def test_train_model(dataset_path: str, target_name: str, conf_path: str): training_pipeline_params = read_training_pipeline_params(conf_path) data = read_data(dataset_path) X, y = extract_target(data, target_name) X_transformed = full_transform(X) X_train, X_test, y_train, y_test = split_train_val_data( X_transformed, y, training_pipeline_params.splitting_params) model = train_model(X_train, y_train, training_pipeline_params.train_params) assert isinstance(model, LogisticRegression)
def test_split_train_val_data(dataset_path: str, target_name: str, conf_path: str): training_pipeline_params = read_training_pipeline_params(conf_path) data = read_data(dataset_path) X, y = extract_target(data, target_name) X_train, X_test, y_train, y_test = split_train_val_data( X, y, training_pipeline_params.splitting_params) assert len(X_train) > 0 assert len(X_test) > 0 assert len(y_train) > 0 assert len(y_test) > 0
def test_predict_model(dataset_path: str, target_name: str, conf_path: str): training_pipeline_params = read_training_pipeline_params(conf_path) data = read_data(dataset_path) X, y = extract_target(data, target_name) X_transformed = full_transform(X) X_train, X_test, y_train, y_test = split_train_val_data( X_transformed, y, training_pipeline_params.splitting_params) model = train_model(X_train, y_train, training_pipeline_params.train_params) pred_labels, pred_proba = predict_model(model, X_test) assert len(set(pred_labels)) == 2 assert max(pred_proba) < 1
def test_train_pipeline(dataset_path: str, target_name: str, conf_path: str): training_pipeline_params = read_training_pipeline_params(conf_path) data = read_data(dataset_path) X, y = extract_target(data, target_name) X_transformed = full_transform(X) X_train, X_test, y_train, y_test = split_train_val_data( X_transformed, y, training_pipeline_params.splitting_params) model = train_model(X_train, y_train, training_pipeline_params.train_params) pred_labels, pred_proba = predict_model(model, X_test) res = evaluate_model(y_test, pred_labels, pred_proba) assert res['accuracy'] > 0 assert res['roc_auc_score'] > 0.5
def train_pipeline_command(config_path: str): params = read_training_pipeline_params(config_path) run_train_pipeline(params)
def build_transformer(features: FeatureParams) -> ColumnTransformer: transformer = ColumnTransformer([ ('numerical_part', numerical_pipeline(), features.numerical_features), ('categorical_part', categorical_pipeline(), features.categorical_features) ]) return transformer def make_features(data: pd.DataFrame, transformer: ColumnTransformer) -> pd.DataFrame: return transformer.transform(data) def extract_target(data: pd.DataFrame, target: FeatureParams) -> pd.Series: return data[target.target_feature] if __name__ == '__main__': from data.make_dataset import read_data from entities.train_pipeline_params import read_training_pipeline_params path_data = '../data/heart_nan.csv' path_config = '../config/train_config_forest.yaml' params = read_training_pipeline_params(path_config) dataset = read_data(path_data).drop(params.feature_params.target_feature, axis=1) transformer = build_transformer(params.feature_params) transformer.fit(dataset) data_transformed = make_features(dataset, transformer) assert data_transformed.shape == dataset.shape
def train_pipeline(config_path: str): training_pipeline_params = read_training_pipeline_params(config_path) return train_pipeline_run(training_pipeline_params)
import pandas as pd from entities.split_params import SplittingParams from typing import Tuple from sklearn.model_selection import train_test_split def read_data(path: str) -> pd.DataFrame: data = pd.read_csv(path) return data def split_train_val_data( data: pd.DataFrame, split_params: SplittingParams) -> Tuple[pd.DataFrame, pd.DataFrame]: random_state = SplittingParams.random_state val_size = SplittingParams.val_size train_set, val_set = train_test_split(data, random_state=random_state, test_size=val_size) return train_set, val_set if __name__ == '__main__': from entities.train_pipeline_params import read_training_pipeline_params params = read_training_pipeline_params( '../config/train_config_forest.yaml') path = 'heart.csv' data = read_data(path) train, val = split_train_val_data(data, params.splitting_params) print(train.shape)