def test_train_model(dataset_path: str, target_name: str, conf_path: str):
    training_pipeline_params = read_training_pipeline_params(conf_path)

    data = read_data(dataset_path)
    X, y = extract_target(data, target_name)
    X_transformed = full_transform(X)
    X_train, X_test, y_train, y_test = split_train_val_data(
        X_transformed, y, training_pipeline_params.splitting_params)
    model = train_model(X_train, y_train,
                        training_pipeline_params.train_params)
    assert isinstance(model, LogisticRegression)
def test_split_train_val_data(dataset_path: str, target_name: str,
                              conf_path: str):
    training_pipeline_params = read_training_pipeline_params(conf_path)

    data = read_data(dataset_path)
    X, y = extract_target(data, target_name)
    X_train, X_test, y_train, y_test = split_train_val_data(
        X, y, training_pipeline_params.splitting_params)
    assert len(X_train) > 0
    assert len(X_test) > 0
    assert len(y_train) > 0
    assert len(y_test) > 0
def test_predict_model(dataset_path: str, target_name: str, conf_path: str):
    training_pipeline_params = read_training_pipeline_params(conf_path)

    data = read_data(dataset_path)
    X, y = extract_target(data, target_name)
    X_transformed = full_transform(X)
    X_train, X_test, y_train, y_test = split_train_val_data(
        X_transformed, y, training_pipeline_params.splitting_params)

    model = train_model(X_train, y_train,
                        training_pipeline_params.train_params)
    pred_labels, pred_proba = predict_model(model, X_test)
    assert len(set(pred_labels)) == 2
    assert max(pred_proba) < 1
예제 #4
0
def test_train_pipeline(dataset_path: str, target_name: str, conf_path: str):
    training_pipeline_params = read_training_pipeline_params(conf_path)

    data = read_data(dataset_path)
    X, y = extract_target(data, target_name)
    X_transformed = full_transform(X)
    X_train, X_test, y_train, y_test = split_train_val_data(
        X_transformed, y, training_pipeline_params.splitting_params)
    model = train_model(X_train, y_train,
                        training_pipeline_params.train_params)
    pred_labels, pred_proba = predict_model(model, X_test)

    res = evaluate_model(y_test, pred_labels, pred_proba)
    assert res['accuracy'] > 0
    assert res['roc_auc_score'] > 0.5
def train_pipeline_command(config_path: str):
    params = read_training_pipeline_params(config_path)
    run_train_pipeline(params)
def build_transformer(features: FeatureParams) -> ColumnTransformer:
    transformer = ColumnTransformer([
        ('numerical_part', numerical_pipeline(), features.numerical_features),
        ('categorical_part', categorical_pipeline(),
         features.categorical_features)
    ])
    return transformer


def make_features(data: pd.DataFrame,
                  transformer: ColumnTransformer) -> pd.DataFrame:
    return transformer.transform(data)


def extract_target(data: pd.DataFrame, target: FeatureParams) -> pd.Series:
    return data[target.target_feature]


if __name__ == '__main__':
    from data.make_dataset import read_data
    from entities.train_pipeline_params import read_training_pipeline_params
    path_data = '../data/heart_nan.csv'
    path_config = '../config/train_config_forest.yaml'
    params = read_training_pipeline_params(path_config)
    dataset = read_data(path_data).drop(params.feature_params.target_feature,
                                        axis=1)
    transformer = build_transformer(params.feature_params)
    transformer.fit(dataset)
    data_transformed = make_features(dataset, transformer)
    assert data_transformed.shape == dataset.shape
def train_pipeline(config_path: str):
    training_pipeline_params = read_training_pipeline_params(config_path)
    return train_pipeline_run(training_pipeline_params)
예제 #8
0
import pandas as pd
from entities.split_params import SplittingParams
from typing import Tuple
from sklearn.model_selection import train_test_split


def read_data(path: str) -> pd.DataFrame:
    data = pd.read_csv(path)
    return data


def split_train_val_data(
        data: pd.DataFrame,
        split_params: SplittingParams) -> Tuple[pd.DataFrame, pd.DataFrame]:
    random_state = SplittingParams.random_state
    val_size = SplittingParams.val_size
    train_set, val_set = train_test_split(data,
                                          random_state=random_state,
                                          test_size=val_size)
    return train_set, val_set


if __name__ == '__main__':
    from entities.train_pipeline_params import read_training_pipeline_params
    params = read_training_pipeline_params(
        '../config/train_config_forest.yaml')
    path = 'heart.csv'
    data = read_data(path)
    train, val = split_train_val_data(data, params.splitting_params)
    print(train.shape)