예제 #1
0
def test_split_dataset(dataset_path: str):
    #val_size = 0.25
    splitting_params = SplitParams()
    data = read_data(dataset_path)
    train, val = split_train_val_data(data, splitting_params)
    assert train.shape[0] == 75
    assert val.shape[0] == 25
예제 #2
0
def test_read_data_csv():
    generated_data = gen_synthetic_data(DEFAULT_DATA_SIZE)
    generated_data.to_csv(DEFAULT_GENERATED_DATA_NAME, index=False)
    data = read_data(DEFAULT_GENERATED_DATA_NAME)
    assert len(data) == DEFAULT_DATA_SIZE
    assert data.shape[1] == COLUMN_COUNT
    assert TARGET_COL_NAME in data.keys()
    os.remove(DEFAULT_GENERATED_DATA_NAME)
예제 #3
0
def test_split_dataset(fake_dataset, dataset_size, config_test):
    splitting_params = SplittingParams(
        random_state=config_test.splitting_random_state,
        val_size=config_test.splitting_val_size,
    )
    data = read_data(fake_dataset)
    train, val = split_train_val_data(data, splitting_params)
    assert len(train) + len(val) == dataset_size
    assert len(train) > 0
    assert len(val) > 0
def test_split_dataset(tmpdir, dataset_path, strategy):
    val_size = 0.3
    random_state = 123
    data = read_data(dataset_path)

    splitting_params = OmegaConf.create({
        'val_size': val_size,
        'random_state': random_state
    })
    train, val = split_train_val_data(data, strategy, splitting_params)

    assert train.shape[0] > 5

    if strategy == 'holdout':
        assert val.shape[0] > 5
    else:
        assert val.shape[0] == 0
예제 #5
0
def features_and_target(
    fake_dataset: str,
    target_col: str,
    categorical_features: List[str],
    numerical_features: List[str],
    features_to_drop: List[str],
) -> Tuple[pd.DataFrame, pd.Series]:
    params = FeatureParams(
        categorical_features=categorical_features,
        numerical_features=numerical_features,
        features_to_drop=features_to_drop,
        target_col=target_col,
    )
    data = read_data(fake_dataset)
    transformer = build_transformer(params)
    transformer.fit(data)
    features = make_features(transformer, data)
    target = extract_target(data, params)
    return features, target
def test_make_features(
    fake_dataset: str,
    target_col: str,
    config_test,
):
    def generate_cols_params():
        column_names = config_test.column_names.copy()
        random.shuffle(column_names)
        indices = [random.randint(0, len(column_names)) for _ in range(3)]
        indices.sort()
        param = (
            column_names[slice(0, indices[0])],
            column_names[slice(indices[0], indices[1])],
            column_names[slice(indices[1], indices[2])],
            column_names[slice(indices[2], len(column_names))],
            bool(random.randint(0, 1)),
        )
        return param

    data = read_data(fake_dataset)

    for _ in range(10):
        categorical_features, numerical_features, rank_features, features_to_drop, normalize = generate_cols_params(
        )
        feature_params = FeatureParams(
            categorical_features=categorical_features,
            numerical_features=numerical_features,
            rank_features=rank_features,
            features_to_drop=features_to_drop,
            normalize=normalize,
            target_col=target_col,
        )
        transformer = build_transformer(feature_params)
        transformer.fit(data)
        features = make_features(transformer, data)
        assert not pd.isnull(features).any().any()
        assert all(x not in features.columns
                   for x in feature_params.features_to_drop)

        target = extract_target(data, feature_params)
        assert_allclose(data[feature_params.target_col].to_numpy(),
                        target.to_numpy())
예제 #7
0
def train_pipeline(training_pipeline_params: TrainingPipelineParams):
    logger.info(f"Старт пайплайна с параметрами:\n{training_pipeline_params}")

    logger.info("Чтение данных...")
    data = read_data(training_pipeline_params.input_data_path)

    logger.info("Сплит данных...")
    train_df, val_df = split_train_val_data(
        data, training_pipeline_params.splitting_params)

    logger.info("Препроцессинг...")
    train_features = extract_features(train_df,
                                      training_pipeline_params.feature_params)
    transformer = build_transformer(training_pipeline_params.feature_params)
    transformer.fit(train_features)
    serialize_transformer(transformer,
                          training_pipeline_params.output_transformer_path)

    train_features = process_features(transformer, train_features)
    train_target = extract_target(train_df,
                                  training_pipeline_params.feature_params)

    val_features = extract_features(val_df,
                                    training_pipeline_params.feature_params)
    val_features = process_features(transformer, val_features)
    val_target = extract_target(val_df,
                                training_pipeline_params.feature_params)

    logger.info("Обучение модели...")
    model = train_model(train_features, train_target,
                        training_pipeline_params.train_params)

    logger.info("Подсчет метрик...")
    preds = predict_model(model, val_features)
    metrics = evaluate_model(preds, val_target)
    with open(training_pipeline_params.metric_path, mode="w") as metric_file:
        json.dump(metrics, metric_file)
    logger.info(f"Значения метрик: {metrics}")

    logger.info("Сериализация модели...")
    serialize_model(model, training_pipeline_params.output_model_path)
예제 #8
0
def predict_pipeline(
        predicting_pipeline_params: PredictingPipelineParams) -> None:
    logger.info(
        f"Старт пайплайна с параметрами:\n{predicting_pipeline_params}")

    logger.info("Чтение данных...")
    features = read_data(predicting_pipeline_params.input_data_path)

    logger.info("Загрузка трансформера...")
    transformer = load_transformer(predicting_pipeline_params.transformer_path)

    logger.info("Применение трансформера...")
    features = process_features(transformer, features)

    logger.info("Загрузка модели...")
    model = load_model(predicting_pipeline_params.model_path)

    logger.info("Применение модели...")
    preds = predict_model(model, features)

    logger.info("Сохранение предсказаний...")
    save_preds(preds, predicting_pipeline_params.output_preds_path)
예제 #9
0
def train_pipeline(params: TrainingPipelineParams):
    # logger.info('Downloading data from S3')
    # download_data(params.input_source_data, params.s3)

    logger.info('Reading data')
    input_data = read_data(params.input_source_data)

    logger.info('Preprocessing data')
    data = preprocess_data(input_data)
    logger.info(f'Data shape {data.shape}')

    # train_features, train_target = get_features_target(train_df)
    # val_features, val_target = get_features_target(val_df)

    logger.info('Data splitting')
    train_df, val_df = split_train_test_split(data, params.splitting_val_size)
    logger.info(f'train_df shape {train_df.shape}')
    logger.info(f'val_df shape {val_df.shape}')

    train_features, train_target = get_features_target(train_df, 'AHD')
    val_features, val_target = get_features_target(val_df, 'AHD')

    model_config = params.model
    logger.info(f'Training with model config: {model_config}')
    model = train_model(train_features, train_target, model_config)

    logger.info('Making predictions on validation data')
    predict = predict_model(model, val_features)
    logger.info('Calculating metrics')
    metrics = eval_model(predict, val_target)
    logger.info(f'Metrics {metrics}')

    logger.info('Saving metrics')
    save_metrics(metrics, model_config.save_folder, model_config.metrics_filename)

    logger.info('Saving model')
    serialize_model(model, model_config.save_folder, model_config.model_filename)
예제 #10
0
def test_load_dataset(dataset_path: str, target_col: str):
    data = read_data(dataset_path)
    assert len(data) == 100
    assert target_col in data.keys()
예제 #11
0
def test_load_dataset(fake_dataset, dataset_size):
    data = read_data(fake_dataset)
    assert len(data) == dataset_size
예제 #12
0
def test_process_categorical_features(dataset_path: str,
                                      categorical_features: List[str]):
    data = read_data(dataset_path)
    #print(data)
    df = process_categorical_features(data[categorical_features])
    assert df.shape[1] == 24
예제 #13
0
def data():
    generated_data = gen_synthetic_data(200)
    generated_data.to_csv(DEFAULT_GENERATED_DATA_NAME, index=False)
    data = read_data(DEFAULT_GENERATED_DATA_NAME)
    os.remove(DEFAULT_GENERATED_DATA_NAME)
    return data
def test_load_dataset(dataset_path):
    data = read_data(dataset_path)
    assert len(data) > 10
    assert "target" in data.keys()