def test_split_dataset(dataset_path: str): #val_size = 0.25 splitting_params = SplitParams() data = read_data(dataset_path) train, val = split_train_val_data(data, splitting_params) assert train.shape[0] == 75 assert val.shape[0] == 25
def test_read_data_csv(): generated_data = gen_synthetic_data(DEFAULT_DATA_SIZE) generated_data.to_csv(DEFAULT_GENERATED_DATA_NAME, index=False) data = read_data(DEFAULT_GENERATED_DATA_NAME) assert len(data) == DEFAULT_DATA_SIZE assert data.shape[1] == COLUMN_COUNT assert TARGET_COL_NAME in data.keys() os.remove(DEFAULT_GENERATED_DATA_NAME)
def test_split_dataset(fake_dataset, dataset_size, config_test): splitting_params = SplittingParams( random_state=config_test.splitting_random_state, val_size=config_test.splitting_val_size, ) data = read_data(fake_dataset) train, val = split_train_val_data(data, splitting_params) assert len(train) + len(val) == dataset_size assert len(train) > 0 assert len(val) > 0
def test_split_dataset(tmpdir, dataset_path, strategy): val_size = 0.3 random_state = 123 data = read_data(dataset_path) splitting_params = OmegaConf.create({ 'val_size': val_size, 'random_state': random_state }) train, val = split_train_val_data(data, strategy, splitting_params) assert train.shape[0] > 5 if strategy == 'holdout': assert val.shape[0] > 5 else: assert val.shape[0] == 0
def features_and_target( fake_dataset: str, target_col: str, categorical_features: List[str], numerical_features: List[str], features_to_drop: List[str], ) -> Tuple[pd.DataFrame, pd.Series]: params = FeatureParams( categorical_features=categorical_features, numerical_features=numerical_features, features_to_drop=features_to_drop, target_col=target_col, ) data = read_data(fake_dataset) transformer = build_transformer(params) transformer.fit(data) features = make_features(transformer, data) target = extract_target(data, params) return features, target
def test_make_features( fake_dataset: str, target_col: str, config_test, ): def generate_cols_params(): column_names = config_test.column_names.copy() random.shuffle(column_names) indices = [random.randint(0, len(column_names)) for _ in range(3)] indices.sort() param = ( column_names[slice(0, indices[0])], column_names[slice(indices[0], indices[1])], column_names[slice(indices[1], indices[2])], column_names[slice(indices[2], len(column_names))], bool(random.randint(0, 1)), ) return param data = read_data(fake_dataset) for _ in range(10): categorical_features, numerical_features, rank_features, features_to_drop, normalize = generate_cols_params( ) feature_params = FeatureParams( categorical_features=categorical_features, numerical_features=numerical_features, rank_features=rank_features, features_to_drop=features_to_drop, normalize=normalize, target_col=target_col, ) transformer = build_transformer(feature_params) transformer.fit(data) features = make_features(transformer, data) assert not pd.isnull(features).any().any() assert all(x not in features.columns for x in feature_params.features_to_drop) target = extract_target(data, feature_params) assert_allclose(data[feature_params.target_col].to_numpy(), target.to_numpy())
def train_pipeline(training_pipeline_params: TrainingPipelineParams): logger.info(f"Старт пайплайна с параметрами:\n{training_pipeline_params}") logger.info("Чтение данных...") data = read_data(training_pipeline_params.input_data_path) logger.info("Сплит данных...") train_df, val_df = split_train_val_data( data, training_pipeline_params.splitting_params) logger.info("Препроцессинг...") train_features = extract_features(train_df, training_pipeline_params.feature_params) transformer = build_transformer(training_pipeline_params.feature_params) transformer.fit(train_features) serialize_transformer(transformer, training_pipeline_params.output_transformer_path) train_features = process_features(transformer, train_features) train_target = extract_target(train_df, training_pipeline_params.feature_params) val_features = extract_features(val_df, training_pipeline_params.feature_params) val_features = process_features(transformer, val_features) val_target = extract_target(val_df, training_pipeline_params.feature_params) logger.info("Обучение модели...") model = train_model(train_features, train_target, training_pipeline_params.train_params) logger.info("Подсчет метрик...") preds = predict_model(model, val_features) metrics = evaluate_model(preds, val_target) with open(training_pipeline_params.metric_path, mode="w") as metric_file: json.dump(metrics, metric_file) logger.info(f"Значения метрик: {metrics}") logger.info("Сериализация модели...") serialize_model(model, training_pipeline_params.output_model_path)
def predict_pipeline( predicting_pipeline_params: PredictingPipelineParams) -> None: logger.info( f"Старт пайплайна с параметрами:\n{predicting_pipeline_params}") logger.info("Чтение данных...") features = read_data(predicting_pipeline_params.input_data_path) logger.info("Загрузка трансформера...") transformer = load_transformer(predicting_pipeline_params.transformer_path) logger.info("Применение трансформера...") features = process_features(transformer, features) logger.info("Загрузка модели...") model = load_model(predicting_pipeline_params.model_path) logger.info("Применение модели...") preds = predict_model(model, features) logger.info("Сохранение предсказаний...") save_preds(preds, predicting_pipeline_params.output_preds_path)
def train_pipeline(params: TrainingPipelineParams): # logger.info('Downloading data from S3') # download_data(params.input_source_data, params.s3) logger.info('Reading data') input_data = read_data(params.input_source_data) logger.info('Preprocessing data') data = preprocess_data(input_data) logger.info(f'Data shape {data.shape}') # train_features, train_target = get_features_target(train_df) # val_features, val_target = get_features_target(val_df) logger.info('Data splitting') train_df, val_df = split_train_test_split(data, params.splitting_val_size) logger.info(f'train_df shape {train_df.shape}') logger.info(f'val_df shape {val_df.shape}') train_features, train_target = get_features_target(train_df, 'AHD') val_features, val_target = get_features_target(val_df, 'AHD') model_config = params.model logger.info(f'Training with model config: {model_config}') model = train_model(train_features, train_target, model_config) logger.info('Making predictions on validation data') predict = predict_model(model, val_features) logger.info('Calculating metrics') metrics = eval_model(predict, val_target) logger.info(f'Metrics {metrics}') logger.info('Saving metrics') save_metrics(metrics, model_config.save_folder, model_config.metrics_filename) logger.info('Saving model') serialize_model(model, model_config.save_folder, model_config.model_filename)
def test_load_dataset(dataset_path: str, target_col: str): data = read_data(dataset_path) assert len(data) == 100 assert target_col in data.keys()
def test_load_dataset(fake_dataset, dataset_size): data = read_data(fake_dataset) assert len(data) == dataset_size
def test_process_categorical_features(dataset_path: str, categorical_features: List[str]): data = read_data(dataset_path) #print(data) df = process_categorical_features(data[categorical_features]) assert df.shape[1] == 24
def data(): generated_data = gen_synthetic_data(200) generated_data.to_csv(DEFAULT_GENERATED_DATA_NAME, index=False) data = read_data(DEFAULT_GENERATED_DATA_NAME) os.remove(DEFAULT_GENERATED_DATA_NAME) return data
def test_load_dataset(dataset_path): data = read_data(dataset_path) assert len(data) > 10 assert "target" in data.keys()