예제 #1
0
def test_train_e2e(
    tmpdir: LocalPath,
    dataset_path: str,
    categorical_features: List[str],
    numerical_features: List[str],
    target_col: str,
    features_to_drop: List[str],
):
    expected_output_model_path = tmpdir.join("model.pkl")
    expected_metric_path = tmpdir.join("metrics.json")
    expected_pretrained_model_path = expected_output_model_path
    expected_predictions_path = tmpdir.join("data/predicted/predictions.csv")
    params = TrainingPipelineParams(
        input_data_path=dataset_path,
        output_model_path=expected_output_model_path,
        metric_path=expected_metric_path,
        pretrained_model_path=expected_pretrained_model_path,
        predictions_path=expected_predictions_path,
        splitting_params=SplittingParams(val_size=0.2, random_state=1234),
        feature_params=FeatureParams(
            numerical_features=numerical_features,
            categorical_features=categorical_features,
            target_col=target_col,
            features_to_drop=features_to_drop,
            use_log_trick=False,
        ),
        train_params=TrainingParams(model_type="LogisticRegression"),
    )
    real_model_path, metrics = train_pipeline(params, LogisticRegression())
    assert metrics["roc_auc"] > 0
    assert os.path.exists(real_model_path)
    assert os.path.exists(params.metric_path)
def test_train_e2e(tmpdir: LocalPath, fake_dataset: str,
                   categorical_features: List[str],
                   numerical_features: List[str], target_col: str,
                   features_to_drop: List[str], config_test):
    categorical_features = list(
        set(categorical_features) - set(numerical_features))
    features_to_drop = list(set(features_to_drop) - set(numerical_features))
    expected_output_model_path = tmpdir.join("model.pkl")
    expected_metric_path = tmpdir.join("metrics.json")
    expected_transformer_path = tmpdir.join("transformer.pkl")
    for model_type in config_test.model_types:
        params = TrainingPipelineParams(
            input_data_path=fake_dataset,
            input_data_url="",
            output_model_path=expected_output_model_path,
            metric_path=expected_metric_path,
            transformer_path=expected_transformer_path,
            splitting_params=SplittingParams(
                val_size=config_test.splitting_val_size,
                random_state=config_test.splitting_random_state,
            ),
            feature_params=FeatureParams(
                numerical_features=numerical_features,
                categorical_features=categorical_features,
                target_col=target_col,
                features_to_drop=features_to_drop,
            ),
            train_params=TrainingParams(model_type=model_type),
        )
        real_model_path, metrics = train_pipeline(params)
        assert metrics["accuracy"] >= config_test.min_accuracy
        assert os.path.exists(real_model_path)
        assert os.path.exists(params.metric_path)
예제 #3
0
def parallel_train_pipeline(config, methods, env, eval_qnet, bhv_qnet, seedvec,
                            max_name_length):
    num_method = len(methods)
    mse = np.zeros(len(methods))
    ind_mse = np.zeros(len(methods))
    mse_w = np.zeros(len(methods))

    results, target = train_pipeline(env, config, eval_qnet, bhv_qnet, seedvec)

    for i_method in range(num_method):
        mse_1, mse_2, mse_3 = error_info(
            results[i_method], target,
            methods[i_method].ljust(max_name_length))
        mse[i_method] = mse_1
        ind_mse[i_method] = mse_2
        mse_w[i_method] = mse_3

    return (mse, ind_mse, mse_w)
예제 #4
0
    methods = ['Model', 'DR', 'WDR', 'Soft DR', 'Soft WDR',
               'Model Bsl', 'DR Bsl', 'WDR Bsl', 'Soft DR Bsl', 'Soft WDR Bsl',
               'Model MSE', 'DR MSE', 'WDR MSE', 'Soft DR MSE', 'Soft WDR MSE',
               'MRDR Q', 'MRDR', 'WMRDR', 'Soft MRDR', 'Soft WMRDR',
               'MRDR-w Q', 'MRDR-w', 'WMRDR-w', 'Soft MRDR-w', 'Soft WMRDR-w',
               'IS', 'WIS', 'Soft IS', 'Soft WIS', 'PDIS', 'WPDIS', 'Soft PDIS', 'Soft WPDIS']
    num_method = len(methods)
    max_name_length = len(max(methods,key=len))

    mse = [deque() for method in methods]
    ind_mse = [deque() for method in methods]

    for i_run in range(config.N):
        print('Run: {}'.format(i_run+1))
        results, target = train_pipeline(env,config,eval_qnet)
        for i_method in range(num_method):
            mse_1, mse_2 = error_info(results[i_method], target, methods[i_method].ljust(max_name_length))
            mse[i_method].append(mse_1)
            ind_mse[i_method].append(mse_2)

    mse_table = np.zeros((num_method,4))
    print('Average result over {} runs:'.format(config.N))
    for i in range(num_method):
        print('{}: Root mse of mean is {:.3e}±{:.2e}, root mse of individual is {:.3e}±{:.2e}'
              .format(methods[i].ljust(max_name_length), np.sqrt(np.mean(mse[i])), np.sqrt(np.std(mse[i])),
                      np.sqrt(np.mean(ind_mse[i])), np.sqrt(np.std(ind_mse[i]))))
        mse_table[i, 0] = np.sqrt(np.mean(mse[i]))
        mse_table[i, 1] = np.sqrt(np.std(mse[i]))
        mse_table[i, 2] = np.sqrt(np.mean(ind_mse[i]))
        mse_table[i, 3] = np.sqrt(np.std(ind_mse[i]))