예제 #1
0
def test_impute_nan():
    X, X_test, _ = _setup()
    trans = dp.ImputeNaN()

    Xt = trans.fit_transform(X)

    assert Xt.isnull().sum().sum() == 0
    _check_equal_rows(X, Xt)

    Xt_test = trans.transform(X_test)
    assert Xt_test.isnull().sum().sum() == 0
    _check_equal_rows(X_test, Xt_test)

    _check_same_cols_and_order(Xt, Xt_test)
예제 #2
0
def test_permutation_importance_test():
    X, X_test, y = _setup()

    process = make_pipeline(dp.ImputeNaN(), dp.TargetMeanEncoding(),
                            dp.PermutationImportanceTest())

    Xt = process.fit_transform(X, y)

    _check_equal_rows(X, Xt)
    _check_col_does_not_exist_in_df(Xt, 'Name')
    _check_col_does_not_exist_in_df(Xt, 'PassengerId')

    Xt_test = process.transform(X_test)
    _check_equal_rows(X_test, Xt_test)

    _check_same_cols_and_order(Xt, Xt_test)
예제 #3
0
def test_append_cluster_target_mean():
    X, X_test, y = _setup()

    process = make_pipeline(dp.ImputeNaN(), dp.TargetMeanEncoding(),
                            dp.AppendClusterTargetMean())

    Xt = process.fit_transform(X, y)

    _check_equal_rows(X, Xt)
    _check_number_of_cols_equal(Xt, 12)
    _check_col_exist_in_df(Xt, 'cluster_mean')

    Xt_test = process.transform(X_test)
    _check_equal_rows(X_test, Xt_test)
    _check_number_of_cols_equal(Xt_test, 12)

    _check_same_cols_and_order(Xt, Xt_test)
예제 #4
0
def test_append_classification_model():
    X, X_test, y = _setup()

    probability_candidates = [True, False]

    for probability in probability_candidates:
        trans = dp.AppendClassificationModel(model=RandomForestClassifier(),
                                             probability=probability)

        process = make_pipeline(dp.ImputeNaN(), dp.TargetMeanEncoding(), trans)

        Xt = process.fit_transform(X, y)
        _check_equal_rows(X, Xt)
        _check_col_exist_in_df(Xt, 'Predicted_RandomForestClassifier')

        Xt_test = process.transform(X_test)
        _check_equal_rows(X_test, Xt_test)
        _check_same_cols_and_order(Xt, Xt_test)
예제 #5
0
def test_pipelines():
    X, X_test, y = _setup()

    ctrans_candidates = [
        dp.OneHotEncoding(),
        dp.TargetMeanEncoding(),
        dp.CountEncoding(),
        dp.RankedCountEncoding(),
        dp.FrequencyEncoding(),
        dp.RankedTargetMeanEncoding(),
    ]

    scaler_candidates = [dp.StandardScaling(), dp.MinMaxScaling()]

    for scaler in scaler_candidates:
        for ctrans in ctrans_candidates:
            process = make_pipeline(
                dp.DropColumns(drop_columns="PassengerId"),
                dp.DropNoVariance(),
                dp.GroupRareCategory(),
                dp.ClipData(),
                dp.DropHighCardinality(),
                dp.BinarizeNaN(),
                dp.CountRowNaN(),
                dp.ImputeNaN(),
                ctrans,
                dp.DropNoVariance(),
                dp.DropHighCorrelation(),
                scaler,
                dp.AppendAnomalyScore(),
                dp.AppendCluster(),
                dp.AppendClusterDistance(),
                dp.AppendPrincipalComponent(),
                dp.DropHighCorrelation(),
                dp.DropLowAUC(),
            )

            Xt = process.fit_transform(X, y)
            Xt_test = process.transform(X_test)

            _check_equal_rows(X, Xt)
            _check_equal_rows(X_test, Xt_test)

            _check_same_cols_and_order(Xt, Xt_test)
예제 #6
0
def test_arithmetic_feature_generator():
    X, X_test, y = _setup()

    operation_candidates = ['add', 'subtract', 'multiply', 'divide']
    metric_candidates = ['roc_auc', 'accuracy']

    for metric in metric_candidates:
        for operation in operation_candidates:
            process = make_pipeline(
                dp.ImputeNaN(),
                dp.ArithmeticFeatureGenerator(metric=metric,
                                              operation=operation))

            Xt = process.fit_transform(X, y)
            Xt_test = process.transform(X_test)

            _check_equal_rows(X, Xt)
            _check_equal_rows(X_test, Xt_test)

            _check_same_cols_and_order(Xt, Xt_test)
예제 #7
0
def test_cascaded_encoders():
    X, X_test, y = _setup()

    process = make_pipeline(
        dp.ImputeNaN(),
        dp.OneHotEncoding(),
        dp.TargetMeanEncoding(),
        dp.CountEncoding(),
        dp.RankedCountEncoding(),
        dp.FrequencyEncoding(),
        dp.RankedTargetMeanEncoding(),
    )

    Xt = process.fit_transform(X, y)
    Xt_test = process.transform(X_test)

    _check_equal_rows(X, Xt)
    _check_equal_rows(X_test, Xt_test)

    _check_same_cols_and_order(Xt, Xt_test)