def test_permutation_importance_test():
    X, X_test, y = _setup()

    process = make_pipeline(dp.ImputeNaN(), dp.TargetMeanEncoding(),
                            dp.PermutationImportanceTest())

    Xt = process.fit_transform(X, y)

    _check_equal_rows(X, Xt)
    _check_col_does_not_exist_in_df(Xt, 'Name')
    _check_col_does_not_exist_in_df(Xt, 'PassengerId')

    Xt_test = process.transform(X_test)
    _check_equal_rows(X_test, Xt_test)

    _check_same_cols_and_order(Xt, Xt_test)
def test_append_cluster_target_mean():
    X, X_test, y = _setup()

    process = make_pipeline(dp.ImputeNaN(), dp.TargetMeanEncoding(),
                            dp.AppendClusterTargetMean())

    Xt = process.fit_transform(X, y)

    _check_equal_rows(X, Xt)
    _check_number_of_cols_equal(Xt, 12)
    _check_col_exist_in_df(Xt, 'cluster_mean')

    Xt_test = process.transform(X_test)
    _check_equal_rows(X_test, Xt_test)
    _check_number_of_cols_equal(Xt_test, 12)

    _check_same_cols_and_order(Xt, Xt_test)
def test_append_classification_model():
    X, X_test, y = _setup()

    probability_candidates = [True, False]

    for probability in probability_candidates:
        trans = dp.AppendClassificationModel(model=RandomForestClassifier(),
                                             probability=probability)

        process = make_pipeline(dp.ImputeNaN(), dp.TargetMeanEncoding(), trans)

        Xt = process.fit_transform(X, y)
        _check_equal_rows(X, Xt)
        _check_col_exist_in_df(Xt, 'Predicted_RandomForestClassifier')

        Xt_test = process.transform(X_test)
        _check_equal_rows(X_test, Xt_test)
        _check_same_cols_and_order(Xt, Xt_test)
def test_pipelines():
    X, X_test, y = _setup()

    ctrans_candidates = [
        dp.OneHotEncoding(),
        dp.TargetMeanEncoding(),
        dp.CountEncoding(),
        dp.RankedCountEncoding(),
        dp.FrequencyEncoding(),
        dp.RankedTargetMeanEncoding(),
    ]

    scaler_candidates = [dp.StandardScaling(), dp.MinMaxScaling()]

    for scaler in scaler_candidates:
        for ctrans in ctrans_candidates:
            process = make_pipeline(
                dp.DropColumns(drop_columns="PassengerId"),
                dp.DropNoVariance(),
                dp.GroupRareCategory(),
                dp.ClipData(),
                dp.DropHighCardinality(),
                dp.BinarizeNaN(),
                dp.CountRowNaN(),
                dp.ImputeNaN(),
                ctrans,
                dp.DropNoVariance(),
                dp.DropHighCorrelation(),
                scaler,
                dp.AppendAnomalyScore(),
                dp.AppendCluster(),
                dp.AppendClusterDistance(),
                dp.AppendPrincipalComponent(),
                dp.DropHighCorrelation(),
                dp.DropLowAUC(),
            )

            Xt = process.fit_transform(X, y)
            Xt_test = process.transform(X_test)

            _check_equal_rows(X, Xt)
            _check_equal_rows(X_test, Xt_test)

            _check_same_cols_and_order(Xt, Xt_test)
def test_target_mean_encoding():
    X, X_test, y = _setup()
    trans = dp.TargetMeanEncoding()

    Xt = trans.fit_transform(X, y)

    assert Xt['Name'].mean() == 0.38383838383838975
    assert Xt['Sex'].mean() == 0.38383838383838054
    assert Xt['Cabin'].mean() == 0.35791513764516214
    assert Xt['Ticket'].mean() == 0.4306411823436723
    assert Xt['Embarked'].mean() == 0.38367351680115463
    _check_equal_rows(X, Xt)
    _check_equal_cols(X, Xt)

    Xt_test = trans.transform(X_test)
    _check_equal_rows(X_test, Xt_test)
    _check_equal_cols(X_test, Xt_test)

    _check_same_cols_and_order(Xt, Xt_test)
def test_cascaded_encoders():
    X, X_test, y = _setup()

    process = make_pipeline(
        dp.ImputeNaN(),
        dp.OneHotEncoding(),
        dp.TargetMeanEncoding(),
        dp.CountEncoding(),
        dp.RankedCountEncoding(),
        dp.FrequencyEncoding(),
        dp.RankedTargetMeanEncoding(),
    )

    Xt = process.fit_transform(X, y)
    Xt_test = process.transform(X_test)

    _check_equal_rows(X, Xt)
    _check_equal_rows(X_test, Xt_test)

    _check_same_cols_and_order(Xt, Xt_test)
def test_append_encoder():
    X, X_test, y = _setup()

    encoder_candidates = [
        dp.TargetMeanEncoding(),
        dp.CountEncoding(),
        dp.RankedCountEncoding(),
        dp.FrequencyEncoding(),
        dp.RankedTargetMeanEncoding(),
        dp.RankedEvaluationMetricEncoding(),
    ]

    for encoder in encoder_candidates:
        correct_col_no = 16
        trans = dp.AppendEncoder(encoder)
        Xt = trans.fit_transform(X, y)

        _check_equal_rows(X, Xt)
        _check_number_of_cols_equal(Xt, correct_col_no)

        Xt_test = trans.transform(X_test)
        _check_equal_rows(X_test, Xt_test)
        _check_number_of_cols_equal(Xt_test, correct_col_no)

        _check_same_cols_and_order(Xt, Xt_test)

    trans = dp.AppendEncoder(dp.OneHotEncoding())
    correct_col_no = 1736
    Xt = trans.fit_transform(X, y)

    _check_equal_rows(X, Xt)
    _check_number_of_cols_equal(Xt, correct_col_no)

    Xt_test = trans.transform(X_test)
    _check_equal_rows(X_test, Xt_test)
    _check_number_of_cols_equal(Xt_test, correct_col_no)

    _check_same_cols_and_order(Xt, Xt_test)