Пример #1
0
def clean_data(X):
    X.dropna(subset=['target'], inplace=True)
    y = X.pop('target')
    X.drop(columns='ID', inplace=True)
    X['v22'] = X['v22'].apply(az_to_int)
    cat_cols = X.select_dtypes(include=['object']).columns.tolist()
    con_cols = X.select_dtypes(include=['number']).columns.tolist()
    num_missing_imputer = SimpleImputer(strategy='median')
    cat_missing_imputer = CategoricalImputer(fill_value='__MISS__')
    rare_label_encoder = RareLabelEncoder(tol=0.01, n_categories=10, replace_with='__OTHER__')
    cat_freq_encoder = CountFrequencyEncoder(encoding_method="frequency")
    X[con_cols] = num_missing_imputer.fit_transform(X[con_cols])
    X[cat_cols] = cat_missing_imputer.fit_transform(X[cat_cols])
    X[cat_cols] = rare_label_encoder.fit_transform(X[cat_cols])
    X[cat_cols] = cat_freq_encoder.fit_transform(X[cat_cols])
    # more cleaning
    trimmer = Winsorizer(capping_method='quantiles', tail='both', fold=0.005)
    X = trimmer.fit_transform(X)
    undersampler = RandomUnderSampler(sampling_strategy=0.7, random_state=1234)
    X, Y = undersampler.fit_resample(X, y)
    quasi_constant = DropConstantFeatures(tol=0.998)
    X = quasi_constant.fit_transform(X)
    print(f"Quasi Features to drop {quasi_constant.features_to_drop_}")
    # Remove duplicated features¶
    duplicates = DropDuplicateFeatures()
    X = duplicates.fit_transform(X)
    print(f"Duplicate feature sets {duplicates.duplicated_feature_sets_}")
    print(f"Dropping duplicate features {duplicates.features_to_drop_}")
    drop_corr = DropCorrelatedFeatures(method="pearson", threshold=0.95, missing_values="ignore")
    X = drop_corr.fit_transform(X)
    print(f"Drop correlated feature sets {drop_corr.correlated_feature_sets_}")
    print(f"Dropping correlared features {drop_corr.features_to_drop_}")
    X['target'] = Y
    return X
def test_drop_constant_features_with_list_of_variables(df_constant_features):
    # test case 3: drop features showing threshold more than 0.7 with variable list
    transformer = DropConstantFeatures(
        tol=0.7, variables=["Name", "const_feat_num", "quasi_feat_num"])
    X = transformer.fit_transform(df_constant_features)

    # expected result
    df = pd.DataFrame({
        "Name": ["tom", "nick", "krish", "jack"],
        "City": ["London", "Manchester", "Liverpool", "Bristol"],
        "Age": [20, 21, 19, 18],
        "Marks": [0.9, 0.8, 0.7, 0.6],
        "dob": pd.date_range("2020-02-24", periods=4, freq="T"),
        "const_feat_cat": ["a", "a", "a", "a"],
        "quasi_feat_cat": ["a", "a", "a", "b"],
    })

    # init params
    assert transformer.tol == 0.7
    assert transformer.variables == [
        "Name", "const_feat_num", "quasi_feat_num"
    ]

    # fit attr
    assert transformer.constant_features_ == [
        "const_feat_num", "quasi_feat_num"
    ]
    assert transformer.input_shape_ == (4, 9)

    # transform params
    pd.testing.assert_frame_equal(X, df)
def test_error_if_all_constant_and_quasi_constant_features():
    # test case 7: when input contains all constant and quasi constant features
    with pytest.raises(ValueError):
        transformer = DropConstantFeatures(tol=0.7)
        transformer.fit_transform(
            pd.DataFrame({
                "col1": [1, 1, 1, 1],
                "col2": [1, 1, 1, 1],
                "col3": [1, 1, 1, 2],
                "col4": [1, 1, 1, 2],
            }))
def test_error_if_input_all_constant_features():
    # test case 6: when input contains all constant features
    with pytest.raises(ValueError):
        DropConstantFeatures().fit(
            pd.DataFrame({
                "col1": [1, 1, 1],
                "col2": [1, 1, 1]
            }))
Пример #5
0
def test_drop_constant_and_quasiconstant_features(df_constant_features):
    transformer = DropConstantFeatures(tol=0.7, variables=None)
    X = transformer.fit_transform(df_constant_features)

    # expected result
    df = pd.DataFrame(
        {
            "Name": ["tom", "nick", "krish", "jack"],
            "City": ["London", "Manchester", "Liverpool", "Bristol"],
            "Age": [20, 21, 19, 18],
            "Marks": [0.9, 0.8, 0.7, 0.6],
            "dob": pd.date_range("2020-02-24", periods=4, freq="T"),
        }
    )

    # init params
    assert transformer.tol == 0.7
    assert transformer.variables == [
        "Name",
        "City",
        "Age",
        "Marks",
        "dob",
        "const_feat_num",
        "const_feat_cat",
        "quasi_feat_num",
        "quasi_feat_cat",
    ]

    # fit attr
    assert transformer.constant_features_ == [
        "const_feat_num",
        "const_feat_cat",
        "quasi_feat_num",
        "quasi_feat_cat",
    ]
    assert transformer.input_shape_ == (4, 9)

    # transform params
    pd.testing.assert_frame_equal(X, df)
def test_drop_constant_features(df_constant_features):
    transformer = DropConstantFeatures(tol=1, variables=None)
    X = transformer.fit_transform(df_constant_features)

    # expected result
    df = pd.DataFrame({
        "Name": ["tom", "nick", "krish", "jack"],
        "City": ["London", "Manchester", "Liverpool", "Bristol"],
        "Age": [20, 21, 19, 18],
        "Marks": [0.9, 0.8, 0.7, 0.6],
        "dob": pd.date_range("2020-02-24", periods=4, freq="T"),
        "quasi_feat_num": [1, 1, 1, 2],
        "quasi_feat_cat": ["a", "a", "a", "b"],
    })

    # fit attribute
    assert transformer.features_to_drop_ == [
        "const_feat_num", "const_feat_cat"
    ]

    # transform output
    pd.testing.assert_frame_equal(X, df)
def test_missing_values_param():

    df = {
        "Name": ["tom", "nick", "krish", "jack"],
        "City": ["London", "Manchester", "Liverpool", "Bristol"],
        "Age": [20, 21, 19, 18],
        "Marks": [0.9, 0.8, 0.7, 0.6],
        "dob": pd.date_range("2020-02-24", periods=4, freq="T"),
        "const_feat_num": [1, 1, 1, np.nan],
        "const_feat_cat": ["a", "a", "a", "a"],
        "quasi_feat_num": [1, 1, 1, 2],
        "quasi_feat_cat": ["a", "a", "a", np.nan],
    }
    df = pd.DataFrame(df)

    # test raises error if there is na
    with pytest.raises(ValueError):
        transformer = DropConstantFeatures(missing_values="raise")
        transformer.fit(df)

    # test ignores na
    transformer = DropConstantFeatures(missing_values="ignore").fit(df)
    constant = ["const_feat_num", "const_feat_cat", "quasi_feat_cat"]
    assert transformer.constant_features_ == constant
    pd.testing.assert_frame_equal(df.drop(constant, axis=1),
                                  transformer.transform(df))

    # test includes na
    transformer = DropConstantFeatures(tol=0.7,
                                       missing_values="include").fit(df)
    qconstant = [
        "const_feat_num", "const_feat_cat", "quasi_feat_num", "quasi_feat_cat"
    ]
    assert transformer.constant_features_ == qconstant
    pd.testing.assert_frame_equal(df.drop(qconstant, axis=1),
                                  transformer.transform(df))
Пример #8
0
def create_pipeline(params: dict = None):
    """
    Create sklearn.pipeline.Pipeline

    Parameters
    ----------
    params : dict
        dictionary of parameters for the pipeline

    Returns
    -------
    sklearn.pipeline.Pipeline
    """

    # pipeline for numeric variables
    p_num = Pipeline([("num_nan_ind", AddMissingIndicator(missing_only=True)),
                      ("rmmean", MeanMedianImputer()),
                      ("drop_quasi_constant", DropConstantFeatures(tol=0.97))])

    # pipeline for categorical variables
    p_cat = Pipeline([("fill_cat_nas",
                       CategoricalImputer(fill_value='MISSING')),
                      ("rlc", RareLabelEncoder()),
                      ("one_hot_encoder", OneHotEncoder())])

    # list of pipelines to combine
    transformers = [("num", p_num,
                     make_column_selector(dtype_include=np.number)),
                    ("cat", p_cat, make_column_selector(dtype_include=object))]

    # combine pipelines and add XGBClassifier
    col_transforms = ColumnTransformer(transformers)
    p = Pipeline([("col_transformers", col_transforms),
                  ("xgb",
                   XGBClassifier(min_child_weight=1,
                                 gamma=0,
                                 objective='binary:logistic',
                                 nthread=4,
                                 scale_pos_weight=1,
                                 seed=1,
                                 gpu_id=0,
                                 tree_method='gpu_hist'))])

    if params:
        p.set_params(**params)
    return p
def test_non_fitted_error(df_constant_features):
    # test case 8: when fit is not called prior to transform
    with pytest.raises(NotFittedError):
        transformer = DropConstantFeatures()
        transformer.transform(df_constant_features)
def test_tol_init_param(tol):
    sel = DropConstantFeatures(tol=tol)
    assert sel.tol == tol
def test_error_if_missing_values_not_permitted():
    # test case 5: threshold not between 0 and 1
    with pytest.raises(ValueError):
        DropConstantFeatures(missing_values="hola")
def test_error_if_tol_is_string():
    # test case 5: threshold not between 0 and 1
    with pytest.raises(ValueError):
        DropConstantFeatures(tol="hola")
def test_error_if_tol_out_of_range():
    # test case 5: threshold not between 0 and 1
    with pytest.raises(ValueError):
        DropConstantFeatures(tol=2)
def test_error_if_fit_input_not_df():
    # test case 4: input is not a dataframe
    with pytest.raises(TypeError):
        DropConstantFeatures().fit({"Name": ["Karthik"]})
def test_error_if_tol_value_not_allowed(tol):
    # test case 5: threshold not between 0 and 1
    with pytest.raises(ValueError):
        DropConstantFeatures(tol=tol)
Пример #16
0
    DropDuplicateFeatures,
    DropFeatures,
    DropHighPSIFeatures,
    RecursiveFeatureAddition,
    RecursiveFeatureElimination,
    SelectByShuffling,
    SelectBySingleFeaturePerformance,
    SelectByTargetMeanPerformance,
    SmartCorrelatedSelection,
)

_logreg = LogisticRegression(C=0.0001, max_iter=2, random_state=1)

_estimators = [
    DropFeatures(features_to_drop=["0"]),
    DropConstantFeatures(missing_values="ignore"),
    DropDuplicateFeatures(),
    DropCorrelatedFeatures(),
    DropHighPSIFeatures(bins=5),
    SmartCorrelatedSelection(),
    SelectByShuffling(estimator=_logreg, scoring="accuracy"),
    SelectByTargetMeanPerformance(bins=3, regression=False),
    SelectBySingleFeaturePerformance(estimator=_logreg, scoring="accuracy"),
    RecursiveFeatureAddition(estimator=_logreg, scoring="accuracy"),
    RecursiveFeatureElimination(estimator=_logreg, scoring="accuracy", threshold=-100),
]

_multivariate_estimators = [
    DropDuplicateFeatures(),
    DropCorrelatedFeatures(),
    SmartCorrelatedSelection(),
    DropDuplicateFeatures,
    DropFeatures,
    RecursiveFeatureAddition,
    RecursiveFeatureElimination,
    SelectByShuffling,
    SelectBySingleFeaturePerformance,
    SelectByTargetMeanPerformance,
    SmartCorrelatedSelection,
)


@pytest.mark.parametrize(
    "Estimator",
    [
        DropFeatures(features_to_drop=["0"]),
        DropConstantFeatures(),
        DropDuplicateFeatures(),
        DropCorrelatedFeatures(),
        SmartCorrelatedSelection(),
        SelectByShuffling(RandomForestClassifier(random_state=1),
                          scoring="accuracy"),
        SelectBySingleFeaturePerformance(
            RandomForestClassifier(random_state=1), scoring="accuracy"),
        RecursiveFeatureAddition(RandomForestClassifier(random_state=1),
                                 scoring="accuracy"),
        RecursiveFeatureElimination(RandomForestClassifier(random_state=1),
                                    scoring="accuracy"),
        SelectByTargetMeanPerformance(scoring="r2_score", bins=3),
    ],
)
def test_all_transformers(Estimator):