예제 #1
0
def clean_data(X):
    X.dropna(subset=['target'], inplace=True)
    y = X.pop('target')
    X.drop(columns='ID', inplace=True)
    X['v22'] = X['v22'].apply(az_to_int)
    cat_cols = X.select_dtypes(include=['object']).columns.tolist()
    con_cols = X.select_dtypes(include=['number']).columns.tolist()
    num_missing_imputer = SimpleImputer(strategy='median')
    cat_missing_imputer = CategoricalImputer(fill_value='__MISS__')
    rare_label_encoder = RareLabelEncoder(tol=0.01, n_categories=10, replace_with='__OTHER__')
    cat_freq_encoder = CountFrequencyEncoder(encoding_method="frequency")
    X[con_cols] = num_missing_imputer.fit_transform(X[con_cols])
    X[cat_cols] = cat_missing_imputer.fit_transform(X[cat_cols])
    X[cat_cols] = rare_label_encoder.fit_transform(X[cat_cols])
    X[cat_cols] = cat_freq_encoder.fit_transform(X[cat_cols])
    # more cleaning
    trimmer = Winsorizer(capping_method='quantiles', tail='both', fold=0.005)
    X = trimmer.fit_transform(X)
    undersampler = RandomUnderSampler(sampling_strategy=0.7, random_state=1234)
    X, Y = undersampler.fit_resample(X, y)
    quasi_constant = DropConstantFeatures(tol=0.998)
    X = quasi_constant.fit_transform(X)
    print(f"Quasi Features to drop {quasi_constant.features_to_drop_}")
    # Remove duplicated features¶
    duplicates = DropDuplicateFeatures()
    X = duplicates.fit_transform(X)
    print(f"Duplicate feature sets {duplicates.duplicated_feature_sets_}")
    print(f"Dropping duplicate features {duplicates.features_to_drop_}")
    drop_corr = DropCorrelatedFeatures(method="pearson", threshold=0.95, missing_values="ignore")
    X = drop_corr.fit_transform(X)
    print(f"Drop correlated feature sets {drop_corr.correlated_feature_sets_}")
    print(f"Dropping correlared features {drop_corr.features_to_drop_}")
    X['target'] = Y
    return X
def test_fit_attributes(df_duplicate_features):
    transformer = DropDuplicateFeatures()
    transformer.fit(df_duplicate_features)

    assert transformer.features_to_drop_ == {"dob", "dob3", "City2", "Age2"}
    assert transformer.duplicated_feature_sets_ == [
        {"dob", "dob2", "dob3"},
        {"City", "City2"},
        {"Age", "Age2"},
    ]
예제 #3
0
def test_drop_duplicates_features(df_duplicate_features):
    transformer = DropDuplicateFeatures()
    X = transformer.fit_transform(df_duplicate_features)

    # expected result
    df = pd.DataFrame({
        "Name": ["tom", "nick", "krish", "jack"],
        "dob2": pd.date_range("2020-02-24", periods=4, freq="T"),
        "City": ["London", "Manchester", "Liverpool", "Bristol"],
        "Age": [20, 21, 19, 18],
        "Marks": [0.9, 0.8, 0.7, 0.6],
    })
    pd.testing.assert_frame_equal(X, df)
예제 #4
0
def test_with_df_with_na(df_duplicate_features_with_na):
    transformer = DropDuplicateFeatures()
    X = transformer.fit_transform(df_duplicate_features_with_na)

    # expected result
    df = pd.DataFrame({
        "Name": ["tom", "nick", "krish", "jack", np.nan],
        "dob2":
        pd.date_range("2020-02-24", periods=5, freq="T"),
        "City": ["London", "Manchester", "Liverpool", "Bristol", np.nan],
        "Age": [20, 21, np.nan, 18, 34],
        "Marks": [0.9, 0.8, 0.7, 0.6, 0.5],
    })
    pd.testing.assert_frame_equal(X, df)

    assert transformer.duplicated_features_ == {"dob", "dob3", "City2", "Age2"}
    assert transformer.duplicated_feature_sets_ == [
        {"dob", "dob2", "dob3"},
        {"City", "City2"},
        {"Age", "Age2"},
    ]
    assert transformer.input_shape_ == (5, 9)
예제 #5
0
def test_non_fitted_error(df_duplicate_features):
    # test case 3: when fit is not called prior to transform
    with pytest.raises(NotFittedError):
        transformer = DropDuplicateFeatures()
        transformer.transform(df_duplicate_features)
예제 #6
0
def test_error_if_fit_input_not_dataframe():
    with pytest.raises(TypeError):
        DropDuplicateFeatures().fit({"Name": ["Karthik"]})
예제 #7
0
def test_variables_assigned_correctly(df_duplicate_features):
    transformer = DropDuplicateFeatures()
    assert transformer.variables is None

    transformer.fit(df_duplicate_features)
    assert transformer.variables == (list(df_duplicate_features.columns))
예제 #8
0
def fake_columns(var_list,df):
    dupis = DropDuplicateFeatures()
    dupis_train = dupis.fit(df[var_list])
    duplicates_train = list(dupis_train.features_to_drop_)
    return duplicates_train
예제 #9
0
    DropFeatures,
    DropHighPSIFeatures,
    RecursiveFeatureAddition,
    RecursiveFeatureElimination,
    SelectByShuffling,
    SelectBySingleFeaturePerformance,
    SelectByTargetMeanPerformance,
    SmartCorrelatedSelection,
)

_logreg = LogisticRegression(C=0.0001, max_iter=2, random_state=1)

_estimators = [
    DropFeatures(features_to_drop=["0"]),
    DropConstantFeatures(missing_values="ignore"),
    DropDuplicateFeatures(),
    DropCorrelatedFeatures(),
    DropHighPSIFeatures(bins=5),
    SmartCorrelatedSelection(),
    SelectByShuffling(estimator=_logreg, scoring="accuracy"),
    SelectByTargetMeanPerformance(bins=3, regression=False),
    SelectBySingleFeaturePerformance(estimator=_logreg, scoring="accuracy"),
    RecursiveFeatureAddition(estimator=_logreg, scoring="accuracy"),
    RecursiveFeatureElimination(estimator=_logreg, scoring="accuracy", threshold=-100),
]

_multivariate_estimators = [
    DropDuplicateFeatures(),
    DropCorrelatedFeatures(),
    SmartCorrelatedSelection(),
    SelectByShuffling(estimator=_logreg, scoring="accuracy"),