def clean_data(X): X.dropna(subset=['target'], inplace=True) y = X.pop('target') X.drop(columns='ID', inplace=True) X['v22'] = X['v22'].apply(az_to_int) cat_cols = X.select_dtypes(include=['object']).columns.tolist() con_cols = X.select_dtypes(include=['number']).columns.tolist() num_missing_imputer = SimpleImputer(strategy='median') cat_missing_imputer = CategoricalImputer(fill_value='__MISS__') rare_label_encoder = RareLabelEncoder(tol=0.01, n_categories=10, replace_with='__OTHER__') cat_freq_encoder = CountFrequencyEncoder(encoding_method="frequency") X[con_cols] = num_missing_imputer.fit_transform(X[con_cols]) X[cat_cols] = cat_missing_imputer.fit_transform(X[cat_cols]) X[cat_cols] = rare_label_encoder.fit_transform(X[cat_cols]) X[cat_cols] = cat_freq_encoder.fit_transform(X[cat_cols]) # more cleaning trimmer = Winsorizer(capping_method='quantiles', tail='both', fold=0.005) X = trimmer.fit_transform(X) undersampler = RandomUnderSampler(sampling_strategy=0.7, random_state=1234) X, Y = undersampler.fit_resample(X, y) quasi_constant = DropConstantFeatures(tol=0.998) X = quasi_constant.fit_transform(X) print(f"Quasi Features to drop {quasi_constant.features_to_drop_}") # Remove duplicated features¶ duplicates = DropDuplicateFeatures() X = duplicates.fit_transform(X) print(f"Duplicate feature sets {duplicates.duplicated_feature_sets_}") print(f"Dropping duplicate features {duplicates.features_to_drop_}") drop_corr = DropCorrelatedFeatures(method="pearson", threshold=0.95, missing_values="ignore") X = drop_corr.fit_transform(X) print(f"Drop correlated feature sets {drop_corr.correlated_feature_sets_}") print(f"Dropping correlared features {drop_corr.features_to_drop_}") X['target'] = Y return X
def test_drop_constant_features_with_list_of_variables(df_constant_features): # test case 3: drop features showing threshold more than 0.7 with variable list transformer = DropConstantFeatures( tol=0.7, variables=["Name", "const_feat_num", "quasi_feat_num"]) X = transformer.fit_transform(df_constant_features) # expected result df = pd.DataFrame({ "Name": ["tom", "nick", "krish", "jack"], "City": ["London", "Manchester", "Liverpool", "Bristol"], "Age": [20, 21, 19, 18], "Marks": [0.9, 0.8, 0.7, 0.6], "dob": pd.date_range("2020-02-24", periods=4, freq="T"), "const_feat_cat": ["a", "a", "a", "a"], "quasi_feat_cat": ["a", "a", "a", "b"], }) # init params assert transformer.tol == 0.7 assert transformer.variables == [ "Name", "const_feat_num", "quasi_feat_num" ] # fit attr assert transformer.constant_features_ == [ "const_feat_num", "quasi_feat_num" ] assert transformer.input_shape_ == (4, 9) # transform params pd.testing.assert_frame_equal(X, df)
def test_error_if_all_constant_and_quasi_constant_features(): # test case 7: when input contains all constant and quasi constant features with pytest.raises(ValueError): transformer = DropConstantFeatures(tol=0.7) transformer.fit_transform( pd.DataFrame({ "col1": [1, 1, 1, 1], "col2": [1, 1, 1, 1], "col3": [1, 1, 1, 2], "col4": [1, 1, 1, 2], }))
def test_error_if_input_all_constant_features(): # test case 6: when input contains all constant features with pytest.raises(ValueError): DropConstantFeatures().fit( pd.DataFrame({ "col1": [1, 1, 1], "col2": [1, 1, 1] }))
def test_drop_constant_and_quasiconstant_features(df_constant_features): transformer = DropConstantFeatures(tol=0.7, variables=None) X = transformer.fit_transform(df_constant_features) # expected result df = pd.DataFrame( { "Name": ["tom", "nick", "krish", "jack"], "City": ["London", "Manchester", "Liverpool", "Bristol"], "Age": [20, 21, 19, 18], "Marks": [0.9, 0.8, 0.7, 0.6], "dob": pd.date_range("2020-02-24", periods=4, freq="T"), } ) # init params assert transformer.tol == 0.7 assert transformer.variables == [ "Name", "City", "Age", "Marks", "dob", "const_feat_num", "const_feat_cat", "quasi_feat_num", "quasi_feat_cat", ] # fit attr assert transformer.constant_features_ == [ "const_feat_num", "const_feat_cat", "quasi_feat_num", "quasi_feat_cat", ] assert transformer.input_shape_ == (4, 9) # transform params pd.testing.assert_frame_equal(X, df)
def test_drop_constant_features(df_constant_features): transformer = DropConstantFeatures(tol=1, variables=None) X = transformer.fit_transform(df_constant_features) # expected result df = pd.DataFrame({ "Name": ["tom", "nick", "krish", "jack"], "City": ["London", "Manchester", "Liverpool", "Bristol"], "Age": [20, 21, 19, 18], "Marks": [0.9, 0.8, 0.7, 0.6], "dob": pd.date_range("2020-02-24", periods=4, freq="T"), "quasi_feat_num": [1, 1, 1, 2], "quasi_feat_cat": ["a", "a", "a", "b"], }) # fit attribute assert transformer.features_to_drop_ == [ "const_feat_num", "const_feat_cat" ] # transform output pd.testing.assert_frame_equal(X, df)
def test_missing_values_param(): df = { "Name": ["tom", "nick", "krish", "jack"], "City": ["London", "Manchester", "Liverpool", "Bristol"], "Age": [20, 21, 19, 18], "Marks": [0.9, 0.8, 0.7, 0.6], "dob": pd.date_range("2020-02-24", periods=4, freq="T"), "const_feat_num": [1, 1, 1, np.nan], "const_feat_cat": ["a", "a", "a", "a"], "quasi_feat_num": [1, 1, 1, 2], "quasi_feat_cat": ["a", "a", "a", np.nan], } df = pd.DataFrame(df) # test raises error if there is na with pytest.raises(ValueError): transformer = DropConstantFeatures(missing_values="raise") transformer.fit(df) # test ignores na transformer = DropConstantFeatures(missing_values="ignore").fit(df) constant = ["const_feat_num", "const_feat_cat", "quasi_feat_cat"] assert transformer.constant_features_ == constant pd.testing.assert_frame_equal(df.drop(constant, axis=1), transformer.transform(df)) # test includes na transformer = DropConstantFeatures(tol=0.7, missing_values="include").fit(df) qconstant = [ "const_feat_num", "const_feat_cat", "quasi_feat_num", "quasi_feat_cat" ] assert transformer.constant_features_ == qconstant pd.testing.assert_frame_equal(df.drop(qconstant, axis=1), transformer.transform(df))
def create_pipeline(params: dict = None): """ Create sklearn.pipeline.Pipeline Parameters ---------- params : dict dictionary of parameters for the pipeline Returns ------- sklearn.pipeline.Pipeline """ # pipeline for numeric variables p_num = Pipeline([("num_nan_ind", AddMissingIndicator(missing_only=True)), ("rmmean", MeanMedianImputer()), ("drop_quasi_constant", DropConstantFeatures(tol=0.97))]) # pipeline for categorical variables p_cat = Pipeline([("fill_cat_nas", CategoricalImputer(fill_value='MISSING')), ("rlc", RareLabelEncoder()), ("one_hot_encoder", OneHotEncoder())]) # list of pipelines to combine transformers = [("num", p_num, make_column_selector(dtype_include=np.number)), ("cat", p_cat, make_column_selector(dtype_include=object))] # combine pipelines and add XGBClassifier col_transforms = ColumnTransformer(transformers) p = Pipeline([("col_transformers", col_transforms), ("xgb", XGBClassifier(min_child_weight=1, gamma=0, objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=1, gpu_id=0, tree_method='gpu_hist'))]) if params: p.set_params(**params) return p
def test_non_fitted_error(df_constant_features): # test case 8: when fit is not called prior to transform with pytest.raises(NotFittedError): transformer = DropConstantFeatures() transformer.transform(df_constant_features)
def test_tol_init_param(tol): sel = DropConstantFeatures(tol=tol) assert sel.tol == tol
def test_error_if_missing_values_not_permitted(): # test case 5: threshold not between 0 and 1 with pytest.raises(ValueError): DropConstantFeatures(missing_values="hola")
def test_error_if_tol_is_string(): # test case 5: threshold not between 0 and 1 with pytest.raises(ValueError): DropConstantFeatures(tol="hola")
def test_error_if_tol_out_of_range(): # test case 5: threshold not between 0 and 1 with pytest.raises(ValueError): DropConstantFeatures(tol=2)
def test_error_if_fit_input_not_df(): # test case 4: input is not a dataframe with pytest.raises(TypeError): DropConstantFeatures().fit({"Name": ["Karthik"]})
def test_error_if_tol_value_not_allowed(tol): # test case 5: threshold not between 0 and 1 with pytest.raises(ValueError): DropConstantFeatures(tol=tol)
DropDuplicateFeatures, DropFeatures, DropHighPSIFeatures, RecursiveFeatureAddition, RecursiveFeatureElimination, SelectByShuffling, SelectBySingleFeaturePerformance, SelectByTargetMeanPerformance, SmartCorrelatedSelection, ) _logreg = LogisticRegression(C=0.0001, max_iter=2, random_state=1) _estimators = [ DropFeatures(features_to_drop=["0"]), DropConstantFeatures(missing_values="ignore"), DropDuplicateFeatures(), DropCorrelatedFeatures(), DropHighPSIFeatures(bins=5), SmartCorrelatedSelection(), SelectByShuffling(estimator=_logreg, scoring="accuracy"), SelectByTargetMeanPerformance(bins=3, regression=False), SelectBySingleFeaturePerformance(estimator=_logreg, scoring="accuracy"), RecursiveFeatureAddition(estimator=_logreg, scoring="accuracy"), RecursiveFeatureElimination(estimator=_logreg, scoring="accuracy", threshold=-100), ] _multivariate_estimators = [ DropDuplicateFeatures(), DropCorrelatedFeatures(), SmartCorrelatedSelection(),
DropDuplicateFeatures, DropFeatures, RecursiveFeatureAddition, RecursiveFeatureElimination, SelectByShuffling, SelectBySingleFeaturePerformance, SelectByTargetMeanPerformance, SmartCorrelatedSelection, ) @pytest.mark.parametrize( "Estimator", [ DropFeatures(features_to_drop=["0"]), DropConstantFeatures(), DropDuplicateFeatures(), DropCorrelatedFeatures(), SmartCorrelatedSelection(), SelectByShuffling(RandomForestClassifier(random_state=1), scoring="accuracy"), SelectBySingleFeaturePerformance( RandomForestClassifier(random_state=1), scoring="accuracy"), RecursiveFeatureAddition(RandomForestClassifier(random_state=1), scoring="accuracy"), RecursiveFeatureElimination(RandomForestClassifier(random_state=1), scoring="accuracy"), SelectByTargetMeanPerformance(scoring="r2_score", bins=3), ], ) def test_all_transformers(Estimator):