def test_feature_tools_transformer(self): df = dsutils.load_bank() df.drop(['id'], axis=1, inplace=True) y = df.pop('y') X_train, X_test = train_test_split(df.head(100), test_size=0.2, random_state=42) ftt = FeatureGenerationTransformer(task='classification', trans_primitives=['add_numeric', 'divide_numeric']) ftt.fit(X_train) x_t = ftt.transform(X_train) assert x_t is not None
def test_datetime_derivation(self): df = pd.DataFrame(data={"x1": [datetime.now()]}) ftt = FeatureGenerationTransformer(task='classification', trans_primitives=["year", "month", "week"]) ftt.fit(df) x_t = ftt.transform(df) assert "YEAR(x1)" in x_t assert "MONTH(x1)" in x_t assert "WEEK(x1)" in x_t
def test_feature_generation_with_selection(self): df = dsutils.load_bank().head(1000) df.drop(['id'], axis=1, inplace=True) y = df.pop('y') cross_cat = CrossCategorical() ftt = FeatureGenerationTransformer(task='classification', trans_primitives=['add_numeric', 'divide_numeric', cross_cat], feature_selection_args={'ratio_select_cols': 0.2}) with pytest.raises(AssertionError) as err: ftt.fit(df) assert err.value == '`y` must be provided for feature selection.' ftt.fit(df, y) x_t = ftt.transform(df) assert x_t.shape[1] == 35
def test_feature_selection(self): df = dsutils.load_bank().head(1000) df.drop(['id'], axis=1, inplace=True) y = df.pop('y') cross_cat = CrossCategorical() ftt = FeatureGenerationTransformer(task='classification', trans_primitives=['add_numeric', 'divide_numeric', cross_cat]) ftt.fit(df) x_t = ftt.transform(df) fst = FeatureSelectionTransformer('classification', ratio_select_cols=0.2, reserved_cols=ftt.original_cols) fst.fit(x_t, y) assert len(fst.scores_.items()) == 99 assert len(fst.columns_) == 35 x_t2 = fst.transform(x_t) assert x_t2.shape[1] == 35
def test_fix_input(self, fix_input: bool): df = pd.DataFrame(data={"x1": [None, 2, 3], 'x2': [4, 5, 6]}) ftt = FeatureGenerationTransformer(task='classification', trans_primitives=['add_numeric', 'divide_numeric'], fix_input=fix_input) ftt.fit(df) x_t = ftt.transform(df) assert "x1 + x2" in x_t assert "x1 / x2" in x_t if fix_input is True: # should no NaN value not only input nor output assert not math.isnan(x_t["x1"][0]) assert not math.isnan(x_t["x1 / x2"][0]) else: # x1 is NaN, it's children is NaN too. assert math.isnan(x_t["x1"][0]) assert math.isnan(x_t["x1 / x2"][0])
def test_feature_tools_categorical_cross(self): df = dsutils.load_bank() df.drop(['id'], axis=1, inplace=True) cross_cat = CrossCategorical() X_train, X_test = train_test_split(df.head(100), test_size=0.2, random_state=42) ftt = FeatureGenerationTransformer(task='classification', trans_primitives=[cross_cat]) ftt.fit(X_train) x_t = ftt.transform(X_train) assert len(set(x_t.columns.to_list()) - set( ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome', 'y', 'age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous', 'contact__marital', 'job__poutcome', 'contact__default', 'housing__month', 'housing__marital', 'loan__y', 'housing__job', 'loan__poutcome', 'month__poutcome', 'default__month', 'default__education', 'education__loan', 'education__housing', 'housing__loan', 'housing__poutcome', 'contact__housing', 'contact__loan', 'marital__y', 'contact__job', 'education__poutcome', 'default__marital', 'job__month', 'job__y', 'default__loan', 'education__marital', 'default__poutcome', 'default__y', 'contact__month', 'education__month', 'contact__education', 'contact__poutcome', 'job__marital', 'education__job', 'job__loan', 'contact__y', 'month__y', 'default__housing', 'default__job', 'poutcome__y', 'loan__marital', 'education__y', 'loan__month', 'marital__month', 'housing__y', 'marital__poutcome'])) == 0