def testNumeric(self): N_SAMPLES = 200 X, y = make_classification(n_samples=N_SAMPLES, n_features=10, n_informative=2, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42) column_names = ['X%i' % i for i in range(10)] D_train = pd.DataFrame(X_train, columns=column_names) D_test = pd.DataFrame(X_test, columns=column_names) lastochka = LastochkaTransformer(verbose=True, n_final=3, n_initial=10) log = LogisticRegression() pipe = Pipeline(steps=[ ('lastochka', lastochka), ('log', log)]) pipe.fit(D_train, y_train) X_w = lastochka.transform(D_train) X_wt = lastochka.transform(D_test) for variable in column_names: vt = lastochka.get_transformer(variable) acceptable_values = vt.optimizer_instance.bin_stats["woe_value"] real_values_train = X_w[variable].unique() real_values_test = X_wt[variable].unique() self.assertTrue(set(acceptable_values) == set(real_values_train)) self.assertTrue((set(acceptable_values)) == set(real_values_test))
def testSpecial(self): _boston = load_boston() X = pd.DataFrame(_boston["data"], columns=_boston["feature_names"]) y = (_boston["target"] >= np.median(_boston["target"])).astype(int) specs = {"PTRATIO": [20.2, 14.7]} lastochka = LastochkaTransformer(verbose=True, n_final=3, n_initial=10, specials=specs) lastochka.fit(X, y) lastochka.transform(X) self.assertTrue(list(lastochka.get_transformer("PTRATIO").specials_stats.keys()) == specs["PTRATIO"])
def testMissing(self): _boston = load_boston() X = pd.DataFrame(_boston["data"], columns=_boston["feature_names"]) np.random.seed(2) indexes = np.random.choice(X.index.tolist(), 200) X.loc[indexes, "ZN"] = np.nan y = (_boston["target"] >= np.median(_boston["target"])).astype(int) lastochka = LastochkaTransformer(verbose=True, n_final=3, n_initial=10) lastochka.fit(X, y) lastochka.transform(X) self.assertTrue(lastochka.get_transformer("ZN").missing_woe_value is not None)
def testCategory(self): _boston = load_boston() X = pd.DataFrame(_boston["data"], columns=_boston["feature_names"]) X["RAD_CAT"] = X["RAD"].astype(str) X = X.drop("RAD", axis=1) y = (_boston["target"] >= np.median(_boston["target"])).astype(int) lastochka = LastochkaTransformer(verbose=True, n_final=3, n_initial=10) lastochka.fit(X, y) lastochka.transform(X) optimizer_instance = lastochka.get_transformer("RAD_CAT").optimizer_instance self.assertIsInstance(optimizer_instance, CategoryOptimizer)