def test_easy_preprocessor_cat_cols(): # Create dataframe with 1 numerical and 3 categorical features. cat_1_unique = ['a', 'b', 'c'] cat_2_unique = ['D', 'E', 'F', 'G'] cat_3_unique = [-2.0, -1.0] data = pd.DataFrame({ 'cat_1': ['a', 'b', 'a', '', '', 'c', 'a', 'c', 'a', 'a'], 'cat_2': ['D', 'D', 'E', np.NaN, 'D', np.NaN, 'E', 'F', 'F', 'G'], 'cat_3': [-2., -1., -1., -2., -1., '', -1., -2., '', np.NaN], 'num': np.sin(range(10)), # Valid continuous feature. }) # Preprocess data, i.e. replace empty strings with NaNs, impute NaNs, and # encode categorical variables with OneHotEncoder. ep = EasyPreprocessor() data_t = ep.fit_transform(data) cat_all = [cat_1_unique, cat_2_unique, cat_3_unique] n_unique_cats = sum(len(cats) for cats in cat_all) # The number of features after preprocessing must be equal to # the number of unique categories within the dataframe + the number of # remaining valid features. assert data_t.shape[1] == n_unique_cats + 1 cat_pipe = ep.ct_.named_transformers_['categorical'] ohe = cat_pipe.named_steps['onehotencoder'] # The category sets detected by OneHotEncoder inside EasyPreprocessor # must match the ones specified in cat_all. assert len(ohe.categories_) == len(cat_all) for cat_list, ohe_cat_list in zip(cat_all, ohe.categories_): assert set(cat_list) == set(ohe_cat_list)
def test_simple_preprocessor_imputed_features(): # Issue: 211 data = pd.DataFrame({'A': [0, 1, 2, 1, np.NaN]}, dtype=int) types = detect_types(data, type_hints={'A': 'categorical'}) ep = EasyPreprocessor(types=types) ep.fit(data) expected_names = ['A_0', 'A_1', 'A_2', 'A_imputed_False', 'A_imputed_True'] assert ep.get_feature_names() == expected_names
def test_simple_preprocessor(): sp = EasyPreprocessor() sp.fit(X_cat) trans = sp.transform(X_cat) assert trans.shape == (3, 7) # FIXME should be 6? iris = load_iris() sp = EasyPreprocessor() sp.fit(iris.data)
def test_boolean_and_nan(null_object): X = pd.DataFrame({'a': [True, False, True, False, null_object]}) types = detect_types(X) assert types.categorical.a X_preprocessed = EasyPreprocessor().fit_transform(X) assert X_preprocessed.shape[1] == 4 assert all(np.unique(X_preprocessed) == [0, 1])
def test_titanic_feature_names(): path = os.path.dirname(__file__) titanic = pd.read_csv(os.path.join(path, '../datasets/titanic.csv')) ep = EasyPreprocessor() ep.fit(clean(titanic.drop('survived', axis=1))) expected_names = [ 'age_dabl_continuous', 'body_dabl_continuous', 'fare_dabl_continuous', 'age_?_0.0', 'age_?_1.0', 'body_?_0.0', 'body_?_1.0', 'pclass_1', 'pclass_2', 'pclass_3', 'sex_female', 'sex_male', 'embarked_?', 'embarked_C', 'embarked_Q', 'embarked_S', 'boat_1', 'boat_10', 'boat_11', 'boat_12', 'boat_13', 'boat_13 15', 'boat_13 15 B', 'boat_14', 'boat_15', 'boat_15 16', 'boat_16', 'boat_2', 'boat_3', 'boat_4', 'boat_5', 'boat_5 7', 'boat_5 9', 'boat_6', 'boat_7', 'boat_8', 'boat_8 10', 'boat_9', 'boat_?', 'boat_A', 'boat_B', 'boat_C', 'boat_C D', 'boat_D' ] assert ep.get_feature_names() == expected_names
def test_titanic_feature_names(): path = os.path.dirname(__file__) titanic = pd.read_csv(os.path.join(path, '../datasets/titanic.csv')) ep = EasyPreprocessor() ep.fit(clean(titanic.drop('survived', axis=1))) expected_names = [ 'sibsp', 'parch', 'age_dabl_continuous', 'fare_dabl_continuous', 'body_dabl_continuous', 'pclass_1', 'pclass_2', 'pclass_3', 'sex_female', 'sex_male', 'sibsp_0', 'sibsp_1', 'sibsp_2', 'sibsp_3', 'sibsp_4', 'sibsp_5', 'sibsp_8', 'parch_0', 'parch_1', 'parch_2', 'parch_3', 'parch_4', 'parch_5', 'parch_6', 'parch_9', 'embarked_?', 'embarked_C', 'embarked_Q', 'embarked_S', 'boat_1', 'boat_10', 'boat_11', 'boat_12', 'boat_13', 'boat_13 15', 'boat_13 15 B', 'boat_14', 'boat_15', 'boat_15 16', 'boat_16', 'boat_2', 'boat_3', 'boat_4', 'boat_5', 'boat_5 7', 'boat_5 9', 'boat_6', 'boat_7', 'boat_8', 'boat_8 10', 'boat_9', 'boat_?', 'boat_A', 'boat_B', 'boat_C', 'boat_C D', 'boat_D', 'age_?_0.0', 'age_?_1.0', 'body_?_0.0', 'body_?_1.0' ] try: assert ep.get_feature_names() == expected_names except AssertionError: # OHE uses int in newer versions expected_names[57] = 'age_?_0' expected_names[58] = 'age_?_1' expected_names[59] = 'body_?_0' expected_names[60] = 'body_?_1' assert ep.get_feature_names() == expected_names # without clean X = ep.fit_transform(titanic.drop('survived', axis=1)) # FIXME can't do that yet # assert ep.get_feature_names() == expected_names_no_clean assert not np.isnan(X).any()
def test_simple_preprocessor(): sp = EasyPreprocessor() trans = sp.fit_transform(X_cat) assert trans.shape == (3, 6) iris = load_iris() sp = EasyPreprocessor() sp.fit(iris.data)
def test_easy_preprocessor_transform(): titanic = load_titanic() titanic_clean = clean(titanic) X, y = titanic_clean.drop("survived", axis=1), titanic_clean.survived X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, random_state=42) pipe = make_pipeline(EasyPreprocessor(), LogisticRegression(C=0.1)) pipe.fit(X_train, y_train) pipe.predict(X_train) pipe.predict(X_val)
def test_simple_preprocessor_dirty_float(): dirty = pd.DataFrame(make_dirty_float()) fp = EasyPreprocessor() fp.fit(dirty) res = fp.transform(dirty) assert res.shape == (100, 3) rowsum = res.sum(axis=0) # count of "garbage" assert rowsum[1] == 1 # count of "missing" assert rowsum[2] == 9 # make sure we can transform a clean column fp.transform(pd.DataFrame(['0', '1', '2'], columns=['a_column']))