def test_simple_imputer_fill_value(data_type): if data_type == "numeric": X = pd.DataFrame({ "some numeric": [np.nan, 1, 0], "another numeric": [0, np.nan, 2] }) fill_value = -1 expected = pd.DataFrame({ "some numeric": [-1, 1, 0], "another numeric": [0, -1, 2] }) else: X = pd.DataFrame({ "categorical with nan": pd.Series([np.nan, "1", np.nan, "0", "3"], dtype='category'), "object with nan": ["b", "b", np.nan, "c", np.nan] }) fill_value = "fill" expected = pd.DataFrame({ "categorical with nan": pd.Series(["fill", "1", "fill", "0", "3"], dtype='category'), "object with nan": pd.Series(["b", "b", "fill", "c", "fill"], dtype='category'), }) y = pd.Series([0, 0, 1, 0, 1]) imputer = SimpleImputer(impute_strategy="constant", fill_value=fill_value) imputer.fit(X, y) transformed = imputer.transform(X, y) assert_frame_equal(expected, transformed.to_dataframe(), check_dtype=False) imputer = SimpleImputer(impute_strategy="constant", fill_value=fill_value) transformed = imputer.fit_transform(X, y) assert_frame_equal(expected, transformed.to_dataframe(), check_dtype=False)
def test_simple_imputer_boolean_dtype(data_type, make_data_type): X = pd.DataFrame([True, np.nan, False, np.nan, True], dtype='boolean') y = pd.Series([1, 0, 0, 1, 0]) X_expected_arr = pd.DataFrame([True, True, False, True, True], dtype='boolean') X = make_data_type(data_type, X) imputer = SimpleImputer() imputer.fit(X, y) X_t = imputer.transform(X) assert_frame_equal(X_expected_arr, X_t.to_dataframe())
def test_simple_imputer_transform_drop_all_nan_columns_empty(): X = pd.DataFrame([[np.nan, np.nan, np.nan]]) transformer = SimpleImputer(impute_strategy='most_frequent') assert transformer.fit_transform(X).to_dataframe().empty assert_frame_equal(X, pd.DataFrame([[np.nan, np.nan, np.nan]])) transformer = SimpleImputer(impute_strategy='most_frequent') transformer.fit(X) assert transformer.transform(X).to_dataframe().empty assert_frame_equal(X, pd.DataFrame([[np.nan, np.nan, np.nan]]))
def test_simple_imputer_all_bool_return_original(data_type, make_data_type): X = pd.DataFrame([True, True, False, True, True], dtype=bool) y = pd.Series([1, 0, 0, 1, 0]) X = make_data_type(data_type, X) y = make_data_type(data_type, y) X_expected_arr = pd.DataFrame([True, True, False, True, True], dtype=bool) imputer = SimpleImputer() imputer.fit(X, y) X_t = imputer.transform(X) assert_frame_equal(X_expected_arr, X_t)
def test_simple_imputer_bool_dtype_object(data_type): X = pd.DataFrame([True, np.nan, False, np.nan, True], dtype=object) y = pd.Series([1, 0, 0, 1, 0]) X_expected_arr = pd.DataFrame([True, True, False, True, True], dtype='category') if data_type == 'ww': X = ww.DataTable(X) imputer = SimpleImputer() imputer.fit(X, y) X_t = imputer.transform(X) assert_frame_equal(X_expected_arr, X_t)
def test_simple_imputer_with_none(): X = pd.DataFrame({ "int with None": [1, 0, 5, None], "float with None": [0.1, 0.0, 0.5, None], "all None": [None, None, None, None] }) y = pd.Series([0, 0, 1, 0, 1]) imputer = SimpleImputer(impute_strategy="mean") imputer.fit(X, y) transformed = imputer.transform(X, y) expected = pd.DataFrame({ "int with None": [1, 0, 5, 2], "float with None": [0.1, 0.0, 0.5, 0.2] }) assert_frame_equal(expected, transformed.to_dataframe(), check_dtype=False) X = pd.DataFrame({ "category with None": pd.Series(["b", "a", "a", None], dtype='category'), "boolean with None": pd.Series([True, None, False, True], dtype='boolean'), "object with None": ["b", "a", "a", None], "all None": [None, None, None, None] }) y = pd.Series([0, 0, 1, 0, 1]) imputer = SimpleImputer() imputer.fit(X, y) transformed = imputer.transform(X, y) expected = pd.DataFrame({ "category with None": pd.Series(["b", "a", "a", "a"], dtype='category'), "boolean with None": pd.Series([True, True, False, True], dtype='boolean'), "object with None": pd.Series(["b", "a", "a", "a"], dtype='category') }) assert_frame_equal(expected, transformed.to_dataframe(), check_dtype=False)
def test_simple_imputer_does_not_reset_index(): X = pd.DataFrame({'input_val': np.arange(10), 'target': np.arange(10)}) X.loc[5, 'input_val'] = np.nan assert X.index.tolist() == list(range(10)) X.drop(0, inplace=True) y = X.pop('target') pd.testing.assert_frame_equal( pd.DataFrame({'input_val': [1.0, 2, 3, 4, np.nan, 6, 7, 8, 9]}, dtype=float, index=list(range(1, 10))), X) imputer = SimpleImputer(impute_strategy="mean") imputer.fit(X, y=y) transformed = imputer.transform(X) pd.testing.assert_frame_equal( pd.DataFrame({'input_val': [1, 2, 3, 4, 5, 6, 7, 8, 9]}, dtype=float, index=list(range(1, 10))), transformed.to_dataframe())
def test_simple_imputer_multitype_with_one_bool(data_type, make_data_type): X_multi = pd.DataFrame({ "bool with nan": pd.Series([True, np.nan, False, np.nan, False], dtype='boolean'), "bool no nan": pd.Series([False, False, False, False, True], dtype=bool), }) y = pd.Series([1, 0, 0, 1, 0]) X_multi_expected_arr = pd.DataFrame({ "bool with nan": pd.Series([True, False, False, False, False], dtype='boolean'), "bool no nan": pd.Series([False, False, False, False, True], dtype='boolean'), }) X_multi = make_data_type(data_type, X_multi) imputer = SimpleImputer() imputer.fit(X_multi, y) X_multi_t = imputer.transform(X_multi) assert_frame_equal(X_multi_expected_arr, X_multi_t.to_dataframe())
def test_simple_imputer_multitype_with_one_bool(data_type): X_multi = pd.DataFrame({ "bool with nan": pd.Series([True, np.nan, False, np.nan, False], dtype=object), "bool no nan": pd.Series([False, False, False, False, True], dtype=bool), }) y = pd.Series([1, 0, 0, 1, 0]) X_multi_expected_arr = pd.DataFrame({ "bool with nan": pd.Series([True, False, False, False, False], dtype='category'), "bool no nan": pd.Series([False, False, False, False, True], dtype=object), }) if data_type == 'ww': X_multi = ww.DataTable(X_multi) imputer = SimpleImputer() imputer.fit(X_multi, y) X_multi_t = imputer.transform(X_multi) assert_frame_equal(X_multi_expected_arr, X_multi_t)
def test_simple_imputer_transform_drop_all_nan_columns(): X = pd.DataFrame({ "all_nan": [np.nan, np.nan, np.nan], "some_nan": [np.nan, 1, 0], "another_col": [0, 1, 2] }) transformer = SimpleImputer(impute_strategy='most_frequent') transformer.fit(X) X_expected_arr = pd.DataFrame({ "some_nan": [0, 1, 0], "another_col": [0, 1, 2] }) assert_frame_equal(X_expected_arr, transformer.transform(X).to_dataframe(), check_dtype=False) assert_frame_equal( X, pd.DataFrame({ "all_nan": [np.nan, np.nan, np.nan], "some_nan": [np.nan, 1, 0], "another_col": [0, 1, 2] }))
def test_simple_imputer_woodwork_custom_overrides_returned_by_components( X_df, has_nan, impute_strategy): y = pd.Series([1, 2, 1]) if has_nan: X_df.iloc[len(X_df) - 1, 0] = np.nan override_types = [Integer, Double, Categorical, NaturalLanguage, Boolean] for logical_type in override_types: try: X = ww.DataTable(X_df, logical_types={0: logical_type}) except TypeError: continue impute_strategy_to_use = impute_strategy if logical_type in [NaturalLanguage, Categorical]: impute_strategy_to_use = "most_frequent" imputer = SimpleImputer(impute_strategy=impute_strategy_to_use) imputer.fit(X, y) transformed = imputer.transform(X, y) assert isinstance(transformed, ww.DataTable) if impute_strategy_to_use == "most_frequent" or not has_nan: assert transformed.logical_types == {0: logical_type} else: assert transformed.logical_types == {0: Double}