def test_simple_imputer_fill_value(data_type): if data_type == "numeric": X = pd.DataFrame({ "some numeric": [np.nan, 1, 0], "another numeric": [0, np.nan, 2] }) fill_value = -1 expected = pd.DataFrame({ "some numeric": [-1, 1, 0], "another numeric": [0, -1, 2] }) else: X = pd.DataFrame({ "categorical with nan": pd.Series([np.nan, "1", np.nan, "0", "3"], dtype='category'), "object with nan": ["b", "b", np.nan, "c", np.nan] }) fill_value = "fill" expected = pd.DataFrame({ "categorical with nan": pd.Series(["fill", "1", "fill", "0", "3"], dtype='category'), "object with nan": pd.Series(["b", "b", "fill", "c", "fill"], dtype='category'), }) y = pd.Series([0, 0, 1, 0, 1]) imputer = SimpleImputer(impute_strategy="constant", fill_value=fill_value) imputer.fit(X, y) transformed = imputer.transform(X, y) assert_frame_equal(expected, transformed.to_dataframe(), check_dtype=False) imputer = SimpleImputer(impute_strategy="constant", fill_value=fill_value) transformed = imputer.fit_transform(X, y) assert_frame_equal(expected, transformed.to_dataframe(), check_dtype=False)
def test_simple_imputer_mean(): X = pd.DataFrame([[np.nan, 0, 1, np.nan], [1, 2, 3, 2], [1, 2, 3, 0]]) # test impute_strategy transformer = SimpleImputer(impute_strategy='mean') X_expected_arr = pd.DataFrame([[1, 0, 1, 1], [1, 2, 3, 2], [1, 2, 3, 0]]) X_t = transformer.fit_transform(X) assert_frame_equal(X_expected_arr, X_t.to_dataframe(), check_dtype=False)
def test_simple_imputer_median(): X = pd.DataFrame([[np.nan, 0, 1, np.nan], [1, 2, 3, 2], [10, 2, np.nan, 2], [10, 2, 5, np.nan], [6, 2, 7, 0]]) transformer = SimpleImputer(impute_strategy='median') X_expected_arr = pd.DataFrame([[8, 0, 1, 2], [1, 2, 3, 2], [10, 2, 4, 2], [10, 2, 5, 2], [6, 2, 7, 0]]) X_t = transformer.fit_transform(X) assert_frame_equal(X_expected_arr, X_t, check_dtype=False)
def test_simple_imputer_most_frequent(): X = pd.DataFrame([[np.nan, 0, 1, np.nan], ["a", 2, np.nan, 3], ["b", 2, 1, 0]]) transformer = SimpleImputer(impute_strategy='most_frequent') X_expected_arr = pd.DataFrame([["a", 0, 1, 0], ["a", 2, 1, 3], ["b", 2, 1, 0]]) X_expected_arr = X_expected_arr.astype({0: 'category'}) X_t = transformer.fit_transform(X) assert_frame_equal(X_expected_arr, X_t.to_dataframe(), check_dtype=False)
def test_simple_imputer_transform_drop_all_nan_columns_empty(): X = pd.DataFrame([[np.nan, np.nan, np.nan]]) transformer = SimpleImputer(impute_strategy='most_frequent') assert transformer.fit_transform(X).to_dataframe().empty assert_frame_equal(X, pd.DataFrame([[np.nan, np.nan, np.nan]])) transformer = SimpleImputer(impute_strategy='most_frequent') transformer.fit(X) assert transformer.transform(X).to_dataframe().empty assert_frame_equal(X, pd.DataFrame([[np.nan, np.nan, np.nan]]))
def test_simple_imputer_numpy_input(): X = np.array([[np.nan, 0, 1, np.nan], [np.nan, 2, 3, 2], [np.nan, 2, 3, 0]]) transformer = SimpleImputer(impute_strategy='mean') X_expected_arr = np.array([[0, 1, 1], [2, 3, 2], [2, 3, 0]]) assert np.allclose(X_expected_arr, transformer.fit_transform(X)) np.testing.assert_almost_equal( X, np.array([[np.nan, 0, 1, np.nan], [np.nan, 2, 3, 2], [np.nan, 2, 3, 0]]))
def test_simple_imputer_constant(): # test impute strategy is constant and fill value is not specified X = pd.DataFrame([[np.nan, 0, 1, np.nan], ["a", 2, np.nan, 3], ["b", 2, 3, 0]]) transformer = SimpleImputer(impute_strategy='constant', fill_value=3) X_expected_arr = pd.DataFrame([[3, 0, 1, 3], ["a", 2, 3, 3], ["b", 2, 3, 0]]) X_expected_arr = X_expected_arr.astype({0: 'category'}) X_t = transformer.fit_transform(X) assert_frame_equal(X_expected_arr, X_t.to_dataframe(), check_dtype=False)
def test_simple_imputer_fit_transform_drop_all_nan_columns(): X = pd.DataFrame({ "all_nan": [np.nan, np.nan, np.nan], "some_nan": [np.nan, 1, 0], "another_col": [0, 1, 2] }) transformer = SimpleImputer(impute_strategy='most_frequent') X_expected_arr = pd.DataFrame({ "some_nan": [0, 1, 0], "another_col": [0, 1, 2] }) X_t = transformer.fit_transform(X) assert_frame_equal(X_expected_arr, X_t.to_dataframe(), check_dtype=False) assert_frame_equal( X, pd.DataFrame({ "all_nan": [np.nan, np.nan, np.nan], "some_nan": [np.nan, 1, 0], "another_col": [0, 1, 2] }))
def test_simple_imputer_col_with_non_numeric(): # test col with all strings X = pd.DataFrame([["a", 0, 1, np.nan], ["b", 2, 3, 3], ["a", 2, 3, 1], [np.nan, 2, 3, 0]]) transformer = SimpleImputer(impute_strategy='mean') with pytest.raises(ValueError, match="Cannot use mean strategy with non-numeric data"): transformer.fit_transform(X) with pytest.raises(ValueError, match="Cannot use mean strategy with non-numeric data"): transformer.fit(X) transformer = SimpleImputer(impute_strategy='median') with pytest.raises( ValueError, match="Cannot use median strategy with non-numeric data"): transformer.fit_transform(X) with pytest.raises( ValueError, match="Cannot use median strategy with non-numeric data"): transformer.fit(X) transformer = SimpleImputer(impute_strategy='most_frequent') X_expected_arr = pd.DataFrame([["a", 0, 1, 0], ["b", 2, 3, 3], ["a", 2, 3, 1], ["a", 2, 3, 0]]) X_expected_arr = X_expected_arr.astype({0: 'category'}) X_t = transformer.fit_transform(X) assert_frame_equal(X_expected_arr, X_t.to_dataframe(), check_dtype=False) transformer = SimpleImputer(impute_strategy='constant', fill_value=2) X_expected_arr = pd.DataFrame([["a", 0, 1, 2], ["b", 2, 3, 3], ["a", 2, 3, 1], [2, 2, 3, 0]]) X_expected_arr = X_expected_arr.astype({0: 'category'}) X_t = transformer.fit_transform(X) assert_frame_equal(X_expected_arr, X_t.to_dataframe(), check_dtype=False)