def test_non_numeric_errors(non_numeric_df):
    # test col with all strings
    X = non_numeric_df

    # mean with all strings
    strategies = {'A': {"impute_strategy": "mean"}}
    with pytest.raises(ValueError,
                       match="Cannot use mean strategy with non-numeric data"):
        transformer = PerColumnImputer(impute_strategies=strategies)
        transformer.fit_transform(X)
    with pytest.raises(ValueError,
                       match="Cannot use mean strategy with non-numeric data"):
        transformer = PerColumnImputer(impute_strategies=strategies)
        transformer.fit(X)

    # median with all strings
    strategies = {'B': {"impute_strategy": "median"}}
    with pytest.raises(
            ValueError,
            match="Cannot use median strategy with non-numeric data"):
        transformer = PerColumnImputer(impute_strategies=strategies)
        transformer.fit_transform(X)
    with pytest.raises(
            ValueError,
            match="Cannot use median strategy with non-numeric data"):
        transformer = PerColumnImputer(impute_strategies=strategies)
        transformer.fit(X)
Exemplo n.º 2
0
def test_non_numeric_valid(non_numeric_df):
    X = non_numeric_df

    # most frequent with all strings
    strategies = {'C': {"impute_strategy": "most_frequent"}}
    transformer = PerColumnImputer(impute_strategies=strategies)

    X_expected = pd.DataFrame({"A": pd.Series(["a", "b", "a", "a"], dtype="category"),
                               "B": pd.Series(["a", "b", "a", "a"], dtype="category"),
                               "C": pd.Series(["a", "b", "a", "a"], dtype="category"),
                               "D": pd.Series(["a", "b", "a", "a"], dtype="category")})

    X_t = transformer.fit_transform(X)
    assert_frame_equal(X_expected, X_t.to_dataframe())

    # constant with all strings
    strategies = {'D': {"impute_strategy": "constant", "fill_value": 100}}
    transformer = PerColumnImputer(impute_strategies=strategies)

    X_expected = pd.DataFrame([["a", "a", "a", "a"],
                               ["b", "b", "b", "b"],
                               ["a", "a", "a", "a"],
                               ["a", "a", "a", 100]])
    X_expected.columns = ['A', 'B', 'C', 'D']
    X_expected = pd.DataFrame({"A": pd.Series(["a", "b", "a", "a"], dtype="category"),
                               "B": pd.Series(["a", "b", "a", "a"], dtype="category"),
                               "C": pd.Series(["a", "b", "a", "a"], dtype="category"),
                               "D": pd.Series(["a", "b", "a", 100], dtype="category")})
    X_t = transformer.fit_transform(X)
    assert_frame_equal(X_expected, X_t.to_dataframe())
def test_all_strategies():
    X = pd.DataFrame([[2, 4, 6, "a"], [4, 6, 8, "a"], [6, 4, 8, "b"],
                      [np.nan, np.nan, np.nan, np.nan]])

    X_expected = pd.DataFrame([[2, 4, 6, "a"], [4, 6, 8, "a"], [6, 4, 8, "b"],
                               [4, 4, 100, "a"]])

    X.columns = ['A', 'B', 'C', 'D']
    X_expected.columns = ['A', 'B', 'C', 'D']

    strategies = {
        'A': {
            "impute_strategy": "mean"
        },
        'B': {
            "impute_strategy": "median"
        },
        'C': {
            "impute_strategy": "constant",
            "fill_value": 100
        },
        'D': {
            "impute_strategy": "most_frequent"
        },
    }

    transformer = PerColumnImputer(impute_strategies=strategies)
    X_t = transformer.fit_transform(X)
    assert_frame_equal(X_expected, X_t, check_dtype=False)
def test_fit_transform_drop_all_nan_columns():
    X = pd.DataFrame({
        "all_nan": [np.nan, np.nan, np.nan],
        "some_nan": [np.nan, 1, 0],
        "another_col": [0, 1, 2]
    })
    strategies = {
        'all_nan': {
            "impute_strategy": "most_frequent"
        },
        'some_nan': {
            "impute_strategy": "most_frequent"
        },
        'another_col': {
            "impute_strategy": "most_frequent"
        }
    }
    transformer = PerColumnImputer(impute_strategies=strategies)
    X_expected_arr = pd.DataFrame({
        "some_nan": [0, 1, 0],
        "another_col": [0, 1, 2]
    })
    X_t = transformer.fit_transform(X)
    assert_frame_equal(X_expected_arr, X_t, check_dtype=False)
    assert_frame_equal(
        X,
        pd.DataFrame({
            "all_nan": [np.nan, np.nan, np.nan],
            "some_nan": [np.nan, 1, 0],
            "another_col": [0, 1, 2]
        }))
Exemplo n.º 5
0
def test_transform_drop_all_nan_columns_empty():
    X = pd.DataFrame([[np.nan, np.nan, np.nan]])
    strategies = {'0': {"impute_strategy": "most_frequent"}, }
    transformer = PerColumnImputer(impute_strategies=strategies)
    assert transformer.fit_transform(X).to_dataframe().empty
    assert_frame_equal(X, pd.DataFrame([[np.nan, np.nan, np.nan]]))

    strategies = {'0': {"impute_strategy": "most_frequent"}}
    transformer = PerColumnImputer(impute_strategies=strategies)
    transformer.fit(X)
    assert transformer.transform(X).to_dataframe().empty
    assert_frame_equal(X, pd.DataFrame([[np.nan, np.nan, np.nan]]))
def test_non_numeric_valid(non_numeric_df):
    X = non_numeric_df

    # most frequent with all strings
    strategies = {'C': {"impute_strategy": "most_frequent"}}
    transformer = PerColumnImputer(impute_strategies=strategies)

    X_expected = pd.DataFrame([["a", "a", "a", "a"], ["b", "b", "b", "b"],
                               ["a", "a", "a", "a"], ["a", "a", "a", "a"]])
    X_expected.columns = ['A', 'B', 'C', 'D']

    X_t = transformer.fit_transform(X)
    assert_frame_equal(X_expected, X_t, check_dtype=False)

    # constant with all strings
    strategies = {'D': {"impute_strategy": "constant", "fill_value": 100}}
    transformer = PerColumnImputer(impute_strategies=strategies)

    X_expected = pd.DataFrame([["a", "a", "a", "a"], ["b", "b", "b", "b"],
                               ["a", "a", "a", "a"], ["a", "a", "a", 100]])
    X_expected.columns = ['A', 'B', 'C', 'D']

    X_t = transformer.fit_transform(X)
    assert_frame_equal(X_expected, X_t, check_dtype=False)
def test_fit_transform():
    X = pd.DataFrame([[2], [4], [6], [np.nan]])

    X_expected = pd.DataFrame([[2], [4], [6], [4]])

    X.columns = ['A']
    X_expected.columns = ['A']

    strategies = {'A': {"impute_strategy": "median"}}

    transformer = PerColumnImputer(impute_strategies=strategies)
    transformer.fit(X)
    X_t = transformer.transform(X)

    transformer = PerColumnImputer(impute_strategies=strategies)
    X_fit_transform = transformer.fit_transform(X)

    assert_frame_equal(X_t, X_fit_transform, check_dtype=False)
Exemplo n.º 8
0
def test_all_strategies():
    X = pd.DataFrame({"A": pd.Series([2, 4, 6, np.nan]),
                      "B": pd.Series([4, 6, 4, np.nan]),
                      "C": pd.Series([6, 8, 8, np.nan]),
                      "D": pd.Series(["a", "a", "b", np.nan])})

    X_expected = pd.DataFrame({"A": pd.Series([2, 4, 6, 4]),
                               "B": pd.Series([4, 6, 4, 4]),
                               "C": pd.Series([6, 8, 8, 100]),
                               "D": pd.Series(["a", "a", "b", "a"], dtype="category")})

    strategies = {
        'A': {"impute_strategy": "mean"},
        'B': {"impute_strategy": "median"},
        'C': {"impute_strategy": "constant", "fill_value": 100},
        'D': {"impute_strategy": "most_frequent"},
    }

    transformer = PerColumnImputer(impute_strategies=strategies)
    X_t = transformer.fit_transform(X)
    assert_frame_equal(X_expected, X_t.to_dataframe(), check_dtype=False)