Exemplo n.º 1
0
def test_ColumnsSelector_columns_not_present():
    dfX = pd.DataFrame({
        "cat1": ["A", "B", "A", "D"],
        "cat2": ["toto", "tata", "truc", "toto"],
        "num1": [0, 1, 2, 3],
        "num2": [1.1, 1.5, -2, -3.5],
        "num3": [-1, 1, 25, 4],
        "text1": ["aa bb", "bb bb cc", "dd aa cc", "ee"],
        "text2": ["a z", "b e", "d t", "a b c"],
    })

    selector = ColumnsSelector(columns_to_use=["column_isnot_present"])
    with pytest.raises(
            ValueError):  # error because columns is not in DataFrame
        selector.fit(dfX)
Exemplo n.º 2
0
def test_ColumnsSelector():

    dfX = pd.DataFrame({
        "cat1": ["A", "B", "A", "D"],
        "cat2": ["toto", "tata", "truc", "toto"],
        "num1": [0, 1, 2, 3],
        "num2": [1.1, 1.5, -2, -3.5],
        "num3": [-1, 1, 25, 4],
        "text1": ["aa bb", "bb bb cc", "dd aa cc", "ee"],
        "text2": ["a z", "b e", "d t", "a b c"],
    })

    dfX2 = pd.DataFrame({
        "cat1": ["D", "B"],
        "cat2": ["toto", "newcat"],
        "num1": [5, 6],
        "num2": [0.1, -5.2],
        "num3": [2, -1],
        "text1": ["dd ee", "aa"],
        "text2": ["t a c", "z b"],
    })

    selector = ColumnsSelector(columns_to_use=["text1", "text2"])
    r1 = dfX.loc[:, ["text1", "text2"]]
    r2 = dfX2.loc[:, ["text1", "text2"]]

    assert (selector.fit_transform(dfX) == r1).all().all()
    assert (selector.transform(dfX2) == r2).all().all()
    assert selector.get_feature_names() == ["text1", "text2"]

    selector = ColumnsSelector(columns_to_use=np.array(["text1", "text2"]))
    r1 = dfX.loc[:, ["text1", "text2"]]
    r2 = dfX2.loc[:, ["text1", "text2"]]

    assert (selector.fit_transform(dfX) == r1).all().all()
    assert (selector.transform(dfX2) == r2).all().all()
    assert selector.get_feature_names() == ["text1", "text2"]

    with pytest.raises(ValueError):
        selector.transform(dfX2.loc[:, ["text2", "text1"]]
                           )  # Error because not correct number of columns

    with pytest.raises(ValueError):
        selector.transform(
            dfX2.loc[:, ["text3", "text1"]])  # Error because text2 not in df

    with pytest.raises(ValueError):
        selector.transform(dfX2.values)  # Error because type changes

    # This error might be ignored later

    ###  Same thing but with 'raise_if_shape_differs=False'
    selector = ColumnsSelector(columns_to_use=np.array(["text1", "text2"]),
                               raise_if_shape_differs=False)
    r1 = dfX.loc[:, ["text1", "text2"]]
    r2 = dfX2.loc[:, ["text1", "text2"]]

    assert (selector.fit_transform(dfX) == r1).all().all()
    assert (selector.transform(dfX2) == r2).all().all()
    assert selector.get_feature_names() == ["text1", "text2"]

    r3 = selector.transform(
        dfX2.loc[:, ["text2", "text1"]])  # Don't raise error anymore
    assert r3.shape == r2.shape
    assert (r3 == r2).all(axis=None)

    with pytest.raises(ValueError):
        r3 = selector.transform(
            dfX2.loc[:, ["text3", "text1"]]
        )  # Still raise an error : because text2 isn't present

    with pytest.raises(ValueError):
        selector.transform(dfX2.values)  # Error because type changes

    selector = ColumnsSelector(columns_to_use=["text1", "text2", "text3"])
    with pytest.raises(ValueError):  # Error because 'text3' isn't present
        selector.fit(dfX)

    selector = ColumnsSelector(columns_to_use=["text1", "text2"])
    selector.fit(dfX)

    dfX3 = dfX2.copy()
    del dfX3["text1"]
    with pytest.raises(
            ValueError):  # Error because 'text1' is no longer present
        selector.transform(dfX3)

    dfX3 = dfX2.copy()
    dfX3.columns = ["cat1", "cat2", "num1", "num2", "num3", "textAA", "text2"]
    with pytest.raises(ValueError):
        selector.transform(dfX3)

    selector = ColumnsSelector(columns_to_use=["^text"], regex_match=True)
    r1 = dfX.loc[:, ["text1", "text2"]]
    r2 = dfX2.loc[:, ["text1", "text2"]]

    dfX3 = dfX.loc[:,
                   ["text2", "cat1", "cat2", "num1", "num2", "num3", "text1"
                    ]].copy()

    assert (selector.fit_transform(dfX) == r1).all().all()
    assert (selector.transform(dfX2) == r2).all().all()
    assert (selector.transform(dfX3) == r1).all().all()
    assert selector.get_feature_names() == ["text1", "text2"]

    selector = ColumnsSelector(columns_to_use=[re.compile("^text")],
                               regex_match=True)
    r1 = dfX.loc[:, ["text1", "text2"]]
    r2 = dfX2.loc[:, ["text1", "text2"]]

    dfX3 = dfX.loc[:,
                   ["text2", "cat1", "cat2", "num1", "num2", "num3", "text1"
                    ]].copy()

    assert (selector.fit_transform(dfX) == r1).all().all()
    assert (selector.transform(dfX2) == r2).all().all()
    assert (selector.transform(dfX3) == r1).all().all()
    assert selector.get_feature_names() == ["text1", "text2"]

    selector = ColumnsSelector(columns_to_use=["^text"], regex_match=False)
    r1 = dfX.loc[:, ["text1", "text2"]]
    r2 = dfX2.loc[:, ["text1", "text2"]]
    with pytest.raises(ValueError):
        selector.fit_transform(dfX)

    selector2 = ColumnsSelector(columns_to_use=[5, 6])
    assert (selector2.fit_transform(dfX) == r1).all().all()
    assert (selector2.transform(dfX2) == r2).all().all()

    selector2b = ColumnsSelector(columns_to_use=np.array([5, 6]))
    assert (selector2b.fit_transform(dfX) == r1).all().all()
    assert (selector2b.transform(dfX2) == r2).all().all()

    with pytest.raises(ValueError):
        selector2b.transform(dfX.iloc[:, 0:-1])  # missing one column

    selector3 = ColumnsSelector(columns_to_use=[10, 5])
    with pytest.raises(ValueError):
        selector3.fit(dfX)  # Error because column 10 is not here

    selector3 = ColumnsSelector(columns_to_use=[5, 6])
    selector3.fit(dfX)
    dfX_oneless_columns = dfX.copy()
    del dfX_oneless_columns["text1"]
    with pytest.raises(ValueError):
        selector3.transform(dfX_oneless_columns)

    selector_none = ColumnsSelector(columns_to_use="all")
    assert (selector_none.fit_transform(dfX) == dfX).all().all()

    antiselector = ColumnsSelector(columns_to_drop=["cat1", "cat2"])
    assert (antiselector.fit_transform(dfX) ==
            dfX.loc[:,
                    ["num1", "num2", "num3", "text1", "text2"]]).all().all()
    assert antiselector.get_feature_names() == [
        "num1", "num2", "num3", "text1", "text2"
    ]

    antiselector = ColumnsSelector(columns_to_drop=np.array(["cat1", "cat2"]))
    assert (antiselector.fit_transform(dfX) ==
            dfX.loc[:,
                    ["num1", "num2", "num3", "text1", "text2"]]).all().all()
    assert antiselector.get_feature_names() == [
        "num1", "num2", "num3", "text1", "text2"
    ]

    antiselector = ColumnsSelector(columns_to_drop=["^cat"], regex_match=True)
    assert (antiselector.fit_transform(dfX) ==
            dfX.loc[:,
                    ["num1", "num2", "num3", "text1", "text2"]]).all().all()
    assert antiselector.get_feature_names() == [
        "num1", "num2", "num3", "text1", "text2"
    ]

    cols = ["cat1", "cat2", "num1", "num2", "num3", "text1", "text2"]
    antiselector2 = ColumnsSelector(columns_to_drop=cols)
    assert antiselector2.fit_transform(dfX).shape == (4, 0)  # No column
    assert antiselector2.transform(dfX2).shape == (2, 0)
    assert antiselector2.get_feature_names() == []

    cols = [0, 1, 2, 3, 4, 5, 6]
    antiselector3 = ColumnsSelector(columns_to_drop=cols)
    assert antiselector3.fit_transform(dfX.values).shape == (4, 0)  # No column
    assert antiselector3.transform(dfX2.values).shape == (2, 0)  # No column
    assert antiselector3.get_feature_names() == []

    cols = [0, 1, 2, 3, 4, 5, 6]
    antiselector3 = ColumnsSelector(columns_to_drop=np.array(cols))
    assert antiselector3.fit_transform(dfX.values).shape == (4, 0)  # No column
    assert antiselector3.transform(dfX2.values).shape == (2, 0)  # No column
    assert antiselector3.get_feature_names() == []

    antiselector4 = ColumnsSelector(columns_to_drop="all")
    assert antiselector4.fit_transform(dfX.values).shape == (4, 0)  # No column
    assert antiselector4.transform(dfX2.values).shape == (2, 0)
    assert antiselector4.get_feature_names() == []

    antiselector5 = ColumnsSelector(columns_to_drop="all")
    assert antiselector5.fit_transform(dfX).shape == (4, 0)  # No column
    assert antiselector5.transform(dfX2).shape == (2, 0)
    assert antiselector5.get_feature_names() == []

    selector3 = ColumnsSelector(columns_to_use=["num1"])
    n1 = dfX.loc[:, ["num1"]]
    n2 = dfX2.loc[:, ["num1"]]

    #    dfX_copy = dfX.copy()
    r1 = selector3.fit_transform(dfX)
    r2 = selector3.transform(dfX2)

    assert isinstance(r1, pd.DataFrame)
    assert isinstance(r2, pd.DataFrame)

    assert (r1 == n1).all().all()
    assert (r2 == n2).all().all()

    dfrest = dfX.loc[:, ["num1", "num2", "num3", "text1", "text2"]]
    dfrest2 = dfX2.loc[:, ["num1", "num2", "num3", "text1", "text2"]]
    selector4 = ColumnsSelector(columns_to_drop=["cat1", "cat2"])

    assert (selector4.fit_transform(dfX) == dfrest).all().all()
    assert (selector4.fit_transform(dfX2) == dfrest2).all().all()

    selector5 = ColumnsSelector(columns_to_drop=[0, 1])
    assert (selector5.fit_transform(dfX) == dfrest).all().all()
    assert (selector5.fit_transform(dfX2) == dfrest2).all().all()

    selector6 = ColumnsSelector(columns_to_use=[0, 1])
    xx = np.random.randn(10, 5)
    xx2 = np.random.randn(3, 5)
    assert np.array_equal(selector6.fit_transform(xx), xx[:, 0:2])
    assert np.array_equal(selector6.fit_transform(xx2), xx2[:, 0:2])

    selector7 = ColumnsSelector(columns_to_use=["num1", "num2"])

    with pytest.raises(ValueError):
        selector7.fit(xx)

    selector_and_antiselector = ColumnsSelector(
        columns_to_use=["num1", "num2", "num3"], columns_to_drop=["num3"])
    assert (selector_and_antiselector.fit_transform(dfX) ==
            dfX.loc[:, ["num1", "num2"]]).all().all()
    assert selector_and_antiselector.get_feature_names() == ["num1", "num2"]

    selector_and_antiselector2 = ColumnsSelector(columns_to_use=["num"],
                                                 columns_to_drop=["3"],
                                                 regex_match=True)
    assert (selector_and_antiselector2.fit_transform(dfX) ==
            dfX.loc[:, ["num1", "num2"]]).all().all()
    assert selector_and_antiselector2.get_feature_names() == ["num1", "num2"]

    X = np.random.randn(20, 10)
    input_features = [("COL_%d" % i) for i in range(10)]
    selector = ColumnsSelector(columns_to_use=[0, 1, 5, 9])
    Xsubset = selector.fit_transform(X)

    assert (Xsubset == X[:, [0, 1, 5, 9]]).all()
    assert selector.get_feature_names() == [0, 1, 5, 9]
    assert selector.get_feature_names(input_features=input_features) == [
        "COL_0", "COL_1", "COL_5", "COL_9"
    ]

    selector_with_type = ColumnsSelector(columns_to_use="object")

    r1 = dfX.loc[:, ["cat1", "cat2", "text1", "text2"]]
    r2 = dfX2.loc[:, ["cat1", "cat2", "text1", "text2"]]

    assert (selector_with_type.fit_transform(dfX) == r1).all().all()
    assert (selector_with_type.transform(dfX2) == r2).all().all()
    assert selector_with_type.get_feature_names() == [
        "cat1", "cat2", "text1", "text2"
    ]

    selector_with_type = ColumnsSelector(columns_to_drop="object")

    r1 = dfX.loc[:, ["num1", "num2", "num3"]]
    r2 = dfX2.loc[:, ["num1", "num2", "num3"]]

    assert (selector_with_type.fit_transform(dfX) == r1).all().all()
    assert (selector_with_type.transform(dfX2) == r2).all().all()
    assert selector_with_type.get_feature_names() == ["num1", "num2", "num3"]

    selector = ColumnsSelector(columns_to_use="object",
                               columns_to_drop=["text1", "text2"])
    r1 = dfX.loc[:, ["cat1", "cat2"]]
    r2 = dfX2.loc[:, ["cat1", "cat2"]]
    assert (selector.fit_transform(dfX) == r1).all().all()
    assert (selector.transform(dfX2) == r2).all().all()
    assert selector.get_feature_names() == ["cat1", "cat2"]