示例#1
0
def test_ColumnsSelector_empty_column():

    dfX = pd.DataFrame({
        "cat1": ["A", "B", "A", "D"],
        "cat2": ["toto", "tata", "truc", "toto"],
        "num1": [0, 1, 2, 3],
        "num2": [1.1, 1.5, -2, -3.5],
        "num3": [-1, 1, 25, 4],
        "text1": ["aa bb", "bb bb cc", "dd aa cc", "ee"],
        "text2": ["a z", "b e", "d t", "a b c"],
    })

    dfX2 = pd.DataFrame({
        "cat1": ["D", "B"],
        "cat2": ["toto", "newcat"],
        "num1": [5, 6],
        "num2": [0.1, -5.2],
        "num3": [2, -1],
        "text1": ["dd ee", "aa"],
        "text2": ["t a c", "z b"],
    })

    for col in ([], None):
        selector = ColumnsSelector(columns_to_use=col)
        df_res = selector.fit_transform(dfX)

        assert df_res.shape == (dfX.shape[0], 0)
        assert isinstance(df_res, pd.DataFrame)
        assert selector.get_feature_names() == []

        df_res2 = selector.transform(dfX2)
        assert df_res2.shape == (dfX2.shape[0], 0)
        assert isinstance(df_res2, pd.DataFrame)
示例#2
0
def test_ColumnsSelector_columns_not_present():
    dfX = pd.DataFrame({
        "cat1": ["A", "B", "A", "D"],
        "cat2": ["toto", "tata", "truc", "toto"],
        "num1": [0, 1, 2, 3],
        "num2": [1.1, 1.5, -2, -3.5],
        "num3": [-1, 1, 25, 4],
        "text1": ["aa bb", "bb bb cc", "dd aa cc", "ee"],
        "text2": ["a z", "b e", "d t", "a b c"],
    })

    selector = ColumnsSelector(columns_to_use=["column_isnot_present"])
    with pytest.raises(
            ValueError):  # error because columns is not in DataFrame
        selector.fit(dfX)
示例#3
0
def test_ColumnsSelector():

    dfX = pd.DataFrame({
        "cat1": ["A", "B", "A", "D"],
        "cat2": ["toto", "tata", "truc", "toto"],
        "num1": [0, 1, 2, 3],
        "num2": [1.1, 1.5, -2, -3.5],
        "num3": [-1, 1, 25, 4],
        "text1": ["aa bb", "bb bb cc", "dd aa cc", "ee"],
        "text2": ["a z", "b e", "d t", "a b c"],
    })

    dfX2 = pd.DataFrame({
        "cat1": ["D", "B"],
        "cat2": ["toto", "newcat"],
        "num1": [5, 6],
        "num2": [0.1, -5.2],
        "num3": [2, -1],
        "text1": ["dd ee", "aa"],
        "text2": ["t a c", "z b"],
    })

    selector = ColumnsSelector(columns_to_use=["text1", "text2"])
    r1 = dfX.loc[:, ["text1", "text2"]]
    r2 = dfX2.loc[:, ["text1", "text2"]]

    assert (selector.fit_transform(dfX) == r1).all().all()
    assert (selector.transform(dfX2) == r2).all().all()
    assert selector.get_feature_names() == ["text1", "text2"]

    selector = ColumnsSelector(columns_to_use=np.array(["text1", "text2"]))
    r1 = dfX.loc[:, ["text1", "text2"]]
    r2 = dfX2.loc[:, ["text1", "text2"]]

    assert (selector.fit_transform(dfX) == r1).all().all()
    assert (selector.transform(dfX2) == r2).all().all()
    assert selector.get_feature_names() == ["text1", "text2"]

    with pytest.raises(ValueError):
        selector.transform(dfX2.loc[:, ["text2", "text1"]]
                           )  # Error because not correct number of columns

    with pytest.raises(ValueError):
        selector.transform(
            dfX2.loc[:, ["text3", "text1"]])  # Error because text2 not in df

    with pytest.raises(ValueError):
        selector.transform(dfX2.values)  # Error because type changes

    # This error might be ignored later

    ###  Same thing but with 'raise_if_shape_differs=False'
    selector = ColumnsSelector(columns_to_use=np.array(["text1", "text2"]),
                               raise_if_shape_differs=False)
    r1 = dfX.loc[:, ["text1", "text2"]]
    r2 = dfX2.loc[:, ["text1", "text2"]]

    assert (selector.fit_transform(dfX) == r1).all().all()
    assert (selector.transform(dfX2) == r2).all().all()
    assert selector.get_feature_names() == ["text1", "text2"]

    r3 = selector.transform(
        dfX2.loc[:, ["text2", "text1"]])  # Don't raise error anymore
    assert r3.shape == r2.shape
    assert (r3 == r2).all(axis=None)

    with pytest.raises(ValueError):
        r3 = selector.transform(
            dfX2.loc[:, ["text3", "text1"]]
        )  # Still raise an error : because text2 isn't present

    with pytest.raises(ValueError):
        selector.transform(dfX2.values)  # Error because type changes

    selector = ColumnsSelector(columns_to_use=["text1", "text2", "text3"])
    with pytest.raises(ValueError):  # Error because 'text3' isn't present
        selector.fit(dfX)

    selector = ColumnsSelector(columns_to_use=["text1", "text2"])
    selector.fit(dfX)

    dfX3 = dfX2.copy()
    del dfX3["text1"]
    with pytest.raises(
            ValueError):  # Error because 'text1' is no longer present
        selector.transform(dfX3)

    dfX3 = dfX2.copy()
    dfX3.columns = ["cat1", "cat2", "num1", "num2", "num3", "textAA", "text2"]
    with pytest.raises(ValueError):
        selector.transform(dfX3)

    selector = ColumnsSelector(columns_to_use=["^text"], regex_match=True)
    r1 = dfX.loc[:, ["text1", "text2"]]
    r2 = dfX2.loc[:, ["text1", "text2"]]

    dfX3 = dfX.loc[:,
                   ["text2", "cat1", "cat2", "num1", "num2", "num3", "text1"
                    ]].copy()

    assert (selector.fit_transform(dfX) == r1).all().all()
    assert (selector.transform(dfX2) == r2).all().all()
    assert (selector.transform(dfX3) == r1).all().all()
    assert selector.get_feature_names() == ["text1", "text2"]

    selector = ColumnsSelector(columns_to_use=[re.compile("^text")],
                               regex_match=True)
    r1 = dfX.loc[:, ["text1", "text2"]]
    r2 = dfX2.loc[:, ["text1", "text2"]]

    dfX3 = dfX.loc[:,
                   ["text2", "cat1", "cat2", "num1", "num2", "num3", "text1"
                    ]].copy()

    assert (selector.fit_transform(dfX) == r1).all().all()
    assert (selector.transform(dfX2) == r2).all().all()
    assert (selector.transform(dfX3) == r1).all().all()
    assert selector.get_feature_names() == ["text1", "text2"]

    selector = ColumnsSelector(columns_to_use=["^text"], regex_match=False)
    r1 = dfX.loc[:, ["text1", "text2"]]
    r2 = dfX2.loc[:, ["text1", "text2"]]
    with pytest.raises(ValueError):
        selector.fit_transform(dfX)

    selector2 = ColumnsSelector(columns_to_use=[5, 6])
    assert (selector2.fit_transform(dfX) == r1).all().all()
    assert (selector2.transform(dfX2) == r2).all().all()

    selector2b = ColumnsSelector(columns_to_use=np.array([5, 6]))
    assert (selector2b.fit_transform(dfX) == r1).all().all()
    assert (selector2b.transform(dfX2) == r2).all().all()

    with pytest.raises(ValueError):
        selector2b.transform(dfX.iloc[:, 0:-1])  # missing one column

    selector3 = ColumnsSelector(columns_to_use=[10, 5])
    with pytest.raises(ValueError):
        selector3.fit(dfX)  # Error because column 10 is not here

    selector3 = ColumnsSelector(columns_to_use=[5, 6])
    selector3.fit(dfX)
    dfX_oneless_columns = dfX.copy()
    del dfX_oneless_columns["text1"]
    with pytest.raises(ValueError):
        selector3.transform(dfX_oneless_columns)

    selector_none = ColumnsSelector(columns_to_use="all")
    assert (selector_none.fit_transform(dfX) == dfX).all().all()

    antiselector = ColumnsSelector(columns_to_drop=["cat1", "cat2"])
    assert (antiselector.fit_transform(dfX) ==
            dfX.loc[:,
                    ["num1", "num2", "num3", "text1", "text2"]]).all().all()
    assert antiselector.get_feature_names() == [
        "num1", "num2", "num3", "text1", "text2"
    ]

    antiselector = ColumnsSelector(columns_to_drop=np.array(["cat1", "cat2"]))
    assert (antiselector.fit_transform(dfX) ==
            dfX.loc[:,
                    ["num1", "num2", "num3", "text1", "text2"]]).all().all()
    assert antiselector.get_feature_names() == [
        "num1", "num2", "num3", "text1", "text2"
    ]

    antiselector = ColumnsSelector(columns_to_drop=["^cat"], regex_match=True)
    assert (antiselector.fit_transform(dfX) ==
            dfX.loc[:,
                    ["num1", "num2", "num3", "text1", "text2"]]).all().all()
    assert antiselector.get_feature_names() == [
        "num1", "num2", "num3", "text1", "text2"
    ]

    cols = ["cat1", "cat2", "num1", "num2", "num3", "text1", "text2"]
    antiselector2 = ColumnsSelector(columns_to_drop=cols)
    assert antiselector2.fit_transform(dfX).shape == (4, 0)  # No column
    assert antiselector2.transform(dfX2).shape == (2, 0)
    assert antiselector2.get_feature_names() == []

    cols = [0, 1, 2, 3, 4, 5, 6]
    antiselector3 = ColumnsSelector(columns_to_drop=cols)
    assert antiselector3.fit_transform(dfX.values).shape == (4, 0)  # No column
    assert antiselector3.transform(dfX2.values).shape == (2, 0)  # No column
    assert antiselector3.get_feature_names() == []

    cols = [0, 1, 2, 3, 4, 5, 6]
    antiselector3 = ColumnsSelector(columns_to_drop=np.array(cols))
    assert antiselector3.fit_transform(dfX.values).shape == (4, 0)  # No column
    assert antiselector3.transform(dfX2.values).shape == (2, 0)  # No column
    assert antiselector3.get_feature_names() == []

    antiselector4 = ColumnsSelector(columns_to_drop="all")
    assert antiselector4.fit_transform(dfX.values).shape == (4, 0)  # No column
    assert antiselector4.transform(dfX2.values).shape == (2, 0)
    assert antiselector4.get_feature_names() == []

    antiselector5 = ColumnsSelector(columns_to_drop="all")
    assert antiselector5.fit_transform(dfX).shape == (4, 0)  # No column
    assert antiselector5.transform(dfX2).shape == (2, 0)
    assert antiselector5.get_feature_names() == []

    selector3 = ColumnsSelector(columns_to_use=["num1"])
    n1 = dfX.loc[:, ["num1"]]
    n2 = dfX2.loc[:, ["num1"]]

    #    dfX_copy = dfX.copy()
    r1 = selector3.fit_transform(dfX)
    r2 = selector3.transform(dfX2)

    assert isinstance(r1, pd.DataFrame)
    assert isinstance(r2, pd.DataFrame)

    assert (r1 == n1).all().all()
    assert (r2 == n2).all().all()

    dfrest = dfX.loc[:, ["num1", "num2", "num3", "text1", "text2"]]
    dfrest2 = dfX2.loc[:, ["num1", "num2", "num3", "text1", "text2"]]
    selector4 = ColumnsSelector(columns_to_drop=["cat1", "cat2"])

    assert (selector4.fit_transform(dfX) == dfrest).all().all()
    assert (selector4.fit_transform(dfX2) == dfrest2).all().all()

    selector5 = ColumnsSelector(columns_to_drop=[0, 1])
    assert (selector5.fit_transform(dfX) == dfrest).all().all()
    assert (selector5.fit_transform(dfX2) == dfrest2).all().all()

    selector6 = ColumnsSelector(columns_to_use=[0, 1])
    xx = np.random.randn(10, 5)
    xx2 = np.random.randn(3, 5)
    assert np.array_equal(selector6.fit_transform(xx), xx[:, 0:2])
    assert np.array_equal(selector6.fit_transform(xx2), xx2[:, 0:2])

    selector7 = ColumnsSelector(columns_to_use=["num1", "num2"])

    with pytest.raises(ValueError):
        selector7.fit(xx)

    selector_and_antiselector = ColumnsSelector(
        columns_to_use=["num1", "num2", "num3"], columns_to_drop=["num3"])
    assert (selector_and_antiselector.fit_transform(dfX) ==
            dfX.loc[:, ["num1", "num2"]]).all().all()
    assert selector_and_antiselector.get_feature_names() == ["num1", "num2"]

    selector_and_antiselector2 = ColumnsSelector(columns_to_use=["num"],
                                                 columns_to_drop=["3"],
                                                 regex_match=True)
    assert (selector_and_antiselector2.fit_transform(dfX) ==
            dfX.loc[:, ["num1", "num2"]]).all().all()
    assert selector_and_antiselector2.get_feature_names() == ["num1", "num2"]

    X = np.random.randn(20, 10)
    input_features = [("COL_%d" % i) for i in range(10)]
    selector = ColumnsSelector(columns_to_use=[0, 1, 5, 9])
    Xsubset = selector.fit_transform(X)

    assert (Xsubset == X[:, [0, 1, 5, 9]]).all()
    assert selector.get_feature_names() == [0, 1, 5, 9]
    assert selector.get_feature_names(input_features=input_features) == [
        "COL_0", "COL_1", "COL_5", "COL_9"
    ]

    selector_with_type = ColumnsSelector(columns_to_use="object")

    r1 = dfX.loc[:, ["cat1", "cat2", "text1", "text2"]]
    r2 = dfX2.loc[:, ["cat1", "cat2", "text1", "text2"]]

    assert (selector_with_type.fit_transform(dfX) == r1).all().all()
    assert (selector_with_type.transform(dfX2) == r2).all().all()
    assert selector_with_type.get_feature_names() == [
        "cat1", "cat2", "text1", "text2"
    ]

    selector_with_type = ColumnsSelector(columns_to_drop="object")

    r1 = dfX.loc[:, ["num1", "num2", "num3"]]
    r2 = dfX2.loc[:, ["num1", "num2", "num3"]]

    assert (selector_with_type.fit_transform(dfX) == r1).all().all()
    assert (selector_with_type.transform(dfX2) == r2).all().all()
    assert selector_with_type.get_feature_names() == ["num1", "num2", "num3"]

    selector = ColumnsSelector(columns_to_use="object",
                               columns_to_drop=["text1", "text2"])
    r1 = dfX.loc[:, ["cat1", "cat2"]]
    r2 = dfX2.loc[:, ["cat1", "cat2"]]
    assert (selector.fit_transform(dfX) == r1).all().all()
    assert (selector.transform(dfX2) == r2).all().all()
    assert selector.get_feature_names() == ["cat1", "cat2"]
示例#4
0
def test_ColumnsSelector_sparse_matrix(sparse_type):

    mat = sparse_type([[0, 0, 0], [0, 1, 1], [0, 0, 1], [1, 0, 0]])
    # no columns
    for col in (None, []):
        selector = ColumnsSelector(columns_to_use=col)
        mat1 = selector.fit_transform(mat)
        assert mat1.shape == (mat.shape[0], 0)
        assert type(mat1) == type(mat)
        mat1_bis = selector.transform(mat)
        assert type(mat1_bis) == type(mat)
        assert mat1_bis.shape == (mat.shape[0], 0)
        assert len(selector.get_feature_names()) == mat1.shape[1]

    # all columns
    selector = ColumnsSelector(columns_to_use="all")
    mat2 = selector.fit_transform(mat)

    assert mat2.shape == mat.shape
    assert type(mat2) == type(mat)
    assert (mat.toarray() == mat2.toarray()).all()
    assert mat2 is mat
    mat2_bis = selector.transform(mat)
    assert mat2_bis is mat
    assert len(selector.get_feature_names()) == mat2.shape[1]

    # 1 column
    selector = ColumnsSelector(columns_to_use=[1])
    mat2 = selector.fit_transform(mat)

    assert mat2.shape == (mat.shape[0], 1)
    assert type(mat2) == type(mat)
    assert (mat.toarray()[:, [1]] == mat2.toarray()).all()
    assert len(selector.get_feature_names()) == mat2.shape[1]

    # 2 column
    selector = ColumnsSelector(columns_to_use=[1, 2])
    mat3 = selector.fit_transform(mat)

    assert mat3.shape == (mat.shape[0], 2)
    assert type(mat3) == type(mat)
    assert (mat.toarray()[:, [1, 2]] == mat3.toarray()).all()
    assert len(selector.get_feature_names()) == mat3.shape[1]
示例#5
0
def test_ColumnsSelector_dataframe():
    df = pd.DataFrame(np.array([[0, 0, 0], [0, 1, 1], [0, 0, 1], [1, 0, 0]]),
                      columns=["a", "b", "c"])

    # no columns
    for col in (None, []):
        selector = ColumnsSelector(columns_to_use=col)
        df1 = selector.fit_transform(df)
        assert df1.shape == (df.shape[0], 0)
        assert type(df1) == type(df)
        df1_bis = selector.transform(df)
        assert type(df1_bis) == type(df)
        assert df1_bis.shape == (df.shape[0], 0)
        assert len(selector.get_feature_names()) == df1.shape[1]

    # all columns
    selector = ColumnsSelector(columns_to_use="all")
    df1 = selector.fit_transform(df)
    assert df1.shape == df.shape
    assert type(df1) == type(df)
    assert (df1 == df).all().all()
    assert df1 is df
    df1_bis = selector.transform(df)
    assert df1_bis is df
    assert len(selector.get_feature_names()) == df1.shape[1]

    # 1 columns, str
    selector = ColumnsSelector(columns_to_use=["a"])
    df2 = selector.fit_transform(df)
    assert df2.shape == (df.shape[0], 1)
    assert type(df2) == type(df)
    assert (df2 == df.loc[:, ["a"]]).all().all()
    assert len(selector.get_feature_names()) == df2.shape[1]

    # 1 columns, int
    selector = ColumnsSelector(columns_to_use=[0])
    df2 = selector.fit_transform(df)
    assert df2.shape == (df.shape[0], 1)
    assert type(df2) == type(df)
    assert (df2 == df.loc[:, ["a"]]).all().all()
    assert len(selector.get_feature_names()) == df2.shape[1]

    # 2 columns, str
    selector = ColumnsSelector(columns_to_use=["a", "c"])
    df3 = selector.fit_transform(df)
    assert df3.shape == (df.shape[0], 2)
    assert type(df3) == type(df)
    assert (df3 == df.loc[:, ["a", "c"]]).all().all()
    assert len(selector.get_feature_names()) == df3.shape[1]

    # 2 columns, int
    selector = ColumnsSelector(columns_to_use=["a", "c"])
    df3 = selector.fit_transform(df)
    assert df3.shape == (df.shape[0], 2)
    assert type(df3) == type(df)
    assert (df3 == df.loc[:, ["a", "c"]]).all().all()
    assert len(selector.get_feature_names()) == df3.shape[1]
示例#6
0
def test_ColumnsSelector__get_list_of_columns():
    X = pd.DataFrame({
        "a": [0, 1, 2],
        "b": ["AAA", "BBB", "CCC"],
        "c": ["xx", "yy", "zz"],
        "d": [0.1, 0.2, 0.3]
    })

    assert ColumnsSelector._get_list_of_columns("all", X,
                                                regex_match=False) is None

    assert ColumnsSelector._get_list_of_columns(["a"], X,
                                                regex_match=False) == ["a"]
    assert ColumnsSelector._get_list_of_columns(
        ["a", "b"], X, regex_match=False) == ["a", "b"]
    assert ColumnsSelector._get_list_of_columns(
        [0, 1, 2], X, regex_match=False) == [0, 1, 2]

    assert ColumnsSelector._get_list_of_columns(None, X,
                                                regex_match=False) == []

    assert ColumnsSelector._get_list_of_columns(
        "object", X, regex_match=False) == ["b", "c"]

    with pytest.raises(TypeError):
        ColumnsSelector._get_list_of_columns(
            X.values, "object",
            regex_match=False)  # error : because no DataFrame

    with pytest.raises(TypeError):
        ColumnsSelector._get_list_of_columns({"type": "not recognized"},
                                             X,
                                             regex_match=False)

    with pytest.raises(ValueError):
        ColumnsSelector._get_list_of_columns(
            "object", X, regex_match=True)  # error : because regex_match

    for columns in TypeOfVariables.alls:
        assert ColumnsSelector._get_list_of_columns(columns, X) == [
            c for c in X.columns if guess_type_of_variable(X[c]) == columns
        ]