示例#1
0
def test_CountVectorizerWrapper_output_type():

    vect = CountVectorizerWrapper()
    res = vect.fit_transform(pd.DataFrame({"a": ["AA", "AAA", "bb"]}))
    assert res.dtype == "int32"

    vect = CountVectorizerWrapper()
    res = vect.fit_transform(
        pd.DataFrame({
            "a": ["AA", "AAA", "bb"],
            "b": ["xxx", "zzz", "xxx"]
        }))
    assert res.dtype == "int32"

    vect = CountVectorizerWrapper(dtype="int64")
    res = vect.fit_transform(pd.DataFrame({"a": ["AA", "AAA", "bb"]}))
    assert res.dtype == "int64"

    vect = CountVectorizerWrapper(dtype="int64")
    res = vect.fit_transform(
        pd.DataFrame({
            "a": ["AA", "AAA", "bb"],
            "b": ["xxx", "zzz", "xxx"]
        }))
    assert res.dtype == "int64"
示例#2
0
def test_CountVectorizerWrapper_few_sample():
    Xtrain = load_dataset("titanic")[0]

    vect = CountVectorizerWrapper(min_df=1)

    X = Xtrain.loc[0:10, ["name", "ticket"]]
    Xres = vect.fit_transform(X)

    assert Xres.shape[0] == 11
示例#3
0
def test_CountVectorizerWrapper_on_Serie():

    df = get_sample_df(size=100, seed=123)

    X = df["text_col"]
    vect = CountVectorizerWrapper()

    Xres = vect.fit_transform(X)

    assert len(Xres.shape) == 2
    assert Xres.shape[0] == X.shape[0]
    assert Xres.shape[1] == len(vect.get_feature_names())

    Xres = vect.transform(X)
    assert len(Xres.shape) == 2
    assert Xres.shape[0] == X.shape[0]
    assert Xres.shape[1] == len(vect.get_feature_names())