def test_CountVectorizerWrapper_few_sample(): Xtrain = load_dataset("titanic")[0] vect = CountVectorizerWrapper(min_df=1) X = Xtrain.loc[0:10, ["name", "ticket"]] Xres = vect.fit_transform(X) assert Xres.shape[0] == 11
def test_graphpipeline_concat_names(): df = get_sample_df(size=100, seed=123) gpipeline = GraphPipeline( models={ "sel": ColumnsSelector(columns_to_use=["float_col", "int_col"]), "vec": CountVectorizerWrapper(columns_to_use=["text_col"]), "pt": PassThrough(), }, edges=[("sel", "pt"), ("vec", "pt")], ) gpipeline.fit(df) df_res = gpipeline.transform(df) assert list(df_res.columns) == [ "float_col", "int_col", "text_col__BAG__aaa", "text_col__BAG__bbb", "text_col__BAG__ccc", "text_col__BAG__ddd", "text_col__BAG__eee", "text_col__BAG__fff", "text_col__BAG__jjj", ] assert gpipeline.get_feature_names() == list(df_res.columns)
def test_graphpipeline_blockselector_cv(): Xnum, y = make_classification(n_samples=100) dfX_text = pd.DataFrame({ "text1": get_random_strings(100), "text2": get_random_strings(100) }) ### X = dico X = {"text": dfX_text, "num": Xnum} graphpipeline = GraphPipeline( models={ "BS_text": BlockSelector("text"), "CV": CountVectorizerWrapper(analyzer="char"), "BS_num": BlockSelector("num"), "RF": DecisionTreeClassifier(), }, edges=[("BS_text", "CV", "RF"), ("BS_num", "RF")], ) from sklearn.model_selection import cross_val_score with pytest.raises(ValueError): cv_res = cross_val_score(graphpipeline, X, y, scoring="accuracy", cv=10) # doesn't work, can't subset dictionnary X = BlockManager({"text": dfX_text, "num": Xnum}) graphpipeline = GraphPipeline( models={ "BS_text": BlockSelector("text"), "CV": CountVectorizerWrapper(analyzer="char"), "BS_num": BlockSelector("num"), "RF": DecisionTreeClassifier(), }, edges=[("BS_text", "CV", "RF"), ("BS_num", "RF")], ) cv_res = cross_val_score(graphpipeline, X, y, scoring="accuracy", cv=10) assert len(cv_res) == 10
def test_CountVectorizerWrapper_on_Serie(): df = get_sample_df(size=100, seed=123) X = df["text_col"] vect = CountVectorizerWrapper() Xres = vect.fit_transform(X) assert len(Xres.shape) == 2 assert Xres.shape[0] == X.shape[0] assert Xres.shape[1] == len(vect.get_feature_names()) Xres = vect.transform(X) assert len(Xres.shape) == 2 assert Xres.shape[0] == X.shape[0] assert Xres.shape[1] == len(vect.get_feature_names())
def test_graphpipeline_blockselector(): Xnum, y = make_classification(n_samples=100) dfX_text = pd.DataFrame({"text1": get_random_strings(100), "text2": get_random_strings(100)}) X = {"text": dfX_text, "num": Xnum} graphpipeline = GraphPipeline( models={ "BS_text": BlockSelector("text"), "CV": CountVectorizerWrapper(analyzer="char"), "BS_num": BlockSelector("num"), "RF": DecisionTreeClassifier(), }, edges=[("BS_text", "CV", "RF"), ("BS_num", "RF")], ) graphpipeline.fit(X, y) yhat = graphpipeline.predict(X) assert yhat.ndim == 1 assert yhat.shape[0] == y.shape[0] ### X = dico ### X = {"text": dfX_text, "num": Xnum} graphpipeline = GraphPipeline( models={"BS_text": BlockSelector("text"), "BS_num": BlockSelector("num"), "PT": DebugPassThrough()}, edges=[("BS_text", "PT"), ("BS_num", "PT")], ) Xhat = graphpipeline.fit_transform(X) assert Xhat.shape[0] == dfX_text.shape[0] assert Xhat.shape[1] == dfX_text.shape[1] + Xnum.shape[1] assert "text1" in Xhat.columns assert "text2" in Xhat.columns assert (Xhat.loc[:, ["text1", "text2"]] == dfX_text).all().all() cols = diff(list(Xhat.columns), ["text1", "text2"]) assert (Xhat.loc[:, cols].values == Xnum).all() ### X = list X = [dfX_text, Xnum] graphpipeline = GraphPipeline( models={"BS_text": BlockSelector(0), "BS_num": BlockSelector(1), "PT": DebugPassThrough()}, edges=[("BS_text", "PT"), ("BS_num", "PT")], ) Xhat = graphpipeline.fit_transform(X) assert Xhat.shape[0] == dfX_text.shape[0] assert Xhat.shape[1] == dfX_text.shape[1] + Xnum.shape[1] assert "text1" in Xhat.columns assert "text2" in Xhat.columns assert (Xhat.loc[:, ["text1", "text2"]] == dfX_text).all().all() cols = diff(list(Xhat.columns), ["text1", "text2"]) assert (Xhat.loc[:, cols].values == Xnum).all() ### X = DataManager X = BlockManager({"text": dfX_text, "num": Xnum}) graphpipeline = GraphPipeline( models={"BS_text": BlockSelector("text"), "BS_num": BlockSelector("num"), "PT": DebugPassThrough()}, edges=[("BS_text", "PT"), ("BS_num", "PT")], ) Xhat = graphpipeline.fit_transform(X) assert Xhat.shape[0] == dfX_text.shape[0] assert Xhat.shape[1] == dfX_text.shape[1] + Xnum.shape[1] assert "text1" in Xhat.columns assert "text2" in Xhat.columns assert (Xhat.loc[:, ["text1", "text2"]] == dfX_text).all().all() cols = diff(list(Xhat.columns), ["text1", "text2"]) assert (Xhat.loc[:, cols].values == Xnum).all()
def test_CountVectorizerWrapper(): df = get_sample_df(size=100, seed=123) vect = CountVectorizerWrapper(columns_to_use=["text_col"]) vect.fit(df) cols = vect.get_feature_names() for c in cols: assert c.startswith("text_col__BAG") vect = CountVectorizerWrapper(columns_to_use=[2]) vect.fit(df) cols = vect.get_feature_names() for c in cols: assert c.startswith("text_col__BAG") X = df.values vect = CountVectorizerWrapper(columns_to_use=[2]) vect.fit(X) cols = vect.get_feature_names() for c in cols: assert c.startswith("2__BAG")
def test_CountVectorizerWrapper_output_type(): vect = CountVectorizerWrapper() res = vect.fit_transform(pd.DataFrame({"a":["AA","AAA","bb"]})) assert res.dtype == "int32" vect = CountVectorizerWrapper() res = vect.fit_transform(pd.DataFrame({"a":["AA","AAA","bb"],"b":["xxx","zzz","xxx"]})) assert res.dtype == "int32" vect = CountVectorizerWrapper(dtype="int64") res = vect.fit_transform(pd.DataFrame({"a":["AA","AAA","bb"]})) assert res.dtype == "int64" vect = CountVectorizerWrapper(dtype="int64") res = vect.fit_transform(pd.DataFrame({"a":["AA","AAA","bb"],"b":["xxx","zzz","xxx"]})) assert res.dtype == "int64"