def test_gpipeline_graphviz(): gpipeline = GraphPipeline( { "ColNum": ColumnsSelector(columns_to_use=["num1", "num2", "num3"]), "ColCat": ColumnsSelector(columns_to_use=["cat1", "cat2"]), "Pt": PassThrough(), }, edges=[("ColNum", "Pt"), ("ColCat", "Pt")], ) gpipeline.fit(dfX, y) assert isinstance(gpipeline.graphviz, graphviz.dot.Digraph) gpipeline = GraphPipeline( { "ColNum": ColumnsSelector(columns_to_use=["num1", "num2", "num3"]), "ColCat": ColumnsSelector(columns_to_use=["cat1", "cat2"]), "Pt": PassThrough(), }, edges=[("ColCat", "Pt"), ("ColNum", "Pt")], ) assert isinstance( gpipeline.graphviz, graphviz.dot.Digraph) # graphviz even before fit is called
def test_graphpipeline_cycle(): gpipeline = GraphPipeline( {"A": PassThrough(), "B": PassThrough(), "C": PassThrough(), "D": PassThrough()}, edges=[("A", "B", "C"), ("C", "A"), ("C", "D")], ) with pytest.raises(ValueError): gpipeline.fit(X, y) # ValueError: The graph shouldn't have any cycle
def test_graphpipeline_set_params(): gpipeline = GraphPipeline( {"A": PassThrough(), "B": PassThrough(), "C": DebugPassThrough(debug=True)}, edges=[("A", "B", "C")] ) assert gpipeline.models["C"].debug is True gpipeline.set_params(C__debug=False) assert gpipeline.models["C"].debug is False
def test_graphpipeline_no_terminal_node(): gpipeline = GraphPipeline( { "A": PassThrough(), "B": PassThrough(), "C": PassThrough() }, edges=[("A", "B", "C"), ("C", "A")]) with pytest.raises(ValueError): gpipeline.fit( X, y ) # ValueError: the graph should have only one terminal node, instead i got 0
def test_graphpipeline_edge_not_in_models(): gpipeline = GraphPipeline( { "ColNum": ColumnsSelector(columns_to_use=["num1", "num2", "num3"]), "ColCat": ColumnsSelector(columns_to_use=["cat1", "cat2"]), "PtNum": PassThrough(), "PtCat": PassThrough(), }, edges=[("ColNum", "PtNummm"), ("ColCat", "PtCat")], ) with pytest.raises(ValueError): gpipeline.fit(dfX, y) # ValueError "the node 'PtNummm' isn't in the dictionnary of models"
def test_graphpipeline_more_than_one_terminal_node(): gpipeline = GraphPipeline( { "ColNum": ColumnsSelector(columns_to_use=["num1", "num2", "num3"]), "ColCat": ColumnsSelector(columns_to_use=["cat1", "cat2"]), "PtNum": PassThrough(), "PtCat": PassThrough(), }, edges=[("ColNum", "PtNum"), ("ColCat", "PtCat")], ) with pytest.raises(ValueError): gpipeline.fit(dfX, y) # ValueError the graph should have only one terminal node, instead i got 2
def test_gpipeline_regression(): gpipeline = GraphPipeline({ "PT": PassThrough(), "Ridge": Ridge() }, [("PT", "Ridge")]) X = dfX.loc[:, ["num1", "num2", "num3"]] gpipeline.fit(X, y) yhat = gpipeline.predict(X) yhat2 = gpipeline.models["Ridge"].predict(X) assert yhat.shape == y.shape assert (yhat == yhat2).all() with pytest.raises(AttributeError): gpipeline.predict_proba(X) with pytest.raises(AttributeError): gpipeline.predict_log_proba(X) assert gpipeline.get_feature_names_at_node("PT") == list(X.columns) assert gpipeline.get_input_features_at_node("PT") == list(X.columns) assert gpipeline.get_input_features_at_node("Ridge") == list(X.columns) with pytest.raises(ValueError): assert gpipeline.get_feature_names_at_node("DONTEXIST")
def test_graphpipeline_concat_names(): df = get_sample_df(size=100, seed=123) gpipeline = GraphPipeline( models={ "sel": ColumnsSelector(columns_to_use=["float_col", "int_col"]), "vec": CountVectorizerWrapper(columns_to_use=["text_col"]), "pt": PassThrough(), }, edges=[("sel", "pt"), ("vec", "pt")], ) gpipeline.fit(df) df_res = gpipeline.transform(df) assert list(df_res.columns) == [ "float_col", "int_col", "text_col__BAG__aaa", "text_col__BAG__bbb", "text_col__BAG__ccc", "text_col__BAG__ddd", "text_col__BAG__eee", "text_col__BAG__fff", "text_col__BAG__jjj", ] assert gpipeline.get_feature_names() == list(df_res.columns)
def test_gpipeline_raise_not_fitted(): gpipeline = GraphPipeline({ "PT": PassThrough(), "Ridge": Ridge() }, [("PT", "Ridge")]) with pytest.raises(NotFittedError): gpipeline.predict(X)
def test_gpipeline_clustering(): gpipeline = GraphPipeline({"PT": PassThrough(), "kmeans": KMeans(n_clusters=2)}, [("PT", "kmeans")]) gpipeline.fit(X) yhat = gpipeline.predict(X) yhat2 = gpipeline.models["kmeans"].predict(X) assert (yhat == yhat2).all()
def test_gpipeline_classification(): gpipeline = GraphPipeline({"PT": PassThrough(), "Logit": LogisticRegression()}, [("PT", "Logit")]) gpipeline.fit(X, yc) yhat_proba = gpipeline.predict_proba(X) yhat_proba2 = gpipeline.models["Logit"].predict_proba(X) assert yhat_proba.shape == (X.shape[0], 2) assert (yhat_proba == yhat_proba2).all() assert list(gpipeline.classes_) == [0, 1]
def test_gpipeline_clone(): gpipeline = GraphPipeline({"PT": PassThrough(), "Ridge": Ridge()}, [("PT", "Ridge")]) gpipeline.fit(X, y) cloned_gpipeline = clone(gpipeline) with pytest.raises(NotFittedError): cloned_gpipeline.predict(X) for m in gpipeline.models.keys(): assert m in cloned_gpipeline.models assert id(gpipeline.models[m]) != id(cloned_gpipeline.models[m])
def test_GraphPipeline_from_sklearn(): np.random.seed(123) X = np.random.randn(100,10) y = 1*(np.random.randn(100)>0) sk_pipeline = Pipeline(steps=[("pt", PassThrough()), ("dt", DecisionTreeClassifier(random_state=123)) ]) # Case 1 # from a non fitted sklearn Pipeline gpipeline = GraphPipeline.from_sklearn(sk_pipeline) assert isinstance(gpipeline, GraphPipeline) assert not gpipeline._already_fitted gpipeline.fit(X, y) yhat = gpipeline.predict(X) yhat_proba = gpipeline.predict_proba(X) yhat2 = sk_pipeline.fit(X, y).predict(X) yhat_proba2 = sk_pipeline.predict_proba(X) assert (yhat == yhat2).all() assert (yhat_proba == yhat_proba2).all() # Case 2 # from an already fitted pipeline gpipeline = GraphPipeline.from_sklearn(sk_pipeline) yhat = gpipeline.predict(X) yhat_proba = gpipeline.predict_proba(X) yhat2 = sk_pipeline.predict(X) yhat_proba2 = sk_pipeline.predict_proba(X) assert (yhat == yhat2).all() assert (yhat_proba == yhat_proba2).all()
def test_graphpipeline_other_input_syntaxes(): # regular syntax gpipeline = GraphPipeline({"A": PassThrough(), "B": PassThrough(), "C": PassThrough()}, edges=[("A", "B", "C")]) gpipeline._complete_init() expected_nodes = {"A", "B", "C"} expected_edges = {("A", "B"), ("B", "C")} assert set(gpipeline.complete_graph.nodes) == expected_nodes assert set(gpipeline.complete_graph.edges) == expected_edges # pipeline syntax gpipeline = GraphPipeline([("A", PassThrough()), ("B", PassThrough()), ("C", PassThrough())]) gpipeline._complete_init() assert set(gpipeline.complete_graph.nodes) == expected_nodes assert set(gpipeline.complete_graph.edges) == expected_edges ## with a merge expected_nodes = {"A", "B", "C", "D"} expected_edges = {("A", "B"), ("B", "D"), ("C", "D")} gpipeline = GraphPipeline( {"A": PassThrough(), "B": PassThrough(), "C": PassThrough(), "D": PassThrough()}, edges=[("A", "B", "D"), ("C", "D")], ) gpipeline._complete_init() assert set(gpipeline.complete_graph.nodes) == expected_nodes assert set(gpipeline.complete_graph.edges) == expected_edges gpipeline = GraphPipeline( {"A": PassThrough(), "B": PassThrough(), "C": PassThrough(), "D": PassThrough()}, edges=[("A", "B"), ("B", "D"), ("C", "D")], ) gpipeline._complete_init() assert set(gpipeline.complete_graph.nodes) == expected_nodes assert set(gpipeline.complete_graph.edges) == expected_edges gpipeline = GraphPipeline( {"A": PassThrough(), "B": PassThrough(), "C": PassThrough(), "D": PassThrough()}, edges="A - B - D ; C - D" ) gpipeline._complete_init() assert set(gpipeline.complete_graph.nodes) == expected_nodes assert set(gpipeline.complete_graph.edges) == expected_edges
def test_PassThrough(): df = get_sample_df(100, seed=123) pt = PassThrough() pt.fit(df) df2 = pt.transform(df) assert df2.shape == df.shape assert (df2 == df).all().all() assert id(df) == id(df2) assert pt.get_feature_names() == list(df.columns) with pytest.raises(ValueError): pt.transform(df.values) with pytest.raises(ValueError): pt.transform(df.iloc[:, [0, 1]]) X = np.random.randn(20, 5) input_features = ["COL_%d" % i for i in range(5)] pt = PassThrough() pt.fit(X) X2 = pt.transform(X) assert X.shape == X2.shape # same shape assert (X == X2).all() # same value assert id(X) == id(X2) # no copy assert pt.get_feature_names() == [0, 1, 2, 3, 4] assert pt.get_feature_names(input_features=input_features) == [ "COL_0", "COL_1", "COL_2", "COL_3", "COL_4" ]
def test_graphpipeline_get_features_names(): dfX = pd.DataFrame( { "text1": ["aa bb", "bb bb cc", "dd aa cc", "ee"], "text2": ["AAA ZZZ", "BBB EEE", "DDD TTT", "AAA BBB CCC"], "num1": [0, 1, 2, 3], "num2": [1.1, 1.5, -2, -3.5], "num3": [-1, 1, 25, 4], "cat1": ["A", "B", "A", "D"], "cat2": ["toto", "tata", "truc", "toto"], } ) ### Test 1 ### model = GraphPipeline({"sel": ColumnsSelector(["cat1", "cat2"]), "pt": PassThrough()}, edges=[("sel", "pt")]) model.fit(dfX) assert model.get_feature_names() == ["cat1", "cat2"] # features at ending nodeC assert model.get_feature_names_at_node("pt") == ["cat1", "cat2"] assert model.get_feature_names_at_node("sel") == ["cat1", "cat2"] assert model.get_input_features_at_node("pt") == ["cat1", "cat2"] assert model.get_input_features_at_node("sel") == ["text1", "text2", "num1", "num2", "num3", "cat1", "cat2"] ### Test 2 ### model = GraphPipeline( {"sel1": ColumnsSelector(["cat1", "cat2"]), "sel2": ColumnsSelector(["num1", "num2"]), "pt": PassThrough()}, edges=[("sel1", "pt"), ("sel2", "pt")], ) model.fit(dfX) assert model.get_feature_names() == ["cat1", "cat2", "num1", "num2"] assert model.get_feature_names_at_node("pt") == ["cat1", "cat2", "num1", "num2"] assert model.get_feature_names_at_node("sel1") == ["cat1", "cat2"] assert model.get_feature_names_at_node("sel2") == ["num1", "num2"] assert model.get_input_features_at_node("pt") == ["cat1", "cat2", "num1", "num2"] assert model.get_input_features_at_node("sel1") == ["text1", "text2", "num1", "num2", "num3", "cat1", "cat2"] assert model.get_input_features_at_node("sel2") == ["text1", "text2", "num1", "num2", "num3", "cat1", "cat2"] ### Test 3 ### model = GraphPipeline( { "sel1": ColumnsSelector(["cat1", "cat2"]), "sel2": ColumnsSelector(["num1", "num2"]), "sel12": ColumnsSelector(["cat1", "num1"]), "pt": PassThrough(), }, edges=[("sel1", "sel12", "pt"), ("sel2", "sel12", "pt")], ) model.fit(dfX) assert model.get_feature_names() == ["cat1", "num1"] assert model.get_feature_names_at_node("pt") == ["cat1", "num1"] assert model.get_feature_names_at_node("sel12") == ["cat1", "num1"] assert model.get_feature_names_at_node("sel1") == ["cat1", "cat2"] assert model.get_feature_names_at_node("sel2") == ["num1", "num2"] assert model.get_input_features_at_node("pt") == ["cat1", "num1"] assert model.get_input_features_at_node("sel12") == ["cat1", "cat2", "num1", "num2"] assert model.get_input_features_at_node("sel1") == ["text1", "text2", "num1", "num2", "num3", "cat1", "cat2"] assert model.get_input_features_at_node("sel2") == ["text1", "text2", "num1", "num2", "num3", "cat1", "cat2"]
'AD_CONTENTS', 'CONTENTS_COVER', 'BUILDINGS_COVER', 'P1_MAR_STATUS', 'P1_POLICY_REFUSED', 'P1_SEX', 'APPR_ALARM', 'APPR_LOCKS', 'FLOODING', 'NEIGH_WATCH', 'OCC_STATUS', 'SAFE_INSTALLED', 'SEC_DISC_REQ', 'SUBSIDENCE', 'PAYMENT_METHOD', 'LEGAL_ADDON_PRE_REN', 'LEGAL_ADDON_POST_REN', 'HOME_EM_ADDON_PRE_REN', 'HOME_EM_ADDON_POST_REN', 'GARDEN_ADDON_PRE_REN', 'GARDEN_ADDON_POST_REN', 'KEYCARE_ADDON_PRE_REN', 'KEYCARE_ADDON_POST_REN', 'HP1_ADDON_PRE_REN', 'HP1_ADDON_POST_REN', 'HP2_ADDON_PRE_REN', 'HP2_ADDON_POST_REN', 'HP3_ADDON_PRE_REN', 'HP3_ADDON_POST_REN', 'MTA_FLAG'], desired_output_type='DataFrame', drop_unused_columns=True, drop_used_columns=True, encoding_type='dummy', max_cum_proba=0.95, max_modalities_number=100, max_na_percentage=0.05, min_modalities_number=20, min_nb_observations=10, regex_match=False) binary_columns_cleaner = BinaryColumnsCleaner() # this one does nothing but is used to use the pipeline without the classifier (for shap): pass_through = PassThrough() classifier = LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0, importance_type='split', learning_rate=0.1, max_depth=-1, min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=31, objective=None, reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0, subsample_for_bin=200000, subsample_freq=0) pipeline = GraphPipeline(edges=[("ColumnsSelector", "NumImputer"), ("NumericalEncoder", "NumImputer", "BinaryColumnsCleaner", "PassThrough", "LGBMClassifier")], models={"ColumnsSelector": columns_selector, "NumericalEncoder": numerical_encoder, "NumImputer": imputer, "BinaryColumnsCleaner": binary_columns_cleaner, "PassThrough": pass_through,