Exemplo n.º 1
0
def test_graphpipeline_cycle():
    gpipeline = GraphPipeline(
        {"A": PassThrough(), "B": PassThrough(), "C": PassThrough(), "D": PassThrough()},
        edges=[("A", "B", "C"), ("C", "A"), ("C", "D")],
    )

    with pytest.raises(ValueError):
        gpipeline.fit(X, y)  # ValueError: The graph shouldn't have any cycle
Exemplo n.º 2
0
def test_gpipeline_raise_not_fitted():
    gpipeline = GraphPipeline({
        "PT": PassThrough(),
        "Ridge": Ridge()
    }, [("PT", "Ridge")])

    with pytest.raises(NotFittedError):
        gpipeline.predict(X)
Exemplo n.º 3
0
def test_gpipeline_clustering():

    gpipeline = GraphPipeline({"PT": PassThrough(), "kmeans": KMeans(n_clusters=2)}, [("PT", "kmeans")])
    gpipeline.fit(X)

    yhat = gpipeline.predict(X)
    yhat2 = gpipeline.models["kmeans"].predict(X)

    assert (yhat == yhat2).all()
Exemplo n.º 4
0
def test_graphpipeline_set_params():

    gpipeline = GraphPipeline(
        {"A": PassThrough(), "B": PassThrough(), "C": DebugPassThrough(debug=True)}, edges=[("A", "B", "C")]
    )

    assert gpipeline.models["C"].debug is True
    gpipeline.set_params(C__debug=False)
    assert gpipeline.models["C"].debug is False
Exemplo n.º 5
0
def test_gpipeline_classification():

    gpipeline = GraphPipeline({"PT": PassThrough(), "Logit": LogisticRegression()}, [("PT", "Logit")])
    gpipeline.fit(X, yc)

    yhat_proba = gpipeline.predict_proba(X)
    yhat_proba2 = gpipeline.models["Logit"].predict_proba(X)

    assert yhat_proba.shape == (X.shape[0], 2)
    assert (yhat_proba == yhat_proba2).all()
    assert list(gpipeline.classes_) == [0, 1]
Exemplo n.º 6
0
def test_graphpipeline_no_terminal_node():
    gpipeline = GraphPipeline(
        {
            "A": PassThrough(),
            "B": PassThrough(),
            "C": PassThrough()
        },
        edges=[("A", "B", "C"), ("C", "A")])
    with pytest.raises(ValueError):
        gpipeline.fit(
            X, y
        )  # ValueError: the graph should have only one terminal node, instead i got 0
Exemplo n.º 7
0
def test_gpipeline_clone():
    gpipeline = GraphPipeline({"PT": PassThrough(), "Ridge": Ridge()}, [("PT", "Ridge")])
    gpipeline.fit(X, y)

    cloned_gpipeline = clone(gpipeline)

    with pytest.raises(NotFittedError):
        cloned_gpipeline.predict(X)

    for m in gpipeline.models.keys():
        assert m in cloned_gpipeline.models
        assert id(gpipeline.models[m]) != id(cloned_gpipeline.models[m])
Exemplo n.º 8
0
def test_graphpipeline_edge_not_in_models():
    gpipeline = GraphPipeline(
        {
            "ColNum": ColumnsSelector(columns_to_use=["num1", "num2", "num3"]),
            "ColCat": ColumnsSelector(columns_to_use=["cat1", "cat2"]),
            "PtNum": PassThrough(),
            "PtCat": PassThrough(),
        },
        edges=[("ColNum", "PtNummm"), ("ColCat", "PtCat")],
    )

    with pytest.raises(ValueError):
        gpipeline.fit(dfX, y)  # ValueError "the node 'PtNummm' isn't in the dictionnary of models"
Exemplo n.º 9
0
def test_graphpipeline_more_than_one_terminal_node():
    gpipeline = GraphPipeline(
        {
            "ColNum": ColumnsSelector(columns_to_use=["num1", "num2", "num3"]),
            "ColCat": ColumnsSelector(columns_to_use=["cat1", "cat2"]),
            "PtNum": PassThrough(),
            "PtCat": PassThrough(),
        },
        edges=[("ColNum", "PtNum"), ("ColCat", "PtCat")],
    )

    with pytest.raises(ValueError):
        gpipeline.fit(dfX, y)  # ValueError the graph should have only one terminal node, instead i got 2
Exemplo n.º 10
0
def test_graphpipeline_blockselector_cv():

    Xnum, y = make_classification(n_samples=100)

    dfX_text = pd.DataFrame({
        "text1": get_random_strings(100),
        "text2": get_random_strings(100)
    })

    ### X = dico
    X = {"text": dfX_text, "num": Xnum}

    graphpipeline = GraphPipeline(
        models={
            "BS_text": BlockSelector("text"),
            "CV": CountVectorizerWrapper(analyzer="char"),
            "BS_num": BlockSelector("num"),
            "RF": DecisionTreeClassifier(),
        },
        edges=[("BS_text", "CV", "RF"), ("BS_num", "RF")],
    )

    from sklearn.model_selection import cross_val_score

    with pytest.raises(ValueError):
        cv_res = cross_val_score(graphpipeline,
                                 X,
                                 y,
                                 scoring="accuracy",
                                 cv=10)
        # doesn't work, can't subset dictionnary

    X = BlockManager({"text": dfX_text, "num": Xnum})

    graphpipeline = GraphPipeline(
        models={
            "BS_text": BlockSelector("text"),
            "CV": CountVectorizerWrapper(analyzer="char"),
            "BS_num": BlockSelector("num"),
            "RF": DecisionTreeClassifier(),
        },
        edges=[("BS_text", "CV", "RF"), ("BS_num", "RF")],
    )

    cv_res = cross_val_score(graphpipeline, X, y, scoring="accuracy", cv=10)

    assert len(cv_res) == 10
Exemplo n.º 11
0
 def get_pipeline():
     pipeline = GraphPipeline({"pt1":DebugPassThrough(column_prefix="PT1_",debug=True),
                               "pt2":DebugPassThrough(column_prefix="PT2_",debug=True),
                               "pt3":DebugPassThrough(column_prefix="PT3_",debug=True),
                               "pt4":DebugPassThrough(column_prefix="PT4_",debug=True)} ,
                               edges = [("pt1","pt3","pt4"),("pt2","pt3","pt4")]
                               )
     return pipeline
Exemplo n.º 12
0
def test_approx_cross_validation_transformer(x_data_type, shuffle, graph_pipeline, with_groups):

    if graph_pipeline:
        estimator = GraphPipeline({"ptA": DebugPassThrough(), "ptB": DebugPassThrough()}, edges=[("ptA", "ptB")])
    else:
        estimator = DebugPassThrough()

    X, y = make_classification(n_samples=100, random_state=123)
    if with_groups:
        groups = np.array([0] * 25 + [1] * 25 + [2] * 25 + [3] * 25)
    else:
        groups = None

    X = convert_generic(X, output_type=x_data_type)
    if x_data_type == DataTypes.DataFrame:
        X.columns = ["col_%d" % i for i in range(X.shape[1])]

    if shuffle:
        np.random.seed(123)
        ii = np.arange(X.shape[0])
        np.random.shuffle(ii)
        y = y[ii]

        if isinstance(X, pd.DataFrame):
            X = X.loc[ii, :]
        else:
            X = X[ii, :]

    scoring = ["accuracy", "neg_log_loss"]

    ##################
    ### Score only ###
    ##################
    with pytest.raises(Exception):
        cross_validation(estimator, X, y, groups, cv=10, scoring=scoring, verbose=0)
        # shouldn't work since DebugPassThrough can't be scored

    #################
    ### Transform ###
    #################
    cv_res, Xhat = cross_validation(
        estimator, X, y, groups, cv=10, scoring=scoring, verbose=0, return_predict=True, no_scoring=True
    )

    assert type(Xhat) == type(X)
    assert cv_res is None
    assert Xhat.shape == X.shape

    if isinstance(X, pd.DataFrame):
        assert (Xhat.index == X.index).all()
        assert (Xhat.columns == X.columns).all()

    if isinstance(X, pd.DataFrame):
        assert np.abs(Xhat - X).max().max() <= 10 ** (10 - 10)
    else:
        assert np.max(np.abs(Xhat - X)) <= 10 ** (-10)
Exemplo n.º 13
0
def test_GraphPipeline_from_sklearn():
    
    np.random.seed(123)
    X = np.random.randn(100,10)
    y = 1*(np.random.randn(100)>0)
    
    sk_pipeline = Pipeline(steps=[("pt", PassThrough()),
                                  ("dt", DecisionTreeClassifier(random_state=123))
                                  ])


    # Case 1 
    # from a non fitted sklearn Pipeline

    gpipeline = GraphPipeline.from_sklearn(sk_pipeline)
    
    assert isinstance(gpipeline, GraphPipeline)
    assert not gpipeline._already_fitted
        
    gpipeline.fit(X, y)
    yhat = gpipeline.predict(X)
    yhat_proba = gpipeline.predict_proba(X)
    
    
    yhat2 = sk_pipeline.fit(X, y).predict(X)
    yhat_proba2 = sk_pipeline.predict_proba(X)

    
    assert (yhat == yhat2).all()
    assert (yhat_proba == yhat_proba2).all()

    # Case 2
    # from an already fitted pipeline
    gpipeline = GraphPipeline.from_sklearn(sk_pipeline)
    yhat = gpipeline.predict(X)
    yhat_proba = gpipeline.predict_proba(X)
    
    
    yhat2 = sk_pipeline.predict(X)
    yhat_proba2 = sk_pipeline.predict_proba(X)
    
    assert (yhat == yhat2).all()
    assert (yhat_proba == yhat_proba2).all()
Exemplo n.º 14
0
def test_estimator_type_GraphPipeline():

    pipe_c = GraphPipeline(
        {
            "scale": StandardScaler(),
            "rf": RandomForestClassifier()
        },
        edges=[("scale", "rf")])

    assert is_classifier(pipe_c)
    assert not is_regressor(pipe_c)
    assert not is_clusterer(pipe_c)

    pipe_r = GraphPipeline(
        {
            "scale": StandardScaler(),
            "rf": RandomForestRegressor()
        },
        edges=[("scale", "rf")])
    assert not is_classifier(pipe_r)
    assert not is_clusterer(pipe_r)
    assert is_regressor(pipe_r)

    pipe_t = GraphPipeline({
        "scale": StandardScaler(),
        "rf": StandardScaler()
    },
                           edges=[("scale", "rf")])
    assert not is_classifier(pipe_t)
    assert not is_clusterer(pipe_t)
    assert not is_regressor(pipe_t)

    pipe_cluster = GraphPipeline(
        {
            "scale": StandardScaler(),
            "kmeans": KMeans()
        },
        edges=[("scale", "kmeans")])
    assert is_clusterer(pipe_cluster)
    assert not is_regressor(pipe_cluster)
    assert not is_classifier(pipe_cluster)
Exemplo n.º 15
0
def test_gpipeline_graphviz():

    gpipeline = GraphPipeline(
        {
            "ColNum": ColumnsSelector(columns_to_use=["num1", "num2", "num3"]),
            "ColCat": ColumnsSelector(columns_to_use=["cat1", "cat2"]),
            "Pt": PassThrough(),
        },
        edges=[("ColNum", "Pt"), ("ColCat", "Pt")],
    )

    gpipeline.fit(dfX, y)
    assert isinstance(gpipeline.graphviz, graphviz.dot.Digraph)

    gpipeline = GraphPipeline(
        {
            "ColNum": ColumnsSelector(columns_to_use=["num1", "num2", "num3"]),
            "ColCat": ColumnsSelector(columns_to_use=["cat1", "cat2"]),
            "Pt": PassThrough(),
        },
        edges=[("ColCat", "Pt"), ("ColNum", "Pt")],
    )

    assert isinstance(
        gpipeline.graphviz,
        graphviz.dot.Digraph)  # graphviz even before fit is called
Exemplo n.º 16
0
def test_graphpipeline_concat_names():

    df = get_sample_df(size=100, seed=123)
    gpipeline = GraphPipeline(
        models={
            "sel": ColumnsSelector(columns_to_use=["float_col", "int_col"]),
            "vec": CountVectorizerWrapper(columns_to_use=["text_col"]),
            "pt": PassThrough(),
        },
        edges=[("sel", "pt"), ("vec", "pt")],
    )

    gpipeline.fit(df)
    df_res = gpipeline.transform(df)

    assert list(df_res.columns) == [
        "float_col",
        "int_col",
        "text_col__BAG__aaa",
        "text_col__BAG__bbb",
        "text_col__BAG__ccc",
        "text_col__BAG__ddd",
        "text_col__BAG__eee",
        "text_col__BAG__fff",
        "text_col__BAG__jjj",
    ]

    assert gpipeline.get_feature_names() == list(df_res.columns)
Exemplo n.º 17
0
def test_graphpipeline_no_concat():

    gpipeline = GraphPipeline(
        {
            "A": DebugPassThrough(debug=True),
            "B": DebugPassThrough(debug=True),
            "C": DebugPassThrough(debug=True)
        },
        edges=[("A", "C"), ("B", "C")],
        no_concat_nodes={"C"},
    )

    Xtransformed = gpipeline.fit_transform(X)
    assert isinstance(Xtransformed, dict)
    assert set(Xtransformed.keys()) == {"A", "B"}
    assert (Xtransformed["A"] == X).all().all()
    assert (Xtransformed["B"] == X).all().all()

    gpipeline = GraphPipeline(
        {
            "A": DebugPassThrough(debug=True),
            "B": DebugPassThrough(debug=True),
            "C": TransformToBlockManager()
        },
        edges=[("A", "C"), ("B", "C")],
        no_concat_nodes={"C"},
    )

    Xtransformed = gpipeline.fit_transform(X)
    assert isinstance(Xtransformed, BlockManager)
    assert (Xtransformed["A"] == X).all().all()
    assert (Xtransformed["B"] == X).all().all()
Exemplo n.º 18
0
def test_graphpipeline_passing_of_groups():
    gpipeline = GraphPipeline({"A": TransformerFailNoGroups(), "B": DebugPassThrough(debug=True)}, edges=[("A", "B")])

    with pytest.raises(ValueError):
        gpipeline.fit(X, y)

    groups = np.zeros(len(y))

    gpipeline.fit(X, y, groups)  # check that it didn't failed
Exemplo n.º 19
0
def test_graphpipeline_fit_params():

    gpipeline = GraphPipeline(
        {"A": DebugPassThrough(debug=True), "B": DebugPassThrough(debug=True), "C": DebugPassThrough(debug=True)},
        edges=[("A", "B", "C")],
    )

    gpipeline.fit(X, y)
    assert gpipeline.models["A"].fit_params == {}
    assert gpipeline.models["B"].fit_params == {}
    assert gpipeline.models["C"].fit_params == {}

    gpipeline.fit(X, y, A__fitparam_A="paramA")
    assert gpipeline.models["A"].fit_params == {"fitparam_A": "paramA"}
    assert gpipeline.models["B"].fit_params == {}
    assert gpipeline.models["C"].fit_params == {}
Exemplo n.º 20
0
def test_graphpipeline_get_features_names_with_input_features():

    xx = np.random.randn(10, 5)
    df = pd.DataFrame(xx, columns=["COL_%d" % j for j in range(xx.shape[1])])

    model = GraphPipeline(
        {"pt1": PassThroughtWithFeatures(prefix="PT1"), "pt2": PassThroughtWithFeatures(prefix="PT2")},
        edges=[("pt1", "pt2")],
    )
    model.fit(df)

    ### Test 1 : without input_features ###
    assert model.get_feature_names() == [
        "PT2__PT1__COL_0",
        "PT2__PT1__COL_1",
        "PT2__PT1__COL_2",
        "PT2__PT1__COL_3",
        "PT2__PT1__COL_4",
    ]
    assert model.get_feature_names_at_node("pt2") == [
        "PT2__PT1__COL_0",
        "PT2__PT1__COL_1",
        "PT2__PT1__COL_2",
        "PT2__PT1__COL_3",
        "PT2__PT1__COL_4",
    ]
    assert model.get_feature_names_at_node("pt1") == [
        "PT1__COL_0",
        "PT1__COL_1",
        "PT1__COL_2",
        "PT1__COL_3",
        "PT1__COL_4",
    ]

    assert model.get_input_features_at_node("pt2") == [
        "PT1__COL_0",
        "PT1__COL_1",
        "PT1__COL_2",
        "PT1__COL_3",
        "PT1__COL_4",
    ]
    assert model.get_input_features_at_node("pt1") == ["COL_0", "COL_1", "COL_2", "COL_3", "COL_4"]

    ### Test 2 : with input feautres ###
    assert model.get_feature_names(input_features=["a", "b", "c", "d", "e"]) == [
        "PT2__PT1__a",
        "PT2__PT1__b",
        "PT2__PT1__c",
        "PT2__PT1__d",
        "PT2__PT1__e",
    ]
    assert model.get_feature_names_at_node("pt2", input_features=["a", "b", "c", "d", "e"]) == [
        "PT2__PT1__a",
        "PT2__PT1__b",
        "PT2__PT1__c",
        "PT2__PT1__d",
        "PT2__PT1__e",
    ]
    assert model.get_feature_names_at_node("pt1", input_features=["a", "b", "c", "d", "e"]) == [
        "PT1__a",
        "PT1__b",
        "PT1__c",
        "PT1__d",
        "PT1__e",
    ]

    assert model.get_input_features_at_node("pt2", input_features=["a", "b", "c", "d", "e"]) == [
        "PT1__a",
        "PT1__b",
        "PT1__c",
        "PT1__d",
        "PT1__e",
    ]
    assert model.get_input_features_at_node("pt1", input_features=["a", "b", "c", "d", "e"]) == [
        "a",
        "b",
        "c",
        "d",
        "e",
    ]

    ### Test 3 :  with numpy array ###
    model = GraphPipeline(
        {"pt1": PassThroughtWithFeatures(prefix="PT1"), "pt2": PassThroughtWithFeatures(prefix="PT2")},
        edges=[("pt1", "pt2")],
    )
    model.fit(xx)

    assert model.get_feature_names() is None
    assert model.get_feature_names_at_node("pt2") is None
    assert model.get_feature_names_at_node("pt1") is None
    assert model.get_input_features_at_node("pt2") is None
    assert model.get_input_features_at_node("pt1") is None

    assert model.get_feature_names(input_features=["a", "b", "c", "d", "e"]) == [
        "PT2__PT1__a",
        "PT2__PT1__b",
        "PT2__PT1__c",
        "PT2__PT1__d",
        "PT2__PT1__e",
    ]
    assert model.get_feature_names_at_node("pt2", input_features=["a", "b", "c", "d", "e"]) == [
        "PT2__PT1__a",
        "PT2__PT1__b",
        "PT2__PT1__c",
        "PT2__PT1__d",
        "PT2__PT1__e",
    ]
    assert model.get_feature_names_at_node("pt1", input_features=["a", "b", "c", "d", "e"]) == [
        "PT1__a",
        "PT1__b",
        "PT1__c",
        "PT1__d",
        "PT1__e",
    ]

    assert model.get_input_features_at_node("pt2", input_features=["a", "b", "c", "d", "e"]) == [
        "PT1__a",
        "PT1__b",
        "PT1__c",
        "PT1__d",
        "PT1__e",
    ]
    assert model.get_input_features_at_node("pt1", input_features=["a", "b", "c", "d", "e"]) == [
        "a",
        "b",
        "c",
        "d",
        "e",
    ]
Exemplo n.º 21
0
def test_gpipeline_regression():
    gpipeline = GraphPipeline({"PT": PassThrough(), "Ridge": Ridge()}, [("PT", "Ridge")])

    X = dfX.loc[:, ["num1", "num2", "num3"]]

    gpipeline.fit(X, y)
    yhat = gpipeline.predict(X)
    yhat2 = gpipeline.models["Ridge"].predict(X)

    assert yhat.shape == y.shape
    assert (yhat == yhat2).all()

    with pytest.raises(AttributeError):
        gpipeline.predict_proba(X)

    with pytest.raises(AttributeError):
        gpipeline.predict_log_proba(X)

    assert gpipeline.get_feature_names_at_node("PT") == list(X.columns)
    assert gpipeline.get_input_features_at_node("PT") == list(X.columns)
    assert gpipeline.get_input_features_at_node("Ridge") == list(X.columns)

    with pytest.raises(ValueError):
        assert gpipeline.get_feature_names_at_node("DONTEXIST")
Exemplo n.º 22
0
def test_graphpipeline_get_features_names():

    dfX = pd.DataFrame(
        {
            "text1": ["aa bb", "bb bb cc", "dd aa cc", "ee"],
            "text2": ["AAA ZZZ", "BBB EEE", "DDD TTT", "AAA BBB CCC"],
            "num1": [0, 1, 2, 3],
            "num2": [1.1, 1.5, -2, -3.5],
            "num3": [-1, 1, 25, 4],
            "cat1": ["A", "B", "A", "D"],
            "cat2": ["toto", "tata", "truc", "toto"],
        }
    )

    ###  Test 1  ###
    model = GraphPipeline({"sel": ColumnsSelector(["cat1", "cat2"]), "pt": PassThrough()}, edges=[("sel", "pt")])

    model.fit(dfX)

    assert model.get_feature_names() == ["cat1", "cat2"]  # features at ending nodeC

    assert model.get_feature_names_at_node("pt") == ["cat1", "cat2"]
    assert model.get_feature_names_at_node("sel") == ["cat1", "cat2"]

    assert model.get_input_features_at_node("pt") == ["cat1", "cat2"]
    assert model.get_input_features_at_node("sel") == ["text1", "text2", "num1", "num2", "num3", "cat1", "cat2"]

    ###  Test 2  ###
    model = GraphPipeline(
        {"sel1": ColumnsSelector(["cat1", "cat2"]), "sel2": ColumnsSelector(["num1", "num2"]), "pt": PassThrough()},
        edges=[("sel1", "pt"), ("sel2", "pt")],
    )

    model.fit(dfX)

    assert model.get_feature_names() == ["cat1", "cat2", "num1", "num2"]
    assert model.get_feature_names_at_node("pt") == ["cat1", "cat2", "num1", "num2"]
    assert model.get_feature_names_at_node("sel1") == ["cat1", "cat2"]
    assert model.get_feature_names_at_node("sel2") == ["num1", "num2"]

    assert model.get_input_features_at_node("pt") == ["cat1", "cat2", "num1", "num2"]
    assert model.get_input_features_at_node("sel1") == ["text1", "text2", "num1", "num2", "num3", "cat1", "cat2"]
    assert model.get_input_features_at_node("sel2") == ["text1", "text2", "num1", "num2", "num3", "cat1", "cat2"]

    ###  Test 3  ###
    model = GraphPipeline(
        {
            "sel1": ColumnsSelector(["cat1", "cat2"]),
            "sel2": ColumnsSelector(["num1", "num2"]),
            "sel12": ColumnsSelector(["cat1", "num1"]),
            "pt": PassThrough(),
        },
        edges=[("sel1", "sel12", "pt"), ("sel2", "sel12", "pt")],
    )

    model.fit(dfX)

    assert model.get_feature_names() == ["cat1", "num1"]

    assert model.get_feature_names_at_node("pt") == ["cat1", "num1"]
    assert model.get_feature_names_at_node("sel12") == ["cat1", "num1"]
    assert model.get_feature_names_at_node("sel1") == ["cat1", "cat2"]
    assert model.get_feature_names_at_node("sel2") == ["num1", "num2"]

    assert model.get_input_features_at_node("pt") == ["cat1", "num1"]
    assert model.get_input_features_at_node("sel12") == ["cat1", "cat2", "num1", "num2"]
    assert model.get_input_features_at_node("sel1") == ["text1", "text2", "num1", "num2", "num3", "cat1", "cat2"]
    assert model.get_input_features_at_node("sel2") == ["text1", "text2", "num1", "num2", "num3", "cat1", "cat2"]
Exemplo n.º 23
0
def test_graphpipeline_blockselector():

    Xnum, y = make_classification(n_samples=100)

    dfX_text = pd.DataFrame({"text1": get_random_strings(100), "text2": get_random_strings(100)})

    X = {"text": dfX_text, "num": Xnum}

    graphpipeline = GraphPipeline(
        models={
            "BS_text": BlockSelector("text"),
            "CV": CountVectorizerWrapper(analyzer="char"),
            "BS_num": BlockSelector("num"),
            "RF": DecisionTreeClassifier(),
        },
        edges=[("BS_text", "CV", "RF"), ("BS_num", "RF")],
    )

    graphpipeline.fit(X, y)
    yhat = graphpipeline.predict(X)

    assert yhat.ndim == 1
    assert yhat.shape[0] == y.shape[0]

    ### X = dico ###
    X = {"text": dfX_text, "num": Xnum}

    graphpipeline = GraphPipeline(
        models={"BS_text": BlockSelector("text"), "BS_num": BlockSelector("num"), "PT": DebugPassThrough()},
        edges=[("BS_text", "PT"), ("BS_num", "PT")],
    )

    Xhat = graphpipeline.fit_transform(X)

    assert Xhat.shape[0] == dfX_text.shape[0]
    assert Xhat.shape[1] == dfX_text.shape[1] + Xnum.shape[1]

    assert "text1" in Xhat.columns
    assert "text2" in Xhat.columns
    assert (Xhat.loc[:, ["text1", "text2"]] == dfX_text).all().all()

    cols = diff(list(Xhat.columns), ["text1", "text2"])
    assert (Xhat.loc[:, cols].values == Xnum).all()

    ### X = list
    X = [dfX_text, Xnum]

    graphpipeline = GraphPipeline(
        models={"BS_text": BlockSelector(0), "BS_num": BlockSelector(1), "PT": DebugPassThrough()},
        edges=[("BS_text", "PT"), ("BS_num", "PT")],
    )

    Xhat = graphpipeline.fit_transform(X)

    assert Xhat.shape[0] == dfX_text.shape[0]
    assert Xhat.shape[1] == dfX_text.shape[1] + Xnum.shape[1]

    assert "text1" in Xhat.columns
    assert "text2" in Xhat.columns
    assert (Xhat.loc[:, ["text1", "text2"]] == dfX_text).all().all()

    cols = diff(list(Xhat.columns), ["text1", "text2"])
    assert (Xhat.loc[:, cols].values == Xnum).all()

    ### X = DataManager
    X = BlockManager({"text": dfX_text, "num": Xnum})

    graphpipeline = GraphPipeline(
        models={"BS_text": BlockSelector("text"), "BS_num": BlockSelector("num"), "PT": DebugPassThrough()},
        edges=[("BS_text", "PT"), ("BS_num", "PT")],
    )

    Xhat = graphpipeline.fit_transform(X)

    assert Xhat.shape[0] == dfX_text.shape[0]
    assert Xhat.shape[1] == dfX_text.shape[1] + Xnum.shape[1]

    assert "text1" in Xhat.columns
    assert "text2" in Xhat.columns
    assert (Xhat.loc[:, ["text1", "text2"]] == dfX_text).all().all()

    cols = diff(list(Xhat.columns), ["text1", "text2"])
    assert (Xhat.loc[:, cols].values == Xnum).all()
Exemplo n.º 24
0
def test_approx_cross_validation_graphpipeline():

    X, y = make_classification(n_samples=100)
    X = pd.DataFrame(X, columns=["col_%d" % i for i in range(X.shape[1])])

    ## Fit ##
    PassThroughWithCallback.reset_counters()
    PassThroughWithCallback2.reset_counters()

    gpipeline = GraphPipeline(
        models={"A": PassThroughWithCallback(), "B": PassThroughWithCallback2(), "C": LogisticRegression()},
        edges=[("A", "B", "C")],
    )

    gpipeline.fit(X, y)

    assert PassThroughWithCallback.nb_fit_transform == 1
    assert PassThroughWithCallback2.nb_fit_transform == 1

    assert PassThroughWithCallback.nb_fit == 0
    assert PassThroughWithCallback2.nb_fit == 0

    assert PassThroughWithCallback.nb_transform == 0
    assert PassThroughWithCallback2.nb_transform == 0

    ## approx cv ##
    PassThroughWithCallback.reset_counters()
    PassThroughWithCallback2.reset_counters()

    gpipeline = GraphPipeline(
        models={"A": PassThroughWithCallback(), "B": PassThroughWithCallback2(), "C": LogisticRegression()},
        edges=[("A", "B", "C")],
    )

    cv_res = gpipeline.approx_cross_validation(X, y, scoring=["neg_mean_squared_error"], cv=10, verbose=False)

    assert PassThroughWithCallback.nb_fit_transform == 10
    assert PassThroughWithCallback2.nb_fit_transform == 10

    assert PassThroughWithCallback.nb_fit == 0
    assert PassThroughWithCallback2.nb_fit == 0

    assert PassThroughWithCallback.nb_transform == 20  # 10 fold x 2 (for score in train and test)
    assert PassThroughWithCallback2.nb_transform == 20

    ## approx cv but skip nodes ##
    PassThroughWithCallback.reset_counters()
    PassThroughWithCallback2.reset_counters()

    gpipeline = GraphPipeline(
        models={"A": PassThroughWithCallback(), "B": PassThroughWithCallback2(), "C": LogisticRegression()},
        edges=[("A", "B", "C")],
    )

    #    cv_res = gpipeline.approx_cross_validation(X, y, scoring=["neg_mean_squared_error"],cv = 10, verbose = False, nodes_not_to_crossvalidate = ("A",))
    cv_res = gpipeline.approx_cross_validation(
        X, y, scoring=["neg_mean_squared_error"], cv=10, verbose=1, nodes_not_to_crossvalidate=("A",)
    )

    assert cv_res is not None
    assert cv_res.shape[0] == 10

    assert PassThroughWithCallback.nb_fit_transform == 1
    assert PassThroughWithCallback2.nb_fit_transform == 10

    assert PassThroughWithCallback.nb_fit == 0
    assert PassThroughWithCallback2.nb_fit == 0

    assert PassThroughWithCallback.nb_transform == 0
    assert PassThroughWithCallback2.nb_transform == 20

    PassThroughWithCallback.reset_counters()
    PassThroughWithCallback2.reset_counters()

    gpipeline = GraphPipeline(
        models={"A": PassThroughWithCallback(), "B": PassThroughWithCallback2(), "C": LogisticRegression()},
        edges=[("A", "B", "C")],
    )

    #    cv_res = gpipeline.approx_cross_validation(X, y, scoring=["neg_mean_squared_error"],cv = 10, verbose = False, nodes_not_to_crossvalidate = ("A",))
    cv_res = gpipeline.approx_cross_validation(
        X, y, scoring=["neg_mean_squared_error"], cv=10, verbose=1, nodes_not_to_crossvalidate=("A", "B")
    )

    assert cv_res is not None
    assert cv_res.shape[0] == 10

    assert PassThroughWithCallback.nb_fit_transform == 1
    assert PassThroughWithCallback2.nb_fit_transform == 1

    assert PassThroughWithCallback.nb_fit == 0
    assert PassThroughWithCallback2.nb_fit == 0

    assert PassThroughWithCallback.nb_transform == 0
    assert PassThroughWithCallback2.nb_transform == 0

    PassThroughWithCallback_cant_cv_transform.reset_counters()
    PassThroughWithCallback2.reset_counters()

    gpipeline = GraphPipeline(
        models={
            "A": PassThroughWithCallback_cant_cv_transform(),
            "B": PassThroughWithCallback2(),
            "C": LogisticRegression(),
        },
        edges=[("A", "B", "C")],
    )

    #    cv_res = gpipeline.approx_cross_validation(X, y, scoring=["neg_mean_squared_error"],cv = 10, verbose = False, nodes_not_to_crossvalidate = ("A",))
    cv_res = gpipeline.approx_cross_validation(
        X, y, scoring=["neg_mean_squared_error"], cv=10, verbose=1, nodes_not_to_crossvalidate={"B"}
    )

    assert cv_res is not None
    assert cv_res.shape[0] == 10

    assert PassThroughWithCallback_cant_cv_transform.nb_fit_transform == 10
    assert PassThroughWithCallback2.nb_fit_transform == 10

    assert PassThroughWithCallback_cant_cv_transform.nb_fit == 0
    assert PassThroughWithCallback2.nb_fit == 0

    assert PassThroughWithCallback_cant_cv_transform.nb_transform == 20
    assert PassThroughWithCallback2.nb_transform == 20
Exemplo n.º 25
0
def test_graphpipeline_merging_node():

    gpipeline = GraphPipeline(
        {
            "ColNum": ColumnsSelector(columns_to_use=["num1", "num2", "num3"]),
            "ColCat": ColumnsSelector(columns_to_use=["cat1", "cat2"]),
            "Pt": DebugPassThrough(debug=True),
        },
        edges=[("ColNum", "Pt"), ("ColCat", "Pt")],
    )

    gpipeline.fit(dfX, y)

    pt = gpipeline.models["Pt"]
    assert pt._expected_columns == ["num1", "num2", "num3", "cat1", "cat2"]
    assert pt._expected_type == DataTypes.DataFrame
    assert pt._expected_nbcols == 5

    dfX_transformed = gpipeline.transform(dfX)
    assert (dfX_transformed == dfX.loc[:, ["num1", "num2", "num3", "cat1", "cat2"]]).all().all()

    assert gpipeline.get_feature_names() == ["num1", "num2", "num3", "cat1", "cat2"]
    assert gpipeline.get_feature_names_at_node("Pt") == ["num1", "num2", "num3", "cat1", "cat2"]
    assert gpipeline.get_feature_names_at_node("ColNum") == ["num1", "num2", "num3"]
    assert gpipeline.get_feature_names_at_node("ColCat") == ["cat1", "cat2"]

    assert gpipeline.get_input_features_at_node("ColNum") == list(dfX.columns)
    assert gpipeline.get_input_features_at_node("ColCat") == list(dfX.columns)
    assert gpipeline.get_input_features_at_node("Pt") == ["num1", "num2", "num3", "cat1", "cat2"]

    # concatenation in the other oreder
    gpipeline = GraphPipeline(
        {
            "ColNum": ColumnsSelector(columns_to_use=["num1", "num2", "num3"]),
            "ColCat": ColumnsSelector(columns_to_use=["cat1", "cat2"]),
            "Pt": DebugPassThrough(debug=True),
        },
        edges=[("ColCat", "Pt"), ("ColNum", "Pt")],
    )

    gpipeline.fit(dfX, y)

    pt = gpipeline.models["Pt"]
    assert pt._expected_columns == ["cat1", "cat2", "num1", "num2", "num3"]  # Concanteation in the order of the edges
    assert pt._expected_type == DataTypes.DataFrame
    assert pt._expected_nbcols == 5

    assert gpipeline.get_feature_names() == ["cat1", "cat2", "num1", "num2", "num3"]
    assert gpipeline.get_feature_names_at_node("Pt") == ["cat1", "cat2", "num1", "num2", "num3"]
    assert gpipeline.get_feature_names_at_node("ColNum") == ["num1", "num2", "num3"]
    assert gpipeline.get_feature_names_at_node("ColCat") == ["cat1", "cat2"]

    assert gpipeline.get_input_features_at_node("ColNum") == list(dfX.columns)
    assert gpipeline.get_input_features_at_node("ColCat") == list(dfX.columns)
    assert gpipeline.get_input_features_at_node("Pt") == ["cat1", "cat2", "num1", "num2", "num3"]

    dfX_transformed = gpipeline.transform(dfX)
    assert (dfX_transformed == dfX.loc[:, ["cat1", "cat2", "num1", "num2", "num3"]]).all().all()
Exemplo n.º 26
0
def test_graphpipeline_other_input_syntaxes():

    # regular syntax
    gpipeline = GraphPipeline({"A": PassThrough(), "B": PassThrough(), "C": PassThrough()}, edges=[("A", "B", "C")])
    gpipeline._complete_init()

    expected_nodes = {"A", "B", "C"}
    expected_edges = {("A", "B"), ("B", "C")}

    assert set(gpipeline.complete_graph.nodes) == expected_nodes
    assert set(gpipeline.complete_graph.edges) == expected_edges

    # pipeline syntax
    gpipeline = GraphPipeline([("A", PassThrough()), ("B", PassThrough()), ("C", PassThrough())])

    gpipeline._complete_init()
    assert set(gpipeline.complete_graph.nodes) == expected_nodes
    assert set(gpipeline.complete_graph.edges) == expected_edges

    ## with a merge
    expected_nodes = {"A", "B", "C", "D"}
    expected_edges = {("A", "B"), ("B", "D"), ("C", "D")}

    gpipeline = GraphPipeline(
        {"A": PassThrough(), "B": PassThrough(), "C": PassThrough(), "D": PassThrough()},
        edges=[("A", "B", "D"), ("C", "D")],
    )

    gpipeline._complete_init()
    assert set(gpipeline.complete_graph.nodes) == expected_nodes
    assert set(gpipeline.complete_graph.edges) == expected_edges

    gpipeline = GraphPipeline(
        {"A": PassThrough(), "B": PassThrough(), "C": PassThrough(), "D": PassThrough()},
        edges=[("A", "B"), ("B", "D"), ("C", "D")],
    )
    gpipeline._complete_init()
    assert set(gpipeline.complete_graph.nodes) == expected_nodes
    assert set(gpipeline.complete_graph.edges) == expected_edges

    gpipeline = GraphPipeline(
        {"A": PassThrough(), "B": PassThrough(), "C": PassThrough(), "D": PassThrough()}, edges="A - B - D ; C - D"
    )
    gpipeline._complete_init()
    assert set(gpipeline.complete_graph.nodes) == expected_nodes
    assert set(gpipeline.complete_graph.edges) == expected_edges
Exemplo n.º 27
0
@author: Lionel Massoulard
"""




from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from aikit.pipeline import GraphPipeline
from aikit.transformers import CountVectorizerWrapper, TruncatedSVDWrapper
from aikit.transformers_categories import NumericalEncoder

gpipeline = GraphPipeline(models = {"vect" : CountVectorizerWrapper(analyzer="char",ngram_range=(1,4), columns_to_use=["text1","text2"]),
                                    "cat"  : NumericalEncoder(columns_to_use=["cat1","cat2"]) , 
                                    "rf"   : RandomForestClassifier(n_estimators=100)}  ,
                               edges = [("vect","rf"),("cat","rf")]
                               )


gpipeline = GraphPipeline(models = {"encoder":NumericalEncoder(columns_to_use = ["cat1","cat2"]),
                                "imputer": NumImputer(),
                                "vect": CountVectorizerWrapper(analyzer="word",columns_to_use=["cat1","cat2"]),
                                "svd":TruncatedSVDWrapper(n_components=50),
                                "rf":RandomForestClassifier(n_estimators=100)
                                    },
                    edges = [("encoder","imputer","rf"),("vect","svd","rf")] )



gpipeline_mix3 = GraphPipeline(models = {"encoder" : NumericalEncoder(columns_to_use = ["cat1","cat2"],
Exemplo n.º 28
0
def test_graphpipeline_nodes_concat_order():

    cols = list(dfX.columns)

    ### 1
    pipeline = GraphPipeline(
        {
            "pt1": DebugPassThrough(column_prefix="PT1_", debug=True),
            "pt2": DebugPassThrough(column_prefix="PT2_", debug=True),
            "pt3": DebugPassThrough(column_prefix="PT3_", debug=True),
        },
        edges=[("pt1", "pt3"), ("pt2", "pt3")],
    )

    Xres = pipeline.fit_transform(dfX)
    assert list(Xres.columns) == ["PT3__PT1__" + c for c in cols] + [
        "PT3__PT2__" + c for c in cols
    ]  # PT1 on the left, PT2 on the right
    assert list(Xres.columns) == pipeline.get_feature_names()

    ### 2 : reverse order
    pipeline = GraphPipeline(
        {
            "pt1": DebugPassThrough(column_prefix="PT1_", debug=True),
            "pt2": DebugPassThrough(column_prefix="PT2_", debug=True),
            "pt3": DebugPassThrough(column_prefix="PT3_", debug=True),
        },
        edges=[("pt2", "pt3"), ("pt1", "pt3")],
    )

    Xres = pipeline.fit_transform(dfX)
    assert list(Xres.columns) == ["PT3__PT2__" + c for c in cols] + [
        "PT3__PT1__" + c for c in cols
    ]  # PT1 on the left, PT2 on the right
    assert list(Xres.columns) == pipeline.get_feature_names()

    ### 3 : with 4 nodes
    for edges in ([("pt1", "pt3", "pt4"), ("pt2", "pt3", "pt4")], [("pt1", "pt3", "pt4"), ("pt2", "pt3")]):
        pipeline = GraphPipeline(
            {
                "pt1": DebugPassThrough(column_prefix="PT1_", debug=True),
                "pt2": DebugPassThrough(column_prefix="PT2_", debug=True),
                "pt3": DebugPassThrough(column_prefix="PT3_", debug=True),
                "pt4": DebugPassThrough(column_prefix="PT4_", debug=True),
            },
            edges=edges,
        )
        Xres = pipeline.fit_transform(dfX)
        assert list(Xres.columns) == ["PT4__PT3__PT1__" + c for c in cols] + [
            "PT4__PT3__PT2__" + c for c in cols
        ]  # PT1 on the left, PT2 on the right
        assert list(Xres.columns) == pipeline.get_feature_names()

    ### 4 : reverse order
    for edges in ([("pt2", "pt3", "pt4"), ("pt1", "pt3", "pt4")], [("pt2", "pt3", "pt4"), ("pt1", "pt3")]):
        pipeline = GraphPipeline(
            {
                "pt1": DebugPassThrough(column_prefix="PT1_", debug=True),
                "pt2": DebugPassThrough(column_prefix="PT2_", debug=True),
                "pt3": DebugPassThrough(column_prefix="PT3_", debug=True),
                "pt4": DebugPassThrough(column_prefix="PT4_", debug=True),
            },
            edges=edges,
        )
        Xres = pipeline.fit_transform(dfX)
        assert list(Xres.columns) == ["PT4__PT3__PT2__" + c for c in cols] + [
            "PT4__PT3__PT1__" + c for c in cols
        ]  # PT1 on the left, PT2 on the right
        assert list(Xres.columns) == pipeline.get_feature_names()
Exemplo n.º 29
0
    def fit_metric_model(self):
        logger.info("start computing metric model...")

        ### Load the results
        df_results = self.result_reader.load_all_results(aggregate=True)

        self._nb_models_done = len(df_results)
        if self._nb_models_done <= self.min_nb_of_models:
            return self

        if (self._nb_models_done is not None
                and len(df_results) == self._nb_models_done
                and self.params_training_columns is not None):
            return self

        ### Load the params
        df_params = self.result_reader.load_all_params()

        df_merged_result = pd.merge(df_params,
                                    df_results,
                                    how="inner",
                                    on="job_id")

        training_cols = diff(list(df_params.columns), ["job_id"])

        # X dataframe for parameters
        dfX_params = df_merged_result.loc[:, training_cols]

        ### Retrive the target metric

        if self.avg_metrics:
            scorers = self.job_config.scoring
        else:
            scorers = [self.job_config.main_scorer
                       ]  # I'll use only the main_scorer

        N = dfX_params.shape[0]
        all_y_params = []
        for scorer in scorers:
            y_params = df_merged_result["test_%s" %
                                        scorer]  # Retrive the raw metric
            # replace NaN by scorer's observed minimum score ; if y_params contains
            # only NaN -> won't work
            y_params = y_params.fillna(y_params.min()).values

            if self.metric_transformation is None:
                pass

            elif self.metric_transformation == "rank":
                ### Transform in non-parametric rank ....
                y_params = kde_transfo_quantile(y_params)

                # => This behave likes a uniform law

            elif self.metric_transformation == "normal":
                ### Transform into non-parametric normal ...
                y_params = norm.ppf(kde_transfo_quantile(y_params))

                # => This behaves likes a normal law

            elif self.metric_transformation == "default":
                ### Transform using default transformation (log like function)
                try:
                    f = get_metric_default_transformation(scorer)
                except ValueError:
                    logger.info(
                        "I don't know how to transform this metric %s, I'll use default normal transformation"
                        % str(scorer))
                    f = None

                if f is None:
                    y_params = norm.ppf(kde_transfo_quantile(y_params))
                else:
                    y_params = f(y_params)

                if self.avg_metrics:
                    # If I'm averaging I'd rather have something centered
                    y_params = (y_params -
                                np.mean(y_params)) / np.std(y_params)

            else:
                raise ValueError("I don't know this metric_transformation %s" %
                                 self.metric_transformation)

            all_y_params.append(y_params.reshape((N, 1)))

        if len(all_y_params) > 1:
            y_params = np.concatenate(all_y_params, axis=1).mean(axis=1)
        else:
            y_params = all_y_params[0].reshape((N, ))

        #        elif self.metric_transformation
        #
        #
        #        else:
        #            # On peut aussi utiliser la transformation par default ?
        #            scorer = self.job_config.main_scorer
        #            y_params = df_merged_result["test_%s" % scorer].values
        #

        # create model
        transformer_model = GraphPipeline(models={
            "encoder": NumericalEncoder(),
            "imputer": NumImputer()
        },
                                          edges=[("encoder", "imputer")])

        xx_params = transformer_model.fit_transform(dfX_params)

        random_forest = RandomForestRegressor(n_estimators=100,
                                              min_samples_leaf=5)

        random_forest.fit(xx_params, y_params)

        random_forest_variance = RandomForestVariance(random_forest)
        random_forest_variance.fit(xx_params, y_params)

        self.params_training_columns = training_cols
        self.transformer_model = transformer_model
        self.random_forest = random_forest
        self.random_forest_variance = random_forest_variance

        self._nb_models_done = len(df_results)

        logger.info("metric model fitted")

        return self
Exemplo n.º 30
0
def test_score_from_params(x_data_type, shuffle, graph_pipeline):
    np.random.seed(123)
    X = np.random.randn(100, 10)

    X = convert_generic(X, output_type=x_data_type)

    if x_data_type == DataTypes.DataFrame:
        X.columns = ["col_%d" % i for i in range(X.shape[1])]

    if shuffle:
        ii = np.arange(X.shape[0])
        np.random.shuffle(ii)

        if isinstance(X, pd.DataFrame):
            X = X.loc[ii, :]
        else:
            X = X[ii, :]

    scoring = ["silhouette", "davies_bouldin", "calinski_harabasz"]

    if graph_pipeline:
        estimator = GraphPipeline(
            {"pt": DebugPassThrough(), "lg": KMeans(n_clusters=3, random_state=123)}, edges=[("pt", "lg")]
        )
    else:
        estimator = KMeans(n_clusters=3, random_state=123)

    ##################
    ### Only score ###
    ##################

    res = score_from_params_clustering(estimator, X, scoring=scoring, verbose=0)

    assert isinstance(res, pd.DataFrame)
    assert res.shape[0] == 1
    for s in scoring:
        assert ("test_" + s) in set(res.columns)

    with pytest.raises(NotFittedError):
        estimator.predict(X)

    ##########################
    ### Score + Prediction ###
    ##########################
    res, label = score_from_params_clustering(estimator, X, scoring=scoring, verbose=0, return_predict=True)

    assert isinstance(res, pd.DataFrame)
    assert res.shape[0] == 1
    for s in scoring:
        assert ("test_" + s) in set(res.columns)

    assert isinstance(label, np.ndarray)

    assert len(np.unique(label)) == 3

    with pytest.raises(NotFittedError):
        estimator.predict(X)

    ####################
    ### Predict only ###
    ####################
    res, label = score_from_params_clustering(
        estimator, X, scoring=scoring, verbose=0, return_predict=True, no_scoring=True
    )

    assert len(np.unique(label)) == 3
    assert res is None

    with pytest.raises(NotFittedError):
        estimator.predict(X)