Пример #1
0
    def _get_feature_names_at_node(self,
                                   node,
                                   input_features=None,
                                   entry=False):
        """ main function to make the feature go down the graphpipleine and retrieve the features at a given node 
        
        Parameter
        ---------
        node : string or ..
            name of the node
            
        input_features : None or list
            if not None, the list of feature (at the input of the graphpipeline)
            
        entry : boolean, default = False
            if True will retrieve the feature at the ENTRY of a given model, otherwise the feature at the EXIT of a given model
        
        Returns
        -------
            list of features for the given node, or None
        
        """

        if not self._already_fitted:
            raise NotFittedError("Please fit the model before")

        if input_features is None:
            input_features = self._Xinput_features

        feature_dico = {}
        for n in self._nodes_order:

            predecessors = list(self.complete_graph.predecessors(n))

            if len(predecessors) == 0:
                last_features = input_features

            elif len(predecessors) == 1:
                last_features = feature_dico[predecessors[0]]

            else:
                predecessors = self._all_concat_order[n]
                all_last_features = [
                    feature_dico[predecessor] for predecessor in predecessors
                ]

                if all_last_features is None or None in all_last_features:
                    last_features = None
                else:
                    last_features = unlist(all_last_features)

            model = self._models[n]

            if last_features is None or None in last_features:
                last_features = None

            if n != node:
                feature_dico[n] = try_to_find_features_names(
                    model, input_features=last_features)
                if feature_dico[n] is not None:
                    feature_dico[n] = list(feature_dico[n])

            else:

                if entry:
                    # Entry, I'll return the features at the entry of the node
                    return last_features
                else:
                    # Otherwise I'll return the features at the exit of the node
                    feature_dico[n] = try_to_find_features_names(
                        model, input_features=last_features)
                    if feature_dico[n] is not None:
                        feature_dico[n] = list(feature_dico[n])

                    return feature_dico[n]

        raise ValueError("node %s isn't in the graph" % node)
Пример #2
0
def test_try_to_find_features_names():

    list_of_words = ["aa bb", "bb bb cc", "dd aa cc", "ee"]
    vec = CountVectorizer()
    vec.fit_transform(list_of_words)

    assert try_to_find_features_names(vec) == ["aa", "bb", "cc", "dd", "ee"]

    pipe = Pipeline([("nothing", DebugPassThrough()),
                     ("vec", CountVectorizer())])

    pipe.fit_transform(list_of_words)

    assert try_to_find_features_names(pipe) == ["aa", "bb", "cc", "dd", "ee"]

    union = FeatureUnion(transformer_list=[(
        "bagword",
        CountVectorizer()), ("bagchar", CountVectorizer(analyzer="char"))])
    union.fit_transform(list_of_words)

    assert try_to_find_features_names(union) == [
        "bagword__aa",
        "bagword__bb",
        "bagword__cc",
        "bagword__dd",
        "bagword__ee",
        "bagchar__ ",
        "bagchar__a",
        "bagchar__b",
        "bagchar__c",
        "bagchar__d",
        "bagchar__e",
    ]

    pipe1 = Pipeline([("nothing", DebugPassThrough()),
                      ("vec", CountVectorizer())])

    pipe2 = Pipeline([("nothing", DebugPassThrough()),
                      ("vec", CountVectorizer(analyzer="char"))])

    union = FeatureUnion(transformer_list=[("bagword",
                                            pipe1), ("bagchar", pipe2)])
    union.fit_transform(list_of_words)

    assert try_to_find_features_names(union) == [
        "bagword__aa",
        "bagword__bb",
        "bagword__cc",
        "bagword__dd",
        "bagword__ee",
        "bagchar__ ",
        "bagchar__a",
        "bagchar__b",
        "bagchar__c",
        "bagchar__d",
        "bagchar__e",
    ]

    class DummyModelAcceptInputFeature(object):
        def get_feature_names(self, input_features=None):
            if input_features is None:
                return [0, 1, 2, 3]
            else:
                return input_features

    class DummyModelDontInputFeature(object):
        def get_feature_names(self):
            return [0, 1, 2, 3]

    class DummyModelDoesntHaveGetFeatures(object):
        pass

    m = DummyModelAcceptInputFeature()
    assert try_to_find_features_names(m) == [0, 1, 2, 3]
    assert try_to_find_features_names(
        m, input_features=["a", "b", "c", "d"]) == ["a", "b", "c", "d"]

    m = DummyModelDontInputFeature()
    assert try_to_find_features_names(m) == [0, 1, 2, 3]
    assert try_to_find_features_names(m, input_features=["a", "b", "c",
                                                         "d"]) == [0, 1, 2, 3]

    m = DummyModelDoesntHaveGetFeatures()
    assert try_to_find_features_names(m) is None
    assert try_to_find_features_names(m, input_features=["a", "b", "c", "d"
                                                         ]) is None
Пример #3
0
    def _fit_transform(self,
                       X,
                       y=None,
                       groups=None,
                       method=None,
                       fit_params=None):
        """ main method of GraphPipeline, handles the fit and predict of object """
        do_fit = method in ("fit", "fit_transform", "fit_predict")

        if not self._already_fitted and not do_fit:
            raise NotFittedError("Please fit the model before")

        # Split fit_params into a 'step-by-step' dictionnary
        fit_params_step = {name: {} for name in self.complete_graph.nodes}
        if fit_params is not None:
            for key, value in fit_params.items():
                step, param = key.split("__", 1)
                fit_params_step[step][param] = value

        data_dico = {}  # Will contain transformed blocks at each node
        feature_dico = {}  # Will contain the get_feature_names() of each node

        if do_fit:
            input_features = getattr(X, "columns", None)
            if input_features is not None:
                input_features = list(input_features)

            self._Xinput_features = input_features

        else:
            input_features = self._Xinput_features

        nodes_done = set()
        for node in self._nodes_order:

            nodes_done.add(node)

            if self.verbose:
                print("start processing node %s ..." % node)

            ### Debugging Help ###
            if (getattr(self, "_return_before_node", None) is not None
                    and getattr(self, "_return_before_node", None) == node):
                return data_dico

            model = self._models[node]

            predecessors = list(self.complete_graph.predecessors(node))
            # Carefull : here it is not necessary always in the same order

            #### I'll use the order in which the edges were given

            # Concatenation : alphabetical order
            concat_at_this_node = self.no_concat_nodes is None or node not in self.no_concat_nodes

            if len(predecessors) == 0:
                #########################
                ###  No predecessors  ###
                #########################
                if concat_at_this_node:
                    lastX = X

                else:
                    lastX = {"_data": X}
                # ==> Apply on original data

                last_features = input_features

            elif len(predecessors) == 1:
                ########################
                ###  One predecessor ###
                ########################

                # ==> Apply on data coming out of last node
                if concat_at_this_node:
                    lastX = data_dico[predecessors[0]]
                else:
                    lastX = {
                        predecessor: data_dico[predecessor]
                        for predecessor in predecessors
                    }

                last_features = feature_dico[predecessors[0]]

            elif len(predecessors) > 1:
                #######################
                ###  More than one  ###
                #######################
                # ==> concat all the predecessors node and apply it

                ### Fix concatenation order ###
                if do_fit:
                    edges_number = self._get_edges_number(predecessors, node)
                    predecessors = sorted(predecessors,
                                          key=lambda p:
                                          (edges_number.get(p, -1), p))
                    self._all_concat_order[node] = predecessors
                else:
                    predecessors = self._all_concat_order[node]

                all_lastX = [
                    data_dico[predecessor] for predecessor in predecessors
                ]
                all_last_features = [
                    feature_dico[predecessor] for predecessor in predecessors
                ]

                if all_last_features is None or None in all_last_features:
                    last_features = None
                else:
                    last_features = unlist(all_last_features)

                # all_columns_names = [try_to_find_features_names( self._models[predecessor], input_features = input_features)
                #        for predecessor, input_features in zip(predecessors, all_last_features)]

                # for predecessor, input_features in zip(predecessors,all_last_features):
                #    try_to_find_features_names( self._models[predecessor], input_features = input_features)

                if self.verbose:
                    print("start aggregation...")

                if do_fit:
                    output_type = guess_output_type(all_lastX)
                    self._all_concat_type[node] = output_type
                else:
                    output_type = self._all_concat_type[node]

                if concat_at_this_node:
                    lastX = generic_hstack(all_lastX,
                                           output_type=output_type,
                                           all_columns_names=all_last_features)
                else:
                    lastX = {
                        predecessor: data_dico[predecessor]
                        for predecessor in predecessors
                    }

            if node != self._terminal_node:
                # This is not the end of the graph
                if do_fit:
                    if groups is not None and function_has_named_argument(
                            model.fit_transform, "groups"):
                        data_dico[node] = model.fit_transform(
                            lastX, y, groups=groups, **fit_params_step[node])
                    else:
                        data_dico[node] = model.fit_transform(
                            lastX, y, **fit_params_step[node])

                    # ICI : on pourrait sauté le fit pour certains models dans le fit params
                    # Quelque-chose comme :

                    # if node in preffited_models:
                    #
                    # self._model[node] = preffited_models[node]
                    # model = preffited_models[node]
                    # + copy model into pipeline

                    #    data_dico[node] = model.transform(lastX, y)
                    # else:
                    #    data_dico[node] = model.fit_transform(lastX, y, **fit_params_step[node] )

                else:
                    data_dico[node] = model.transform(lastX)

                feature_dico[node] = try_to_find_features_names(
                    model, input_features=last_features)

            else:
                # This is the last node of the Graph
                if method == "fit":
                    if groups is not None and function_has_named_argument(
                            model.fit, "groups"):
                        model.fit(lastX, y, groups, **fit_params_step[node])
                    else:
                        model.fit(lastX, y, **fit_params_step[node])
                    result = self

                elif method == "fit_predict":
                    if groups is not None and function_has_named_argument(
                            model.fit_predict, "groups"):
                        result = model.fit_predict(lastX, y, groups,
                                                   **fit_params_step[node])
                    else:
                        result = model.fit_predict(lastX, y,
                                                   **fit_params_step[node])

                elif method == "fit_transform":
                    if groups is not None and function_has_named_argument(
                            model.fit_transform, "groups"):
                        result = model.fit_transform(lastX, y, groups,
                                                     **fit_params_step[node])
                    else:
                        result = model.fit_transform(lastX, y,
                                                     **fit_params_step[node])

                elif method == "transform":
                    result = model.transform(lastX)

                elif method == "predict":
                    result = model.predict(lastX)

                elif method == "predict_proba":
                    result = model.predict_proba(lastX)

                elif method == "predict_log_proba":
                    result = model.predict_log_proba(lastX)

                elif method == "decision_function":
                    result = model.decision_function(lastX)

                elif method == "score":
                    result = model.score(lastX, y)

                else:
                    raise ValueError("I don't know that kind of method '%s' " %
                                     method)

                feature_dico[node] = try_to_find_features_names(
                    model, input_features=last_features)
                return result

            #######################
            #### Dico cleaning ####
            #######################
            # I'll do a step of cleaning to remove useless blocks in memory
            # I need to remove data in nodes that wont be accessed anymore
            still_usefull = set()
            for n in self.complete_graph.nodes:
                if n in nodes_done:
                    continue

                p = list(self.complete_graph.predecessors(n))
                still_usefull.update(p)

            for n in data_dico.keys():
                if data_dico[n] is None:
                    continue
                if n not in still_usefull:
                    if self.verbose:
                        print("deleting useless node %s" % n)
                    data_dico[n] = None