示例#1
0
def edges_from_graph(G):
    """ return the edges from a graph """
    all_edges = list(sorted(set(
        G.edges)))  # to make sure the order of the edges doesn't change
    goon = True

    while goon:
        something_has_change = False
        for e1, e2 in itertools.product(all_edges, all_edges):
            if e1 != e2 and e1[-1] == e2[0]:
                all_edges = [e for e in all_edges if e != e1]
                all_edges = [e for e in all_edges if e != e2]
                all_edges.append(tuple(e1[0:-1]) + tuple(e2))

                something_has_change = True
            if something_has_change:
                break

        if not something_has_change:
            goon = False

    all_edges = list(all_edges)
    # Re-add node not in edges
    all_nodes_in_edge = unlist(all_edges)
    all_nodes = sorted(G.nodes)
    all_nodes = [n for n in all_nodes if n not in all_nodes_in_edge]
    all_edges += [(n, ) for n in all_nodes]

    G2 = graph_from_edges(*all_edges)

    assert set(G.nodes) == set(G2.nodes)
    assert set(G.edges) == set(G2.edges)

    return all_edges
示例#2
0
    def _get_feature_names_at_node(self,
                                   node,
                                   input_features=None,
                                   entry=False):
        """ main function to make the feature go down the graphpipleine and retrieve the features at a given node 
        
        Parameter
        ---------
        node : string or ..
            name of the node
            
        input_features : None or list
            if not None, the list of feature (at the input of the graphpipeline)
            
        entry : boolean, default = False
            if True will retrieve the feature at the ENTRY of a given model, otherwise the feature at the EXIT of a given model
        
        Returns
        -------
            list of features for the given node, or None
        
        """

        if not self._already_fitted:
            raise NotFittedError("Please fit the model before")

        if input_features is None:
            input_features = self._Xinput_features

        feature_dico = {}
        for n in self._nodes_order:

            predecessors = list(self.complete_graph.predecessors(n))

            if len(predecessors) == 0:
                last_features = input_features

            elif len(predecessors) == 1:
                last_features = feature_dico[predecessors[0]]

            else:
                predecessors = self._all_concat_order[n]
                all_last_features = [
                    feature_dico[predecessor] for predecessor in predecessors
                ]

                if all_last_features is None or None in all_last_features:
                    last_features = None
                else:
                    last_features = unlist(all_last_features)

            model = self._models[n]

            if last_features is None or None in last_features:
                last_features = None

            if n != node:
                feature_dico[n] = try_to_find_features_names(
                    model, input_features=last_features)
                if feature_dico[n] is not None:
                    feature_dico[n] = list(feature_dico[n])

            else:

                if entry:
                    # Entry, I'll return the features at the entry of the node
                    return last_features
                else:
                    # Otherwise I'll return the features at the exit of the node
                    feature_dico[n] = try_to_find_features_names(
                        model, input_features=last_features)
                    if feature_dico[n] is not None:
                        feature_dico[n] = list(feature_dico[n])

                    return feature_dico[n]

        raise ValueError("node %s isn't in the graph" % node)
示例#3
0
    def _fit_transform(self,
                       X,
                       y=None,
                       groups=None,
                       method=None,
                       fit_params=None):
        """ main method of GraphPipeline, handles the fit and predict of object """
        do_fit = method in ("fit", "fit_transform", "fit_predict")

        if not self._already_fitted and not do_fit:
            raise NotFittedError("Please fit the model before")

        # Split fit_params into a 'step-by-step' dictionnary
        fit_params_step = {name: {} for name in self.complete_graph.nodes}
        if fit_params is not None:
            for key, value in fit_params.items():
                step, param = key.split("__", 1)
                fit_params_step[step][param] = value

        data_dico = {}  # Will contain transformed blocks at each node
        feature_dico = {}  # Will contain the get_feature_names() of each node

        if do_fit:
            input_features = getattr(X, "columns", None)
            if input_features is not None:
                input_features = list(input_features)

            self._Xinput_features = input_features

        else:
            input_features = self._Xinput_features

        nodes_done = set()
        for node in self._nodes_order:

            nodes_done.add(node)

            if self.verbose:
                print("start processing node %s ..." % node)

            ### Debugging Help ###
            if (getattr(self, "_return_before_node", None) is not None
                    and getattr(self, "_return_before_node", None) == node):
                return data_dico

            model = self._models[node]

            predecessors = list(self.complete_graph.predecessors(node))
            # Carefull : here it is not necessary always in the same order

            #### I'll use the order in which the edges were given

            # Concatenation : alphabetical order
            concat_at_this_node = self.no_concat_nodes is None or node not in self.no_concat_nodes

            if len(predecessors) == 0:
                #########################
                ###  No predecessors  ###
                #########################
                if concat_at_this_node:
                    lastX = X

                else:
                    lastX = {"_data": X}
                # ==> Apply on original data

                last_features = input_features

            elif len(predecessors) == 1:
                ########################
                ###  One predecessor ###
                ########################

                # ==> Apply on data coming out of last node
                if concat_at_this_node:
                    lastX = data_dico[predecessors[0]]
                else:
                    lastX = {
                        predecessor: data_dico[predecessor]
                        for predecessor in predecessors
                    }

                last_features = feature_dico[predecessors[0]]

            elif len(predecessors) > 1:
                #######################
                ###  More than one  ###
                #######################
                # ==> concat all the predecessors node and apply it

                ### Fix concatenation order ###
                if do_fit:
                    edges_number = self._get_edges_number(predecessors, node)
                    predecessors = sorted(predecessors,
                                          key=lambda p:
                                          (edges_number.get(p, -1), p))
                    self._all_concat_order[node] = predecessors
                else:
                    predecessors = self._all_concat_order[node]

                all_lastX = [
                    data_dico[predecessor] for predecessor in predecessors
                ]
                all_last_features = [
                    feature_dico[predecessor] for predecessor in predecessors
                ]

                if all_last_features is None or None in all_last_features:
                    last_features = None
                else:
                    last_features = unlist(all_last_features)

                # all_columns_names = [try_to_find_features_names( self._models[predecessor], input_features = input_features)
                #        for predecessor, input_features in zip(predecessors, all_last_features)]

                # for predecessor, input_features in zip(predecessors,all_last_features):
                #    try_to_find_features_names( self._models[predecessor], input_features = input_features)

                if self.verbose:
                    print("start aggregation...")

                if do_fit:
                    output_type = guess_output_type(all_lastX)
                    self._all_concat_type[node] = output_type
                else:
                    output_type = self._all_concat_type[node]

                if concat_at_this_node:
                    lastX = generic_hstack(all_lastX,
                                           output_type=output_type,
                                           all_columns_names=all_last_features)
                else:
                    lastX = {
                        predecessor: data_dico[predecessor]
                        for predecessor in predecessors
                    }

            if node != self._terminal_node:
                # This is not the end of the graph
                if do_fit:
                    if groups is not None and function_has_named_argument(
                            model.fit_transform, "groups"):
                        data_dico[node] = model.fit_transform(
                            lastX, y, groups=groups, **fit_params_step[node])
                    else:
                        data_dico[node] = model.fit_transform(
                            lastX, y, **fit_params_step[node])

                    # ICI : on pourrait sauté le fit pour certains models dans le fit params
                    # Quelque-chose comme :

                    # if node in preffited_models:
                    #
                    # self._model[node] = preffited_models[node]
                    # model = preffited_models[node]
                    # + copy model into pipeline

                    #    data_dico[node] = model.transform(lastX, y)
                    # else:
                    #    data_dico[node] = model.fit_transform(lastX, y, **fit_params_step[node] )

                else:
                    data_dico[node] = model.transform(lastX)

                feature_dico[node] = try_to_find_features_names(
                    model, input_features=last_features)

            else:
                # This is the last node of the Graph
                if method == "fit":
                    if groups is not None and function_has_named_argument(
                            model.fit, "groups"):
                        model.fit(lastX, y, groups, **fit_params_step[node])
                    else:
                        model.fit(lastX, y, **fit_params_step[node])
                    result = self

                elif method == "fit_predict":
                    if groups is not None and function_has_named_argument(
                            model.fit_predict, "groups"):
                        result = model.fit_predict(lastX, y, groups,
                                                   **fit_params_step[node])
                    else:
                        result = model.fit_predict(lastX, y,
                                                   **fit_params_step[node])

                elif method == "fit_transform":
                    if groups is not None and function_has_named_argument(
                            model.fit_transform, "groups"):
                        result = model.fit_transform(lastX, y, groups,
                                                     **fit_params_step[node])
                    else:
                        result = model.fit_transform(lastX, y,
                                                     **fit_params_step[node])

                elif method == "transform":
                    result = model.transform(lastX)

                elif method == "predict":
                    result = model.predict(lastX)

                elif method == "predict_proba":
                    result = model.predict_proba(lastX)

                elif method == "predict_log_proba":
                    result = model.predict_log_proba(lastX)

                elif method == "decision_function":
                    result = model.decision_function(lastX)

                elif method == "score":
                    result = model.score(lastX, y)

                else:
                    raise ValueError("I don't know that kind of method '%s' " %
                                     method)

                feature_dico[node] = try_to_find_features_names(
                    model, input_features=last_features)
                return result

            #######################
            #### Dico cleaning ####
            #######################
            # I'll do a step of cleaning to remove useless blocks in memory
            # I need to remove data in nodes that wont be accessed anymore
            still_usefull = set()
            for n in self.complete_graph.nodes:
                if n in nodes_done:
                    continue

                p = list(self.complete_graph.predecessors(n))
                still_usefull.update(p)

            for n in data_dico.keys():
                if data_dico[n] is None:
                    continue
                if n not in still_usefull:
                    if self.verbose:
                        print("deleting useless node %s" % n)
                    data_dico[n] = None
示例#4
0
    def _fit_transform(self, X, y, do_fit, do_transform):

        ###############################
        ### 1) Create preprocessing ###
        ###############################
        if do_fit:
            if self.text_preprocess is None:
                self._text_preprocessor = None

            elif self.text_preprocess == "default":
                self._text_preprocessor = TextDefaultProcessing()

            elif self.text_preprocess == "digit":
                self._text_preprocessor = TextDigitAnonymizer()

            elif self.text_preprocess == "nltk":
                self._text_preprocessor = TextNltkProcessing()

        ##############################
        ### 2) Apply preprocessing ###
        ##############################

        if self._text_preprocessor is not None:
            if do_fit:
                newX = self._text_preprocessor.fit_transform(X)
            else:
                newX = self._text_preprocessor.transform(X)
        else:
            newX = X

        if do_fit:
            self._nbcols = newX.shape[1]
        else:
            if newX.shape[1] != self._nbcols:
                raise ValueError(
                    "I don't have the correct number of columns %d, expected %d" (
                        newX.shape[1], self._nbcols))

        #######################################################
        ### 2) get all sub string of length 'self.nb_chars' ###
        #######################################################

        Xsplitted = [[
            _retrieve_all_rolling_string_parts(string, ngram=self.ngram)
            for string in newX.iloc[:, j]
        ] for j in range(self._nbcols)]

        #################################
        ### 3) fit Word2Vec embedding ###
        #################################
        if do_fit:
            if self.other_params is None:
                other_params = {}
            else:
                other_params = self.other_params

            if self.same_embedding_all_columns:
                ##############################################
                ### One embedding for ALL the text columns ###
                ##############################################

                Xsplitted_all = []
                for Xs in Xsplitted:
                    Xsplitted_all += unlist(Xs)

                model = Word2Vec(size=self.size,
                                 window=self.window,
                                 seed=self.random_state,
                                 **other_params)
                model.build_vocab(Xsplitted_all)
                model.train(Xsplitted_all,
                            total_examples=model.corpus_count,
                            epochs=model.epochs)

                self.models = [
                    model for j in range(self._nbcols)
                ]  # j time the same model, model train on everything

            else:
                ######################################
                ### One embedding PER text columns ###
                ######################################
                self.models = []
                for jj, Xs in enumerate(Xsplitted):
                    seed = self.random_state + jj if self.random_state else None
                    uXs = unlist(Xs)

                    model = Word2Vec(size=self.size,
                                     window=self.window,
                                     seed=seed,
                                     **other_params)
                    model.build_vocab(uXs)
                    model.train(uXs,
                                total_examples=model.corpus_count,
                                epochs=model.epochs)

                    self.models.append(model)

            self._features_names = []
            for j in range(self._nbcols):
                self._features_names += [
                    "%s__EMB__%d" % (X.columns[j], w) for w in range(self.size)
                ]

        if not do_transform:
            return self

        if self.models is None:
            raise NotFittedError("You must fit the model first")

        #########################
        ### 5) Apply Word2Vec ###
        #########################

        # Rmk : il faudrait refaire ca en vectorialisee... ou peut etre accelerer avec numba
        XXres = np.zeros((X.shape[0], self.size * self._nbcols),
                         dtype=np.float32)
        for j, (modelj, Xs) in enumerate(zip(self.models, Xsplitted)):

            for i, sentence in enumerate(Xs):
                count = 0
                for k, sub_sentence in enumerate(sentence):

                    for word in sub_sentence:
                        try:
                            emb = modelj.wv[word]
                        except KeyError:
                            emb = None

                        if emb is not None:
                            count += 1
                            XXres[i,
                                  (self.size * j):(self.size * (j + 1))] += emb

                if count > 0:
                    XXres[i, (self.size * j):(self.size * (j + 1))] /= count

        return pd.DataFrame(XXres, columns=self._features_names, index=X.index)
示例#5
0
def test_unlist():
    assert unlist([[1, 10], [32]]) == [1, 10, 32]
    assert unlist([[10], [11], [], [45]]) == [10, 11, 45]