Exemplo n.º 1
0
    def _approx_cross_validation_pre_calculation(
        self,
        X,
        y,
        groups,
        scoring,
        cv,
        verbose,
        fit_params_step,
        return_predict,
        method,
        no_scoring,
        stopping_round,
        stopping_threshold,
        nodes_not_to_crossvalidate,
        nodes_cant_cv_transform,
        kwargs_step,
    ):
        """ sub-method to loop through the nodes of the pipeline and pre-compute everything that can be pre-computed """

        data_dico = {}  # Will contain transformed blocks at each node

        nodes_done = set()
        for node in self._nodes_order:

            concat_at_this_node = self.no_concat_nodes is None or node not in self.no_concat_nodes
            if not concat_at_this_node:
                raise NotImplementedError(
                    "Approx cross-validation does't work if no concatenation (node %s)"
                    % str(node))

            nodes_done.add(node)

            if self.verbose:
                print("start processing node %s ..." % node)

            ### Debugging Help ###
            # if getattr(self,"_return_before_node",None) is not None and getattr(self,"_return_before_node",None) == node:
            #    return data_dico

            model = self._models[node]

            predecessors = list(self.complete_graph.predecessors(node))
            # Carefull : here it is not necessary always in the same order

            #### I'll use the order in which the edges were given

            # Concatenation : alphabetical order

            if len(predecessors) == 0:
                #########################
                ###  No predecessors  ###
                #########################

                # ==> Apply on original data
                lastX = X

            elif len(predecessors) == 1:
                ########################
                ###  One predecessor ###
                ########################

                # ==> Apply on data coming out of last node
                lastX = data_dico[predecessors[0]]
                # data_dico[node] = model.fit_transform(lastX, y, **fit_params_step[node] )

            elif len(predecessors) > 1:
                #######################
                ###  More than one  ###
                #######################
                # ==> concat all the predecessors node and apply it

                ### Fix concatenation order ###
                edges_number = self._get_edges_number(predecessors, node)
                predecessors = sorted(predecessors,
                                      key=lambda p:
                                      (edges_number.get(p, -1), p))
                self._all_concat_order[node] = predecessors

                all_lastX = [
                    data_dico[predecessor] for predecessor in predecessors
                ]

                if self.verbose:
                    print("start aggregation...")

                # if do_fit:
                output_type = guess_output_type(all_lastX)
                self._all_concat_type[node] = output_type
                # else:
                #    output_type = self._all_concat_type[node]
                has_none = False
                for x in all_lastX:
                    if x is None:
                        has_none = True
                        break

                # None in all_lastX

                if has_none:
                    lastX = None
                else:
                    lastX = generic_hstack(all_lastX, output_type=output_type)

            if node != self._terminal_node and lastX is not None:
                # This is not the end of the graph

                if node not in nodes_not_to_crossvalidate and node not in nodes_cant_cv_transform:
                    ### 1) Node should BE crossvalitaded  ...
                    ### 2) ... and we CAN use 'cv_transform'

                    if self.verbose:
                        print("do crossvalidation on %s" % node)

                    _, data_dico[node] = cross_validation(
                        model,
                        lastX,
                        y,
                        groups=groups,
                        cv=cv,
                        verbose=verbose,
                        fit_params=fit_params_step[node],
                        return_predict=True,
                        method="transform",
                        no_scoring=True,
                        stopping_round=None,
                        stopping_threshold=None,
                        **kwargs_step[node])

                elif node not in nodes_not_to_crossvalidate and node in nodes_cant_cv_transform:
                    ### 1) Node should BE crossvalitated ...
                    ### 2) ... but we can't use 'cv_transform'

                    if self.verbose:
                        print("can't do node %s" % node)
                    data_dico[node] = None  # Can't compute this node

                else:
                    ### Node that shouldn't be cross-validated ###

                    if self.verbose:
                        print("skip crossvalidation on %s" % node)
                    cloned_model = clone(model)
                    if groups is not None and function_has_named_argument(
                            cloned_model.fit_transform, "groups"):
                        data_dico[node] = cloned_model.fit_transform(
                            lastX, y, groups, **fit_params_step[node])
                    else:
                        data_dico[node] = cloned_model.fit_transform(
                            lastX, y, **fit_params_step[node])

            elif lastX is not None:

                ### CV no matter what at the last node ###

                #                if node not in nodes_not_to_crossvalidate and node not in nodes_cant_cv_transform:
                #
                #                    # This is the last node of the Graph
                #                    result = approx_cross_validation( model, lastX, y, groups = groups, scoring = scoring, cv = cv ,
                #                                                verbose = verbose, fit_params = fit_params_step[node],
                #                                                return_predict = return_predict , method = method, no_scoring = no_scoring,
                #                                                stopping_round = stopping_round, stopping_threshold = stopping_threshold,
                #                                                **kwargs_step[node])
                #
                #                elif node not in nodes_not_to_crossvalidate and node in nodes_cant_cv_transform:
                #                    pass
                #
                #                else:

                # This is the last node of the Graph
                result = cross_validation(
                    model,
                    lastX,
                    y,
                    groups=groups,
                    scoring=scoring,
                    cv=cv,
                    verbose=verbose,
                    fit_params=fit_params_step[node],
                    return_predict=return_predict,
                    method=method,
                    no_scoring=no_scoring,
                    stopping_round=stopping_round,
                    stopping_threshold=stopping_threshold,
                    **kwargs_step[node])

                # Rmk : if we do that so column regarding the time of fit are 'false' : they will only account for the time spent in the last node

                return True, data_dico, result
            #                return result

            else:
                ###
                if self.verbose:
                    print("can't compute node %s because lastX is None" % node)
                data_dico[node] = None
                # return result

        return False, data_dico, None  # None : no result yet
Exemplo n.º 2
0
    def _fit_transform(self,
                       X,
                       y=None,
                       groups=None,
                       method=None,
                       fit_params=None):
        """ main method of GraphPipeline, handles the fit and predict of object """
        do_fit = method in ("fit", "fit_transform", "fit_predict")

        if not self._already_fitted and not do_fit:
            raise NotFittedError("Please fit the model before")

        # Split fit_params into a 'step-by-step' dictionnary
        fit_params_step = {name: {} for name in self.complete_graph.nodes}
        if fit_params is not None:
            for key, value in fit_params.items():
                step, param = key.split("__", 1)
                fit_params_step[step][param] = value

        data_dico = {}  # Will contain transformed blocks at each node
        feature_dico = {}  # Will contain the get_feature_names() of each node

        if do_fit:
            input_features = getattr(X, "columns", None)
            if input_features is not None:
                input_features = list(input_features)

            self._Xinput_features = input_features

        else:
            input_features = self._Xinput_features

        nodes_done = set()
        for node in self._nodes_order:

            nodes_done.add(node)

            if self.verbose:
                print("start processing node %s ..." % node)

            ### Debugging Help ###
            if (getattr(self, "_return_before_node", None) is not None
                    and getattr(self, "_return_before_node", None) == node):
                return data_dico

            model = self._models[node]

            predecessors = list(self.complete_graph.predecessors(node))
            # Carefull : here it is not necessary always in the same order

            #### I'll use the order in which the edges were given

            # Concatenation : alphabetical order
            concat_at_this_node = self.no_concat_nodes is None or node not in self.no_concat_nodes

            if len(predecessors) == 0:
                #########################
                ###  No predecessors  ###
                #########################
                if concat_at_this_node:
                    lastX = X

                else:
                    lastX = {"_data": X}
                # ==> Apply on original data

                last_features = input_features

            elif len(predecessors) == 1:
                ########################
                ###  One predecessor ###
                ########################

                # ==> Apply on data coming out of last node
                if concat_at_this_node:
                    lastX = data_dico[predecessors[0]]
                else:
                    lastX = {
                        predecessor: data_dico[predecessor]
                        for predecessor in predecessors
                    }

                last_features = feature_dico[predecessors[0]]

            elif len(predecessors) > 1:
                #######################
                ###  More than one  ###
                #######################
                # ==> concat all the predecessors node and apply it

                ### Fix concatenation order ###
                if do_fit:
                    edges_number = self._get_edges_number(predecessors, node)
                    predecessors = sorted(predecessors,
                                          key=lambda p:
                                          (edges_number.get(p, -1), p))
                    self._all_concat_order[node] = predecessors
                else:
                    predecessors = self._all_concat_order[node]

                all_lastX = [
                    data_dico[predecessor] for predecessor in predecessors
                ]
                all_last_features = [
                    feature_dico[predecessor] for predecessor in predecessors
                ]

                if all_last_features is None or None in all_last_features:
                    last_features = None
                else:
                    last_features = unlist(all_last_features)

                # all_columns_names = [try_to_find_features_names( self._models[predecessor], input_features = input_features)
                #        for predecessor, input_features in zip(predecessors, all_last_features)]

                # for predecessor, input_features in zip(predecessors,all_last_features):
                #    try_to_find_features_names( self._models[predecessor], input_features = input_features)

                if self.verbose:
                    print("start aggregation...")

                if do_fit:
                    output_type = guess_output_type(all_lastX)
                    self._all_concat_type[node] = output_type
                else:
                    output_type = self._all_concat_type[node]

                if concat_at_this_node:
                    lastX = generic_hstack(all_lastX,
                                           output_type=output_type,
                                           all_columns_names=all_last_features)
                else:
                    lastX = {
                        predecessor: data_dico[predecessor]
                        for predecessor in predecessors
                    }

            if node != self._terminal_node:
                # This is not the end of the graph
                if do_fit:
                    if groups is not None and function_has_named_argument(
                            model.fit_transform, "groups"):
                        data_dico[node] = model.fit_transform(
                            lastX, y, groups=groups, **fit_params_step[node])
                    else:
                        data_dico[node] = model.fit_transform(
                            lastX, y, **fit_params_step[node])

                    # ICI : on pourrait sauté le fit pour certains models dans le fit params
                    # Quelque-chose comme :

                    # if node in preffited_models:
                    #
                    # self._model[node] = preffited_models[node]
                    # model = preffited_models[node]
                    # + copy model into pipeline

                    #    data_dico[node] = model.transform(lastX, y)
                    # else:
                    #    data_dico[node] = model.fit_transform(lastX, y, **fit_params_step[node] )

                else:
                    data_dico[node] = model.transform(lastX)

                feature_dico[node] = try_to_find_features_names(
                    model, input_features=last_features)

            else:
                # This is the last node of the Graph
                if method == "fit":
                    if groups is not None and function_has_named_argument(
                            model.fit, "groups"):
                        model.fit(lastX, y, groups, **fit_params_step[node])
                    else:
                        model.fit(lastX, y, **fit_params_step[node])
                    result = self

                elif method == "fit_predict":
                    if groups is not None and function_has_named_argument(
                            model.fit_predict, "groups"):
                        result = model.fit_predict(lastX, y, groups,
                                                   **fit_params_step[node])
                    else:
                        result = model.fit_predict(lastX, y,
                                                   **fit_params_step[node])

                elif method == "fit_transform":
                    if groups is not None and function_has_named_argument(
                            model.fit_transform, "groups"):
                        result = model.fit_transform(lastX, y, groups,
                                                     **fit_params_step[node])
                    else:
                        result = model.fit_transform(lastX, y,
                                                     **fit_params_step[node])

                elif method == "transform":
                    result = model.transform(lastX)

                elif method == "predict":
                    result = model.predict(lastX)

                elif method == "predict_proba":
                    result = model.predict_proba(lastX)

                elif method == "predict_log_proba":
                    result = model.predict_log_proba(lastX)

                elif method == "decision_function":
                    result = model.decision_function(lastX)

                elif method == "score":
                    result = model.score(lastX, y)

                else:
                    raise ValueError("I don't know that kind of method '%s' " %
                                     method)

                feature_dico[node] = try_to_find_features_names(
                    model, input_features=last_features)
                return result

            #######################
            #### Dico cleaning ####
            #######################
            # I'll do a step of cleaning to remove useless blocks in memory
            # I need to remove data in nodes that wont be accessed anymore
            still_usefull = set()
            for n in self.complete_graph.nodes:
                if n in nodes_done:
                    continue

                p = list(self.complete_graph.predecessors(n))
                still_usefull.update(p)

            for n in data_dico.keys():
                if data_dico[n] is None:
                    continue
                if n not in still_usefull:
                    if self.verbose:
                        print("deleting useless node %s" % n)
                    data_dico[n] = None