def _approx_cross_validation_pre_calculation( self, X, y, groups, scoring, cv, verbose, fit_params_step, return_predict, method, no_scoring, stopping_round, stopping_threshold, nodes_not_to_crossvalidate, nodes_cant_cv_transform, kwargs_step, ): """ sub-method to loop through the nodes of the pipeline and pre-compute everything that can be pre-computed """ data_dico = {} # Will contain transformed blocks at each node nodes_done = set() for node in self._nodes_order: concat_at_this_node = self.no_concat_nodes is None or node not in self.no_concat_nodes if not concat_at_this_node: raise NotImplementedError( "Approx cross-validation does't work if no concatenation (node %s)" % str(node)) nodes_done.add(node) if self.verbose: print("start processing node %s ..." % node) ### Debugging Help ### # if getattr(self,"_return_before_node",None) is not None and getattr(self,"_return_before_node",None) == node: # return data_dico model = self._models[node] predecessors = list(self.complete_graph.predecessors(node)) # Carefull : here it is not necessary always in the same order #### I'll use the order in which the edges were given # Concatenation : alphabetical order if len(predecessors) == 0: ######################### ### No predecessors ### ######################### # ==> Apply on original data lastX = X elif len(predecessors) == 1: ######################## ### One predecessor ### ######################## # ==> Apply on data coming out of last node lastX = data_dico[predecessors[0]] # data_dico[node] = model.fit_transform(lastX, y, **fit_params_step[node] ) elif len(predecessors) > 1: ####################### ### More than one ### ####################### # ==> concat all the predecessors node and apply it ### Fix concatenation order ### edges_number = self._get_edges_number(predecessors, node) predecessors = sorted(predecessors, key=lambda p: (edges_number.get(p, -1), p)) self._all_concat_order[node] = predecessors all_lastX = [ data_dico[predecessor] for predecessor in predecessors ] if self.verbose: print("start aggregation...") # if do_fit: output_type = guess_output_type(all_lastX) self._all_concat_type[node] = output_type # else: # output_type = self._all_concat_type[node] has_none = False for x in all_lastX: if x is None: has_none = True break # None in all_lastX if has_none: lastX = None else: lastX = generic_hstack(all_lastX, output_type=output_type) if node != self._terminal_node and lastX is not None: # This is not the end of the graph if node not in nodes_not_to_crossvalidate and node not in nodes_cant_cv_transform: ### 1) Node should BE crossvalitaded ... ### 2) ... and we CAN use 'cv_transform' if self.verbose: print("do crossvalidation on %s" % node) _, data_dico[node] = cross_validation( model, lastX, y, groups=groups, cv=cv, verbose=verbose, fit_params=fit_params_step[node], return_predict=True, method="transform", no_scoring=True, stopping_round=None, stopping_threshold=None, **kwargs_step[node]) elif node not in nodes_not_to_crossvalidate and node in nodes_cant_cv_transform: ### 1) Node should BE crossvalitated ... ### 2) ... but we can't use 'cv_transform' if self.verbose: print("can't do node %s" % node) data_dico[node] = None # Can't compute this node else: ### Node that shouldn't be cross-validated ### if self.verbose: print("skip crossvalidation on %s" % node) cloned_model = clone(model) if groups is not None and function_has_named_argument( cloned_model.fit_transform, "groups"): data_dico[node] = cloned_model.fit_transform( lastX, y, groups, **fit_params_step[node]) else: data_dico[node] = cloned_model.fit_transform( lastX, y, **fit_params_step[node]) elif lastX is not None: ### CV no matter what at the last node ### # if node not in nodes_not_to_crossvalidate and node not in nodes_cant_cv_transform: # # # This is the last node of the Graph # result = approx_cross_validation( model, lastX, y, groups = groups, scoring = scoring, cv = cv , # verbose = verbose, fit_params = fit_params_step[node], # return_predict = return_predict , method = method, no_scoring = no_scoring, # stopping_round = stopping_round, stopping_threshold = stopping_threshold, # **kwargs_step[node]) # # elif node not in nodes_not_to_crossvalidate and node in nodes_cant_cv_transform: # pass # # else: # This is the last node of the Graph result = cross_validation( model, lastX, y, groups=groups, scoring=scoring, cv=cv, verbose=verbose, fit_params=fit_params_step[node], return_predict=return_predict, method=method, no_scoring=no_scoring, stopping_round=stopping_round, stopping_threshold=stopping_threshold, **kwargs_step[node]) # Rmk : if we do that so column regarding the time of fit are 'false' : they will only account for the time spent in the last node return True, data_dico, result # return result else: ### if self.verbose: print("can't compute node %s because lastX is None" % node) data_dico[node] = None # return result return False, data_dico, None # None : no result yet
def _fit_transform(self, X, y=None, groups=None, method=None, fit_params=None): """ main method of GraphPipeline, handles the fit and predict of object """ do_fit = method in ("fit", "fit_transform", "fit_predict") if not self._already_fitted and not do_fit: raise NotFittedError("Please fit the model before") # Split fit_params into a 'step-by-step' dictionnary fit_params_step = {name: {} for name in self.complete_graph.nodes} if fit_params is not None: for key, value in fit_params.items(): step, param = key.split("__", 1) fit_params_step[step][param] = value data_dico = {} # Will contain transformed blocks at each node feature_dico = {} # Will contain the get_feature_names() of each node if do_fit: input_features = getattr(X, "columns", None) if input_features is not None: input_features = list(input_features) self._Xinput_features = input_features else: input_features = self._Xinput_features nodes_done = set() for node in self._nodes_order: nodes_done.add(node) if self.verbose: print("start processing node %s ..." % node) ### Debugging Help ### if (getattr(self, "_return_before_node", None) is not None and getattr(self, "_return_before_node", None) == node): return data_dico model = self._models[node] predecessors = list(self.complete_graph.predecessors(node)) # Carefull : here it is not necessary always in the same order #### I'll use the order in which the edges were given # Concatenation : alphabetical order concat_at_this_node = self.no_concat_nodes is None or node not in self.no_concat_nodes if len(predecessors) == 0: ######################### ### No predecessors ### ######################### if concat_at_this_node: lastX = X else: lastX = {"_data": X} # ==> Apply on original data last_features = input_features elif len(predecessors) == 1: ######################## ### One predecessor ### ######################## # ==> Apply on data coming out of last node if concat_at_this_node: lastX = data_dico[predecessors[0]] else: lastX = { predecessor: data_dico[predecessor] for predecessor in predecessors } last_features = feature_dico[predecessors[0]] elif len(predecessors) > 1: ####################### ### More than one ### ####################### # ==> concat all the predecessors node and apply it ### Fix concatenation order ### if do_fit: edges_number = self._get_edges_number(predecessors, node) predecessors = sorted(predecessors, key=lambda p: (edges_number.get(p, -1), p)) self._all_concat_order[node] = predecessors else: predecessors = self._all_concat_order[node] all_lastX = [ data_dico[predecessor] for predecessor in predecessors ] all_last_features = [ feature_dico[predecessor] for predecessor in predecessors ] if all_last_features is None or None in all_last_features: last_features = None else: last_features = unlist(all_last_features) # all_columns_names = [try_to_find_features_names( self._models[predecessor], input_features = input_features) # for predecessor, input_features in zip(predecessors, all_last_features)] # for predecessor, input_features in zip(predecessors,all_last_features): # try_to_find_features_names( self._models[predecessor], input_features = input_features) if self.verbose: print("start aggregation...") if do_fit: output_type = guess_output_type(all_lastX) self._all_concat_type[node] = output_type else: output_type = self._all_concat_type[node] if concat_at_this_node: lastX = generic_hstack(all_lastX, output_type=output_type, all_columns_names=all_last_features) else: lastX = { predecessor: data_dico[predecessor] for predecessor in predecessors } if node != self._terminal_node: # This is not the end of the graph if do_fit: if groups is not None and function_has_named_argument( model.fit_transform, "groups"): data_dico[node] = model.fit_transform( lastX, y, groups=groups, **fit_params_step[node]) else: data_dico[node] = model.fit_transform( lastX, y, **fit_params_step[node]) # ICI : on pourrait sauté le fit pour certains models dans le fit params # Quelque-chose comme : # if node in preffited_models: # # self._model[node] = preffited_models[node] # model = preffited_models[node] # + copy model into pipeline # data_dico[node] = model.transform(lastX, y) # else: # data_dico[node] = model.fit_transform(lastX, y, **fit_params_step[node] ) else: data_dico[node] = model.transform(lastX) feature_dico[node] = try_to_find_features_names( model, input_features=last_features) else: # This is the last node of the Graph if method == "fit": if groups is not None and function_has_named_argument( model.fit, "groups"): model.fit(lastX, y, groups, **fit_params_step[node]) else: model.fit(lastX, y, **fit_params_step[node]) result = self elif method == "fit_predict": if groups is not None and function_has_named_argument( model.fit_predict, "groups"): result = model.fit_predict(lastX, y, groups, **fit_params_step[node]) else: result = model.fit_predict(lastX, y, **fit_params_step[node]) elif method == "fit_transform": if groups is not None and function_has_named_argument( model.fit_transform, "groups"): result = model.fit_transform(lastX, y, groups, **fit_params_step[node]) else: result = model.fit_transform(lastX, y, **fit_params_step[node]) elif method == "transform": result = model.transform(lastX) elif method == "predict": result = model.predict(lastX) elif method == "predict_proba": result = model.predict_proba(lastX) elif method == "predict_log_proba": result = model.predict_log_proba(lastX) elif method == "decision_function": result = model.decision_function(lastX) elif method == "score": result = model.score(lastX, y) else: raise ValueError("I don't know that kind of method '%s' " % method) feature_dico[node] = try_to_find_features_names( model, input_features=last_features) return result ####################### #### Dico cleaning #### ####################### # I'll do a step of cleaning to remove useless blocks in memory # I need to remove data in nodes that wont be accessed anymore still_usefull = set() for n in self.complete_graph.nodes: if n in nodes_done: continue p = list(self.complete_graph.predecessors(n)) still_usefull.update(p) for n in data_dico.keys(): if data_dico[n] is None: continue if n not in still_usefull: if self.verbose: print("deleting useless node %s" % n) data_dico[n] = None