def test_function_has_named_argument(): def f1(a, b): pass def f2(a, b, **kwargs): pass def f3(a=None, b=10, *args, **kwargs): pass class Foo(object): def f(self, a, b): pass @staticmethod def f2(a, b): pass class Functor(object): def __call__(self, a, b): pass for f in (f1, f2, f3, Foo.f, Foo().f, Foo.f2, Foo().f2, Functor()): assert function_has_named_argument(f, "a") assert function_has_named_argument(f, "b") assert not function_has_named_argument(f, "c")
def _multimetric_score_with_group(estimator, X_test, y_test, groups_test, scorers): """Return a dict of score for multimetric scoring""" # Copy of sklearn '_multimetric_score' but where the 'groups' can be passed to the scorer scores = {} for name, scorer in scorers.items(): has_group = groups_test is not None and function_has_named_argument( scorer, "groups") if y_test is None: if has_group: score = scorer(estimator, X_test, groups_test) else: score = scorer(estimator, X_test) else: if has_group: score = scorer(estimator, X_test, y_test, groups_test) else: score = scorer(estimator, X_test, y_test) if hasattr(score, 'item'): try: # e.g. unwrap memmapped scalars score = score.item() except ValueError: # non-scalar? pass scores[name] = score if not isinstance(score, numbers.Number): raise ValueError("scoring must return a number, got %s (%s) " "instead. (scorer=%s)" % (str(score), type(score), name)) return scores
def fit_command(self, job_ids): """ this command is to launch the final fit one (or more) model(s) It can be executed using the 'fit' command keyword followed by '--job_ids ***' It will: * reload the data * fit a model on all the data * save the pickled object """ all_models = [] for job_id in job_ids: print("fitting of job_id '%s'" % job_id) self.reload() job_param = self.data_persister.read(job_id, path = "job_param", write_type = SavingType.json) model = sklearn_model_from_param(job_param["model_json"]) print("start fitting...") if function_has_named_argument(model.fit, "groups") and self.groups is not None: model.fit(self.dfX, self.y, groups=self.groups) else: model.fit(self.dfX, self.y) print("...model fitted!") self.data_persister.write(model, job_id, path="saved_models", write_type=SavingType.pickle) self.data_persister.write(job_param["model_json"], job_id, path="saved_models", write_type=SavingType.json) print("model persisted") all_models.append(model) return all_models
def try_to_find_features_names(model, input_features=None): # TODO : il faudrait que ca prenne en entree un champs 'input_features_names' a passer a get_features_names # TODO : il faut tester si le model accept 'input_features_names' # TODO : il faudrait que pour les pipelines ca iter avec 'input_features_names' = get_features_names(last step) if hasattr(model, "get_feature_names"): # It already has a 'get_feature_names' method f = None if input_features is not None and function_has_named_argument( model.get_feature_names, "input_features"): # I have an input_features argument AND the method accepts it # => I'll use it try: f = model.get_feature_names(input_features) except (ValueError, AttributeError): pass else: try: f = model.get_feature_names() except (ValueError, AttributeError): pass if f is not None: return f if hasattr(model, "steps"): # It is a pipeline last_step = model.steps[-1][1] return try_to_find_features_names(last_step, input_features=input_features) if hasattr(model, "transformer_list"): features = [] for name, transformer in model.transformer_list: fs = try_to_find_features_names(transformer, input_features=input_features) if fs is None: return None features += [name + "__" + f for f in fs] return features # Rmk : FeatureUnion, already implemented else: # I don't know return None # don't know
def _score_with_group(estimator, X_test, y_test, groups_test, scorer, is_multimetric=False): """Compute the score(s) of an estimator on a given test set. Will return a single float if is_multimetric is False and a dict of floats, if is_multimetric is True """ # Copy of sklearn '_score' but where the 'groups' can be passed to the scorer if isinstance(y_test, pd.DataFrame): y_test = y_test.values if is_multimetric: return _multimetric_score_with_group(estimator, X_test, y_test, groups_test, scorer) else: has_group = groups_test is not None and function_has_named_argument( scorer, "groups") # True if : # * group is passed to the function # * the scorer accepts a 'group' argument if y_test is None: if has_group: score = scorer(estimator, X_test, groups_test) else: score = scorer(estimator, X_test) else: if has_group: score = scorer(estimator, X_test, y_test, groups_test) else: score = scorer(estimator, X_test, y_test) if hasattr(score, "item"): try: # e.g. unwrap memmapped scalars score = score.item() except ValueError: # non-scalar? pass if not isinstance(score, numbers.Number): raise ValueError("scoring must return a number, got %s (%s) " "instead. (scorer=%r)" % (str(score), type(score), scorer)) return score
def _compute_one_fold( fold_index, train, test, multi_output_proba, all_classes, classes, estimator, X, y, groups, scorers, verbose, fit_params, return_predict, method, no_scoring, ): if verbose: print("cv %d started\n" % fold_index) ### Clone the estimator ### cloned_estimator = sklearn.base.clone(estimator) ### split train test ### X_train, y_train = sklearn.model_selection._validation._safe_split( estimator, X, y, train) if groups is not None: groups_train, _ = sklearn.model_selection._validation._safe_split( estimator, groups, None, train) else: groups_train = None X_test, y_test = sklearn.model_selection._validation._safe_split( estimator, X, y, test, train) if groups is not None: groups_test, _ = sklearn.model_selection._validation._safe_split( estimator, groups, None, test, train) else: groups_test = None if hasattr(X_test, "index"): index_test = X_test.index else: index_test = test fit_params = fit_params if fit_params is not None else {} fit_params = _check_fit_params(X, fit_params, train) # Try to subset the fit_params if that is possible, Ex : 'sample_weight=np.array(....)' should be subsetted but not 'epochs=10' start_fit = time() ### Fit estimator ### if y_train is None: if groups_train is not None and function_has_named_argument( cloned_estimator.fit, "groups"): cloned_estimator.fit(X_train, groups=groups_train, **fit_params) else: cloned_estimator.fit(X_train, **fit_params) else: if groups_train is not None and function_has_named_argument( cloned_estimator.fit, "groups"): cloned_estimator.fit(X_train, y_train, groups=groups_train, **fit_params) else: cloned_estimator.fit(X_train, y_train, **fit_params) fit_time = time() - start_fit result_predict = None if return_predict: func = getattr(cloned_estimator, method) predictions = func(X_test) ## re-alignement with class ## if method in ("predict_proba", "predict_log_proba", "decision_function"): def _align_predict(predictions, classes, cloned_estimator_classes_): float_min = np.finfo(predictions.dtype).min default_values = { "decision_function": float_min, "predict_log_proba": float_min, "predict_proba": 0 } predictions_for_all_classes = pd.DataFrame( default_values[method], index=index_test, columns=classes) for j, c in enumerate(cloned_estimator_classes_): predictions_for_all_classes[c] = predictions[:, j] return predictions_for_all_classes if multi_output_proba: predictions = [ _align_predict(p, c, cloned_c) for p, c, cloned_c in zip( predictions, all_classes, cloned_estimator.classes_) ] else: predictions = _align_predict(predictions, classes, cloned_estimator.classes_) result_predict = (predictions, test) result = OrderedDict() ### Score test ### test_scores_dictionary = None if not no_scoring: start_score = time() test_scores_dictionary = _score_with_group(cloned_estimator, X_test, y_test, groups_test, scorer=scorers, is_multimetric=True) # Here : scorers is a dictionary of scorers, hence is_multimetric = True score_time = time() - start_score ### Score train ### train_scores_dictionary = _score_with_group(cloned_estimator, X_train, y_train, groups_train, scorer=scorers, is_multimetric=True) ### Put everything into a dictionnary ### for k, v in test_scores_dictionary.items(): result["test_%s" % k] = v for k, v in train_scores_dictionary.items(): result["train_%s" % k] = v result["fit_time"] = fit_time if not no_scoring: result["score_time"] = score_time result[ "n_test_samples"] = sklearn.model_selection._validation._num_samples( X_test) result["fold_nb"] = fold_index return result, result_predict, test_scores_dictionary
def _approx_cross_validation_pre_calculation( self, X, y, groups, scoring, cv, verbose, fit_params_step, return_predict, method, no_scoring, stopping_round, stopping_threshold, nodes_not_to_crossvalidate, nodes_cant_cv_transform, kwargs_step, ): """ sub-method to loop through the nodes of the pipeline and pre-compute everything that can be pre-computed """ data_dico = {} # Will contain transformed blocks at each node nodes_done = set() for node in self._nodes_order: concat_at_this_node = self.no_concat_nodes is None or node not in self.no_concat_nodes if not concat_at_this_node: raise NotImplementedError( "Approx cross-validation does't work if no concatenation (node %s)" % str(node)) nodes_done.add(node) if self.verbose: print("start processing node %s ..." % node) ### Debugging Help ### # if getattr(self,"_return_before_node",None) is not None and getattr(self,"_return_before_node",None) == node: # return data_dico model = self._models[node] predecessors = list(self.complete_graph.predecessors(node)) # Carefull : here it is not necessary always in the same order #### I'll use the order in which the edges were given # Concatenation : alphabetical order if len(predecessors) == 0: ######################### ### No predecessors ### ######################### # ==> Apply on original data lastX = X elif len(predecessors) == 1: ######################## ### One predecessor ### ######################## # ==> Apply on data coming out of last node lastX = data_dico[predecessors[0]] # data_dico[node] = model.fit_transform(lastX, y, **fit_params_step[node] ) elif len(predecessors) > 1: ####################### ### More than one ### ####################### # ==> concat all the predecessors node and apply it ### Fix concatenation order ### edges_number = self._get_edges_number(predecessors, node) predecessors = sorted(predecessors, key=lambda p: (edges_number.get(p, -1), p)) self._all_concat_order[node] = predecessors all_lastX = [ data_dico[predecessor] for predecessor in predecessors ] if self.verbose: print("start aggregation...") # if do_fit: output_type = guess_output_type(all_lastX) self._all_concat_type[node] = output_type # else: # output_type = self._all_concat_type[node] has_none = False for x in all_lastX: if x is None: has_none = True break # None in all_lastX if has_none: lastX = None else: lastX = generic_hstack(all_lastX, output_type=output_type) if node != self._terminal_node and lastX is not None: # This is not the end of the graph if node not in nodes_not_to_crossvalidate and node not in nodes_cant_cv_transform: ### 1) Node should BE crossvalitaded ... ### 2) ... and we CAN use 'cv_transform' if self.verbose: print("do crossvalidation on %s" % node) _, data_dico[node] = cross_validation( model, lastX, y, groups=groups, cv=cv, verbose=verbose, fit_params=fit_params_step[node], return_predict=True, method="transform", no_scoring=True, stopping_round=None, stopping_threshold=None, **kwargs_step[node]) elif node not in nodes_not_to_crossvalidate and node in nodes_cant_cv_transform: ### 1) Node should BE crossvalitated ... ### 2) ... but we can't use 'cv_transform' if self.verbose: print("can't do node %s" % node) data_dico[node] = None # Can't compute this node else: ### Node that shouldn't be cross-validated ### if self.verbose: print("skip crossvalidation on %s" % node) cloned_model = clone(model) if groups is not None and function_has_named_argument( cloned_model.fit_transform, "groups"): data_dico[node] = cloned_model.fit_transform( lastX, y, groups, **fit_params_step[node]) else: data_dico[node] = cloned_model.fit_transform( lastX, y, **fit_params_step[node]) elif lastX is not None: ### CV no matter what at the last node ### # if node not in nodes_not_to_crossvalidate and node not in nodes_cant_cv_transform: # # # This is the last node of the Graph # result = approx_cross_validation( model, lastX, y, groups = groups, scoring = scoring, cv = cv , # verbose = verbose, fit_params = fit_params_step[node], # return_predict = return_predict , method = method, no_scoring = no_scoring, # stopping_round = stopping_round, stopping_threshold = stopping_threshold, # **kwargs_step[node]) # # elif node not in nodes_not_to_crossvalidate and node in nodes_cant_cv_transform: # pass # # else: # This is the last node of the Graph result = cross_validation( model, lastX, y, groups=groups, scoring=scoring, cv=cv, verbose=verbose, fit_params=fit_params_step[node], return_predict=return_predict, method=method, no_scoring=no_scoring, stopping_round=stopping_round, stopping_threshold=stopping_threshold, **kwargs_step[node]) # Rmk : if we do that so column regarding the time of fit are 'false' : they will only account for the time spent in the last node return True, data_dico, result # return result else: ### if self.verbose: print("can't compute node %s because lastX is None" % node) data_dico[node] = None # return result return False, data_dico, None # None : no result yet
def _fit_transform(self, X, y=None, groups=None, method=None, fit_params=None): """ main method of GraphPipeline, handles the fit and predict of object """ do_fit = method in ("fit", "fit_transform", "fit_predict") if not self._already_fitted and not do_fit: raise NotFittedError("Please fit the model before") # Split fit_params into a 'step-by-step' dictionnary fit_params_step = {name: {} for name in self.complete_graph.nodes} if fit_params is not None: for key, value in fit_params.items(): step, param = key.split("__", 1) fit_params_step[step][param] = value data_dico = {} # Will contain transformed blocks at each node feature_dico = {} # Will contain the get_feature_names() of each node if do_fit: input_features = getattr(X, "columns", None) if input_features is not None: input_features = list(input_features) self._Xinput_features = input_features else: input_features = self._Xinput_features nodes_done = set() for node in self._nodes_order: nodes_done.add(node) if self.verbose: print("start processing node %s ..." % node) ### Debugging Help ### if (getattr(self, "_return_before_node", None) is not None and getattr(self, "_return_before_node", None) == node): return data_dico model = self._models[node] predecessors = list(self.complete_graph.predecessors(node)) # Carefull : here it is not necessary always in the same order #### I'll use the order in which the edges were given # Concatenation : alphabetical order concat_at_this_node = self.no_concat_nodes is None or node not in self.no_concat_nodes if len(predecessors) == 0: ######################### ### No predecessors ### ######################### if concat_at_this_node: lastX = X else: lastX = {"_data": X} # ==> Apply on original data last_features = input_features elif len(predecessors) == 1: ######################## ### One predecessor ### ######################## # ==> Apply on data coming out of last node if concat_at_this_node: lastX = data_dico[predecessors[0]] else: lastX = { predecessor: data_dico[predecessor] for predecessor in predecessors } last_features = feature_dico[predecessors[0]] elif len(predecessors) > 1: ####################### ### More than one ### ####################### # ==> concat all the predecessors node and apply it ### Fix concatenation order ### if do_fit: edges_number = self._get_edges_number(predecessors, node) predecessors = sorted(predecessors, key=lambda p: (edges_number.get(p, -1), p)) self._all_concat_order[node] = predecessors else: predecessors = self._all_concat_order[node] all_lastX = [ data_dico[predecessor] for predecessor in predecessors ] all_last_features = [ feature_dico[predecessor] for predecessor in predecessors ] if all_last_features is None or None in all_last_features: last_features = None else: last_features = unlist(all_last_features) # all_columns_names = [try_to_find_features_names( self._models[predecessor], input_features = input_features) # for predecessor, input_features in zip(predecessors, all_last_features)] # for predecessor, input_features in zip(predecessors,all_last_features): # try_to_find_features_names( self._models[predecessor], input_features = input_features) if self.verbose: print("start aggregation...") if do_fit: output_type = guess_output_type(all_lastX) self._all_concat_type[node] = output_type else: output_type = self._all_concat_type[node] if concat_at_this_node: lastX = generic_hstack(all_lastX, output_type=output_type, all_columns_names=all_last_features) else: lastX = { predecessor: data_dico[predecessor] for predecessor in predecessors } if node != self._terminal_node: # This is not the end of the graph if do_fit: if groups is not None and function_has_named_argument( model.fit_transform, "groups"): data_dico[node] = model.fit_transform( lastX, y, groups=groups, **fit_params_step[node]) else: data_dico[node] = model.fit_transform( lastX, y, **fit_params_step[node]) # ICI : on pourrait sauté le fit pour certains models dans le fit params # Quelque-chose comme : # if node in preffited_models: # # self._model[node] = preffited_models[node] # model = preffited_models[node] # + copy model into pipeline # data_dico[node] = model.transform(lastX, y) # else: # data_dico[node] = model.fit_transform(lastX, y, **fit_params_step[node] ) else: data_dico[node] = model.transform(lastX) feature_dico[node] = try_to_find_features_names( model, input_features=last_features) else: # This is the last node of the Graph if method == "fit": if groups is not None and function_has_named_argument( model.fit, "groups"): model.fit(lastX, y, groups, **fit_params_step[node]) else: model.fit(lastX, y, **fit_params_step[node]) result = self elif method == "fit_predict": if groups is not None and function_has_named_argument( model.fit_predict, "groups"): result = model.fit_predict(lastX, y, groups, **fit_params_step[node]) else: result = model.fit_predict(lastX, y, **fit_params_step[node]) elif method == "fit_transform": if groups is not None and function_has_named_argument( model.fit_transform, "groups"): result = model.fit_transform(lastX, y, groups, **fit_params_step[node]) else: result = model.fit_transform(lastX, y, **fit_params_step[node]) elif method == "transform": result = model.transform(lastX) elif method == "predict": result = model.predict(lastX) elif method == "predict_proba": result = model.predict_proba(lastX) elif method == "predict_log_proba": result = model.predict_log_proba(lastX) elif method == "decision_function": result = model.decision_function(lastX) elif method == "score": result = model.score(lastX, y) else: raise ValueError("I don't know that kind of method '%s' " % method) feature_dico[node] = try_to_find_features_names( model, input_features=last_features) return result ####################### #### Dico cleaning #### ####################### # I'll do a step of cleaning to remove useless blocks in memory # I need to remove data in nodes that wont be accessed anymore still_usefull = set() for n in self.complete_graph.nodes: if n in nodes_done: continue p = list(self.complete_graph.predecessors(n)) still_usefull.update(p) for n in data_dico.keys(): if data_dico[n] is None: continue if n not in still_usefull: if self.verbose: print("deleting useless node %s" % n) data_dico[n] = None