def _fit_transform_rest(self, X, transformed_part, is_fit, is_transform): """ method to take care of the rest of data, that wasn't transformed, it can either be * dropped (default) : 'keep_other_columns' == 'drop' * kept as is : 'keep_other_columns' == 'keep' * keep only not used columns 'keep_other_columns' == 'delta' """ if self.keep_other_columns == "keep": # In that case I'll keep the original columns as well if is_fit: if hasattr(X, "columns"): self._Xcolumns = list(getattr(X, "columns")) elif hasattr(X, "shape"): self._Xcolumns = [i for i in range(X.shape[1])] else: self._Xcolumns = None if is_transform: kept_features_names = self._get_rest_columns() Xcomplete_result = dsh.generic_hstack( [X, transformed_part], output_type=self.desired_output_type, all_columns_names=[kept_features_names, self._feature_names_for_transform], ) return Xcomplete_result else: return self elif self.keep_other_columns == "drop": return None # "delta' mode, I'll keep only the columns that were not used if self.columns_to_use is None: return transformed_part if is_fit and is_transform: self.anti_selector = ColumnsSelector(columns_to_drop=self.columns_to_use, regex_match=self.regex_match) Xother = self.anti_selector.fit_transform(X) elif is_transform: Xother = self.anti_selector.transform(X) elif is_fit: self.anti_selector = ColumnsSelector(columns_to_drop=self.columns_to_use, regex_match=self.regex_match) self.anti_selector.fit(X) if is_transform: kept_features_names = self._get_rest_columns() return dsh.generic_hstack( [Xother, transformed_part], output_type=self.desired_output_type, all_columns_names=[kept_features_names, self._feature_names_for_transform], ) # Rmk : generic_hstack will handle the case where Xother has no columns else: return self
def test_generic_hstack(): df1 = pd.DataFrame({"a": list(range(10)), "b": ["aaaa", "bbbbb", "cccc"] * 3 + ["ezzzz"]}) df2 = pd.DataFrame({"c": list(range(10)), "d": ["aaaa", "bbbbb", "cccc"] * 3 + ["ezzzz"]}) df12 = generic_hstack((df1, df2)) assert get_type(df12) == DataTypes.DataFrame assert df12.shape == (10, 4) assert list(df12.columns) == ["a", "b", "c", "d"] df1 = pd.DataFrame({"a": list(range(10)), "b": ["aaaa", "bbbbb", "cccc"] * 3 + ["ezzzz"]}) df2 = pd.DataFrame( {"c": list(range(10)), "d": ["aaaa", "bbbbb", "cccc"] * 3 + ["ezzzz"]}, index=[1, 3, 5, 7, 9, 11, 13, 15, 17, 19], ) df12 = generic_hstack((df1, df2)) assert np.array_equal(df12.index.values, np.array([1, 3, 5, 7, 9, 11, 13, 15, 17, 19])) assert get_type(df12) == DataTypes.DataFrame assert df12.shape == (10, 4) assert list(df12.columns) == ["a", "b", "c", "d"] df12 = generic_hstack((df1, df2), output_type=DataTypes.NumpyArray) assert get_type(df12) == DataTypes.NumpyArray assert df12.shape == (10, 4) with pytest.raises(ValueError): generic_hstack((df1.head(3), df2.head(4))) with pytest.raises(ValueError): generic_hstack((df1.head(3).values, df2.head(4))) with pytest.raises(ValueError): generic_hstack((df1.head(3).values, df2.head(4).values))
def _transform_aggregat(self, X, target_aggregat, target_aggregat_global): all_results = [] for col in self._columns_to_encode: if self._na_to_null[col]: Xcol = self.na_remplacing(X[col]) else: Xcol = X[col] result = Xcol.apply(lambda x: self.get_value(x, target_aggregat[col], target_aggregat_global[col])) # result.columns = ["%s__%s" % (col,c) for c in result.columns] all_results.append(result) assert len(result) == len(X) assert len(result.shape) == 2 if len(all_results) == 0: if len(self._columns_to_keep) > 0: result_other = X.loc[:, self._columns_to_keep] return result_other else: return pd.DataFrame(index=range(X.shape[0]), columns=[]) # empty DataFrame all_results = pd.concat(all_results, axis=1) assert (all_results.index == X.index).all() if len(self._columns_to_keep) > 0: result_other = X.loc[:, self._columns_to_keep] return generic_hstack([result_other, all_results]) else: return all_results
def transform(self, X): if get_type(X) != DataTypes.DataFrame: raise TypeError("X should be a DataFrame") result = self._transform_to_encode(X) if len(self._columns_to_keep) > 0: result_other = X.loc[:, self._columns_to_keep] return generic_hstack([result_other, result]) else: return result
def test_generic_hstack_sparse_and_category(with_cat, force_sparse): df = pd.DataFrame({"a":10+np.arange(10),"b":np.random.randn(10)}) if with_cat: df["a"] = df["a"].astype("category") xx = convert_to_sparsearray(np.random.randint(0,1, size=(10,2))) concat = generic_hstack((df,xx), max_number_of_cells_for_non_sparse = 10 + (1-force_sparse) * 1000000) assert concat.shape == (df.shape[0] , df.shape[1] + xx.shape[1]) if force_sparse: assert get_type(concat) == DataTypes.SparseArray elif with_cat: assert concat.dtypes["a"] == "category" assert isinstance(concat, pd.DataFrame)
def _fit_transform(self, X, y, is_fit, is_transform, fit_params=None): """ internal method that handle the fit and the transform """ if fit_params is None: fit_params = {} if is_fit: if isinstance(self.columns_to_use, str) and self.columns_to_use == "auto": columns = self._get_default_columns_to_use(X, y) self.selector = ColumnsSelector(columns_to_use=columns) else: self.selector = ColumnsSelector(columns_to_use=self.columns_to_use, regex_match=self.regex_match) if hasattr(X, "shape"): if X.shape[0] == 0: raise ValueError("the X object has 0 rows") Xindex = dsh._get_index(X) # if X has an index retrieve it # if self.columns_to_use is not None: if is_fit: Xsubset = self.selector.fit_transform(X) else: Xsubset = self.selector.transform(X) # TODO (maybe): here allow a preprocessing pipeline # if self.has_preprocessing: # if is_fit: # self.preprocessing = self._get_preprocessing() # Xsubset = self.preprocessing.fit_transform(Xsubset) # else: # Xsubset = self.preprocessing.transform(Xsubset) # Store columns and shape BEFORE any modification if self.selector is not None: Xsubset_columns = self.selector.get_feature_names() else: raise NotImplementedError("should not go there anymore") # Xsubset_columns = getattr(Xsubset, "columns", None) Xsubset_shape = getattr(Xsubset, "shape", None) # TODO : ici utiliser d'une facon ou d'une autre un ' # https://github.com/scikit-learn/scikit-learn/issues/6425 if is_fit: self._expected_type = dsh.get_type(Xsubset) self._expected_nbcols = dsh._nbcols(Xsubset) self._expected_columns = dsh._get_columns(Xsubset) else: Xtype = dsh.get_type(Xsubset) if Xtype != self._expected_type: raise ValueError( "I don't have the correct type as input, expected : %s, got : %s" % (self._expected_type, Xtype) ) nbcols = dsh._nbcols(Xsubset) if nbcols != self._expected_nbcols: raise ValueError( "I don't have the correct nb of colmns as input, expected : %d, got : %d" % (self._expected_nbcols, nbcols) ) columns = dsh._get_columns(Xsubset) expected_columns = getattr(self, "_expected_columns", None) # to allow pickle compatibility if expected_columns is not None and columns is not None and columns != self._expected_columns: raise ValueError("I don't have the correct names of columns") if self.accepted_input_types is not None and self._expected_type not in self.accepted_input_types: Xsubset = dsh.convert_generic( Xsubset, mapped_type=self._expected_type, output_type=self.accepted_input_types[0] ) if is_fit: self._verif_params() self._empty_data = False s = getattr(Xsubset, "shape", None) if s is not None and len(s) > 1 and s[1] == 0: self._empty_data = True if self.all_columns_at_once or self._empty_data: if is_fit: self._model = self._get_model(Xsubset, y) ############################################## ### Apply the model on ALL columns at ONCE ### ############################################## if self.work_on_one_column_only: Xsubset = dsh.make1dimension(Xsubset) # will generate an error if 2 dimensions else: Xsubset = dsh.make2dimensions(Xsubset) # Call to underlying model Xres = None if is_fit and is_transform: ############################## ### fit_transform method ### ############################## # test if the the data to transform actually has some columns if not self._empty_data: # normal case Xres = self._model.fit_transform(Xsubset, y, **fit_params) else: # It means there is no columns to transform Xres = Xsubset # don't do anything elif is_fit and not is_transform: #################### ### fit method ### #################### if self.must_transform_to_get_features_name: Xres = self._model.fit_transform(Xsubset, y, **fit_params) else: self._model.fit(Xsubset, y, **fit_params) else: #################### ### transform ### #################### if not self._empty_data: Xres = self._model.transform(Xsubset) else: Xres = Xsubset if is_fit: self._columns_informations = { "output_columns": getattr(Xres, "columns", None), # names of transformed columns if exist "output_shape": getattr(Xres, "shape", None), # shape of transformed result if exist "input_columns": Xsubset_columns, # name of input columns "input_shape": Xsubset_shape, # shape of input data } self._feature_names_for_transform = self.try_to_find_feature_names_all_at_once( output_columns=self._columns_informations["output_columns"], output_shape=self._columns_informations["output_shape"], input_columns=self._columns_informations["input_columns"], input_shape=self._columns_informations["input_shape"], ) # self.kept_features_names = None # for now if is_transform: Xres = dsh.convert_generic(Xres, output_type=self.desired_output_type) Xres = dsh._set_index(Xres, Xindex) else: ######################################## ### Apply the model COLUMN BY COLUMN ### ######################################## if is_fit: self._models = [] if is_transform or self.must_transform_to_get_features_name: all_Xres = [] else: all_Xres = None Xsubset = dsh.make2dimensions(Xsubset) for j in range(self._expected_nbcols): if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame, DataTypes.Serie): Xsubset_j = Xsubset.iloc[:, j] else: Xsubset_j = Xsubset[:, j] if is_fit: sub_model = self._get_model(Xsubset, y) self._models.append(sub_model) else: sub_model = self._models[j] if not self.work_on_one_column_only: Xsubset_j = dsh.make2dimensions(Xsubset_j) if is_fit and is_transform: # fit_transform method Xres_j = sub_model.fit_transform(Xsubset_j, y, **fit_params) all_Xres.append(Xres_j) elif is_fit and not is_transform: # fit method if self.must_transform_to_get_features_name: Xres_j = sub_model.fit_transform(Xsubset_j, y, **fit_params) all_Xres.append(Xres_j) else: sub_model.fit(Xsubset_j, y, **fit_params) elif is_transform: # transform method Xres_j = sub_model.transform(Xsubset_j) all_Xres.append(Xres_j) if is_fit: self._columns_informations = { "all_output_columns": None if all_Xres is None else [getattr(Xres, "columns", None) for Xres in all_Xres], "all_output_shape": None if all_Xres is None else [getattr(Xres, "shape", None) for Xres in all_Xres], "input_columns": Xsubset_columns, # name of input columns "input_shape": Xsubset_shape, # shape of input data } self._feature_names_for_transform = list( self.try_to_find_feature_names_separate( all_output_columns=self._columns_informations["all_output_columns"], all_output_shape=self._columns_informations["all_output_shape"], input_columns=self._columns_informations["input_columns"], input_shape=self._columns_informations["input_shape"], ) ) # self.kept_features_names = None # for now if is_transform: Xres = dsh.generic_hstack(all_Xres, output_type=self.desired_output_type) Xres = dsh._set_index(Xres, Xindex) if is_transform: if self._feature_names_for_transform is not None: ### LA ca marche pas en transform !!! Xres = dsh._set_columns(Xres, self._feature_names_for_transform) if is_transform: return Xres else: return self
def _fit_transform_rest(self, X, transformed_part, is_fit, is_transform): """ method to take care of the what to do with wasn't transformed, there are 2 part of the data: * the part that was used by the transformer : columns_to_use * the part that wasn't used by the transformer : the rest We can keep or drop those 2 parts """ # There are four possibilities : # * drop_unused_columns = True and drop_used_columns = True # => nothing to do. Nothing to ADD to the result of the wrapped transformer # # * drop_unused_columns = True and drop_used_columns = False # => We need to add the 'un-transformed' data to the result # => Add an 'anti-selector' with 'columns_to_drop' = 'columns_to_use' # This will selector the rest of the columns # # * drop_unused_columns = False and drop_used_columns = True # => We need to add the 'transformed' part of the data # => Add a 'selector' : with 'columns_to_use' = 'columns_to_use' # # * drop_unused_columns = False and drop_used_columns = True # => We need to add the full data # => ... don't add a selector (or a selector with columns_to_use = None) # # if is_fit: self.other_selector = None if self.drop_unused_columns and self.drop_used_columns: # Nothing to do if is_transform: return transformed_part else: return None if is_fit: if hasattr(X, "columns"): self._Xcolumns = list(getattr(X, "columns")) elif hasattr(X, "shape"): self._Xcolumns = [i for i in range(X.shape[1])] else: self._Xcolumns = None if not self.drop_used_columns and self.drop_unused_columns: self.other_selector = ColumnsSelector(columns_to_use=self.columns_to_use, regex_match=self.regex_match) elif self.drop_used_columns and not self.drop_unused_columns: self.other_selector = ColumnsSelector(columns_to_drop=self.columns_to_use, regex_match=self.regex_match) elif not self.drop_used_columns and not self.drop_unused_columns: self.other_selector = ColumnsSelector(columns_to_use="all") # Maybe we can 'by-pass' this else: self.other_selector = None # we never go there, already out of the function if is_fit and is_transform: Xother = self.other_selector.fit_transform(X) elif is_transform: Xother = self.other_selector.transform(X) elif is_fit: self.other_selector.fit(X) if is_transform: kept_features_names = self.other_selector.get_feature_names() return dsh.generic_hstack( [Xother, transformed_part], output_type=self.desired_output_type, all_columns_names=[kept_features_names, self._feature_names_for_transform], ) else: return self
def _approx_cross_validation_pre_calculation( self, X, y, groups, scoring, cv, verbose, fit_params_step, return_predict, method, no_scoring, stopping_round, stopping_threshold, nodes_not_to_crossvalidate, nodes_cant_cv_transform, kwargs_step, ): """ sub-method to loop through the nodes of the pipeline and pre-compute everything that can be pre-computed """ data_dico = {} # Will contain transformed blocks at each node nodes_done = set() for node in self._nodes_order: concat_at_this_node = self.no_concat_nodes is None or node not in self.no_concat_nodes if not concat_at_this_node: raise NotImplementedError( "Approx cross-validation does't work if no concatenation (node %s)" % str(node)) nodes_done.add(node) if self.verbose: print("start processing node %s ..." % node) ### Debugging Help ### # if getattr(self,"_return_before_node",None) is not None and getattr(self,"_return_before_node",None) == node: # return data_dico model = self._models[node] predecessors = list(self.complete_graph.predecessors(node)) # Carefull : here it is not necessary always in the same order #### I'll use the order in which the edges were given # Concatenation : alphabetical order if len(predecessors) == 0: ######################### ### No predecessors ### ######################### # ==> Apply on original data lastX = X elif len(predecessors) == 1: ######################## ### One predecessor ### ######################## # ==> Apply on data coming out of last node lastX = data_dico[predecessors[0]] # data_dico[node] = model.fit_transform(lastX, y, **fit_params_step[node] ) elif len(predecessors) > 1: ####################### ### More than one ### ####################### # ==> concat all the predecessors node and apply it ### Fix concatenation order ### edges_number = self._get_edges_number(predecessors, node) predecessors = sorted(predecessors, key=lambda p: (edges_number.get(p, -1), p)) self._all_concat_order[node] = predecessors all_lastX = [ data_dico[predecessor] for predecessor in predecessors ] if self.verbose: print("start aggregation...") # if do_fit: output_type = guess_output_type(all_lastX) self._all_concat_type[node] = output_type # else: # output_type = self._all_concat_type[node] has_none = False for x in all_lastX: if x is None: has_none = True break # None in all_lastX if has_none: lastX = None else: lastX = generic_hstack(all_lastX, output_type=output_type) if node != self._terminal_node and lastX is not None: # This is not the end of the graph if node not in nodes_not_to_crossvalidate and node not in nodes_cant_cv_transform: ### 1) Node should BE crossvalitaded ... ### 2) ... and we CAN use 'cv_transform' if self.verbose: print("do crossvalidation on %s" % node) _, data_dico[node] = cross_validation( model, lastX, y, groups=groups, cv=cv, verbose=verbose, fit_params=fit_params_step[node], return_predict=True, method="transform", no_scoring=True, stopping_round=None, stopping_threshold=None, **kwargs_step[node]) elif node not in nodes_not_to_crossvalidate and node in nodes_cant_cv_transform: ### 1) Node should BE crossvalitated ... ### 2) ... but we can't use 'cv_transform' if self.verbose: print("can't do node %s" % node) data_dico[node] = None # Can't compute this node else: ### Node that shouldn't be cross-validated ### if self.verbose: print("skip crossvalidation on %s" % node) cloned_model = clone(model) if groups is not None and function_has_named_argument( cloned_model.fit_transform, "groups"): data_dico[node] = cloned_model.fit_transform( lastX, y, groups, **fit_params_step[node]) else: data_dico[node] = cloned_model.fit_transform( lastX, y, **fit_params_step[node]) elif lastX is not None: ### CV no matter what at the last node ### # if node not in nodes_not_to_crossvalidate and node not in nodes_cant_cv_transform: # # # This is the last node of the Graph # result = approx_cross_validation( model, lastX, y, groups = groups, scoring = scoring, cv = cv , # verbose = verbose, fit_params = fit_params_step[node], # return_predict = return_predict , method = method, no_scoring = no_scoring, # stopping_round = stopping_round, stopping_threshold = stopping_threshold, # **kwargs_step[node]) # # elif node not in nodes_not_to_crossvalidate and node in nodes_cant_cv_transform: # pass # # else: # This is the last node of the Graph result = cross_validation( model, lastX, y, groups=groups, scoring=scoring, cv=cv, verbose=verbose, fit_params=fit_params_step[node], return_predict=return_predict, method=method, no_scoring=no_scoring, stopping_round=stopping_round, stopping_threshold=stopping_threshold, **kwargs_step[node]) # Rmk : if we do that so column regarding the time of fit are 'false' : they will only account for the time spent in the last node return True, data_dico, result # return result else: ### if self.verbose: print("can't compute node %s because lastX is None" % node) data_dico[node] = None # return result return False, data_dico, None # None : no result yet
def _fit_transform(self, X, y=None, groups=None, method=None, fit_params=None): """ main method of GraphPipeline, handles the fit and predict of object """ do_fit = method in ("fit", "fit_transform", "fit_predict") if not self._already_fitted and not do_fit: raise NotFittedError("Please fit the model before") # Split fit_params into a 'step-by-step' dictionnary fit_params_step = {name: {} for name in self.complete_graph.nodes} if fit_params is not None: for key, value in fit_params.items(): step, param = key.split("__", 1) fit_params_step[step][param] = value data_dico = {} # Will contain transformed blocks at each node feature_dico = {} # Will contain the get_feature_names() of each node if do_fit: input_features = getattr(X, "columns", None) if input_features is not None: input_features = list(input_features) self._Xinput_features = input_features else: input_features = self._Xinput_features nodes_done = set() for node in self._nodes_order: nodes_done.add(node) if self.verbose: print("start processing node %s ..." % node) ### Debugging Help ### if (getattr(self, "_return_before_node", None) is not None and getattr(self, "_return_before_node", None) == node): return data_dico model = self._models[node] predecessors = list(self.complete_graph.predecessors(node)) # Carefull : here it is not necessary always in the same order #### I'll use the order in which the edges were given # Concatenation : alphabetical order concat_at_this_node = self.no_concat_nodes is None or node not in self.no_concat_nodes if len(predecessors) == 0: ######################### ### No predecessors ### ######################### if concat_at_this_node: lastX = X else: lastX = {"_data": X} # ==> Apply on original data last_features = input_features elif len(predecessors) == 1: ######################## ### One predecessor ### ######################## # ==> Apply on data coming out of last node if concat_at_this_node: lastX = data_dico[predecessors[0]] else: lastX = { predecessor: data_dico[predecessor] for predecessor in predecessors } last_features = feature_dico[predecessors[0]] elif len(predecessors) > 1: ####################### ### More than one ### ####################### # ==> concat all the predecessors node and apply it ### Fix concatenation order ### if do_fit: edges_number = self._get_edges_number(predecessors, node) predecessors = sorted(predecessors, key=lambda p: (edges_number.get(p, -1), p)) self._all_concat_order[node] = predecessors else: predecessors = self._all_concat_order[node] all_lastX = [ data_dico[predecessor] for predecessor in predecessors ] all_last_features = [ feature_dico[predecessor] for predecessor in predecessors ] if all_last_features is None or None in all_last_features: last_features = None else: last_features = unlist(all_last_features) # all_columns_names = [try_to_find_features_names( self._models[predecessor], input_features = input_features) # for predecessor, input_features in zip(predecessors, all_last_features)] # for predecessor, input_features in zip(predecessors,all_last_features): # try_to_find_features_names( self._models[predecessor], input_features = input_features) if self.verbose: print("start aggregation...") if do_fit: output_type = guess_output_type(all_lastX) self._all_concat_type[node] = output_type else: output_type = self._all_concat_type[node] if concat_at_this_node: lastX = generic_hstack(all_lastX, output_type=output_type, all_columns_names=all_last_features) else: lastX = { predecessor: data_dico[predecessor] for predecessor in predecessors } if node != self._terminal_node: # This is not the end of the graph if do_fit: if groups is not None and function_has_named_argument( model.fit_transform, "groups"): data_dico[node] = model.fit_transform( lastX, y, groups=groups, **fit_params_step[node]) else: data_dico[node] = model.fit_transform( lastX, y, **fit_params_step[node]) # ICI : on pourrait sauté le fit pour certains models dans le fit params # Quelque-chose comme : # if node in preffited_models: # # self._model[node] = preffited_models[node] # model = preffited_models[node] # + copy model into pipeline # data_dico[node] = model.transform(lastX, y) # else: # data_dico[node] = model.fit_transform(lastX, y, **fit_params_step[node] ) else: data_dico[node] = model.transform(lastX) feature_dico[node] = try_to_find_features_names( model, input_features=last_features) else: # This is the last node of the Graph if method == "fit": if groups is not None and function_has_named_argument( model.fit, "groups"): model.fit(lastX, y, groups, **fit_params_step[node]) else: model.fit(lastX, y, **fit_params_step[node]) result = self elif method == "fit_predict": if groups is not None and function_has_named_argument( model.fit_predict, "groups"): result = model.fit_predict(lastX, y, groups, **fit_params_step[node]) else: result = model.fit_predict(lastX, y, **fit_params_step[node]) elif method == "fit_transform": if groups is not None and function_has_named_argument( model.fit_transform, "groups"): result = model.fit_transform(lastX, y, groups, **fit_params_step[node]) else: result = model.fit_transform(lastX, y, **fit_params_step[node]) elif method == "transform": result = model.transform(lastX) elif method == "predict": result = model.predict(lastX) elif method == "predict_proba": result = model.predict_proba(lastX) elif method == "predict_log_proba": result = model.predict_log_proba(lastX) elif method == "decision_function": result = model.decision_function(lastX) elif method == "score": result = model.score(lastX, y) else: raise ValueError("I don't know that kind of method '%s' " % method) feature_dico[node] = try_to_find_features_names( model, input_features=last_features) return result ####################### #### Dico cleaning #### ####################### # I'll do a step of cleaning to remove useless blocks in memory # I need to remove data in nodes that wont be accessed anymore still_usefull = set() for n in self.complete_graph.nodes: if n in nodes_done: continue p = list(self.complete_graph.predecessors(n)) still_usefull.update(p) for n in data_dico.keys(): if data_dico[n] is None: continue if n not in still_usefull: if self.verbose: print("deleting useless node %s" % n) data_dico[n] = None