def fit(self, X, y=None): Xtype = get_type(X) if Xtype != DataTypes.DataFrame: raise TypeError("X should be a DataFrame") Xcolumns = list(X.columns) self._columns_to_encode = Xcolumns # Force to encode everything now X = get_rid_of_categories(X) # Verif: if not isinstance(self._columns_to_encode, list): raise TypeError("_columns_to_encode should be a list") for c in self._columns_to_encode: if c not in Xcolumns: raise ValueError("column %s isn't in the DataFrame" % c) self.variable_modality_mapping = {col: self.modalities_filter(X[col]) for col in self._columns_to_encode} # Rmk : si on veut pas faire un encodage ou les variables sont par ordre croissant, on peut faire un randomization des numbre ici if self.encoding_type == "num": self._feature_names = self._columns_to_encode self.columns_mapping = {c: [c] for c in self._feature_names} elif self.encoding_type == "dummy": self.columns_mapping = {} index_column = {} self._variable_shift = {} cum_max = 0 for col in self._columns_to_encode: self.columns_mapping[col] = [] for i, (mod, ind) in enumerate(self.variable_modality_mapping[col].items()): index_column[ind + cum_max] = col + "__" + str(mod) self.columns_mapping[col].append(col + "__" + str(mod)) self._variable_shift[col] = cum_max cum_max += i + 1 self._dummy_size = cum_max self._dummy_feature_names = [index_column[i] for i in range(cum_max)] self._feature_names = self._dummy_feature_names else: raise NotImplementedError("I don't know that type of encoding %s" % self.encoding_type) return self
def transform(self, X): if get_type(X) != DataTypes.DataFrame: raise TypeError("X should be a DataFrame") X = get_rid_of_categories(X) result = self._transform_aggregat(X, self._target_aggregat, self._target_aggregat_global) assert result.shape[1] == len(self.get_feature_names()) return result
def transform(self, X): if get_type(X) != DataTypes.DataFrame: raise TypeError("X should be a DataFrame") X = get_rid_of_categories(X) result = self._transform_to_encode(X) return result
def test_get_rid_of_categories(): df = get_sample_df() df2 = get_rid_of_categories(df) assert df2 is df # nothing happend df_with_cat = df.copy() df_with_cat["text_col"] = df_with_cat["text_col"].astype("category") assert (df_with_cat.dtypes == "category").any() # category df2 = get_rid_of_categories(df_with_cat) assert not (df2.dtypes == "category").any() # no more category assert df2["text_col"].dtype == "object" assert (df2["text_col"] == df_with_cat["text_col"]).all() df_with_cat = df.copy() df_with_cat["int_col"] = df_with_cat["int_col"].astype("category") df2 = get_rid_of_categories(df_with_cat) assert not (df2.dtypes == "category").any() # no more category assert (df2.dtypes == df.dtypes).all()
def fit_transform(self, X, y): if y is None: raise ValueError("I need a value for 'y'") if not isinstance(y, pd.Series): sy = pd.Series(y) else: sy = y self.fit(X, sy) X = get_rid_of_categories(X) if self.cv is None: # No Cross Validation ... target_aggregat, target_aggregat_global = self._fit_aggregat( X, y, noise_level=self.noise_level) all_results = self._transform_aggregat(X, target_aggregat, target_aggregat_global) else: cv = create_cv(self.cv, y=sy, classifier=not self.is_regression, random_state=123) all_results = [] for train, test in cv.split(X, y): target_aggregat, target_aggregat_global = self._fit_aggregat( X.iloc[train, :], sy.iloc[train], noise_level=self.noise_level) sub_result = self._transform_aggregat(X.iloc[test, :], target_aggregat, target_aggregat_global) all_results.append(sub_result) all_results = pd.concat(all_results, axis=0) all_results = all_results.loc[X.index, :] assert len(all_results) == len(X) assert (all_results.index == X.index).all() assert all_results.shape[1] == len(self.get_feature_names()) return all_results
def fit(self, X, y): if y is None: raise ValueError("I need a value for 'y'") self._random_gen = check_random_state(self.random_state) Xtype = get_type(X) if Xtype != DataTypes.DataFrame: raise TypeError("X should be a DataFrame") Xcolumns = list(X.columns) if not isinstance(y, pd.Series): sy = pd.Series(y) else: sy = y # Columns to encode and to keep self._columns_to_encode = list(X.columns) X = get_rid_of_categories(X) # Verif: if not isinstance(self._columns_to_encode, list): raise TypeError("_columns_to_encode should be a list") for c in self._columns_to_encode: if c not in Xcolumns: raise ValueError("column %s isn't in the DataFrame" % c) self._columns_to_keep = [] # Verif: if not isinstance(self._columns_to_keep, list): raise TypeError("_columns_to_keep should be a list") for c in self._columns_to_keep: if c not in Xcolumns: raise ValueError("column %s isn't in the DataFrame" % c) # Target information if self.is_regression: self.target_classes = None # No target classes for Regressor self.global_std = np.std(sy) else: # For classification I need to store it self.global_std = None self.target_classes = list(np.unique(sy)) if len(self.target_classes) == 2: self.target_classes = self.target_classes[1:] # Columns on which we want None to be a special modality self._na_to_null = dict() for col in self._columns_to_encode: ii_null = X[col].isnull() self._na_to_null[col] = ii_null.sum() >= self.max_na_percentage * len(X) self._target_aggregat, self._target_aggregat_global = self._fit_aggregat(X, sy, noise_level=None) # Features names self._feature_names = [c for c in self._columns_to_keep] # copy for col in self._columns_to_encode: self._feature_names += self._get_output_column_name(col=col, target_classes=self.target_classes) # self._feature_names += ["%s__target_%s" % (col,str(t)) for t in self.target_classes] return self