def inverse_transform(self, X): """Inverse ordinal-encode the columns in `X` Parameters ---------- X : array or dataframe Either the NumPy, dask, or pandas version Returns ------- data : DataFrame Dask array or dataframe will return a Dask DataFrame. Numpy array or pandas dataframe will return a pandas DataFrame """ if isinstance(X, np.ndarray): X = pd.DataFrame(X, columns=self.columns_) elif isinstance(X, da.Array): # later on we concat(..., axis=1), which requires # known divisions. Suboptimal, but I think unavoidable. unknown = np.isnan(X.chunks[0]).any() if unknown: lengths = blockwise(len, "i", X[:, 0], "i", dtype="i8").compute() X = X.copy() chunks = (tuple(lengths), X.chunks[1]) X._chunks = chunks X = dd.from_dask_array(X, columns=self.columns_) big = isinstance(X, dd.DataFrame) if big: chunks = np.array(X.divisions) chunks[-1] = chunks[-1] + 1 chunks = tuple(chunks[1:] - chunks[:-1]) X = X.copy() for col in self.categorical_columns_: if _HAS_CTD: dtype = self.dtypes_[col] categories, ordered = dtype.categories, dtype.ordered else: categories, ordered = self.dtypes_[col] # use .values to avoid warning from pandas codes = X[col].values if big: # dask codes._chunks = (chunks, ) # Need a Categorical.from_codes for dask series = (dd.from_dask_array( codes, columns=col).astype("category").cat.set_categories( np.arange(len(categories)), ordered=ordered).cat.rename_categories(categories)) # Bug in pandas <= 0.20.3 lost name if series.name is None: series.name = col series.divisions = X.divisions else: # pandas series = pd.Series( pd.Categorical.from_codes(codes, categories, ordered=ordered), name=col, ) X[col] = series return X
def inverse_transform(self, X): """Inverse dummy-encode the columns in `X` Parameters ---------- X : array or dataframe Either the NumPy, dask, or pandas version Returns ------- data : DataFrame Dask array or dataframe will return a Dask DataFrame. Numpy array or pandas dataframe will return a pandas DataFrame """ if isinstance(X, np.ndarray): X = pd.DataFrame(X, columns=self.transformed_columns_) elif isinstance(X, da.Array): # later on we concat(..., axis=1), which requires # known divisions. Suboptimal, but I think unavoidable. unknown = np.isnan(X.chunks[0]).any() if unknown: lengths = blockwise(len, "i", X[:, 0], "i", dtype="i8").compute() X = X.copy() chunks = (tuple(lengths), X.chunks[1]) X._chunks = chunks X = dd.from_dask_array(X, columns=self.transformed_columns_) big = isinstance(X, dd.DataFrame) if big: chunks = np.array(X.divisions) chunks[-1] = chunks[-1] + 1 chunks = tuple(chunks[1:] - chunks[:-1]) non_cat = X[list(self.non_categorical_columns_)] cats = [] for col in self.categorical_columns_: slice_ = self.categorical_blocks_[col] if _HAS_CTD: dtype = self.dtypes_[col] categories, ordered = dtype.categories, dtype.ordered else: categories, ordered = self.dtypes_[col] # use .values to avoid warning from pandas cols_slice = list(X.columns[slice_]) if big: inds = X[cols_slice].to_dask_array(lengths=chunks) else: inds = X[cols_slice].values codes = inds.argmax(1) if self.drop_first: codes += 1 codes[(inds == 0).all(1)] = 0 if big: # dask codes._chunks = (chunks, ) # Need a Categorical.from_codes for dask series = (dd.from_dask_array( codes, columns=col).astype("category").cat.set_categories( np.arange(len(categories)), ordered=ordered).cat.rename_categories(categories)) # Bug in pandas <= 0.20.3 lost name if series.name is None: series.name = col series.divisions = X.divisions else: # pandas series = pd.Series( pd.Categorical.from_codes(codes, categories, ordered=ordered), name=col, ) cats.append(series) if big: df = dd.concat([non_cat] + cats, axis=1)[list(self.columns_)] else: df = pd.concat([non_cat] + cats, axis=1)[self.columns_] return df