def fit(self, data): """Fit the transformer to the data. Args: data (pandas.Series or numpy.ndarray): Data to fit to. """ if isinstance(data, np.ndarray): data = pd.Series(data) self.null_transformer = NullTransformer(self.nan, self.null_column) self.null_transformer.fit(data)
def fit(self, data): """Fit the transformer to the data. Args: data (pandas.Series or numpy.ndarray): Data to fit the transformer to. """ if isinstance(data, np.ndarray): data = pd.Series(data) transformed = self._transform(data) if self.nan == 'mean': fill_value = transformed.mean() elif self.nan == 'mode': fill_value = transformed.mode(dropna=True)[0] else: fill_value = self.nan self.null_transformer = NullTransformer(fill_value, self.null_column) self.null_transformer.fit(data)
def fit(self, data): """Fit the transformer to the data. Args: data (pandas.Series or numpy.ndarray): Data to fit. """ if isinstance(data, np.ndarray): data = pd.Series(data) self._dtype = self.dtype or data.dtype self._min_value = data.min() if self.min_value == 'auto' else self.min_value self._max_value = data.max() if self.max_value == 'auto' else self.max_value if self.rounding == 'auto': self._rounding_digits = self._learn_rounding_digits(data) elif isinstance(self.rounding, int): self._rounding_digits = self.rounding self.null_transformer = NullTransformer(self.nan, self.null_column, copy=True) self.null_transformer.fit(data)
class BooleanTransformer(BaseTransformer): """Transformer for boolean data. This transformer replaces boolean values with their integer representation transformed to float. Null values are replaced using a ``NullTransformer``. Args: nan (int or None): Replace null values with the given value. If ``None``, do not replace them. Defaults to ``-1``. null_column (bool): Whether to create a new column to indicate which values were null or not. If ``None``, only create a new column when the fit data contains null values. If ``True``, always create the new column whether there are null values or not. If ``False``, do not create the new column. Defaults to ``None``. """ null_transformer = None def __init__(self, nan=-1, null_column=None): self.nan = nan self.null_column = null_column def fit(self, data): """Fit the transformer to the data. Args: data (pandas.Series or numpy.ndarray): Data to fit to. """ if isinstance(data, np.ndarray): data = pd.Series(data) self.null_transformer = NullTransformer(self.nan, self.null_column) self.null_transformer.fit(data) def transform(self, data): """Transform boolean to float. The boolean values will be replaced by the corresponding integer representations as float values. Args: data (pandas.Series or numpy.ndarray): Data to transform. Returns: numpy.ndarray """ if isinstance(data, np.ndarray): data = pd.Series(data) data.loc[data.notnull()] = data.dropna().astype(int) return self.null_transformer.transform(data).astype(float) def reverse_transform(self, data): """Transform float values back to the original boolean values. Args: data (numpy.ndarray): Data to revert. Returns: pandas.Series """ if self.nan is not None: data = self.null_transformer.reverse_transform(data) if isinstance(data, np.ndarray): data = pd.Series(data) data[pd.notnull(data)] = np.round(data[pd.notnull(data)]).astype(bool) return data
class DatetimeTransformer(BaseTransformer): """Transformer for datetime data. This transformer replaces datetime values with an integer timestamp transformed to float. Null values are replaced using a ``NullTransformer``. Args: nan (int, str or None): Indicate what to do with the null values. If an integer is given, replace them with the given value. If the strings ``'mean'`` or ``'mode'`` are given, replace them with the corresponding aggregation. If ``None`` is given, do not replace them. Defaults to ``'mean'``. null_column (bool): Whether to create a new column to indicate which values were null or not. If ``None``, only create a new column when the data contains null values. If ``True``, always create the new column whether there are null values or not. If ``False``, do not create the new column. Defaults to ``None``. strip_constant (bool): Whether to optimize the output values by finding the smallest time unit that is not zero on the training datetimes and dividing the generated numerical values by the value of the next smallest time unit. This, a part from reducing the orders of magnitued of the transformed values, ensures that reverted values always are zero on the lower time units. """ null_transformer = None divider = None def __init__(self, nan='mean', null_column=None, strip_constant=False): self.nan = nan self.null_column = null_column self.strip_constant = strip_constant def _find_divider(self, transformed): self.divider = 1 multipliers = [10] * 9 + [60, 60, 24] for multiplier in multipliers: candidate = self.divider * multiplier if np.mod(transformed, candidate).any(): break self.divider = candidate def _transform(self, datetimes): """Transform datetime values to integer.""" nulls = datetimes.isnull() integers = np.zeros(len(datetimes)) integers[~nulls] = datetimes[~nulls].astype(np.int64).astype( np.float64).values integers[nulls] = np.nan transformed = pd.Series(integers) if self.strip_constant: self._find_divider(transformed) transformed = transformed.floordiv(self.divider) return transformed def fit(self, data): """Fit the transformer to the data. Args: data (pandas.Series or numpy.ndarray): Data to fit the transformer to. """ if isinstance(data, np.ndarray): data = pd.Series(data) transformed = self._transform(data) self.null_transformer = NullTransformer(self.nan, self.null_column, copy=True) self.null_transformer.fit(transformed) def transform(self, data): """Transform datetime values to float values. Args: data (pandas.Series or numpy.ndarray): Data to transform. Returns: numpy.ndarray """ if isinstance(data, np.ndarray): data = pd.Series(data) data = self._transform(data) return self.null_transformer.transform(data) def reverse_transform(self, data): """Convert float values back to datetimes. Args: data (pandas.Series or numpy.ndarray): Data to transform. Returns: pandas.Series """ if self.nan is not None: data = self.null_transformer.reverse_transform(data) if isinstance(data, np.ndarray) and (data.ndim == 2): data = data[:, 0] data[pd.notnull(data)] = np.round(data[pd.notnull(data)]).astype( np.int64) if self.strip_constant: data = data.astype(float) * self.divider return pd.to_datetime(data)
class NumericalTransformer(BaseTransformer): """Transformer for numerical data. This transformer replaces integer values with their float equivalent. Non null float values are not modified. Null values are replaced using a ``NullTransformer``. Args: dtype (data type): Data type of the data to transform. It will be used when reversing the transformation. If not provided, the dtype of the fit data will be used. Defaults to ``None``. nan (int, str or None): Indicate what to do with the null values. If an integer is given, replace them with the given value. If the strings ``'mean'`` or ``'mode'`` are given, replace them with the corresponding aggregation. If ``None`` is given, do not replace them. Defaults to ``'mean'``. null_column (bool): Whether to create a new column to indicate which values were null or not. If ``None``, only create a new column when the data contains null values. If ``True``, always create the new column whether there are null values or not. If ``False``, do not create the new column. Defaults to ``None``. """ null_transformer = None def __init__(self, dtype=None, nan='mean', null_column=None): self.nan = nan self.null_column = null_column self.dtype = dtype self._dtype = dtype def fit(self, data): """Fit the transformer to the data. Args: data (pandas.Series or numpy.ndarray): Data to fit. """ if isinstance(data, np.ndarray): data = pd.Series(data) self._dtype = self.dtype or data.dtype self.null_transformer = NullTransformer(self.nan, self.null_column, copy=True) self.null_transformer.fit(data) def transform(self, data): """Transform numerical data. Integer values are replaced by their float equivalent. Non null float values are left unmodified. Args: data (pandas.Series or numpy.ndarray): Data to transform. Returns: numpy.ndarray """ if isinstance(data, np.ndarray): data = pd.Series(data) return self.null_transformer.transform(data) def reverse_transform(self, data): """Convert data back into the original format. Args: data (numpy.ndarray): Data to transform. Returns: numpy.ndarray """ if self.nan is not None: data = self.null_transformer.reverse_transform(data) if np.dtype(self._dtype).kind == 'i': if pd.notnull(data).all(): return data.round().astype(self._dtype) data[pd.notnull(data)] = np.round(data[pd.notnull(data)]).astype(self._dtype) return data return data.astype(self._dtype)
class DatetimeTransformer(BaseTransformer): """Transformer for datetime data. This transformer replaces datetime values with an integer timestamp transformed to float. Null values are replaced using a ``NullTransformer``. Args: nan (int, str or None): Indicate what to do with the null values. If an integer is given, replace them with the given value. If the strings ``'mean'`` or ``'mode'`` are given, replace them with the corresponding aggregation. If ``None`` is given, do not replace them. Defaults to ``'mean'``. null_column (bool): Whether to create a new column to indicate which values were null or not. If ``None``, only create a new column when the data contains null values. If ``True``, always create the new column whether there are null values or not. If ``False``, do not create the new column. Defaults to ``None``. """ null_transformer = None def __init__(self, nan='mean', null_column=None): self.nan = nan self.null_column = null_column @staticmethod def _transform(datetimes): """Transform datetime values to integer.""" nulls = datetimes.isnull() integers = datetimes.astype(int).astype(float).values integers[nulls] = np.nan return pd.Series(integers) def fit(self, data): """Fit the transformer to the data. Args: data (pandas.Series or numpy.ndarray): Data to fit the transformer to. """ if isinstance(data, np.ndarray): data = pd.Series(data) transformed = self._transform(data) if self.nan == 'mean': fill_value = transformed.mean() elif self.nan == 'mode': fill_value = transformed.mode(dropna=True)[0] else: fill_value = self.nan self.null_transformer = NullTransformer(fill_value, self.null_column) self.null_transformer.fit(data) def transform(self, data): """Transform datetime values to float values. Args: data (pandas.Series or numpy.ndarray): Data to transform. Returns: numpy.ndarray """ if isinstance(data, np.ndarray): data = pd.Series(data) data = self._transform(data) return self.null_transformer.transform(data) def reverse_transform(self, data): """Convert float values back to datetimes. Args: data (pandas.Series or numpy.ndarray): Data to transform. Returns: pandas.Series """ if self.nan is not None: data = self.null_transformer.reverse_transform(data) data[pd.notnull(data)] = np.round(data[pd.notnull(data)]).astype(int) return pd.to_datetime(data)
class NumericalTransformer(BaseTransformer): """Transformer for numerical data. This transformer replaces integer values with their float equivalent. Non null float values are not modified. Null values are replaced using a ``NullTransformer``. Args: dtype (data type): Data type of the data to transform. It will be used when reversing the transformation. If not provided, the dtype of the fit data will be used. Defaults to ``None``. nan (int, str or None): Indicate what to do with the null values. If an integer is given, replace them with the given value. If the strings ``'mean'`` or ``'mode'`` are given, replace them with the corresponding aggregation. If ``None`` is given, do not replace them. Defaults to ``'mean'``. null_column (bool): Whether to create a new column to indicate which values were null or not. If ``None``, only create a new column when the data contains null values. If ``True``, always create the new column whether there are null values or not. If ``False``, do not create the new column. Defaults to ``None``. rounding (int, str or None): Define rounding scheme for data. If set to an int, values will be rounded to that number of decimal places. If ``None``, values will not be rounded. If set to ``'auto'``, the transformer will round to the maximum number of decimal places detected in the fitted data. min_value (int, str or None): Indicate whether or not to set a minimum value for the data. If an integer is given, reverse transformed data will be greater than or equal to it. If the string ``'auto'`` is given, the minimum will be the minimum value seen in the fitted data. If ``None`` is given, there won't be a minimum. max_value (int, str or None): Indicate whether or not to set a maximum value for the data. If an integer is given, reverse transformed data will be less than or equal to it. If the string ``'auto'`` is given, the maximum will be the maximum value seen in the fitted data. If ``None`` is given, there won't be a maximum. """ null_transformer = None nan = None _dtype = None _rounding_digits = None _min_value = None _max_value = None def __init__(self, dtype=None, nan='mean', null_column=None, rounding=None, min_value=None, max_value=None): self.nan = nan self.null_column = null_column self.dtype = dtype self.rounding = rounding self.min_value = min_value self.max_value = max_value @staticmethod def _learn_rounding_digits(data): # check if data has any decimals roundable_data = data[~(np.isinf(data) | pd.isnull(data))] if (roundable_data % 1 != 0).any(): if not (roundable_data == roundable_data.round(MAX_DECIMALS)).all(): return None for decimal in range(MAX_DECIMALS + 1): if (roundable_data == roundable_data.round(decimal)).all(): return decimal else: maximum = max(abs(roundable_data)) start = int(np.log10(maximum)) if maximum != 0 else 0 for decimal in range(-start, 1): if (roundable_data == roundable_data.round(decimal)).all(): return decimal return None def fit(self, data): """Fit the transformer to the data. Args: data (pandas.Series or numpy.ndarray): Data to fit. """ if isinstance(data, np.ndarray): data = pd.Series(data) self._dtype = self.dtype or data.dtype self._min_value = data.min() if self.min_value == 'auto' else self.min_value self._max_value = data.max() if self.max_value == 'auto' else self.max_value if self.rounding == 'auto': self._rounding_digits = self._learn_rounding_digits(data) elif isinstance(self.rounding, int): self._rounding_digits = self.rounding self.null_transformer = NullTransformer(self.nan, self.null_column, copy=True) self.null_transformer.fit(data) def transform(self, data): """Transform numerical data. Integer values are replaced by their float equivalent. Non null float values are left unmodified. Args: data (pandas.Series or numpy.ndarray): Data to transform. Returns: numpy.ndarray """ if isinstance(data, np.ndarray): data = pd.Series(data) return self.null_transformer.transform(data) def reverse_transform(self, data): """Convert data back into the original format. Args: data (numpy.ndarray): Data to transform. Returns: numpy.ndarray """ if self._min_value is not None or self._max_value is not None: if len(data.shape) > 1: data[:, 0] = data[:, 0].clip(self._min_value, self._max_value) else: data = data.clip(self._min_value, self._max_value) if self.nan is not None: data = self.null_transformer.reverse_transform(data) is_integer = np.dtype(self._dtype).kind == 'i' if self._rounding_digits is not None or is_integer: data = data.round(self._rounding_digits or 0) if pd.isnull(data).any() and is_integer: return data return data.astype(self._dtype)