示例#1
0
文件: boolean.py 项目: pythiac/RDT
class BooleanTransformer(BaseTransformer):
    """Transformer for boolean data.

    This transformer replaces boolean values with their integer representation
    transformed to float.

    Null values are replaced using a ``NullTransformer``.

    Args:
        nan (int or None):
            Replace null values with the given value. If ``None``, do not replace them.
            Defaults to ``-1``.
        null_column (bool):
            Whether to create a new column to indicate which values were null or not.
            If ``None``, only create a new column when the fit data contains null values.
            If ``True``, always create the new column whether there are null values or not.
            If ``False``, do not create the new column.
            Defaults to ``None``.
    """

    null_transformer = None

    def __init__(self, nan=-1, null_column=None):
        self.nan = nan
        self.null_column = null_column

    def fit(self, data):
        """Fit the transformer to the data.

        Args:
            data (pandas.Series or numpy.ndarray):
                Data to fit to.
        """
        if isinstance(data, np.ndarray):
            data = pd.Series(data)

        self.null_transformer = NullTransformer(self.nan, self.null_column)
        self.null_transformer.fit(data)

    def transform(self, data):
        """Transform boolean to float.

        The boolean values will be replaced by the corresponding integer
        representations as float values.

        Args:
            data (pandas.Series or numpy.ndarray):
                Data to transform.

        Returns:
            numpy.ndarray
        """
        if isinstance(data, np.ndarray):
            data = pd.Series(data)

        data.loc[data.notnull()] = data.dropna().astype(int)

        return self.null_transformer.transform(data).astype(float)

    def reverse_transform(self, data):
        """Transform float values back to the original boolean values.

        Args:
            data (numpy.ndarray):
                Data to revert.

        Returns:
            pandas.Series
        """
        if self.nan is not None:
            data = self.null_transformer.reverse_transform(data)

        if isinstance(data, np.ndarray):
            data = pd.Series(data)

        data[pd.notnull(data)] = np.round(data[pd.notnull(data)]).astype(bool)
        return data
示例#2
0
文件: datetime.py 项目: sbrugman/RDT
class DatetimeTransformer(BaseTransformer):
    """Transformer for datetime data.

    This transformer replaces datetime values with an integer timestamp
    transformed to float.

    Null values are replaced using a ``NullTransformer``.

    Args:
        nan (int, str or None):
            Indicate what to do with the null values. If an integer is given, replace them
            with the given value. If the strings ``'mean'`` or ``'mode'`` are given, replace
            them with the corresponding aggregation. If ``None`` is given, do not replace them.
            Defaults to ``'mean'``.
        null_column (bool):
            Whether to create a new column to indicate which values were null or not.
            If ``None``, only create a new column when the data contains null values.
            If ``True``, always create the new column whether there are null values or not.
            If ``False``, do not create the new column.
            Defaults to ``None``.
        strip_constant (bool):
            Whether to optimize the output values by finding the smallest time unit that
            is not zero on the training datetimes and dividing the generated numerical
            values by the value of the next smallest time unit. This, a part from reducing the
            orders of magnitued of the transformed values, ensures that reverted values always
            are zero on the lower time units.
    """

    null_transformer = None
    divider = None

    def __init__(self, nan='mean', null_column=None, strip_constant=False):
        self.nan = nan
        self.null_column = null_column
        self.strip_constant = strip_constant

    def _find_divider(self, transformed):
        self.divider = 1
        multipliers = [10] * 9 + [60, 60, 24]
        for multiplier in multipliers:
            candidate = self.divider * multiplier
            if np.mod(transformed, candidate).any():
                break

            self.divider = candidate

    def _transform(self, datetimes):
        """Transform datetime values to integer."""
        nulls = datetimes.isnull()
        integers = np.zeros(len(datetimes))
        integers[~nulls] = datetimes[~nulls].astype(np.int64).astype(
            np.float64).values
        integers[nulls] = np.nan

        transformed = pd.Series(integers)
        if self.strip_constant:
            self._find_divider(transformed)
            transformed = transformed.floordiv(self.divider)

        return transformed

    def fit(self, data):
        """Fit the transformer to the data.

        Args:
            data (pandas.Series or numpy.ndarray):
                Data to fit the transformer to.
        """
        if isinstance(data, np.ndarray):
            data = pd.Series(data)

        transformed = self._transform(data)
        self.null_transformer = NullTransformer(self.nan,
                                                self.null_column,
                                                copy=True)
        self.null_transformer.fit(transformed)

    def transform(self, data):
        """Transform datetime values to float values.

        Args:
            data (pandas.Series or numpy.ndarray):
                Data to transform.

        Returns:
            numpy.ndarray
        """
        if isinstance(data, np.ndarray):
            data = pd.Series(data)

        data = self._transform(data)

        return self.null_transformer.transform(data)

    def reverse_transform(self, data):
        """Convert float values back to datetimes.

        Args:
            data (pandas.Series or numpy.ndarray):
                Data to transform.

        Returns:
            pandas.Series
        """
        if self.nan is not None:
            data = self.null_transformer.reverse_transform(data)

        if isinstance(data, np.ndarray) and (data.ndim == 2):
            data = data[:, 0]

        data[pd.notnull(data)] = np.round(data[pd.notnull(data)]).astype(
            np.int64)
        if self.strip_constant:
            data = data.astype(float) * self.divider

        return pd.to_datetime(data)
示例#3
0
文件: datetime.py 项目: pythiac/RDT
class DatetimeTransformer(BaseTransformer):
    """Transformer for datetime data.

    This transformer replaces datetime values with an integer timestamp
    transformed to float.

    Null values are replaced using a ``NullTransformer``.

    Args:
        nan (int, str or None):
            Indicate what to do with the null values. If an integer is given, replace them
            with the given value. If the strings ``'mean'`` or ``'mode'`` are given, replace
            them with the corresponding aggregation. If ``None`` is given, do not replace them.
            Defaults to ``'mean'``.
        null_column (bool):
            Whether to create a new column to indicate which values were null or not.
            If ``None``, only create a new column when the data contains null values.
            If ``True``, always create the new column whether there are null values or not.
            If ``False``, do not create the new column.
            Defaults to ``None``.
    """

    null_transformer = None

    def __init__(self, nan='mean', null_column=None):
        self.nan = nan
        self.null_column = null_column

    @staticmethod
    def _transform(datetimes):
        """Transform datetime values to integer."""
        nulls = datetimes.isnull()
        integers = datetimes.astype(int).astype(float).values
        integers[nulls] = np.nan

        return pd.Series(integers)

    def fit(self, data):
        """Fit the transformer to the data.

        Args:
            data (pandas.Series or numpy.ndarray):
                Data to fit the transformer to.
        """
        if isinstance(data, np.ndarray):
            data = pd.Series(data)

        transformed = self._transform(data)

        if self.nan == 'mean':
            fill_value = transformed.mean()
        elif self.nan == 'mode':
            fill_value = transformed.mode(dropna=True)[0]
        else:
            fill_value = self.nan

        self.null_transformer = NullTransformer(fill_value, self.null_column)
        self.null_transformer.fit(data)

    def transform(self, data):
        """Transform datetime values to float values.

        Args:
            data (pandas.Series or numpy.ndarray):
                Data to transform.

        Returns:
            numpy.ndarray
        """
        if isinstance(data, np.ndarray):
            data = pd.Series(data)

        data = self._transform(data)

        return self.null_transformer.transform(data)

    def reverse_transform(self, data):
        """Convert float values back to datetimes.

        Args:
            data (pandas.Series or numpy.ndarray):
                Data to transform.

        Returns:
            pandas.Series
        """
        if self.nan is not None:
            data = self.null_transformer.reverse_transform(data)

        data[pd.notnull(data)] = np.round(data[pd.notnull(data)]).astype(int)
        return pd.to_datetime(data)
示例#4
0
class NumericalTransformer(BaseTransformer):
    """Transformer for numerical data.

    This transformer replaces integer values with their float equivalent.
    Non null float values are not modified.

    Null values are replaced using a ``NullTransformer``.

    Args:
        dtype (data type):
            Data type of the data to transform. It will be used when reversing the
            transformation. If not provided, the dtype of the fit data will be used.
            Defaults to ``None``.
        nan (int, str or None):
            Indicate what to do with the null values. If an integer is given, replace them
            with the given value. If the strings ``'mean'`` or ``'mode'`` are given, replace
            them with the corresponding aggregation. If ``None`` is given, do not replace them.
            Defaults to ``'mean'``.
        null_column (bool):
            Whether to create a new column to indicate which values were null or not.
            If ``None``, only create a new column when the data contains null values.
            If ``True``, always create the new column whether there are null values or not.
            If ``False``, do not create the new column.
            Defaults to ``None``.
    """

    null_transformer = None

    def __init__(self, dtype=None, nan='mean', null_column=None):
        self.nan = nan
        self.null_column = null_column
        self.dtype = dtype
        self._dtype = dtype

    def fit(self, data):
        """Fit the transformer to the data.

        Args:
            data (pandas.Series or numpy.ndarray):
                Data to fit.
        """
        if isinstance(data, np.ndarray):
            data = pd.Series(data)

        self._dtype = self.dtype or data.dtype
        self.null_transformer = NullTransformer(self.nan, self.null_column, copy=True)
        self.null_transformer.fit(data)

    def transform(self, data):
        """Transform numerical data.

        Integer values are replaced by their float equivalent. Non null float values
        are left unmodified.

        Args:
            data (pandas.Series or numpy.ndarray):
                Data to transform.

        Returns:
            numpy.ndarray
        """
        if isinstance(data, np.ndarray):
            data = pd.Series(data)

        return self.null_transformer.transform(data)

    def reverse_transform(self, data):
        """Convert data back into the original format.

        Args:
            data (numpy.ndarray):
                Data to transform.

        Returns:
            numpy.ndarray
        """
        if self.nan is not None:
            data = self.null_transformer.reverse_transform(data)

        if np.dtype(self._dtype).kind == 'i':
            if pd.notnull(data).all():
                return data.round().astype(self._dtype)

            data[pd.notnull(data)] = np.round(data[pd.notnull(data)]).astype(self._dtype)
            return data

        return data.astype(self._dtype)
示例#5
0
文件: numerical.py 项目: sdv-dev/RDT
class NumericalTransformer(BaseTransformer):
    """Transformer for numerical data.

    This transformer replaces integer values with their float equivalent.
    Non null float values are not modified.

    Null values are replaced using a ``NullTransformer``.

    Args:
        dtype (data type):
            Data type of the data to transform. It will be used when reversing the
            transformation. If not provided, the dtype of the fit data will be used.
            Defaults to ``None``.
        nan (int, str or None):
            Indicate what to do with the null values. If an integer is given, replace them
            with the given value. If the strings ``'mean'`` or ``'mode'`` are given, replace
            them with the corresponding aggregation. If ``None`` is given, do not replace them.
            Defaults to ``'mean'``.
        null_column (bool):
            Whether to create a new column to indicate which values were null or not.
            If ``None``, only create a new column when the data contains null values.
            If ``True``, always create the new column whether there are null values or not.
            If ``False``, do not create the new column.
            Defaults to ``None``.
        rounding (int, str or None):
            Define rounding scheme for data. If set to an int, values will be rounded
            to that number of decimal places. If ``None``, values will not be rounded.
            If set to ``'auto'``, the transformer will round to the maximum number of
            decimal places detected in the fitted data.
        min_value (int, str or None):
            Indicate whether or not to set a minimum value for the data. If an integer is given,
            reverse transformed data will be greater than or equal to it. If the string ``'auto'``
            is given, the minimum will be the minimum value seen in the fitted data. If ``None``
            is given, there won't be a minimum.
        max_value (int, str or None):
            Indicate whether or not to set a maximum value for the data. If an integer is given,
            reverse transformed data will be less than or equal to it. If the string ``'auto'``
            is given, the maximum will be the maximum value seen in the fitted data. If ``None``
            is given, there won't be a maximum.
    """

    null_transformer = None
    nan = None
    _dtype = None
    _rounding_digits = None
    _min_value = None
    _max_value = None

    def __init__(self, dtype=None, nan='mean', null_column=None, rounding=None,
                 min_value=None, max_value=None):
        self.nan = nan
        self.null_column = null_column
        self.dtype = dtype
        self.rounding = rounding
        self.min_value = min_value
        self.max_value = max_value

    @staticmethod
    def _learn_rounding_digits(data):
        # check if data has any decimals
        roundable_data = data[~(np.isinf(data) | pd.isnull(data))]
        if (roundable_data % 1 != 0).any():
            if not (roundable_data == roundable_data.round(MAX_DECIMALS)).all():
                return None

            for decimal in range(MAX_DECIMALS + 1):
                if (roundable_data == roundable_data.round(decimal)).all():
                    return decimal

        else:
            maximum = max(abs(roundable_data))
            start = int(np.log10(maximum)) if maximum != 0 else 0
            for decimal in range(-start, 1):
                if (roundable_data == roundable_data.round(decimal)).all():
                    return decimal

        return None

    def fit(self, data):
        """Fit the transformer to the data.

        Args:
            data (pandas.Series or numpy.ndarray):
                Data to fit.
        """
        if isinstance(data, np.ndarray):
            data = pd.Series(data)

        self._dtype = self.dtype or data.dtype
        self._min_value = data.min() if self.min_value == 'auto' else self.min_value
        self._max_value = data.max() if self.max_value == 'auto' else self.max_value

        if self.rounding == 'auto':
            self._rounding_digits = self._learn_rounding_digits(data)
        elif isinstance(self.rounding, int):
            self._rounding_digits = self.rounding

        self.null_transformer = NullTransformer(self.nan, self.null_column, copy=True)
        self.null_transformer.fit(data)

    def transform(self, data):
        """Transform numerical data.

        Integer values are replaced by their float equivalent. Non null float values
        are left unmodified.

        Args:
            data (pandas.Series or numpy.ndarray):
                Data to transform.

        Returns:
            numpy.ndarray
        """
        if isinstance(data, np.ndarray):
            data = pd.Series(data)

        return self.null_transformer.transform(data)

    def reverse_transform(self, data):
        """Convert data back into the original format.

        Args:
            data (numpy.ndarray):
                Data to transform.

        Returns:
            numpy.ndarray
        """
        if self._min_value is not None or self._max_value is not None:
            if len(data.shape) > 1:
                data[:, 0] = data[:, 0].clip(self._min_value, self._max_value)
            else:
                data = data.clip(self._min_value, self._max_value)

        if self.nan is not None:
            data = self.null_transformer.reverse_transform(data)

        is_integer = np.dtype(self._dtype).kind == 'i'
        if self._rounding_digits is not None or is_integer:
            data = data.round(self._rounding_digits or 0)

        if pd.isnull(data).any() and is_integer:
            return data

        return data.astype(self._dtype)