def test_transform_series(self): """Test transform datetime series""" # Setup data = pd.Series([None, '1996-10-17', '1965-05-23']) data = pd.to_datetime(data) data_transform = pd.Series( [np.nan, 845510400000000000, -145497600000000000]) # Run transformer = Mock() transformer._transform.return_value = data_transform DatetimeTransformer.transform(transformer, data) # Asserts exp_call_data = pd.Series([None, '1996-10-17', '1965-05-23']) expect_call_args = pd.to_datetime(exp_call_data) expect_call_count = 1 pd.testing.assert_series_equal(transformer._transform.call_args[0][0], expect_call_args) self.assertEqual( transformer.null_transformer.transform.call_count, expect_call_count, "NullTransformer.transform must be called only once.")
def test_reverse_transform_all_none(self): dt = pd.to_datetime(['2020-01-01']) dtt = DatetimeTransformer(strip_constant=True) dtt.fit(dt) output = dtt.reverse_transform(np.array([None])) expected = pd.to_datetime(['NaT']) pd.testing.assert_series_equal(output.to_series(), expected.to_series())
def test_reverse_transform_2d_ndarray(self): dt = pd.to_datetime(['2020-01-01', '2020-02-01', '2020-03-01']) dtt = DatetimeTransformer(strip_constant=True) dtt.fit(dt) transformed = np.array([[18262.], [18293.], [18322.]]) output = dtt.reverse_transform(transformed) expected = pd.to_datetime(['2020-01-01', '2020-02-01', '2020-03-01']) pd.testing.assert_series_equal(output.to_series(), expected.to_series())
def test_no_strip(self): dtt = DatetimeTransformer(strip_constant=False) data = pd.to_datetime(pd.Series([None, '1996-10-17', '1965-05-23'])) # Run transformed = dtt.fit_transform(data.copy().to_numpy()) reverted = dtt.reverse_transform(transformed) # Asserts expect_trans = np.array([ [350006400000000000, 1.0], [845510400000000000, 0.0], [-145497600000000000, 0.0] ]) np.testing.assert_almost_equal(expect_trans, transformed) pd.testing.assert_series_equal(reverted, data)
def test_reverse_transform_nan_ignore(self): """Test reverse_transform with nan equal to ignore""" # Setup data = pd.Series([np.nan, 845510400000000000, -145497600000000000]) # Run transformer = Mock() transformer.nan = None result = DatetimeTransformer.reverse_transform(transformer, data) # Asserts expect = pd.Series([ np.nan, pd.to_datetime(845510400000000000), pd.to_datetime(-145497600000000000) ]) expect_reverse_call_count = 0 pd.testing.assert_series_equal(result, expect) self.assertEqual( transformer.null_transformer.reverse_transform.call_count, expect_reverse_call_count, "NullTransformer.reverse_transform won't be called when nan is ignore" )
def test_fit_nan_mode_series(self): """Test fit nan mode with pandas.Series""" # Setup data = np.array([None, '1996-10-17', '1965-05-23']) data = pd.to_datetime(data) # Run transformer = DatetimeTransformer(nan='mode') transformer.fit(data) # Asserts expect_nan = 'mode' expect_fill_value = -145497600000000000 self.assertEqual(transformer.nan, expect_nan, 'Unexpected nan') self.assertEqual(transformer.null_transformer.fill_value, expect_fill_value, "Data mean is wrong")
def test_fit_nan_mean_array(self): """Test fit nan mean with numpy.array""" # Setup data = np.array([None, '1996-10-17', '1965-05-23']) data = pd.to_datetime(data).to_numpy() # Run transformer = DatetimeTransformer(nan='mean') transformer.fit(data) # Asserts expect_nan = 'mean' expect_fill_value = 350006400000000000 self.assertEqual(transformer.nan, expect_nan, 'Unexpected nan') self.assertEqual(transformer.null_transformer.fill_value, expect_fill_value, "Data mean is wrong")
def _analyze(self, data): """Build a ``dict`` with column names and transformers from a given ``pandas.DataFrame``. When ``self.dtypes`` is ``None``, use the dtypes from the input data. When ``dtype`` is: - ``int``: a ``NumericalTransformer`` is created with ``dtype=int``. - ``float``: a ``NumericalTransformer`` is created with ``dtype=float``. - ``object`` or ``category``: a ``CategoricalTransformer`` is created. - ``bool``: a ``BooleanTransformer`` is created. - ``datetime``: a ``DatetimeTransformer`` is created. Any other ``dtype`` is not supported and raises a ``ValueError``. Args: data (pandas.DataFrame): Data used to analyze the ``pandas.DataFrame`` dtypes. Returns: dict: Mapping of column names and transformer instances. Raises: ValueError: if a ``dtype`` is not supported by the `HyperTransformer``. """ transformers = dict() dtypes = self.dtypes or data.dtypes if self.dtypes: dtypes = self.dtypes else: dtypes = [ data[column].dropna().infer_objects() for column in data.columns ] for name, dtype in zip(data.columns, dtypes): dtype = np.dtype(dtype) if dtype.kind == 'i': transformer = NumericalTransformer(dtype=int) elif dtype.kind == 'f': transformer = NumericalTransformer(dtype=float) elif dtype.kind == 'O': anonymize = self.anonymize.get(name) transformer = CategoricalTransformer(anonymize=anonymize) elif dtype.kind == 'b': transformer = BooleanTransformer() elif dtype.kind == 'M': transformer = DatetimeTransformer() else: raise ValueError('Unsupported dtype: {}'.format(dtype)) transformers[name] = transformer return transformers
def test___init__(self): """Test default instance""" # Run transformer = DatetimeTransformer() # Asserts self.assertEqual(transformer.nan, 'mean', "Unexpected nan") self.assertIsNone(transformer.null_column, "null_column is None by default") self.assertIsNone(transformer.null_transformer, "null_transformer is None by default")
def test__transform(self): """Test transform datetimes series to integer""" # Setup data = pd.Series([None, '1996-10-17', '1965-05-23']) data = pd.to_datetime(data) # Run result = DatetimeTransformer._transform(data) # Asserts expect = pd.Series([np.nan, 845510400000000000, -145497600000000000]) pd.testing.assert_series_equal(result, expect)
def test_reverse_transform_nan_not_ignore(self): """Test reverse_transform with nan not equal to ignore""" # Setup data = pd.Series([np.nan, 845510400000000000, -145497600000000000]) reversed_data = pd.Series( [np.nan, 845510400000000000, -145497600000000000]) # Run transformer = Mock() transformer.nan = 'mean' transformer.null_transformer.reverse_transform.return_value = reversed_data DatetimeTransformer.reverse_transform(transformer, data) # Asserts expect_reverse_call_count = 1 self.assertEqual( transformer.null_transformer.reverse_transform.call_count, expect_reverse_call_count, "NullTransformer.reverse_transform must be called when nan is not ignore" )