Exemplo n.º 1
0
    def test__reverse_transfrom_by_matrix(self, psutil_mock):
        """Test the _reverse_transform_by_matrix method with numerical data

        Expect that the transformed data is correctly reverse transformed.

        Setup:
            The categorical transformer is instantiated with 4 categories and means. Also patch
            the `psutil.virtual_memory` function to return a large enough `available_memory`.
        Input:
            - transformed data with 4 rows
        Ouptut:
            - the original data
        """
        # Setup
        data = pd.Series([1, 2, 3, 4])
        transformed = pd.Series([0.875, 0.625, 0.375, 0.125])

        transformer = CategoricalTransformer()
        transformer.means = pd.Series([0.125, 0.375, 0.625, 0.875],
                                      index=[4, 3, 2, 1])
        transformer.dtype = data.dtype

        virtual_memory = Mock()
        virtual_memory.available = 4 * 4 * 8 * 3 + 1
        psutil_mock.return_value = virtual_memory

        # Run
        reverse = transformer._reverse_transform_by_matrix(transformed)

        # Assert
        pd.testing.assert_series_equal(data, reverse)
Exemplo n.º 2
0
def test_categoricaltransformer_mixed_low_virtual_memory(psutil_mock):
    """Test the CategoricalTransformer on mixed type data with low virtual memory.

    Ensure that the CategoricalTransformer can fit, transform, and reverse
    transform on mixed type data, when there is low virtual memory. Expect that the
    reverse transformed data is the same as the input.

    Input:
        - 4 rows of mixed data
    Output:
        - The reverse transformed data
    """
    # setup
    data = pd.Series([True, 'a', 1, None])
    transformer = CategoricalTransformer()

    virtual_memory = Mock()
    virtual_memory.available = 1
    psutil_mock.return_value = virtual_memory

    # run
    reverse = transformer.reverse_transform(transformer.fit_transform(data))

    # assert
    pd.testing.assert_series_equal(data, reverse)
Exemplo n.º 3
0
    def test_fit_series_no_anonymize(self):
        """Test fit with a pandas.Series, don't anonymize"""
        # Setup
        data = pd.Series(['bar', 'foo', 'foo', 'tar'])

        # Run
        transformer = Mock()
        transformer.anonymize = None

        CategoricalTransformer.fit(transformer, data)

        # Asserts
        expect_anonymize_call_count = 0
        expect_intervals_call_count = 1
        expect_intervals_call_args = pd.Series(['bar', 'foo', 'foo', 'tar'])

        self.assertEqual(
            transformer._anonymize.call_count,
            expect_anonymize_call_count,
            "Anonymize must be called only when anonymize is something"
        )

        self.assertEqual(
            transformer._get_intervals.call_count,
            expect_intervals_call_count,
            "Get intervals will be called always in fit"
        )

        pd.testing.assert_series_equal(
            transformer._get_intervals.call_args[0][0],
            expect_intervals_call_args
        )
Exemplo n.º 4
0
    def test_fit_series_anonymize(self):
        """Test fit with a pandas.Series, anonymize"""
        # Setup
        data = pd.Series(['bar', 'foo', 'foo', 'tar'])
        data_anonymized = pd.Series(['bar', 'foo', 'foo', 'tar'])

        # Run
        transformer = Mock()
        transformer.anonymize = 'email'
        transformer._anonymize.return_value = data_anonymized

        CategoricalTransformer.fit(transformer, data)

        # Asserts
        expect_anonymize_call_count = 1
        expect_intervals_call_count = 1
        expect_intervals_call_args = pd.Series(['bar', 'foo', 'foo', 'tar'])

        self.assertEqual(
            transformer._anonymize.call_count,
            expect_anonymize_call_count,
            "Anonymize must be called only once"
        )

        self.assertEqual(
            transformer._get_intervals.call_count,
            expect_intervals_call_count,
            "Get intervals will be called always in fit"
        )

        pd.testing.assert_series_equal(
            transformer._get_intervals.call_args[0][0],
            expect_intervals_call_args
        )
Exemplo n.º 5
0
    def test__transform_by_row(self):
        """Test the `_transform_by_row` method with numerical data.

        Expect that the correct transformed data is returned.

        Setup:
            The categorical transformer is instantiated with 4 categories and intervals.
        Input:
            - data with 4 rows
        Ouptut:
            - the transformed data
        """
        # Setup
        data = pd.Series([1, 2, 3, 4])
        transformer = CategoricalTransformer()
        transformer.intervals = {
            4: (0, 0.25, 0.125, 0.041666666666666664),
            3: (0.25, 0.5, 0.375, 0.041666666666666664),
            2: (0.5, 0.75, 0.625, 0.041666666666666664),
            1: (0.75, 1.0, 0.875, 0.041666666666666664),
        }

        # Run
        transformed = transformer._transform_by_row(data)

        # Asserts
        expected = np.array([0.875, 0.625, 0.375, 0.125])
        assert (transformed == expected).all()
Exemplo n.º 6
0
    def test__get_faker_anonymize_category_not_exist(self):
        """Test _get_faker with a category that don't exist"""
        # Run & assert
        transformer = Mock()
        transformer.anonymize = 'SuP3R-P1Th0N-P0w3R'

        with self.assertRaises(ValueError):
            CategoricalTransformer._get_faker(transformer)
Exemplo n.º 7
0
    def test__get_value_no_fuzzy(self):
        # Setup
        transformer = CategoricalTransformer(fuzzy=False)
        transformer.fuzzy = False
        transformer.intervals = {
            'foo': (0, 0.5, 0.25, 0.5 / 6),
        }

        # Run
        result = transformer._get_value('foo')

        # Asserts
        assert result == 0.25
Exemplo n.º 8
0
    def test__normalize_no_clip(self):
        """Test normalize data"""
        # Setup
        transformer = CategoricalTransformer(clip=False)

        # Run
        data = pd.Series([-0.43, 0.1234, 1.5, -1.31])
        result = transformer._normalize(data)

        # Asserts
        expect = pd.Series([0.57, 0.1234, 0.5, 0.69], dtype=float)

        pd.testing.assert_series_equal(result, expect)
Exemplo n.º 9
0
    def test__get_value_fuzzy(self, rvs_mock):
        # setup
        rvs_mock.return_value = 0.2745

        transformer = CategoricalTransformer(fuzzy=True)
        transformer.intervals = {
            'foo': (0, 0.5, 0.25, 0.5 / 6),
        }

        # Run
        result = transformer._get_value('foo')

        # Asserts
        assert result == 0.2745
Exemplo n.º 10
0
    def test___init__(self):
        """Test default instance"""
        # Run
        transformer = CategoricalTransformer()

        # Asserts
        self.assertFalse(transformer.anonymize, "Unexpected anonimyze default value")
Exemplo n.º 11
0
    def test__transform_by_row_called(self):
        """Test that the `_transform_by_row` method is called.

        When the number of rows is less than or equal to the number of categories,
        expect that the `_transform_by_row` method is called.

        Setup:
            The categorical transformer is instantiated with 4 categories.
        Input:
            - data with 4 rows
        Output:
            - the output of `_transform_by_row`
        Side effects:
            - `_transform_by_row` will be called once
        """
        # Setup
        data = pd.Series([1, 2, 3, 4])

        categorical_transformer_mock = Mock()
        categorical_transformer_mock.means = pd.Series(
            [0.125, 0.375, 0.625, 0.875])

        # Run
        transformed = CategoricalTransformer.transform(
            categorical_transformer_mock, data)

        # Asserts
        categorical_transformer_mock._transform_by_row.assert_called_once_with(
            data)
        assert transformed == categorical_transformer_mock._transform_by_row.return_value
Exemplo n.º 12
0
    def test_fit(self):
        # Setup
        transformer = CategoricalTransformer()

        # Run
        data = np.array(['foo', 'bar', 'bar', 'foo', 'foo', 'tar'])
        transformer.fit(data)

        # Asserts
        expected_intervals = {
            'foo': (0, 0.5, 0.25, 0.5 / 6),
            'bar':
            (0.5, 0.8333333333333333, 0.6666666666666666, 0.05555555555555555),
            'tar': (0.8333333333333333, 0.9999999999999999, 0.9166666666666666,
                    0.027777777777777776)
        }
        assert transformer.intervals == expected_intervals
Exemplo n.º 13
0
    def test_transform_array_no_anonymize(self, mock_maps):
        """Test transform a numpy.array, no anonymize"""
        # Setup
        data = np.array(['bar', 'foo', 'foo', 'tar'])

        # Run
        transformer = Mock()
        transformer.anonymize = None

        CategoricalTransformer.transform(transformer, data)

        # Asserts
        expect_maps_call_count = 0

        self.assertEqual(
            mock_maps.call_count,
            expect_maps_call_count,
            "Dont call to the map encoder when not anonymize"
        )
Exemplo n.º 14
0
    def _analyze(self, data):
        """Build a ``dict`` with column names and transformers from a given ``pandas.DataFrame``.

        When ``self.dtypes`` is ``None``, use the dtypes from the input data.

        When ``dtype`` is:
            - ``int``: a ``NumericalTransformer`` is created with ``dtype=int``.
            - ``float``: a ``NumericalTransformer`` is created with ``dtype=float``.
            - ``object`` or ``category``: a ``CategoricalTransformer`` is created.
            - ``bool``: a ``BooleanTransformer`` is created.
            - ``datetime``: a ``DatetimeTransformer`` is created.

        Any other ``dtype`` is not supported and raises a ``ValueError``.

        Args:
            data (pandas.DataFrame):
                Data used to analyze the ``pandas.DataFrame`` dtypes.

        Returns:
            dict:
                Mapping of column names and transformer instances.

        Raises:
            ValueError:
                if a ``dtype`` is not supported by the `HyperTransformer``.
        """
        transformers = dict()
        dtypes = self.dtypes or data.dtypes
        if self.dtypes:
            dtypes = self.dtypes
        else:
            dtypes = [
                data[column].dropna().infer_objects()
                for column in data.columns
            ]

        for name, dtype in zip(data.columns, dtypes):
            dtype = np.dtype(dtype)
            if dtype.kind == 'i':
                transformer = NumericalTransformer(dtype=int)
            elif dtype.kind == 'f':
                transformer = NumericalTransformer(dtype=float)
            elif dtype.kind == 'O':
                anonymize = self.anonymize.get(name)
                transformer = CategoricalTransformer(anonymize=anonymize)
            elif dtype.kind == 'b':
                transformer = BooleanTransformer()
            elif dtype.kind == 'M':
                transformer = DatetimeTransformer()
            else:
                raise ValueError('Unsupported dtype: {}'.format(dtype))

            transformers[name] = transformer

        return transformers
Exemplo n.º 15
0
    def test___init__(self):
        """Passed arguments must be stored as attributes."""
        # Run
        transformer = CategoricalTransformer(
            fuzzy='fuzzy_value',
            clip='clip_value',
        )

        # Asserts
        assert transformer.fuzzy == 'fuzzy_value'
        assert transformer.clip == 'clip_value'
Exemplo n.º 16
0
def test_categoricaltransformer_integers():
    """Test the CategoricalTransformer on integer data.

    Ensure that the CategoricalTransformer can fit, transform, and reverse
    transform on integer data. Expect that the reverse transformed data is the
    same as the input.

    Input:
        - 4 rows of int data
    Output:
        - The reverse transformed data
    """
    # setup
    data = pd.Series([1, 2, 3, 2])
    transformer = CategoricalTransformer()

    # run
    reverse = transformer.reverse_transform(transformer.fit_transform(data))

    # assert
    pd.testing.assert_series_equal(data, reverse)
Exemplo n.º 17
0
def test_categoricaltransformer_mixed():
    """Test the CategoricalTransformer on mixed type data.

    Ensure that the CategoricalTransformer can fit, transform, and reverse
    transform on mixed type data. Expect that the reverse transformed data is
    the same as the input.

    Input:
        - 4 rows of mixed data
    Output:
        - The reverse transformed data
    """
    # setup
    data = pd.Series([True, 'a', 1, None])
    transformer = CategoricalTransformer()

    # run
    reverse = transformer.reverse_transform(transformer.fit_transform(data))

    # assert
    pd.testing.assert_series_equal(data, reverse)
Exemplo n.º 18
0
def test_categoricaltransformer_pickle_nans():
    """Ensure that CategoricalTransformer can be pickled and loaded with nan value."""
    # setup
    data = pd.Series([1, 2, float('nan'), np.nan])

    transformer = CategoricalTransformer()
    transformer.fit(data)
    transformed = transformer.transform(data)

    # create pickle file on memory
    bytes_io = BytesIO()
    pickle.dump(transformer, bytes_io)
    # rewind
    bytes_io.seek(0)

    # run
    pickled_transformer = pickle.load(bytes_io)

    # assert
    pickle_transformed = pickled_transformer.transform(data)
    np.testing.assert_array_equal(pickle_transformed, transformed)
Exemplo n.º 19
0
    def test__get_faker_anonymize_list_type(self):
        """Test _get_faker when anonymize is a list with two elements"""
        # Run
        transformer = Mock()
        transformer.anonymize = ['credit_card_number', 'visa']

        faker_method = CategoricalTransformer._get_faker(transformer)
        fake_value = faker_method()

        # Asserts
        assert isinstance(fake_value, str)
        assert len(fake_value) == 16
Exemplo n.º 20
0
    def test__normalize(self):
        """Test normalize data"""
        # Setup
        data = pd.Series([-0.43, 0.1234, 1.5, -1.31])

        # Run
        result = CategoricalTransformer._normalize(data)

        # Asserts
        expect = pd.Series([0.43, 0.1234, 0.5, 0.31], dtype=float)

        pd.testing.assert_series_equal(result, expect)
Exemplo n.º 21
0
    def test__get_value(self, scipy_mock):
        """Test convert category value into num between 0 and 1"""
        # Run
        transformer = Mock()
        transformer.intervals = {
            'foo': (0, 0.5),
        }

        result = CategoricalTransformer._get_value(transformer, 'foo')

        # Asserts
        assert result == 0.25
Exemplo n.º 22
0
def test_categoricaltransformer_strings_2_categories():
    """Test the CategoricalTransformer on string data.

    Ensure that the CategoricalTransformer can fit, transform, and reverse
    transform on string data, when there are 2 categories of strings with
    the same value counts. Expect that the reverse transformed data is the
    same as the input.

    Input:
        - 4 rows of string data
    Output:
        - The reverse transformed data
    """
    # setup
    data = pd.Series(['a', 'b', 'a', 'b'])
    transformer = CategoricalTransformer()

    reverse = transformer.reverse_transform(transformer.fit_transform(data))

    # assert
    pd.testing.assert_series_equal(data, reverse)
Exemplo n.º 23
0
    def test__reverse_transform_by_category(self, psutil_mock):
        """Test the _reverse_transform_by_category method with numerical data.

        Expect that the transformed data is correctly reverse transformed.

        Setup:
            The categorical transformer is instantiated with 4 categories, and the means
            and intervals are set for those categories. Also patch the `psutil.virtual_memory`
            function to return an `available_memory` of 1.
        Input:
            - transformed data with 5 rows
        Ouptut:
            - the original data
        """
        data = pd.Series([1, 3, 3, 2, 1])
        transformed = pd.Series([0.875, 0.375, 0.375, 0.625, 0.875])

        transformer = CategoricalTransformer()
        transformer.means = pd.Series([0.125, 0.375, 0.625, 0.875],
                                      index=[4, 3, 2, 1])
        transformer.intervals = {
            4: (0, 0.25, 0.125, 0.041666666666666664),
            3: (0.25, 0.5, 0.375, 0.041666666666666664),
            2: (0.5, 0.75, 0.625, 0.041666666666666664),
            1: (0.75, 1.0, 0.875, 0.041666666666666664),
        }
        transformer.dtype = data.dtype

        virtual_memory = Mock()
        virtual_memory.available = 1
        psutil_mock.return_value = virtual_memory

        reverse = transformer._reverse_transform_by_category(transformed)

        pd.testing.assert_series_equal(data, reverse)
Exemplo n.º 24
0
    def test__get_faker_anonymize_not_tuple_or_list(self):
        """Test _get_faker when anonymize is neither a typle or a list"""
        # Run
        transformer = Mock()
        transformer.anonymize = 'email'

        result = CategoricalTransformer._get_faker(transformer)

        # Asserts
        self.assertEqual(
            result.__name__,
            'faker',
            "Expected faker function"
        )
Exemplo n.º 25
0
    def test__get_faker_anonymize_list(self):
        """Test _get_faker when anonymize is a list"""
        # Run
        transformer = Mock()
        transformer.anonymize = ['email']

        result = CategoricalTransformer._get_faker(transformer)

        # Asserts
        self.assertEqual(
            result.__name__,
            'faker',
            "Expected faker function"
        )
Exemplo n.º 26
0
    def test__get_intervals(self):
        # Run
        data = pd.Series(['foo', 'bar', 'bar', 'foo', 'foo', 'tar'])
        result = CategoricalTransformer._get_intervals(data)

        # Asserts
        expected_intervals = {
            'foo': (0, 0.5, 0.25, 0.5 / 6),
            'bar':
            (0.5, 0.8333333333333333, 0.6666666666666666, 0.05555555555555555),
            'tar': (0.8333333333333333, 0.9999999999999999, 0.9166666666666666,
                    0.027777777777777776)
        }
        assert result[0] == expected_intervals
Exemplo n.º 27
0
    def test_reverse_transform_array(self):
        """Test reverse_transform a numpy.array"""
        # Setup
        data = np.array(['foo', 'bar', 'bar', 'foo', 'foo', 'tar'])
        rt_data = np.array([-0.6, 0.5, 0.6, 0.2, 0.1, -0.2])
        transformer = CategoricalTransformer()

        # Run
        transformer.fit(data)
        result = transformer.reverse_transform(rt_data)

        # Asserts
        expected_intervals = {
            'foo': (0, 0.5, 0.25, 0.5 / 6),
            'bar':
            (0.5, 0.8333333333333333, 0.6666666666666666, 0.05555555555555555),
            'tar': (0.8333333333333333, 0.9999999999999999, 0.9166666666666666,
                    0.027777777777777776)
        }
        assert transformer.intervals == expected_intervals

        expect = pd.Series(data)
        pd.testing.assert_series_equal(result, expect)
Exemplo n.º 28
0
    def test__get_intervals(self):
        """Test get category intervals"""
        # Setup
        data = pd.Series(['bar', 'foo', 'foo', 'tar'])

        # Run
        result = CategoricalTransformer._get_intervals(data)

        # Asserts
        expected_intervals = {
            'foo': (0, 0.5),
            'tar': (0.5, 0.75),
            'bar': (0.75, 1)
        }
        assert result == expected_intervals
Exemplo n.º 29
0
    def test__get_faker_anonymize_tuple(self):
        """Test _get_faker when anonymize is a tuple"""
        # Setup

        # Run
        transformer = Mock()
        transformer.anonymize = ('email',)

        result = CategoricalTransformer._get_faker(transformer)

        # Asserts
        self.assertEqual(
            result.__name__,
            'faker',
            "Expected faker function"
        )
Exemplo n.º 30
0
def test_categorical_numerical_nans():
    """Ensure CategoricalTransformer works on numerical + nan only columns."""

    data = pd.Series([1, 2, float('nan'), np.nan])

    transformer = CategoricalTransformer()
    transformer.fit(data)
    transformed = transformer.transform(data)
    reverse = transformer.reverse_transform(transformed)

    pd.testing.assert_series_equal(reverse, data)