Exemplo n.º 1
0
    def test_fit(self):
        """Test 'fit' on a np.ndarray with one continuous and one discrete columns.

        The 'fit' method should:
            - Set 'self.dataframe' to 'False'
            - Set 'self._column_raw_dtypes' to the appropirate dtypes
            - Use the appropriate '_fit' type for each column'
            - Update 'self.output_info_list', 'self.output_dimensions' and
            'self._column_transform_info_list' appropriately

        Setup:
            - Create DataTransformer
            - Mock _fit_discrete
            - Mock _fit_continuous

        Input:
            - raw_data = a table with one continuous and one discrete columns.
            - discrete_columns = list with the name of the discrete column

        Output:
            - None

        Side Effects:
            - _fit_discrete and _fit_continuous should each be called once
            - Assigns 'self._column_raw_dtypes' the appropriate dtypes
            - Assigns 'self.output_info_list' the appropriate 'output_info'.
            - Assigns 'self.output_dimensions' the appropriate 'output_dimensions'.
            - Assigns 'self._column_transform_info_list' the appropriate 'column_transform_info'.
        """
        data = pd.DataFrame({
            "x": np.random.random(size=100),
            "y": np.random.choice(["yes", "no"], size=100)
        })

        transformer = DataTransformer()
        transformer._fit_continuous = Mock()
        transformer._fit_continuous.return_value = ColumnTransformInfo(
            column_name="x",
            column_type="continuous",
            transform=None,
            transform_aux=None,
            output_info=[SpanInfo(1, 'tanh'),
                         SpanInfo(3, 'softmax')],
            output_dimensions=1 + 3)

        transformer._fit_discrete = Mock()
        transformer._fit_discrete.return_value = ColumnTransformInfo(
            column_name="y",
            column_type="discrete",
            transform=None,
            transform_aux=None,
            output_info=[SpanInfo(2, 'softmax')],
            output_dimensions=2)

        transformer.fit(data, discrete_columns=["y"])

        transformer._fit_discrete.assert_called_once()
        transformer._fit_continuous.assert_called_once()
        assert transformer.output_dimensions == 6
Exemplo n.º 2
0
    def test_fit(self):
        """Test ``fit`` on a np.ndarray with one continuous and one discrete columns.

        The ``fit`` method should:
            - Set ``self.dataframe`` to ``False``.
            - Set ``self._column_raw_dtypes`` to the appropirate dtypes.
            - Use the appropriate ``_fit`` type for each column.
            - Update ``self.output_info_list``, ``self.output_dimensions`` and
            ``self._column_transform_info_list`` appropriately.

        Setup:
            - Create ``DataTransformer``.
            - Mock ``_fit_discrete``.
            - Mock ``_fit_continuous``.

        Input:
            - A table with one continuous and one discrete columns.
            - A list with the name of the discrete column.

        Side Effects:
            - ``_fit_discrete`` and ``_fit_continuous`` should each be called once.
            - Assigns ``self._column_raw_dtypes`` the appropriate dtypes.
            - Assigns ``self.output_info_list`` the appropriate ``output_info``.
            - Assigns ``self.output_dimensions`` the appropriate ``output_dimensions``.
            - Assigns ``self._column_transform_info_list`` the appropriate
            ``column_transform_info``.
        """
        # Setup
        transformer = DataTransformer()
        transformer._fit_continuous = Mock()
        transformer._fit_continuous.return_value = ColumnTransformInfo(
            column_name='x',
            column_type='continuous',
            transform=None,
            output_info=[SpanInfo(1, 'tanh'),
                         SpanInfo(3, 'softmax')],
            output_dimensions=1 + 3)

        transformer._fit_discrete = Mock()
        transformer._fit_discrete.return_value = ColumnTransformInfo(
            column_name='y',
            column_type='discrete',
            transform=None,
            output_info=[SpanInfo(2, 'softmax')],
            output_dimensions=2)

        data = pd.DataFrame({
            'x': np.random.random(size=100),
            'y': np.random.choice(['yes', 'no'], size=100)
        })

        # Run
        transformer.fit(data, discrete_columns=['y'])

        # Assert
        transformer._fit_discrete.assert_called_once()
        transformer._fit_continuous.assert_called_once()
        assert transformer.output_dimensions == 6
Exemplo n.º 3
0
    def test___fit_continuous_(self, MockBGM):
        """Test '_fit_continuous_' on a simple continuous column.

        A 'BayesianGaussianMixture' will be created and fit with the
        'raw_column_data'.

        Setup:
            - Create DataTransformer with weight_threshold
            - Mock the BayesianGaussianMixture
            - Provide fit method (no-op)
            - Provide weights_ attribute, some above threshold, some below

        Input:
            - column_name = string
            - raw_column_data = numpy array of continuous values

        Output:
            - ColumnTransformInfo
              - Check column name
              - Check that output_dims matches expected (1 + # weights above threshold)

        Side Effects:
            - fit should be called with the data
        """
        bgm_instance = MockBGM.return_value
        bgm_instance.weights_ = np.array([10.0, 5.0,
                                          0.0])  # 2 non-zero components

        max_clusters = 10
        transformer = DataTransformer(max_clusters, weight_threshold=0.005)
        info = transformer._fit_continuous("column", np.random.normal(
            (100, 1)))

        assert info.column_name == "column"
        assert info.transform == bgm_instance
        assert info.output_dimensions == 3
        assert info.output_info[0].dim == 1
        assert info.output_info[0].activation_fn == "tanh"
        assert info.output_info[1].dim == 2
        assert info.output_info[1].activation_fn == "softmax"
Exemplo n.º 4
0
    def test___fit_continuous(self, MockBGM):
        """Test ``_fit_continuous`` on a simple continuous column.

        A ``BayesGMMTransformer`` will be created and fit with some ``data``.

        Setup:
            - Mock the ``BayesGMMTransformer`` with ``valid_component_indicator`` as
              ``[True, False, True]``.
            - Initialize a ``DataTransformer``.

        Input:
            - A dataframe with only one column containing random float values.

        Output:
            - A ``ColumnTransformInfo`` object where:
              - ``column_name`` matches the column of the data.
              - ``transform`` is the ``BayesGMMTransformer`` instance.
              - ``output_dimensions`` is 3 (matches size of ``valid_component_indicator``).
              - ``output_info`` assigns the correct activation functions.

        Side Effects:
            - ``fit`` should be called with the data.
        """
        # Setup
        bgm_instance = MockBGM.return_value
        bgm_instance.valid_component_indicator = [True, False, True]
        transformer = DataTransformer()
        data = pd.DataFrame(np.random.normal((100, 1)), columns=['column'])

        # Run
        info = transformer._fit_continuous(data)

        # Assert
        assert info.column_name == 'column'
        assert info.transform == bgm_instance
        assert info.output_dimensions == 3
        assert info.output_info[0].dim == 1
        assert info.output_info[0].activation_fn == 'tanh'
        assert info.output_info[1].dim == 2
        assert info.output_info[1].activation_fn == 'softmax'