示例#1
0
    def perform1HotEncoding(self):
        if not hasattr(self, "data"):
            raise ValueError("perform1HotEncoding can only be called when "
                             "data is loaded")
        if hasattr(self, "encoder_"):
            raise ValueError("perform1HotEncoding can only be called on "
                             "non-encoded data.")
        self.encoder_ = None

        sparse = True if self.info['is_sparse'] == 1 else False
        has_missing = True if self.info['has_missing'] else False

        to_encode = ['categorical']
        if has_missing:
            to_encode += ['binary']
        encoding_mask = [
            feat_type.lower() in to_encode for feat_type in self.feat_type
        ]

        categorical = [
            True if feat_type.lower() == 'categorical' else False
            for feat_type in self.feat_type
        ]

        predicted_RAM_usage = float(
            data_util.predict_RAM_usage(self.data['X_train'],
                                        categorical)) / 1024 / 1024

        if predicted_RAM_usage > 1000:
            sparse = True

        if any(encoding_mask):
            encoder = OneHotEncoder(categorical_features=encoding_mask,
                                    dtype=np.float32,
                                    sparse=sparse)
            self.data['X_train'] = encoder.fit_transform(self.data['X_train'])
            if 'X_valid' in self.data:
                self.data['X_valid'] = encoder.transform(self.data['X_valid'])
            if 'X_test' in self.data:
                self.data['X_test'] = encoder.transform(self.data['X_test'])

            if not sparse and scipy.sparse.issparse(self.data['X_train']):
                self.data['X_train'] = self.data['X_train'].todense()
                if 'X_valid' in self.data:
                    self.data['X_valid'] = self.data['X_valid'].todense()
                if 'X_test' in self.data:
                    self.data['X_test'] = self.data['X_test'].todense()

            self.encoder_ = encoder
            self.info['is_sparse'] = 1 if sparse else 0
    def test_transform_with_unknown_value(self):
        input = np.array(((0, 1, 2, 3, 4, 5), (0, 1, 2, 3, 4, 5))).transpose()
        ohe = OneHotEncoder()
        ohe.fit(input)
        test_data = np.array(((0, 1, 2, 6), (0, 1, 6, 7))).transpose()
        output = ohe.transform(test_data).todense()
        self.assertEqual(5, np.sum(output))

        input = np.array(((0, 1, 2, 3, 4, 5), (0, 1, 2, 3, 4, 5))).transpose()
        ips = scipy.sparse.csr_matrix(input)
        ohe = OneHotEncoder()
        ohe.fit(ips)
        test_data = np.array(((0, 1, 2, 6), (0, 1, 6, 7))).transpose()
        tds = scipy.sparse.csr_matrix(test_data)
        output = ohe.transform(tds).todense()
        self.assertEqual(3, np.sum(output))
    def perform1HotEncoding(self):
        if not hasattr(self, 'data'):
            raise ValueError('perform1HotEncoding can only be called when '
                             'data is loaded')
        if hasattr(self, 'encoder_'):
            raise ValueError('perform1HotEncoding can only be called on '
                             'non-encoded data.')
        self._encoder = None

        sparse = True if self.info['is_sparse'] == 1 else False
        has_missing = True if self.info['has_missing'] else False

        to_encode = ['categorical']
        if has_missing:
            to_encode += ['binary']
        encoding_mask = [feat_type.lower() in to_encode
                         for feat_type in self.feat_type]

        categorical = [True if feat_type.lower() == 'categorical' else False
                       for feat_type in self.feat_type]

        predicted_RAM_usage = float(predict_RAM_usage(
            self.data['X_train'], categorical)) / 1024 / 1024

        if predicted_RAM_usage > 1000:
            sparse = True

        if any(encoding_mask):
            encoder = OneHotEncoder(categorical_features=encoding_mask,
                                    dtype=np.float32,
                                    sparse=sparse)
            self.data['X_train'] = encoder.fit_transform(self.data['X_train'])
            if 'X_valid' in self.data:
                self.data['X_valid'] = encoder.transform(self.data['X_valid'])
            if 'X_test' in self.data:
                self.data['X_test'] = encoder.transform(self.data['X_test'])

            if not sparse and scipy.sparse.issparse(self.data['X_train']):
                self.data['X_train'] = self.data['X_train'].todense()
                if 'X_valid' in self.data:
                    self.data['X_valid'] = self.data['X_valid'].todense()
                if 'X_test' in self.data:
                    self.data['X_test'] = self.data['X_test'].todense()

            self.encoder = encoder
            self.info['is_sparse'] = 1 if sparse else 0
    def fit_then_transform_dense(self, expected, input,
                                 categorical_features='all',
                                 minimum_fraction=None):
        ohe = OneHotEncoder(categorical_features=categorical_features,
                            sparse=False, minimum_fraction=minimum_fraction)
        transformation = ohe.fit_transform(input.copy())
        self.assertIsInstance(transformation, np.ndarray)
        assert_array_almost_equal(expected, transformation)

        ohe2 = OneHotEncoder(categorical_features=categorical_features,
                             sparse=False, minimum_fraction=minimum_fraction)
        ohe2.fit(input.copy())
        transformation = ohe2.transform(input.copy())
        self.assertIsInstance(transformation, np.ndarray)
        assert_array_almost_equal(expected, transformation)
    def fit_then_transform(self, expected, input, categorical_features='all',
                           minimum_fraction=None):
        # Test fit_transform
        ohe = OneHotEncoder(categorical_features=categorical_features,
                            minimum_fraction=minimum_fraction)
        transformation = ohe.fit_transform(input.copy())
        self.assertIsInstance(transformation, scipy.sparse.csr_matrix)
        assert_array_almost_equal(expected.astype(float),
                                  transformation.todense())

        # Test fit, and afterwards transform
        ohe2 = OneHotEncoder(categorical_features=categorical_features,
                             minimum_fraction=minimum_fraction)
        ohe2.fit(input.copy())
        transformation = ohe2.transform(input.copy())
        self.assertIsInstance(transformation, scipy.sparse.csr_matrix)
        assert_array_almost_equal(expected, transformation.todense())