def perform1HotEncoding(self): if not hasattr(self, "data"): raise ValueError("perform1HotEncoding can only be called when " "data is loaded") if hasattr(self, "encoder_"): raise ValueError("perform1HotEncoding can only be called on " "non-encoded data.") self.encoder_ = None sparse = True if self.info['is_sparse'] == 1 else False has_missing = True if self.info['has_missing'] else False to_encode = ['categorical'] if has_missing: to_encode += ['binary'] encoding_mask = [ feat_type.lower() in to_encode for feat_type in self.feat_type ] categorical = [ True if feat_type.lower() == 'categorical' else False for feat_type in self.feat_type ] predicted_RAM_usage = float( data_util.predict_RAM_usage(self.data['X_train'], categorical)) / 1024 / 1024 if predicted_RAM_usage > 1000: sparse = True if any(encoding_mask): encoder = OneHotEncoder(categorical_features=encoding_mask, dtype=np.float32, sparse=sparse) self.data['X_train'] = encoder.fit_transform(self.data['X_train']) if 'X_valid' in self.data: self.data['X_valid'] = encoder.transform(self.data['X_valid']) if 'X_test' in self.data: self.data['X_test'] = encoder.transform(self.data['X_test']) if not sparse and scipy.sparse.issparse(self.data['X_train']): self.data['X_train'] = self.data['X_train'].todense() if 'X_valid' in self.data: self.data['X_valid'] = self.data['X_valid'].todense() if 'X_test' in self.data: self.data['X_test'] = self.data['X_test'].todense() self.encoder_ = encoder self.info['is_sparse'] = 1 if sparse else 0
def test_transform_with_unknown_value(self): input = np.array(((0, 1, 2, 3, 4, 5), (0, 1, 2, 3, 4, 5))).transpose() ohe = OneHotEncoder() ohe.fit(input) test_data = np.array(((0, 1, 2, 6), (0, 1, 6, 7))).transpose() output = ohe.transform(test_data).todense() self.assertEqual(5, np.sum(output)) input = np.array(((0, 1, 2, 3, 4, 5), (0, 1, 2, 3, 4, 5))).transpose() ips = scipy.sparse.csr_matrix(input) ohe = OneHotEncoder() ohe.fit(ips) test_data = np.array(((0, 1, 2, 6), (0, 1, 6, 7))).transpose() tds = scipy.sparse.csr_matrix(test_data) output = ohe.transform(tds).todense() self.assertEqual(3, np.sum(output))
def perform1HotEncoding(self): if not hasattr(self, 'data'): raise ValueError('perform1HotEncoding can only be called when ' 'data is loaded') if hasattr(self, 'encoder_'): raise ValueError('perform1HotEncoding can only be called on ' 'non-encoded data.') self._encoder = None sparse = True if self.info['is_sparse'] == 1 else False has_missing = True if self.info['has_missing'] else False to_encode = ['categorical'] if has_missing: to_encode += ['binary'] encoding_mask = [feat_type.lower() in to_encode for feat_type in self.feat_type] categorical = [True if feat_type.lower() == 'categorical' else False for feat_type in self.feat_type] predicted_RAM_usage = float(predict_RAM_usage( self.data['X_train'], categorical)) / 1024 / 1024 if predicted_RAM_usage > 1000: sparse = True if any(encoding_mask): encoder = OneHotEncoder(categorical_features=encoding_mask, dtype=np.float32, sparse=sparse) self.data['X_train'] = encoder.fit_transform(self.data['X_train']) if 'X_valid' in self.data: self.data['X_valid'] = encoder.transform(self.data['X_valid']) if 'X_test' in self.data: self.data['X_test'] = encoder.transform(self.data['X_test']) if not sparse and scipy.sparse.issparse(self.data['X_train']): self.data['X_train'] = self.data['X_train'].todense() if 'X_valid' in self.data: self.data['X_valid'] = self.data['X_valid'].todense() if 'X_test' in self.data: self.data['X_test'] = self.data['X_test'].todense() self.encoder = encoder self.info['is_sparse'] = 1 if sparse else 0
def fit_then_transform_dense(self, expected, input, categorical_features='all', minimum_fraction=None): ohe = OneHotEncoder(categorical_features=categorical_features, sparse=False, minimum_fraction=minimum_fraction) transformation = ohe.fit_transform(input.copy()) self.assertIsInstance(transformation, np.ndarray) assert_array_almost_equal(expected, transformation) ohe2 = OneHotEncoder(categorical_features=categorical_features, sparse=False, minimum_fraction=minimum_fraction) ohe2.fit(input.copy()) transformation = ohe2.transform(input.copy()) self.assertIsInstance(transformation, np.ndarray) assert_array_almost_equal(expected, transformation)
def fit_then_transform(self, expected, input, categorical_features='all', minimum_fraction=None): # Test fit_transform ohe = OneHotEncoder(categorical_features=categorical_features, minimum_fraction=minimum_fraction) transformation = ohe.fit_transform(input.copy()) self.assertIsInstance(transformation, scipy.sparse.csr_matrix) assert_array_almost_equal(expected.astype(float), transformation.todense()) # Test fit, and afterwards transform ohe2 = OneHotEncoder(categorical_features=categorical_features, minimum_fraction=minimum_fraction) ohe2.fit(input.copy()) transformation = ohe2.transform(input.copy()) self.assertIsInstance(transformation, scipy.sparse.csr_matrix) assert_array_almost_equal(expected, transformation.todense())