def setUp(self): self.cwd = os.getcwd() tests_dir = __file__ os.chdir(os.path.dirname(tests_dir)) decoder = arff.ArffDecoder() with open(os.path.join("datasets", "dataset.arff")) as fh: dataset = decoder.decode(fh, encode_nominal=True) # -1 because the last attribute is the class self.attribute_types = [ 'numeric' if type(type_) != list else 'nominal' for name, type_ in dataset['attributes'][:-1]] self.categorical = [True if attribute == 'nominal' else False for attribute in self.attribute_types] data = np.array(dataset['data'], dtype=np.float64) X = data[:,:-1] y = data[:,-1].reshape((-1,)) ohe = OneHotEncoder(self.categorical) X_transformed = ohe.fit_transform(X) imp = Imputer(copy=False) X_transformed = imp.fit_transform(X_transformed) center = not scipy.sparse.isspmatrix((X_transformed)) standard_scaler = StandardScaler(with_mean=center) X_transformed = standard_scaler.fit_transform(X_transformed) X_transformed = X_transformed.todense() # Transform the array which indicates the categorical metafeatures number_numerical = np.sum(~np.array(self.categorical)) categorical_transformed = [True] * (X_transformed.shape[1] - number_numerical) + \ [False] * number_numerical self.categorical_transformed = categorical_transformed self.X = X self.X_transformed = X_transformed self.y = y self.mf = meta_features.metafeatures self.helpers = meta_features.helper_functions # Precompute some helper functions self.helpers.set_value("PCA", self.helpers["PCA"] (self.X_transformed, self.y)) self.helpers.set_value("MissingValues", self.helpers[ "MissingValues"](self.X, self.y, self.categorical)) self.helpers.set_value("NumSymbols", self.helpers["NumSymbols"]( self.X, self.y, self.categorical)) self.helpers.set_value("ClassOccurences", self.helpers["ClassOccurences"](self.X, self.y)) self.helpers.set_value("Skewnesses", self.helpers["Skewnesses"](self.X_transformed, self.y, self.categorical_transformed)) self.helpers.set_value("Kurtosisses", self.helpers["Kurtosisses"](self.X_transformed, self.y, self.categorical_transformed))
def perform1HotEncoding(self): if not hasattr(self, 'data'): raise ValueError('perform1HotEncoding can only be called when ' 'data is loaded') if hasattr(self, 'encoder_'): raise ValueError('perform1HotEncoding can only be called on ' 'non-encoded data.') self._encoder = None sparse = True if self.info['is_sparse'] == 1 else False has_missing = True if self.info['has_missing'] else False to_encode = ['categorical'] if has_missing: to_encode += ['binary'] encoding_mask = [ feat_type.lower() in to_encode for feat_type in self.feat_type ] categorical = [ True if feat_type.lower() == 'categorical' else False for feat_type in self.feat_type ] predicted_RAM_usage = float( predict_RAM_usage(self.data['X_train'], categorical)) / 1024 / 1024 if predicted_RAM_usage > 1000: sparse = True if any(encoding_mask): encoder = OneHotEncoder(categorical_features=encoding_mask, dtype=np.float32, sparse=sparse) self.data['X_train'] = encoder.fit_transform(self.data['X_train']) if 'X_valid' in self.data: self.data['X_valid'] = encoder.transform(self.data['X_valid']) if 'X_test' in self.data: self.data['X_test'] = encoder.transform(self.data['X_test']) if not sparse and scipy.sparse.issparse(self.data['X_train']): self.data['X_train'] = self.data['X_train'].todense() if 'X_valid' in self.data: self.data['X_valid'] = self.data['X_valid'].todense() if 'X_test' in self.data: self.data['X_test'] = self.data['X_test'].todense() self.encoder = encoder self.info['is_sparse'] = 1 if sparse else 0
def test_transform_with_unknown_value(self): input = np.array(((0, 1, 2, 3, 4, 5), (0, 1, 2, 3, 4, 5))).transpose() ohe = OneHotEncoder() ohe.fit(input) test_data = np.array(((0, 1, 2, 6), (0, 1, 6, 7))).transpose() output = ohe.transform(test_data).todense() self.assertEqual(5, np.sum(output)) input = np.array(((0, 1, 2, 3, 4, 5), (0, 1, 2, 3, 4, 5))).transpose() ips = scipy.sparse.csr_matrix(input) ohe = OneHotEncoder() ohe.fit(ips) test_data = np.array(((0, 1, 2, 6), (0, 1, 6, 7))).transpose() tds = scipy.sparse.csr_matrix(test_data) output = ohe.transform(tds).todense() self.assertEqual(3, np.sum(output))
def perform1HotEncoding(self): if not hasattr(self, 'data'): raise ValueError('perform1HotEncoding can only be called when ' 'data is loaded') if hasattr(self, 'encoder_'): raise ValueError('perform1HotEncoding can only be called on ' 'non-encoded data.') self._encoder = None sparse = True if self.info['is_sparse'] == 1 else False has_missing = True if self.info['has_missing'] else False to_encode = ['categorical'] if has_missing: to_encode += ['binary'] encoding_mask = [feat_type.lower() in to_encode for feat_type in self.feat_type] categorical = [True if feat_type.lower() == 'categorical' else False for feat_type in self.feat_type] predicted_RAM_usage = float(predict_RAM_usage( self.data['X_train'], categorical)) / 1024 / 1024 if predicted_RAM_usage > 1000: sparse = True if any(encoding_mask): encoder = OneHotEncoder(categorical_features=encoding_mask, dtype=np.float32, sparse=sparse) self.data['X_train'] = encoder.fit_transform(self.data['X_train']) if 'X_valid' in self.data: self.data['X_valid'] = encoder.transform(self.data['X_valid']) if 'X_test' in self.data: self.data['X_test'] = encoder.transform(self.data['X_test']) if not sparse and scipy.sparse.issparse(self.data['X_train']): self.data['X_train'] = self.data['X_train'].todense() if 'X_valid' in self.data: self.data['X_valid'] = self.data['X_valid'].todense() if 'X_test' in self.data: self.data['X_test'] = self.data['X_test'].todense() self.encoder = encoder self.info['is_sparse'] = 1 if sparse else 0
def fit_then_transform_dense(self, expected, input, categorical_features='all', minimum_fraction=None): ohe = OneHotEncoder(categorical_features=categorical_features, sparse=False, minimum_fraction=minimum_fraction) transformation = ohe.fit_transform(input.copy()) self.assertIsInstance(transformation, np.ndarray) assert_array_almost_equal(expected, transformation) ohe2 = OneHotEncoder(categorical_features=categorical_features, sparse=False, minimum_fraction=minimum_fraction) ohe2.fit(input.copy()) transformation = ohe2.transform(input.copy()) self.assertIsInstance(transformation, np.ndarray) assert_array_almost_equal(expected, transformation)
def perform_one_hot_encoding(sparse, categorical, data): predicted_RAM_usage = float(predict_RAM_usage(data[0], categorical)) / 1024 / 1024 if predicted_RAM_usage > 1000: sparse = True rvals = [] if any(categorical): encoder = OneHotEncoder(categorical_features=categorical, dtype=np.float32, sparse=sparse) rvals.append(encoder.fit_transform(data[0])) for d in data[1:]: rvals.append(encoder.transform(d)) if not sparse and scipy.sparse.issparse(rvals[0]): for i in range(len(rvals)): rvals[i] = rvals[i].todense() else: rvals = data return rvals, sparse
def perform_one_hot_encoding(sparse, categorical, data): predicted_RAM_usage = float( predict_RAM_usage(data[0], categorical)) / 1024 / 1024 if predicted_RAM_usage > 1000: sparse = True rvals = [] if any(categorical): encoder = OneHotEncoder(categorical_features=categorical, dtype=np.float32, sparse=sparse) rvals.append(encoder.fit_transform(data[0])) for d in data[1:]: rvals.append(encoder.transform(d)) if not sparse and scipy.sparse.issparse(rvals[0]): for i in range(len(rvals)): rvals[i] = rvals[i].todense() else: rvals = data return rvals, sparse
def fit_then_transform(self, expected, input, categorical_features='all', minimum_fraction=None): # Test fit_transform ohe = OneHotEncoder(categorical_features=categorical_features, minimum_fraction=minimum_fraction) transformation = ohe.fit_transform(input.copy()) self.assertIsInstance(transformation, scipy.sparse.csr_matrix) assert_array_almost_equal(expected.astype(float), transformation.todense()) # Test fit, and afterwards transform ohe2 = OneHotEncoder(categorical_features=categorical_features, minimum_fraction=minimum_fraction) ohe2.fit(input.copy()) transformation = ohe2.transform(input.copy()) self.assertIsInstance(transformation, scipy.sparse.csr_matrix) assert_array_almost_equal(expected, transformation.todense())
def test_classification_workflow(self): task = openml.tasks.get_task(254) X, y = task.get_X_and_y() ohe = OneHotEncoder(categorical_features=[True] * 22) tree = sklearn.tree.DecisionTreeClassifier(random_state=1) pipeline = sklearn.pipeline.Pipeline((('ohe', ohe), ('tree', tree))) X_train, X_test, y_train, y_test = \ sklearn.model_selection.train_test_split(X, y, random_state=3, train_size=0.5, test_size=0.5) pipeline.fit(X_train, y_train) self.assertEqual(np.mean(y_train == pipeline.predict(X_train)), 1) # With an incorrect copy operation the OneHotEncoder would rearrange # the data in such a way that the accuracy would drop to 66% self.assertEqual(np.mean(y_test == pipeline.predict(X_test)), 1)
def setUp(self): self.cwd = os.getcwd() tests_dir = __file__ os.chdir(os.path.dirname(tests_dir)) decoder = arff.ArffDecoder() with open(os.path.join("datasets", "dataset.arff")) as fh: dataset = decoder.decode(fh, encode_nominal=True) # -1 because the last attribute is the class self.attribute_types = [ 'numeric' if type(type_) != list else 'nominal' for name, type_ in dataset['attributes'][:-1] ] self.categorical = [ True if attribute == 'nominal' else False for attribute in self.attribute_types ] data = np.array(dataset['data'], dtype=np.float64) X = data[:, :-1] y = data[:, -1].reshape((-1, )) # First, swap NaNs and zeros, because when converting an encoded # dense matrix to sparse, the values which are encoded to zero are lost X_sparse = X.copy() NaNs = ~np.isfinite(X_sparse) X_sparse[NaNs] = 0 X_sparse = sparse.csr_matrix(X_sparse) ohe = OneHotEncoder(self.categorical) X_transformed = X_sparse.copy() X_transformed = ohe.fit_transform(X_transformed) imp = SimpleImputer(copy=False) X_transformed = imp.fit_transform(X_transformed) standard_scaler = StandardScaler(with_mean=False) X_transformed = standard_scaler.fit_transform(X_transformed) # Transform the array which indicates the categorical metafeatures number_numerical = np.sum(~np.array(self.categorical)) categorical_transformed = [True] * (X_transformed.shape[1] - number_numerical) + \ [False] * number_numerical self.categorical_transformed = categorical_transformed self.X = X_sparse self.X_transformed = X_transformed self.y = y self.mf = meta_features.metafeatures self.helpers = meta_features.helper_functions # Precompute some helper functions self.helpers.set_value("PCA", self.helpers["PCA"](self.X_transformed, self.y)) self.helpers.set_value( "MissingValues", self.helpers["MissingValues"](self.X, self.y, self.categorical)) self.mf.set_value( "NumberOfMissingValues", self.mf["NumberOfMissingValues"](self.X, self.y, self.categorical)) self.helpers.set_value( "NumSymbols", self.helpers["NumSymbols"](self.X, self.y, self.categorical)) self.helpers.set_value("ClassOccurences", self.helpers["ClassOccurences"](self.X, self.y)) self.helpers.set_value( "Skewnesses", self.helpers["Skewnesses"](self.X_transformed, self.y, self.categorical_transformed)) self.helpers.set_value( "Kurtosisses", self.helpers["Kurtosisses"](self.X_transformed, self.y, self.categorical_transformed))
def setUp(self): self.cwd = os.getcwd() tests_dir = __file__ os.chdir(os.path.dirname(tests_dir)) decoder = arff.ArffDecoder() with open(os.path.join("datasets", "dataset.arff")) as fh: dataset = decoder.decode(fh, encode_nominal=True) # -1 because the last attribute is the class self.attribute_types = [ 'numeric' if type(type_) != list else 'nominal' for name, type_ in dataset['attributes'][:-1]] self.categorical = [True if attribute == 'nominal' else False for attribute in self.attribute_types] data = np.array(dataset['data'], dtype=np.float64) X = data[:, :-1] y = data[:, -1].reshape((-1,)) # First, swap NaNs and zeros, because when converting an encoded # dense matrix to sparse, the values which are encoded to zero are lost X_sparse = X.copy() NaNs = ~np.isfinite(X_sparse) X_sparse[NaNs] = 0 X_sparse = sparse.csr_matrix(X_sparse) ohe = OneHotEncoder(self.categorical) X_transformed = X_sparse.copy() X_transformed = ohe.fit_transform(X_transformed) imp = Imputer(copy=False) X_transformed = imp.fit_transform(X_transformed) standard_scaler = StandardScaler() X_transformed = standard_scaler.fit_transform(X_transformed) # Transform the array which indicates the categorical metafeatures number_numerical = np.sum(~np.array(self.categorical)) categorical_transformed = [True] * (X_transformed.shape[1] - number_numerical) + \ [False] * number_numerical self.categorical_transformed = categorical_transformed self.X = X_sparse self.X_transformed = X_transformed self.y = y self.mf = meta_features.metafeatures self.helpers = meta_features.helper_functions # Precompute some helper functions self.helpers.set_value("PCA", self.helpers["PCA"] (self.X_transformed, self.y)) self.helpers.set_value("MissingValues", self.helpers[ "MissingValues"](self.X, self.y, self.categorical)) self.mf.set_value("NumberOfMissingValues", self.mf["NumberOfMissingValues"](self.X, self.y, self.categorical)) self.helpers.set_value("NumSymbols", self.helpers["NumSymbols"]( self.X, self.y, self.categorical)) self.helpers.set_value("ClassOccurences", self.helpers["ClassOccurences"](self.X, self.y)) self.helpers.set_value("Skewnesses", self.helpers["Skewnesses"](self.X_transformed, self.y, self.categorical_transformed)) self.helpers.set_value("Kurtosisses", self.helpers["Kurtosisses"](self.X_transformed, self.y, self.categorical_transformed))
def calculate_all_metafeatures(X, y, categorical, dataset_name, calculate=None, dont_calculate=None, densify_threshold=1000): logger = get_logger(__name__) """Calculate all metafeatures.""" helper_functions.clear() metafeatures.clear() mf_ = dict() visited = set() to_visit = deque() to_visit.extend(metafeatures) X_transformed = None y_transformed = None # TODO calculate the numpy metafeatures after all others to consume less # memory while len(to_visit) > 0: name = to_visit.pop() if calculate is not None and name not in calculate: continue if dont_calculate is not None and name in dont_calculate: continue if name in npy_metafeatures: if X_transformed is None: # TODO make sure this is done as efficient as possible (no copy for # sparse matrices because of wrong sparse format) sparse = scipy.sparse.issparse(X) ohe = OneHotEncoder(categorical_features=categorical, sparse=True) X_transformed = ohe.fit_transform(X) imputer = Imputer(strategy='mean', copy=False, dtype=X.dtype) X_transformed = imputer.fit_transform(X_transformed) standard_scaler = StandardScaler(copy=False) X_transformed = standard_scaler.fit_transform(X_transformed) # Transform the array which indicates the categorical metafeatures number_numerical = np.sum(~np.array(categorical)) categorical_transformed = [True] * (X_transformed.shape[1] - number_numerical) + \ [False] * number_numerical # Densify the transformed matrix if not sparse and scipy.sparse.issparse(X_transformed): bytes_per_float = X_transformed.dtype.itemsize num_elements = X_transformed.shape[ 0] * X_transformed.shape[1] megabytes_required = num_elements * bytes_per_float / 1000 / 1000 if megabytes_required < densify_threshold: X_transformed = X_transformed.todense() # This is not only important for datasets which are somehow # sorted in a strange way, but also prevents lda from failing in # some cases. # Because this is advanced indexing, a copy of the data is returned!!! X_transformed = check_array(X_transformed, force_all_finite=True, accept_sparse='csr') rs = np.random.RandomState(42) indices = np.arange(X_transformed.shape[0]) rs.shuffle(indices) # TODO Shuffle inplace X_transformed = X_transformed[indices] y_transformed = y[indices] X_ = X_transformed y_ = y_transformed categorical_ = categorical_transformed else: X_ = X y_ = y categorical_ = categorical dependency = metafeatures.get_dependency(name) if dependency is not None: is_metafeature = dependency in metafeatures is_helper_function = dependency in helper_functions if is_metafeature and is_helper_function: raise NotImplementedError() elif not is_metafeature and not is_helper_function: raise ValueError(dependency) elif is_metafeature and not metafeatures.is_calculated(dependency): to_visit.appendleft(name) continue elif is_helper_function and not helper_functions.is_calculated( dependency): logger.info("%s: Going to calculate: %s", dataset_name, dependency) value = helper_functions[dependency](X_, y_, categorical_) helper_functions.set_value(dependency, value) mf_[dependency] = value logger.info("%s: Going to calculate: %s", dataset_name, name) value = metafeatures[name](X_, y_, categorical_) metafeatures.set_value(name, value) mf_[name] = value visited.add(name) mf_ = DatasetMetafeatures(dataset_name, mf_) return mf_
def calculate_all_metafeatures(X, y, categorical, dataset_name, calculate=None, dont_calculate=None, densify_threshold=1000): logger = get_logger(__name__) """Calculate all metafeatures.""" helper_functions.clear() metafeatures.clear() mf_ = dict() visited = set() to_visit = deque() to_visit.extend(metafeatures) X_transformed = None y_transformed = None # TODO calculate the numpy metafeatures after all others to consume less # memory while len(to_visit) > 0: name = to_visit.pop() if calculate is not None and name not in calculate: continue if dont_calculate is not None and name in dont_calculate: continue if name in npy_metafeatures: if X_transformed is None: # TODO make sure this is done as efficient as possible (no copy for # sparse matrices because of wrong sparse format) sparse = scipy.sparse.issparse(X) ohe = OneHotEncoder(categorical_features=categorical, sparse=True) X_transformed = ohe.fit_transform(X) imputer = Imputer(strategy='mean', copy=False, dtype=X.dtype) X_transformed = imputer.fit_transform(X_transformed) standard_scaler = StandardScaler(copy=False) X_transformed = standard_scaler.fit_transform(X_transformed) # Transform the array which indicates the categorical metafeatures number_numerical = np.sum(~np.array(categorical)) categorical_transformed = [True] * (X_transformed.shape[1] - number_numerical) + \ [False] * number_numerical # Densify the transformed matrix if not sparse and scipy.sparse.issparse(X_transformed): bytes_per_float = X_transformed.dtype.itemsize num_elements = X_transformed.shape[0] * X_transformed.shape[1] megabytes_required = num_elements * bytes_per_float / 1000 / 1000 if megabytes_required < densify_threshold: X_transformed = X_transformed.todense() # This is not only important for datasets which are somehow # sorted in a strange way, but also prevents lda from failing in # some cases. # Because this is advanced indexing, a copy of the data is returned!!! X_transformed = check_array(X_transformed, force_all_finite=True, accept_sparse='csr') rs = np.random.RandomState(42) indices = np.arange(X_transformed.shape[0]) rs.shuffle(indices) # TODO Shuffle inplace X_transformed = X_transformed[indices] y_transformed = y[indices] X_ = X_transformed y_ = y_transformed categorical_ = categorical_transformed else: X_ = X y_ = y categorical_ = categorical dependency = metafeatures.get_dependency(name) if dependency is not None: is_metafeature = dependency in metafeatures is_helper_function = dependency in helper_functions if is_metafeature and is_helper_function: raise NotImplementedError() elif not is_metafeature and not is_helper_function: raise ValueError(dependency) elif is_metafeature and not metafeatures.is_calculated(dependency): to_visit.appendleft(name) continue elif is_helper_function and not helper_functions.is_calculated( dependency): logger.info("%s: Going to calculate: %s", dataset_name, dependency) value = helper_functions[dependency](X_, y_, categorical_) helper_functions.set_value(dependency, value) mf_[dependency] = value logger.info("%s: Going to calculate: %s", dataset_name, name) value = metafeatures[name](X_, y_, categorical_) metafeatures.set_value(name, value) mf_[name] = value visited.add(name) mf_ = DatasetMetafeatures(dataset_name, mf_) return mf_