def setUp(self): self.cwd = os.getcwd() tests_dir = __file__ os.chdir(os.path.dirname(tests_dir)) decoder = arff.ArffDecoder() with open(os.path.join("datasets", "dataset.arff")) as fh: dataset = decoder.decode(fh, encode_nominal=True) # -1 because the last attribute is the class self.attribute_types = [ 'numeric' if type(type_) != list else 'nominal' for name, type_ in dataset['attributes'][:-1]] self.categorical = [True if attribute == 'nominal' else False for attribute in self.attribute_types] data = np.array(dataset['data'], dtype=np.float64) X = data[:,:-1] y = data[:,-1].reshape((-1,)) ohe = OneHotEncoder(self.categorical) X_transformed = ohe.fit_transform(X) imp = Imputer(copy=False) X_transformed = imp.fit_transform(X_transformed) standard_scaler = StandardScaler() X_transformed = standard_scaler.fit_transform(X_transformed) X_transformed = X_transformed.todense() # Transform the array which indicates the categorical metafeatures number_numerical = np.sum(~np.array(self.categorical)) categorical_transformed = [True] * (X_transformed.shape[1] - number_numerical) + \ [False] * number_numerical self.categorical_transformed = categorical_transformed self.X = X self.X_transformed = X_transformed self.y = y self.mf = meta_features.metafeatures self.helpers = meta_features.helper_functions # Precompute some helper functions self.helpers.set_value("PCA", self.helpers["PCA"] (self.X_transformed, self.y)) self.helpers.set_value("MissingValues", self.helpers[ "MissingValues"](self.X, self.y, self.categorical)) self.helpers.set_value("NumSymbols", self.helpers["NumSymbols"]( self.X, self.y, self.categorical)) self.helpers.set_value("ClassOccurences", self.helpers["ClassOccurences"](self.X, self.y)) self.helpers.set_value("Skewnesses", self.helpers["Skewnesses"](self.X_transformed, self.y, self.categorical_transformed)) self.helpers.set_value("Kurtosisses", self.helpers["Kurtosisses"](self.X_transformed, self.y, self.categorical_transformed))
def setUp(self): self.cwd = os.getcwd() tests_dir = __file__ os.chdir(os.path.dirname(tests_dir)) decoder = arff.ArffDecoder() with open(os.path.join("datasets", "dataset.arff")) as fh: dataset = decoder.decode(fh, encode_nominal=True) # -1 because the last attribute is the class self.attribute_types = [ 'numeric' if type(type_) != list else 'nominal' for name, type_ in dataset['attributes'][:-1] ] self.categorical = [ True if attribute == 'nominal' else False for attribute in self.attribute_types ] data = np.array(dataset['data'], dtype=np.float64) X = data[:, :-1] y = data[:, -1].reshape((-1, )) ohe = OneHotEncoder(self.categorical) X_transformed = ohe.fit_transform(X) imp = Imputer(copy=False) X_transformed = imp.fit_transform(X_transformed) standard_scaler = StandardScaler() X_transformed = standard_scaler.fit_transform(X_transformed) X_transformed = X_transformed.todense() # Transform the array which indicates the categorical metafeatures number_numerical = np.sum(~np.array(self.categorical)) categorical_transformed = [True] * (X_transformed.shape[1] - number_numerical) + \ [False] * number_numerical self.categorical_transformed = categorical_transformed self.X = X self.X_transformed = X_transformed self.y = y self.mf = meta_features.metafeatures self.helpers = meta_features.helper_functions # Precompute some helper functions self.helpers.set_value("PCA", self.helpers["PCA"](self.X_transformed, self.y)) self.helpers.set_value( "MissingValues", self.helpers["MissingValues"](self.X, self.y, self.categorical)) self.helpers.set_value( "NumSymbols", self.helpers["NumSymbols"](self.X, self.y, self.categorical)) self.helpers.set_value("ClassOccurences", self.helpers["ClassOccurences"](self.X, self.y)) self.helpers.set_value( "Skewnesses", self.helpers["Skewnesses"](self.X_transformed, self.y, self.categorical_transformed)) self.helpers.set_value( "Kurtosisses", self.helpers["Kurtosisses"](self.X_transformed, self.y, self.categorical_transformed))
def calculate_all_metafeatures(X, y, categorical, dataset_name, calculate=None, dont_calculate=None, densify_threshold=1000): logger = get_logger(__name__) """Calculate all metafeatures.""" helper_functions.clear() metafeatures.clear() mf_ = dict() visited = set() to_visit = deque() to_visit.extend(metafeatures) X_transformed = None y_transformed = None # TODO calculate the numpy metafeatures after all others to consume less # memory while len(to_visit) > 0: name = to_visit.pop() if calculate is not None and name not in calculate: continue if dont_calculate is not None and name in dont_calculate: continue if name in npy_metafeatures: if X_transformed is None: # TODO make sure this is done as efficient as possible (no copy for # sparse matrices because of wrong sparse format) sparse = scipy.sparse.issparse(X) ohe = OneHotEncoder(categorical_features=categorical, sparse=True) X_transformed = ohe.fit_transform(X) imputer = Imputer(strategy='mean', copy=False, dtype=X.dtype) X_transformed = imputer.fit_transform(X_transformed) standard_scaler = StandardScaler(copy=False) X_transformed = standard_scaler.fit_transform(X_transformed) # Transform the array which indicates the categorical metafeatures number_numerical = np.sum(~np.array(categorical)) categorical_transformed = [True] * (X_transformed.shape[1] - number_numerical) + \ [False] * number_numerical # Densify the transformed matrix if not sparse and scipy.sparse.issparse(X_transformed): bytes_per_float = X_transformed.dtype.itemsize num_elements = X_transformed.shape[ 0] * X_transformed.shape[1] megabytes_required = num_elements * bytes_per_float / 1000 / 1000 if megabytes_required < densify_threshold: X_transformed = X_transformed.todense() # This is not only important for datasets which are somehow # sorted in a strange way, but also prevents lda from failing in # some cases. # Because this is advanced indexing, a copy of the data is returned!!! X_transformed = check_array(X_transformed, force_all_finite=True, accept_sparse='csr') rs = np.random.RandomState(42) indices = np.arange(X_transformed.shape[0]) rs.shuffle(indices) # TODO Shuffle inplace X_transformed = X_transformed[indices] y_transformed = y[indices] X_ = X_transformed y_ = y_transformed categorical_ = categorical_transformed else: X_ = X y_ = y categorical_ = categorical dependency = metafeatures.get_dependency(name) if dependency is not None: is_metafeature = dependency in metafeatures is_helper_function = dependency in helper_functions if is_metafeature and is_helper_function: raise NotImplementedError() elif not is_metafeature and not is_helper_function: raise ValueError(dependency) elif is_metafeature and not metafeatures.is_calculated(dependency): to_visit.appendleft(name) continue elif is_helper_function and not helper_functions.is_calculated( dependency): logger.info("%s: Going to calculate: %s", dataset_name, dependency) value = helper_functions[dependency](X_, y_, categorical_) helper_functions.set_value(dependency, value) mf_[dependency] = value logger.info("%s: Going to calculate: %s", dataset_name, name) value = metafeatures[name](X_, y_, categorical_) metafeatures.set_value(name, value) mf_[name] = value visited.add(name) mf_ = DatasetMetafeatures(dataset_name, mf_) return mf_
def calculate_all_metafeatures(X, y, categorical, dataset_name, calculate=None, dont_calculate=None, densify_threshold=1000): logger = get_logger(__name__) """Calculate all metafeatures.""" helper_functions.clear() metafeatures.clear() mf_ = dict() visited = set() to_visit = deque() to_visit.extend(metafeatures) X_transformed = None y_transformed = None # TODO calculate the numpy metafeatures after all others to consume less # memory while len(to_visit) > 0: name = to_visit.pop() if calculate is not None and name not in calculate: continue if dont_calculate is not None and name in dont_calculate: continue if name in npy_metafeatures: if X_transformed is None: # TODO make sure this is done as efficient as possible (no copy for # sparse matrices because of wrong sparse format) sparse = scipy.sparse.issparse(X) ohe = OneHotEncoder(categorical_features=categorical, sparse=True) X_transformed = ohe.fit_transform(X) imputer = Imputer(strategy='mean', copy=False, dtype=X.dtype) X_transformed = imputer.fit_transform(X_transformed) standard_scaler = StandardScaler(copy=False) X_transformed = standard_scaler.fit_transform(X_transformed) # Transform the array which indicates the categorical metafeatures number_numerical = np.sum(~np.array(categorical)) categorical_transformed = [True] * (X_transformed.shape[1] - number_numerical) + \ [False] * number_numerical # Densify the transformed matrix if not sparse and scipy.sparse.issparse(X_transformed): bytes_per_float = X_transformed.dtype.itemsize num_elements = X_transformed.shape[0] * X_transformed.shape[1] megabytes_required = num_elements * bytes_per_float / 1000 / 1000 if megabytes_required < densify_threshold: X_transformed = X_transformed.todense() # This is not only important for datasets which are somehow # sorted in a strange way, but also prevents lda from failing in # some cases. # Because this is advanced indexing, a copy of the data is returned!!! X_transformed = check_array(X_transformed, force_all_finite=True, accept_sparse='csr') rs = np.random.RandomState(42) indices = np.arange(X_transformed.shape[0]) rs.shuffle(indices) # TODO Shuffle inplace X_transformed = X_transformed[indices] y_transformed = y[indices] X_ = X_transformed y_ = y_transformed categorical_ = categorical_transformed else: X_ = X y_ = y categorical_ = categorical dependency = metafeatures.get_dependency(name) if dependency is not None: is_metafeature = dependency in metafeatures is_helper_function = dependency in helper_functions if is_metafeature and is_helper_function: raise NotImplementedError() elif not is_metafeature and not is_helper_function: raise ValueError(dependency) elif is_metafeature and not metafeatures.is_calculated(dependency): to_visit.appendleft(name) continue elif is_helper_function and not helper_functions.is_calculated( dependency): logger.info("%s: Going to calculate: %s", dataset_name, dependency) value = helper_functions[dependency](X_, y_, categorical_) helper_functions.set_value(dependency, value) mf_[dependency] = value logger.info("%s: Going to calculate: %s", dataset_name, name) value = metafeatures[name](X_, y_, categorical_) metafeatures.set_value(name, value) mf_[name] = value visited.add(name) mf_ = DatasetMetafeatures(dataset_name, mf_) return mf_