def read(self) -> List[RawFeature]: self.task = openml.tasks.get_task(self.task_id) dataset = openml.datasets.get_dataset(dataset_id=self.task.dataset_id) test_indices = np.array([]) train_indices = np.array([]) circular_queue = deque(range(self.task.get_split_dimensions()[1])) circular_queue.rotate(self.rotate_test) for fold in range(self.task.get_split_dimensions()[1]): _, t_indices = self.task.get_train_test_split_indices( fold=circular_queue.popleft()) if fold < self.test_folds: test_indices = np.concatenate((test_indices, t_indices), axis=None) else: train_indices = np.concatenate((train_indices, t_indices), axis=None) train_indices = np.array(train_indices, dtype=np.int) test_indices = np.array(test_indices, dtype=np.int) try: X, y, categorical_indicator, attribute_names = dataset.get_data( target=dataset.default_target_attribute, dataset_format='array') except: X, y, categorical_indicator, attribute_names = dataset.get_data( target=dataset.default_target_attribute, return_categorical_indicator=True, return_attribute_names=True) self.dataframe = pd.DataFrame(data=X, columns=attribute_names) self.splitted_values = {} self.splitted_target = {} self.splitted_target['train'] = y[train_indices] self.splitted_target['test'] = y[test_indices] self.splitted_values['train'] = X[train_indices] self.splitted_values['test'] = X[test_indices] for attribute_i in range(self.dataframe.shape[1]): rf = RawFeature(self.dataframe.columns[attribute_i], attribute_i, {}) rf.derive_properties( self.dataframe[self.dataframe.columns[attribute_i]].values) rf.properties['categorical'] = categorical_indicator[attribute_i] self.raw_features.append(rf) return self.raw_features
def read(self): openML_path = Config.get('openml.path') info_frame = pd.read_csv(openML_path + "/info.csv") assert info_frame[info_frame['name'] == self.name][ 'MLType'].values == 'classification', "it is not a classification task" #get schema and target file = open(openML_path + "/data/" + self.name + "_columns.csv", mode='r') json_schema = file.read() file.close() schema = json.loads(json_schema) names = [s['name'] for s in schema] self.dataframe = pd.read_csv( openML_path + "/data/" + self.name + ".csv", ) self.target_column_id = np.where( self.dataframe.columns == 'target')[0][0] # get target self.target_values = self.dataframe[self.dataframe.columns[ self.target_column_id]].values self.dataframe.drop(self.dataframe.columns[self.target_column_id], axis=1, inplace=True) # get split of the data self.splitter.get_splitted_ids(self.dataframe, self.target_values) self.splitted_values = {} self.splitted_target = {} self.splitted_target['train'], self.splitted_target[ 'valid'], self.splitted_target[ 'test'] = self.splitter.materialize_target(self.target_values) self.splitted_values['train'], self.splitted_values[ 'valid'], self.splitted_values[ 'test'] = self.splitter.materialize_values(self.dataframe) for attribute_i in range(self.dataframe.shape[1]): properties = self.derive_properties( attribute_i, self.dataframe[self.dataframe.columns[attribute_i]].values) self.raw_features.append( RawFeature(self.dataframe.columns[attribute_i], attribute_i, properties)) return self.raw_features
def read(self) -> List[RawFeature]: self.dataframe = pd.read_csv(self.file_name, na_filter=False) # get target self.target_values = self.dataframe[self.dataframe.columns[self.target_column_id]].values self.dataframe.drop(self.dataframe.columns[self.target_column_id], axis=1, inplace=True) # get split of the data self.splitter.get_splitted_ids(self.dataframe, self.target_values) self.splitted_values = {} self.splitted_target= {} self.splitted_target['train'], self.splitted_target['valid'], self.splitted_target['test'] = self.splitter.materialize_target(self.target_values) self.splitted_values['train'], self.splitted_values['valid'],self.splitted_values['test'] = self.splitter.materialize_values(self.dataframe) for attribute_i in range(self.dataframe.shape[1]): rf = RawFeature(self.dataframe.columns[attribute_i], attribute_i, {}) rf.derive_properties(self.dataframe[self.dataframe.columns[attribute_i]].values) self.raw_features.append(rf) return self.raw_features
def read(self) -> List[RawFeature]: self.dataframe = pd.DataFrame(data=self.X_train, columns=self.feature_names) if type(self.feature_is_categorical) != type(None): for feature_id in range(len(self.feature_is_categorical)): if not self.feature_is_categorical[feature_id]: self.dataframe[ self.dataframe.columns[feature_id]] = pd.to_numeric( self.dataframe[self.dataframe.columns[feature_id]]) if type(self.feature_is_categorical) == type(None): for feature_id in range(len(self.dataframe.columns)): try: self.dataframe[ self.dataframe.columns[feature_id]] = pd.to_numeric( self.dataframe[self.dataframe.columns[feature_id]]) except: pass self.splitted_values = {} self.splitted_target = {} self.splitted_target['train'] = self.y_train self.splitted_target['test'] = [] self.splitted_values['train'] = self.X_train self.splitted_values['test'] = [] for attribute_i in range(self.dataframe.shape[1]): feature_name = 'Feature' + str(self.dataframe.columns[attribute_i]) if type(self.feature_names) != type(None): feature_name = self.feature_names[attribute_i] rf = RawFeature(feature_name, attribute_i, {}) rf.derive_properties( self.dataframe[self.dataframe.columns[attribute_i]].values) if not rf.is_numeric(): rf.properties['categorical'] = True if type(self.feature_is_categorical) != type(None): rf.properties['categorical'] = self.feature_is_categorical[ attribute_i] self.raw_features.append(rf) print(self.raw_features) return self.raw_features
import numpy as np from fastsklearnfeature.reader.Reader import Reader from fastsklearnfeature.splitting.Splitter import Splitter from fastsklearnfeature.configuration.Config import Config from fastsklearnfeature.transformations.GroupByThenTransformation import GroupByThenTransformation from fastsklearnfeature.candidates.CandidateFeature import CandidateFeature from fastsklearnfeature.candidates.RawFeature import RawFeature f0 = RawFeature('col0', 0, {}) f1 = RawFeature('col1', 1, {}) training = np.array([[6, 1], [5, 1], [4, 2], [3, 2]]) print(training[0, 1]) print(training.shape) c = CandidateFeature(GroupByThenTransformation(np.sum, 2), [f0, f1]) c.fit(training) print(c.transform(training)) ''' raw_features[1].fit(training) print(raw_features[1].transform(training)) raw_features[0].fit(training) print(raw_features[0].transform(training)) '''