예제 #1
0
    def read(self) -> List[RawFeature]:

        self.task = openml.tasks.get_task(self.task_id)
        dataset = openml.datasets.get_dataset(dataset_id=self.task.dataset_id)

        test_indices = np.array([])
        train_indices = np.array([])

        circular_queue = deque(range(self.task.get_split_dimensions()[1]))

        circular_queue.rotate(self.rotate_test)

        for fold in range(self.task.get_split_dimensions()[1]):
            _, t_indices = self.task.get_train_test_split_indices(
                fold=circular_queue.popleft())

            if fold < self.test_folds:
                test_indices = np.concatenate((test_indices, t_indices),
                                              axis=None)
            else:
                train_indices = np.concatenate((train_indices, t_indices),
                                               axis=None)

        train_indices = np.array(train_indices, dtype=np.int)
        test_indices = np.array(test_indices, dtype=np.int)

        try:
            X, y, categorical_indicator, attribute_names = dataset.get_data(
                target=dataset.default_target_attribute,
                dataset_format='array')
        except:
            X, y, categorical_indicator, attribute_names = dataset.get_data(
                target=dataset.default_target_attribute,
                return_categorical_indicator=True,
                return_attribute_names=True)

        self.dataframe = pd.DataFrame(data=X, columns=attribute_names)

        self.splitted_values = {}
        self.splitted_target = {}

        self.splitted_target['train'] = y[train_indices]
        self.splitted_target['test'] = y[test_indices]
        self.splitted_values['train'] = X[train_indices]
        self.splitted_values['test'] = X[test_indices]

        for attribute_i in range(self.dataframe.shape[1]):
            rf = RawFeature(self.dataframe.columns[attribute_i], attribute_i,
                            {})
            rf.derive_properties(
                self.dataframe[self.dataframe.columns[attribute_i]].values)
            rf.properties['categorical'] = categorical_indicator[attribute_i]
            self.raw_features.append(rf)

        return self.raw_features
    def read(self):
        openML_path = Config.get('openml.path')

        info_frame = pd.read_csv(openML_path + "/info.csv")

        assert info_frame[info_frame['name'] == self.name][
            'MLType'].values == 'classification', "it is not a classification task"

        #get schema and target
        file = open(openML_path + "/data/" + self.name + "_columns.csv",
                    mode='r')
        json_schema = file.read()
        file.close()
        schema = json.loads(json_schema)

        names = [s['name'] for s in schema]

        self.dataframe = pd.read_csv(
            openML_path + "/data/" + self.name + ".csv", )

        self.target_column_id = np.where(
            self.dataframe.columns == 'target')[0][0]

        # get target
        self.target_values = self.dataframe[self.dataframe.columns[
            self.target_column_id]].values
        self.dataframe.drop(self.dataframe.columns[self.target_column_id],
                            axis=1,
                            inplace=True)

        # get split of the data
        self.splitter.get_splitted_ids(self.dataframe, self.target_values)

        self.splitted_values = {}
        self.splitted_target = {}

        self.splitted_target['train'], self.splitted_target[
            'valid'], self.splitted_target[
                'test'] = self.splitter.materialize_target(self.target_values)
        self.splitted_values['train'], self.splitted_values[
            'valid'], self.splitted_values[
                'test'] = self.splitter.materialize_values(self.dataframe)

        for attribute_i in range(self.dataframe.shape[1]):
            properties = self.derive_properties(
                attribute_i,
                self.dataframe[self.dataframe.columns[attribute_i]].values)
            self.raw_features.append(
                RawFeature(self.dataframe.columns[attribute_i], attribute_i,
                           properties))

        return self.raw_features
예제 #3
0
파일: Reader.py 프로젝트: BigDaMa/DFS
    def read(self) -> List[RawFeature]:
        self.dataframe = pd.read_csv(self.file_name, na_filter=False)

        # get target
        self.target_values = self.dataframe[self.dataframe.columns[self.target_column_id]].values
        self.dataframe.drop(self.dataframe.columns[self.target_column_id], axis=1, inplace=True)

        # get split of the data
        self.splitter.get_splitted_ids(self.dataframe, self.target_values)

        self.splitted_values = {}
        self.splitted_target= {}

        self.splitted_target['train'], self.splitted_target['valid'], self.splitted_target['test'] = self.splitter.materialize_target(self.target_values)
        self.splitted_values['train'], self.splitted_values['valid'],self.splitted_values['test'] = self.splitter.materialize_values(self.dataframe)

        for attribute_i in range(self.dataframe.shape[1]):
            rf = RawFeature(self.dataframe.columns[attribute_i], attribute_i, {})
            rf.derive_properties(self.dataframe[self.dataframe.columns[attribute_i]].values)
            self.raw_features.append(rf)


        return self.raw_features
예제 #4
0
    def read(self) -> List[RawFeature]:

        self.dataframe = pd.DataFrame(data=self.X_train,
                                      columns=self.feature_names)

        if type(self.feature_is_categorical) != type(None):
            for feature_id in range(len(self.feature_is_categorical)):
                if not self.feature_is_categorical[feature_id]:
                    self.dataframe[
                        self.dataframe.columns[feature_id]] = pd.to_numeric(
                            self.dataframe[self.dataframe.columns[feature_id]])

        if type(self.feature_is_categorical) == type(None):
            for feature_id in range(len(self.dataframe.columns)):
                try:
                    self.dataframe[
                        self.dataframe.columns[feature_id]] = pd.to_numeric(
                            self.dataframe[self.dataframe.columns[feature_id]])
                except:
                    pass

        self.splitted_values = {}
        self.splitted_target = {}

        self.splitted_target['train'] = self.y_train
        self.splitted_target['test'] = []
        self.splitted_values['train'] = self.X_train
        self.splitted_values['test'] = []

        for attribute_i in range(self.dataframe.shape[1]):
            feature_name = 'Feature' + str(self.dataframe.columns[attribute_i])
            if type(self.feature_names) != type(None):
                feature_name = self.feature_names[attribute_i]

            rf = RawFeature(feature_name, attribute_i, {})
            rf.derive_properties(
                self.dataframe[self.dataframe.columns[attribute_i]].values)
            if not rf.is_numeric():
                rf.properties['categorical'] = True
            if type(self.feature_is_categorical) != type(None):
                rf.properties['categorical'] = self.feature_is_categorical[
                    attribute_i]
            self.raw_features.append(rf)

        print(self.raw_features)

        return self.raw_features
예제 #5
0
파일: test_group.py 프로젝트: BigDaMa/DFS
import numpy as np
from fastsklearnfeature.reader.Reader import Reader
from fastsklearnfeature.splitting.Splitter import Splitter
from fastsklearnfeature.configuration.Config import Config
from fastsklearnfeature.transformations.GroupByThenTransformation import GroupByThenTransformation
from fastsklearnfeature.candidates.CandidateFeature import CandidateFeature
from fastsklearnfeature.candidates.RawFeature import RawFeature

f0 = RawFeature('col0', 0, {})
f1 = RawFeature('col1', 1, {})

training = np.array([[6, 1], [5, 1], [4, 2], [3, 2]])

print(training[0, 1])

print(training.shape)

c = CandidateFeature(GroupByThenTransformation(np.sum, 2), [f0, f1])

c.fit(training)

print(c.transform(training))
'''
raw_features[1].fit(training)
print(raw_features[1].transform(training))

raw_features[0].fit(training)
print(raw_features[0].transform(training))
'''