Пример #1
0
            def _add_extra_fields(self,
                                  aws_access_key=None,
                                  aws_secret_key=None):

                data = load_data(self.name, self.train_path, aws_access_key,
                                 aws_secret_key)

                if self.n_examples is None:
                    self.n_examples = len(data)

                if self.k_classes is None:
                    self.k_classes = len(np.unique(data[self.class_column]))

                if self.d_features is None:
                    total_features = data.shape[1] - 1
                    for column in data.columns:
                        if data[column].dtype == 'object':
                            total_features += len(np.unique(data[column])) - 1

                    self.d_features = total_features

                if self.majority is None:
                    counts = data[self.class_column].value_counts()
                    self.majority = float(max(counts)) / float(sum(counts))

                if self.size_kb is None:
                    self.size_kb = int(np.array(data).nbytes / 1000)
Пример #2
0
            def load(self, test_size=0.3, random_state=0,
                     aws_access_key=None, aws_secret_key=None):
                data = load_data(self.name, self.train_path, aws_access_key, aws_secret_key)

                if self.test_path:
                    if self.name.endswith('.csv'):
                        test_name = self.name.replace('.csv', '_test.csv')
                    else:
                        test_name = self.name + '_test'

                    test_data = load_data(test_name, self.test_path,
                                          aws_access_key, aws_secret_key)
                    return data, test_data

                else:
                    return train_test_split(data, test_size=test_size, random_state=random_state)