def __init__(self, root, split=TRAIN, validation_size=0.2):
        file_name_train = 'train.csv'
        file_name_test = 'test.csv'
        dataset_path = os.path.join(root, self.name)
        url_train = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00421/aps_failure_training_set.csv'
        url_test = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00421/aps_failure_test_set.csv'
        download_file(url_train, dataset_path, file_name_train)
        download_file(url_test, dataset_path, file_name_test)
        file_path_train = os.path.join(dataset_path, file_name_train)
        file_path_test = os.path.join(dataset_path, file_name_train)
        df_train_valid = pd.read_csv(file_path_train, skiprows=20, na_values='na')
        df_test = pd.read_csv(file_path_test, skiprows=20, na_values='na')

        # TODO This is risky business since test and train might be cleaned to have different columns
        clean_na_(df_train_valid)
        clean_na_(df_test)
        if not (df_train_valid.columns == df_test.columns).all():
            raise Exception('Cleaning lead to different set of columns for train/test')

        y_columns = ['class']
        label_encode_df_([df_train_valid, df_test], y_columns[0])
        df_train, df_valid = split_df(df_train_valid, [1 - validation_size, validation_size])
        normalize_df_(df_train, other_dfs=[df_valid, df_test], skip_column=y_columns[0])
        df_res = get_split(df_train, df_valid, df_test, split)
        self.x, self.y = xy_split(df_res, y_columns)
        self.y = self.y[:, 0]
 def __init__(self, root, split=TRAIN, validation_size=0.2):
     dataset_path = os.path.join(root, self.name)
     url_train = 'http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/shuttle/shuttle.trn.Z'
     url_test = 'http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/shuttle/shuttle.tst'
     file_name_train = 'train.csv'
     file_name_test = 'test.csv'
     file_path_train = os.path.join(dataset_path, file_name_train)
     file_path_test = os.path.join(dataset_path, file_name_test)
     file_name_z = 'train.z'
     fresh_download = download_file(url_train, dataset_path, file_name_z)
     if fresh_download:
         path_z = os.path.join(dataset_path, file_name_z)
         with open(path_z, 'rb') as f_in:
             with open(file_path_train, 'wb') as f_out:
                 f_out.write(unlzw(f_in.read()))
         download_file(url_test, dataset_path, file_name_test)
     df_train_valid = pd.read_csv(file_path_train, header=None, sep=' ')
     y_columns = [9]
     df_train, df_valid = split_classification_df(df_train_valid,
                                                  [1 - validation_size, validation_size],
                                                  y_columns[0])
     df_test = pd.read_csv(file_path_test, header=None, sep=' ')
     label_encode_df_([df_train, df_valid, df_test], y_columns[0])
     normalize_df_(df_train, other_dfs=[df_valid, df_test], skip_column=y_columns[0])
     df_res = get_split(df_train, df_valid, df_test, split)
     self.x, self.y = xy_split(df_res, y_columns)
     self.y = self.y[:, 0]
    def __init__(self, root, split=TRAIN, validation_size=0.2):
        dataset_path = os.path.join(root, self.name)
        filename_train = 'data_train.csv'
        filename_test = 'data_test.csv'
        file_path_train = os.path.join(dataset_path, filename_train)
        file_path_test = os.path.join(dataset_path, filename_test)
        url_train = 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
        url_test = 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test'
        download_file(url_train, dataset_path, filename_train)
        download_file(url_test, dataset_path, filename_test)

        df_train_valid = pd.read_csv(file_path_train, header=None, skiprows=0)
        df_test = pd.read_csv(file_path_test, header=None, skiprows=1)

        # Trailing period in test file
        df_test[14] = df_test[14].str.rstrip('.')

        df_test.index += len(df_train_valid)
        df = pd.concat([df_train_valid, df_test])
        y_columns = df.columns[-1:]
        one_hot_encode_df_(df, skip_columns=y_columns)
        label_encode_df_(df, y_columns[0])
        df_train_valid, df_test = (df.loc[df_train_valid.index], df.loc[df_test.index])
        df_train, df_valid = split_df(df_train_valid, [1 - validation_size, validation_size])
        normalize_df_(df_train, other_dfs=[df_valid, df_test], skip_column=y_columns[0])
        df_res = get_split(df_train, df_valid, df_test, split)
        self.x, self.y = xy_split(df_res, y_columns)
        self.y = self.y[:, 0]  # Flatten for classification
    def __init__(self, root, split=TRAIN, validation_size=0.2):
        dataset_path = os.path.join(root, self.name)
        url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00363/Dataset.zip'
        download_unzip(url, dataset_path)
        dataset_path = os.path.join(dataset_path, 'Dataset')

        # The 5th variant has the most data
        train_path = os.path.join(dataset_path, 'Training', 'Features_Variant_5.csv')
        test_path = os.path.join(dataset_path, 'Testing', 'Features_TestSet.csv')
        df_train_valid = pd.read_csv(train_path, header=None)
        df_test = pd.read_csv(test_path, header=None)
        y_columns = df_train_valid.columns[-1:]

        # Page ID is not included, but can be derived. Page IDs can not be
        # in both training and validation sets
        page_columns = list(range(29))
        for i, (_, df_group) in enumerate(df_train_valid.groupby(page_columns)):
            df_train_valid.loc[df_group.index, 'page_id'] = i
        df_train, df_valid = split_df_on_column(df_train_valid, [1 - validation_size, validation_size], 'page_id')
        df_train.drop(columns='page_id', inplace=True)
        df_valid.drop(columns='page_id', inplace=True)

        normalize_df_(df_train, other_dfs=[df_valid, df_test])
        df_res = get_split(df_train, df_valid, df_test, split)
        self.x, self.y = xy_split(df_res, y_columns)
    def __init__(self, root, split=TRAIN, validation_size=0.2):
        file_name = 'blogData_train.csv'
        dataset_path = os.path.join(root, self.name)
        url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00304/BlogFeedback.zip'
        download_unzip(url, dataset_path)

        # Iterate all test csv and concatenate to one DataFrame
        test_dfs = []
        for fn in os.listdir(dataset_path):
            if 'blogData_test' not in fn:
                continue
            file_path = os.path.join(dataset_path, fn)
            test_dfs.append(pd.read_csv(file_path, header=None))
        df_test = pd.concat(test_dfs)

        file_path = os.path.join(dataset_path, file_name)
        df_train_valid = pd.read_csv(file_path, header=None)
        y_columns = [280]
        df_train_valid[y_columns[0]] = np.log(df_train_valid[y_columns[0]] + 0.01)
        df_test[y_columns[0]] = np.log(df_test[y_columns[0]] + 0.01)

        page_columns = list(range(50))
        for i, (_, df_group) in enumerate(df_train_valid.groupby(page_columns)):
            df_train_valid.loc[df_group.index, 'page_id'] = i
        df_train, df_valid = split_df_on_column(df_train_valid, [1 - validation_size, validation_size], 'page_id')
        df_train.drop(columns='page_id', inplace=True)
        df_valid.drop(columns='page_id', inplace=True)

        normalize_df_(df_train, other_dfs=[df_valid, df_test])
        df_res = get_split(df_train, df_valid, df_test, split)
        self.x, self.y = xy_split(df_res, y_columns)
示例#6
0
    def test_normalize_regression(self):
        clean_na_(self.df)
        one_hot_encode_df_(self.df)
        normalize_df_(self.df)

        # All columns should have mean = 0
        self.assertAlmostEqual(self.df.mean().mean(), 0, delta=1e-9)

        # Columns with original std > 0 should now have std = 1
        self.assertAlmostEqual(self.df.age.std(), 1, delta=1e-1)
示例#7
0
    def test_normalize_classification(self):
        clean_na_(self.df)
        label_encode_df_(self.df, 'sex')
        normalize_df_(self.df, skip_column='sex')

        # Label/omitted column should have mean > 0
        self.assertGreater(self.df.sex.mean(), 0)

        # Other columns should have mean = 0
        self.assertAlmostEqual(self.df.age.mean(), 0, delta=1e-6)

        # Columns with original std > 0 should now have std = 1
        self.assertAlmostEqual(self.df.age.std(), 1, delta=1e-6)
 def __init__(self, root, split=TRAIN, validation_size=0.2):
     dataset_path = os.path.join(root, self.name)
     filename = 'data.csv'
     file_path = os.path.join(dataset_path, filename)
     url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data'
     download_file(url, dataset_path, filename)
     df = pd.read_csv(file_path, header=None)
     y_columns = df.columns[-1:]
     one_hot_encode_df_(df)
     df_test, df_train, df_valid = split_df(df, [0.2, 0.8 - 0.8 * validation_size, 0.8 * validation_size])
     normalize_df_(df_train, other_dfs=[df_valid, df_test])
     df_res = get_split(df_train, df_valid, df_test, split)
     self.x, self.y = xy_split(df_res, y_columns)
 def __init__(self, root, split=TRAIN, validation_size=0.2):
     dataset_path = os.path.join(root, self.name)
     url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00206/slice_localization_data.zip'
     download_unzip(url, dataset_path)
     file_name = 'slice_localization_data.csv'
     file_path = os.path.join(dataset_path, file_name)
     df = pd.read_csv(file_path)
     # No patient should be in both train and test set
     df_train_valid = deepcopy(df.loc[df.patientId < 80, :])  # Pandas complains if it is a view
     df_test = deepcopy(df.loc[df.patientId >= 80, :])        # - " -
     df_train, df_valid = split_df_on_column(df_train_valid, [1 - validation_size, validation_size], 'patientId')
     y_columns = ['reference']
     normalize_df_(df_train, other_dfs=[df_valid, df_test])
     df_res = get_split(df_train, df_valid, df_test, split)
     df_res = df_res.drop(columns='patientId')
     self.x, self.y = xy_split(df_res, y_columns)
 def __init__(self, root, split=TRAIN, validation_size=0.2):
     dataset_path = os.path.join(root, self.name)
     url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00459/avila.zip'
     download_unzip(url, dataset_path)
     file_path_train = os.path.join(dataset_path, 'avila', 'avila-tr.txt')
     file_path_test = os.path.join(dataset_path, 'avila', 'avila-ts.txt')
     df_train_valid = pd.read_csv(file_path_train, header=None)
     df_test = pd.read_csv(file_path_test, header=None)
     y_columns = [10]
     label_encode_df_([df_train_valid, df_test], y_columns[0])  # Assumes encoding will be identical for train/test
     df_train, df_valid = split_classification_df(df_train_valid,
                                                  [1 - validation_size, validation_size],
                                                  y_columns[0])
     normalize_df_(df_train, other_dfs=[df_valid, df_test], skip_column=y_columns[0])
     df_res = get_split(df_train, df_valid, df_test, split)
     self.x, self.y = xy_split(df_res, y_columns)
     self.y = self.y[:, 0]
示例#11
0
    def __init__(self, root, split=TRAIN, validation_size=0.2):
        dataset_path = os.path.join(root, self.name)
        filename = 'data.csv'
        file_path: str = os.path.join(dataset_path, filename)
        url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/' \
              'parkinsons/telemonitoring/parkinsons_updrs.data'
        download_file(url, dataset_path, filename)
        df = pd.read_csv(file_path)
        y_columns = ['motor_UPDRS', 'total_UPDRS']

        df_train_valid = df[df['subject#'] <= 30]
        df_test = deepcopy(df[df['subject#'] > 30])

        df_train, df_valid = split_df_on_column(df_train_valid, [1 - validation_size, validation_size], 'subject#')
        normalize_df_(df_train, other_dfs=[df_valid, df_test])
        df_res = get_split(df_train, df_valid, df_test, split)
        df_res.drop(columns='subject#', inplace=True)
        self.x, self.y = xy_split(df_res, y_columns)
 def __init__(self, root, split=TRAIN, validation_size=0.2):
     dataset_path = os.path.join(root, self.name)
     file_name_train = 'train.csv'
     file_name_test = 'test.csv'
     url_train = 'http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/satimage/sat.trn'
     url_test = 'http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/satimage/sat.tst'
     download_file(url_train, dataset_path, file_name_train)
     download_file(url_test, dataset_path, file_name_test)
     file_path_train = os.path.join(dataset_path, file_name_train)
     file_path_test = os.path.join(dataset_path, file_name_test)
     df_train_valid = pd.read_csv(file_path_train, sep=' ', header=None)
     df_test = pd.read_csv(file_path_test, sep=' ', header=None)
     df_test.index += len(df_train_valid)
     df = pd.concat([df_train_valid, df_test])
     y_columns = [36]
     label_encode_df_(df, y_columns[0])
     df_train_valid = df.loc[df_train_valid.index, :]
     df_test = df.loc[df_test.index, :]
     df_train, df_valid = split_classification_df(df_train_valid, [1 - validation_size, validation_size], 36)
     normalize_df_(df_train, other_dfs=[df_valid, df_test], skip_column=y_columns[0])
     df_res = get_split(df_train, df_valid, df_test, split)
     self.x, self.y = xy_split(df_res, y_columns)
     self.y = self.y[:, 0]