def __init__(self, root, split=TRAIN, validation_size=0.2):
        file_name_train = 'train.csv'
        file_name_test = 'test.csv'
        dataset_path = os.path.join(root, self.name)
        url_train = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00421/aps_failure_training_set.csv'
        url_test = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00421/aps_failure_test_set.csv'
        download_file(url_train, dataset_path, file_name_train)
        download_file(url_test, dataset_path, file_name_test)
        file_path_train = os.path.join(dataset_path, file_name_train)
        file_path_test = os.path.join(dataset_path, file_name_train)
        df_train_valid = pd.read_csv(file_path_train, skiprows=20, na_values='na')
        df_test = pd.read_csv(file_path_test, skiprows=20, na_values='na')

        # TODO This is risky business since test and train might be cleaned to have different columns
        clean_na_(df_train_valid)
        clean_na_(df_test)
        if not (df_train_valid.columns == df_test.columns).all():
            raise Exception('Cleaning lead to different set of columns for train/test')

        y_columns = ['class']
        label_encode_df_([df_train_valid, df_test], y_columns[0])
        df_train, df_valid = split_df(df_train_valid, [1 - validation_size, validation_size])
        normalize_df_(df_train, other_dfs=[df_valid, df_test], skip_column=y_columns[0])
        df_res = get_split(df_train, df_valid, df_test, split)
        self.x, self.y = xy_split(df_res, y_columns)
        self.y = self.y[:, 0]
 def __init__(self, root, split=TRAIN, validation_size=0.2):
     dataset_path = os.path.join(root, self.name)
     url_train = 'http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/shuttle/shuttle.trn.Z'
     url_test = 'http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/shuttle/shuttle.tst'
     file_name_train = 'train.csv'
     file_name_test = 'test.csv'
     file_path_train = os.path.join(dataset_path, file_name_train)
     file_path_test = os.path.join(dataset_path, file_name_test)
     file_name_z = 'train.z'
     fresh_download = download_file(url_train, dataset_path, file_name_z)
     if fresh_download:
         path_z = os.path.join(dataset_path, file_name_z)
         with open(path_z, 'rb') as f_in:
             with open(file_path_train, 'wb') as f_out:
                 f_out.write(unlzw(f_in.read()))
         download_file(url_test, dataset_path, file_name_test)
     df_train_valid = pd.read_csv(file_path_train, header=None, sep=' ')
     y_columns = [9]
     df_train, df_valid = split_classification_df(df_train_valid,
                                                  [1 - validation_size, validation_size],
                                                  y_columns[0])
     df_test = pd.read_csv(file_path_test, header=None, sep=' ')
     label_encode_df_([df_train, df_valid, df_test], y_columns[0])
     normalize_df_(df_train, other_dfs=[df_valid, df_test], skip_column=y_columns[0])
     df_res = get_split(df_train, df_valid, df_test, split)
     self.x, self.y = xy_split(df_res, y_columns)
     self.y = self.y[:, 0]
    def __init__(self, root, split=TRAIN, validation_size=0.2):
        dataset_path = os.path.join(root, self.name)
        filename_train = 'data_train.csv'
        filename_test = 'data_test.csv'
        file_path_train = os.path.join(dataset_path, filename_train)
        file_path_test = os.path.join(dataset_path, filename_test)
        url_train = 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
        url_test = 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test'
        download_file(url_train, dataset_path, filename_train)
        download_file(url_test, dataset_path, filename_test)

        df_train_valid = pd.read_csv(file_path_train, header=None, skiprows=0)
        df_test = pd.read_csv(file_path_test, header=None, skiprows=1)

        # Trailing period in test file
        df_test[14] = df_test[14].str.rstrip('.')

        df_test.index += len(df_train_valid)
        df = pd.concat([df_train_valid, df_test])
        y_columns = df.columns[-1:]
        one_hot_encode_df_(df, skip_columns=y_columns)
        label_encode_df_(df, y_columns[0])
        df_train_valid, df_test = (df.loc[df_train_valid.index], df.loc[df_test.index])
        df_train, df_valid = split_df(df_train_valid, [1 - validation_size, validation_size])
        normalize_df_(df_train, other_dfs=[df_valid, df_test], skip_column=y_columns[0])
        df_res = get_split(df_train, df_valid, df_test, split)
        self.x, self.y = xy_split(df_res, y_columns)
        self.y = self.y[:, 0]  # Flatten for classification
예제 #4
0
 def __init__(self, root, split=TRAIN, validation_size=0.2):
     dataset_path = os.path.join(root, self.name)
     filename = 'data.csv'
     file_path = os.path.join(dataset_path, filename)
     url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv'
     download_file(url, dataset_path, filename)
     df = pd.read_csv(file_path, sep=';')
     y_columns = ['quality']
     self.x, self.y = split_normalize_sequence(df, y_columns, validation_size, split, self.type_)
 def __init__(self, root, split=TRAIN, validation_size=0.2):
     file_name = 'data.csv'
     dataset_path = os.path.join(root, self.name)
     url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00325/Sensorless_drive_diagnosis.txt'
     download_file(url, dataset_path, file_name)
     file_path = os.path.join(dataset_path, file_name)
     df = pd.read_csv(file_path, header=None, sep=' ')
     y_columns = [48]
     label_encode_df_(df, y_columns[0])
     self.x, self.y = split_normalize_sequence(df, y_columns, validation_size, split, self.type_)
 def __init__(self, root, split=TRAIN, validation_size=0.2):
     file_name = 'data.csv'
     dataset_path = os.path.join(root, self.name)
     url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/letter-recognition.data'
     download_file(url, dataset_path, file_name)
     file_path = os.path.join(dataset_path, file_name)
     df = pd.read_csv(file_path, header=None)
     y_columns = [0]
     label_encode_df_(df, y_columns[0])
     self.x, self.y = split_normalize_sequence(df, y_columns, validation_size, split, self.type_)
 def __init__(self, root, split=TRAIN, validation_size=0.2):
     dataset_path = os.path.join(root, self.name)
     filename = 'data.xls'
     file_path = os.path.join(dataset_path, filename)
     url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00350/' \
           'default%20of%20credit%20card%20clients.xls'
     download_file(url, dataset_path, filename)
     df = pd.read_excel(file_path, skiprows=1, index_col='ID')
     y_columns = ['default payment next month']
     self.x, self.y = split_normalize_sequence(df, y_columns, validation_size, split, self.type_)
예제 #8
0
    def __init__(self, root, split=TRAIN, validation_size=0.2):
        dataset_path = os.path.join(root, self.name)
        filename = 'airfoil_self_noise.dat'
        url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00291/airfoil_self_noise.dat'
        download_file(url, dataset_path, filename)
        file_path = os.path.join(dataset_path, filename)

        df = pd.read_csv(file_path, sep='\t', header=None)
        y_columns = [5]
        self.x, self.y = split_normalize_sequence(df, y_columns, validation_size, split, self.type_)
예제 #9
0
    def __init__(self, root, split=TRAIN, validation_size=0.2):
        dataset_path = os.path.join(root, self.name)
        filename = 'Real estate valuation data set.xlsx'
        url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00477/Real%20estate%20valuation%20data%20set.xlsx'
        download_file(url, dataset_path, filename)
        file_path = os.path.join(dataset_path, filename)

        df = pd.read_excel(file_path, index_col='No')
        y_columns = ['Y house price of unit area']
        self.x, self.y = split_normalize_sequence(df, y_columns, validation_size, split, self.type_)
예제 #10
0
 def __init__(self, root, split=TRAIN, validation_size=0.2):
     dataset_path = os.path.join(root, self.name)
     filename = 'data.csv'
     file_path = os.path.join(dataset_path, filename)
     url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data'
     download_file(url, dataset_path, filename)
     df = pd.read_csv(file_path, header=None)
     y_columns = df.columns[-1:]
     one_hot_encode_df_(df)
     df_test, df_train, df_valid = split_df(df, [0.2, 0.8 - 0.8 * validation_size, 0.8 * validation_size])
     normalize_df_(df_train, other_dfs=[df_valid, df_test])
     df_res = get_split(df_train, df_valid, df_test, split)
     self.x, self.y = xy_split(df_res, y_columns)
예제 #11
0
    def __init__(self, root, split=TRAIN, validation_size=0.2):
        dataset_path = os.path.join(root, self.name)
        filename = 'data.csv'
        file_path: str = os.path.join(dataset_path, filename)
        url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/' \
              'parkinsons/telemonitoring/parkinsons_updrs.data'
        download_file(url, dataset_path, filename)
        df = pd.read_csv(file_path)
        y_columns = ['motor_UPDRS', 'total_UPDRS']

        df_train_valid = df[df['subject#'] <= 30]
        df_test = deepcopy(df[df['subject#'] > 30])

        df_train, df_valid = split_df_on_column(df_train_valid, [1 - validation_size, validation_size], 'subject#')
        normalize_df_(df_train, other_dfs=[df_valid, df_test])
        df_res = get_split(df_train, df_valid, df_test, split)
        df_res.drop(columns='subject#', inplace=True)
        self.x, self.y = xy_split(df_res, y_columns)
 def __init__(self, root, split=TRAIN, validation_size=0.2):
     dataset_path = os.path.join(root, self.name)
     file_name_train = 'train.csv'
     file_name_test = 'test.csv'
     url_train = 'http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/satimage/sat.trn'
     url_test = 'http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/satimage/sat.tst'
     download_file(url_train, dataset_path, file_name_train)
     download_file(url_test, dataset_path, file_name_test)
     file_path_train = os.path.join(dataset_path, file_name_train)
     file_path_test = os.path.join(dataset_path, file_name_test)
     df_train_valid = pd.read_csv(file_path_train, sep=' ', header=None)
     df_test = pd.read_csv(file_path_test, sep=' ', header=None)
     df_test.index += len(df_train_valid)
     df = pd.concat([df_train_valid, df_test])
     y_columns = [36]
     label_encode_df_(df, y_columns[0])
     df_train_valid = df.loc[df_train_valid.index, :]
     df_test = df.loc[df_test.index, :]
     df_train, df_valid = split_classification_df(df_train_valid, [1 - validation_size, validation_size], 36)
     normalize_df_(df_train, other_dfs=[df_valid, df_test], skip_column=y_columns[0])
     df_res = get_split(df_train, df_valid, df_test, split)
     self.x, self.y = xy_split(df_res, y_columns)
     self.y = self.y[:, 0]