def __init__(self, root, split=TRAIN, validation_size=0.2):
        dataset_path = os.path.join(root, self.name)
        url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00363/Dataset.zip'
        download_unzip(url, dataset_path)
        dataset_path = os.path.join(dataset_path, 'Dataset')

        # The 5th variant has the most data
        train_path = os.path.join(dataset_path, 'Training', 'Features_Variant_5.csv')
        test_path = os.path.join(dataset_path, 'Testing', 'Features_TestSet.csv')
        df_train_valid = pd.read_csv(train_path, header=None)
        df_test = pd.read_csv(test_path, header=None)
        y_columns = df_train_valid.columns[-1:]

        # Page ID is not included, but can be derived. Page IDs can not be
        # in both training and validation sets
        page_columns = list(range(29))
        for i, (_, df_group) in enumerate(df_train_valid.groupby(page_columns)):
            df_train_valid.loc[df_group.index, 'page_id'] = i
        df_train, df_valid = split_df_on_column(df_train_valid, [1 - validation_size, validation_size], 'page_id')
        df_train.drop(columns='page_id', inplace=True)
        df_valid.drop(columns='page_id', inplace=True)

        normalize_df_(df_train, other_dfs=[df_valid, df_test])
        df_res = get_split(df_train, df_valid, df_test, split)
        self.x, self.y = xy_split(df_res, y_columns)
    def __init__(self, root, split=TRAIN, validation_size=0.2):
        file_name = 'blogData_train.csv'
        dataset_path = os.path.join(root, self.name)
        url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00304/BlogFeedback.zip'
        download_unzip(url, dataset_path)

        # Iterate all test csv and concatenate to one DataFrame
        test_dfs = []
        for fn in os.listdir(dataset_path):
            if 'blogData_test' not in fn:
                continue
            file_path = os.path.join(dataset_path, fn)
            test_dfs.append(pd.read_csv(file_path, header=None))
        df_test = pd.concat(test_dfs)

        file_path = os.path.join(dataset_path, file_name)
        df_train_valid = pd.read_csv(file_path, header=None)
        y_columns = [280]
        df_train_valid[y_columns[0]] = np.log(df_train_valid[y_columns[0]] + 0.01)
        df_test[y_columns[0]] = np.log(df_test[y_columns[0]] + 0.01)

        page_columns = list(range(50))
        for i, (_, df_group) in enumerate(df_train_valid.groupby(page_columns)):
            df_train_valid.loc[df_group.index, 'page_id'] = i
        df_train, df_valid = split_df_on_column(df_train_valid, [1 - validation_size, validation_size], 'page_id')
        df_train.drop(columns='page_id', inplace=True)
        df_valid.drop(columns='page_id', inplace=True)

        normalize_df_(df_train, other_dfs=[df_valid, df_test])
        df_res = get_split(df_train, df_valid, df_test, split)
        self.x, self.y = xy_split(df_res, y_columns)
 def __init__(self, root, split=TRAIN, validation_size=0.2):
     dataset_path = os.path.join(root, self.name)
     url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00464/superconduct.zip'
     download_unzip(url, dataset_path)
     file_path = os.path.join(dataset_path, 'train.csv')
     df = pd.read_csv(file_path)
     y_columns = ['critical_temp']
     self.x, self.y = split_normalize_sequence(df, y_columns, validation_size, split, self.type_)
 def __init__(self, root, split=TRAIN, validation_size=0.2):
     dataset_path = os.path.join(root, self.name)
     url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00294/CCPP.zip'
     download_unzip(url, dataset_path)
     file_path = os.path.join(dataset_path, 'CCPP', 'Folds5x2_pp.xlsx')
     df = pd.read_excel(file_path)
     y_columns = ['PE']  # Not clear if this is the aim of the dataset
     self.x, self.y = split_normalize_sequence(df, y_columns, validation_size, split, self.type_)
 def __init__(self, root, split=TRAIN, validation_size=0.2):
     dataset_path = os.path.join(root, self.name)
     url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip'
     download_unzip(url, dataset_path)
     file_path = os.path.join(dataset_path, 'bank-additional', 'bank-additional-full.csv')
     df = pd.read_csv(file_path, sep=';')
     y_columns = ['y']
     one_hot_encode_df_(df, skip_columns=y_columns)
     label_encode_df_(df, y_columns[0])
     self.x, self.y = split_normalize_sequence(df, y_columns, validation_size, split, self.type_)
 def __init__(self, root, split=TRAIN, validation_size=0.2):
     dataset_path = os.path.join(root, self.name)
     url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00332/OnlineNewsPopularity.zip'
     download_unzip(url, dataset_path)
     file_path = os.path.join(dataset_path, 'OnlineNewsPopularity', 'OnlineNewsPopularity.csv')
     df = pd.read_csv(file_path, )
     df.drop(columns=['url', ' timedelta'], inplace=True)
     y_columns = [' shares']
     df[y_columns[0]] = np.log(df[y_columns[0]])
     self.x, self. y = split_normalize_sequence(df, y_columns, validation_size, split, self.type_)
 def __init__(self, root, split=TRAIN, validation_size=0.2):
     dataset_path = os.path.join(root, self.name)
     url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00206/slice_localization_data.zip'
     download_unzip(url, dataset_path)
     file_name = 'slice_localization_data.csv'
     file_path = os.path.join(dataset_path, file_name)
     df = pd.read_csv(file_path)
     # No patient should be in both train and test set
     df_train_valid = deepcopy(df.loc[df.patientId < 80, :])  # Pandas complains if it is a view
     df_test = deepcopy(df.loc[df.patientId >= 80, :])        # - " -
     df_train, df_valid = split_df_on_column(df_train_valid, [1 - validation_size, validation_size], 'patientId')
     y_columns = ['reference']
     normalize_df_(df_train, other_dfs=[df_valid, df_test])
     df_res = get_split(df_train, df_valid, df_test, split)
     df_res = df_res.drop(columns='patientId')
     self.x, self.y = xy_split(df_res, y_columns)
 def __init__(self, root, split=TRAIN, validation_size=0.2):
     dataset_path = os.path.join(root, self.name)
     url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00459/avila.zip'
     download_unzip(url, dataset_path)
     file_path_train = os.path.join(dataset_path, 'avila', 'avila-tr.txt')
     file_path_test = os.path.join(dataset_path, 'avila', 'avila-ts.txt')
     df_train_valid = pd.read_csv(file_path_train, header=None)
     df_test = pd.read_csv(file_path_test, header=None)
     y_columns = [10]
     label_encode_df_([df_train_valid, df_test], y_columns[0])  # Assumes encoding will be identical for train/test
     df_train, df_valid = split_classification_df(df_train_valid,
                                                  [1 - validation_size, validation_size],
                                                  y_columns[0])
     normalize_df_(df_train, other_dfs=[df_valid, df_test], skip_column=y_columns[0])
     df_res = get_split(df_train, df_valid, df_test, split)
     self.x, self.y = xy_split(df_res, y_columns)
     self.y = self.y[:, 0]
    def __init__(self, root, split=TRAIN, validation_size=0.2):
        dataset_path = os.path.join(root, self.name)
        filename = 'AirQualityUCI.csv'
        url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00360/AirQualityUCI.zip'
        download_unzip(url, dataset_path)
        file_path = os.path.join(dataset_path, filename)

        df = pd.read_csv(file_path, sep=';', parse_dates=[0, 1])
        df.dropna(axis=0, how='all', inplace=True)
        df.dropna(axis=1, how='all', inplace=True)

        df.Date = (df.Date - df.Date.min()).astype('timedelta64[D]')  # Days as int
        df.Time = df.Time.apply(lambda x: int(x.split('.')[0]))  # Hours as int
        df['C6H6(GT)'] = df['C6H6(GT)'].apply(lambda x: float(x.replace(',', '.')))  # Target as float

        # Some floats are given with ',' instead of '.'
        df = df.applymap(lambda x: float(x.replace(',', '.')) if type(x) is str else x)  # Target as float

        df = df[df['C6H6(GT)'] != -200]  # Drop all rows with missing target values
        df.loc[df['CO(GT)'] == -200, 'CO(GT)'] = -10  # -200 means missing value, shifting this to be closer to
        # the other values for this column

        y_columns = ['C6H6(GT)']
        self.x, self.y = split_normalize_sequence(df, y_columns, validation_size, split, self.type_)