def __init__(self, root, split=TRAIN, validation_size=0.2): dataset_path = os.path.join(root, self.name) url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00363/Dataset.zip' download_unzip(url, dataset_path) dataset_path = os.path.join(dataset_path, 'Dataset') # The 5th variant has the most data train_path = os.path.join(dataset_path, 'Training', 'Features_Variant_5.csv') test_path = os.path.join(dataset_path, 'Testing', 'Features_TestSet.csv') df_train_valid = pd.read_csv(train_path, header=None) df_test = pd.read_csv(test_path, header=None) y_columns = df_train_valid.columns[-1:] # Page ID is not included, but can be derived. Page IDs can not be # in both training and validation sets page_columns = list(range(29)) for i, (_, df_group) in enumerate(df_train_valid.groupby(page_columns)): df_train_valid.loc[df_group.index, 'page_id'] = i df_train, df_valid = split_df_on_column(df_train_valid, [1 - validation_size, validation_size], 'page_id') df_train.drop(columns='page_id', inplace=True) df_valid.drop(columns='page_id', inplace=True) normalize_df_(df_train, other_dfs=[df_valid, df_test]) df_res = get_split(df_train, df_valid, df_test, split) self.x, self.y = xy_split(df_res, y_columns)
def __init__(self, root, split=TRAIN, validation_size=0.2): file_name = 'blogData_train.csv' dataset_path = os.path.join(root, self.name) url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00304/BlogFeedback.zip' download_unzip(url, dataset_path) # Iterate all test csv and concatenate to one DataFrame test_dfs = [] for fn in os.listdir(dataset_path): if 'blogData_test' not in fn: continue file_path = os.path.join(dataset_path, fn) test_dfs.append(pd.read_csv(file_path, header=None)) df_test = pd.concat(test_dfs) file_path = os.path.join(dataset_path, file_name) df_train_valid = pd.read_csv(file_path, header=None) y_columns = [280] df_train_valid[y_columns[0]] = np.log(df_train_valid[y_columns[0]] + 0.01) df_test[y_columns[0]] = np.log(df_test[y_columns[0]] + 0.01) page_columns = list(range(50)) for i, (_, df_group) in enumerate(df_train_valid.groupby(page_columns)): df_train_valid.loc[df_group.index, 'page_id'] = i df_train, df_valid = split_df_on_column(df_train_valid, [1 - validation_size, validation_size], 'page_id') df_train.drop(columns='page_id', inplace=True) df_valid.drop(columns='page_id', inplace=True) normalize_df_(df_train, other_dfs=[df_valid, df_test]) df_res = get_split(df_train, df_valid, df_test, split) self.x, self.y = xy_split(df_res, y_columns)
def __init__(self, root, split=TRAIN, validation_size=0.2): dataset_path = os.path.join(root, self.name) url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00464/superconduct.zip' download_unzip(url, dataset_path) file_path = os.path.join(dataset_path, 'train.csv') df = pd.read_csv(file_path) y_columns = ['critical_temp'] self.x, self.y = split_normalize_sequence(df, y_columns, validation_size, split, self.type_)
def __init__(self, root, split=TRAIN, validation_size=0.2): dataset_path = os.path.join(root, self.name) url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00294/CCPP.zip' download_unzip(url, dataset_path) file_path = os.path.join(dataset_path, 'CCPP', 'Folds5x2_pp.xlsx') df = pd.read_excel(file_path) y_columns = ['PE'] # Not clear if this is the aim of the dataset self.x, self.y = split_normalize_sequence(df, y_columns, validation_size, split, self.type_)
def __init__(self, root, split=TRAIN, validation_size=0.2): dataset_path = os.path.join(root, self.name) url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip' download_unzip(url, dataset_path) file_path = os.path.join(dataset_path, 'bank-additional', 'bank-additional-full.csv') df = pd.read_csv(file_path, sep=';') y_columns = ['y'] one_hot_encode_df_(df, skip_columns=y_columns) label_encode_df_(df, y_columns[0]) self.x, self.y = split_normalize_sequence(df, y_columns, validation_size, split, self.type_)
def __init__(self, root, split=TRAIN, validation_size=0.2): dataset_path = os.path.join(root, self.name) url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00332/OnlineNewsPopularity.zip' download_unzip(url, dataset_path) file_path = os.path.join(dataset_path, 'OnlineNewsPopularity', 'OnlineNewsPopularity.csv') df = pd.read_csv(file_path, ) df.drop(columns=['url', ' timedelta'], inplace=True) y_columns = [' shares'] df[y_columns[0]] = np.log(df[y_columns[0]]) self.x, self. y = split_normalize_sequence(df, y_columns, validation_size, split, self.type_)
def __init__(self, root, split=TRAIN, validation_size=0.2): dataset_path = os.path.join(root, self.name) url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00206/slice_localization_data.zip' download_unzip(url, dataset_path) file_name = 'slice_localization_data.csv' file_path = os.path.join(dataset_path, file_name) df = pd.read_csv(file_path) # No patient should be in both train and test set df_train_valid = deepcopy(df.loc[df.patientId < 80, :]) # Pandas complains if it is a view df_test = deepcopy(df.loc[df.patientId >= 80, :]) # - " - df_train, df_valid = split_df_on_column(df_train_valid, [1 - validation_size, validation_size], 'patientId') y_columns = ['reference'] normalize_df_(df_train, other_dfs=[df_valid, df_test]) df_res = get_split(df_train, df_valid, df_test, split) df_res = df_res.drop(columns='patientId') self.x, self.y = xy_split(df_res, y_columns)
def __init__(self, root, split=TRAIN, validation_size=0.2): dataset_path = os.path.join(root, self.name) url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00459/avila.zip' download_unzip(url, dataset_path) file_path_train = os.path.join(dataset_path, 'avila', 'avila-tr.txt') file_path_test = os.path.join(dataset_path, 'avila', 'avila-ts.txt') df_train_valid = pd.read_csv(file_path_train, header=None) df_test = pd.read_csv(file_path_test, header=None) y_columns = [10] label_encode_df_([df_train_valid, df_test], y_columns[0]) # Assumes encoding will be identical for train/test df_train, df_valid = split_classification_df(df_train_valid, [1 - validation_size, validation_size], y_columns[0]) normalize_df_(df_train, other_dfs=[df_valid, df_test], skip_column=y_columns[0]) df_res = get_split(df_train, df_valid, df_test, split) self.x, self.y = xy_split(df_res, y_columns) self.y = self.y[:, 0]
def __init__(self, root, split=TRAIN, validation_size=0.2): dataset_path = os.path.join(root, self.name) filename = 'AirQualityUCI.csv' url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00360/AirQualityUCI.zip' download_unzip(url, dataset_path) file_path = os.path.join(dataset_path, filename) df = pd.read_csv(file_path, sep=';', parse_dates=[0, 1]) df.dropna(axis=0, how='all', inplace=True) df.dropna(axis=1, how='all', inplace=True) df.Date = (df.Date - df.Date.min()).astype('timedelta64[D]') # Days as int df.Time = df.Time.apply(lambda x: int(x.split('.')[0])) # Hours as int df['C6H6(GT)'] = df['C6H6(GT)'].apply(lambda x: float(x.replace(',', '.'))) # Target as float # Some floats are given with ',' instead of '.' df = df.applymap(lambda x: float(x.replace(',', '.')) if type(x) is str else x) # Target as float df = df[df['C6H6(GT)'] != -200] # Drop all rows with missing target values df.loc[df['CO(GT)'] == -200, 'CO(GT)'] = -10 # -200 means missing value, shifting this to be closer to # the other values for this column y_columns = ['C6H6(GT)'] self.x, self.y = split_normalize_sequence(df, y_columns, validation_size, split, self.type_)