def __init__(self, root, split=TRAIN, validation_size=0.2): dataset_path = os.path.join(root, self.name) url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00464/superconduct.zip' download_unzip(url, dataset_path) file_path = os.path.join(dataset_path, 'train.csv') df = pd.read_csv(file_path) y_columns = ['critical_temp'] self.x, self.y = split_normalize_sequence(df, y_columns, validation_size, split, self.type_)
def __init__(self, root, split=TRAIN, validation_size=0.2): dataset_path = os.path.join(root, self.name) url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00294/CCPP.zip' download_unzip(url, dataset_path) file_path = os.path.join(dataset_path, 'CCPP', 'Folds5x2_pp.xlsx') df = pd.read_excel(file_path) y_columns = ['PE'] # Not clear if this is the aim of the dataset self.x, self.y = split_normalize_sequence(df, y_columns, validation_size, split, self.type_)
def __init__(self, root, split=TRAIN, validation_size=0.2): dataset_path = os.path.join(root, self.name) filename = 'data.csv' file_path = os.path.join(dataset_path, filename) url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv' download_file(url, dataset_path, filename) df = pd.read_csv(file_path, sep=';') y_columns = ['quality'] self.x, self.y = split_normalize_sequence(df, y_columns, validation_size, split, self.type_)
def __init__(self, root, split=TRAIN, validation_size=0.2): file_name = 'data.csv' dataset_path = os.path.join(root, self.name) url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00325/Sensorless_drive_diagnosis.txt' download_file(url, dataset_path, file_name) file_path = os.path.join(dataset_path, file_name) df = pd.read_csv(file_path, header=None, sep=' ') y_columns = [48] label_encode_df_(df, y_columns[0]) self.x, self.y = split_normalize_sequence(df, y_columns, validation_size, split, self.type_)
def __init__(self, root, split=TRAIN, validation_size=0.2): file_name = 'data.csv' dataset_path = os.path.join(root, self.name) url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/letter-recognition.data' download_file(url, dataset_path, file_name) file_path = os.path.join(dataset_path, file_name) df = pd.read_csv(file_path, header=None) y_columns = [0] label_encode_df_(df, y_columns[0]) self.x, self.y = split_normalize_sequence(df, y_columns, validation_size, split, self.type_)
def __init__(self, root, split=TRAIN, validation_size=0.2): dataset_path = os.path.join(root, self.name) filename = 'data.xls' file_path = os.path.join(dataset_path, filename) url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00350/' \ 'default%20of%20credit%20card%20clients.xls' download_file(url, dataset_path, filename) df = pd.read_excel(file_path, skiprows=1, index_col='ID') y_columns = ['default payment next month'] self.x, self.y = split_normalize_sequence(df, y_columns, validation_size, split, self.type_)
def __init__(self, root, split=TRAIN, validation_size=0.2): dataset_path = os.path.join(root, self.name) url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip' download_unzip(url, dataset_path) file_path = os.path.join(dataset_path, 'bank-additional', 'bank-additional-full.csv') df = pd.read_csv(file_path, sep=';') y_columns = ['y'] one_hot_encode_df_(df, skip_columns=y_columns) label_encode_df_(df, y_columns[0]) self.x, self.y = split_normalize_sequence(df, y_columns, validation_size, split, self.type_)
def __init__(self, root, split=TRAIN, validation_size=0.2): dataset_path = os.path.join(root, self.name) filename = 'airfoil_self_noise.dat' url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00291/airfoil_self_noise.dat' download_file(url, dataset_path, filename) file_path = os.path.join(dataset_path, filename) df = pd.read_csv(file_path, sep='\t', header=None) y_columns = [5] self.x, self.y = split_normalize_sequence(df, y_columns, validation_size, split, self.type_)
def __init__(self, root, split=TRAIN, validation_size=0.2): dataset_path = os.path.join(root, self.name) filename = 'Real estate valuation data set.xlsx' url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00477/Real%20estate%20valuation%20data%20set.xlsx' download_file(url, dataset_path, filename) file_path = os.path.join(dataset_path, filename) df = pd.read_excel(file_path, index_col='No') y_columns = ['Y house price of unit area'] self.x, self.y = split_normalize_sequence(df, y_columns, validation_size, split, self.type_)
def __init__(self, root, split=TRAIN, validation_size=0.2): dataset_path = os.path.join(root, self.name) url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00332/OnlineNewsPopularity.zip' download_unzip(url, dataset_path) file_path = os.path.join(dataset_path, 'OnlineNewsPopularity', 'OnlineNewsPopularity.csv') df = pd.read_csv(file_path, ) df.drop(columns=['url', ' timedelta'], inplace=True) y_columns = [' shares'] df[y_columns[0]] = np.log(df[y_columns[0]]) self.x, self. y = split_normalize_sequence(df, y_columns, validation_size, split, self.type_)
def __init__(self, root, split=TRAIN, validation_size=0.2): dataset_path = os.path.join(root, self.name) filename = 'AirQualityUCI.csv' url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00360/AirQualityUCI.zip' download_unzip(url, dataset_path) file_path = os.path.join(dataset_path, filename) df = pd.read_csv(file_path, sep=';', parse_dates=[0, 1]) df.dropna(axis=0, how='all', inplace=True) df.dropna(axis=1, how='all', inplace=True) df.Date = (df.Date - df.Date.min()).astype('timedelta64[D]') # Days as int df.Time = df.Time.apply(lambda x: int(x.split('.')[0])) # Hours as int df['C6H6(GT)'] = df['C6H6(GT)'].apply(lambda x: float(x.replace(',', '.'))) # Target as float # Some floats are given with ',' instead of '.' df = df.applymap(lambda x: float(x.replace(',', '.')) if type(x) is str else x) # Target as float df = df[df['C6H6(GT)'] != -200] # Drop all rows with missing target values df.loc[df['CO(GT)'] == -200, 'CO(GT)'] = -10 # -200 means missing value, shifting this to be closer to # the other values for this column y_columns = ['C6H6(GT)'] self.x, self.y = split_normalize_sequence(df, y_columns, validation_size, split, self.type_)