def load_image_data(data_name, dict_no, noise_rate): """Loads image datasets. This module loads CIFAR10 and CIFAR100 datasets and saves train.npz, valid.npz and test.npz files under data_files directory. If noise_rate > 0.0, adds noise on the datasets. Args: data_name: 'cifar10' or 'cifar100' dict_no: Training and validation set numbers noise_rate: Label corruption ratio Returns: noise_idx: Indices of noisy samples """ # Loads datasets if data_name == 'cifar10': (x_train, y_train), (x_test, y_test) = datasets.cifar10.load_data() elif data_name == 'cifar100': (x_train, y_train), (x_test, y_test) = datasets.cifar100.load_data() # Splits train, valid and test sets train_idx = np.random.permutation(len(x_train)) valid_idx = train_idx[:dict_no['valid']] train_idx = train_idx[dict_no['valid']:(dict_no['train'] + dict_no['valid'])] test_idx = np.random.permutation(len(x_test))[:dict_no['test']] x_valid = x_train[valid_idx] x_train = x_train[train_idx] x_test = x_test[test_idx] y_valid = y_train[valid_idx].flatten() y_train = y_train[train_idx].flatten() y_test = y_test[test_idx].flatten() # Adds noise on labels y_train, noise_idx = dvrl_utils.corrupt_label(y_train, noise_rate) # Saves data if not os.path.exists('data_files'): os.makedirs('data_files') np.savez_compressed('./data_files/train.npz', x_train=x_train, y_train=y_train) np.savez_compressed('./data_files/valid.npz', x_valid=x_valid, y_valid=y_valid) np.savez_compressed('./data_files/test.npz', x_test=x_test, y_test=y_test) return noise_idx
def load_tabular_data(data_name, dict_no, noise_rate): """Loads Adult Income and Blog Feedback datasets. This module loads the two tabular datasets and saves train.csv, valid.csv and test.csv files under data_files directory. UCI Adult data link: https://archive.ics.uci.edu/ml/datasets/Adult UCI Blog data link: https://archive.ics.uci.edu/ml/datasets/BlogFeedback If noise_rate > 0.0, adds noise on the datasets. Then, saves train.csv, valid.csv, test.csv on './data_files/' directory Args: data_name: 'adult' or 'blog' dict_no: training and validation set numbers noise_rate: label corruption ratio Returns: noise_idx: indices of noisy samples """ # Loads datasets from links uci_base_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/' # Adult Income dataset if data_name == 'adult': train_url = uci_base_url + 'adult/adult.data' test_url = uci_base_url + 'adult/adult.test' data_train = pd.read_csv(train_url, header=None) data_test = pd.read_csv(test_url, skiprows=1, header=None) df = pd.concat((data_train, data_test), axis=0) # Column names df.columns = [ 'Age', 'WorkClass', 'fnlwgt', 'Education', 'EducationNum', 'MaritalStatus', 'Occupation', 'Relationship', 'Race', 'Gender', 'CapitalGain', 'CapitalLoss', 'HoursPerWeek', 'NativeCountry', 'Income' ] # Creates binary labels df['Income'] = df['Income'].map({ ' <=50K': 0, ' >50K': 1, ' <=50K.': 0, ' >50K.': 1 }) # Changes string to float df.Age = df.Age.astype(float) df.fnlwgt = df.fnlwgt.astype(float) df.EducationNum = df.EducationNum.astype(float) df.EducationNum = df.EducationNum.astype(float) df.CapitalGain = df.CapitalGain.astype(float) df.CapitalLoss = df.CapitalLoss.astype(float) # One-hot encoding df = pd.get_dummies(df, columns=[ 'WorkClass', 'Education', 'MaritalStatus', 'Occupation', 'Relationship', 'Race', 'Gender', 'NativeCountry' ]) # Sets label name as Y df = df.rename(columns={'Income': 'Y'}) df['Y'] = df['Y'].astype(int) # Resets index df = df.reset_index() df = df.drop(columns=['index']) # Blog Feedback dataset elif data_name == 'blog': resp = urllib.request.urlopen(uci_base_url + '00304/BlogFeedback.zip') zip_file = zipfile.ZipFile(io.BytesIO(resp.read())) # Loads train dataset train_file_name = 'blogData_train.csv' data_train = pd.read_csv(zip_file.open(train_file_name), header=None) # Loads test dataset data_test = [] for i in range(29): if i < 9: file_name = 'blogData_test-2012.02.0' + str(i + 1) + '.00_00.csv' else: file_name = 'blogData_test-2012.02.' + str(i + 1) + '.00_00.csv' temp_data = pd.read_csv(zip_file.open(file_name), header=None) if i == 0: data_test = temp_data else: data_test = pd.concat((data_test, temp_data), axis=0) for i in range(31): if i < 9: file_name = 'blogData_test-2012.03.0' + str(i + 1) + '.00_00.csv' elif i < 25: file_name = 'blogData_test-2012.03.' + str(i + 1) + '.00_00.csv' else: file_name = 'blogData_test-2012.03.' + str(i + 1) + '.01_00.csv' temp_data = pd.read_csv(zip_file.open(file_name), header=None) data_test = pd.concat((data_test, temp_data), axis=0) df = pd.concat((data_train, data_test), axis=0) # Removes rows with missing data df = df.dropna() # Sets label and named as Y df.columns = df.columns.astype(str) df['280'] = 1 * (df['280'] > 0) df = df.rename(columns={'280': 'Y'}) df['Y'] = df['Y'].astype(int) # Resets index df = df.reset_index() df = df.drop(columns=['index']) # Splits train, valid and test sets train_idx = range(len(data_train)) train = df.loc[train_idx] test_idx = range(len(data_train), len(df)) test = df.loc[test_idx] train_idx_final = np.random.permutation(len(train))[:dict_no['train']] temp_idx = np.random.permutation(len(test)) valid_idx_final = temp_idx[:dict_no['valid']] + len(data_train) test_idx_final = temp_idx[dict_no['valid']:] + len(data_train) train = train.loc[train_idx_final] valid = test.loc[valid_idx_final] test = test.loc[test_idx_final] # Adds noise on labels y_train = np.asarray(train['Y']) y_train, noise_idx = dvrl_utils.corrupt_label(y_train, noise_rate) train['Y'] = y_train # Saves data if not os.path.exists('data_files'): os.makedirs('data_files') train.to_csv('./data_files/train.csv', index=False) valid.to_csv('./data_files/valid.csv', index=False) test.to_csv('./data_files/test.csv', index=False) # Returns indices of noisy samples return noise_idx