def _prepare_cifar10_data(): data_path = '/home/huwenp/Dataset/CIFAR/' url = 'http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz' file_manager.create_dirname_if_not_exist(data_path) file_name = os.path.basename(url) full_path = os.path.join(data_path, file_name) folder = os.path.join(data_path, 'cifar-10-batches-py') if not os.path.isdir(folder): file_manager.download(url, data_path) with tarfile.open(full_path) as f: f.extractall(path=data_path) train_x = [] train_y = [] for i in range(1, 6): file_path = os.path.join(folder, 'data_batch_{0:d}'.format(i)) data_dict = file_manager.unpickle(file_path) train_x.append(data_dict['data']) train_y.append(data_dict['labels']) train_x = np.concatenate(train_x) / 255.0 pos = 0.006 train_y = np.concatenate(train_y) train_x = train_x / np.linalg.norm(train_x, axis=1, keepdims=True) train_x = train_x - np.expand_dims(np.mean(train_x, 1), 1) + pos data_dict = file_manager.unpickle(os.path.join(folder, 'test_batch')) test_y = np.array(data_dict['labels']) test_x = data_dict['data'] / 255.0 test_x = test_x / np.linalg.norm(test_x, axis=1, keepdims=True) test_x = test_x - np.expand_dims(np.mean(test_x, 1), 1) + pos train_x = train_x.reshape((train_x.shape[0], 3, -1)) test_x = test_x.reshape((test_x.shape[0], 3, -1)) return train_x, train_y, test_x, test_y
def _prepare_imagenet_test_data(self): data_dict = file_manager.unpickle( os.path.join(self.data_path, 'val_data')) test_x = data_dict['data'] / 255.0 test_y = np.array(data_dict['labels']) # .transpose([0, 2, 3, 1]) test_x = test_x.reshape((test_x.shape[0], 3, 32, 32))
def _make_Negative_small_data(): train_x = [] train_y = [] for i in range(1, 11): file_path = os.path.join(data_path, 'train_data_batch_{0:d}'.format(i)) data_dict = file_manager.unpickle(file_path) train_x.append(data_dict['data']) train_y.append(data_dict['labels']) train_x = np.concatenate(train_x) / 255.0 train_y = np.concatenate(train_y) # pdb.set_trace() label = _select_pOn_data(train_y, negatives_small) train_x_ = train_x[label,:] max_num = 20000 for i in range(10): lengh = train_x_.shape[0] if lengh > max_num: torch.save(train_x_[:max_num, :], data_path_s + 'imagenet_batch_' + str(i) + '.pt') train_x_ = train_x_[max_num:, :] else: torch.save(train_x_[:lengh, :], data_path_s + 'imagenet_batch_' + str(i) + '.pt') train_x_ = None if train_x_ is None: break
def _prepare_cifar50_data(): data_path = '/home/huwenp/Dataset/CIFAR/' url = 'http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz' file_manager.create_dirname_if_not_exist(data_path) file_name = os.path.basename(url) full_path = os.path.join(data_path, file_name) folder = os.path.join(data_path, 'cifar-10-batches-py-feature') if not os.path.isdir(folder): file_manager.download(url, data_path) with tarfile.open(full_path) as f: f.extractall(path=data_path) train_x = [] train_y = [] for i in range(0, 5): file_path = os.path.join(folder, 'traindata_batch_{0:d}.pt'.format(i)) data_dict = file_manager.unpickle(file_path) pdb.set_trace() train_x.append(data_dict['data']) train_y.append(data_dict['labels']) train_x = np.concatenate(train_x) / 255.0 train_y = np.concatenate(train_y) data_dict = file_manager.unpickle( os.path.join(folder, 'testdata_batch_0.pt')) test_x = data_dict['data'] / 255.0 test_y = np.array(data_dict['labels']) # pdb.set_trace() # .transpose([0, 2, 3, 1]) train_x = train_x.reshape((train_x.shape[0], 3, 32, 32)) # .transpose([0, 2, 3, 1]) test_x = test_x.reshape((test_x.shape[0], 3, 32, 32)) # for i in range(10): # # pdb.set_trace() # misc.imsave('./images/cifar10' + str(train_y[i]) + '-' + str(i) + '.jpg', train_x[i].transpose(1, 2, 0)) # pdb.set_trace() # train_y = _binarize(train_y) # test_y = _binarize(test_y) return train_x, train_y, test_x, test_y
def _prepare_imagenet_data_all(data_path_t): # pdb.set_trace() train_x = [] train_y = [] for i in range(1, 11): file_path = os.path.join(data_path, 'train_data_batch_{0:d}'.format(i)) data_dict = file_manager.unpickle(file_path) train_x.append(data_dict['data']) train_y.append(data_dict['labels']) # pdb.set_trace() train_x = np.concatenate(train_x) / 255.0 train_y = np.concatenate(train_y) data_dict = file_manager.unpickle( os.path.join(data_path, 'val_data')) test_x = data_dict['data'] / 255.0 test_y = np.array(data_dict['labels']) train_x = train_x.reshape((train_x.shape[0], 3, 32, 32)) # .transpose([0, 2, 3, 1]) test_x = test_x.reshape((test_x.shape[0], 3, 32, 32)) # pdb.set_trace() return train_x, train_y, test_x, test_y
def _make_other_data(): # folder = os.path.join(data_path, 'imagenet_train') # if not os.path.isdir(data_path): # file_manager.download(url, data_path) # with tarfile.open(full_path) as f: # f.extractall(path=data_path) train_x = [] train_y = [] for i in range(1, 11): file_path = os.path.join(data_path, 'train_data_batch_{0:d}'.format(i)) data_dict = file_manager.unpickle(file_path) train_x.append(data_dict['data']) train_y.append(data_dict['labels']) train_x = np.concatenate(train_x) / 255.0 train_y = np.concatenate(train_y) # pdb.set_trace() label = _select_data(train_y) train_x = train_x[label,:] train_y = train_y[label] max_num = 5000 for i in range(100000000): lengh = train_x.shape[0] if lengh > max_num: data_save = {'data':train_x[:max_num, :], 'labels':train_y[:max_num]} torch.save(data_save, data_path_ot + 'imagenet_batch_' + str(i) + '.pt') train_x = train_x[max_num:, :] train_y = train_y[max_num:] else: data_save = {'data':train_x, 'labels':train_y} torch.save(data_save, data_path_ot + 'imagenet_batch_' + str(i) + '.pt') train_x = None if train_x is None: break