def load_datasets(self): """ GSC specifies specific files to be used as training, test, and validation. We assume the data has already been processed using the pre-processing scripts here: https://github.com/numenta/nupic.torch/tree/master/examples/gsc """ validation_dataset = PreprocessedDataset( cachefilepath=self.data_dir, basename="gsc_valid", qualifiers=[""], ) test_dataset = PreprocessedDataset( cachefilepath=self.data_dir, basename="gsc_test_noise", qualifiers=["{:02d}".format(int(100 * n)) for n in self.noise_values], ) train_dataset = PreprocessedDataset( cachefilepath=self.data_dir, basename="gsc_train", qualifiers=range(30), ) self.train_loader = DataLoader( train_dataset, batch_size=self.batch_size, shuffle=True ) self.validation_loader = DataLoader( validation_dataset, batch_size=self.batch_size, shuffle=False ) self.test_loader = DataLoader( test_dataset, batch_size=self.batch_size, shuffle=False )
def load_datasets(self): """ GSC specifies specific files to be used as training, test, and validation. We assume the data has already been processed using the pre-processing scripts here: https://github.com/numenta/nupic.torch/tree/master/examples/gsc """ validation_dataset = ClasswiseDataset(cachefilepath=self.data_dir, basename="data_valid", qualifiers=[""]) self.validation_loader = DataLoader( validation_dataset, batch_size=self.batch_size, shuffle=False, drop_last=True, ) self.gen_test_dataset = PreprocessedDataset( cachefilepath=self.test_data_dir, basename="gsc_test_noise", qualifiers=["00"], transform=self.subtract_label_transform(), ) self.gen_test_loader = DataLoader( self.gen_test_dataset, batch_size=self.batch_size, shuffle=True, drop_last=True, ) self.train_dataset = PreprocessedDataset( cachefilepath=self.test_data_dir, basename="gsc_train", qualifiers=range(30), transform=self.subtract_label_transform(), ) self.full_train_loader = DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, drop_last=True) self.test_loader = [] # Iterate over labels for class_ in np.arange(12): test_dataset = ClasswiseDataset( cachefilepath=self.data_dir, basename="data_test_0noise", qualifiers=[class_ + 1], ) self.test_loader.append( DataLoader( test_dataset, batch_size=self.batch_size, shuffle=False, drop_last=True, ))
def preprocessed_gsc(root, train=True, qualifiers=None, download=True): """ Create train or test dataset from preprocessed GSC data, downloading if necessary. Warning: Be sure to call dataset.load_next() following each epoch of training. Otherwise, no new augmentations will be loaded, and the same exact samples will be reused. .. note:: To load the preprocessed noise dataset use noise levels `["05", "10", ..., "50"]` as qualifiers on the test dataset. Noise level "00" is equivalent to the regular "test" dataset .. seealso:: PreprocessedDataset :param root: directory to store or load downloaded data :param train: whether to load train or test data :param qualifiers: List of qualifiers for each preprocessed files in this dataset. If None, `range(30)` will be used for training and "00" for testing. :param download: whether to download the data """ root = os.path.expanduser(root) if download: download_gsc_data(root) if train: basename = "gsc_train" if qualifiers is None: qualifiers = range(30) else: # Load test and noise dataset basename = "gsc_test_noise" if qualifiers is None: qualifiers = ["00"] dataset = PreprocessedDataset( cachefilepath=root, basename=basename, qualifiers=qualifiers, ) return dataset
def process_gsc_by_class(data_dir=None): if data_dir is None: data_dir = "/home/ec2-user/nta/data/" if not os.path.isdir(data_dir + "data_classes"): os.mkdir(data_dir + "data_classes") class_min = 1 class_max = 12 ranges = [np.arange(k, k + 1) for k in range(30)] for k in range(class_min, class_max + 1): data_tensor = torch.zeros(0, 1, 32, 32) for j in range(len(ranges)): dataset = PreprocessedDataset(cachefilepath=data_dir, basename="gsc_train", qualifiers=ranges[j]) class_indices = np.where(dataset.tensors[1] == k)[0] if len(class_indices) > 0: data_tensor = torch.cat((data_tensor, torch.Tensor( dataset.tensors[0][class_indices, :, :, :]))) labels_tensor = torch.Tensor(data_tensor.shape[0] * [k - 1]).long() out_tensor = list((data_tensor, labels_tensor)) with open(data_dir + "/data_classes/data_train_{}.npz".format(k), "wb") as f: torch.save(out_tensor, f) for k in range(class_min, class_max + 1): data_tensor = torch.zeros(0, 1, 32, 32) dataset = PreprocessedDataset(cachefilepath=data_dir, basename="gsc_valid", qualifiers=[""] ) class_indices = np.where(dataset.tensors[1] == k)[0] if len(class_indices) > 0: data_tensor = torch.cat((data_tensor, torch.Tensor( dataset.tensors[0][class_indices, :, :, :]))) labels_tensor = torch.Tensor(data_tensor.shape[0] * [k - 1]).long() out_tensor = list((data_tensor, labels_tensor)) with open(data_dir + "/data_classes/data_valid.npz", "wb") as f: torch.save(out_tensor, f) noise_values = [0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5] for k in range(class_min, class_max + 1): for n in noise_values: data_tensor = torch.zeros(0, 1, 32, 32) dataset = PreprocessedDataset(cachefilepath=data_dir, basename="gsc_test_noise", qualifiers=["{:02d}".format(int(100 * n))]) class_indices = np.where(dataset.tensors[1] == k)[0] if len(class_indices) > 0: data_tensor = torch.cat((data_tensor, torch.Tensor( dataset.tensors[0][class_indices, :, :, :]))) if k == 1: labels_tensor = torch.Tensor(55860 * [k - 1]).long() else: labels_tensor = torch.Tensor(data_tensor.shape[0] * [k - 1]).long() out_tensor = list((data_tensor, labels_tensor)) if n == 0.0: tensor_string = "data_test_0noise{}.npz".format(k) else: tensor_string = "data_test_{}_{}.npz".format(k, n) with open(data_dir + "/data_classes/" + tensor_string, "wb") as f: torch.save(out_tensor, f)