def __init__(self, in_dir, healpixes, raytracing_out_dirs, aperture_size, n_data, features, stop_mean_std_early=False, n_cores=20): self.stop_mean_std_early = stop_mean_std_early self.n_datasets = len(healpixes) self.n_cores = n_cores datasets = [] Y_list = [] for i in range(self.n_datasets): graph_hp = CosmoDC2GraphHealpix( healpixes[i], in_dir, raytracing_out_dirs[i], aperture_size, n_data[i], features, n_cores=self.n_cores, ) datasets.append(graph_hp) Y_list.append(graph_hp.Y) self.Y = pd.concat(Y_list, ignore_index=True).reset_index(drop=True) ConcatDataset.__init__(self, datasets) self.transform_X = None self.transform_Y = None self.transform_Y_local = None
def build_dataset(cfg, default_args=None): """Build a dataset from config dict. Args: cfg (dict): Config dict. It should at least contain the key "type". default_args (dict, optional): Default initialization arguments. Default: None. Returns: Dataset: The constructed dataset. """ from .dataset_wrappers import RepeatDataset if isinstance(cfg, (list, tuple)): dataset = ConcatDataset([build_dataset(c, default_args) for c in cfg]) elif cfg['type'] == 'ConcatDataset': dataset = ConcatDataset( [build_dataset(c, default_args) for c in cfg['datasets']]) elif cfg['type'] == 'RepeatDataset': dataset = RepeatDataset(build_dataset(cfg['dataset'], default_args), cfg['times']) elif isinstance(cfg.get('ann_file'), (list, tuple)): dataset = _concat_dataset(cfg, default_args) else: dataset = build_from_cfg(cfg, DATASETS, default_args) return dataset
def __init__(self, corpora: List[Corpus], name: str = 'multicorpus'): self.corpora = corpora super(MultiCorpus, self).__init__( ConcatDataset([corpus.train for corpus in self.corpora]), ConcatDataset([corpus.dev for corpus in self.corpora]), ConcatDataset([corpus.test for corpus in self.corpora]), name=name)
def get_face_swap_iterators(bs): """DepthNet + GT <-> frontal GT faces""" filename_vgg = "data/vgg/vgg.h5" filename_celeba = "data/celeba/celebA.h5" filename_celeba_swap = "data/celeba_faceswap/celeba_faceswap.h5" a_train = H5Dataset(filename_celeba_swap, 'imgs', train=True) vgg_side_train = H5Dataset('%s' % filename_vgg, 'src_GT', train=True) vgg_frontal_train = H5Dataset('%s' % filename_vgg, 'tg_GT', train=True) celeba_side_train = H5Dataset('%s' % filename_celeba, 'src_GT', train=True) celeba_frontal_train = H5Dataset('%s' % filename_celeba, 'tg_GT', train=True) b_train = ConcatDataset((vgg_side_train, vgg_frontal_train, celeba_side_train, celeba_frontal_train)) a_valid = H5Dataset(filename_celeba_swap, 'imgs', train=False) vgg_side_valid = H5Dataset('%s' % filename_vgg, 'src_GT', train=False) vgg_frontal_valid = H5Dataset('%s' % filename_vgg, 'tg_GT', train=False) celeba_side_valid = H5Dataset('%s' % filename_celeba, 'src_GT', train=False) celeba_frontal_valid = H5Dataset('%s' % filename_celeba, 'tg_GT', train=False) b_valid = ConcatDataset((vgg_side_valid, vgg_frontal_valid, celeba_side_valid, celeba_frontal_valid)) loader_train_a = DataLoader(a_train, batch_size=bs, shuffle=True) loader_train_b = DataLoader(b_train, batch_size=bs, shuffle=True) loader_valid_a = DataLoader(a_valid, batch_size=bs, shuffle=True) loader_valid_b = DataLoader(b_valid, batch_size=bs, shuffle=True) return loader_train_a, loader_train_b, loader_valid_a, loader_valid_b
def get(cls, args, splits=('train', 'val', 'val_video')): newargs1 = copy.deepcopy(args) newargs2 = copy.deepcopy(args) vars(newargs1).update({ 'train_file': args.train_file.split(';')[0], 'val_file': args.val_file.split(';')[0], 'data': args.data.split(';')[0] }) vars(newargs2).update({ 'train_file': args.train_file.split(';')[1], 'val_file': args.val_file.split(';')[1], 'data': args.data.split(';')[1] }) if 'train' in splits or 'val' in splits: train_datasetego, val_datasetego, _ = CharadesEgoMeta.get( newargs1, splits=splits) else: train_datasetego, val_datasetego = None, None train_dataset, val_dataset, valvideo_dataset = super( CharadesEgoPlusCharades, cls).get(newargs2, splits=splits) if 'train' in splits: train_dataset.target_transform = transforms.Lambda(lambda x: -x) train_dataset = ConcatDataset( [train_dataset] + [train_datasetego] * 3) # magic number to balance if 'val' in splits: val_dataset.target_transform = transforms.Lambda(lambda x: -x) val_dataset = ConcatDataset([val_dataset] + [val_datasetego] * 3) return train_dataset, val_dataset, valvideo_dataset
def setup(self, stage): for dm in self.dms: dm.setup(stage) self.train_dataset = ConcatDataset( [dm.train_dataset for dm in self.dms]) self.val_dataset = ConcatDataset([dm.val_dataset for dm in self.dms]) self.test_dataset = ConcatDataset([dm.test_dataset for dm in self.dms]) self.tokenizer = self.dms[0].tokenizer self.collate = functools.partial( self.dms[0].train_dataset.collate, mlm_collator=self.dms[0].mlm_collator, ) if self.dist: self.train_sampler = DistributedSampler(self.train_dataset, shuffle=True) self.val_sampler = DistributedSampler(self.val_dataset, shuffle=True) self.test_sampler = DistributedSampler(self.test_dataset, shuffle=False) else: self.train_sampler = None self.val_sampler = None self.test_sampler = None
def get_all_sentences(self) -> Dataset: """Canasai's comment out: return ConcatDataset([self.train, self.dev, self.test])""" if self.train is not None and self.dev is not None and self.test is not None: return ConcatDataset([self.train, self.dev, self.test]) elif self.train is not None and self.dev is not None: return ConcatDataset([self.train, self.dev]) elif self.train is not None: return ConcatDataset([self.train]) else: raise RuntimeError("Shouldn't be here")
def setup(self, stage: Optional[str] = None) -> None: if stage == "test" or stage == "predict": raise NotImplemented train_dataset = ConcatDataset(list(map(self._get_dataset_from_path, self.train_paths))) if self.test_paths is None or len(self.test_paths) == 0: train_len = int(.95 * len(train_dataset)) train_dataset, test_dataset = random_split(train_dataset, [train_len, len(train_dataset) - train_len]) else: test_dataset = ConcatDataset(list(map(self._get_dataset_from_path, self.test_paths))) self.train_set, self.val_set = train_dataset, test_dataset
def update_dataloaders(self, time): ## update idx_list for all minidata for key in self.datasets: for d_ in self.datasets[key].datasets: d_.update_idx_list(time) self.train = DataLoader(ConcatDataset(self.datasets['train'].datasets), **self.dataLoader_kwargs) self.dev = DataLoader(ConcatDataset(self.datasets['dev'].datasets), **self.dataLoader_kwargs) self.test = DataLoader(ConcatDataset(self.datasets['test'].datasets), **self.dataLoader_kwargs)
def load_data_from_csv(train_csv, val_csv, input_size, transform_config): tsfm = create_transform(input_size, transform_config) train_set = [CSVDataset(csv, transform=tsfm['train']) for csv in train_csv] val_set = [CSVDataset(csv, transform=tsfm['val']) for csv in val_csv] train_dataset = ConcatDataset(train_set) val_dataset = ConcatDataset(val_set) train_labels = [] for _, label in train_dataset: train_labels.append(label) train_sampler = weighted_sampler(train_labels) return train_dataset, val_dataset, train_sampler
def test(self, test_data): self.discriminator.eval() dataset = ConcatDataset(test_data) data_loader = DataLoader(dataset, shuffle=True, batch_size=self.batch_size) total = len(dataset) correct = 0 confusion_matrix = torch.zeros(self.trained_class_num, self.trained_class_num).type(torch.long) with torch.no_grad(): for _, (index, x, y) in enumerate(data_loader): if self.use_gpu: x, y = x.cuda(self.device_num), y.cuda(self.device_num) output = self.discriminator(x) label = self.softmax(output).argmax(dim=1) for n, m in zip(y.view(-1, 1), label.view(-1, 1)): confusion_matrix[n, m] += 1 correct += label.eq(y).long().cpu().sum() if self.use_gpu else label.eq(y).long().sum() confusion_matrix = confusion_matrix.numpy() df_cm = pd.DataFrame(confusion_matrix, index=[i for i in range(self.trained_class_num)], columns=[i for i in range(self.trained_class_num)]) plt.xlabel('real label') plt.ylabel('classification result') plt.figure(figsize=(7 * self.trained_class_num // 10, 5 * self.trained_class_num // 10)) sn.heatmap(df_cm, annot=True) plt.savefig('./confusion_matrix/e2e/' + str(self.trained_class_num) + '_heatmap.png', dpi=300) print("Accuracy: {}/{} ({:.2f}%)".format(correct, total, 100. * correct / total))
def build_dataset(dataset_list, dataset_catalog, is_train=True): """ Arguments: dataset_list (list[str]): Contains the names of the datasets, i.e., coco_2014_trian, coco_2014_val, etc dataset_catalog (DatasetCatalog): contains the information on how to construct a dataset. is_train (bool): whether to setup the dataset for training or testing """ if not isinstance(dataset_list, (list, tuple)): raise RuntimeError( "dataset_list should be a list of strings, got {}".format( dataset_list)) datasets = [] for dataset_name in dataset_list: data = dataset_catalog.get(dataset_name) args = data["args"] args["is_train"] = is_train factory = globals()[data['factory']] # make dataset from factory dataset = factory(**args) datasets.append(dataset) # for testing, return a list of datasets if not is_train: return datasets # for training, concatenate all datasets into a single one dataset = datasets[0] if len(datasets) > 1: dataset = ConcatDataset(datasets) return [dataset]
def __init__(self, pattern="/global_index_cluster_data.npy", root_dir='../results/VAE_fashion-mnist_64_62', transform=None, list_idx=[0], dsname="fashion-mnist", num_labels=10, num_cluster=5): """ Args: pattern (string): file name of the npy file which stores the global index of each cluster/subdomain as a dictionary root_dir (string): Directory with all the images. transform (callable, optional): Optional transform to be applied on a sample. list_idx (list): the list of indexes of the cluster to choose as trainset or testset for example trainset = VGMMDataset(list_idx = [0,1, 2, 3]) testset = VGMMDataset(list_idx = [4]) dsname: currently dsname is fashion-mnist, but not used at all """ self.root_dir = root_dir self.pattern = pattern self.transform = transform if not tf.gfile.Exists(self.root_dir + self.pattern): _, self.global_index = InputDataset.concatenate_data_from_dir(self.root_dir, num_labels=num_labels, num_clusters=num_cluster) else: self.global_index = np.load(self.root_dir + pattern, allow_pickle=True) self.list_idx = list_idx all_inds = [] print('cluster index list:' + str(list_idx)) for index in self.list_idx: # iterate all **chosen** clusters/subdomains to_append = self.global_index.item().get(str(index)) # self.global_index is a dictionary of {'0': [15352, 2152,21, 25,...], '1':[1121, 1252, 3195,...]} print('\n size of cluster:' + str(np.shape(to_append)) + '\n') all_inds = np.append(all_inds, to_append) print(all_inds.shape) self.all_inds = all_inds.tolist() self.all_inds = [round(x) for x in self.all_inds] # make to be integer # self.all_inds = map(round, self.all_inds) trainset_temp = torchvision.datasets.FashionMNIST(root='./data', train=True, download=True, transform=transform) testset_temp = torchvision.datasets.FashionMNIST(root='./data', train=False, download=False, transform=transform) cd = ConcatDataset((trainset_temp, testset_temp)) self.subset = torch.utils.data.Subset(cd, self.all_inds)
def make_dataset(data, for_evaluate=False): padded_spectrogram_dataset = SpectrogramDataset( inputs=data, sampling_length=config.sampling_length, min_not_silence_length=int(config.min_not_silence_rate * config.sampling_length), padding_length=config.padding_length * 2, ) spectrogram_dataset = SpectrogramDataset( inputs=data, sampling_length=config.sampling_length, min_not_silence_length=int(config.min_not_silence_rate * config.sampling_length), padding_length=0, ) dataset = TrainDataset( padded_spectrogram_dataset=padded_spectrogram_dataset, spectrogram_dataset=spectrogram_dataset, latent_size=config.latent_size, ) if for_evaluate: dataset = ConcatDataset([dataset] * config.evaluate_times) return dataset
def join(self, other, info=None): if type(other) is not list: other = [other] datasets = [self] + other info = info or datasets[0].info name = f"concat[" + ",".join(x.name for x in datasets) + "]" return Dataset(ConcatDataset(datasets), info, name)
def _concat_dataset(cfg, default_args=None): types = cfg['type'] ann_files = cfg['ann_file'] img_prefixes = cfg.get('img_prefix', None) dataset_infos = cfg.get('dataset_info', None) num_joints = cfg['data_cfg'].get('num_joints', None) dataset_channel = cfg['data_cfg'].get('dataset_channel', None) datasets = [] num_dset = len(ann_files) for i in range(num_dset): cfg_copy = copy.deepcopy(cfg) cfg_copy['ann_file'] = ann_files[i] if isinstance(types, (list, tuple)): cfg_copy['type'] = types[i] if isinstance(img_prefixes, (list, tuple)): cfg_copy['img_prefix'] = img_prefixes[i] if isinstance(dataset_infos, (list, tuple)): cfg_copy['dataset_info'] = dataset_infos[i] if isinstance(num_joints, (list, tuple)): cfg_copy['data_cfg']['num_joints'] = num_joints[i] if is_seq_of(dataset_channel, list): cfg_copy['data_cfg']['dataset_channel'] = dataset_channel[i] datasets.append(build_dataset(cfg_copy, default_args)) return ConcatDataset(datasets)
def build_data_loader_dump(manifest_list, batch_size=32, num_workers=16, shuffle=True, drop_last=True, rate_min=0.9, rate_max=1.1, n_mels=80, hop_length=160, win_length=400, n_fft=512, left_frames=0, right_frames=0, skip_frames=0, vocab_path='testing_vocab.model', min_duration=1, max_duration=10, given_rate=None): audio_sets = [ AudioSet(file, rate_min, rate_max, n_mels, hop_length, win_length, n_fft, left_frames, right_frames, skip_frames, vocab_path, min_duration, max_duration, given_rate) for file in manifest_list ] dataset = ConcatDataset(audio_sets) dataloader = DataLoaderX(dataset, batch_size, shuffle, num_workers=num_workers, collate_fn=CollateFnDump(), drop_last=drop_last) return dataloader
def __init__(self, root, transforms=None, segments=None): """ :param root: the dataset root directory :type root: str :param transform: the transformation to perform after loading the frames. A typical choice is ``torchvision.transforms.Totensor()`` followed by ``Normalize``. :param segments: None to concatenate all segments as if they were a single video; otherwise, specify the name of the segment to read; or a list of names to concatenate :type segments: Optional[Union[str, Sequence[str]]] """ super(VideoSegmentDataset, self).__init__(root) self.transforms = transforms if isinstance(segments, str): segments = [segments] def in_segments(ds_name): if segments is None: return True return ds_name in segments self.segment_data = ConcatDataset(list(map(_SegmentWrapper, map(self.h5file.get, filter(in_segments, self.h5file)))))
def build_dataset(cfg, stage): """ Build dataset. if several datasets are defined in the dict cfg.*_data_loader.datasets then create ConcatDataset """ assert stage in ["train", "val", "test"] key = LyftDataset.name_2_dataloader_key[stage] cfg = cfg.copy() dset_cfg = cfg[key] if "datasets" in dset_cfg: datasets = [] for dset_name, params in dset_cfg.datasets.items(): cur_cfg = cfg.copy() # we take only the subconfig with the corresponding name! OmegaConf.set_struct(cur_cfg, False) cur_cfg[key].update(params) OmegaConf.set_struct(cur_cfg, True) if cur_cfg[key].prerendered: dset_class = LyftDatasetPrerendered else: dset_class = LyftDataset datasets.append(dset_class(dset_name, cfg_data=cur_cfg)) if len(datasets) > 1: return ConcatDataset(datasets) else: return datasets[0] else: if dset_cfg.prerendered: dset_class = LyftDatasetPrerendered else: dset_class = LyftDataset return dset_class(dset_cfg.dset_name, cfg_data=cfg)
def load_mnist(dataset_name, shuffle=True, seed=547): trainset_temp = torchvision.datasets.FashionMNIST(root='./data', train=True, download=True, transform=transform) trX = trainset_temp.data trX = trX.reshape((60000, 28, 28, 1)) trY = trainset_temp.targets testset_temp = torchvision.datasets.FashionMNIST(root='./data', train=False, download=False, transform=transform) teX = testset_temp.data teX = teX.reshape((10000, 28, 28, 1)) teY = testset_temp.targets cd = ConcatDataset((trainset_temp, testset_temp)) #return cd.data, cd.targets X = np.concatenate((trX, teX), axis=0) y = np.concatenate((trY, teY), axis=0).astype(np.int) yy = np.zeros((len(y), 10)) yy[np.arange(len(y)), y] = 1 if shuffle: np.random.seed(seed) np.random.shuffle(X) np.random.seed(seed) np.random.shuffle(yy) return X / 255., yy
def prepare_data(self, datasets): """Deal with making data ready for consumption. Parameters --- datasets: Dict[str, List of dataset names] Returns: tuple: """ # train_datasets = {dataset_name: dataset for dataset_name, dataset in zip(datasets["order"], datasets["train"])} train_datasets = datasets_dict(datasets["train"], datasets["order"]) val_datasets = datasets_dict(datasets["test"], datasets["order"]) eval_dataset = val_datasets[self.config.testing.eval_dataset] # split into training and testing point, assumes there is no meaningful difference in dataset order eval_train_dataset = eval_dataset.new(0, self.config.testing.n_samples) eval_eval_dataset = eval_dataset.new(self.config.testing.n_samples, -1) # sample a subset so validation doesn't take too long eval_eval_dataset = eval_eval_dataset.sample(min(self.config.testing.few_shot_validation_size, len(eval_dataset))) if self.config.data.alternating_order: order, n_samples = alternating_order(train_datasets, tasks=self.config.data.alternating_tasks, n_samples_per_switch=self.config.data.alternating_n_samples_per_switch, relative_frequencies=self.config.data.alternating_relative_frequencies) else: n_samples, order = n_samples_order(self.config.learner.samples_per_task, self.config.task_order, datasets["order"]) datas = get_continuum(train_datasets, order=order, n_samples=n_samples, eval_dataset=self.config.testing.eval_dataset, merge=False) # for logging extra things self.extra_dataloader = iter(DataLoader(ConcatDataset(train_datasets.values()), batch_size=self.mini_batch_size, shuffle=True)) return datas, order, n_samples, eval_train_dataset, eval_eval_dataset, eval_dataset
def __init__(self, corpora: List[Corpus], name: str = "multicorpus", **corpusargs): self.corpora: List[Corpus] = corpora train_parts = [] dev_parts = [] test_parts = [] for corpus in self.corpora: if corpus.train: train_parts.append(corpus.train) if corpus.dev: dev_parts.append(corpus.dev) if corpus.test: test_parts.append(corpus.test) super(MultiCorpus, self).__init__( ConcatDataset(train_parts) if len(train_parts) > 0 else None, ConcatDataset(dev_parts) if len(dev_parts) > 0 else None, ConcatDataset(test_parts) if len(test_parts) > 0 else None, name=name, **corpusargs, )
def train_model_crossval(data_transforms, kfold_dir, train_cfg, model_cfg, optimizer_cfg, scheduler_cfg, loss_fn=nn.CrossEntropyLoss(reduction='none'), cv=True, pseudo_scheduler=None, pseudo_dir=None, target_transform=None): kfold_datasets = [] for k in safe_listdir(kfold_dir): kfold_datasets.append(datasets.ImageFolder(os.path.join(kfold_dir, k))) kfold_result = [] K = len(kfold_datasets) for i in range(K): print('K_Fold CV {}/{}'.format(i + 1, K)) print('=' * 10) train_sets = kfold_datasets[:i] + kfold_datasets[i + 1:] for s in train_sets: s.transform = data_transforms['train'] if target_transform is not None: s.target_transform = target_transform val_set = kfold_datasets[i] val_set.transform = data_transforms['val'] image_datasets = {'train': ConcatDataset(train_sets), 'val': val_set} if pseudo_scheduler is not None: pseudo_set = datasets.ImageFolder(pseudo_dir) pseudo_set.transform = data_transforms['val'] image_datasets['pseudo'] = pseudo_set class_names, dataloaders, dataset_sizes = helper_dataloaders( image_datasets, train_cfg['batch_size']) model, optimizer, scheduler = \ helper_train(model_cfg, optimizer_cfg, scheduler_cfg) result = \ train_model(class_names, dataset_sizes, dataloaders, model, loss_fn, optimizer, scheduler, model_cfg['device'], train_cfg['num_epochs'], train_cfg['batch_per_disp'], pseudo_scheduler=pseudo_scheduler) kfold_result.append(result) ckpoint = {'kfold_result': kfold_result, 'class_names': class_names} torch.save(ckpoint, 'ckpoint.pt') if not cv: break return ckpoint
def get_dataset(replay_folder, cache_folder, limit, name=None): if name is not None: name = os.path.join(cache_folder, name) if os.path.exists(name): return torch.load(name) if replay_folder is None: files = [(dp, f) for dp, dn, fn in os.walk(cache_folder) for f in fn if f.endswith(".pickle")] else: files = [(dp, f) for dp, dn, fn in os.walk(replay_folder) for f in fn if f.endswith(".replay")] file_iter = tqdm.tqdm(enumerate(files[:limit]), desc="Load", total=limit, bar_format="{l_bar}{r_bar}") datasets = [] for i, (dp, f) in file_iter: try: if replay_folder is None: out_path = os.path.join(dp, f) with open(out_path, "rb") as handle: dfs = pickle.load(handle) else: in_path = os.path.join(dp, f) out_path = os.path.join(cache_folder, f[:-7] + ".pickle") if os.path.exists(out_path): with open(out_path, "rb") as handle: dfs = pickle.load(handle) else: dfs = replay_to_dfs(in_path) with open(out_path, "wb") as handle: pickle.dump(dfs, handle) x_n, y_n = convert_dfs(dfs, tensors=True) assert x_n[2].shape == x_n[3].shape normalize(x_n) swap_teams(x_n, y_n, slice(i % 2, None, 2)) datasets.append(TensorDataset(*x_n, *y_n)) # x_s, y_s = [v.copy() for v in x_n], [v.copy() for v in y_n] # swap_teams(x_s, y_s) # arrays.append((x_s, y_s)) except Exception as e: print(e) pass ds = ConcatDataset(datasets) if name is not None: torch.save(ds, name) return ds
def tdt_split(self): length = self.df.shape[0] end_train = int(length * self.split[0]) start_dev = end_train end_dev = int(start_dev + length * self.split[1]) start_test = end_dev if 'split' in self.df.columns: df_train = self.df[self.df['split'] == 'train'] df_dev = self.df[self.df['split'] == 'dev'] df_test = self.df[self.df['split'] == 'test'] else: df_train = self.df[:end_train] df_dev = self.df[start_dev:end_dev] df_test = self.df[start_test:] minidataKwargs = { 'stft_window': self.stft_window, 'stft_hop': self.stft_hop, 'n_fft': self.n_fft, 'hop_length': self.hop_length, 'win_length': self.win_length } dataset_train = ConcatDataset([ MiniData(row.wav, **minidataKwargs) for i, row in tqdm(df_train.iterrows()) ]) dataset_dev = ConcatDataset([ MiniData(row.wav, **minidataKwargs) for i, row in tqdm(df_dev.iterrows()) ]) dataset_test = ConcatDataset([ MiniData(row.wav, **minidataKwargs) for i, row in tqdm(df_test.iterrows()) ]) return { 'train': dataset_train, 'dev': dataset_dev, 'test': dataset_test }
def get_dataloader(self, type_path: str, batch_size: int, shuffle: bool = False) -> DataLoader: datasets = [] for d in self.dataset_list: datasets.append( KiltDataset(self.tokenizer, self.data_dir, d, type_path, self.source_length, self.target_length, self.output_dir)) if type_path == 'dev': for x in datasets: self.devsets.update(x.id_targets) concat_dataset = ConcatDataset(datasets) dataloader = DataLoader(concat_dataset, batch_size=batch_size, shuffle=shuffle, collate_fn=self.collate_fn) print(type_path, dataloader.batch_size, concat_dataset.__len__()) return dataloader
def __init__(self, indices, transform=None): trainset_temp = torchvision.datasets.FashionMNIST(root='./data', train=True, download=True, transform=transform) testset_temp = torchvision.datasets.FashionMNIST(root='./data', train=False, download=False, transform=transform) cd = ConcatDataset((trainset_temp, testset_temp)) #trainloader, testloader = _make_dataloaders(cd, trainsetsize, testsetsize, batch_size) self.subset = torch.utils.data.Subset(cd, indices)
def _dataset(fns, for_test=False): inputs = [ LazyInput( phoneme_list_path=phoneme_list_paths[fn], start_accent_list_path=start_accent_list_paths[fn], end_accent_list_path=end_accent_list_paths[fn], start_accent_phrase_list_path=start_accent_phrase_list_paths[ fn], end_accent_phrase_list_path=end_accent_phrase_list_paths[fn], f0_path=f0_paths[fn], volume_path=volume_paths[fn], phoneme_class=phoneme_type_to_class[config.phoneme_type], ) for fn in fns ] if not for_test: dataset = FeatureDataset( inputs=inputs, sampling_length=config.sampling_length, f0_process_mode=F0ProcessMode(config.f0_process_mode), phoneme_mask_max_length=config.phoneme_mask_max_length, phoneme_mask_num=config.phoneme_mask_num, accent_mask_max_length=config.accent_mask_max_length, accent_mask_num=config.accent_mask_num, f0_mask_max_length=config.f0_mask_max_length, f0_mask_num=config.f0_mask_num, ) else: dataset = FeatureDataset( inputs=inputs, sampling_length=config.sampling_length, f0_process_mode=F0ProcessMode(config.f0_process_mode), phoneme_mask_max_length=0, phoneme_mask_num=0, accent_mask_max_length=0, accent_mask_num=0, f0_mask_max_length=0, f0_mask_num=0, ) if speaker_ids is not None: dataset = SpeakerFeatureDataset( dataset=dataset, speaker_ids=[speaker_ids[fn] for fn in fns], ) dataset = TensorWrapperDataset(dataset) if for_test: dataset = ConcatDataset([dataset] * config.test_trial_num) return dataset
def update_modules(self, trainloader, task_id): self.net.freeze_modules(freeze=False) self.net.freeze_structure(freeze=True) prev_reduction = self.loss.reduction self.loss.reduction = 'sum' # make sure the loss is summed over instances tmp_dataset = copy.copy(trainloader.dataset) tmp_dataset.tensors = tmp_dataset.tensors + (torch.full( (len(tmp_dataset), ), task_id, dtype=int), ) mega_dataset = ConcatDataset( [loader.dataset for loader in self.memory_loaders.values()] + [tmp_dataset]) tmp_loader = next(iter(self.memory_loaders.values())) batch_size = tmp_loader.batch_size mega_loader = torch.utils.data.DataLoader(mega_dataset, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True) for X, Y, t in mega_loader: X = X.to(self.net.device, non_blocking=True) Y = Y.to(self.net.device, non_blocking=True) l = 0. n = 0 all_t = torch.unique(t) for task_id_tmp in all_t: Y_hat = self.net(X[t == task_id_tmp], task_id=task_id_tmp) l += self.loss(Y_hat, Y[t == task_id_tmp]) n += X.shape[0] l /= n self.optimizer.zero_grad() l.backward() self.optimizer.step() l = 0. n = 0 self.net.hide_tmp_module() for task_id_tmp in all_t: Y_hat = self.net(X[t == task_id_tmp], task_id=task_id_tmp) l += self.loss(Y_hat, Y[t == task_id_tmp]) n += X.shape[0] l /= n self.optimizer.zero_grad() l.backward() self.optimizer.step() self.net.recover_hidden_module() self.loss.reduction = prev_reduction self.net.freeze_modules(freeze=True) self.net.freeze_structure( freeze=False, task_id=task_id) # unfreeze only current task's structure
def test_dataset_transform_override(): # given data1 = MemoryDataset({ 'x': [pic(1), pic(2), pic(3)], 'y': ['a', 'b', 'c'] }, transform=Lambda(lambda x: np.array(x)[0, 0] * 2)) data2 = MemoryDataset({ 'x': [pic(4), pic(5), pic(6)], 'y': ['d', 'e', 'f'] }, transform=Lambda(lambda x: np.array(x)[0, 0] * 3)) data3 = MemoryDataset({ 'x': [pic(7), pic(8), pic(9)], 'y': ['g', 'h', 'i'] }, transform=Lambda(lambda x: np.array(x)[0, 0] + 10)) ds = ConcatDataset([data1, ConcatDataset([data2, data3])]) # when x1, y1 = zip(*[ds[i] for i in range(len(ds))]) with override_dataset_transform(ds, Lambda(lambda x: np.array(x)[0, 0])) as ds_overriden: x2, y2 = zip(*[ds_overriden[i] for i in range(len(ds_overriden))]) x3, y3 = zip(*[ds[i] for i in range(len(ds))]) # then assert np.array_equal(x1, [2, 4, 6, 12, 15, 18, 17, 18, 19]) assert np.array_equal(x2, [1, 2, 3, 4, 5, 6, 7, 8, 9]) assert np.array_equal(x3, x1) # after everything is back to normal