def __init__(self, data: DataBase, replacement: bool = False, num_samples: Optional[int] = None): super().__init__(data) self._sampler = torch_sampler.RandomSampler(data, replacement, num_samples)
def visualize_data(dataset, imagedir='runtime-images', filename='sample', num_samples=10): """ Displays random samples from the given dataset in a grid. """ randomSampler = sampler.RandomSampler(dataset, True, num_samples=num_samples) my_path = os.path.join(os.path.dirname(__file__), imagedir) for index in randomSampler: (image, label) = dataset[index] plt.figure() num_channels = image.shape[0] if num_channels == 1: image = image.squeeze() else: image = image.view(image.shape[1], image.shape[2], image.shape[0]) plt.imshow(image.numpy()) plt.savefig( os.path.join(my_path, filename + str(index) + '.png'), bbox_inches='tight', cmap='gray', ) plt.close()
def generate_sampler(dataset, sampler_option="random", n_bins=5): df = dataset.df n_labels = df[dataset.label].nunique() count = np.zeros(n_labels) for idx in df.index: label = df.loc[idx, dataset.label] key = dataset.label_fn(label) count[key] += 1 weight_per_class = 1 / np.array(count) weights = [] for idx, label in enumerate(df[dataset.label].values): key = dataset.label_fn(label) weights += [weight_per_class[key]] * dataset.elem_per_image if sampler_option == "random": return sampler.RandomSampler(weights) elif sampler_option == "weighted": return sampler.WeightedRandomSampler(weights, len(weights)) else: raise NotImplementedError( f"The option {sampler_option} for sampler on classification task is not implemented" )
def make_loader(self, batch_size=16, num_workers=0, shuffle=False, pin_memory=False, resize_rate=10, drop_last=False): """ CommandLine: python ~/code/netharn/examples/yolo_voc.py YoloVOCDataset.make_loader Example: >>> # DISABLE_DOCTSET >>> torch.random.manual_seed(0) >>> self = YoloVOCDataset(split='train') >>> self.augmenter = None >>> loader = self.make_loader(batch_size=1, shuffle=True) >>> # training batches should have multiple shapes >>> shapes = set() >>> for batch in ub.ProgIter(iter(loader), total=len(loader)): >>> inputs, labels = batch >>> # test to see multiscale works >>> shapes.add(inputs.shape[-1]) >>> if len(shapes) > 1: >>> break >>> assert len(shapes) > 1 """ import torch.utils.data.sampler as torch_sampler assert len(self) > 0, 'must have some data' if shuffle: sampler = torch_sampler.RandomSampler(self) resample_freq = resize_rate else: sampler = torch_sampler.SequentialSampler(self) resample_freq = None # use custom sampler that does multiscale training batch_sampler = multiscale_batch_sampler.MultiScaleBatchSampler( sampler, batch_size=batch_size, resample_freq=resample_freq, drop_last=drop_last, ) # torch.utils.data.sampler.WeightedRandomSampler loader = torch_data.DataLoader( self, batch_sampler=batch_sampler, collate_fn=nh.data.collate.padded_collate, num_workers=num_workers, pin_memory=pin_memory) if loader.batch_size != batch_size: try: # Hack: ensure dataloader has batch size attr loader._DataLoader__initialized = False loader.batch_size = batch_size loader._DataLoader__initialized = True except Exception: pass return loader
def make_loader(self, batch_size=16, num_workers=0, shuffle=False, pin_memory=False): """ Example: >>> torch.random.manual_seed(0) >>> dset = coco_api.CocoDataset(coco_api.demo_coco_data()) >>> self = YoloCocoDataset(dset, train=1) >>> loader = self.make_loader(batch_size=1) >>> train_iter = iter(loader) >>> # training batches should have multiple shapes >>> shapes = set() >>> for batch in train_iter: >>> shapes.add(batch[0].shape[-1]) >>> if len(shapes) > 1: >>> break >>> #assert len(shapes) > 1 >>> vali_loader = iter(loaders['vali']) >>> vali_iter = iter(loaders['vali']) >>> # vali batches should have one shape >>> shapes = set() >>> for batch, _ in zip(vali_iter, [1, 2, 3, 4]): >>> shapes.add(batch[0].shape[-1]) >>> assert len(shapes) == 1 """ assert len(self) > 0, 'must have some data' if shuffle: if True: # If the data is not balanced we need to balance it index_to_weight = self._training_sample_weights() num_samples = len(self) index_to_weight = index_to_weight[:num_samples] sampler = torch_sampler.WeightedRandomSampler(index_to_weight, num_samples, replacement=True) sampler.data_source = self # hack for use with multiscale else: sampler = torch_sampler.RandomSampler(self) resample_freq = 10 else: sampler = torch_sampler.SequentialSampler(self) resample_freq = None # use custom sampler that does multiscale training batch_sampler = multiscale_batch_sampler.MultiScaleBatchSampler( sampler, batch_size=batch_size, resample_freq=resample_freq, ) # torch.utils.data.sampler.WeightedRandomSampler loader = torch_data.DataLoader(self, batch_sampler=batch_sampler, collate_fn=nh.data.collate.padded_collate, num_workers=num_workers, pin_memory=pin_memory) if loader.batch_size != batch_size: try: loader.batch_size = batch_size except Exception: pass return loader
def data_sampler(dataset, shuffle, distributed): if distributed: return DistributedSampler(dataset, shuffle=shuffle) if shuffle: return sampler.RandomSampler(dataset) else: return sampler.SequentialSampler(dataset)
def main(): data_dir = sys.argv[1] hero2ix_dir = sys.argv[2] # import DataFrame and hero2ix dictionary heroes_df = pd.read_csv(data_dir, index_col=0) hero2ix_df = pd.read_csv(hero2ix_dir, index_col=0) heroes_df = heroes_df.dropna().reset_index(drop=True) hero2ix = dict(zip(hero2ix_df.hero, hero2ix_df.ID)) # heroes = hero2ix_df['hero'].values # train test split split = int(len(heroes_df)*0.9) heroes_train = heroes_df.iloc[:split] heroes_test = heroes_df.iloc[split:] # build dataset generator train_gen = DataFrameIterator(heroes_train, hero2ix) test_gen = DataFrameIterator(heroes_test, hero2ix) # Use Dataloader class in pytorch to generate batched data batch_size = 16 loader_train = DataLoader(train_gen, batch_size=batch_size, sampler=sampler.RandomSampler(train_gen), num_workers=4) loader_test = DataLoader(test_gen, batch_size=batch_size, sampler=sampler.SequentialSampler(test_gen), num_workers=4) # define model, totally three models in hetor2vec.py model = CBOH(embedding_dim=10, heropool_size=len(hero2ix)) # define loss function loss_function = nn.CrossEntropyLoss() # run train losses = train(model=model, dataloader=loader_train, loss_function=loss_function, init_lr=0.1, epochs=20, lr_decay_epoch=8, print_epoch=2, gpu=False) # check test accuracy print('accuracy: ', accuracy(model, dataloader=loader_test, batch_size=batch_size, gpu=False)) # save embeddings as numpy arrays output_dir = './output/hero/hero_embeddings.npy' save_embeddings(model, filename=output_dir) # pickle model pickle_dir = './output/hero/model.p' pickle.dump(obj=model, file=open(pickle_dir, 'wb')) # plot loss vs epoch plot_loss(losses, './output/hero/loss_hitory.png') # project embeddings to 2d plane plot_embeddings(model, hero2ix)
def __init__(self, opt, shared=None): opt['batch_sort'] = False super().__init__(opt, shared) self.use_batch_act = self.bsz > 1 self.num_workers = opt['numworkers'] self.batch_sort = opt.get('pytorch_teacher_batch_sort') self.batch_cache_type = opt.get('batch_sort_cache') self.batch_sort_field = opt.get('batch_sort_field') # One can specify a collate function to use for preparing a batch self.opt = opt.copy() self.is_shared = shared is not None dataset_classes = self.get_dataset_class(opt) self.ordered = ('ordered' in self.datatype or ('stream' in self.datatype and not opt.get('shuffle'))) if not shared: if len(dataset_classes) > 1: datasets = [] for class_name, collate_fn, task_name in dataset_classes: opt['pytorch_teacher_task'] = task_name opt['task'] = task_name datasets.append(class_name(opt)) self.collate_fn = collate_fn self.dataset = ParlAIConcatDataset(datasets) else: class_name, self.collate_fn, task_name = dataset_classes[0] self.dataset = class_name(opt) if self.ordered or not self.training: data_sampler = sampler.SequentialSampler(self.dataset) pin_memory = False else: data_sampler = sampler.RandomSampler(self.dataset) pin_memory = True self.pytorch_dataloader = DataLoader( self.dataset, batch_size=self.bsz, sampler=data_sampler, num_workers=self.num_workers, collate_fn=self.collate_fn, pin_memory=pin_memory, drop_last=False, ) self.lastYs = [None] * self.bsz if self.batch_sort: self.loader_process = LoaderProcess(opt) self.loader_process.start() self.data = enumerate(self.pytorch_dataloader) else: self.dataset = shared['dataset'] self.pytorch_dataloader = shared['pytorch_dataloader'] self.lastYs = shared['lastYs'] self.data = shared['data'] self.num_batches = math.ceil(self.dataset.num_episodes() / self.bsz) self.reset()
def __init__(self, dataset, batch_size, shuffle = True, drop_last = False): # buckets list 根据contexts长度分组 self.buckets = bucket(dataset) # 打乱 list if shuffle: np.random.shuffle(self.buckets) random_samplers = [sampler.RandomSampler(bucket) for bucket in self.buckets] else: random_samplers = [sampler.SequentialSampler(bucket) for bucket in self.buckets] self.sampler = [sampler.BatchSampler(s, batch_size, drop_last) for s in random_samplers]
def get_dataloaders(train_batchsize, val_batchsize): kwargs={ 'num_workers': 20, 'pin_memory': True } input_size = INFO['model-info']['input-size'] base = '{}/{}'.format(os.environ['datadir-base'], INFO['dataset']) normalize = T.Normalize(mean=INFO['dataset-info']['normalization']['mean'], std=INFO['dataset-info']['normalization']['std']) transform = { 'train': T.Compose([ T.Resize(tuple([int(x*(4/3)) for x in input_size])), # 放大 T.RandomResizedCrop(input_size), # 随机裁剪后resize T.RandomHorizontalFlip(0.5), # 随机水平翻转 T.RandomVerticalFlip(0.5), # 随机垂直翻转 T.RandomApply([T.RandomRotation(90)], 0.5), # 随机旋转90/270度 T.RandomApply([T.RandomRotation(180)], 0.25), # 随机旋转180度 T.RandomApply([T.ColorJitter(brightness=np.random.random()/5+0.9)], 0.5), #随机调整图像亮度 T.RandomApply([T.ColorJitter(contrast=np.random.random()/5+0.9)], 0.5), # 随机调整图像对比度 T.RandomApply([T.ColorJitter(saturation=np.random.random()/5+0.9)], 0.5), # 随机调整图像饱和度 T.ToTensor(), normalize ]), 'val': T.Compose([ T.Resize(input_size), # 放大 T.ToTensor(), normalize ]) } train_dset = dset.ImageFolder('{}/{}'.format(base, 'Train'), transform=transform['train']) train4val_dset = dset.ImageFolder('{}/{}'.format(base, 'Train'), transform=transform['val']) val_dset = dset.ImageFolder('{}/{}'.format(base, 'Val'), transform=transform['val']) labels = torch.from_numpy(np.array(train_dset.imgs)[:, 1].astype(int)) num_of_images_by_class = torch.zeros(len(train_dset.classes)) for i in range(len(train_dset.classes)): num_of_images_by_class[i] = torch.where(labels == i, torch.ones_like(labels), torch.zeros_like(labels)).sum().item() mapping = {} for c in train_dset.classes: if c in val_dset.classes: mapping[train_dset.class_to_idx[c]] = val_dset.class_to_idx[c] else: mapping[train_dset.class_to_idx[c]] = val_dset.class_to_idx['UNKNOWN'] mapping[-1] = val_dset.class_to_idx['UNKNOWN'] train_len = train_dset.__len__() val_len = val_dset.__len__() train_loader = DataLoader(train_dset, batch_size=train_batchsize, sampler=sampler.RandomSampler(range(train_len)), **kwargs) train4val_loader = DataLoader(train4val_dset, batch_size=val_batchsize, sampler=sampler.SequentialSampler(range(train_len)), **kwargs) val_loader = DataLoader(val_dset, batch_size=val_batchsize, sampler=sampler.SequentialSampler(range(val_len)), **kwargs) imgs = np.array(val_dset.imgs) return train_loader, train4val_loader, val_loader, num_of_images_by_class, mapping, imgs
def generate_sampler(dataset, sampler_option="random", n_bins=5): df = dataset.df weights = [1] * len(df) * dataset.elem_per_image if sampler_option == "random": return sampler.RandomSampler(weights) else: raise NotImplementedError( f"The option {sampler_option} for sampler on reconstruction task is not implemented" )
def setup_dataloader(self, mode): # dataset[train] or dataset[test] ,batch_size ,sample ,dataloader dataset = self.dataset[mode] # VOCDataset object # Difficult to understand how it work data_loader = dataloader.DataLoader( dataset, batch_size=self.config.BATCH_SIZE[mode], sampler=sampler.RandomSampler(dataset), collate_fn=self.collate_fn) #print(data_loader) return data_loader
def prepare_dataloader(): """Make data loaders for train and dev""" global args logger.info('-' * 100) logger.info('Loading Datasets...') train_ex = utils.load_data(args.train_file) logger.info('{} train examples loaded'.format(len(train_ex))) test_ex = utils.load_data(args.test_file) logger.info('{} test examples loaded'.format(len(test_ex))) logger.info('Building feature dictionary...') feature_dict = utils.build_feature_dict(train_ex) logger.info('Num features = %d' % len(feature_dict)) logger.info(feature_dict) logger.info('Build word dictionary...') word_dict = utils.build_word_dict(train_ex + test_ex) logger.info('Num words = %d' % len(word_dict)) args.vocab_size = len(word_dict) logger.info('-' * 100) logger.info('Creating DataLoaders') if args.cuda: kwargs = {'num_workers': 0, 'pin_memory': True} else: kwargs = {'num_workers': args.data_workers} train_dataset = data.ImdbDataset(args, train_ex, word_dict, feature_dict) train_loader = DataLoader(train_dataset, batch_size=args.batch_size, sampler=sampler.RandomSampler(train_dataset), **kwargs) test_dataset = data.ImdbDataset(args, test_ex, word_dict, feature_dict) test_loader = DataLoader(test_dataset, batch_size=args.batch_size, sampler=sampler.RandomSampler(test_dataset), **kwargs) return train_loader, test_loader, word_dict, feature_dict
def main(): if torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device('cpu') print(f'Using device {device}') model = models.resnet18(pretrained=True) model = model.to(device=device) num_ftrs = model.fc.in_features model.fc = torch.nn.Linear(num_ftrs, NUM_AGE_BUCKETS).to(device=device) model.load_state_dict(torch.load(MODEL_PATH)) model.eval() transform = transforms.Compose([ transforms.ToPILImage(), transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize([0.5797703, 0.43427974, 0.38307136], [0.25409877, 0.22383073, 0.21819368]), ]) dataset = ChaLearnDataset( ['ChaLearn/images/test_1', 'ChaLearn/images/test_2'], 'ChaLearn/gt/test_gt.csv', transform, ) loader = DataLoader( dataset, batch_size=BATCH_SIZE, num_workers=DATA_LOADER_NUM_WORKERS, sampler=sampler.RandomSampler(dataset), ) # Test and write the results to a file. with torch.no_grad(): with open(OUTPUT_FILE_NAME, 'w') as output: for x, file_names in loader: x = x.to(device=device) scores = model(x) num_classes = scores.size(1) predicted_ages = ( (F.softmax(scores, dim=1) * torch.arange(end=num_classes).to(device=device)).sum( dim=1)) lines = [ f'{file_name},{age}\n' for file_name, age in zip(file_names, predicted_ages) ] output.writelines(lines)
def __init__(self, data_source, shuffle=False, batch_size=16, drop_last=False, resample_frequency=10): if shuffle: self.sampler = torch_sampler.RandomSampler(data_source) else: self.sampler = torch_sampler.SequentialSampler(data_source) self.shuffle = shuffle self.batch_size = batch_size self.drop_last = drop_last self.num_scales = len(data_source.multi_scale_inp_size) self.resample_frequency = resample_frequency
def evaluate(model, test_dataset): model.eval() criterion = tnn.MSELoss() test_sampler = tsampler.RandomSampler(test_dataset) test_loader = tdata.DataLoader(test_dataset, batch_size=len(test_dataset), sampler=test_sampler) d = next(iter(test_loader)) data_var = _make_variable(d[0], volatile=True) target_var = _make_variable(d[1], volatile=True) output_f, output_g = model.forward(data_var) loss = criterion(output_g, target_var) return loss
def __init__(self, input_tensor, input_lengths, labels_tensor, batch_size, sequence_lenght=2665): self.input_tensor = input_tensor self.input_lengths = input_lengths self.labels_tensor = labels_tensor self.batch_size = batch_size self.sequence_length = 2665 self.sampler = splr.BatchSampler( splr.RandomSampler(self.labels_tensor), self.batch_size, False) self.sampler_iter = iter(self.sampler)
def create_dataloader(config, data, mode): dataset = create_dataset(config, data, mode) if mode == 'train': # create Sampler if dist.is_available() and dist.is_initialized(): train_RandomSampler = distributed.DistributedSampler(dataset) else: train_RandomSampler = sampler.RandomSampler(dataset, replacement=False) train_BatchSampler = sampler.BatchSampler(train_RandomSampler, batch_size=config.train.batch_size, drop_last=config.train.dataloader.drop_last) # Augment collator = get_collate_fn(config) # DataLoader data_loader = DataLoader(dataset=dataset, batch_sampler=train_BatchSampler, collate_fn=collator, pin_memory=config.train.dataloader.pin_memory, num_workers=config.train.dataloader.work_nums) elif mode == 'val': if dist.is_available() and dist.is_initialized(): val_SequentialSampler = distributed.DistributedSampler(dataset) else: val_SequentialSampler = sampler.SequentialSampler(dataset) val_BatchSampler = sampler.BatchSampler(val_SequentialSampler, batch_size=config.val.batch_size, drop_last=config.val.dataloader.drop_last) data_loader = DataLoader(dataset, batch_sampler=val_BatchSampler, pin_memory=config.val.dataloader.pin_memory, num_workers=config.val.dataloader.work_nums) else: if dist.is_available() and dist.is_initialized(): test_SequentialSampler = distributed.DistributedSampler(dataset) else: test_SequentialSampler = None data_loader = DataLoader(dataset, sampler=test_SequentialSampler, batch_size=config.test.batch_size, pin_memory=config.val.dataloader.pin_memory, num_workers=config.val.dataloader.work_nums) return data_loader
def generate_sampler(dataset, sampler_option='random', step=1): """ Returns sampler according to the wanted options :param dataset: (MRIDataset) the dataset to sample from :param sampler_option: (str) choice of sampler :param step: (int) step to discretize ages and give a weight per class :return: (Sampler) """ df = dataset.df min_age = np.min(df.age) max_age = np.max(df.age) if (max_age - min_age) % step == 0: max_age += step bins = np.arange(min_age, max_age, step) count = np.zeros(len(bins)) for idx in df.index: age = df.loc[idx, "age"] key = np.argmax(np.logical_and(age - step < bins, age >= bins)).astype(int) count[key] += 1 # weight_per_class = (1 / np.array(count)) if count.any() != 0 else 0. weight_per_class = np.zeros_like(count).astype(float) np.divide(1., count, out=weight_per_class, where=count != 0) weights = [0] * len(df) for idx, age in enumerate(df.age.values): key = np.argmax(np.logical_and(age - 5 <= bins, age > bins)).astype(int) weights[idx] = weight_per_class[key] weights = torch.FloatTensor(weights) if sampler_option == 'random': s = sampler.RandomSampler(dataset, replacement=False) elif sampler_option == 'weighted': s = sampler.WeightedRandomSampler(weights, len(weights)) else: raise NotImplementedError( "The option %s for sampler is not implemented" % sampler_option) return s
def get_dataloader(dataset, balance_data, batch_size, num_workers, shuffle=True): if balance_data: weights = dataset.get_data_weights(balance_data) sampler_ = sampler.WeightedRandomSampler(weights, len(weights)) elif shuffle: sampler_ = sampler.RandomSampler(dataset) else: sampler_ = None dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=sampler_, num_workers=num_workers) return dataloader
def __init__(self, opt, shared=None): opt['batch_sort'] = False super().__init__(opt, shared) self.use_batch_act = self.bsz > 1 self.num_workers = opt['numworkers'] self.batch_cache_type = opt.get('batch_sort_cache') # One can specify a collate function to use for preparing a batch self.opt = copy.deepcopy(opt) self.is_shared = shared is not None dataset_class, self.collate_fn = self.get_dataset_class(opt) opt['dataset_class'] = dataset_class opt['collate_fn'] = self.collate_fn if not shared: self.dataset = dataset_class(opt) if self.datatype == 'train' and not isinstance( self.dataset, StreamDataset): data_sampler = sampler.RandomSampler(self.dataset) else: data_sampler = sampler.SequentialSampler(self.dataset) pin_memory = not isinstance(self.dataset, StreamDataset) self.pytorch_dataloader = DataLoader( self.dataset, batch_size=self.bsz, shuffle=False, sampler=data_sampler, num_workers=self.num_workers, collate_fn=self.collate_fn, pin_memory=pin_memory, drop_last=False, ) self.lastYs = [None] * self.bsz if self.batch_cache_type != 'none': self.loader_process = LoaderProcess(opt) self.loader_process.start() self.data = enumerate(self.pytorch_dataloader) else: self.dataset = shared['dataset'] self.pytorch_dataloader = shared['pytorch_dataloader'] self.lastYs = shared['lastYs'] self.data = shared['data'] self.num_batches = math.ceil(self.dataset.num_episodes() / self.bsz) self.reset()
def create_dataloaders(self, train_labels, balance_weights: bool = True): '''create dataloaders based on split from StratifiedKFold''' sampler = (data_loaders.balanced_sampler(train_labels) if balance_weights else None) train_loader = data_loaders.create_loader(self.train_data, batch_size=self.batch_size, sampler=sampler) if config.VALID_SIZE < 0.01: # use all data for training - no val loader val_loader = None else: val_sampler = samp.RandomSampler(self.val_data) val_loader = data_loaders.create_loader(self.val_data, batch_size=100, sampler=val_sampler) if config.SAVE_MODEL: data_loaders.save_valloader(self.val_data) dataloaders_dict = {"train": train_loader, "val": val_loader} return dataloaders_dict
def __init__(self, storage, sampler=None, num_batches=10, batch_size=None, batch_size_bounds=None, replacement=True, verbose=0): """ Initialize the storage sampler. Args: storage (Storage): storage sampler. sampler (Sampler, None): If None, it will use a sampler that randomly sample batches of the storage. It will by default sample :attr:`num_batches`. num_batches (int): number of batches. batch_size (int, None): size of the batch. If None, it will be computed based on the size of the storage, where batch_size = size(storage) // num_batches. Note that the batch size must be smaller than the size of the storage itself. The num_batches * batch_size can however be bigger than the storage size if :attr:`replacement = True`. batch_size_bounds (tuple of int, None): if :attr:`batch_size` is None, we can instead specify the lower and upper bounds for the `batch_size`. For instance, we can set it to `(16, 128)` along with `batch_size=None`; this will result to compute `batch_size = size(storage) // num_batches` but if this one is too small (<16), it will be set to 16, and if this one is too big (>128), it will be set to 128. replacement (bool): if we should sample each element only one time, or we can sample the same ones multiple times. verbose (int, bool): verbose level, select between {0, 1, 2}. If 0=False, it won't print anything. If 1=True, it will print basic information about the sampler. If verbose=2, it will print detailed information. """ # set the storage self.storage = storage # set variables self._num_batches = num_batches self._replacement = bool(replacement) self._batch_size_bounds = batch_size_bounds self._batch_size_given = batch_size is not None self._verbose = verbose # set the sampler if sampler is None: # check batch size and compute it if necessary if batch_size is None: batch_size = self.size // num_batches # check batch size bounds if isinstance(batch_size_bounds, (tuple, list)) and len(batch_size_bounds) == 2: if batch_size < batch_size_bounds[0]: batch_size = batch_size_bounds[0] elif batch_size > batch_size_bounds[1]: batch_size = batch_size_bounds[1] # check the batch size * number of batches wrt the storage size if batch_size * num_batches > self.size and not self.replacement: raise ValueError( "Expecting the batch size (={}) * num_batches (={}) to be smaller than the size of " "the storage (={}), if we can not use replacement.".format( batch_size, num_batches, self.size)) # subsampler if replacement: subsampler = torch_sampler.RandomSampler( data_source=range(self.size), replacement=replacement, num_samples=self.size) else: subsampler = torch_sampler.SubsetRandomSampler( indices=range(self.size)) # create sampler sampler = torch_sampler.BatchSampler(sampler=subsampler, batch_size=batch_size, drop_last=True) self.sampler = sampler if verbose: print( "\nCreating sampler with size: {} - num batches: {} - batch size: {}" .format(self.size, num_batches, self.batch_size))
def train_model(args): with open("tacotron/config.toml") as file: cfg = toml.load(file) tensorboard_path = Path("tensorboard") / args.checkpoint_dir checkpoint_dir = Path(args.checkpoint_dir) writer = SummaryWriter(tensorboard_path) tacotron = Tacotron(**cfg["model"]).cuda() optimizer = optim.Adam(tacotron.parameters(), lr=cfg["train"]["optimizer"]["lr"]) scaler = amp.GradScaler() scheduler = optim.lr_scheduler.MultiStepLR( optimizer=optimizer, milestones=cfg["train"]["scheduler"]["milestones"], gamma=cfg["train"]["scheduler"]["gamma"], ) if args.resume is not None: global_step = load_checkpoint( tacotron=tacotron, optimizer=optimizer, scaler=scaler, scheduler=scheduler, load_path=args.resume, ) else: global_step = 0 root_path = Path(args.dataset_dir) text_path = Path(args.text_path) dataset = TTSDataset(root_path, text_path) sampler = samplers.RandomSampler(dataset) batch_sampler = BucketBatchSampler( sampler=sampler, batch_size=cfg["train"]["batch_size"], drop_last=True, sort_key=dataset.sort_key, bucket_size_multiplier=cfg["train"]["bucket_size_multiplier"], ) collate_fn = partial( pad_collate, reduction_factor=cfg["model"]["decoder"]["reduction_factor"]) loader = DataLoader( dataset, batch_sampler=batch_sampler, collate_fn=collate_fn, num_workers=cfg["train"]["n_workers"], pin_memory=True, ) n_epochs = cfg["train"]["n_steps"] // len(loader) + 1 start_epoch = global_step // len(loader) + 1 for epoch in range(start_epoch, n_epochs + 1): average_loss = 0 for i, (mels, texts, mel_lengths, text_lengths, attn_flag) in enumerate(tqdm(loader), 1): mels, texts = mels.cuda(), texts.cuda() optimizer.zero_grad() with amp.autocast(): ys, alphas = tacotron(texts, mels) loss = F.l1_loss(ys, mels) scaler.scale(loss).backward() scaler.unscale_(optimizer) clip_grad_norm_(tacotron.parameters(), cfg["train"]["clip_grad_norm"]) scaler.step(optimizer) scaler.update() scheduler.step() global_step += 1 average_loss += (loss.item() - average_loss) / i if global_step % cfg["train"]["checkpoint_interval"] == 0: save_checkpoint( tacotron=tacotron, optimizer=optimizer, scaler=scaler, scheduler=scheduler, step=global_step, checkpoint_dir=checkpoint_dir, ) if attn_flag: index = attn_flag[0] alpha = alphas[ index, :text_lengths[index], :mel_lengths[index] // 2] alpha = alpha.detach().cpu().numpy() y = ys[index, :, :].detach().cpu().numpy() log_alignment(alpha, y, cfg["preprocess"], writer, global_step) writer.add_scalar("loss", average_loss, global_step) print( f"epoch {epoch} : loss {average_loss:.4f} : {scheduler.get_last_lr()}" )
def train(model, name, train_dataset, test_dataset): train_sampler = tsampler.RandomSampler(train_dataset) train_loader = tdata.DataLoader(train_dataset, batch_size=len(train_dataset), sampler=train_sampler) optimizer = toptim.Adadelta(model.parameters(), weight_decay=0.001) last_loss = 1e10 last_test_loss = 1e19 epoch = 0 while True: for i, d in enumerate(train_loader): model.train() if hasattr(model, 'before_epoch'): model.before_epoch(epoch) epoch += 1 data_var = _make_variable(d[0]) target_var = _make_variable(d[1]).resize(len(d[1])) criterion = tnn.MSELoss() optimizer.zero_grad() output_f, output_g = model.forward(data_var) # print('output=', output) # print('output shape:', output.shape, 'target_var_shape:', target_var.shape) # print(output_g, target_var) print(output_g.data.shape, ' target shape:', target_var.data.shape) if epoch % 10 == 0: print(((output_g - target_var)**2).sum() / len(output_g)) loss = criterion(output_g, target_var) loss.backward() optimizer.step() print('Batch {} | loss {}'.format(epoch, loss.data[0])) if (loss.data[0] < model.LOSS_LIMIT and loss.data[0] < last_loss) or epoch % 10 == 0: test_loss = evaluate(model, test_dataset) test_loss_num = test_loss.data[0] print('Evaluated test loss {}'.format(test_loss_num)) if (test_loss_num < model.LOSS_LIMIT and test_loss_num < last_test_loss) or epoch % 50 == 0: last_test_loss = test_loss_num os.makedirs('saved_model/' + name, exist_ok=True) torch.save( model.state_dict(), os.path.join( 'saved_model', name, '{}_loss_{}_test_{}_{}.t7'.format( epoch, loss.data[0], test_loss_num, datetime.datetime.now().strftime( "%I:%M%p on %B %d, %Y")))) torch.save( model, os.path.join( 'saved_model', name, 'model_{}_loss_{}_test_{}_{}.t7'.format( epoch, loss.data[0], test_loss_num, datetime.datetime.now().strftime( "%I:%M%p on %B %d, %Y")))) last_loss = loss.data[0]
def train_sup(self, epoch_lim, data, valid_data, early_stopping_lim, batch_size, num_workers, track_embeddings, validation_rate, loss_weight_base=1, value_weight=0, value_ratio=0): """ Training loop :param epoch_lim: total number of training epochs :param data: training data :param valid_data: validation data :param early_stopping_lim: Number of epochs to run without validation improvement before stopping if None, never stop early :param batch_size: training batch_size :param num_workers: number of CPU workers to use for data loading :param track_embeddings: Save out embedding information at end of run :param validation_rate: Check validation performance every validation_rate training epochs :param loss_weight_base: A constant between 0 and 1 used to interpolate between Single (=0) and Multi (=1) Step forecasting. :param value_weight: A constant multiplier for the real-value loss, set to 0 in the paper :param value_ratio: The proportion of loss used for the MSE loss term (as opposed for the cross-entropy loss), set to 0 in the paper :return loss array, model: """ if early_stopping_lim is None: early_stopping_lim = epoch_lim train_sampler = sampler.RandomSampler(np.arange(len(data))) data_train = DataLoader(data, batch_size=batch_size, sampler=train_sampler, drop_last=True) valid_sampler = sampler.SequentialSampler(np.arange(len(valid_data))) data_valid = DataLoader(valid_data, batch_size=batch_size, sampler=valid_sampler) step = 0 bsf_loss = np.inf epochs_without_improvement = 0 improvements = [] for epoch in range(epoch_lim): if epochs_without_improvement > early_stopping_lim: print('Exceeded early stopping limit, stopping') break if epoch % validation_rate == 0: valid_loss = self.validation(data_valid=data_valid, step=step, data=data, loss_weight_base=loss_weight_base, value_weight=value_weight, value_ratio=value_ratio) (bsf_loss, epochs_without_improvement, improvements) = self.manage_early_stopping(bsf_loss=bsf_loss, early_stopping_lim=early_stopping_lim, epochs_without_improvement=epochs_without_improvement, valid_loss=valid_loss, validation_rate=validation_rate, improvements=improvements) running_train_loss = 0 for inp, out, out_real, lens in tqdm(data_train): loss, y_p = forecast_model.get_loss(inp=inp, out=out, lens=lens, cuda=True, gn=self.model, glucose_dat=data, criterion=self.criterion, base=loss_weight_base, out_real=out_real, value_weight=value_weight, value_ratio=value_ratio) step += 1 running_train_loss += loss.data.cpu().numpy()[0] self.optimizer.zero_grad() loss.backward() self.optimizer.step() running_train_loss = running_train_loss/len(data_train) self.writer.add_scalar(tag='train_loss', scalar_value=running_train_loss, global_step=step) torch.save(self.model.state_dict(), '{}/final_sup.pt'.format(self.model_dir)) if track_embeddings: self.embed(data_valid, step, embed_batch=100) return improvements
def __init__(self, opt, shared=None): opt['batch_sort'] = False super().__init__(opt, shared) self.use_batch_act = self.bsz > 1 self.num_workers = opt['numworkers'] self.batch_sort = opt.get('pytorch_teacher_batch_sort') and \ 'train' in self.datatype self.batch_cache_type = opt.get('batch_sort_cache_type') self.batch_sort_field = opt.get('batch_sort_field') # One can specify a collate function to use for preparing a batch self.opt = opt.copy() self.is_shared = shared is not None dataset_classes = self.get_dataset_class(opt) self.ordered = ('ordered' in self.datatype or ('stream' in self.datatype and not opt.get('shuffle'))) if self.ordered: # force index for ordered, so that we see every example warn_once('\nNote: You are using PytorchDataTeacher with ordered ' 'examples. Please specify `--shuffle` if you would like ' 'to have examples loaded in randomized order.\n') self.batch_cache_type = 'index' if not shared: BatchSortCache.create() if len(dataset_classes) > 1: datasets = [] for class_name, collate_fn, task_name in dataset_classes: dataset_opt = opt.copy() dataset_opt['pytorch_teacher_task'] = task_name dataset_opt['task'] = task_name datasets.append(class_name(dataset_opt)) self.collate_fn = collate_fn self.id = ','.join([d[2] for d in dataset_classes]) self.dataset = ParlAIConcatDataset(datasets) else: class_name, self.collate_fn, task_name = dataset_classes[0] self.id = task_name self.dataset = class_name(opt) if self.ordered or not self.training: data_sampler = sampler.SequentialSampler(self.dataset) else: data_sampler = sampler.RandomSampler(self.dataset) self.pytorch_dataloader = DataLoader( self.dataset, batch_size=self.bsz, sampler=data_sampler, num_workers=self.num_workers, collate_fn=self.collate_fn, pin_memory=False, drop_last=False, ) self.lastYs = [None] * self.bsz if self.batch_sort: self.loader_process = LoaderProcess(opt) self.loader_process.start() self.data = enumerate(self.pytorch_dataloader) else: self.dataset = shared['dataset'] self.pytorch_dataloader = shared['pytorch_dataloader'] self.lastYs = shared['lastYs'] self.data = shared['data'] self.id = shared['id'] self.num_batches = math.ceil(self.dataset.num_episodes() / self.bsz) self.reset()
num_channels=1)) dataset = ConcatDataset(dataset) if len(dataset) > 1 else dataset[0] target_type = params['target_type'] if params['target_type'] != 'spatial_bootstrap' else 'psa' val_dataset = WSJ0(folder=params['validation_folder'], length='full', n_fft=params['n_fft'], hop_length=params['hop_length'], output_type=target_type, create_cache=True, #params['create_cache'], num_channels=1) if args.sample_strategy == 'sequential': sample_strategy = sampler.SequentialSampler(dataset) elif args.sample_strategy == 'random': sample_strategy = sampler.RandomSampler(dataset) dataloader = DataLoader(dataset, batch_size=params['batch_size'], num_workers=params['num_workers'], sampler=sample_strategy) dummy_input, _, _, _, _, dummy_one_hot = dataset[0] params['num_attractors'] = dummy_one_hot.shape[-1] params['num_sources'] = params['num_attractors'] params['sample_rate'] = dataset.sr dataset.reorder_sources = args.reorder_sources val_dataset.reorder_sources = args.reorder_sources pp.pprint(params)
def __init__(self, dataset_path=const.DATASET_PATH, train=True, test=False, load=None,\ num_classes=const.NUM_JOINTS, backbone_name=const.BACKBONE_NAME, backbone_pre_trained=const.PRE_TRAINED,\ target_size=const.TARGET_SIZE, stride=const.STRIDE, p_h=None, p_w=None, spacial_factor=const.SPACIAL_FACTOR,\ lr=const.LR_RATE, w_d=const.WEIGHT_DECAY, step_size=const.STEP_SIZE, gamma=const.GAMMA, reg_loss_fac=const.REG_LOSS_FACTOR,\ bs=const.BATCH_SIZE, max_epoch=const.MAX_EPOCH, save_path=const.SAVE_PATH, save_freq=const.SAVE_FREQ, train_spit=const.TRAIN_VAL_SPLIT): print("Setting up model...") self.dataset_path = dataset_path self.save_path = save_path self.num_classes = num_classes self.backbone_name = backbone_name self.train = train self.test = test self.max_epoch = max_epoch self.save_freq = save_freq self.model = A2J(num_joints=num_classes, backbone_name=backbone_name, backbone_pretrained=backbone_pre_trained) self.reg_loss_factor = reg_loss_fac self.post_precess = PostProcess(shape=(target_size[1] // 16, target_size[0] // 16), stride=stride, p_h=p_h, p_w=p_w) self.optim = torch.optim.Adam(self.model.parameters(), lr=lr, weight_decay=w_d) self.sched = torch.optim.lr_scheduler.StepLR(self.optim, step_size=step_size, gamma=gamma) self.criterion = A2JLoss(shape=(target_size[1]//16, target_size[0]//16), stride=stride,\ spacial_factor=spacial_factor, p_h=p_h, p_w=p_w) if load: print(f"Loading model...{load}") check_point = torch.load(load, map_location=torch.device('cpu')) self.num_class = check_point["num_classes"] self.model = A2J(num_joints=num_classes, backbone_name=backbone_name, backbone_pretrained=backbone_pre_trained) self.model.load_state_dict(check_point["model"]) self.optim.load_state_dict(check_point["optim"]) self.sched.load_state_dict(check_point["sched"]) self.epoch = check_point["epoch"] + 1 else: self.epoch = 0 self.train_data = A2J_NYU_DataLoader(train=True, dataset_path=self.dataset_path) self.valid_data = A2J_NYU_DataLoader(train=False, dataset_path=self.dataset_path) self.test_data = A2J_NYU_DataLoader(train=False, dataset_path=self.dataset_path) self.load_train = DataLoader( self.train_data, batch_size=bs, sampler=sampler.RandomSampler(self.train_data), drop_last=False, num_workers=8, ) self.load_valid = DataLoader( self.valid_data, batch_size=bs // 4, sampler=sampler.RandomSampler(self.valid_data), drop_last=False, num_workers=8, ) self.load_test = DataLoader( self.test_data, batch_size=bs // 8, sampler=sampler.RandomSampler(self.test_data), drop_last=False, num_workers=8, ) print("Model setup finished!")
def train_model(model, train, val, learning_rate, batch_size=16, epochs=1, dtype=torch.float32, device=DEFAULT_DEVICE, verbose=False): model.cuda(device) _, nonempty_train = split_empty_nonempty(train) _, nonempty_val = split_empty_nonempty(val) if verbose: print( f'{len(nonempty_train)}/{len(train)} training examples are non-empty' ) print( f'{len(nonempty_val)}/{len(val)} validation examples are non-empty' ) nonempty_train_ds = Subset(train, nonempty_train) nonempty_val_ds = Subset(val, nonempty_val) train_loader = DataLoader(nonempty_train_ds, batch_size=batch_size, sampler=sampler.RandomSampler( nonempty_train_ds, True, len(train))) for l in model: if hasattr(l, '_should_init'): nn.init.kaiming_normal_(l.weight) optimizer = optim.Adam(model.parameters(), lr=learning_rate) loss_history, train_history, val_history = [], empty_results_history( 3), empty_results_history(3) it = 0 for e in range(epochs): if verbose: print(f'Epoch {e+1}') n_iters = (len(train) + batch_size - 1) // batch_size for t, (x, y) in enumerate(train_loader): model.train() x = x.to(device=device, dtype=dtype) y = y.to(device=device, dtype=torch.long) scores = model(x) loss = F.cross_entropy(scores, y) optimizer.zero_grad() loss.backward() optimizer.step() loss_history.append(loss.item()) if it % 100 == 0 or (e + 1 == epochs and t + 1 == n_iters): train_result = evaluate_model(model, nonempty_train_ds, [0, 1, 2], 1000) record_results(train_history, train_result) val_result = evaluate_model(model, nonempty_val_ds, [0, 1, 2], 1000) record_results(val_history, val_result) if verbose: print( f'Iteration {t}, loss={loss.item()}, ' + f'prec={format_metric(train_result.precision)}/{format_metric(val_result.precision)}, ' + f'recall={format_metric(train_result.recall)}/{format_metric(val_result.recall)}, ' + f'acc={format_metric(train_result.accuracy)}/{format_metric(val_result.accuracy)}, ' + f'f1={format_metric(train_result.f1)}/{format_metric(val_result.f1)}, ' + f'kappa={format_metric(train_result.kappa)}/{format_metric(val_result.kappa)}, ' ) it += 1 return loss_history, train_history, val_history, nonempty_train