train_loader = AudioDataLoader(train_dataset, num_workers=args.num_workers, batch_sampler=train_sampler) test_loader = AudioDataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=True) if (not args.no_shuffle and start_epoch != 0) or args.no_sorta_grad: print("Shuffling batches for the following epochs") train_sampler.shuffle(start_epoch) if args.tensorboard and generate_graph: # TO DO get some audios also with torch.no_grad(): # sla vai que ne inputs, targets, input_percentages, target_sizes = next(iter(train_loader)) input_sizes = input_percentages.mul_(int(inputs.size(3))).int() tensorboard_logger.add_image(inputs, input_sizes, targets, network=model) # add graph doesn't work if model is in gpu if freeze_conv: model.conv.requires_grad_(requires_grad=False) # Free batch norm layer to learn running average model.conv.seq_module[1].requires_grad_(requires_grad=True) model.conv.seq_module[4].requires_grad_(requires_grad=True) if freeze_rnns: model.rnns.requires_grad_(requires_grad=False) for i in range(1, len(model.rnns)): model.rnns[i].batch_norm.requires_grad_(requires_grad=True) if remove_bn_conv: model.conv.seq_module[1] = Identity() model.conv.seq_module[4] = Identity()
def train(self, **kwargs): """ Run optimization to train the model. Parameters ---------- """ world_size = kwargs.pop('world_size', 1) gpu_rank = kwargs.pop('gpu_rank', 0) rank = kwargs.pop('rank', 0) dist_backend = kwargs.pop('dist_backend', 'nccl') dist_url = kwargs.pop('dist_url', None) os.environ['MASTER_ADDR'] = '127.0.0.1' os.environ['MASTER_PORT'] = '1234' main_proc = True self.distributed = world_size > 1 if self.distributed: if self.gpu_rank: torch.cuda.set_device(int(gpu_rank)) dist.init_process_group(backend=dist_backend, init_method=dist_url, world_size=world_size, rank=rank) print('Initiated process group') main_proc = rank == 0 # Only the first proc should save models if main_proc and self.tensorboard: tensorboard_logger = TensorBoardLogger(self.id, self.log_dir, self.log_params, comment=self.sufix) if self.distributed: train_sampler = DistributedBucketingSampler( self.data_train, batch_size=self.batch_size, num_replicas=world_size, rank=rank) else: if self.sampler_type == 'bucketing': train_sampler = BucketingSampler(self.data_train, batch_size=self.batch_size, shuffle=True) if self.sampler_type == 'random': train_sampler = RandomBucketingSampler( self.data_train, batch_size=self.batch_size) print("Shuffling batches for the following epochs..") train_sampler.shuffle(self.start_epoch) train_loader = AudioDataLoader(self.data_train, num_workers=self.num_workers, batch_sampler=train_sampler) val_loader = AudioDataLoader(self.data_val, batch_size=self.batch_size_val, num_workers=self.num_workers, shuffle=True) if self.tensorboard and self.generate_graph: # TO DO get some audios also with torch.no_grad(): inputs, targets, input_percentages, target_sizes = next( iter(train_loader)) input_sizes = input_percentages.mul_(int(inputs.size(3))).int() tensorboard_logger.add_image(inputs, input_sizes, targets, network=self.model) self.model = self.model.to(self.device) parameters = self.model.parameters() if self.update_rule == 'adam': optimizer = torch.optim.Adam(parameters, lr=self.lr, weight_decay=self.reg) if self.update_rule == 'sgd': optimizer = torch.optim.SGD(parameters, lr=self.lr, weight_decay=self.reg) self.model, self.optimizer = amp.initialize( self.model, optimizer, opt_level=self.opt_level, keep_batchnorm_fp32=self.keep_batchnorm_fp32, loss_scale=self.loss_scale) if self.optim_state is not None: self.optimizer.load_state_dict(self.optim_state) if self.amp_state is not None: amp.load_state_dict(self.amp_state) if self.distributed: self.model = DistributedDataParallel(self.model) print(self.model) if self.criterion_type == 'cross_entropy_loss': self.criterion = torch.nn.CrossEntropyLoss() # Useless for now because I don't save. accuracies_train_iters = [] losses_iters = [] avg_loss = 0 batch_time = AverageMeter() epoch_time = AverageMeter() losses = AverageMeter() start_training = time.time() for epoch in range(self.start_epoch, self.num_epochs): print("Start epoch..") # Put model in train mode self.model.train() y_true_train_epoch = np.array([]) y_pred_train_epoch = np.array([]) start_epoch = time.time() for i, (data) in enumerate(train_loader, start=0): start_batch = time.time() print('Start batch..') if i == len(train_sampler): # QUE pq isso deus break inputs, targets, input_percentages, _ = data input_sizes = input_percentages.mul_(int(inputs.size(3))).int() inputs = inputs.to(self.device) targets = targets.to(self.device) output, loss_value = self._step(inputs, input_sizes, targets) print('Step finished.') avg_loss += loss_value with torch.no_grad(): y_pred = self.decoder.decode(output.detach()).cpu().numpy() # import pdb; pdb.set_trace() y_true_train_epoch = np.concatenate( (y_true_train_epoch, targets.cpu().numpy() )) # maybe I should do it with tensors? y_pred_train_epoch = np.concatenate( (y_pred_train_epoch, y_pred)) inputs_size = inputs.size(0) del output, inputs, input_percentages if self.intra_epoch_sanity_check: with torch.no_grad(): acc, _ = self.check_accuracy(targets.cpu().numpy(), y_pred=y_pred) accuracies_train_iters.append(acc) losses_iters.append(loss_value) cm = confusion_matrix(targets.cpu().numpy(), y_pred, labels=self.labels) print('[it %i/%i] Confusion matrix train step:' % ((i + 1, len(train_sampler)))) print(pd.DataFrame(cm)) if self.tensorboard: tensorboard_logger.update( len(train_loader) * epoch + i + 1, { 'Loss/through_iterations': loss_value, 'Accuracy/train_through_iterations': acc }) del targets batch_time.update(time.time() - start_batch) epoch_time.update(time.time() - start_epoch) losses.update(loss_value, inputs_size) # Write elapsed time (and loss) to terminal print('Epoch: [{0}][{1}/{2}]\t' 'Batch {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Epoch {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format( (epoch + 1), (i + 1), len(train_sampler), batch_time=batch_time, data_time=epoch_time, loss=losses)) # Loss log avg_loss /= len(train_sampler) self.loss_epochs.append(avg_loss) # Accuracy train log acc_train, _ = self.check_accuracy(y_true_train_epoch, y_pred=y_pred_train_epoch) self.accuracy_train_epochs.append(acc_train) # Accuracy val log with torch.no_grad(): y_pred_val = np.array([]) targets_val = np.array([]) for data in val_loader: inputs, targets, input_percentages, _ = data input_sizes = input_percentages.mul_(int( inputs.size(3))).int() _, y_pred_val_batch = self.check_accuracy( targets.cpu().numpy(), inputs=inputs, input_sizes=input_sizes) y_pred_val = np.concatenate((y_pred_val, y_pred_val_batch)) targets_val = np.concatenate( (targets_val, targets.cpu().numpy() )) # TO DO: think of a smarter way to do this later del inputs, targets, input_percentages # import pdb; pdb.set_trace() acc_val, y_pred_val = self.check_accuracy(targets_val, y_pred=y_pred_val) self.accuracy_val_epochs.append(acc_val) cm = confusion_matrix(targets_val, y_pred_val, labels=self.labels) print('Confusion matrix validation:') print(pd.DataFrame(cm)) # Write epoch stuff to tensorboard if self.tensorboard: tensorboard_logger.update( epoch + 1, {'Loss/through_epochs': avg_loss}, parameters=self.model.named_parameters) tensorboard_logger.update(epoch + 1, { 'train': acc_train, 'validation': acc_val }, together=True, name='Accuracy/through_epochs') # Keep track of the best model if acc_val > self.best_acc_val: self.best_acc_val = acc_val self.best_params = {} for k, v in self.model.named_parameters( ): # TO DO: actually copy model and save later? idk.. self.best_params[k] = v.clone() # Anneal learning rate. TO DO: find better way to this this specific to every parameter as cs231n does. for g in self.optimizer.param_groups: g['lr'] = g['lr'] / self.learning_anneal print('Learning rate annealed to: {lr:.6f}'.format(lr=g['lr'])) # Shuffle batches order print("Shuffling batches...") train_sampler.shuffle(epoch) # Rechoose batches elements if self.sampler_type == 'random': train_sampler.recompute_bins() end_training = time.time() if self.tensorboard: tensorboard_logger.close() print('Elapsed time in training: %.02f ' % ((end_training - start_training) / 60.0))