def train_max_epochs(self, args, train0, train1, dev0, dev1, vocab, no_of_epochs, writer, time, save_epochs_flag=False, save_batch_flag=False, save_batch=5): print("No of epochs: ", no_of_epochs) self.train() self.enc_optim = optim.AdamW(self.encoder.parameters(), lr=args.learning_rate, betas=(self.beta1, self.beta2)) self.gen_optim = optim.AdamW(self.generator.parameters(), lr=args.learning_rate, betas=(self.beta1, self.beta2)) self.discrim1_optim = optim.AdamW(self.discriminator1.parameters(), lr=args.learning_rate, betas=(self.beta1, self.beta2)) self.discrim2_optim = optim.AdamW(self.discriminator2.parameters(), lr=args.learning_rate, betas=(self.beta1, self.beta2)) Path(args.saves_path).mkdir(parents=True, exist_ok=True) saves_path = os.path.join(args.saves_path, utils.get_filename(args, time, "model")) Path(saves_path).mkdir(parents=True, exist_ok=True) flag = True with autograd.detect_anomaly(): for epoch in range(no_of_epochs): random.shuffle(train0) random.shuffle(train1) batches0, batches1, _1, _2 = utils.get_batches(train0, train1, vocab.word2id, args.batch_size, noisy=True) dev_batches0 = [] dev_batches1 = [] if self.args.dev: dev_batches0, dev_batches1, _, _ = utils.get_batches(dev0, dev1, vocab.word2id, args.batch_size, noisy=True) # batches0, batches1, _1, _2 = utils.get_batches_bpe(train0, train1, vocab.word2id, # args.batch_size, noisy=True) random.shuffle(batches0) random.shuffle(batches1) print("Epoch: ", epoch) self.logger.info("Epoch: "+str(epoch)) train_flag = self(args, batches0, batches1, dev_batches0, dev_batches1, vocab, no_of_epochs, epoch, writer, time, save_epochs_flag=False, save_batch_flag=False, save_batch=5) if train_flag: break
def loss_function(self, forward_ret, labels=None): x_gen, x_real = forward_ret if self.debug: debug_context = autograd.detect_anomaly() else: debug_context = contextlib.nullcontext() with debug_context: d_p = self.disc(x_real) d_q = self.disc(x_gen) if self.train_disc(): if self.flags.gan_loss == 'bce': loss = F.binary_cross_entropy_with_logits(d_p, torch.ones_like(d_p)) + \ F.binary_cross_entropy_with_logits(d_q, torch.zeros_like(d_q)) elif self.flags.gan_loss == 'wgan': grad_penalty = self.gradient_penalty(x_real, x_gen, context=debug_context) loss = -d_p.mean() + d_q.mean() + ( 10.0 * grad_penalty) + 1e-3 * (d_p**2).mean() self.d_loss = loss.item() else: if self.flags.gan_loss == 'bce': loss = F.binary_cross_entropy_with_logits(d_p, torch.zeros_like(d_p)) + \ F.binary_cross_entropy_with_logits(d_q, torch.ones_like(d_q)) elif self.flags.gan_loss == 'wgan': loss = d_p.mean() - d_q.mean() self.g_loss = loss.item() return loss, self.g_loss, self.d_loss
def train(self): train_loss = np.array([]) valid_loss = np.array([]) print("start train") for epoch in range(self.epoch_num): with detect_anomaly(): print('epoch{0}'.format(epoch)) start = time.time() self.model.train() tmp_train_loss = self._run(self.model, self.criterion, self.train_data_loader, self.train_batch_size, mode='train') train_loss = np.append(train_loss, tmp_train_loss.clone().numpy()) validation self.model.eval() with torch.no_grad(): tmp_valid_loss = self._run(self.model, self.criterion, self.valid_data_loader, self.valid_batch_size, mode='validation') valid_loss = np.append(valid_loss, tmp_valid_loss.clone().numpy()) if (epoch + 1) % 10 == 0: torch.save(self.model.state_dict(), self.save_path + 'wave_u_net{0}.ckpt'.format(epoch + 1)) end = time.time() print('----excute time: {0}'.format(end - start)) plt.plot(train_loss) print(train_loss) plt.show()
def loss_function(self, input, output, mean, logvar): """ Given input, output, mean and std, compute and return loss """ # to prevent log of 0 epsilon = 1e-6 with autograd.detect_anomaly(): L_recon = -1 * torch.sum(input * torch.log(output + epsilon) + (1 - input) * torch.log(1 - output), dim=1) # L_reg = -0.5 * torch.sum(1 + std - mean.pow(2) - std.exp()) # L_recon = torch.nn.functional.binary_cross_entropy(output, input) L_reg = -0.5 * torch.sum(1 + logvar - mean.pow(2) - logvar.exp()) # L_reg = torch.sum(torch.sum(-1*torch.log(std) + ((std.pow(2) + mean.pow(2))-1)*0.5, dim=1, dim=0) # Normalise by same number of elements as in reconstruction if we average recon L_reg /= input.size(dim=0) # * self.image_dim #####CHECK IF THIS IS NEEDED AGAIN!!!!!! # get total loss total_loss = torch.mean( L_recon + L_reg, dim=0 ) # may need to be the sum############################################### return total_loss
def train(epoch, batch_logger, train_loader): model.train() if epoch == 120: for param_group in optimizer.param_groups: param_group['lr'] = 0.0001 if epoch == 150: for param_group in optimizer.param_groups: param_group['lr'] = 0.00001 for i, data in enumerate(train_loader): with autograd.detect_anomaly(): data = data.to(device) #print(data.y) optimizer.zero_grad() end_point = model(data) loss = F.nll_loss(end_point, data.y) pred = end_point.max(1)[1] acc = (pred.eq(data.y).sum().item())/len(data.y) loss.backward() optimizer.step() if i % 10 == 0: batch_logger.log({'epoch': epoch,'batch': i + 1,'loss': loss.item(),'acc': acc})
def train(self, xs, ys): with autograd.detect_anomaly(): xs = [self._t_cuda(x) for x in xs] ys = [self._t_cuda(y) for y in ys] #把loss关于weight的导数变成0,即把梯度置零 self.optimizer.zero_grad() #执行model(x)的时候,底层自动调用forward计算结果 loss, focal_loss, pull_loss, push_loss, off_loss = self.network( xs, ys) #计算loss loss = loss.mean() #added focal_loss = focal_loss.mean() pull_loss = pull_loss.mean() push_loss = push_loss.mean() off_loss = off_loss.mean() #反向传播求梯度 loss.backward() #更新参数,即w* = w+alpha*grad self.optimizer.step() return loss, focal_loss, pull_loss, push_loss, off_loss
def train(model, loader, optimizer, n_iter): model.train() err = 0.0 i = 0 pbar = tqdm(total=len(loader), desc='records loaded') for i, (seq, prof, _, dmat, pdb, *_) in enumerate(batch_generator(loader, prepare_xu_batch)): optimizer.zero_grad() cmap_hat = predict(model, seq, prof) if n_iter % UPLOAD_IMAGE_EVERY == 0: dmat_hat = cmap_to_dmat(cmap_hat) upload_images(dmat, dmat_hat, pdb, n_iter, '%s/%s' % (model.name, 'train')) loss = get_loss(cmap_hat, dmat) err += loss.item() e = err / (i + 1.) writer.add_scalars('%s/Loss' % model.name, {"train": loss.item()}, n_iter) try: with autograd.detect_anomaly(): loss.backward() except RuntimeError as e: raise e optimizer.step_and_update_lr(loss.item()) lr = optimizer.lr pbar.set_description("Training Loss:%.6f, LR: %.6f (L=%d)" % (e, lr, seq.size(1))) pbar.update(seq.size(0)) n_iter += 1 pbar.close() return n_iter
def train_epoch(self, epoch): """ Evaluate the model on the train set. """ t1 = time() output = { 'tp': [], 'fp': [], 'fn': [], 'tn': [], 'loss': [], 'preds': [] } train_info = [] self.model = self.model.train() train_iter = self.iterator(self.data['train'], batch_size=self.params['batch'], shuffle_=self.params['shuffle_data']) self.optimizer.zero_grad() for batch_idx, batch in enumerate(train_iter): batch = self.convert_batch(batch) with autograd.detect_anomaly(): loss, stats, predictions, select = self.model(batch) loss.backward() # backward computation output['loss'] += [loss.item()] output['tp'] += [stats['tp'].to('cpu').data.numpy()] output['fp'] += [stats['fp'].to('cpu').data.numpy()] output['fn'] += [stats['fn'].to('cpu').data.numpy()] output['tn'] += [stats['tn'].to('cpu').data.numpy()] output['preds'] += [predictions.to('cpu').data.numpy()] train_info += [ batch['info'][select[0].to('cpu').data.numpy(), select[1].to('cpu').data.numpy(), select[2].to('cpu').data.numpy()] ] # Accumulate gradients (by Yuwei Xu) if (batch_idx + 1) % self.accumulation_steps == 0: nn.utils.clip_grad_norm_(self.model.parameters(), self.gc) # gradient clipping self.optimizer.step() self.optimizer.zero_grad() t2 = time() if self.window: total_loss, scores = self.subdocs_performance( output['loss'], output['preds'], train_info) else: total_loss, scores = self.performance(output) self.train_res['loss'] += [total_loss] self.train_res['score'] += [scores[self.primary_metric]] print('Epoch: {:02d} | TRAIN | LOSS = {:.05f}, '.format( epoch, total_loss), end="") print_results(scores, [], self.show_class, t2 - t1)
def train( self, epochs: int, val_frequency: int, print_frequency: int = 20, log_frequency: int = 5, start_epoch: int = 0, ): self.model.train() with autograd.detect_anomaly(): for epoch in range(start_epoch, epochs): self.model.train() data_load_start_time = time.time() # Iterate over the samples in the training set for batch, labels, fnames in self.train_loader: batch = batch.to(self.device) labels = labels.to(self.device) data_load_end_time = time.time() batch = batch.float() labels = labels.float() # Forward pass of the CNN logits = self.model.forward(batch) labels = torch.unsqueeze(labels, dim=1) # Calculate the loss loss = self.criterion(logits, labels) # Backpropagate error loss.backward() # Update the optimiser self.optimizer.step() self.optimizer.zero_grad() # Log times and loss data_load_time = data_load_end_time - data_load_start_time step_time = time.time() - data_load_end_time if ((self.step + 1) % log_frequency) == 0: self.log_metrics(epoch, loss, data_load_time, step_time) if ((self.step + 1) % print_frequency) == 0: self.print_metrics(epoch, loss, data_load_time, step_time) self.step += 1 data_load_start_time = time.time() self.summary_writer.add_scalar("epoch", epoch, self.step) if ((epoch + 1) % val_frequency) == 0: self.validate(epoch) # self.validate() will put the model in validation mode, # so we have to switch back to train mode afterwards self.model.train()
def UCE_loss(self, alpha, soft_output): with autograd.detect_anomaly(): alpha_0 = alpha.sum(1).unsqueeze(-1).repeat(1, self.output_dim) entropy_reg = Dirichlet(alpha).entropy() UCE_loss = torch.sum( soft_output * (torch.digamma(alpha_0) - torch.digamma(alpha))) - self.regr * torch.sum(entropy_reg) return UCE_loss
def test_g1(self): with autograd.detect_anomaly(): inp = torch.rand(5, 5, requires_grad=True) x = inp[1:3, 1:3] x = x**2 d = x.sum() print(d.grad) d.backward(retain_graph=True) print(d.grad) print(inp.grad)
def _fwd(self, X): with torch.no_grad(): with autograd.detect_anomaly(): try: X = X.to(self.devices[0]) y = self.tcn(X).detach().cpu().numpy() except Exception as e: os.system('clear') Tools.pyout(e, force=True) sys.exit(0) return y
def train_epoch(self, epoch): """ Train model on the training set for 1 epoch, estimate performance and average loss. """ t1 = time.time() output_tr = { 'tp': [], 'fp': [], 'fn': [], 'tn': [], 'loss': [], 'preds': [], 'truth': [], 'probs': [] } self.model = self.model.train() train_iter = self.iterator(self.data['train'], batch_size=self.params['batch'], shuffle_=True) for batch in train_iter: batch = self.convert_batch(batch) with autograd.detect_anomaly(): self.optimizer.zero_grad() loss, stats, probs, preds, truth, att_scores = self.model( batch) output_tr['preds'] += preds.to('cpu').data.tolist() output_tr['probs'] += probs.to('cpu').data.tolist() output_tr['truth'] += truth.to('cpu').data.tolist() output_tr['loss'] += [loss.item()] output_tr['tp'] += [stats['tp'].to('cpu').data.numpy()] output_tr['fp'] += [stats['fp'].to('cpu').data.numpy()] output_tr['fn'] += [stats['fn'].to('cpu').data.numpy()] output_tr['tn'] += [stats['tn'].to('cpu').data.numpy()] loss.backward() # backward computation nn.utils.clip_grad_norm_(self.model.parameters(), self.params['gc']) # gradient clipping self.optimizer.step() # update t2 = time.time() # estimate performance total_loss, scores = self.performance(output_tr) self.train_res['loss'] += [total_loss] self.train_res['score'] += [scores[self.primary_metric]] print('Epoch: {:02d} | TRAIN | LOSS = {:.04f}'.format( epoch, total_loss), end="") print_results(scores, self.show_class, t2 - t1)
def test_g2(self): with autograd.detect_anomaly(): x = torch.zeros(5, 5, requires_grad=True) inp = torch.tensor([1, 3, 1, 3], requires_grad=True) x[inp[0]:inp[1], inp[2]:inp[3]] = 1 x = x**2 d = x.sum() print(d.grad) d.backward(retain_graph=True) print(d.grad) print(inp.grad)
def train(model, loader, optimizer, n_iter): model.train() err = 0.0 i = 0 pbar = tqdm(total=len(loader), desc='records loaded') for i, (seq, beta, prof, dmat, dssp, pdb, *_) in enumerate(batch_generator(loader, prepare_pdb_batch)): optimizer.zero_grad() cmap_hat, dssp_hat = predict(model, seq, beta, prof) cmap = get_contact_map(dmat) losses = get_loss(cmap_hat, cmap, dssp_hat, dssp) loss = sum(losses) err += loss.item() e = err / (i + 1.) writer.add_scalars('M3/Loss', {"train": loss.item()}, n_iter) writer.add_scalars('M3/CMAP_BCE', {"train": losses[0].item()}, n_iter) writer.add_scalars('M3/SYM_L1', {"train": losses[1].item()}, n_iter) if len(losses) == 3: writer.add_scalars('M3/SS_CE', {"train": losses[2].item()}, n_iter) try: with autograd.detect_anomaly(): loss.backward() except RuntimeError as e: raise e if n_iter % UPLOAD_IMAGE_EVERY == 0: for cm1, cm2, dm, pdb_id in zip(cmap_hat.data.cpu().numpy(), cmap.float().data.cpu().numpy(), dmat.data.cpu().numpy(), pdb): writer.add_image('M3/%s/cmap_pred' % pdb_id, to_colormap_image(cm1), n_iter, dataformats='HWC') writer.add_image('M3/%s/cmap_true' % pdb_id, to_colormap_image(cm2), n_iter, dataformats='HWC') # writer.add_image('M3/%s/dmap_true' % pdb_id, to_colormap_image(dm), n_iter, dataformats='HWC') optimizer.step_and_update_lr(loss.item()) lr = optimizer.lr pbar.set_description("Training Loss:%.6f, LR: %.6f (L=%d)" % (e, lr, seq.size(1))) pbar.update(seq.size(0)) n_iter += 1 pbar.close() return n_iter
def train(train_data): # Turn on training mode which enables dropout. model.train() total_loss = 0. total_aux_loss = 0. start_time = time.time() ntokens = len(corpus.dictionary) for batch, i in enumerate(range(0, train_data.size(1) - 1, BPTT)): with autograd.detect_anomaly(): data, targets = get_batch(train_data, i) # data is [35, 20], targets is [700] trg_mask = create_mask(data).to(device) optimizer.zero_grad() for submodule in model.modules(): submodule.register_forward_hook(nan_hook) output, aux_loss = model(src=None, trg=data, src_mask=None, trg_mask=trg_mask, is_lm=True) output = output.view(-1, ntokens) loss = criterion(output, targets) final_loss = loss + aux_loss final_loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP) optimizer.step_and_update_lr() model_has_nan = check_for_nans(model) if model_has_nan: print("Nans have been identified") total_loss += loss.item() total_aux_loss += aux_loss.item() if batch == 0: print("Running without errors") if batch % LOG_INTERVAL == 0 and batch > 0: cur_loss = total_loss / LOG_INTERVAL # curr loss is independent of the aux loss curr_aux_loss = total_aux_loss / LOG_INTERVAL elapsed = time.time() - start_time print( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | aux_loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, train_data.size(1) // BPTT, LR, elapsed * 1000 / LOG_INTERVAL, cur_loss, curr_aux_loss, math.exp(cur_loss))) total_loss = 0. total_aux_loss = 0. start_time = time.time()
def run_experiment(dataset_folder, ratio_0, ratio_p1, use_cce, batch_size, lr, epochs, early_stopping_patience, emb_dim, lstm_hidden_dim, emb_dropout_p, lstm_dropout_p, n_classes, checkpoint_file, device): tb_writer = SummaryWriter(comment=f"_r0_{ratio_0}_rp1_{ratio_p1}_cce_{use_cce}") vocab_size, train_it, val_it, test_it = read_data(dataset_folder, ratio_0, ratio_p1, batch_size, device) model = BiLstmClassifier(vocab_size, emb_dim, lstm_hidden_dim, emb_dropout_p, lstm_dropout_p, n_classes).to(device) opt = optim.Adam(model.parameters(), lr) criterion = complement_cross_entropy_loss if use_cce else one_hot_cross_entropy_loss with autograd.detect_anomaly(): model = train_model(model, n_classes, criterion, opt, train_it, val_it, epochs, early_stopping_patience, checkpoint_file, tb_writer) score = evaluate_model(model, n_classes, test_it, tb_writer) return score
def train(self, states, critic): with autograd.detect_anomaly(): # transform to torch tensors states = torch.from_numpy(states).float().to(self._device) self._optimizer.zero_grad() # compute actions taken in these states by the actor _actionsPred = self._backbone([states]) # compose the critic over the actor outputs (sandwich), which effectively does g(f(x)) _lossActor = -critic(states, _actionsPred).mean() _lossActor.backward() # take a step with the optimizer self._optimizer.step()
def train(self, loss, clip_grad_norm=None): assert self.is_training() self.optimizer.zero_grad() if self.debug: debug_context = autograd.detect_anomaly() else: debug_context = contextlib.nullcontext() with debug_context: loss.backward() if clip_grad_norm is not None: nn.utils.clip_grad_norm_(self.model.parameters(), clip_grad_norm) self.optimizer.step() self.increment_train_steps()
def _forward_pass_with_anomaly_detection( self, patches: torch.Tensor, mask: torch.Tensor = None, labels: torch.Tensor = None) -> SegmentationForwardPass.Result: if self.detect_anomaly: with autograd.detect_anomaly(): result = self._forward_pass(patches, mask, labels) if result.loss is not None and (math.isnan(result.loss) or math.isinf(result.loss)): raise RuntimeError( f"The loss computation returned {result.loss}") return result return self._forward_pass(patches, mask, labels)
def predict_test(testset): # Create dataset loader # Create dataset loader test_loader = torch.utils.data.DataLoader(testset, batch_size=args.batch_size, shuffle=False, drop_last=False) # restore checkpoint restore_checkpoint(args) # Set loss function args.criterion = torch.nn.MSELoss() # Predict all test elements and measure run_loss = 0 for batch_idx, batch in enumerate(test_loader, 1): # Unpack batch inputs, targets = batch # Send to device inputs = inputs.to(args.device) targets = targets.to(args.device) # Calculate gradients and update with autograd.detect_anomaly(): # forward outputs = args.net(inputs) # get maximum from each layer print(outputs.shape) idx_inpt = get_max(inputs, dim=(2, 3)) idx_otpt = get_max(outputs, dim=(2, 3)) print(idx_inpt.shape) print(idx_inpt) print(idx_otpt.shape) print(idx_otpt) input() # calculate loss loss = args.criterion(outputs, targets) run_loss += loss.item() if batch_idx < 10: # Plot predictions # img = imshow_bboxes(inputs, targets, args, t_outputs) # args.writer.add_image('Test/predicted', img, batch_idx) pass else: break
def test_LinearOperator_radon_gradcheck(self): # Set image size. image_size = (5, 4) # Define angles. nangles = 180 angles = np.linspace(0, np.pi, nangles, False) # Create operators. R, Radj, ndet = radon.radon2d(*image_size, angles) data_size = (nangles, ndet) # Create instances for use with torch. K = radon.RadonTransform(R, Radj, data_size) Kadj = radon.BackProjection(R, Radj, image_size) # Apply to dummy input. x = torch.randn((1, 1, *image_size), requires_grad=True, dtype=torch.double) f = K(x) # Check for simple loss. loss = f.sum() loss.backward() torch.allclose(x.grad, Kadj(x.new_ones(1, 1, *data_size))) def op_fun(x): out = LinearOperator.apply(x, K, Kadj) return out.sum() # Check for anomalies. with tag.detect_anomaly(): x = torch.randn(1, 1, *image_size, requires_grad=True, dtype=torch.double) out = op_fun(x) out.backward() # Check numerical gradient up to certain tolerance. # Due to inaccuracy of adjoint this check fails. x = torch.randn(1, 1, *image_size, requires_grad=True, dtype=torch.double) tag.gradcheck(lambda t: K(t), x)
def train(args, model, device, train_loader, optimizer, epoch, is_display): model.train() total_loss = 0 for batch_idx, (imgs, labels, gt) in enumerate(train_loader): with autograd.detect_anomaly(): if args.using_contrastive_loss: target = gt.type(torch.FloatTensor).view(gt.shape[0], 1) target = target.view(-1) imgs[0], imgs[1], imgs[2], imgs[3], target = imgs[0].to( device), imgs[1].to(device), imgs[2].to( device), imgs[3].to(device), target.to(device) optimizer.zero_grad() _, A, B, C, D = model(imgs[0], imgs[1], imgs[2], imgs[3]) embs = [A, B, C, D] loss = criteria(embs, target) total_loss += loss.item() sum_loss = loss if sub_criteria != None: sub_loss = sub_criteria(embs, target) total_loss += sub_loss.item() sum_loss += sub_loss if sub_criteria_2 != None: sub_loss = sub_criteria_2(embs, target) total_loss += sub_loss.item() sum_loss += sub_loss else: target = gt.type(torch.FloatTensor).view(gt.shape[0], 1) imgs[0], imgs[1], imgs[2], imgs[3], target = imgs[0].to( device), imgs[1].to(device), imgs[2].to( device), imgs[3].to(device), target.to(device) optimizer.zero_grad() output, emb_a, emb_b, emb_c, emb_d = model( imgs[0], imgs[1], imgs[2], imgs[3]) loss = criteria(output, target) total_loss += loss.item() sum_loss = loss sum_loss.backward() optimizer.step() if batch_idx % args.log_interval == 0: logging.info( '[Epoch {}/{}] [Batch {}/{}] [loss: {:.6f}]'.format( epoch, args.epoch, batch_idx, len(train_loader), sum_loss.item())) total_loss /= len(train_loader) logging.info('[Epoch {}/{}] [loss: {:.6f}]'.format(epoch, args.epoch, total_loss)) return total_loss
def train(self): LOGGER.addHandler( logging.FileHandler( os.path.join(self.args.checkpoint_dir, 'logger_train.log'))) writer = SummaryWriter(log_dir=self.args.checkpoint_dir) start_time = time() # setup dataset/data loader loader = {k: self.__setup_loader(k) for k in ['train', 'valid']} LOGGER.info('data_loader: %s' % str(list(loader.keys()))) # start training LOGGER.info('*** start training from step %i, epoch %i ***' % (self.__step, self.__epoch)) try: with detect_anomaly(): while True: data_loader, _ = loader['train'] if_training_finish = self.__epoch_train(data_loader, writer=writer) self.release_cache() data_loader, info_loader = loader['valid'] self.__epoch_valid(data_loader, info_loader=info_loader, writer=writer, prefix='valid', sample_n=SAMPLE_N) self.release_cache() if if_training_finish: break self.__epoch += 1 except RuntimeError: LOGGER.exception( '*** RuntimeError (NaN found, see above log in detail) ***') except KeyboardInterrupt: LOGGER.info('*** KeyboardInterrupt ***') self.__save() LOGGER.info('[training completed, %0.2f sec in total]' % (time() - start_time)) writer.close() LOGGER.info('ckpt saved at %s' % self.args.checkpoint_dir)
def test_LinearOperator_radon_cuda(self): # Set image size. image_size = 5, 4 # Define angles. nangles = 180 angles = np.linspace(0, np.pi, nangles, False) # Check if GPU is available. cuda = torch.cuda.is_available() device = torch.device('cuda' if cuda else 'cpu') # Create operators. R, Radj, ndet = radon.radon2d(*image_size, angles, cuda) data_size = (nangles, ndet) # Create instances for use with torch. K = radon.RadonTransform(R, Radj, data_size) Kadj = radon.BackProjection(R, Radj, image_size) # Apply to dummy input. x = torch.randn((1, 1, *image_size), requires_grad=True, dtype=torch.double, device=device) f = K(x) # Check for simple loss. loss = f.sum() loss.backward() torch.allclose(x.grad, Kadj(x.new_ones(1, 1, *data_size))) def op_fun(x): out = LinearOperator.apply(x, K, Kadj) return out.sum() # Check for anomalies. with tag.detect_anomaly(): x = torch.randn(1, 1, *image_size, requires_grad=True, dtype=torch.double, device=device) out = op_fun(x) out.backward()
def train(): ds = GlazeCompositionDataset() train_loader, val_loader = get_data_loaders(ds) out_D = len(ds.compounds) print('Out Dimension is %i', out_D) model = Net(out_D) loss = torch.nn.MSELoss() optimizer = optim.SGD(model.parameters(), lr=0.2, momentum=0.5) trainer = create_supervised_trainer(model, optimizer, loss) metrics = {'MSE': ignite.metrics.RootMeanSquaredError()} evaluator = create_supervised_evaluator(model, metrics) saver = ignite.handlers.ModelCheckpoint('./checkpoints/models', 'chkpoint', save_interval=2, n_saved=4, create_dir=True, require_empty=False) trainer.add_event_handler(Events.EPOCH_COMPLETED, saver, {'glaze_net_3': model}) print(model.state_dict().keys()) @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(trainer): iter = (trainer.state.iteration - 1) % len(train_loader) + 1 if iter % 10 == 0: print("Epoch[{}] Iteration[{}/{}] Loss: {:.10f}".format( trainer.state.epoch, iter, len(train_loader), trainer.state.output)) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(trainer): evaluator.run(train_loader) metrics = evaluator.state.metrics print("Training Results - Epoch: {} MSE: {:.2f}".format( trainer.state.epoch, metrics['MSE'])) @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(trainer): evaluator.run(val_loader) metrics = evaluator.state.metrics print("Validation Results - Epoch: {} MSE: {:.2f}".format( trainer.state.epoch, metrics['MSE'])) with autograd.detect_anomaly(): trainer.run(train_loader, max_epochs=100) return model
def run(self): try: # display configuration self.logger.debug('>>> configuration: \n' + conf().dump().strip()) # load pre-trained model current_epoch = self.load_model() # train with autograd.detect_anomaly(): self.train(current_epoch) # evaluate self.evaluate() except Exception as e: self.logger.error(e, exc_info=True) raise e
def run_batch(self, data, visualize=False): """If visualize is True, a visualize method of the model module is called.""" if not isinstance(data, list) and not isinstance(data, tuple): data = [data] if self.is_training(): context = contextlib.nullcontext() else: context = torch.no_grad() if self.debug: debug_context = autograd.detect_anomaly() else: debug_context = contextlib.nullcontext() with context, debug_context: if not visualize: return self.model(*data) else: return self.model.visualize(*data)
def train(model, loader, optimizer, n_iter): model.train() err = 0.0 i = 0 pbar = tqdm(total=len(loader), desc='pairs loaded') for i, (s1, s2, b1, b2, p1, p2, m1, m2, idx, pdb1, pdb2, *_) in enumerate(batch_generator(loader, prepare_pairs_batch)): optimizer.zero_grad() assert s1.shape == s2.shape assert m1.shape == m2.shape assert p1.shape == p2.shape ddm_hat, ddm = predict_2ways(model, m1, m2, s1, s2, b1, b2, p1, p2, idx) loss = get_loss(ddm_hat, ddm) err += loss.item() e = err / (i + 1.) writer.add_scalars('M1/Loss', {"train": e}, n_iter) try: with autograd.detect_anomaly(): loss.backward() except RuntimeError as e: raise e if n_iter % UPLOAD_IMAGE_EVERY == 0: write_true_pred_pairs("M1", n_iter, pdb1, pdb2, ddm.data.cpu().numpy(), ddm_hat.data.cpu().numpy()) write_dist_mats_pairs("M1", n_iter, pdb1, pdb2, m1.data.cpu().numpy(), m2.data.cpu().numpy()) optimizer.step_and_update_lr(loss.item()) lr = optimizer.lr pbar.set_description("Training Loss:%.6f, LR: %.6f (L=%d)" % (e, lr, s1.size(1))) pbar.update(len(idx)) n_iter += 1 pbar.close() return n_iter
def loss_function(self, forward_ret, labels=None): if self.is_training(): forward_batch, backward_batch = self.get_disc_batches(forward_ret) if self.debug: debug_context = autograd.detect_anomaly() else: debug_context = nullcontext() with debug_context: d_ps = self.disc(*forward_batch) d_qs = self.disc(*backward_batch) if self.train_disc(): if self.flags.loss == 'wasserstein': if self.flags.gp: grad_penalty = self.gradient_penalty( backward_batch, forward_batch, context=debug_context) else: grad_penalty = 0 loss = -d_ps.mean() + d_qs.mean() + (10.0 * grad_penalty) + self.flags.wasserstein_nodrift * \ ((d_ps + d_qs)**2).mean() else: loss = (F.binary_cross_entropy_with_logits( d_ps, torch.ones_like(d_ps)) + F.binary_cross_entropy_with_logits( d_qs, torch.zeros_like(d_qs))) self.d_loss = loss.item() else: if self.flags.loss == 'wasserstein': loss = d_ps.mean() - d_qs.mean() else: loss = (F.binary_cross_entropy_with_logits( d_ps, torch.zeros_like(d_ps)) + F.binary_cross_entropy_with_logits( d_qs, torch.ones_like(d_qs))) self.g_loss = loss.item() g_loss = self.g_loss d_loss = self.d_loss else: loss, g_loss, d_loss = 0.0, 0.0, 0.0 return loss, g_loss, d_loss