def test_backward(self): dy = np.array([[1, 2, 3, 4], [5, 6, 7, 8]]) x = np.array([[1, 2, 3], [4, 5, 6]]) w = np.array([[1, 0, 0, 0], [-1, 0, 1, 0], [0, 2, -3, 1]]) dx, dw, db = utils.backward(dy, x, w) self.assertTrue(np.allclose(dx, [[1, 2, -1], [5, 2, -1]])) self.assertTrue( np.allclose( dw, [[21, 26, 31, 36], [27, 34, 41, 48], [33, 42, 51, 60]])) self.assertTrue(np.allclose(db, [[6, 8, 10, 12]]))
def backward(self, dy): for t in reversed(range(self.l)): # for one time point only dVt = np.zeros_like(self.V) dWt = np.zeros_like(self.W) dUt = np.zeros_like(self.U) dyt = dy[t] dst, dVt, dbyt = utils.backward(dyt, self.s[t], self.V) dht = dst * (1 - self.h[t]**2) dbht = dht for i in reversed(range(t)): dxi, dUi, _ = utils.backward(dht, self.x[i], self.U) if i > 0: dsi, dWi, _ = utils.backward(dht, self.s[i - 1], self.W) dht = np.clip((dht @ self.W.T) * (1 - self.h[i - 1]**2), -self.clip, self.clip) else: dsi, dWi, _ = utils.backward(dht, self.s0, self.W) if np.mean(dht) < 0.001: dht *= 10 #print('vanishing gradient') dV += dVt dW += dWt dU += dUt dby += dbyt dbh += dbht dV = np.clip(dV, -self.clip, self.clip) dW = np.clip(dW, -self.clip, self.clip) dU = np.clip(dU, -self.clip, self.clip) dby = np.clip(dby, -self.clip, self.clip) dbh = np.clip(dbh, -self.clip, self.clip) return (dV, dW, dU, dby, dbh)
def backward(self, dy): N = dy.shape[0] dx = np.zeros([N, self.dim_in]) dw = np.zeros([self.dim_k, self.dout]) db = np.zeros([1, self.dout]) for i in range(N): dyi = dy[i, :].reshape(self.dout, -1).T dfxi, dwi, dbi = utils.backward(dyi, self.fx[i, ], self.w) dx[i, ] = utils.unflatten(dfxi, self.shape_in, self.shape_k, self.pad, self.stride, self.indice).ravel() dw += dwi db += dbi self.dw_cache = dw self.db_cache = db return dx
if not args.use_data: _, batch = next(pretrain_dataloader_iter) else: batch = data_batches[batch_idx] data_batches[batch_idx] = None inst_pass += list(batch.values())[0].size(0) summary = {} for percent, inputs in utils.partition_inputs(batch, accumu_steps, True): outputs = model(**inputs) for key in outputs: outputs[key] = outputs[key].mean() * percent utils.backward(outputs["loss"], amp_scaler) utils.add_output_to_summary(outputs, summary) utils.optimizer_step(optimizer, lr_scheduler, amp_scaler) del batch t1 = time.time() summary["idx"] = epoch * pretraining_config[ "batches_per_epoch"] + batch_idx summary["batch_idx"] = batch_idx summary["epoch"] = epoch summary["time"] = round(t1 - t0, 4) summary["inst_pass"] = inst_pass summary["learning_rate"] = round(optimizer.param_groups[0]["lr"], 8) summary["time_since_start"] = round(time.time() - init_t, 4)
def main(): args = get_args() os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu) cur_timestamp = str(datetime.now())[:-3] # we also include ms to prevent the probability of name collision model_width = {'linear': '', 'cnn': args.n_filters_cnn, 'lenet': '', 'resnet18': ''}[args.model] model_str = '{}{}'.format(args.model, model_width) model_name = '{} dataset={} model={} eps={} attack={} m={} attack_init={} fgsm_alpha={} epochs={} pgd={}-{} grad_align_cos_lambda={} lr_max={} seed={}'.format( cur_timestamp, args.dataset, model_str, args.eps, args.attack, args.minibatch_replay, args.attack_init, args.fgsm_alpha, args.epochs, args.pgd_alpha_train, args.pgd_train_n_iters, args.grad_align_cos_lambda, args.lr_max, args.seed) if not os.path.exists('models'): os.makedirs('models') logger = utils.configure_logger(model_name, args.debug) logger.info(args) half_prec = args.half_prec n_cls = 2 if 'binary' in args.dataset else 10 np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) double_bp = True if args.grad_align_cos_lambda > 0 else False n_eval_every_k_iter = args.n_eval_every_k_iter args.pgd_alpha = args.eps / 4 eps, pgd_alpha, pgd_alpha_train = args.eps / 255, args.pgd_alpha / 255, args.pgd_alpha_train / 255 train_data_augm = False if args.dataset in ['mnist'] else True train_batches = data.get_loaders(args.dataset, -1, args.batch_size, train_set=True, shuffle=True, data_augm=train_data_augm) train_batches_fast = data.get_loaders(args.dataset, n_eval_every_k_iter, args.batch_size, train_set=True, shuffle=False, data_augm=False) test_batches = data.get_loaders(args.dataset, args.n_final_eval, args.batch_size_eval, train_set=False, shuffle=False, data_augm=False) test_batches_fast = data.get_loaders(args.dataset, n_eval_every_k_iter, args.batch_size_eval, train_set=False, shuffle=False, data_augm=False) model = models.get_model(args.model, n_cls, half_prec, data.shapes_dict[args.dataset], args.n_filters_cnn).cuda() model.apply(utils.initialize_weights) model.train() if args.model == 'resnet18': opt = torch.optim.SGD(model.parameters(), lr=args.lr_max, momentum=0.9, weight_decay=args.weight_decay) elif args.model == 'cnn': opt = torch.optim.Adam(model.parameters(), lr=args.lr_max, weight_decay=args.weight_decay) elif args.model == 'lenet': opt = torch.optim.Adam(model.parameters(), lr=args.lr_max, weight_decay=args.weight_decay) else: raise ValueError('decide about the right optimizer for the new model') if half_prec: if double_bp: amp.register_float_function(torch, 'batch_norm') model, opt = amp.initialize(model, opt, opt_level="O1") if args.attack == 'fgsm': # needed here only for Free-AT delta = torch.zeros(args.batch_size, *data.shapes_dict[args.dataset][1:]).cuda() delta.requires_grad = True lr_schedule = utils.get_lr_schedule(args.lr_schedule, args.epochs, args.lr_max) loss_function = nn.CrossEntropyLoss() train_acc_pgd_best, best_state_dict = 0.0, copy.deepcopy(model.state_dict()) start_time = time.time() time_train, iteration, best_iteration = 0, 0, 0 for epoch in range(args.epochs + 1): train_loss, train_reg, train_acc, train_n, grad_norm_x, avg_delta_l2 = 0, 0, 0, 0, 0, 0 for i, (X, y) in enumerate(train_batches): if i % args.minibatch_replay != 0 and i > 0: # take new inputs only each `minibatch_replay` iterations X, y = X_prev, y_prev time_start_iter = time.time() # epoch=0 runs only for one iteration (to check the training stats at init) if epoch == 0 and i > 0: break X, y = X.cuda(), y.cuda() lr = lr_schedule(epoch - 1 + (i + 1) / len(train_batches)) # epoch - 1 since the 0th epoch is skipped opt.param_groups[0].update(lr=lr) if args.attack in ['pgd', 'pgd_corner']: pgd_rs = True if args.attack_init == 'random' else False n_eps_warmup_epochs = 5 n_iterations_max_eps = n_eps_warmup_epochs * data.shapes_dict[args.dataset][0] // args.batch_size eps_pgd_train = min(iteration / n_iterations_max_eps * eps, eps) if args.dataset == 'svhn' else eps delta = utils.attack_pgd_training( model, X, y, eps_pgd_train, pgd_alpha_train, opt, half_prec, args.pgd_train_n_iters, rs=pgd_rs) if args.attack == 'pgd_corner': delta = eps * utils.sign(delta) # project to the corners delta = clamp(X + delta, 0, 1) - X elif args.attack == 'fgsm': if args.minibatch_replay == 1: if args.attack_init == 'zero': delta = torch.zeros_like(X, requires_grad=True) elif args.attack_init == 'random': delta = utils.get_uniform_delta(X.shape, eps, requires_grad=True) else: raise ValueError('wrong args.attack_init') else: # if Free-AT, we just reuse the existing delta from the previous iteration delta.requires_grad = True X_adv = clamp(X + delta, 0, 1) output = model(X_adv) loss = F.cross_entropy(output, y) if half_prec: with amp.scale_loss(loss, opt) as scaled_loss: grad = torch.autograd.grad(scaled_loss, delta, create_graph=True if double_bp else False)[0] grad /= scaled_loss / loss # reverse back the scaling else: grad = torch.autograd.grad(loss, delta, create_graph=True if double_bp else False)[0] grad = grad.detach() argmax_delta = eps * utils.sign(grad) n_alpha_warmup_epochs = 5 n_iterations_max_alpha = n_alpha_warmup_epochs * data.shapes_dict[args.dataset][0] // args.batch_size fgsm_alpha = min(iteration / n_iterations_max_alpha * args.fgsm_alpha, args.fgsm_alpha) if args.dataset == 'svhn' else args.fgsm_alpha delta.data = clamp(delta.data + fgsm_alpha * argmax_delta, -eps, eps) delta.data = clamp(X + delta.data, 0, 1) - X elif args.attack == 'random_corner': delta = utils.get_uniform_delta(X.shape, eps, requires_grad=False) delta = eps * utils.sign(delta) elif args.attack == 'none': delta = torch.zeros_like(X, requires_grad=False) else: raise ValueError('wrong args.attack') # extra FP+BP to calculate the gradient to monitor it if args.attack in ['none', 'random_corner', 'pgd', 'pgd_corner']: grad = get_input_grad(model, X, y, opt, eps, half_prec, delta_init='none', backprop=args.grad_align_cos_lambda != 0.0) delta = delta.detach() output = model(X + delta) loss = loss_function(output, y) reg = torch.zeros(1).cuda()[0] # for .item() to run correctly if args.grad_align_cos_lambda != 0.0: grad2 = get_input_grad(model, X, y, opt, eps, half_prec, delta_init='random_uniform', backprop=True) grads_nnz_idx = ((grad**2).sum([1, 2, 3])**0.5 != 0) * ((grad2**2).sum([1, 2, 3])**0.5 != 0) grad1, grad2 = grad[grads_nnz_idx], grad2[grads_nnz_idx] grad1_norms, grad2_norms = l2_norm_batch(grad1), l2_norm_batch(grad2) grad1_normalized = grad1 / grad1_norms[:, None, None, None] grad2_normalized = grad2 / grad2_norms[:, None, None, None] cos = torch.sum(grad1_normalized * grad2_normalized, (1, 2, 3)) reg += args.grad_align_cos_lambda * (1.0 - cos.mean()) loss += reg if epoch != 0: opt.zero_grad() utils.backward(loss, opt, half_prec) opt.step() time_train += time.time() - time_start_iter train_loss += loss.item() * y.size(0) train_reg += reg.item() * y.size(0) train_acc += (output.max(1)[1] == y).sum().item() train_n += y.size(0) with torch.no_grad(): # no grad for the stats grad_norm_x += l2_norm_batch(grad).sum().item() delta_final = clamp(X + delta, 0, 1) - X # we should measure delta after the projection onto [0, 1]^d avg_delta_l2 += ((delta_final ** 2).sum([1, 2, 3]) ** 0.5).sum().item() if iteration % args.eval_iter_freq == 0: train_loss, train_reg = train_loss / train_n, train_reg / train_n train_acc, avg_delta_l2 = train_acc / train_n, avg_delta_l2 / train_n # it'd be incorrect to recalculate the BN stats on the test sets and for clean / adversarial points utils.model_eval(model, half_prec) test_acc_clean, _, _ = rob_acc(test_batches_fast, model, eps, pgd_alpha, opt, half_prec, 0, 1) test_acc_fgsm, test_loss_fgsm, fgsm_deltas = rob_acc(test_batches_fast, model, eps, eps, opt, half_prec, 1, 1, rs=False) test_acc_pgd, test_loss_pgd, pgd_deltas = rob_acc(test_batches_fast, model, eps, pgd_alpha, opt, half_prec, args.attack_iters, 1) cos_fgsm_pgd = utils.avg_cos_np(fgsm_deltas, pgd_deltas) train_acc_pgd, _, _ = rob_acc(train_batches_fast, model, eps, pgd_alpha, opt, half_prec, args.attack_iters, 1) # needed for early stopping grad_x = utils.get_grad_np(model, test_batches_fast, eps, opt, half_prec, rs=False) grad_eta = utils.get_grad_np(model, test_batches_fast, eps, opt, half_prec, rs=True) cos_x_eta = utils.avg_cos_np(grad_x, grad_eta) time_elapsed = time.time() - start_time train_str = '[train] loss {:.3f}, reg {:.3f}, acc {:.2%} acc_pgd {:.2%}'.format(train_loss, train_reg, train_acc, train_acc_pgd) test_str = '[test] acc_clean {:.2%}, acc_fgsm {:.2%}, acc_pgd {:.2%}, cos_x_eta {:.3}, cos_fgsm_pgd {:.3}'.format( test_acc_clean, test_acc_fgsm, test_acc_pgd, cos_x_eta, cos_fgsm_pgd) logger.info('{}-{}: {} {} ({:.2f}m, {:.2f}m)'.format(epoch, iteration, train_str, test_str, time_train/60, time_elapsed/60)) if train_acc_pgd > train_acc_pgd_best: # catastrophic overfitting can be detected on the training set best_state_dict = copy.deepcopy(model.state_dict()) train_acc_pgd_best, best_iteration = train_acc_pgd, iteration utils.model_train(model, half_prec) train_loss, train_reg, train_acc, train_n, grad_norm_x, avg_delta_l2 = 0, 0, 0, 0, 0, 0 iteration += 1 X_prev, y_prev = X.clone(), y.clone() # needed for Free-AT if epoch == args.epochs: torch.save({'last': model.state_dict(), 'best': best_state_dict}, 'models/{} epoch={}.pth'.format(model_name, epoch)) # disable global conversion to fp16 from amp.initialize() (https://github.com/NVIDIA/apex/issues/567) context_manager = amp.disable_casts() if half_prec else utils.nullcontext() with context_manager: last_state_dict = copy.deepcopy(model.state_dict()) half_prec = False # final eval is always in fp32 model.load_state_dict(last_state_dict) utils.model_eval(model, half_prec) opt = torch.optim.SGD(model.parameters(), lr=0) attack_iters, n_restarts = (50, 10) if not args.debug else (10, 3) test_acc_clean, _, _ = rob_acc(test_batches, model, eps, pgd_alpha, opt, half_prec, 0, 1) test_acc_pgd_rr, _, deltas_pgd_rr = rob_acc(test_batches, model, eps, pgd_alpha, opt, half_prec, attack_iters, n_restarts) logger.info('[last: test on 10k points] acc_clean {:.2%}, pgd_rr {:.2%}'.format(test_acc_clean, test_acc_pgd_rr)) if args.eval_early_stopped_model: model.load_state_dict(best_state_dict) utils.model_eval(model, half_prec) test_acc_clean, _, _ = rob_acc(test_batches, model, eps, pgd_alpha, opt, half_prec, 0, 1) test_acc_pgd_rr, _, deltas_pgd_rr = rob_acc(test_batches, model, eps, pgd_alpha, opt, half_prec, attack_iters, n_restarts) logger.info('[best: test on 10k points][iter={}] acc_clean {:.2%}, pgd_rr {:.2%}'.format( best_iteration, test_acc_clean, test_acc_pgd_rr)) utils.model_train(model, half_prec) logger.info('Done in {:.2f}m'.format((time.time() - start_time) / 60))
def train_epoch(self, img_data_iter: List[data_utils.DataLoader], step: int, saving_path: str = None, img_dev_data_iter: List[data_utils.DataLoader] = None, max_step: int = 300000, lex_dict=None, **kwargs): "Standard Training and Logging Function" start = time.time() total_tokens, total_loss, tokens, cur_loss = 0, 0, 0, 0 cur_loss = 0 batch_zip, shortest = self.get_batch_zip(img_data_iter, None, None) model = (self.model.module if hasattr(self.model, "module") else self.model) for i, batches in enumerate(batch_zip): for batch in batches: try: self.optimizer.zero_grad() captions = [b["captions"] for b in batch] caption_pad_mask = [b["caption_mask"] for b in batch] langs = [b["langs"] for b in batch] with torch.no_grad(): image_encoding = self.caption_model(batch=batch, encode_only=True) image_encoding = image_encoding.view( image_encoding.size(0), -1) predictions = self.model(src_inputs=captions, src_mask=caption_pad_mask, src_langs=langs) l2_loss = torch.dist(predictions, image_encoding, 2) / predictions.size(0) backward(l2_loss, self.optimizer, self.fp16) loss = float(l2_loss.data) tokens += int(predictions.size(0)) total_tokens += int(predictions.size(0)) total_loss += loss cur_loss += loss # We accumulate the gradients for both tasks! torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip) self.optimizer.step() step += 1 if step % 50 == 0 and tokens > 0: elapsed = time.time() - start print( datetime.datetime.now(), "Epoch Step: %d Loss: %f Image per Sec: %f " % (step, cur_loss / tokens, tokens / elapsed)) if step % 500 == 0: if img_dev_data_iter is not None and step % 5000 == 0: loss = self.eval(img_dev_data_iter) print("Dev Loss:", loss) model.save(saving_path + ".latest") with open( os.path.join(saving_path + ".latest", "optim"), "wb") as fp: pickle.dump(self.optimizer, fp) start, tokens, cur_loss = time.time(), 0, 0 if step >= max_step: break if i == shortest - 1: break except RuntimeError as err: print(repr(err)) torch.cuda.empty_cache() try: print("Total loss in this epoch: %f" % (total_loss / total_tokens)) model.save(saving_path + ".latest") loss = self.eval(img_dev_data_iter) print("Dev Loss:", loss) except RuntimeError as err: print(repr(err)) return step
def backward(self, dy): N = dy.shape[0] dx, dw, db = utils.backward(dy, self.x, self.w) self.dw = dw self.db = db return dx
import pygame import pickle import numpy as np from game import init, iterate from ann import NeuralNetwork import utils # Architecture (Specify archetecture here.) network = NeuralNetwork(layers=[7, 14, 14, 7, 1], activations=['sigmoid', 'sigmoid', 'sigmoid', 'tanh']) lr = 0.1 losses = [] screen, font = init() # Game Loop / Train Loop frame_count, score, _, _, x = iterate.iterate(screen, font, 0, 0) game = True run = True prediction = 0 while run: for event in pygame.event.get(): if event.type == pygame.QUIT: run = False prediction = utils.forward(x, network) frame_count, score, game, run, x = iterate.iterate(screen, font, frame_count, score, game, run, prediction) loss = utils.backward(prediction, x, lr, network) losses.append(loss) pygame.quit()
def train_epoch(self, step: int, saving_path: str = None, mt_dev_iter: List[data_utils.DataLoader] = None, mt_train_iter: List[data_utils.DataLoader] = None, max_step: int = 300000, src_neg_iter: data_utils.DataLoader = None, dst_neg_iter: data_utils.DataLoader = None, **kwargs): "Standard Training and Logging Function" start = time.time() total_tokens, total_loss, tokens, cur_loss = 0, 0, 0, 0 cur_loss = 0 batch_zip, shortest = self.get_batch_zip(None, None, mt_train_iter) model = (self.model.module if hasattr(self.model, "module") else self.model) for i, batches in enumerate(batch_zip): for batch in batches: self.optimizer.zero_grad() try: src_inputs = batch["src_texts"].squeeze(0) src_mask = batch["src_pad_mask"].squeeze(0) tgt_inputs = batch["dst_texts"].squeeze(0) tgt_mask = batch["dst_pad_mask"].squeeze(0) src_langs = batch["src_langs"].squeeze(0) dst_langs = batch["dst_langs"].squeeze(0) src_neg_batch = next(iter(src_neg_iter)) src_neg_inputs = src_neg_batch["src_texts"].squeeze(0) src_neg_mask = src_neg_batch["src_pad_mask"].squeeze(0) src_neg_langs = src_neg_batch["langs"].squeeze(0) dst_neg_batch = next(iter(dst_neg_iter)) tgt_neg_inputs = dst_neg_batch["src_texts"].squeeze(0) tgt_neg_mask = dst_neg_batch["src_pad_mask"].squeeze(0) dst_neg_langs = dst_neg_batch["langs"].squeeze(0) if src_inputs.size(0) < self.num_gpu: continue loss = self.model(src_inputs=src_inputs, tgt_inputs=tgt_inputs, src_mask=src_mask, tgt_mask=tgt_mask, src_langs=src_langs, tgt_langs=dst_langs, src_neg_inputs=src_neg_inputs, tgt_neg_inputs=tgt_neg_inputs, src_neg_mask=src_neg_mask, tgt_neg_mask=tgt_neg_mask, src_neg_langs=src_neg_langs, tgt_neg_langs=dst_neg_langs, normalize=True) nSens = src_inputs.size(0) backward(loss, self.optimizer, self.fp16) loss = float(loss.data) * nSens tokens += nSens total_tokens += nSens total_loss += loss cur_loss += loss # We accumulate the gradients for both tasks! torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip) self.optimizer.step() step += 1 if step % 50 == 0 and tokens > 0: elapsed = time.time() - start print( datetime.datetime.now(), "Epoch Step: %d Loss: %f Tokens per Sec: %f " % (step, cur_loss / tokens, tokens / elapsed)) if step % 500 == 0: if mt_dev_iter is not None and step % 5000 == 0: dev_loss = self.eval(mt_dev_iter, saving_path) print("Dev Loss:", dev_loss) model.save(saving_path + ".latest") with open( os.path.join(saving_path + ".latest", "optim"), "wb") as fp: pickle.dump(self.optimizer, fp) start, tokens, cur_loss = time.time(), 0, 0 except RuntimeError as err: print(repr(err)) torch.cuda.empty_cache() if i == shortest - 1: break if step >= max_step: break try: print("Total loss in this epoch: %f" % (total_loss / total_tokens)) model.save(saving_path + ".latest") if mt_dev_iter is not None: dev_loss = self.eval(mt_dev_iter, saving_path) print("Dev Loss:", dev_loss) except RuntimeError as err: print(repr(err)) return step
if isData(): c = sys.stdin.read(1) print(c) if c == 's': stop_state = True elif c == 'h': happy_state = not happy_state c = '' else: #c == '' or else. if c == 'w': forward(pwm, pwm) stop_state = False elif c == 'x': backward(pwm, pwm) stop_state = False elif c == 'a': spin_left(pwm, pwm) stop_state = False elif c == 'd': spin_right(pwm, pwm) stop_state = False elif c == 'q': pwm = pwm + pwm_increment if pwm <= max_speed - pwm_increment else pwm print("pwm: ", pwm) elif c == 'e': pwm = pwm - pwm_increment if pwm >= min_speed + pwm_increment else pwm print("pwm: ", pwm)
def train_epoch(self, img_data_iter: List[data_utils.DataLoader], step: int, saving_path: str = None, img_dev_data_iter: List[data_utils.DataLoader] = None, max_step: int = 300000, lex_dict=None, accum=1, mt_train_iter: List[data_utils.DataLoader] = None, mt_dev_iter: List[data_utils.DataLoader] = None, mtl_weight=0.1, **kwargs): "Standard Training and Logging Function" start = time.time() total_tokens, total_loss, tokens, cur_loss = 0, 0, 0, 0 cur_loss = 0 batch_zip, shortest = self.get_batch_zip(img_data_iter, None, mt_train_iter) model = (self.model.module if hasattr(self.model, "module") else self.model) for i, batches in enumerate(batch_zip): for batch in batches: try: is_img_batch = isinstance(batch, list) and "captions" in batch[0] if is_img_batch: # Captioning training data. captions = [b["captions"] for b in batch] caption_pad_mask = [b["caption_mask"] for b in batch] proposals = [b["proposal"] for b in batch ] if lex_dict is not None else None langs = [b["langs"] for b in batch] if len(batch) < self.num_gpu: continue predictions = self.model( tgt_inputs=captions, tgt_mask=caption_pad_mask, pad_idx=model.text_processor.pad_token_id(), tgt_langs=langs, batch=batch, proposals=proposals, log_softmax=True) targets = torch.cat( list( map(lambda c: c[:, 1:].contiguous().view(-1), captions))) tgt_mask_flat = torch.cat( list( map(lambda c: c[:, 1:].contiguous().view(-1), caption_pad_mask))) targets = targets[tgt_mask_flat] else: # MT data! src_inputs = batch["src_texts"].squeeze(0) src_mask = batch["src_pad_mask"].squeeze(0) tgt_inputs = batch["dst_texts"].squeeze(0) tgt_mask = batch["dst_pad_mask"].squeeze(0) src_langs = batch["src_langs"].squeeze(0) dst_langs = batch["dst_langs"].squeeze(0) proposals = batch["proposal"].squeeze( 0) if lex_dict is not None else None if src_inputs.size(0) < self.num_gpu: continue predictions = self.model( src_inputs=src_inputs, tgt_inputs=tgt_inputs, src_pads=src_mask, tgt_mask=tgt_mask, src_langs=src_langs, tgt_langs=dst_langs, proposals=proposals, pad_idx=model.text_processor.pad_token_id(), log_softmax=True) targets = tgt_inputs[:, 1:].contiguous().view(-1) tgt_mask_flat = tgt_mask[:, 1:].contiguous().view(-1) targets = targets[tgt_mask_flat] ntokens = targets.size(0) if ntokens > 0: if self.num_gpu == 1: targets = targets.to(predictions.device) loss = self.criterion(predictions, targets).mean() weight = 1 if is_img_batch else mtl_weight backward(loss * weight, self.optimizer, self.fp16) loss = float(loss.data) * ntokens tokens += ntokens total_tokens += ntokens total_loss += loss cur_loss += loss # We accumulate the gradients for both tasks! torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip) step += 1 if step % accum == 0: self.optimizer.step() self.optimizer.zero_grad() if step % 50 == 0 and tokens > 0: elapsed = time.time() - start print( datetime.datetime.now(), "Epoch Step: %d Loss: %f Tokens per Sec: %f " % (step, cur_loss / tokens, tokens / elapsed)) if step % 500 == 0: if img_dev_data_iter is not None and step % 5000 == 0: bleu = self.eval_bleu( img_dev_data_iter, saving_path) print("Captioning BLEU:", bleu) if mt_dev_iter is not None and step % 5000 == 0: bleu = super().eval_bleu( mt_dev_iter, saving_path) print("MT BLEU:", bleu) model.save(saving_path + ".latest") with open( os.path.join(saving_path + ".latest", "optim"), "wb") as fp: pickle.dump(self.optimizer, fp) start, tokens, cur_loss = time.time(), 0, 0 if step >= max_step: break if i == shortest - 1: break except RuntimeError as err: print(repr(err)) torch.cuda.empty_cache() try: if img_dev_data_iter is not None: bleu = self.eval_bleu(img_dev_data_iter, saving_path) print("Captioning BLEU:", bleu) if mt_dev_iter is not None: bleu = super().eval_bleu(mt_dev_iter, saving_path) print("MT BLEU:", bleu) print("Total loss in this epoch: %f" % (total_loss / total_tokens)) model.save(saving_path + ".latest") except RuntimeError as err: print(repr(err)) return step