def epoch(epoch_idx, is_train=True): model.train() if is_train else model.eval() loader = train_loader if is_train else val_loader if is_train and args.distributed: loader.sampler.set_epoch(epoch_idx) if recorder: recorder.epoch_start(epoch_idx, is_train, loader) for batch_idx, batch in enumerate(loader): batch_size = batch['response'].size()[0] batch = {key: val.to(device) for key, val in batch.items()} optimizer.zero_grad() output, pointer_prob = model(batch) pointer_prob_target = (batch['response_triple'] != NAF_IDX).all(-1).to(torch.float) pointer_prob_target.data.masked_fill_(batch['response'] == 0, PAD_IDX) loss, nll_loss = criterion(output, batch['response'][:, 1:], pointer_prob, pointer_prob_target[:, 1:]) pp = perplexity(nll_loss) if is_train: loss.backward() optimizer.step() if recorder: recorder.batch_end(batch_idx, batch_size, loss.item(), pp.item()) if recorder: recorder.log_text(output, batch) recorder.epoch_end() return recorder.epoch_loss
def test_relative_gradient(self): crit = criterion(iterations_max = 1000, gtol = 0.1) state = {'iteration' : 1001, 'function' : Function(1.), 'new_parameters' : numpy.zeros((1,))} assert(crit(state)) state = {'iteration' : 5, 'function' : Function(1.), 'new_parameters' : numpy.zeros((1,))} assert(not crit(state)) state = {'iteration' : 5, 'function' : Function(0.09), 'new_parameters' : numpy.zeros((1,))} assert(crit(state))
def test_relative_parameters(self): crit = criterion(iterations_max = 1000, xtol = 0.1) state = {'iteration' : 1001, 'old_parameters' : numpy.ones((1,)), 'new_parameters' : numpy.zeros((1,))} assert(crit(state)) state = {'iteration' : 5, 'old_parameters' : numpy.ones((1,)), 'new_parameters' : numpy.zeros((1,))} assert(not crit(state)) state = {'iteration' : 5, 'old_parameters' : numpy.ones((1,)), 'new_parameters' : numpy.ones((1,)) * 0.9} assert(crit(state))
def test_relative_value(self): crit = criterion(iterations_max = 1000, ftol = 0.1) state = {'iteration' : 1001, 'old_value' : 1., 'new_value' : 0.} assert(crit(state)) state = {'iteration' : 5, 'old_value' : 1., 'new_value' : 0.} assert(not crit(state)) state = {'iteration' : 5, 'old_value' : 1., 'new_value' : 0.9} assert(crit(state))
def find_lr(init_value=1e-8, final_value=10., beta=0.98): print("here") num = len(dataset) - 1 mult = (final_value / init_value)**(1 / num) lr = init_value optimizer.param_groups[0]['lr'] = lr avg_loss = 0. best_loss = 0. batch_num = 0 losses = [] log_lrs = [] for data in dataset: batch_num += 1 #As before, get the loss for this mini-batch of inputs/outputs inputs, labels = data inputs = inputs.to(device) optimizer.zero_grad() a_indices, anchors, positives, negatives, _ = model(inputs) anchors = anchors.view(anchors.size(0), args.att_heads, -1) positives = positives.view(positives.size(0), args.att_heads, -1) negatives = negatives.view(negatives.size(0), args.att_heads, -1) l_div, l_homo, l_heter = criterion.criterion(anchors, positives, negatives) loss = l_div + l_homo + l_heter #Compute the smoothed loss avg_loss = beta * avg_loss + (1 - beta) * loss.item() smoothed_loss = avg_loss / (1 - beta**batch_num) #Stop if the loss is exploding if batch_num > 1 and smoothed_loss > 4 * best_loss: return log_lrs, losses #Record the best loss if smoothed_loss < best_loss or batch_num == 1: best_loss = smoothed_loss #Store the values losses.append(smoothed_loss) log_lrs.append(math.log10(lr)) #Do the SGD step loss.backward() optimizer.step() #Update the lr for the next step lr *= mult optimizer.param_groups[0]['lr'] = lr return log_lrs, losses
#import pdb;pdb.set_trace() a_indices, anchors, positives, negatives, _ = out anchors = anchors.view(anchors.size(0), args.att_heads, -1) positives = positives.view(positives.size(0), args.att_heads, -1) negatives = negatives.view(negatives.size(0), args.att_heads, -1) if args.cycle: lr, mom = one_cycle.calc() update_lr(optimizer, lr) update_mom(optimizer, mom) optimizer.zero_grad() l_div, l_homo, l_heter = criterion.criterion( anchors, positives, negatives) l = l_div + l_homo + l_heter l.backward() optimizer.step() loss_homo += l_homo.item() loss_heter += l_heter.item() loss_div += l_div.item() if i % 100 == 0: print('LR:', get_lr(optimizer)) print('\tBatch %d\tloss div: %.4f (%.3f)\tloss h**o: %.4f (%.3f)\tloss heter: %.4f (%.3f)'%\ (i, l_div.item(), loss_div/(i+1), l_homo.item(), loss_homo/(i+1), l_heter.item(), loss_heter/(i+1))) if i % 1000 == 0: writer.add_figure('grad_flow', util.plot_grad_flow_v2( model.named_parameters()),