예제 #1
0
def epoch(epoch_idx, is_train=True):
    model.train() if is_train else model.eval()
    loader = train_loader if is_train else val_loader
    if is_train and args.distributed:
        loader.sampler.set_epoch(epoch_idx)
    if recorder:
        recorder.epoch_start(epoch_idx, is_train, loader)
    for batch_idx, batch in enumerate(loader):
        batch_size = batch['response'].size()[0]
        batch = {key: val.to(device) for key, val in batch.items()}
        optimizer.zero_grad()
        output, pointer_prob = model(batch)
        pointer_prob_target = (batch['response_triple'] != NAF_IDX).all(-1).to(torch.float)
        pointer_prob_target.data.masked_fill_(batch['response'] == 0, PAD_IDX)
        loss, nll_loss = criterion(output, batch['response'][:, 1:], pointer_prob, pointer_prob_target[:, 1:])
        pp = perplexity(nll_loss)
        if is_train:
            loss.backward()
            optimizer.step()
        if recorder:
            recorder.batch_end(batch_idx, batch_size, loss.item(), pp.item())
    if recorder:
        recorder.log_text(output, batch)
        recorder.epoch_end()
        return recorder.epoch_loss
예제 #2
0
 def test_relative_gradient(self):
   crit = criterion(iterations_max = 1000, gtol = 0.1)
   state = {'iteration' : 1001, 'function' : Function(1.), 'new_parameters' : numpy.zeros((1,))}
   assert(crit(state))
   state = {'iteration' : 5, 'function' : Function(1.), 'new_parameters' : numpy.zeros((1,))}
   assert(not crit(state))
   state = {'iteration' : 5, 'function' : Function(0.09), 'new_parameters' : numpy.zeros((1,))}
   assert(crit(state))
예제 #3
0
 def test_relative_parameters(self):
   crit = criterion(iterations_max = 1000, xtol = 0.1)
   state = {'iteration' : 1001, 'old_parameters' : numpy.ones((1,)), 'new_parameters' : numpy.zeros((1,))}
   assert(crit(state))
   state = {'iteration' : 5, 'old_parameters' : numpy.ones((1,)), 'new_parameters' : numpy.zeros((1,))}
   assert(not crit(state))
   state = {'iteration' : 5, 'old_parameters' : numpy.ones((1,)), 'new_parameters' : numpy.ones((1,)) * 0.9}
   assert(crit(state))
예제 #4
0
 def test_relative_value(self):
   crit = criterion(iterations_max = 1000, ftol = 0.1)
   state = {'iteration' : 1001, 'old_value' : 1., 'new_value' : 0.}
   assert(crit(state))
   state = {'iteration' : 5, 'old_value' : 1., 'new_value' : 0.}
   assert(not crit(state))
   state = {'iteration' : 5, 'old_value' : 1., 'new_value' : 0.9}
   assert(crit(state))
예제 #5
0
def find_lr(init_value=1e-8, final_value=10., beta=0.98):
    print("here")
    num = len(dataset) - 1
    mult = (final_value / init_value)**(1 / num)
    lr = init_value
    optimizer.param_groups[0]['lr'] = lr
    avg_loss = 0.
    best_loss = 0.
    batch_num = 0
    losses = []
    log_lrs = []
    for data in dataset:
        batch_num += 1
        #As before, get the loss for this mini-batch of inputs/outputs
        inputs, labels = data
        inputs = inputs.to(device)
        optimizer.zero_grad()
        a_indices, anchors, positives, negatives, _ = model(inputs)
        anchors = anchors.view(anchors.size(0), args.att_heads, -1)
        positives = positives.view(positives.size(0), args.att_heads, -1)
        negatives = negatives.view(negatives.size(0), args.att_heads, -1)

        l_div, l_homo, l_heter = criterion.criterion(anchors, positives,
                                                     negatives)
        loss = l_div + l_homo + l_heter
        #Compute the smoothed loss
        avg_loss = beta * avg_loss + (1 - beta) * loss.item()
        smoothed_loss = avg_loss / (1 - beta**batch_num)
        #Stop if the loss is exploding
        if batch_num > 1 and smoothed_loss > 4 * best_loss:
            return log_lrs, losses
        #Record the best loss
        if smoothed_loss < best_loss or batch_num == 1:
            best_loss = smoothed_loss
        #Store the values
        losses.append(smoothed_loss)
        log_lrs.append(math.log10(lr))
        #Do the SGD step
        loss.backward()
        optimizer.step()
        #Update the lr for the next step
        lr *= mult
        optimizer.param_groups[0]['lr'] = lr
    return log_lrs, losses
예제 #6
0
                #import pdb;pdb.set_trace()
                a_indices, anchors, positives, negatives, _ = out

                anchors = anchors.view(anchors.size(0), args.att_heads, -1)
                positives = positives.view(positives.size(0), args.att_heads,
                                           -1)
                negatives = negatives.view(negatives.size(0), args.att_heads,
                                           -1)

                if args.cycle:
                    lr, mom = one_cycle.calc()
                    update_lr(optimizer, lr)
                    update_mom(optimizer, mom)
                optimizer.zero_grad()

                l_div, l_homo, l_heter = criterion.criterion(
                    anchors, positives, negatives)
                l = l_div + l_homo + l_heter
                l.backward()
                optimizer.step()

                loss_homo += l_homo.item()
                loss_heter += l_heter.item()
                loss_div += l_div.item()
                if i % 100 == 0:
                    print('LR:', get_lr(optimizer))
                    print('\tBatch %d\tloss div: %.4f (%.3f)\tloss h**o: %.4f (%.3f)\tloss heter: %.4f (%.3f)'%\
                        (i, l_div.item(), loss_div/(i+1), l_homo.item(), loss_homo/(i+1), l_heter.item(), loss_heter/(i+1)))
                if i % 1000 == 0:
                    writer.add_figure('grad_flow',
                                      util.plot_grad_flow_v2(
                                          model.named_parameters()),