def tldr_loss(model, batch, args): longer_sample = batch[0].to(args.gpu) inp = longer_sample[:, :args.train_batch_size] model_output = model(input_ids=inp) target = longer_sample[:, 1:args.train_batch_size + 1] logits = model_output[0] lprobs = F.log_softmax(logits, dim=-1) assert lprobs.size(0) == 1, 'We work on flat sequences' nll_loss = F.nll_loss(lprobs[0], target[0], reduction='sum') arange = np.arange(args.train_batch_size) lprobs_y = lprobs[:, arange, target] print(torch.sum(torch.cos(np.pi * lprobs_y.exp()) + 1 < 0.5)) loss = ((torch.cos(np.pi * lprobs_y.exp()) + 1)**args.focal_gamma * (-lprobs_y)).sum() true_token_logits = -F.nll_loss(logits[0], target[0], reduction='none') ntokens = inp.numel() logging_output = TrainingMetrics.ranking_metrics(logits[0].float(), true_token_logits, None, ntokens, target[0]) logging_output['loss'] = nll_loss.item() logging_output['tldr_loss'] = loss.item() logging_output['normalizer'] = ntokens logging_output['sample_size'] = ntokens logging_output['ntokens'] = ntokens loss = loss / ntokens return loss, logging_output
def eval_singletoken_argmax(model, args, dataset_paths, config, train_iter=None, batch_size=None): batch_size = batch_size if batch_size is not None else args.batch_size_singletoken datasets = get_datasets(dataset_paths, max_len=batch_size) eval_sampler = SequentialSampler(datasets[args.eval_split]) eval_dataloader = DataLoader( datasets[args.eval_split], sampler=eval_sampler, batch_size=1) model.eval() logging_outputs = [] predicted_tokens = [] target_tokens = [] with torch.no_grad(): for i, batch in tqdm(enumerate(eval_dataloader), desc="Evaluating", total=len(eval_dataloader)): longer_sample = batch[0].to(args.gpu) inp = longer_sample[:, :args.batch_size_singletoken] model_output = model(input_ids=inp) target = longer_sample[:, 1:] logits = model_output[0] lprobs = F.log_softmax(logits, dim=-1) assert lprobs.size(0) == 1, 'We work on flat sequences' loss = F.nll_loss(lprobs[0], target[0], reduction='sum') true_token_logits = - \ F.nll_loss(logits[0], target[0], reduction='none') pred = lprobs.argmax(dim=-1).view(-1).tolist() predicted_tokens.extend(pred) ntokens = inp.numel() logging_output = TrainingMetrics.ranking_metrics( logits[0].float(), true_token_logits, None, ntokens, target[0]) logging_output['loss'] = loss.item() logging_output['normalizer'] = ntokens logging_output['sample_size'] = ntokens logging_output['ntokens'] = ntokens logging_outputs.append(logging_output) # for human uniq target_tokens.extend(target.view(-1).tolist()) logging_average = CrossEntropyCriterionWCustomMetrics.aggregate_logging_outputs( logging_outputs) logging_average['ppl'] = 2 ** logging_average['loss'] logging_average['uniq'] = len(set(predicted_tokens)) logging_average['human_uniq'] = len(set(target_tokens)) save_singletoken_metrics( logging_average, config.to_dict(), args, train_iter=train_iter) return logging_average
def mle_loss(model, batch, args): print("before", batch.pre_tru.shape) batch.pre_tru = truncate_batch(args, batch.pre_tru) bsz, newlen = batch.pre_tru.shape inp = batch.pre_tru print("after", batch.pre_tru.shape) set_trace() model_output = model(inp) target = batch.pre_tru[:, 1:].clone().detach() # bsz, newlen logits = model_output[0] # bsz, newlen, vocabsize _, __, vocabsize = logits.shape lprobs = F.log_softmax(logits, dim=-1) # bsz, newlen loss = F.nll_loss( lprob.view(-1, vocabsize).contiguous(), target.view(-1).contiguous(), reduction='mean') # reduction method on original code: 'sum' true_token_logits = -F.nll_loss(logits.view(-1, vocabsize).contiguous(), target.view(-1).contiguous(), reduction='none') #flatten shape of batches --> recover shape assert len(true_token_logits) == newlen * bsz true_token_logits = true_token_logits.view(bsz, newlen) ntokens = inp.numel() logging_output = TrainingMetrics.ranking_metrics(logits[0], true_token_logits, None, ntokens, target[0]) logging_output['loss'] = loss.item() logging_output['normalizer'] = ntokens logging_output['sample_size'] = ntokens logging_output['ntokens'] = ntokens '''logging_output = { # from fairseq.custom.metrics 'target_rank': utils.item(target_rank.data), 'hits_at_1': utils.item(hits_at_1.data), 'hits_at_10': utils.item(hits_at_10.data), 'median_target_rank': utils.item(median_target_rank), # NOTE: different normalization since it's not a sum 'normalizer': ntokens, 'repeat_topk/p_{}': 'wrepeat_topk/p_{}': 'nextunique_topk/p_{}': }''' #loss = loss / ntokens #covered above with reduction method return loss, logging_output
def alpha_entmax_loss(model, batch, args): longer_sample = batch[0].to(args.gpu) inp = longer_sample[:, :args.train_batch_size] model_output = model(input_ids=inp) target = longer_sample[:, 1:args.train_batch_size + 1] logits = model_output[0] alpha = torch.tensor([args.alpha], requires_grad=True, device=torch.device(args.gpu)) probs = entmax_bisect(logits, alpha) loss = ((probs - F.one_hot(target, num_classes=probs.size(-1))) * logits).sum(-1) loss += alpha_entropy(probs, args.alpha) loss = loss.sum() true_token_logits = -F.nll_loss(logits[0], target[0], reduction='none') ntokens = inp.numel() arange = np.arange(probs.size(1)) next_token_probs = probs[:, arange, target.squeeze().tolist()] voc_sizes = probs.size(-1) smoothed_nll = -torch.mean( torch.log((next_token_probs + args.laplas_eps) / (1 + args.laplas_eps * voc_sizes))) logging_output = TrainingMetrics.ranking_metrics(logits[0].float(), true_token_logits, None, ntokens, target[0]) logging_output['loss'] = loss.item() logging_output['smoothed_nll_loss'] = smoothed_nll.item() logging_output['normalizer'] = ntokens logging_output['sample_size'] = ntokens logging_output['ntokens'] = ntokens logging_output['js_div'] = jensen_shannon_divergence(probs, target).mean().item() print(logging_output['js_div']) loss = loss / ntokens return loss, logging_output
def mle_loss(model, batch, args): longer_sample = batch[0].cuda() inp = longer_sample[:, :args.train_batch_size] model_output = model(inp) target = longer_sample[:, 1:] logits = model_output[0] lprobs = F.log_softmax(logits, dim=-1) assert lprobs.size(0) == 1, 'We work on flat sequences' loss = F.nll_loss(lprobs[0], target[0], reduction='sum') true_token_logits = -F.nll_loss(logits[0], target[0], reduction='none') ntokens = inp.numel() logging_output = TrainingMetrics.ranking_metrics(logits[0], true_token_logits, None, ntokens, target[0]) logging_output['loss'] = loss.item() logging_output['normalizer'] = ntokens logging_output['sample_size'] = ntokens logging_output['ntokens'] = ntokens loss = loss / ntokens return loss, logging_output
def aggregate_logging_outputs(logging_outputs): """Aggregate logging outputs from data parallel training.""" loss_sum = sum(log.get('loss', 0) for log in logging_outputs) ntokens = sum(log.get('ntokens', 0) for log in logging_outputs) nsentences = sum(log.get('nsentences', 0) for log in logging_outputs) sample_size = sum(log.get('sample_size', 0) for log in logging_outputs) agg_output = { 'loss': loss_sum / sample_size / math.log(2) if sample_size > 0 else 0., 'ntokens': ntokens, 'nsentences': nsentences, 'sample_size': sample_size, } from fairseq.custom.metrics import TrainingMetrics custom_output = TrainingMetrics.aggregate_and_normalize( logging_outputs) for k, v in custom_output.items(): agg_output[k] = v if sample_size != ntokens: agg_output['nll_loss'] = loss_sum / ntokens / math.log( 2) if ntokens > 0 else 0. return agg_output
def forward(self, model, sample, reduce=True, compute_custom_metrics=True): """Compute the loss for the given sample. Returns a tuple with three elements: 1) the loss 2) the sample size, which is used as the denominator for the gradient 3) logging outputs to display while training """ net_output = model(**sample['net_input']) logits = net_output[0].view(-1, net_output[0].size(-1)) target = model.get_targets(sample, net_output) target = target.view(-1) loss, _ = self.compute_loss(model, net_output, sample, reduce=reduce) sample_size = sample['target'].size( 0) if self.args.sentence_avg else sample['ntokens'] true_token_logits = -F.nll_loss( logits, target, ignore_index=self.padding_idx, reduction='none', # I think this needs to be mean for batch case? ) orig = utils.strip_pad(target, self.padding_idx) ntokens = orig.numel() logging_output = { 'loss': utils.item(loss.data) if reduce else loss.data, 'ntokens': sample['ntokens'], 'nsentences': sample['target'].size(0), 'sample_size': sample_size, } if compute_custom_metrics: custom_output = TrainingMetrics.ranking_metrics( logits, true_token_logits, sample, ntokens, target) for k, v in custom_output.items(): logging_output[k] = v return loss, sample_size, logging_output
def eval_single_token_prediction(model, itr, dictionary, singletoken_topp=0.0, singletoken_topk=1): predicted_tokens = [] target_tokens = [] mle_loss_sum = 0 num_samples_sum = 0 wrong_mass_sum = 0 logging_outputs = [] for n, sample in tqdm(enumerate(itr)): sample = utils.move_to_cuda(sample) net_output = model(**sample['net_input']) logits = net_output[0][0] logits[:, dictionary.pad()] = -1e19 predicted_tokens.append(logits.argmax(1).tolist()) target = sample['target'].view(-1) target_tokens.append(target.tolist()) # -- mle loss lprobs = model.get_normalized_probs(net_output, log_probs=True) lprobs = lprobs.view(-1, lprobs.size(-1)) true_token_lprobs = F.nll_loss( lprobs, target, ignore_index=dictionary.pad_index, reduction='none', ) true_token_logits = -F.nll_loss( logits, target, ignore_index=dictionary.pad_index, reduction='none', ) mle_loss = true_token_lprobs.sum() orig = utils.strip_pad(target, dictionary.pad_index) ntokens = orig.numel() mle_loss_sum += mle_loss.item() num_samples_sum += ntokens logging_output = TrainingMetrics.ranking_metrics(logits, true_token_logits, sample, ntokens, target, topk=singletoken_topk, topp=singletoken_topp) negative_targets = (logits > true_token_logits[:, None]).float() wrong_mass_sum += (negative_targets * (F.softmax(logits, dim=1))).sum() logging_outputs.append(logging_output) ppl = math.pow(2, mle_loss_sum / num_samples_sum / math.log(2)) custom_metrics = TrainingMetrics.aggregate_and_normalize(logging_outputs) custom_metrics['ppl'] = ppl avg_wrong_mass = wrong_mass_sum / num_samples_sum custom_metrics['avg_wrong_mass'] = avg_wrong_mass.item() return predicted_tokens, target_tokens, custom_metrics
def eval_singletoken(model, args, dataset_paths, config, top_k=1, top_p=0.0, t=1.0, train_iter=None, batch_size=None): alpha_entmax = args.alpha_entmax batch_size = batch_size if batch_size is not None else args.batch_size_singletoken datasets = get_datasets(dataset_paths, max_len=batch_size) eval_sampler = SequentialSampler(datasets[args.eval_split]) eval_dataloader = DataLoader( datasets[args.eval_split], sampler=eval_sampler, batch_size=1) model.eval() logging_outputs = [] predicted_tokens = [] target_tokens = [] with torch.no_grad(): for i, batch in tqdm(enumerate(eval_dataloader), desc="Evaluating", total=len(eval_dataloader)): longer_sample = batch[0].to(args.gpu) inp = longer_sample[:, :args.batch_size_singletoken] model_output = model(input_ids=inp) target = longer_sample[:, 1:] logits = model_output[0] log_softmax_probs = F.log_softmax(logits, dim=-1) nll = F.nll_loss(log_softmax_probs[0], target[0], reduction='sum') true_token_logits = - \ F.nll_loss(logits[0], target[0], reduction='none') if alpha_entmax is False: filtered_logits = top_k_top_p_filtering( logits.squeeze(0), top_k=args.top_k, top_p=args.top_p).unsqueeze(0) prev = F.softmax( filtered_logits.view(filtered_logits.shape[1:]), dim=-1).multinomial(num_samples=1).unsqueeze(0).squeeze(-1) probs = F.softmax(filtered_logits, dim=-1) else: probs = entmax_bisect(logits, torch.tensor( [args.alpha], requires_grad=True, device=torch.device(args.gpu)).float()) arange = np.arange(logits.size(1)) next_token_probs = probs[:, arange, target.squeeze().tolist()] voc_sizes = probs.size(-1) smoothed_nll = -torch.mean(torch.log( (next_token_probs + args.laplas_eps) / (1 + args.laplas_eps * voc_sizes) )) pred = probs.view(-1, probs.size(-1) ).multinomial(num_samples=1).view(probs.shape[:-1]) predicted_tokens.extend(pred.view(-1).tolist()) ntokens = inp.numel() rep_logits = torch.zeros_like(logits) rep_logits[:, arange, pred.squeeze().tolist()] = 1 logging_output = TrainingMetrics.ranking_metrics( rep_logits[0].float(), true_token_logits, None, ntokens, target[0]) logging_output['loss'] = nll.item() logging_output['smoothed_nll_loss'] = smoothed_nll.item() logging_output['normalizer'] = ntokens logging_output['sample_size'] = ntokens logging_output['ntokens'] = ntokens logging_output['js_div'] = jensen_shannon_divergence( probs, target).mean().item() if args.token_loss == 'alpha_entmax': loss = ((probs - F.one_hot(target, num_classes=probs.size(-1))) * logits).sum(-1) loss += alpha_entropy(probs, args.alpha) logging_output['alpha_entmax_loss'] = loss.mean().item() logging_outputs.append(logging_output) # for human uniq target_tokens.extend(target.view(-1).tolist()) logging_average = CrossEntropyCriterionWCustomMetrics.aggregate_logging_outputs( logging_outputs) logging_average['e_ppl'] = np.exp( np.mean([x['smoothed_nll_loss'] for x in logging_outputs])) # aggregate_logging_outputs does division by log(2) of loss logging_average['ppl'] = 2**logging_average['loss'] logging_average['human_uniq'] = len(set(target_tokens)) logging_average['uniq'] = len(set(predicted_tokens)) logging_average['wrep'] = np.mean( [v for k, v in logging_average.items() if k.startswith('wrong_repeat')]) logging_average['rep'] = np.mean( [v for k, v in logging_average.items() if k.startswith('repeat')]) logging_average['js_div'] = np.mean([x['js_div'] for x in logging_outputs]) if args.token_loss == 'alpha_entmax': logging_average['alpha_entmax_loss'] = np.mean( [x['alpha_entmax_loss'] for x in logging_outputs]) save_singletoken_sampling_metrics( logging_average, config.to_dict(), args, top_k=top_k, top_p=top_p, train_iter=train_iter) return logging_average
def forward(self, model, sample, reduce=True, compute_custom_metrics=True): net_output = model(**sample['net_input']) target = model.get_targets(sample, net_output) nsentences = target.size(0) target = target.view(-1) # -- mle loss lprobs = model.get_normalized_probs(net_output, log_probs=True) lprobs = lprobs.view(-1, lprobs.size(-1)) true_token_lprobs = F.nll_loss( lprobs, target, ignore_index=self.padding_idx, reduction='none', ) mle_loss = true_token_lprobs.sum() # -- custom loss # Maximize (1 - p(x_nt)) for negative target tokens x_nt (equivalently minimize -log(1-p(x_nt))) # - form negative targets with torch.no_grad(): # E.g. DABCC | D | EFFGD => {A,B,C} are negative targets. if self.candidate_type == 'prev_context': # Make 'the triangle'. ctx_cands = target.unsqueeze(0).expand(target.size(0), target.size(0)) ctx_cands_ = (ctx_cands.tril(-1) + self.padding_idx) ctx_cands_ = ctx_cands_ * ctx_cands_.triu() ctx_cands = ctx_cands.tril(-1) + ctx_cands_ # Don't include the target for that timestep as a negative target. ctx_cands = ctx_cands.masked_fill(ctx_cands == target.unsqueeze(1), self.padding_idx) negative_targets = torch.zeros_like(lprobs).scatter_(1, ctx_cands, 1) else: raise NotImplementedError('candidate type %s' % self.candidate_type) # - compute loss one_minus_probs = torch.clamp((1.0 - lprobs.exp()), min=1e-5) custom_loss = -torch.log(one_minus_probs)*negative_targets custom_loss = custom_loss.sum() loss = mle_loss + self.rank_alpha * custom_loss # -- metrics logits = net_output[0].view(-1, net_output[0].size(-1)) true_token_logits = -F.nll_loss( logits, target, ignore_index=self.padding_idx, reduction='none', ) orig = utils.strip_pad(target, self.padding_idx) ntokens = orig.numel() sample_size = sample['target'].size(0) if self.args.sentence_avg else ntokens logging_output = { 'custom_loss': utils.item(custom_loss.data), 'loss': utils.item(mle_loss.data), 'ntokens': ntokens, 'nsentences': nsentences, 'sample_size': sample_size, } if compute_custom_metrics: custom_output = TrainingMetrics.ranking_metrics(logits, true_token_logits, sample, ntokens, target) for k, v in custom_output.items(): logging_output[k] = v return loss, sample_size, logging_output