def Reward_bleu_fin(self, trg, hyp, show=False): """ To use as self.Reward funtion. Return an array of rewards, based on the differences of current Blue Score. As proposed on paper. :param trg: target. :param hyp: the predicted sequence. :param show: Boolean, display the computation of the rewards :return: current Bleu score """ rew = np.zeros(len(hyp[0])) decoded_valid_tar = self.model.trg_vocab.arrays_to_sentences( arrays=trg, cut_at_eos=True) decoded_valid_hyp = self.model.trg_vocab.arrays_to_sentences( arrays=hyp, cut_at_eos=True) # evaluate with metric on each src, tar, and hypotesis join_char = " " if self.level in ["word", "bpe"] else "" valid_references = [join_char.join(t) for t in decoded_valid_tar] valid_hypotheses = [join_char.join(t) for t in decoded_valid_hyp] # post-process if self.level == "bpe": valid_references = [bpe_postprocess(v) for v in valid_references] valid_hypotheses = [bpe_postprocess(v) for v in valid_hypotheses] # if references are given, evaluate against them if valid_references: assert len(valid_hypotheses) == len(valid_references) current_valid_score = 0 if self.eval_metric.lower() == 'bleu': # this version does not use any tokenization #print(' aaa ') current_valid_score = bleu(valid_hypotheses, valid_references) elif self.eval_metric.lower() == 'chrf': current_valid_score = chrf(valid_hypotheses, valid_references) elif self.eval_metric.lower() == 'token_accuracy': current_valid_score = token_accuracy(valid_hypotheses, valid_references, level=self.level) elif self.eval_metric.lower() == 'sequence_accuracy': current_valid_score = sequence_accuracy( valid_hypotheses, valid_references) else: current_valid_score = -1 rew[-1] = current_valid_score final_rew = rew[1:] if show: print( "\n Sample-------------Target vs Eval_net prediction:--Raw---and---Decoded-----" ) print("Target: ", trg, decoded_valid_tar) print("Eval : ", hyp, decoded_valid_hyp) print("Reward: ", final_rew, "\n") return final_rew
def validate_on_data(model: Model, data: Dataset, batch_size: int, use_cuda: bool, max_output_length: int, level: str, eval_metric: Optional[str], n_gpu: int, batch_class: Batch = Batch, compute_loss: bool = False, beam_size: int = 1, beam_alpha: int = -1, batch_type: str = "sentence", postprocess: bool = True, bpe_type: str = "subword-nmt", sacrebleu: dict = None, n_best: int = 1) \ -> (float, float, float, List[str], List[List[str]], List[str], List[str], List[List[str]], List[np.array]): """ Generate translations for the given data. If `compute_loss` is True and references are given, also compute the loss. :param model: model module :param data: dataset for validation :param batch_size: validation batch size :param batch_class: class type of batch :param use_cuda: if True, use CUDA :param max_output_length: maximum length for generated hypotheses :param level: segmentation level, one of "char", "bpe", "word" :param eval_metric: evaluation metric, e.g. "bleu" :param n_gpu: number of GPUs :param compute_loss: whether to computes a scalar loss for given inputs and targets :param beam_size: beam size for validation. If <2 then greedy decoding (default). :param beam_alpha: beam search alpha for length penalty, disabled if set to -1 (default). :param batch_type: validation batch type (sentence or token) :param postprocess: if True, remove BPE segmentation from translations :param bpe_type: bpe type, one of {"subword-nmt", "sentencepiece"} :param sacrebleu: sacrebleu options :param n_best: Amount of candidates to return :return: - current_valid_score: current validation score [eval_metric], - valid_loss: validation loss, - valid_ppl:, validation perplexity, - valid_sources: validation sources, - valid_sources_raw: raw validation sources (before post-processing), - valid_references: validation references, - valid_hypotheses: validation_hypotheses, - decoded_valid: raw validation hypotheses (before post-processing), - valid_attention_scores: attention scores for validation hypotheses """ assert batch_size >= n_gpu, "batch_size must be bigger than n_gpu." if sacrebleu is None: # assign default value sacrebleu = {"remove_whitespace": True, "tokenize": "13a"} if batch_size > 1000 and batch_type == "sentence": logger.warning( "WARNING: Are you sure you meant to work on huge batches like " "this? 'batch_size' is > 1000 for sentence-batching. " "Consider decreasing it or switching to" " 'eval_batch_type: token'.") valid_iter = make_data_iter(dataset=data, batch_size=batch_size, batch_type=batch_type, shuffle=False, train=False) valid_sources_raw = data.src pad_index = model.src_vocab.stoi[PAD_TOKEN] # disable dropout model.eval() # don't track gradients during validation with torch.no_grad(): all_outputs = [] valid_attention_scores = [] total_loss = 0 total_ntokens = 0 total_nseqs = 0 for valid_batch in iter(valid_iter): # run as during training to get validation loss (e.g. xent) batch = batch_class(valid_batch, pad_index, use_cuda=use_cuda) # sort batch now by src length and keep track of order reverse_index = batch.sort_by_src_length() sort_reverse_index = expand_reverse_index(reverse_index, n_best) # run as during training with teacher forcing if compute_loss and batch.trg is not None: batch_loss, _, _, _ = model(return_type="loss", **vars(batch)) if n_gpu > 1: batch_loss = batch_loss.mean() # average on multi-gpu total_loss += batch_loss total_ntokens += batch.ntokens total_nseqs += batch.nseqs # run as during inference to produce translations output, attention_scores = run_batch( model=model, batch=batch, beam_size=beam_size, beam_alpha=beam_alpha, max_output_length=max_output_length, n_best=n_best) # sort outputs back to original order all_outputs.extend(output[sort_reverse_index]) valid_attention_scores.extend( attention_scores[sort_reverse_index] if attention_scores is not None else []) assert len(all_outputs) == len(data) * n_best if compute_loss and total_ntokens > 0: # total validation loss valid_loss = total_loss # exponent of token-level negative log prob valid_ppl = torch.exp(total_loss / total_ntokens) else: valid_loss = -1 valid_ppl = -1 # decode back to symbols decoded_valid = model.trg_vocab.arrays_to_sentences(arrays=all_outputs, cut_at_eos=True) # evaluate with metric on full dataset join_char = " " if level in ["word", "bpe"] else "" valid_sources = [join_char.join(s) for s in data.src] valid_references = [join_char.join(t) for t in data.trg] valid_hypotheses = [join_char.join(t) for t in decoded_valid] # post-process if level == "bpe" and postprocess: valid_sources = [ bpe_postprocess(s, bpe_type=bpe_type) for s in valid_sources ] valid_references = [ bpe_postprocess(v, bpe_type=bpe_type) for v in valid_references ] valid_hypotheses = [ bpe_postprocess(v, bpe_type=bpe_type) for v in valid_hypotheses ] # if references are given, evaluate against them if valid_references: assert len(valid_hypotheses) == len(valid_references) current_valid_score = 0 if eval_metric.lower() == 'bleu': # this version does not use any tokenization current_valid_score = bleu(valid_hypotheses, valid_references, tokenize=sacrebleu["tokenize"]) elif eval_metric.lower() == 'chrf': current_valid_score = chrf( valid_hypotheses, valid_references, remove_whitespace=sacrebleu["remove_whitespace"]) elif eval_metric.lower() == 'token_accuracy': current_valid_score = token_accuracy( # supply List[List[str]] list(decoded_valid), list(data.trg)) elif eval_metric.lower() == 'sequence_accuracy': current_valid_score = sequence_accuracy( valid_hypotheses, valid_references) else: current_valid_score = -1 return current_valid_score, valid_loss, valid_ppl, valid_sources, \ valid_sources_raw, valid_references, valid_hypotheses, \ decoded_valid, valid_attention_scores
def validate_on_data(model: Model, data: Dataset, logger: Logger, batch_size: int, use_cuda: bool, max_output_length: int, level: str, eval_metric: Optional[str], loss_function: torch.nn.Module = None, beam_size: int = 1, beam_alpha: int = -1, batch_type: str = "sentence", postprocess: bool = True ) \ -> (float, float, float, List[str], List[List[str]], List[str], List[str], List[List[str]], List[np.array]): """ Generate translations for the given data. If `loss_function` is not None and references are given, also compute the loss. :param model: model module :param logger: logger :param data: dataset for validation :param batch_size: validation batch size :param use_cuda: if True, use CUDA :param max_output_length: maximum length for generated hypotheses :param level: segmentation level, one of "char", "bpe", "word" :param eval_metric: evaluation metric, e.g. "bleu" :param loss_function: loss function that computes a scalar loss for given inputs and targets :param beam_size: beam size for validation. If <2 then greedy decoding (default). :param beam_alpha: beam search alpha for length penalty, disabled if set to -1 (default). :param batch_type: validation batch type (sentence or token) :param postprocess: if True, remove BPE segmentation from translations :return: - current_valid_score: current validation score [eval_metric], - valid_loss: validation loss, - valid_ppl:, validation perplexity, - valid_sources: validation sources, - valid_sources_raw: raw validation sources (before post-processing), - valid_references: validation references, - valid_hypotheses: validation_hypotheses, - decoded_valid: raw validation hypotheses (before post-processing), - valid_attention_scores: attention scores for validation hypotheses """ if batch_size > 1000 and batch_type == "sentence": logger.warning( "WARNING: Are you sure you meant to work on huge batches like " "this? 'batch_size' is > 1000 for sentence-batching. " "Consider decreasing it or switching to" " 'eval_batch_type: token'.") valid_iter = make_data_iter(dataset=data, batch_size=batch_size, batch_type=batch_type, shuffle=False, train=False) valid_sources_raw = data.src pad_index = model.src_vocab.stoi[PAD_TOKEN] # disable dropout model.eval() # don't track gradients during validation with torch.no_grad(): all_outputs = [] valid_attention_scores = [] total_loss = 0 total_ntokens = 0 total_nseqs = 0 for valid_batch in iter(valid_iter): # run as during training to get validation loss (e.g. xent) batch = Batch(valid_batch, pad_index, use_cuda=use_cuda) # sort batch now by src length and keep track of order sort_reverse_index = batch.sort_by_src_lengths() # run as during training with teacher forcing if loss_function is not None and batch.trg is not None: batch_loss = model.get_loss_for_batch( batch, loss_function=loss_function) total_loss += batch_loss total_ntokens += batch.ntokens total_nseqs += batch.nseqs # run as during inference to produce translations output, attention_scores = model.run_batch( batch=batch, beam_size=beam_size, beam_alpha=beam_alpha, max_output_length=max_output_length) # sort outputs back to original order all_outputs.extend(output[sort_reverse_index]) valid_attention_scores.extend( attention_scores[sort_reverse_index] if attention_scores is not None else []) assert len(all_outputs) == len(data) if loss_function is not None and total_ntokens > 0: # total validation loss valid_loss = total_loss # exponent of token-level negative log prob valid_ppl = torch.exp(total_loss / total_ntokens) else: valid_loss = -1 valid_ppl = -1 # decode back to symbols decoded_valid = model.trg_vocab.arrays_to_sentences(arrays=all_outputs, cut_at_eos=True) # evaluate with metric on full dataset join_char = " " if level in ["word", "bpe"] else "" valid_sources = [join_char.join(s) for s in data.src] valid_references = [join_char.join(t) for t in data.trg] valid_hypotheses = [join_char.join(t) for t in decoded_valid] # post-process if level == "bpe" and postprocess: valid_sources = [bpe_postprocess(s) for s in valid_sources] valid_references = [bpe_postprocess(v) for v in valid_references] valid_hypotheses = [bpe_postprocess(v) for v in valid_hypotheses] # if references are given, evaluate against them if valid_references: assert len(valid_hypotheses) == len(valid_references) current_valid_score = 0 if eval_metric.lower() == 'bleu': # this version does not use any tokenization current_valid_score = bleu(valid_hypotheses, valid_references) elif eval_metric.lower() == 'chrf': current_valid_score = chrf(valid_hypotheses, valid_references) elif eval_metric.lower() == 'token_accuracy': current_valid_score = token_accuracy(valid_hypotheses, valid_references, level=level) elif eval_metric.lower() == 'sequence_accuracy': current_valid_score = sequence_accuracy( valid_hypotheses, valid_references) else: current_valid_score = -1 return current_valid_score, valid_loss, valid_ppl, valid_sources, \ valid_sources_raw, valid_references, valid_hypotheses, \ decoded_valid, valid_attention_scores
def get_rl_loss_for_batch(self, batch: Batch, loss_function, rl_weight: float, beta_entropy: float, use_cuda: bool, max_output_length: int, level: str) -> Tensor: """ Generate translations for the given data. If `loss_function` is not None and references are given, also compute the loss. :param model: model module :param logger: logger :param data: dataset for validation :param batch_size: validation batch size :param use_cuda: if True, use CUDA :param max_output_length: maximum length for generated hypotheses :param level: segmentation level, one of "char", "bpe", "word" :param eval_metric: evaluation metric, e.g. "bleu" :param loss_function: loss function that computes a scalar loss for given inputs and targets :param beam_size: beam size for validation. If <2 then greedy decoding (default). :param beam_alpha: beam search alpha for length penalty, disabled if set to -1 (default). :param batch_type: validation batch type (sentence or token) :return: - current_valid_score: current validation score [eval_metric], - valid_loss: validation loss, - valid_ppl:, validation perplexity, - valid_sources: validation sources, - valid_sources_raw: raw validation sources (before post-processing), - valid_references: validation references, - valid_hypotheses: validation_hypotheses, - decoded_valid: raw validation hypotheses (before post-processing), - valid_attention_scores: attention scores for validation hypotheses """ # sort batch now by src length and keep track of order sort_reverse_index = batch.sort_by_src_lengths() if rl_weight != 1: # note not yet normalized by tokens #TODO: Check here to make sure that we do not delete mle_loss here breakpoint() loss = self.get_loss_for_batch(batch, loss_function) # for logging mle_loss = loss.cpu().detach().item() else: loss = 0 mle_loss = 0 if rl_weight > 0.0: # run as during inference to produce translations & RL score output, transposed_log_probs, entropy = self.run_rl_batch( batch=batch, max_output_length=max_output_length) # sort outputs back to original order output = output[sort_reverse_index] log_probs = torch.stack(transposed_log_probs).T[ sort_reverse_index] # T x B -> B x T as Tensor # decode back to symbols decoded_src = self.src_vocab.arrays_to_sentences(arrays=batch.src, cut_at_eos=True) decoded_trg = self.trg_vocab.arrays_to_sentences(arrays=batch.trg, cut_at_eos=True) decoded_hyp = self.trg_vocab.arrays_to_sentences(arrays=output, cut_at_eos=True) # evaluate with metric on full dataset join_char = " " if level in ["word", "bpe"] else "" train_sources = [join_char.join(s) for s in decoded_src] train_references = [join_char.join(t) for t in decoded_trg] train_hypotheses = [join_char.join(t) for t in decoded_hyp] # post-process if level == "bpe": train_sources = [bpe_postprocess(s) for s in train_sources] train_references = [ bpe_postprocess(v) for v in train_references ] train_hypotheses = [ bpe_postprocess(v) for v in train_hypotheses ] # if references are given, evaluate against them assert len(train_hypotheses) == len(train_references) reinforce_scores = self.bleurt_scorer(references=train_references, hypotheses=train_hypotheses) reinforce_scores = torch.tensor(reinforce_scores).unsqueeze(-1) if use_cuda: reinforce_scores = reinforce_scores.cuda() log_probs = log_probs.cuda() reward_adjusted_log_probs = torch.mul(log_probs, reinforce_scores) # minimize the log-adjusted cost and maximize entropy (or "multiply entropy by -1 and minimize") # note this is not normalized by the number of tokens yet batch_rl_loss = reward_adjusted_log_probs.sum( ) - beta_entropy * entropy loss = loss * (1 - rl_weight) + rl_weight * batch_rl_loss batch_rl_loss = batch_rl_loss.cpu().detach().item() entropy = entropy.cpu().detach().item() mean_bleurt = torch.mean(reinforce_scores).item() else: batch_rl_loss = 0 entropy = 0 mean_bleurt = 0 return loss, batch_rl_loss, mle_loss, entropy, mean_bleurt
def validate_on_data(model, data, batch_size, use_cuda, max_output_length, level, eval_metric, criterion, beam_size=0, beam_alpha=-1): """ Generate translations for the given data. If `criterion` is not None and references are given, also compute the loss. :param model: :param data: :param batch_size: :param use_cuda: :param max_output_length: :param level: :param eval_metric: :param criterion: :param beam_size: :param beam_alpha: :return: """ valid_iter = make_data_iter(dataset=data, batch_size=batch_size, shuffle=False, train=False) valid_sources_raw = [s for s in data.src] pad_index = model.src_vocab.stoi[PAD_TOKEN] # disable dropout model.eval() # don't track gradients during validation with torch.no_grad(): all_outputs = [] valid_attention_scores = [] total_loss = 0 total_ntokens = 0 for valid_i, valid_batch in enumerate(iter(valid_iter), 1): # run as during training to get validation loss (e.g. xent) batch = Batch(valid_batch, pad_index, use_cuda=use_cuda) # sort batch now by src length and keep track of order sort_reverse_index = batch.sort_by_src_lengths() # TODO save computation: forward pass is computed twice # run as during training with teacher forcing if criterion is not None and batch.trg is not None: batch_loss = model.get_loss_for_batch(batch, criterion=criterion) total_loss += batch_loss total_ntokens += batch.ntokens # run as during inference to produce translations output, attention_scores = model.run_batch( batch=batch, beam_size=beam_size, beam_alpha=beam_alpha, max_output_length=max_output_length) # sort outputs back to original order all_outputs.extend(output[sort_reverse_index]) valid_attention_scores.extend( attention_scores[sort_reverse_index] if attention_scores is not None else []) assert len(all_outputs) == len(data) if criterion is not None and total_ntokens > 0: # total validation loss valid_loss = total_loss # exponent of token-level negative log prob valid_ppl = torch.exp(total_loss / total_ntokens) else: valid_loss = -1 valid_ppl = -1 # decode back to symbols decoded_valid = arrays_to_sentences(arrays=all_outputs, vocabulary=model.trg_vocab, cut_at_eos=True) # evaluate with metric on full dataset join_char = " " if level in ["word", "bpe"] else "" valid_sources = [join_char.join(s) for s in data.src] valid_references = [join_char.join(t) for t in data.trg] valid_hypotheses = [join_char.join(t) for t in decoded_valid] # post-process if level == "bpe": valid_sources = [bpe_postprocess(s) for s in valid_sources] valid_references = [bpe_postprocess(v) for v in valid_references] valid_hypotheses = [bpe_postprocess(v) for v in valid_hypotheses] # if references are given, evaluate against them if len(valid_references) > 0: assert len(valid_hypotheses) == len(valid_references) current_valid_score = 0 if eval_metric.lower() == 'bleu': # this version does not use any tokenization current_valid_score = bleu(valid_hypotheses, valid_references) elif eval_metric.lower() == 'chrf': current_valid_score = chrf(valid_hypotheses, valid_references) elif eval_metric.lower() == 'token_accuracy': current_valid_score = token_accuracy(valid_hypotheses, valid_references, level=level) elif eval_metric.lower() == 'sequence_accuracy': current_valid_score = sequence_accuracy( valid_hypotheses, valid_references) else: current_valid_score = -1 return current_valid_score, valid_loss, valid_ppl, valid_sources, \ valid_sources_raw, valid_references, valid_hypotheses, \ decoded_valid, \ valid_attention_scores
def validate_on_data(model: Model, data: Dataset, batch_size: int, use_cuda: bool, max_output_length: int, src_level: str, trg_level: str, eval_metrics: Optional[Sequence[str]], attn_metrics: Optional[Sequence[str]], loss_function: torch.nn.Module = None, beam_size: int = 0, beam_alpha: int = 0, batch_type: str = "sentence", save_attention: bool = False, log_sparsity: bool = False, apply_mask: bool = True # hmm ) \ -> (float, float, float, List[str], List[List[str]], List[str], List[str], List[List[str]], List[np.array]): """ Generate translations for the given data. If `loss_function` is not None and references are given, also compute the loss. :param model: model module :param data: dataset for validation :param batch_size: validation batch size :param use_cuda: if True, use CUDA :param max_output_length: maximum length for generated hypotheses :param src_level: source segmentation level, one of "char", "bpe", "word" :param trg_level: target segmentation level, one of "char", "bpe", "word" :param eval_metrics: evaluation metric, e.g. "bleu" :param loss_function: loss function that computes a scalar loss for given inputs and targets :param beam_size: beam size for validation. If 0 then greedy decoding (default). :param beam_alpha: beam search alpha for length penalty, disabled if set to 0 (default). :param batch_type: validation batch type (sentence or token) :return: - current_valid_score: current validation score [eval_metric], - valid_loss: validation loss, - valid_ppl:, validation perplexity, - valid_sources: validation sources, - valid_sources_raw: raw validation sources (before post-processing), - valid_references: validation references, - valid_hypotheses: validation_hypotheses, - decoded_valid: raw validation hypotheses (before post-processing), - valid_attention_scores: attention scores for validation hypotheses """ eval_funcs = { "bleu": bleu, "chrf": chrf, "token_accuracy": partial(token_accuracy, level=trg_level), "sequence_accuracy": sequence_accuracy, "wer": wer, "cer": partial(character_error_rate, level=trg_level) } selected_eval_metrics = {name: eval_funcs[name] for name in eval_metrics} valid_iter = make_data_iter( dataset=data, batch_size=batch_size, batch_type=batch_type, shuffle=False, train=False) valid_sources_raw = [s for s in data.src] pad_index = model.src_vocab.stoi[PAD_TOKEN] # disable dropout model.eval() # don't track gradients during validation scorer = partial(len_penalty, alpha=beam_alpha) if beam_alpha > 0 else None with torch.no_grad(): all_outputs = [] valid_attention_scores = defaultdict(list) total_loss = 0 total_ntokens = 0 total_nseqs = 0 total_attended = defaultdict(int) greedy_steps = 0 greedy_supported = 0 for valid_batch in iter(valid_iter): # run as during training to get validation loss (e.g. xent) batch = Batch(valid_batch, pad_index, use_cuda=use_cuda) # sort batch now by src length and keep track of order sort_reverse_index = batch.sort_by_src_lengths() # run as during training with teacher forcing if loss_function is not None and batch.trg is not None: batch_loss = model.get_loss_for_batch( batch, loss_function=loss_function) total_loss += batch_loss total_ntokens += batch.ntokens total_nseqs += batch.nseqs # run as during inference to produce translations output, attention_scores, probs = model.run_batch( batch=batch, beam_size=beam_size, scorer=scorer, max_output_length=max_output_length, log_sparsity=log_sparsity, apply_mask=apply_mask) if log_sparsity: lengths = torch.LongTensor((output == model.trg_vocab.stoi[EOS_TOKEN]).argmax(axis=1)).unsqueeze(1) batch_greedy_steps = lengths.sum().item() greedy_steps += lengths.sum().item() ix = torch.arange(output.shape[1]).unsqueeze(0).expand(output.shape[0], -1) mask = ix <= lengths supp = probs.exp().gt(0).sum(dim=-1).cpu() # batch x len supp = torch.where(mask, supp, torch.tensor(0)).sum() greedy_supported += supp.float().item() # sort outputs back to original order all_outputs.extend(output[sort_reverse_index]) if attention_scores is not None: # is attention_scores ever None? if save_attention: # beam search currently does not support attention logging for k, v in attention_scores.items(): valid_attention_scores[k].extend(v[sort_reverse_index]) if attn_metrics: # add to total_attended for k, v in attention_scores.items(): total_attended[k] += (v > 0).sum() assert len(all_outputs) == len(data) if log_sparsity: print(greedy_supported / greedy_steps) valid_scores = dict() if loss_function is not None and total_ntokens > 0: # total validation loss valid_loss = total_loss valid_scores["loss"] = total_loss valid_scores["ppl"] = torch.exp(total_loss / total_ntokens) # decode back to symbols decoded_valid = model.trg_vocab.arrays_to_sentences(arrays=all_outputs, cut_at_eos=True) # evaluate with metric on full dataset src_join_char = " " if src_level in ["word", "bpe"] else "" trg_join_char = " " if trg_level in ["word", "bpe"] else "" valid_sources = [src_join_char.join(s) for s in data.src] valid_references = [trg_join_char.join(t) for t in data.trg] valid_hypotheses = [trg_join_char.join(t) for t in decoded_valid] if attn_metrics: decoded_ntokens = sum(len(t) for t in decoded_valid) for attn_metric in attn_metrics: assert attn_metric == "support" for attn_name, tot_attended in total_attended.items(): score_name = attn_name + "_" + attn_metric # this is not the right denominator valid_scores[score_name] = tot_attended / decoded_ntokens # post-process if src_level == "bpe": valid_sources = [bpe_postprocess(s) for s in valid_sources] if trg_level == "bpe": valid_references = [bpe_postprocess(v) for v in valid_references] valid_hypotheses = [bpe_postprocess(v) for v in valid_hypotheses] languages = [language for language in data.language] by_language = defaultdict(list) seqs = zip(valid_references, valid_hypotheses) if valid_references else valid_hypotheses if languages: examples = zip(languages, seqs) for lang, seq in examples: by_language[lang].append(seq) else: by_language[None].extend(seqs) # if references are given, evaluate against them # incorrect if-condition? # scores_by_lang = {name: dict() for name in selected_eval_metrics} scores_by_lang = dict() if valid_references and eval_metrics is not None: assert len(valid_hypotheses) == len(valid_references) for eval_metric, eval_func in selected_eval_metrics.items(): score_by_lang = dict() for lang, pairs in by_language.items(): lang_hyps, lang_refs = zip(*pairs) lang_score = eval_func(lang_hyps, lang_refs) score_by_lang[lang] = lang_score score = sum(score_by_lang.values()) / len(score_by_lang) valid_scores[eval_metric] = score scores_by_lang[eval_metric] = score_by_lang if not languages: scores_by_lang = None return valid_scores, valid_sources, \ valid_sources_raw, valid_references, valid_hypotheses, \ decoded_valid, valid_attention_scores, scores_by_lang, by_language
def dev_network(self): """ Show how is the current performace over the dev dataset, by mean of the total reward and the belu score. :return: current Bleu score """ freeze_model(self.eval_net) for data_set_name, data_set in self.data_to_dev.items(): #print(data_set_name) valid_iter = make_data_iter(dataset=data_set, batch_size=1, batch_type=self.batch_type, shuffle=False, train=False) valid_sources_raw = data_set.src # don't track gradients during validation r_total = 0 roptimal_total = 0 all_outputs = [] i_sample = 0 for valid_batch in iter(valid_iter): # run as during training to get validation loss (e.g. xent) batch = Batch(valid_batch, self.pad_index, use_cuda=self.use_cuda) encoder_output, encoder_hidden = self.model.encode( batch.src, batch.src_lengths, batch.src_mask) # if maximum output length is # not globally specified, adapt to src len if self.max_output_length is None: self.max_output_length = int( max(batch.src_lengths.cpu().numpy()) * 1.5) batch_size = batch.src_mask.size(0) prev_y = batch.src_mask.new_full(size=[batch_size, 1], fill_value=self.bos_index, dtype=torch.long) output = [] hidden = self.model.decoder._init_hidden(encoder_hidden) prev_att_vector = None finished = batch.src_mask.new_zeros((batch_size, 1)).byte() # pylint: disable=unused-variable for t in range(self.max_output_length): # if i_sample == 0 or i_sample == 3 or i_sample == 6: # print("state on t = ", t, " : " , state) # decode one single step logits, hidden, att_probs, prev_att_vector = self.model.decoder( encoder_output=encoder_output, encoder_hidden=encoder_hidden, src_mask=batch.src_mask, trg_embed=self.model.trg_embed(prev_y), hidden=hidden, prev_att_vector=prev_att_vector, unroll_steps=1) # greedy decoding: choose arg max over vocabulary in each step with egreedy porbability if self.state_type == 'hidden': state = torch.cat(hidden, dim=2).squeeze(1).detach().cpu()[0] else: state = torch.FloatTensor( prev_att_vector.squeeze(1).detach().cpu().numpy() [0]) logits = self.eval_net(state) logits = logits.reshape([1, 1, -1]) #print(type(logits), logits.shape, logits) next_word = torch.argmax(logits, dim=-1) a = next_word.squeeze(1).detach().cpu().numpy()[0] prev_y = next_word output.append(next_word.squeeze(1).detach().cpu().numpy()) prev_y = next_word # check if previous symbol was <eos> is_eos = torch.eq(next_word, self.eos_index) finished += is_eos # stop predicting if <eos> reached for all elements in batch if (finished >= 1).sum() == batch_size: break stacked_output = np.stack(output, axis=1) # batch, time #decode back to symbols decoded_valid_in = self.model.trg_vocab.arrays_to_sentences( arrays=batch.src, cut_at_eos=True) decoded_valid_out_trg = self.model.trg_vocab.arrays_to_sentences( arrays=batch.trg, cut_at_eos=True) decoded_valid_out = self.model.trg_vocab.arrays_to_sentences( arrays=stacked_output, cut_at_eos=True) hyp = stacked_output r = self.Reward(batch.trg, hyp, show=False) if i_sample == 0 or i_sample == 3 or i_sample == 6: print( "\n Sample ", i_sample, "-------------Target vs Eval_net prediction:--Raw---and---Decoded-----" ) print("Target: ", batch.trg, decoded_valid_out_trg) print("Eval : ", stacked_output, decoded_valid_out, "\n") print("Reward: ", r) #r = self.Reward1(batch.trg, hyp , show = False) r_total += sum(r[np.where(r > 0)]) if i_sample == 0: roptimal = self.Reward(batch.trg, batch.trg, show=False) roptimal_total += sum(roptimal[np.where(roptimal > 0)]) all_outputs.extend(stacked_output) i_sample += 1 assert len(all_outputs) == len(data_set) # decode back to symbols decoded_valid = self.model.trg_vocab.arrays_to_sentences( arrays=all_outputs, cut_at_eos=True) # evaluate with metric on full dataset join_char = " " if self.level in ["word", "bpe"] else "" valid_sources = [join_char.join(s) for s in data_set.src] valid_references = [join_char.join(t) for t in data_set.trg] valid_hypotheses = [join_char.join(t) for t in decoded_valid] # post-process if self.level == "bpe": valid_sources = [bpe_postprocess(s) for s in valid_sources] valid_references = [ bpe_postprocess(v) for v in valid_references ] valid_hypotheses = [ bpe_postprocess(v) for v in valid_hypotheses ] # if references are given, evaluate against them if valid_references: assert len(valid_hypotheses) == len(valid_references) current_valid_score = 0 if self.eval_metric.lower() == 'bleu': # this version does not use any tokenization current_valid_score = bleu(valid_hypotheses, valid_references) elif self.eval_metric.lower() == 'chrf': current_valid_score = chrf(valid_hypotheses, valid_references) elif self.eval_metric.lower() == 'token_accuracy': current_valid_score = token_accuracy(valid_hypotheses, valid_references, level=self.level) elif self.eval_metric.lower() == 'sequence_accuracy': current_valid_score = sequence_accuracy( valid_hypotheses, valid_references) else: current_valid_score = -1 self.dev_network_count += 1 self.tb_writer.add_scalar("dev/dev_reward", r_total, self.dev_network_count) self.tb_writer.add_scalar("dev/dev_bleu", current_valid_score, self.dev_network_count) print(self.dev_network_count, ' r_total and score: ', r_total, current_valid_score) unfreeze_model(self.eval_net) return current_valid_score
def Reward_lin(self, trg, hyp, show=False): """ To use as self.Reward funtion. Return an array of rewards, based on the current Score. From a T predicted sequence. Gives a reward per each T steps. Just when the predicted word is on the right place. :param trg: target. :param hyp: the predicted sequence. :param show: Boolean, display the computation of the rewards :return: current Bleu score """ tar_len = trg.shape[1] hyp_len = hyp.shape[1] final_rew = -1 * np.ones(hyp_len - 1) len_temp = 0 if tar_len > hyp_len: len_temp = hyp_len else: len_temp = tar_len hyp2com = np.zeros([1, tar_len]) hyp2com[0, :len_temp] = hyp[0, :len_temp] equal = (trg.numpy() == hyp2com) #equal = np.invert(equal)*np.ones(equal.size)*0.2 # ind1, ind2 = np.where(equal == False) # if len(ind1) != 0: # equal[ind1[0]:, ind2[0]:] = False decoded_valid_tar = self.model.trg_vocab.arrays_to_sentences( arrays=trg, cut_at_eos=True) decoded_valid_hyp = self.model.trg_vocab.arrays_to_sentences( arrays=hyp, cut_at_eos=True) if show: print('la lista trg-out decodificada: ', decoded_valid_tar) print('la lista hypotesis decodificada: ', decoded_valid_hyp) # evaluate with metric on each src, tar, and hypotesis join_char = " " if self.level in ["word", "bpe"] else "" valid_references = [join_char.join(t) for t in decoded_valid_tar] valid_hypotheses = [join_char.join(t) for t in decoded_valid_hyp] # post-process if self.level == "bpe": valid_references = [bpe_postprocess(v) for v in valid_references] valid_hypotheses = [bpe_postprocess(v) for v in valid_hypotheses] # if references are given, evaluate against them if valid_references: assert len(valid_hypotheses) == len(valid_references) current_valid_score = 0 if self.eval_metric.lower() == 'bleu': # this version does not use any tokenization current_valid_score = bleu(valid_hypotheses, valid_references) elif self.eval_metric.lower() == 'chrf': current_valid_score = chrf(valid_hypotheses, valid_references) elif self.eval_metric.lower() == 'token_accuracy': current_valid_score = token_accuracy(valid_hypotheses, valid_references, level=self.level) elif self.eval_metric.lower() == 'sequence_accuracy': current_valid_score = sequence_accuracy( valid_hypotheses, valid_references) else: current_valid_score = -1 k = sum(np.arange(tar_len)) a_i = np.arange(1, tar_len) / k VSa_i = [sum(a_i[:i]) for i in np.arange(1, tar_len, dtype='int')] VSa_i = np.multiply( np.asanyarray(VSa_i).reshape([1, tar_len - 1]), equal).reshape([tar_len - 1]) final_rew[:len_temp - 1] = np.multiply(VSa_i, current_valid_score)[:len_temp] if show: print('Reward is: ', final_rew) print('sum: ', sum(final_rew)) return final_rew
def Reward_bleu_diff(self, trg, hyp, show=False): """ To use as self.Reward funtion. Return an array of rewards, based on the differences of current Blue Score. As proposed on paper. :param trg: target. :param hyp: the predicted sequence. :param show: Boolean, display the computation of the rewards :return: current Bleu score """ smooth = 0.001 rew = np.zeros([len(hyp[0])]) # print('aa',len(hyp[0])) discount_ini_token = 1 discount_fin_token = 1 if trg[0, 0] != hyp[0, 0]: #print(trg, hyp) discount_ini_token = 0.5 if len(hyp[0]) > len(trg[0]): discount_ini_token = 0.5 for t in np.arange(len(hyp[0]) - 1): hyp_sub = hyp[:, :t + 1] #print(hyp_sub) decoded_valid_tar = self.model.trg_vocab.arrays_to_sentences( arrays=trg, cut_at_eos=True) decoded_valid_hyp = self.model.trg_vocab.arrays_to_sentences( arrays=hyp_sub, cut_at_eos=True) # evaluate with metric on each src, tar, and hypotesis join_char = " " if self.level in ["word", "bpe"] else "" valid_references = [join_char.join(t) for t in decoded_valid_tar] valid_hypotheses = [join_char.join(t) for t in decoded_valid_hyp] # post-process if self.level == "bpe": valid_references = [ bpe_postprocess(v) for v in valid_references ] valid_hypotheses = [ bpe_postprocess(v) for v in valid_hypotheses ] # if references are given, evaluate against them if valid_references: assert len(valid_hypotheses) == len(valid_references) current_valid_score = sacrebleu.corpus_bleu( valid_hypotheses[0], valid_references[0], smooth_method='floor', smooth_value=smooth, use_effective_order=True).score if t == 0: current_valid_score *= discount_ini_token if t > len(trg[0]): current_valid_score *= discount_fin_token if t > self.max_output_length - 1: current_valid_score = -10 # if show: # print("\n Sample-------------Target vs Eval_net prediction:--Raw---and---Decoded-----") # print("Target: ", trg, decoded_valid_tar, valid_references) # print("Eval : ", hyp_sub, decoded_valid_hyp, valid_hypotheses ) # print("Current Reward: ", current_valid_score, "\n") rew[t + 1] = current_valid_score if show: print('rew: ', rew) rew[:-1] = np.diff(rew) final_rew = rew # final_rew = np.zeros(len(hyp[0])) # final_rew[0] = r_1 # #print(rew) # final_rew[1:] = np.diff(rew) if show: print( "\n Sample-------------Target vs Eval_net prediction:--Raw---and---Decoded-----" ) print("Target: ", trg, decoded_valid_tar) print("Eval : ", hyp, decoded_valid_hyp) print("Reward: ", final_rew, "\n") return final_rew
def validate_on_data(model: Model, data: Dataset, batch_size: int, use_cuda: bool, max_output_length: int, level: str, eval_metric: Optional[str], loss_function: torch.nn.Module = None, beam_size: int = 0, beam_alpha: int = -1, batch_type: str = "sentence", kb_task = None, valid_kb: Dataset = None, valid_kb_lkp: list = [], valid_kb_lens:list=[], valid_kb_truvals: Dataset = None, valid_data_canon: Dataset = None, report_on_canonicals: bool = False, ) \ -> (float, float, float, List[str], List[List[str]], List[str], List[str], List[List[str]], List[np.array]): """ Generate translations for the given data. If `loss_function` is not None and references are given, also compute the loss. :param model: model module :param data: dataset for validation :param batch_size: validation batch size :param use_cuda: if True, use CUDA :param max_output_length: maximum length for generated hypotheses :param level: segmentation level, one of "char", "bpe", "word" :param eval_metric: evaluation metric, e.g. "bleu" :param loss_function: loss function that computes a scalar loss for given inputs and targets :param beam_size: beam size for validation. If 0 then greedy decoding (default). :param beam_alpha: beam search alpha for length penalty, disabled if set to -1 (default). :param batch_type: validation batch type (sentence or token) :param kb_task: is not None if kb_task should be executed :param valid_kb: MonoDataset holding the loaded valid kb data :param valid_kb_lkp: List with valid example index to corresponding kb indices :param valid_kb_len: List with amount of triples per kb :param valid_data_canon: TranslationDataset of valid data but with canonized target data (for loss reporting) :return: - current_valid_score: current validation score [eval_metric], - valid_loss: validation loss, - valid_ppl:, validation perplexity, - valid_sources: validation sources, - valid_sources_raw: raw validation sources (before post-processing), - valid_references: validation references, - valid_hypotheses: validation_hypotheses, - decoded_valid: raw validation hypotheses (before post-processing), - valid_attention_scores: attention scores for validation hypotheses - valid_ent_f1: TODO FIXME """ print(f"\n{'-'*10} ENTER VALIDATION {'-'*10}\n") print(f"\n{'-'*10} VALIDATION DEBUG {'-'*10}\n") print("---data---") print(dir(data[0])) print([[ getattr(example, attr) for attr in dir(example) if hasattr(getattr(example, attr), "__iter__") and "kb" in attr or "src" in attr or "trg" in attr ] for example in data[:3]]) print(batch_size) print(use_cuda) print(max_output_length) print(level) print(eval_metric) print(loss_function) print(beam_size) print(beam_alpha) print(batch_type) print(kb_task) print("---valid_kb---") print(dir(valid_kb[0])) print([[ getattr(example, attr) for attr in dir(example) if hasattr(getattr(example, attr), "__iter__") and "kb" in attr or "src" in attr or "trg" in attr ] for example in valid_kb[:3]]) print(len(valid_kb_lkp), valid_kb_lkp[-5:]) print(len(valid_kb_lens), valid_kb_lens[-5:]) print("---valid_kb_truvals---") print(len(valid_kb_truvals), valid_kb_lens[-5:]) print([[ getattr(example, attr) for attr in dir(example) if hasattr(getattr(example, attr), "__iter__") and "kb" in attr or "src" in attr or "trg" in attr or "trv" in attr ] for example in valid_kb_truvals[:3]]) print("---valid_data_canon---") print(len(valid_data_canon), valid_data_canon[-5:]) print([[ getattr(example, attr) for attr in dir(example) if hasattr(getattr(example, attr), "__iter__") and "kb" in attr or "src" in attr or "trg" in attr or "trv" or "can" in attr ] for example in valid_data_canon[:3]]) print(report_on_canonicals) print(f"\n{'-'*10} END VALIDATION DEBUG {'-'*10}\n") if not kb_task: valid_iter = make_data_iter(dataset=data, batch_size=batch_size, batch_type=batch_type, shuffle=False, train=False) else: # knowledgebase version of make data iter and also provide canonized target data # data: for bleu/ent f1 # canon_data: for loss valid_iter = make_data_iter_kb(data, valid_kb, valid_kb_lkp, valid_kb_lens, valid_kb_truvals, batch_size=batch_size, batch_type=batch_type, shuffle=False, train=False, canonize=model.canonize, canon_data=valid_data_canon) valid_sources_raw = data.src pad_index = model.src_vocab.stoi[PAD_TOKEN] # disable dropout model.eval() # don't track gradients during validation with torch.no_grad(): all_outputs = [] valid_attention_scores = [] valid_kb_att_scores = [] total_loss = 0 total_ntokens = 0 total_nseqs = 0 for valid_batch in iter(valid_iter): # run as during training to get validation loss (e.g. xent) batch = Batch(valid_batch, pad_index, use_cuda=use_cuda) \ if not kb_task else \ Batch_with_KB(valid_batch, pad_index, use_cuda=use_cuda) assert hasattr(batch, "kbsrc") == bool(kb_task) # sort batch now by src length and keep track of order if not kb_task: sort_reverse_index = batch.sort_by_src_lengths() else: sort_reverse_index = list(range(batch.src.shape[0])) # run as during training with teacher forcing if loss_function is not None and batch.trg is not None: ntokens = batch.ntokens if hasattr(batch, "trgcanon") and batch.trgcanon is not None: ntokens = batch.ntokenscanon # normalize loss with num canonical tokens for perplexity # do a loss calculation without grad updates just to report valid loss # we can only do this when batch.trg exists, so not during actual translation/deployment batch_loss = model.get_loss_for_batch( batch, loss_function=loss_function) # keep track of metrics for reporting total_loss += batch_loss total_ntokens += ntokens # gold target tokens total_nseqs += batch.nseqs # run as during inference to produce translations output, attention_scores, kb_att_scores = model.run_batch( batch=batch, beam_size=beam_size, beam_alpha=beam_alpha, max_output_length=max_output_length) # sort outputs back to original order all_outputs.extend(output[sort_reverse_index]) valid_attention_scores.extend( attention_scores[sort_reverse_index] if attention_scores is not None else []) valid_kb_att_scores.extend(kb_att_scores[sort_reverse_index] if kb_att_scores is not None else []) assert len(all_outputs) == len(data) if loss_function is not None and total_ntokens > 0: # total validation loss valid_loss = total_loss # exponent of token-level negative log likelihood # can be seen as 2^(cross_entropy of model on valid set); normalized by num tokens; # see https://en.wikipedia.org/wiki/Perplexity#Perplexity_per_word valid_ppl = torch.exp(valid_loss / total_ntokens) else: valid_loss = -1 valid_ppl = -1 # decode back to symbols decoding_vocab = model.trg_vocab if not kb_task else model.trv_vocab decoded_valid = decoding_vocab.arrays_to_sentences(arrays=all_outputs, cut_at_eos=True) print(f"decoding_vocab.itos: {decoding_vocab.itos}") print(decoded_valid) # evaluate with metric on full dataset join_char = " " if level in ["word", "bpe"] else "" valid_sources = [join_char.join(s) for s in data.src] # TODO replace valid_references with uncanonicalized dev.car data ... requires writing new Dataset in data.py valid_references = [join_char.join(t) for t in data.trg] valid_hypotheses = [join_char.join(t) for t in decoded_valid] # post-process if level == "bpe": valid_sources = [bpe_postprocess(s) for s in valid_sources] valid_references = [bpe_postprocess(v) for v in valid_references] valid_hypotheses = [bpe_postprocess(v) for v in valid_hypotheses] # if references are given, evaluate against them if valid_references: assert len(valid_hypotheses) == len(valid_references) print(list(zip(valid_sources, valid_references, valid_hypotheses))) current_valid_score = 0 if eval_metric.lower() == 'bleu': # this version does not use any tokenization current_valid_score = bleu(valid_hypotheses, valid_references) elif eval_metric.lower() == 'chrf': current_valid_score = chrf(valid_hypotheses, valid_references) elif eval_metric.lower() == 'token_accuracy': current_valid_score = token_accuracy(valid_hypotheses, valid_references, level=level) elif eval_metric.lower() == 'sequence_accuracy': current_valid_score = sequence_accuracy( valid_hypotheses, valid_references) if kb_task: valid_ent_f1, valid_ent_mcc = calc_ent_f1_and_ent_mcc( valid_hypotheses, valid_references, vocab=model.trv_vocab, c_fun=model.canonize, report_on_canonicals=report_on_canonicals) else: valid_ent_f1, valid_ent_mcc = -1, -1 else: current_valid_score = -1 print(f"\n{'-'*10} EXIT VALIDATION {'-'*10}\n") return current_valid_score, valid_loss, valid_ppl, valid_sources, \ valid_sources_raw, valid_references, valid_hypotheses, \ decoded_valid, valid_attention_scores, valid_kb_att_scores, \ valid_ent_f1, valid_ent_mcc