def _compute_bleu(self, cur_valid_out, valid_trees): """Compute BLEU score of the current output on a set of validation trees. If the validation set is a tuple (two paraphrases), use them both for BLEU computation. @param cur_valid_out: the current system output on the validation DAs @param valid_trees: the gold trees for the validation DAs (one or two paraphrases) @return: BLEU score, as a float (percentage) """ evaluator = BLEUMeasure() for pred_tree, gold_trees in zip(cur_valid_out, valid_trees): evaluator.append(pred_tree, gold_trees) return evaluator.bleu()
def eval_tokens(das, eval_tokens, gen_tokens): """Evaluate generated tokens and print out statistics.""" postprocess_tokens(eval_tokens, das) postprocess_tokens(gen_tokens, das) evaluator = BLEUMeasure() for pred_sent, gold_sents in zip(gen_tokens, eval_tokens): evaluator.append(pred_sent, gold_sents) log_info("BLEU score: %.4f" % (evaluator.bleu() * 100)) evaluator = Evaluator() for pred_sent, gold_sents in zip(gen_tokens, eval_tokens): for gold_sent in gold_sents: # effectively an average over all gold paraphrases evaluator.append(gold_sent, pred_sent) log_info("TOKEN precision: %.4f, Recall: %.4f, F1: %.4f" % evaluator.p_r_f1(EvalTypes.TOKEN)) log_info("Sentence length stats:\n * GOLD %s\n * PRED %s\n * DIFF %s" % evaluator.size_stats()) log_info("Common subphrase stats:\n -- SIZE: %s\n -- ΔGLD: %s\n -- ΔPRD: %s" % evaluator.common_substruct_stats())
def _rerank_paths(self, paths, da): """Rerank the n-best decoded paths according to the reranking classifier and/or BLEU against context.""" trees = [ self.tree_embs.ids_to_tree( np.array(path.dec_inputs).transpose()[0]) for path in paths ] # rerank using BLEU against context if set to do so if self.context_bleu_weight: bm = BLEUMeasure(max_ngram=2) bleus = [] for path, tree in zip(paths, trees): bm.reset() bm.append([(n.t_lemma, None) for n in tree.nodes[1:]], [da[0]]) bleu = (bm.ngram_precision() if self.context_bleu_metric == 'ngram_prec' else bm.bleu()) bleus.append(bleu) path.logprob += self.context_bleu_weight * bleu log_debug(("BLEU for context: %s\n\n" % " ".join([form for form, _ in da[0]])) + "\n".join([("%.5f\t" % b) + " ".join([n.t_lemma for n in t.nodes[1:]]) for b, t in zip(bleus, trees)])) # add distances to logprob so that non-fitting will be heavily penalized if self.classif_filter: self.classif_filter.init_run(da) fits = self.classif_filter.dist_to_cur_da(trees) for path, fit in zip(paths, fits): path.logprob -= self.misfit_penalty * fit log_debug(("Misfits for DA: %s\n\n" % str(da)) + "\n".join([("%.5f\t" % fit) + " ".join( [unicode(n.t_lemma) for n in tree.nodes[1:]]) for fit, tree in zip(fits, trees)])) # adjust paths for length (if set to do so) if self.length_norm_weight: for path in paths: path.logprob /= len(path)**self.length_norm_weight return sorted(paths, cmp=lambda p, q: cmp(p.logprob, q.logprob), reverse=True)
def _rerank_paths(self, paths, da): """Rerank the n-best decoded paths according to the reranking classifier and/or BLEU against context.""" trees = [self.tree_embs.ids_to_tree(np.array(path.dec_inputs).transpose()[0]) for path in paths] # rerank using BLEU against context if set to do so if self.context_bleu_weight: bm = BLEUMeasure(max_ngram=2) bleus = [] for path, tree in zip(paths, trees): bm.reset() bm.append([(n.t_lemma, None) for n in tree.nodes[1:]], [da[0]]) bleu = (bm.ngram_precision() if self.context_bleu_metric == 'ngram_prec' else bm.bleu()) bleus.append(bleu) path.logprob += self.context_bleu_weight * bleu log_debug(("BLEU for context: %s\n\n" % " ".join([form for form, _ in da[0]])) + "\n".join([("%.5f\t" % b) + " ".join([n.t_lemma for n in t.nodes[1:]]) for b, t in zip(bleus, trees)])) # add distances to logprob so that non-fitting will be heavily penalized if self.classif_filter: self.classif_filter.init_run(da) fits = self.classif_filter.dist_to_cur_da(trees) for path, fit in zip(paths, fits): path.logprob -= self.misfit_penalty * fit log_debug(("Misfits for DA: %s\n\n" % str(da)) + "\n".join([("%.5f\t" % fit) + " ".join([unicode(n.t_lemma) for n in tree.nodes[1:]]) for fit, tree in zip(fits, trees)])) # adjust paths for length (if set to do so) if self.length_norm_weight: for path in paths: path.logprob /= len(path) ** self.length_norm_weight return sorted(paths, cmp=lambda p, q: cmp(p.logprob, q.logprob), reverse=True)
def eval_tokens(das, eval_tokens, gen_tokens): """Evaluate generated tokens and print out statistics.""" postprocess_tokens(eval_tokens, das) postprocess_tokens(gen_tokens, das) evaluator = BLEUMeasure() for pred_sent, gold_sents in zip(gen_tokens, eval_tokens): evaluator.append(pred_sent, gold_sents) log_info("BLEU score: %.4f" % (evaluator.bleu() * 100)) evaluator = Evaluator() for pred_sent, gold_sents in zip(gen_tokens, eval_tokens): for gold_sent in gold_sents: # effectively an average over all gold paraphrases evaluator.append(gold_sent, pred_sent) log_info("TOKEN precision: %.4f, Recall: %.4f, F1: %.4f" % evaluator.p_r_f1(EvalTypes.TOKEN)) log_info("Sentence length stats:\n * GOLD %s\n * PRED %s\n * DIFF %s" % evaluator.size_stats()) log_info( "Common subphrase stats:\n -- SIZE: %s\n -- ΔGLD: %s\n -- ΔPRD: %s" % evaluator.common_substruct_stats())