def test_score(): cand = "中华人民共和国" ref = "中华人民共和国公民" bleu = Bleu(N_SIZE) bleu.add_inst(cand, ref) s = bleu.get_score() print('score: {}'.format(s))
def test_add_inst(): cand = '13' ref = '13' bleu = Bleu(N_SIZE) bleu.add_inst(cand, ref) match_ngram = bleu.match_ngram candi_ngram = bleu.candi_ngram print('match_ngram: {}'.format(match_ngram)) print('candi_ngram: {}'.format(candi_ngram))
def test_score(): # init all argument data = read_json() rouge_eval = RougeL() bleu_eval = Bleu() for idx, (ref_key, cand_key) in enumerate(data): ref_sent = data[idx][ref_key] cand_sent = data[idx][cand_key] rouge_eval.add_inst(cand_sent, ref_sent) bleu_eval.add_inst(cand_sent, ref_sent) bleu_score = bleu_eval.get_score() rouge_score = rouge_eval.get_score() print('bleu score: {}, rouge score: {}'.format(bleu_score, rouge_score))
def eval_captions(gt_captions, res_captions): """ gt_captions = ground truth captions; 5 per image res_captions = captions generated by the model to be evaluated """ print('ground truth captions') print(gt_captions) print('RES CAPTIONS') print(res_captions) scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(),"METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr"), ] res = [] for scorer, method in scorers: print('computing %s score...' % (scorer.method())) score, scores = scorer.compute_score(gt_captions, res_captions) if type(method) == list: for sc, scs, m in zip(score, scores, method): print("%s: %0.3f"%(m, sc)) res.append((m, sc)) else: print("%s: %0.3f"%(method, score)) res.append((method, score)) return res
def main(): from ngram import Ngram from model import Model from forest import Forest flags.DEFINE_integer("beam", 100, "beam size", short_name="b") flags.DEFINE_integer("debuglevel", 0, "debug level") flags.DEFINE_boolean("mert", True, "output mert-friendly info (<hyp><cost>)") flags.DEFINE_boolean("cube", True, "using cube pruning to speedup") flags.DEFINE_integer("kbest", 1, "kbest output", short_name="k") flags.DEFINE_integer("ratio", 3, "the maximum items (pop from PQ): ratio*b", short_name="r") argv = FLAGS(sys.argv) [outfile] = argv[1:] weights = Model.cmdline_model() lm = Ngram.cmdline_ngram() false_decoder = CYKDecoder(weights, lm) out = utility.getfile(outfile, 1) old_bleu = Bleu() new_bleu = Bleu() for i, forest in enumerate(Forest.load("-", is_tforest=True, lm=lm), 1): oracle_forest, oracle_item = oracle_extracter(forest, weights, false_decoder, 100, 2, extract=100) print >>sys.stderr, "processed sent %s " % i oracle_forest.dump(out) bleu, hyp, fv, edgelist = forest.compute_oracle(weights, 0.0, 1) forest.bleu.rescore(hyp) old_bleu += forest.bleu forest.bleu.rescore(oracle_item[0].full_derivation) new_bleu += forest.bleu bad_bleu, _, _, _ = oracle_forest.compute_oracle(weights, 0.0, -1) #for i in range(min(len(oracle_item), 5)): # print >>sys.stderr, "Oracle Trans: %s %s %s" %(oracle_item[i].full_derivation, oracle_item[i].score, str(oracle_item[i].score[2])) # print >>sys.stderr, "Oracle BLEU Score: %s"% (forest.bleu.rescore(oracle_item[i].full_derivation)) print >>sys.stderr, "Oracle BLEU Score: %s"% (forest.bleu.rescore(oracle_item[0].full_derivation)) print >>sys.stderr, "Worst new Oracle BLEU Score: %s"% (bad_bleu) print >>sys.stderr, "Old Oracle BLEU Score: %s"% (bleu) print >>sys.stderr, "Running Oracle BLEU Score: %s"% (new_bleu.compute_score()) print >>sys.stderr, "Running Old Oracle BLEU Score: %s"% (old_bleu.compute_score())
def evaluate(self): cap = open(r'results.txt') cap_ = [] for line in cap: line = line.split(' ') line[len(line)-1] = '.' del line[0] print(line) cap_.append(line) gts = {} res = {} f = open("cap_flickr30k.json") captions = json.load(f) f1 = open("dic_flickr30k.json") dics = json.load(f1) dics = dics['images'] pos = 0 for i in range(0, len(dics), 1): if dics[i]['split'] == 'test': caption_1 = [] caption_2 = [] caption_1.append(captions[i][0]['caption']) res[dics[i]['id']] = caption_1 caption_2.append(cap_[pos]) caption_2.append(cap_[pos]) gts[dics[i]['id']] = caption_2 pos = pos + 1 # ================================================= # Set up scorers # ================================================= # ================================================= # Set up scorers # ================================================= print('setting up scorers...') scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Cider(), "CIDEr") ] # ================================================= # Compute scores # ================================================= eval = {} for scorer, method in scorers: print ('computing %s score...'%(scorer.method())) score, scores = scorer.compute_score(gts, res) if type(method) == list: for sc, scs, m in zip(score, scores, method): self.setEval(sc, m) print ("%s: %0.3f"%(m, sc)) else: self.setEval(score, method) print ("%s: %0.3f"%(method, score))
def compute_bleu_rouge(pred_dict, ref_dict, bleu_order=4): """ Compute bleu and rouge scores. """ assert set(pred_dict.keys()) == set(ref_dict.keys()), \ "missing keys: {}".format( set(ref_dict.keys()) - set(pred_dict.keys())) scores = {} bleu_scores, _ = Bleu(bleu_order).compute_score(ref_dict, pred_dict) for i, bleu_score in enumerate(bleu_scores): bleu_score *= 100 scores['Bleu-%d' % (i + 1)] = bleu_score return scores
def evaluate(self): imgIds = self.params['image_id'] # imgIds = self.coco.getImgIds() gts = {} res = {} for imgId in imgIds: gts[imgId] = self.coco.imgToAnns[imgId] res[imgId] = self.cocoRes.imgToAnns[imgId] # ================================================= # Set up scorers # ================================================= print('tokenization...') tokenizer = PTBTokenizer() gts = tokenizer.tokenize(gts) res = tokenizer.tokenize(res) # ================================================= # Set up scorers # ================================================= print('setting up scorers...') ''' scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(),"METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr") ] ''' scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"])] # ================================================= # Compute scores # ================================================= for scorer, method in scorers: print('computing %s score...' % (scorer.method())) score, scores = scorer.compute_score(gts, res) if type(method) == list: for sc, scs, m in zip(score, scores, method): self.setEval(sc, m) self.setImgToEvalImgs(scs, gts.keys(), m) print("%s: %0.3f" % (m, sc)) else: self.setEval(score, method) self.setImgToEvalImgs(scores, gts.keys(), method) print("%s: %0.3f" % (method, score)) self.setEvalImgs()
def score(ref, hypo): scorers = [ (Bleu(4),["Bleu_1","Bleu_2","Bleu_3","Bleu_4"]), (Meteor(),"METEOR"), (Rouge(),"ROUGE_L"), (Cider(),"CIDEr") ] final_scores = {} for scorer,method in scorers: score,scores = scorer.compute_score(ref,hypo) if type(score)==list: for m,s in zip(method,score): final_scores[m] = s else: final_scores[method] = score return final_scores
def cal_avg_B4(custom_gts, custom_res): # input tested senetences, and (top_N - 1) corresponding 'gt' sentences # return the BLEU-4 score # calculate BLEU scores in tradictional way gts = tokenizer.tokenize(custom_gts) res = tokenizer.tokenize(custom_res) print('setting up scorers...') scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"])] imgToEval = {} for scorer, method in scorers: print('computing %s score...'%(scorer.method())) if type(method) == list: score, scores, subgraph_training_bleu = scorer.compute_score(gts, res) for sc, scs, m in zip(score, scores, method): setImgToEvalImgs(scs, list(gts.keys()), m, imgToEval) print("%s: %0.3f"%(m, sc)) B_4s = [imgToEval[sen_id]['Bleu_4'] for sen_id in custom_gts.keys()] return B_4s
def score(ref, hypo): """ ref, dictionary of reference sentences (id, sentence) hypo, dictionary of hypothesis sentences (id, sentence) score, dictionary of scores """ scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Rouge(), "ROUGE_L"), ] final_scores = {} for scorer, method in scorers: score, scores = scorer.compute_score(ref, hypo) if type(score) == list: for m, s in zip(method, score): final_scores[m] = s else: final_scores[method] = score return final_scores
def __init__(self, alpha=0.5): self.simple_meteor = SimpleMeteor(alpha=alpha, beta=0.16) self.tri_bleu = Bleu(3) self.four_bleu = Bleu(4, beta=0.13) self.p = Preprocessor()
class MeteorBleu: """ Prints features for all versions of Meteor and BLEU for every input sentence """ def __init__(self, alpha=0.5): self.simple_meteor = SimpleMeteor(alpha=alpha, beta=0.16) self.tri_bleu = Bleu(3) self.four_bleu = Bleu(4, beta=0.13) self.p = Preprocessor() def features(self, tokline, posline): """ The workhouse function Takes lists of tokens and postags for [h1, h2, ref] Returns feature values for h1, h2, h1-h2 """ features = [] # Simple Meteor h1p, h2p, refp = self.p.preprocess(tokline, stem=False, lowercase=False) h1score = self.simple_meteor.score(h1p, refp) h2score = self.simple_meteor.score(h2p, refp) h1_h2 = h1score - h2score features += [h1score, h2score, h1_h2] # Simple Meteor lowercase h1p, h2p, refp = self.p.preprocess(tokline, stem=False, lowercase=True) h1score = self.simple_meteor.score(h1p, refp) h2score = self.simple_meteor.score(h2p, refp) h1_h2 = h1score - h2score features += [h1score, h2score, h1_h2] # Simple Meteor lowercase, stemmed h1p, h2p, refp = self.p.preprocess(tokline, stem=True, lowercase=True) h1score = self.simple_meteor.score(h1p, refp) h2score = self.simple_meteor.score(h2p, refp) h1_h2 = h1score - h2score features += [h1score, h2score, h1_h2] # Simple Meteor referencing sequence of postags, lowercase, stemmed h1p, h2p, refp = self.p.preprocess(tokline, stem=True, lowercase=True) h1pos, h2pos, refpos = self.p.preprocess(posline) h1score = self.simple_meteor.score(h1p, refp, postags=True, hpos=h1pos, refpos=refpos) h2score = self.simple_meteor.score(h2p, refp, postags=True, hpos=h2pos, refpos=refpos) h1_h2 = h1score - h2score features += [h1score, h2score, h1_h2] # trigram BLEU, lowercased, stemmed h1p, h2p, refp = self.p.preprocess(tokline, stem=True, lowercase=True) h1score = self.tri_bleu.score(h1p, refp) h2score = self.tri_bleu.score(h2p, refp) h1_h2 = h1score - h2score features += [h1score, h2score, h1_h2] # postag-smoothed 4-gram BLEU h1p, h2p, refp = self.p.preprocess(tokline, stem=False, lowercase=False) h1pos, h2pos, refpos = self.p.preprocess(posline) h1score = self.four_bleu.score(h1p, refp, postag=True, hpos=h1pos, refpos=refpos) h2score = self.four_bleu.score(h2p, refp, postag=True, hpos=h2pos, refpos=refpos) h1_h2 = h1score - h2score features += [h1score, h2score, h1_h2] # postag-smoothed 4-gram BLEU, lowercased h1p, h2p, refp = self.p.preprocess(tokline, stem=False, lowercase=True) h1pos, h2pos, refpos = self.p.preprocess(posline) h1score = self.four_bleu.score(h1p, refp, postag=True, hpos=h1pos, refpos=refpos) h2score = self.four_bleu.score(h2p, refp, postag=True, hpos=h2pos, refpos=refpos) h1_h2 = h1score - h2score features += [h1score, h2score, h1_h2] # postag-smoothed 4-gram BLEU, lowercased, stemmed h1p, h2p, refp = self.p.preprocess(tokline, stem=True, lowercase=True) h1pos, h2pos, refpos = self.p.preprocess(posline) h1score = self.four_bleu.score(h1p, refp, postag=True, hpos=h1pos, refpos=refpos) h2score = self.four_bleu.score(h2p, refp, postag=True, hpos=h2pos, refpos=refpos) h1_h2 = h1score - h2score features += [h1score, h2score, h1_h2] # postag-smoothed 4-gram BLEU, lowercased, stemmed, weighted w = [10, 5, 2, 1] h1p, h2p, refp = self.p.preprocess(tokline, stem=True, lowercase=True) h1pos, h2pos, refpos = self.p.preprocess(posline) h1score = self.four_bleu.score(h1p, refp, postag=True, hpos=h1pos, refpos=refpos, wts=w) h2score = self.four_bleu.score(h2p, refp, postag=True, hpos=h2pos, refpos=refpos, wts=w) h1_h2 = h1score - h2score features += [h1score, h2score, h1_h2] return features def evaluate(self, h1score, h2score): """ Scores hypothesis sentences based on scores Prints output """ if h1score > h2score: print -1 elif h1score == h2score: print 0 else: print 1
help="print result for each sentence", default=False) optparser.add_option("", "--defaultnbest", dest="defaultnbest", help="default nbests", metavar="FILE", default=None) (opts, args) = optparser.parse_args() if opts.weights is not None: weights = get_weights(opts.weights) else: weights = Vector("lm1=2 gt_prob=1") extra_feats = None # prep_features(args) decoder = LocalDecoder() #BUDecoder(opts.k, extra_feats, check_feats=False) decoder.set_feats(extra_feats) all_pp = Bleu() # Parseval(), now BLEU decode_time, parseval_time = 0, 0 sum_score = 0 if opts.defaultnbest: defaultnbests = defaultdict(lambda : []) for line in open(opts.defaultnbest): defaultnbests[int(line.split()[0])].append(line.strip()) for i, forest in enumerate(decoder.load("-")): if forest is None: print >> logs, "forest %d is empty" % (i+1) if opts.defaultnbest: for line in defaultnbests[i][:opts.k]: print line
from ngram import Ngram # defines --lm and --order argv = FLAGS(sys.argv) if FLAGS.prob is None and FLAGS.ratio is None: print >> logs, "Error: must specify pruning threshold by -p or ratio by -r" + str(FLAGS) sys.exit(1) weights = Model.cmdline_model() lm = Ngram.cmdline_ngram() # if FLAGS.lm is None then returns None if lm: weights["lm1"] = weights["lm"] * FLAGS.lmratio onebestscores = 0 onebestbleus = Bleu() myscores = 0 myoraclebleus = Bleu() total_nodes = total_edges = old_nodes = old_edges = 0 for i, forest in enumerate(Forest.load("-", lm=lm), 1): if forest is None: print continue prune(forest, weights, FLAGS.prob, FLAGS.ratio) score, hyp, fv = forest.root.bestres forest.bleu.rescore(hyp)
def test_count_bp(): cand = '我是中国人' ref = '重视啊啊啊啊我啊啊我了' bleu = Bleu(N_SIZE) bp = bleu.count_bp(cand, ref) print('BP: {}'.format(bp))
bylinefile = open(opts.byline) reffiles = [open(f) for f in args] ## the remaining of the input are assumed to be refs ## print >> logs, "rules file %s" % rulefile ## print >> logs, "source file %s" % srcfile ## print >> logs, "byline file %s" % bylinefile ## print >> logs, "re files %s" % " ".join(map(str, reffiles)) # lhuang: n-gram order = 4 theoracle = oracle.Oracle(4, variant="ibm") hopebleus = collections.defaultdict(lambda : Bleu()) hopescores = collections.defaultdict(lambda : []) onebestbleus = Bleu() onebestscores = [] for i, (srcline, byline, forestline) in \ enumerate(itertools.izip(srcfile, bylinefile, forestfile)): reflines = [f.readline() for f in reffiles] rules = read_rules(opts.rules) if opts.extrarules: rules = read_rules(opts.extrarules, rules) if forestline.strip() == "": ## empty forest (pure byline) forestline = "(0<gt_prob:0> )" f = forest_from_text(forestline)
def main(): weights = Model.cmdline_model() lm = Ngram.cmdline_ngram() LMState.init(lm, weights) decoder = Decoder() tot_bleu = Bleu() tot_score = 0. tot_time = 0. tot_len = tot_fnodes = tot_fedges = 0 tot_steps = tot_states = tot_edges = tot_stacks = 0 for i, forest in enumerate(Forest.load("-", is_tforest=True, lm=lm), 1): t = time.time() best, final_items = decoder.beam_search(forest, b=FLAGS.beam) score, trans, fv = best.score, best.trans(), best.get_fvector() t = time.time() - t tot_time += t tot_score += score forest.bleu.rescore(trans) tot_bleu += forest.bleu fnodes, fedges = forest.size() tot_len += len(forest.sent) tot_fnodes += fnodes tot_fedges += fedges tot_steps += decoder.max_step tot_states += decoder.num_states tot_edges += decoder.num_edges tot_stacks += decoder.num_stacks print >> logs, ("sent %d, b %d\tscore %.4f\tbleu+1 %s" + \ "\ttime %.3f\tsentlen %-3d fnodes %-4d fedges %-5d\tstep %d states %d edges %d stacks %d") % \ (i, FLAGS.beam, score, forest.bleu.score_ratio_str(), t, len(forest.sent), fnodes, fedges, decoder.max_step, decoder.num_states, decoder.num_edges, decoder.num_stacks) if FLAGS.k > 1 or FLAGS.forest: lmforest = best.toforest(forest) if FLAGS.forest: lmforest.dump() if FLAGS.k > 1: lmforest.lazykbest(FLAGS.k, weights=weights) klist = lmforest.root.klist if not FLAGS.mert: for j, (sc, tr, fv) in enumerate(klist, 1): print >> logs, "k=%d score=%.4f fv=%s\n%s" % (j, sc, fv, tr) else: klist = [(best.score, best.trans(), best.get_fvector())] if FLAGS.mert: # <score>... <hyp> ... print >> logs, '<sent No="%d">' % i print >> logs, "<Chinese>%s</Chinese>" % " ".join(forest.cased_sent) for sc, tr, fv in klist: print >> logs, "<score>%.3lf</score>" % sc print >> logs, "<hyp>%s</hyp>" % tr print >> logs, "<cost>%s</cost>" % fv print >> logs, "</sent>" if not FLAGS.forest: print trans print >> logs, "avg %d sentences, first pass score: %.4f, bleu: %s" % \ (i, decoder.firstpassscore/i, decoder.firstpassbleus.score_ratio_str()) print >> logs, ("avg %d sentences, b %d\tscore %.4lf\tbleu %s\ttime %.3f" + \ "\tsentlen %.1f fnodes %.1f fedges %.1f\tstep %.1f states %.1f edges %.1f stacks %.1f") % \ (i, FLAGS.beam, tot_score/i, tot_bleu.score_ratio_str(), tot_time/i, tot_len/i, tot_fnodes/i, tot_fedges/i, tot_steps/i, tot_states/i, tot_edges/i, tot_stacks/i) print >> logs, LMState.cachehits, LMState.cachemiss
best_devscore = -1 print >> logs, "starting perceptron at", time.ctime() for it in xrange(opts.iterations): print >> logs, "iteration %d" % (it+1), "= = " * 20 print >> logs, "hope weight on modelcost = %lf" % opts.hope iterstart = time.time() if opts.shuffle: ## TODO: randomize pass parseval = Bleu() num_updates = 0 avgtime = 0 decoder.reset() if not preloaded: trainforests = decoder.load(opts.trainfile) for i, forest in enumerate(trainforests): decoder.do_oracle(forest, weights) print >> logs, " iteration %d, example %d" % (it+1, i+1), "-" * 5, "oracle = %.4lf" % forest.oracle_bleu_score, updated, pp, deltafv = one_example(forest, weights)
def init_scorer(cached_tokens): global CiderD_scorer CiderD_scorer = CiderD_scorer or CiderD(df=cached_tokens) global Bleu_scorer Bleu_scorer = Bleu_scorer or Bleu(4)
class MeteorBleu: """ Prints features for all versions of Meteor and BLEU for every input sentence """ def __init__(self, alpha=0.5): self.simple_meteor = SimpleMeteor(alpha=alpha, beta=0.16) self.tri_bleu = Bleu(3) self.four_bleu = Bleu(4, beta=0.13) self.p = Preprocessor() def features(self, tokline, posline): """ The workhouse function Takes lists of tokens and postags for [h1, h2, ref] Returns feature values for h1, h2, h1-h2 """ features = [] # Simple Meteor h1p, h2p, refp = self.p.preprocess(tokline, stem=False, lowercase=False) h1score = self.simple_meteor.score(h1p, refp) h2score = self.simple_meteor.score(h2p, refp) h1_h2 = h1score - h2score features += [h1score, h2score, h1_h2] # Simple Meteor lowercase h1p, h2p, refp = self.p.preprocess(tokline, stem=False, lowercase=True) h1score = self.simple_meteor.score(h1p, refp) h2score = self.simple_meteor.score(h2p, refp) h1_h2 = h1score - h2score features += [h1score, h2score, h1_h2] # Simple Meteor lowercase, stemmed h1p, h2p, refp = self.p.preprocess(tokline, stem=True, lowercase=True) h1score = self.simple_meteor.score(h1p, refp) h2score = self.simple_meteor.score(h2p, refp) h1_h2 = h1score - h2score features += [h1score, h2score, h1_h2] # Simple Meteor referencing sequence of postags, lowercase, stemmed h1p, h2p, refp = self.p.preprocess(tokline, stem=True, lowercase=True) h1pos, h2pos, refpos = self.p.preprocess(posline) h1score = self.simple_meteor.score( h1p, refp, postags=True, hpos=h1pos, refpos=refpos) h2score = self.simple_meteor.score( h2p, refp, postags=True, hpos=h2pos, refpos=refpos) h1_h2 = h1score - h2score features += [h1score, h2score, h1_h2] # trigram BLEU, lowercased, stemmed h1p, h2p, refp = self.p.preprocess(tokline, stem=True, lowercase=True) h1score = self.tri_bleu.score(h1p, refp) h2score = self.tri_bleu.score(h2p, refp) h1_h2 = h1score - h2score features += [h1score, h2score, h1_h2] # postag-smoothed 4-gram BLEU h1p, h2p, refp = self.p.preprocess(tokline, stem=False, lowercase=False) h1pos, h2pos, refpos = self.p.preprocess(posline) h1score = self.four_bleu.score( h1p, refp, postag=True, hpos=h1pos, refpos=refpos) h2score = self.four_bleu.score( h2p, refp, postag=True, hpos=h2pos, refpos=refpos) h1_h2 = h1score - h2score features += [h1score, h2score, h1_h2] # postag-smoothed 4-gram BLEU, lowercased h1p, h2p, refp = self.p.preprocess(tokline, stem=False, lowercase=True) h1pos, h2pos, refpos = self.p.preprocess(posline) h1score = self.four_bleu.score( h1p, refp, postag=True, hpos=h1pos, refpos=refpos) h2score = self.four_bleu.score( h2p, refp, postag=True, hpos=h2pos, refpos=refpos) h1_h2 = h1score - h2score features += [h1score, h2score, h1_h2] # postag-smoothed 4-gram BLEU, lowercased, stemmed h1p, h2p, refp = self.p.preprocess(tokline, stem=True, lowercase=True) h1pos, h2pos, refpos = self.p.preprocess(posline) h1score = self.four_bleu.score( h1p, refp, postag=True, hpos=h1pos, refpos=refpos) h2score = self.four_bleu.score( h2p, refp, postag=True, hpos=h2pos, refpos=refpos) h1_h2 = h1score - h2score features += [h1score, h2score, h1_h2] # postag-smoothed 4-gram BLEU, lowercased, stemmed, weighted w = [10,5,2,1] h1p, h2p, refp = self.p.preprocess(tokline, stem=True, lowercase=True) h1pos, h2pos, refpos = self.p.preprocess(posline) h1score = self.four_bleu.score( h1p, refp, postag=True, hpos=h1pos, refpos=refpos, wts=w) h2score = self.four_bleu.score( h2p, refp, postag=True, hpos=h2pos, refpos=refpos, wts=w) h1_h2 = h1score - h2score features += [h1score, h2score, h1_h2] return features def evaluate(self, h1score, h2score): """ Scores hypothesis sentences based on scores Prints output """ if h1score > h2score: print -1 elif h1score == h2score: print 0 else: print 1
def __init__(self): self.scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"])]
def test_score(): cand = "中华人民共和国" ref = "中华人民共和国公民" bleu = Bleu(N_SIZE) s = bleu.score(cand, ref) print('score: {}'.format(s))
from rouge import Rouge import argparse import logging from ReadingComprehension.IterativeReattentionAligner.e2e_encoder import MnemicReader as e2e_MnemicReader import cProfile, pstats, io from utils import * from InformationRetrieval.AttentionRM.modules import AttentionRM from EndToEndModel.modules import EndToEndModel from nltk.translate.bleu_score import sentence_bleu import re import pickle from CSMrouge import RRRouge from bleu import Bleu stoplist = set(['.',',', '...', '..']) bleu_obj = Bleu(4) def add_arguments(parser): parser.add_argument("train_file", help="File that contains training data") parser.add_argument("dev_file", help="File that contains dev data") parser.add_argument("embedding_file", help="File that contains pre-trained embeddings") parser.add_argument('--dicts_dir', type=str, default=None, help='Directory containing the word dictionaries') parser.add_argument('--seed', type=int, default=6, help='Random seed for the experiment') parser.add_argument('--epochs', type=int, default=20, help='Train data iterations') parser.add_argument('--train_batch_size', type=int, default=32, help='Batch size for training') parser.add_argument('--dev_batch_size', type=int, default=32, help='Batch size for dev') parser.add_argument('--hidden_size', type=int, default=100, help='Hidden size for LSTM') parser.add_argument('--num_layers', type=int, default=1, help='Number of layers for LSTM') parser.add_argument('--char_emb_size', type=int, default=50, help='Embedding size for characters') parser.add_argument('--pos_emb_size', type=int, default=50, help='Embedding size for pos tags') parser.add_argument('--ner_emb_size', type=int, default=50, help='Embedding size for ner')