def compute_bleu(net, word_dict, index_dict, tokens, initial=None, IM=None): """ Return BLEU scores for reference tokens For each reference caption, a candidate caption is sampled from net """ bleu_scores = np.zeros((len(tokens), 3)) for i, ref in enumerate(tokens): if initial != None: init = copy.deepcopy(initial) else: init = None ref = ref[net.context:][:-1] if IM != None: can = sample(net, word_dict, index_dict, len(ref), IM[i], initial=init) else: can = sample(net, word_dict, index_dict, len(ref), initial=init) # Compute bleu using n = (1,2,3) n1 = bleu.score_cooked( [bleu.cook_test(can, bleu.cook_refs([ref], n=1), n=1)], n=1) n2 = bleu.score_cooked( [bleu.cook_test(can, bleu.cook_refs([ref], n=2), n=2)], n=2) n3 = bleu.score_cooked( [bleu.cook_test(can, bleu.cook_refs([ref], n=3), n=3)], n=3) bleu_scores[i] = [n1, n2, n3] return bleu_scores
def batch_bleu(cans, refs): """ cans : [ 'XXX', 'XXX', ... ] refs : [ ['XXX', 'XXX', ... ], ['XXX', 'XXX', ... ], ... ] """ bleu_scores = np.zeros((len(cans), 3)) for i, can in enumerate(cans): n1 = bleu.score_cooked([bleu.cook_test(can, bleu.cook_refs(refs[i], n=1), n=1)], n=1) n2 = bleu.score_cooked([bleu.cook_test(can, bleu.cook_refs(refs[i], n=2), n=2)], n=2) n3 = bleu.score_cooked([bleu.cook_test(can, bleu.cook_refs(refs[i], n=3), n=3)], n=3) bleu_scores[i] = [n1,n2,n3] return bleu_scores
def main(): sys.path.append("../scripts/training/cmert-0.5") import bleu data_dir = "test_scorer_data" nbest_file = os.path.join(data_dir, "nbest.out") ref_file = os.path.join(data_dir, "reference.txt") bleu.preserve_case = False bleu.eff_ref_len = "shortest" bleu.nonorm = 0 ref_fh = open(ref_file) cookedrefs = [] for ref in ref_fh: cookedref = bleu.cook_refs([ref]) cookedrefs.append(cookedref) ref_fh.close() nbest_fh = open(nbest_file) tests = [] i = -1 for line in nbest_fh: fields = line.split("||| ") current_i = int(fields[0]) text = fields[1] if i != current_i: tests.append([]) i = current_i tests[-1].append(text) nbest_fh.close() # score with first best cookedtests = [] for i in range(len(tests)): sentence = tests[i][0] cookedtest = (bleu.cook_test(sentence, cookedrefs[i])) stats = " ".join([ "%d %d" % (c, g) for (c, g) in zip(cookedtest['correct'], cookedtest['guess']) ]) print " %s %d" % (stats, cookedtest['reflen']) cookedtests.append(cookedtest) bleu1 = bleu.score_cooked(cookedtests) # vary, and score again cookedtests = [] for i in range(len(tests)): sentence = tests[i][0] if i == 7: sentence = tests[i][8] elif i == 1: sentences = tests[i][2] cookedtest = (bleu.cook_test(sentence, cookedrefs[i])) cookedtests.append(cookedtest) bleu2 = bleu.score_cooked(cookedtests) print "Bleus: ", bleu1, bleu2
def main(): sys.path.append("../scripts/training/cmert-0.5") import bleu data_dir = "test_scorer_data" nbest_file = os.path.join(data_dir,"nbest.out") ref_file = os.path.join(data_dir,"reference.txt") bleu.preserve_case = False bleu.eff_ref_len = "shortest" bleu.nonorm = 0 ref_fh = open(ref_file) cookedrefs = [] for ref in ref_fh: cookedref = bleu.cook_refs([ref]) cookedrefs.append(cookedref) ref_fh.close() nbest_fh = open(nbest_file) tests = [] i = -1 for line in nbest_fh: fields = line.split("||| ") current_i = int(fields[0]) text = fields[1] if i != current_i: tests.append([]) i = current_i tests[-1].append(text) nbest_fh.close() # score with first best cookedtests = [] for i in range(len(tests)): sentence = tests[i][0] cookedtest = (bleu.cook_test(sentence, cookedrefs[i])) stats = " ".join(["%d %d" % (c,g) for (c,g) in zip(cookedtest['correct'], cookedtest['guess'])]) print " %s %d" % (stats ,cookedtest['reflen']) cookedtests.append(cookedtest) bleu1 = bleu.score_cooked(cookedtests) # vary, and score again cookedtests = [] for i in range(len(tests)): sentence = tests[i][0] if i == 7: sentence = tests[i][8] elif i == 1: sentences = tests[i][2] cookedtest = (bleu.cook_test(sentence, cookedrefs[i])) cookedtests.append(cookedtest) bleu2 = bleu.score_cooked(cookedtests) print "Bleus: ", bleu1,bleu2
def compute_bleu(net, word_dict, index_dict, tokens, initial=None, IM=None): """ Return BLEU scores for reference tokens For each reference caption, a candidate caption is sampled from net """ bleu_scores = np.zeros((len(tokens), 3)) for i, ref in enumerate(tokens): if initial != None: init = copy.deepcopy(initial) else: init = None ref = ref[net.context:][:-1] if IM != None: can = sample(net, word_dict, index_dict, len(ref), IM[i], initial=init) else: can = sample(net, word_dict, index_dict, len(ref), initial=init) # Compute bleu using n = (1,2,3) n1 = bleu.score_cooked([bleu.cook_test(can, bleu.cook_refs([ref], n=1), n=1)], n=1) n2 = bleu.score_cooked([bleu.cook_test(can, bleu.cook_refs([ref], n=2), n=2)], n=2) n3 = bleu.score_cooked([bleu.cook_test(can, bleu.cook_refs([ref], n=3), n=3)], n=3) bleu_scores[i] = [n1,n2,n3] return bleu_scores
bleu.preserve_case = True if opt == "-a": bleu.eff_ref_len = "average" if opt == "-s": bleu.eff_ref_len = "shortest" if opt == "-e": bleu.eff_ref_len = "closest" if opt == "-n": bleu.nonorm = 1 print args cookedrefs = [] reffiles = [file(name) for name in args[:-1]] print reffiles for refs in itertools.izip(*reffiles): cookedrefs.append(bleu.cook_refs(refs)) outprefix = args[-1] featsfile = file(outprefix+"feats.opt", "w") candsfile = file(outprefix+"cands.opt", "w") cur_sentnum = None testsents = set() progress = 0 infile = sys.stdin # function that recognizes floats re_float=re.compile(r'^-?[-0-9.e\+]+$') is_float=lambda(x):re_float.match(x)
def mbr_best(lines, nbest, cost_weighting, rank_limit, rank_weight, sys_weights, sys_cost_bases=None,fast=False,addprec=1,eff_ref_len="average",n=4,cost_base=None,normalize_cost_base=False,per_system_norm=True): if sys_weights == None or len(sys_weights)==0: no_syswt=True sys_weights={} else: no_syswt=False sumsw=sum(sys_weights.itervalues()) sys_weights=dict((s,(w/sumsw)) for s,w in sys_weights.iteritems()) use_cost_base = (cost_base != None or sys_cost_bases != None) if use_cost_base and sys_cost_bases == None: sys_cost_bases = {} if nbest: start = 6 # added system also to beginning else: start = 1 entries_per_system = {} best_system_score = {} # these are set by first/last in input, rather than assuming more positive -> better worst_system_score = {} max_system_cost = {} # cookedrefs = [] ref_probs = [] ref_syss = [] splits=[line.split(None,start) for line in lines] hyps=[bleu.precook(s[start]) for s in splits] sump_sys = {} for split_ref in splits: sysname = split_ref[0] entries_per_system[sysname] = entries_per_system.get(sysname, 0) + 1 if nbest: score = float(split_ref[5]) cost = -score if sysname not in best_system_score: best_system_score[sysname]=score worst_system_score[sysname]=score if max_system_cost.get(sysname, 0) < cost: max_system_cost[sysname] = cost # diff_system_score = dict((s,w-best_system_score[s]) for s,w in worst_system_score.iteritems()) sump=0 for split_ref in splits: p = 1.0 sysname = split_ref[0] if not per_system_norm and sysname in sys_weights: # pdb.set_trace() p *= sys_weights[sysname] if nbest: if rank_weight != None: rank = int(split_ref[4]) p *= 1./(rank_weight + rank) if use_cost_base and sysname in best_system_score: bss=best_system_score[sysname] score = float(split_ref[5])-bss if normalize_cost_base: diff=bss-worst_system_score[sysname] #diff=diff_system_score[sysname] if diff!=0: score /= diff p *= math.pow(sys_cost_bases.get(sysname,cost_base),score) if cost_weighting and max_system_cost.get(sysname, 0) > 0.0: cost = -float(split_ref[5]) p *= cost/max_system_cost[sysname] sump+=p sump_sys[sysname]=sump_sys.get(sysname,0)+p ref_probs.append(p) ref_syss.append(sysname) if per_system_norm: if no_syswt: wsys=1./len(sump_sys) sys_weights=dict.fromkeys(sump_sys.iterkeys(),wsys) # dump(sys_weights) assert isclose(sum(sys_weights.itervalues()),1) mult_sys=dict((n,(sys_weights[n]/s)) for n,s in sump_sys.iteritems()); # dump([(sum([p for n,p in zip(ref_syss,ref_probs) if n==sn]),sn,sump_sys[sn],mult_sys[sn]) for sn in sump_sys.keys()]) ref_probs=[mult_sys[sn]*p for sn,p in itertools.izip(ref_syss,ref_probs)] dump(sum(ref_probs)) elif sump != 1.0: oos=1./sump ref_probs=[p*oos for p in ref_probs] if fast: expected_ref=cook_expected_ref(hyps,n=n,p_refs=ref_probs,eff_ref_len=eff_ref_len) else: cookedrefs = [ bleu.cook_refs([s[start]],n=n) for s in splits ] max_items = [] avg_bleu = [] N=len(hyps) for test in xrange(N): split_test = splits[test] avg_test_bleu = 0.0 if nbest and rank_limit != None: test_rank = int(split_test[4]) if test_rank > rank_limit: avg_bleu.append(avg_test_bleu) continue if fast: avg_test_bleu = score_vs_cooked(hyps[test],expected_ref,n=n,addprec=addprec) else: for ref in xrange(N): split_ref = splits[ref] factor = ref_probs[ref] # score=1. if ref != test: # each system gets to vote for itself score=score_vs_cooked(hyps[test],cookedrefs[ref],n=n,addprec=addprec) else: score=1. avg_test_bleu += ref_probs[ref]*score avg_bleu.append(avg_test_bleu) if len(max_items) == 0 or avg_test_bleu == avg_bleu[max_items[0]]: max_items.append(test) elif avg_test_bleu > avg_bleu[max_items[0]]: max_items = [] max_items.append(test) dump(avg_bleu) # dump([x/avg_bleu[0] for x in avg_bleu]) return max_items,avg_bleu
def mbr_best(lines, nbest, cost_weighting, rank_limit, rank_weight, sys_weights, sys_cost_bases=None, fast=False, addprec=1, eff_ref_len="average", n=4, cost_base=None, normalize_cost_base=False, per_system_norm=True): if sys_weights == None or len(sys_weights) == 0: no_syswt = True sys_weights = {} else: no_syswt = False sumsw = sum(sys_weights.itervalues()) sys_weights = dict( (s, (w / sumsw)) for s, w in sys_weights.iteritems()) use_cost_base = (cost_base != None or sys_cost_bases != None) if use_cost_base and sys_cost_bases == None: sys_cost_bases = {} if nbest: start = 6 # added system also to beginning else: start = 1 entries_per_system = {} best_system_score = { } # these are set by first/last in input, rather than assuming more positive -> better worst_system_score = {} max_system_cost = {} # cookedrefs = [] ref_probs = [] ref_syss = [] splits = [line.split(None, start) for line in lines] hyps = [bleu.precook(s[start]) for s in splits] sump_sys = {} for split_ref in splits: sysname = split_ref[0] entries_per_system[sysname] = entries_per_system.get(sysname, 0) + 1 if nbest: score = float(split_ref[5]) cost = -score if sysname not in best_system_score: best_system_score[sysname] = score worst_system_score[sysname] = score if max_system_cost.get(sysname, 0) < cost: max_system_cost[sysname] = cost # diff_system_score = dict((s,w-best_system_score[s]) for s,w in worst_system_score.iteritems()) sump = 0 for split_ref in splits: p = 1.0 sysname = split_ref[0] if not per_system_norm and sysname in sys_weights: # pdb.set_trace() p *= sys_weights[sysname] if nbest: if rank_weight != None: rank = int(split_ref[4]) p *= 1. / (rank_weight + rank) if use_cost_base and sysname in best_system_score: bss = best_system_score[sysname] score = float(split_ref[5]) - bss if normalize_cost_base: diff = bss - worst_system_score[sysname] #diff=diff_system_score[sysname] if diff != 0: score /= diff p *= math.pow(sys_cost_bases.get(sysname, cost_base), score) if cost_weighting and max_system_cost.get(sysname, 0) > 0.0: cost = -float(split_ref[5]) p *= cost / max_system_cost[sysname] sump += p sump_sys[sysname] = sump_sys.get(sysname, 0) + p ref_probs.append(p) ref_syss.append(sysname) if per_system_norm: if no_syswt: wsys = 1. / len(sump_sys) sys_weights = dict.fromkeys(sump_sys.iterkeys(), wsys) # dump(sys_weights) assert isclose(sum(sys_weights.itervalues()), 1) mult_sys = dict( (n, (sys_weights[n] / s)) for n, s in sump_sys.iteritems()) # dump([(sum([p for n,p in zip(ref_syss,ref_probs) if n==sn]),sn,sump_sys[sn],mult_sys[sn]) for sn in sump_sys.keys()]) ref_probs = [ mult_sys[sn] * p for sn, p in itertools.izip(ref_syss, ref_probs) ] dump(sum(ref_probs)) elif sump != 1.0: oos = 1. / sump ref_probs = [p * oos for p in ref_probs] if fast: expected_ref = cook_expected_ref(hyps, n=n, p_refs=ref_probs, eff_ref_len=eff_ref_len) else: cookedrefs = [bleu.cook_refs([s[start]], n=n) for s in splits] max_items = [] avg_bleu = [] N = len(hyps) for test in xrange(N): split_test = splits[test] avg_test_bleu = 0.0 if nbest and rank_limit != None: test_rank = int(split_test[4]) if test_rank > rank_limit: avg_bleu.append(avg_test_bleu) continue if fast: avg_test_bleu = score_vs_cooked(hyps[test], expected_ref, n=n, addprec=addprec) else: for ref in xrange(N): split_ref = splits[ref] factor = ref_probs[ref] # score=1. if ref != test: # each system gets to vote for itself score = score_vs_cooked(hyps[test], cookedrefs[ref], n=n, addprec=addprec) else: score = 1. avg_test_bleu += ref_probs[ref] * score avg_bleu.append(avg_test_bleu) if len(max_items) == 0 or avg_test_bleu == avg_bleu[max_items[0]]: max_items.append(test) elif avg_test_bleu > avg_bleu[max_items[0]]: max_items = [] max_items.append(test) dump(avg_bleu) # dump([x/avg_bleu[0] for x in avg_bleu]) return max_items, avg_bleu
logbleu += min(0,1-(float(comps['reflen']+1))/(comps['testlen']+1)) return math.exp(logbleu) if __name__ == "__main__": optparser = optparse.OptionParser() optparser.add_option("-m", "--map-file", dest="mapfilename", help="map file indicating sentence number in reference set for each line of input") optparser.add_option("-b", "--brevity-penalty", dest="brevitypenalty", action="store_true", help="assess brevity penalty") (opts, args) = optparser.parse_args() n = 4 cookedrefs = [] for lines in itertools.izip(*[file(filename) for filename in args[1:]]): cookedrefs.append(bleu.cook_refs(lines, n=n)) if opts.mapfilename is not None: linemap = [] for line in file(opts.mapfilename): linemap.append(int(line)) else: linemap = range(len(cookedrefs)) if args[0] == "-": infile = sys.stdin else: infile = open(args[0]) test1 = [] for (line,i) in itertools.izip(infile, linemap): test1.append(bleu.cook_test(line, cookedrefs[i], n=n))
bleu.normalize = normalize # usage: bleu+1.py <test> <ref>+ if __name__ == "__main__": optparser = optparse.OptionParser() optparser.add_option("-m", "--map-file", dest="mapfilename", help="map file indicating sentence number in reference set for each line of input") optparser.add_option("-b", "--brevity-penalty", dest="brevitypenalty", action="store_true", help="assess brevity penalty") (opts, args) = optparser.parse_args() n = 4 cookedrefs = [] for lines in itertools.izip(*[file(filename) for filename in args[1:]]): cookedrefs.append(bleu.cook_refs([line.split() for line in lines], n=n)) if opts.mapfilename is not None: linemap = [] for line in file(opts.mapfilename): linemap.append(int(line)) else: linemap = range(len(cookedrefs)) test1 = [] for (line,i) in itertools.izip(file(args[0]), linemap): test1.append(bleu.cook_test(line.split(), cookedrefs[i], n=n)) total = 0. n_sent = 0
bleu.preserve_case = True if opt == "-a": bleu.eff_ref_len = "average" if opt == "-s": bleu.eff_ref_len = "shortest" if opt == "-e": bleu.eff_ref_len = "closest" if opt == "-n": bleu.nonorm = 1 print args cookedrefs = [] reffiles = [file(name) for name in args[:-1]] print reffiles for refs in itertools.izip(*reffiles): cookedrefs.append(bleu.cook_refs(refs)) outprefix = args[-1] featsfile = file(outprefix + "feats.opt", "w") candsfile = file(outprefix + "cands.opt", "w") cur_sentnum = None testsents = set() progress = 0 infile = sys.stdin # function that recognizes floats re_float = re.compile(r'^-?[-0-9.e\+]+$') is_float = lambda (x): re_float.match(x)
(opts, args) = getopt.getopt(sys.argv[1:], "rctpv", []) for (opt, parm) in opts: if opt == "-c": bleu.preserve_case = True elif opt == "-t": bleu.nist_tokenize = False elif opt == "-p": bleu.clip_len = True elif opt == "-v": verbose = True test1 = [] test2 = [] for lines in itertools.izip(*[file(filename) for filename in args]): cookedrefs = bleu.cook_refs(lines[2:]) test1.append(bleu.cook_test(lines[0], cookedrefs)) test2.append(bleu.cook_test(lines[1], cookedrefs)) score1 = bleu.score_cooked(test1) print "System 1: %f" % score1 print "System 2: %f" % bleu.score_cooked(test2) better = worse = 0 fake = test1[:] for i in xrange(len(fake)): fake[i] = test2[i] fake_score = bleu.score_cooked(fake) if fake_score > score1: better += 1
modelweights = dict() for wname, wval in weights.iteritems(): if wname not in args.feats: modelweights[wname] = float(wval) # write extra random weights to temp file for point in range(args.randoms): startweights.write( ' '.join(map(str, np.random.rand(len(args.feats) + 1))) + "\n") startweights.close() # cook references for comps cookedrefs = [] for lines in itertools.izip(*(args.reference)): cookedrefs.append( bleu.cook_refs([line.split() for line in lines], n=bleun)) for line in infile: prefeats = parse_nbest(line.strip()) feats = dd(lambda: "0") feats.update(prefeats) hyp = feats[hypkey].lstrip("{").rstrip("}") sent = int(feats[sentkey]) - 1 # write hyp to temp file hypfile.write(hyp + "\n") # write id, components, features to tuning file tunefile.write("%d ||| " % sent)