def __init__(self, opts): self.opts = opts self.tm = models.TM(opts.tm, sys.maxint) self.lm = models.LM(opts.lm) self.french = [ tuple(line.strip().split()) for line in open(opts.input).readlines() ] # tm should translate unknown words as-is with probability 1 for word in set(sum(self.french, ())): if (word, ) not in self.tm: self.tm[(word, )] = [models.phrase(word, 0.0)]
def main(): parser = argparse.ArgumentParser( description= 'Compute unnormalized translation probability by marginalizing over alignments.' ) parser.add_argument( '-i', '--input', dest='input', default='data/input', help='File containing sentences to translate (default=data/input)') parser.add_argument( '-t', '--translation-model', dest='tm', default='data/tm', help='File containing translation model (default=data/tm)') parser.add_argument( '-l', '--language-model', dest='lm', default='data/lm', help='File containing ARPA-format language model (default=data/lm)') opts = parser.parse_args() tm = models.TM(opts.tm, sys.maxint) lm = models.LM(opts.lm) french_sents = [ tuple(line.strip().split()) for line in open(opts.input).readlines() ] english_sents = [tuple(line.strip().split()) for line in sys.stdin] if (len(french_sents) != len(english_sents)): sys.stderr.write( "ERROR: French and English files are not the same length! Only complete output can be graded!\n" ) sys.exit(1) total_logprob = 0.0 unaligned_sentences = 0 for sent_num, (f, e) in enumerate(zip(french_sents, english_sents)): total_logprob += sent_logp(sent_num, (f, e)) if unaligned_sentences > 0: sys.stderr.write( "ERROR: There were %d unaligned sentences! Only sentences that align under the model can be graded!\n" % unaligned_sentences) sys.stdout.write("%f\n" % total_logprob)
import sys import models from collections import namedtuple optparser = optparse.OptionParser() optparser.add_option("-i", "--input", dest="input", default="data/input", help="File containing sentences to translate (default=data/input)") optparser.add_option("-t", "--translation-model", dest="tm", default="data/tm", help="File containing translation model (default=data/tm)") optparser.add_option("-l", "--language-model", dest="lm", default="data/lm", help="File containing ARPA-format language model (default=data/lm)") optparser.add_option("-n", "--num_sentences", dest="num_sents", default=sys.maxint, type="int", help="Number of sentences to decode (default=no limit)") optparser.add_option("-k", "--translations-per-phrase", dest="k", default=1, type="int", help="Limit on number of translations to consider per phrase (default=1)") optparser.add_option("-s", "--stack-size", dest="s", default=1, type="int", help="Maximum stack size (default=1)") optparser.add_option("-v", "--verbose", dest="verbose", action="store_true", default=False, help="Verbose mode (default=off)") opts = optparser.parse_args()[0] opts.k = 4 tm = models.TM(opts.tm, opts.k) lm = models.LM(opts.lm) french = [tuple(line.strip().split()) for line in open(opts.input).readlines()[:opts.num_sents]] # tm should translate unknown words as-is with probability 1 for word in set(sum(french,())): if (word,) not in tm: tm[(word,)] = [models.phrase(word, 0.0)] # values for the model parameter dd = 5 nn = -4 beta = 2
type="float", help="weight for language model") optparser.add_option("-b", "--beta", dest="beta", default=1.0, type="float", help="weight for translation model") optparser.add_option("-m", "--mute", dest="mute", default=0, type="int", help="mute the output") opts = optparser.parse_args()[0] tm = models.TM(opts.tm, opts.k, opts.mute) lm = models.LM(opts.lm, opts.mute) french = [ tuple(line.strip().split()) for line in open(opts.input).readlines()[:opts.num_sents] ] bound_width = float(opts.bwidth) hypothesis = namedtuple( "hypothesis", "lm_state, logprob, coverage, end, predecessor, phrase") def bitmap(sequence): """ Generate a coverage bitmap for a sequence of indexes """ return reduce(lambda x, y: x | y, map(lambda i: long('1' + '0' * i, 2), sequence), 0)
help="Try to resegment unknown words into two known words (default=off)") optparser.add_option("-v", "--verbose", dest="verbose", action="store_true", default=False, help="Verbose mode (default=off)") opts = optparser.parse_args()[0] lm = models.LM(opts.lm) # Training weights = [1.0 / 7] * 7 for i in range(opts.loop): tm = models.TM(opts.tmdev, opts.k, weights[:4], simpmode=opts.simplify) nbest_list = list( decode.get_candidates(opts.input, tm, lm, weights, stack_size=opts.s, verbose=opts.verbose, simpmode=opts.simplify, separate_unknown_words=opts.reseg_unknown)) weights = trainreranker.train(nbest_list, opts.reference.split(), weights) print weights results = rerank.rerank(weights, nbest_list) print >> sys.stderr, "TRAINING LOOP %d BLEU SCORE: %f:" % \ (i, scorereranker.score(results, opts.reference.split()))
help="File containing ARPA-format language model (default=data/lm)") argparser.add_argument("-o", "--output", dest="output", default="output", help="Ouput result file") argparser.add_argument("-n", "--num_sentences", dest="num_sents", default=2**64, type=int, help="Number of sentences to decode (default=2^64)") argparser.add_argument("-k", "--translations-per-phrase", dest="k", default=2**64, type=int, help="Limit on number of translations to consider per phrase (default=2^64)") argparser.add_argument("-b", "--beam-size", dest="beam_size", default=1000, type=int, help="Maximum beam size (default=1000)") argparser.add_argument("-dl", "--distortion-limit", dest="distortion_limit", default=10, type=int, help="Hard distortion limit (default=10)") argparser.add_argument("-dp", "--distortion-parameter", dest="distortion_parameter", default=-0.01, type=float, help="Soft distortion parameter (default=-0.01)") args = argparser.parse_args() tm = models.TM(args.tm, args.k) lm = models.LM(args.lm) french = [tuple(line.strip().split()) for line in open(args.input, 'r', encoding='utf-8').readlines()[:args.num_sents]] output = open(args.output, 'w', encoding='utf-8') # tm should translate unknown words as-is with probability 1 for word in set(sum(french,())): if (word,) not in tm: tm[(word,)] = [models.phrase(word, 0.0)] hypothesis = namedtuple("hypothesis", "logprob, state, predecessor, phrase") state = namedtuple("state", "e1, e2, bitString, r") ph = namedtuple("ph", "s, t, phrase") for f in french: initial_state = state(None, lm.begin(), 0, 0.0)
default=1, type="int", help="Verbosity level, 0-3 (default=1)") optparser.add_option("-o", "--logfile", dest="logfile", default=None, help="filename for logging output") opts = optparser.parse_args()[0] if opts.logfile: logging.basicConfig(filename=opts.logfile, filemode='w', level=logging.INFO) tm = models.TM(opts.tm, sys.maxint) lm = models.LM(opts.lm) french = [tuple(line.strip().split()) for line in open(opts.input).readlines()] english = [tuple(line.strip().split()) for line in sys.stdin] # tm should translate unknown words as-is with probability 1 for word in set(sum(french, ())): if (word, ) not in tm: tm[(word, )] = [models.phrase(word, 0.0)] def maybe_write(s, verbosity): if opts.logfile: logging.info(s) if opts.verbosity > verbosity: sys.stdout.write(s)
dest="verbose", action="store_true", default=False, help="Verbose mode (default=off)") optparser.add_option( "-f", "--feedback-loop", dest="loop", default=10, help= "The number of times the weight vector loops between decoder and reranker") opts = optparser.parse_args()[0] weights = [1, 1, 1, 1, 1] lm = models.LM(opts.lm) tm = models.TM(opts.tmtest, opts.k, weights[0:-1]) nbest_list = decode.get_candidates(opts.eval, tm, lm, weights) results = [] pt = -1 for cand in nbest_list: (i, sentence, features) = cand.strip().split("|||") i = int(i) if i != pt: pt = i results += [sentence] file = open("output_d", "w") file.write("\n".join(results)) file.close()
opts.lm = "data/lm/en.gigaword.3g.filtered.train_dev_test.arpa.gz" opts.tm = "data/large/phrase-table/dev-filtered/rules_cnt.final.out" elif opts.dataset == "test": opts.input = "data/test/all.cn-en.cn" opts.lm = "data/lm/en.gigaword.3g.filtered.train_dev_test.arpa.gz" opts.tm = "data/large/phrase-table/test-filtered/rules_cnt.final.out" if opts.weights is None: weights = [1. / number_of_features] * number_of_features else: with open(opts.weights) as weights_file: weights = [float(line.strip()) for line in weights_file] # weights = map(lambda x: 1.0 if math.isnan(x) or x == float("-inf") or x == float("inf") or x == 0.0 else x, w) assert len(weights) == number_of_features if opts.simpmode: tm = models.TM(opts.tm, opts.k, weights, simpmode=True) else: tm = models.TM(opts.tm, opts.k, weights, simpmode=False) lm = models.LM(opts.lm) candidates = get_candidates(opts.input, tm, lm, weights, stack_size=opts.s, nbest=opts.nbest, simpmode=opts.simpmode, separate_unknown_words=opts.reseg_unknown, verbose=opts.verbose) for i in candidates: print i
def init(tmpath, lmpath): tm = models.TM(opts.tm, sys.maxint) lm = models.LM(opts.lm) return tm, lm