def main(): parser = argparse.ArgumentParser(description="Wrapper around the Moses Decoder that adds support for context features through classifiers.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-f','--inputfile', type=str,help="Input text file; the test corpus (plain text, tokenised, one sentence per line), may be specified multiple times for each factor", action='append',required=False) parser.add_argument('-d','--devinputfile', type=str,help="Extra input text file to consider when training classifiers; the development corpus (plain text, tokenised, one sentence per line)", action='store',required=False) parser.add_argument('-S','--sourceclassfile', type=str, help="Source class file", action='store',required=True) parser.add_argument('-T','--targetclassfile', type=str, help="Target class file", action='store',required=True) parser.add_argument('-a','--alignmodelfile', type=str,help="Colibri alignment model (made from phrase translation table)", action='store',default="",required=False) parser.add_argument('-w','--workdir', type=str,help="Working directory, should contain classifier training files", action='store',default="",required=True) parser.add_argument('--train', help="Train classifiers", action="store_true", default=False) #parser.add_argument('-O','--timbloptions', type=str, help="Options for the Timbl classifier", action="store", default="-a 0 -k 1") parser.add_argument('--ta', type=str, help="Timbl algorithm", action="store", default="0") parser.add_argument('--tk', type=str, help="Timbl k value", action="store", default="1") parser.add_argument('--tw', type=str, help="Timbl weighting", action="store", default="gr") parser.add_argument('--tm', type=str, help="Timbl feature metrics", action="store", default="O") parser.add_argument('--td', type=str, help="Timbl distance metric", action="store", default="Z") parser.add_argument('-I','--ignoreclassifier', help="Ignore classifier (for testing bypass method)", action="store_true", default=False) parser.add_argument('-H','--scorehandling', type=str, help="Score handling, can be 'append' (default), 'replace', or 'weighed'", action="store", default="append") parser.add_argument('--mosesinclusive',help="Pass full sentences through through Moses server using XML input (will start a moses server, requires --moseslm). Classifier output competes with normal translation table. Score handling (-H) has no effect as only the classifier score will be passed.", action='store_true',default=False) parser.add_argument('--mosesexclusive',help="Pass full sentences through through Moses server using XML input (will start a moses server, requires --moseslm). Classifier does NOT compete with normal translation table. Score handling (-H) has no effect as only the classifier score will be passed.", action='store_true',default=False) parser.add_argument('--mosesdir', type=str,help='Path to Moses directory (required for MERT)', default="") parser.add_argument('--mert', type=int,help="Do MERT parameter tuning, set to number of MERT runs to perform", required=False, default=0) parser.add_argument('--threads', type=int, default=1, help="Number of threads to use for Moses or Mert") parser.add_argument('--reordering', type=str,action="store",help="Reordering type (use with --reorderingtable)", required=False) parser.add_argument('--reorderingtable', type=str,action="store",help="Use reordering table (use with --reordering)", required=False) parser.add_argument('--ref', type=str,action="store",help="Reference corpus (target corpus, plain text)", required=False) parser.add_argument('--lm', type=str, help="Language Model", action="store", default="", required=False) parser.add_argument('--lmorder', type=int, help="Language Model order", action="store", default=3, required=False) parser.add_argument('--lmweight', type=float, help="Language Model weight", action="store", default=0.5, required=False) parser.add_argument('--dweight', type=float, help="Distortion Model weight", action="store", default=0.3, required=False) parser.add_argument('--wweight', type=float, help="Word penalty weight", action="store", default=-1, required=False) parser.add_argument('--tweight', type=float, help="Translation Model weight (may be specified multiple times for each score making up the translation model)", action="append", required=False) parser.add_argument('--reorderingweight', type=float, help="Reordering Model weight (may be specified multiple times for each score making up the reordering model)", action="append", required=False) parser.add_argument('--pweight', type=float, help="Phrase penalty", default=0.2, action="store", required=False) parser.add_argument('--classifierdir', type=str,help="Trained classifiers, intermediate phrase-table and test file will be written here (only specify if you want a different location than the work directory)", action='store',default="",required=False) parser.add_argument('--decodedir', type=str,help="Moses output will be written here (only specify if you want a different location than the work directory)", action='store',default="",required=False) parser.add_argument('--skipdecoder',action="store_true",default=False) parser.add_argument('--ignoreerrors',action="store_true",help="Attempt to ignore errors",default=False) parser.add_argument('--mosesport',type=int, help="Port for Moses server (will be started for you), if -Z is enabled",action='store',default=8080) args = parser.parse_args() #args.storeconst, args.dataset, args.num, args.bar if not os.path.isdir(args.workdir) or not os.path.exists(args.workdir + '/classifier.conf'): print("Work directory " + args.workdir + " or classifier configuration therein does not exist. Did you extract features and create classifier training files using colibri-extractfeatures?" ,file=sys.stderr) sys.exit(2) if args.classifierdir: classifierdir = args.classifierdir else: classifierdir = args.workdir if not classifierdir: classifierdir = os.getcwd() elif classifierdir and classifierdir[0] != '/': classifierdir = os.getcwd() + '/' + classifierdir if args.mert and not args.mosesdir: print("--mert requires --mosesdir to be set",file=sys.stderr) sys.exit(2) if args.mert and not args.ref: print("--mert requires --ref to be set",file=sys.stderr) sys.exit(2) if args.decodedir: decodedir = args.decodedir else: decodedir = args.workdir if not decodedir: decodedir = os.getcwd() elif decodedir and decodedir[0] != '/': decodedir = os.getcwd() + '/' + decodedir print("Loading configuration (training corpora and class decoders)",file=sys.stderr) f = open(args.workdir + '/classifier.conf','rb') classifierconf = pickle.load(f) f.close() print("Configuration: ", classifierconf,file=sys.stderr) if args.inputfile: if len(classifierconf['featureconf']) > len(args.inputfile): raise Exception("Number of input files (" + str(len(args.inputfile)) + ") is less than the number of factor-features in configuration (" + str(len(classifierconf['featureconf'])) + "), you need to specify all") #one for each factor sourceencoders = [] if args.inputfile: l = [] for i, (inputfile, conf) in enumerate(zip(args.inputfile, classifierconf['featureconf'])): trainclassfile = conf['classdecoder'] print("Processing factor #" + str(i),file=sys.stderr) #process inputfile corpusfile = os.path.basename(inputfile).replace('.txt','') + '.colibri.dat' classfile = os.path.basename(inputfile).replace('.txt','') + '.colibri.cls' #if os.path.exists(corpusfiles[i]) and os.path.exists(classfiles[i]): # print("Notice: Re-using previously generated corpusfile and classfile",file=sys.stderr) # print("Loading source class encoder and decoder",file=sys.stderr) # sourceencoders.append( ClassEncoder(classfiles[i]) ) # sourcedecoders.append( ClassDecoder(classfiles[i]) ) #else: print("Loading and extending source class encoder, from " + trainclassfile + " to " + classfile,file=sys.stderr) sourceencoders.append( ClassEncoder(trainclassfile) ) sourceencoders[i].processcorpus(inputfile) if i == 0 and args.devinputfile: print("(including development corpus in extended class encoder)",file=sys.stderr) sourceencoders[i].processcorpus(args.devinputfile) sourceencoders[i].buildclasses() sourceencoders[i].save(classfile) print("Encoding test corpus, from " + inputfile + " to " + corpusfile,file=sys.stderr) sourceencoders[i].encodefile(inputfile, corpusfile) if i == 0 and args.devinputfile: print("Encoding development corpus, from " + args.devinputfile + " to " + args.devinputfile + '.colibri.dat',file=sys.stderr) sourceencoders[i].encodefile(args.devinputfile, args.devinputfile + '.colibri.dat') print("Loading source class decoder " + classfile,file=sys.stderr) sourcedecoder = ClassDecoder(classfile) print("Loading test corpus " + corpusfile,file=sys.stderr) l.append( Configuration( IndexedCorpus(corpusfile), sourcedecoder, conf['leftcontext'], conf['focus'], conf['rightcontext']) ) classifierconf['featureconf'] = l else: print("Loading source class decoders",file=sys.stderr) l = [] for conf in classifierconf['featureconf']: sourcedecoder = ClassDecoder(conf['classdecoder']) l.append( Configuration( IndexedCorpus(), sourcedecoder, conf['leftcontext'], conf['focus'], conf['rightcontext'] ) ) if args.inputfile and args.alignmodelfile: print("Loading target encoder " + args.targetclassfile,file=sys.stderr) targetencoder = ClassEncoder(args.targetclassfile) print("Loading target decoder " + args.targetclassfile,file=sys.stderr) targetdecoder = ClassDecoder(args.targetclassfile) print("Loading alignment model " + args.alignmodelfile ,file=sys.stderr) alignmodel = AlignmentModel(args.alignmodelfile) print("\tAlignment model has " + str(len(alignmodel)) + " source patterns",file=sys.stderr) print("Building patternmodel on test corpus " + classifierconf['featureconf'][0].corpus.filename() ,file=sys.stderr) options = PatternModelOptions(mintokens=1, maxlength=12, debug=True) testmodel = IndexedPatternModel(reverseindex=classifierconf['featureconf'][0].corpus) testmodel.train( "", options, alignmodel) print("\tTest model has " + str(len(testmodel)) + " source patterns",file=sys.stderr) #saving just so we can inspect it for debug purposes: testmodel.write( decodedir + '/test.colibri.indexedpatternmodel' ) if args.devinputfile: print("Building patternmodel on development corpus " + args.devinputfile + ".colibri.dat" ,file=sys.stderr) devcorpus = IndexedCorpus(args.devinputfile + ".colibri.dat") print("Development corpus has " + str(devcorpus.sentences()) + " sentences") devmodel = IndexedPatternModel(reverseindex=devcorpus) devmodel.train( "", options, alignmodel) print("\tDevelopment model has " + str(len(testmodel)) + " source patterns",file=sys.stderr) #saving just so we can inspect it for debug purposes: devmodel.write( decodedir + '/dev.colibri.indexedpatternmodel' ) else: devmodel = {} if args.reorderingtable: print("Loading reordering model (may take a while)",file=sys.stderr) rtable = PhraseTable(args.reorderingtable) #TODO: convert to colibri alignmodel elif args.train and args.inputfile: if not args.alignmodelfile: print("No alignment model specified (-a)",file=sys.stderr) sys.exit(2) elif not args.train: if not args.inputfile: print("No input file specified (-f)",file=sys.stderr) if not args.alignmodelfile: print("No alignment model specified (-a)",file=sys.stderr) sys.exit(2) if args.train: #training mode if args.inputfile: print("Training classifiers (constrained by test data)",file=sys.stderr) else: print("Training all classifiers (you may want to constrain by test data using -f)",file=sys.stderr) if 'monolithic' in classifierconf and classifierconf['monolithic']: #monolithic trainfile = args.workdir + "/train" #build a classifier print("Training monolithic classifier " + trainfile,file=sys.stderr) timbloptions = gettimbloptions(args, classifierconf) if args.classifierdir: #ugly hack since we want ibases in a different location trainfilecopy = trainfile.replace(args.workdir, args.classifierdir) shutil.copyfile(trainfile+".train", trainfilecopy+".train") trainfile = trainfilecopy classifier = timbl.TimblClassifier(trainfile, timbloptions) classifier.train() classifier.save() if args.classifierdir: #remove copy os.unlink(trainfile+".train") trained = 1 else: #experts trained = 0 for trainfile in itertools.chain(glob.glob(args.workdir + "/*.train"), glob.glob(args.workdir + "/.*.train")): #explicitly add 'dotfiles', will be skipped by default if args.inputfile: sourcepattern_s = unquote_plus(os.path.basename(trainfile.replace('.train',''))) sourcepattern = sourceencoders[0].buildpattern(sourcepattern_s) if not sourcepattern in testmodel and not sourcepattern in devmodel: print("Skipping " + trainfile + " (\"" + sourcepattern_s + "\" not in test/dev model)",file=sys.stderr) continue #build a classifier print("Training " + trainfile,file=sys.stderr) trained += 1 timbloptions = gettimbloptions(args, classifierconf) if args.classifierdir: #ugly hack since we want ibases in a different location trainfilecopy = trainfile.replace(args.workdir, args.classifierdir) shutil.copyfile(trainfile, trainfilecopy) trainfile = trainfilecopy classifier = timbl.TimblClassifier(trainfile.replace('.train',''), timbloptions) classifier.train() classifier.save() if args.classifierdir: #remove copy os.unlink(trainfile) if not os.path.exists(trainfile.replace(".train",".ibase")): raise Exception("Resulting instance base " + trainfile.replace(".train",".ibase") + " not found!") with open(args.classifierdir + '/trained','w',encoding='utf-8') as f: f.write(str(trained)+"\n") else: #TEST if not args.inputfile: print("Specify an input file (-f)",file=sys.stderr) sys.exit(2) if not args.mosesinclusive and not args.mosesexclusive: print("Writing intermediate test data to " + decodedir + "/test.txt",file=sys.stderr) #write intermediate test data (consisting only of indices AND unknown words) and f = open(decodedir + "/test.txt",'w',encoding='utf-8') for sentencenum, line in enumerate(classifierconf['featureconf'][0].corpus.sentences()): sentenceindex = sentencenum + 1 print("@" + str(sentenceindex),file=sys.stderr) tokens = [] #actual string representations for tokenindex,pattern in enumerate(line): #will yield only unigrams #is this an uncovered word that does not appear in the phrasetable? check using alignment model and keep the word untranslated if so if not pattern in alignmodel: print(" Found OOV at @" + str(sentenceindex) + ":" + str(tokenindex) + ": " + pattern.tostring(classifierconf['featureconf'][0].classdecoder), file=sys.stderr) tokens.append(pattern.tostring(classifierconf['featureconf'][0].classdecoder)) else: tokens.append(str(sentenceindex) + "_" + str(tokenindex)) f.write(" ".join(tokens) + "\n") f.close() classifierindex = set() if classifierconf['monolithic']: print("Loading classifier index for monolithic classifier",file=sys.stderr) with open(args.workdir + "/sourcepatterns.list",'r',encoding='utf-8') as f: for line in f: classifierindex.add(line.strip()) print("Loading monolithic classifier " + classifierdir + "/train.train",file=sys.stderr) timbloptions = gettimbloptions(args, classifierconf) classifier = timbl.TimblClassifier(classifierdir + "/train", timbloptions) else: classifier = None if args.reorderingtable: print("Creating intermediate phrase-table and reordering-table",file=sys.stderr) freordering = open(decodedir + "/reordering-table", 'w',encoding='utf-8') else: print("Creating intermediate phrase-table",file=sys.stderr) freordering = None if args.mosesinclusive or args.mosesexclusive: #Use mosesserver with XML input method #write mos if not args.tweight: lentweights = 4 tweights = " ".join([str(1/(lentweights+1))]*lentweights) else: tweights = " ".join([ str(x) for x in args.tweight]) lentweights = len(args.tweight) if os.path.exists(decodedir + "/moses.ini"): os.unlink(decodedir+"/moses.ini") print("Writing " + decodedir + "/moses.ini",file=sys.stderr) if args.reordering: reorderingfeature = "LexicalReordering name=LexicalReordering0 num-features=6 type=" + args.reordering + " input-factor=0 output-factor=0 path=" + decodedir + "/reordering-table" reorderingweight = "LexicalReordering0= 0.3 0.3 0.3 0.3 0.3 0.3" else: reorderingfeature = "" reorderingweight = "" #write moses.ini f = open(decodedir + '/moses.ini','w',encoding='utf-8') f.write(""" #Moses INI, produced by contextmoses.py [input-factors] 0 [mapping] 0 T 0 [distortion-limit] 6 [feature] UnknownWordPenalty WordPenalty PhrasePenalty PhraseDictionaryMemory name=TranslationModel0 num-features={lentweights} path={phrasetable} input-factor=0 output-factor=0 table-limit=20 {reorderingfeature} Distortion SRILM name=LM0 factor=0 path={lm} order={lmorder} [weight] UnknownWordPenalty0= 1 WordPenalty0= {wweight} PhrasePenalty0= {pweight} LM0= {lmweight} TranslationModel0= {tweights} Distortion0= {dweight} {reorderingweight} """.format(phrasetable=decodedir + "/phrase-table", lm=args.lm, lmorder=args.lmorder, lmweight = args.lmweight, dweight = args.dweight, tweights=tweights, lentweights=lentweights, wweight=args.wweight, pweight = args.pweight, reorderingfeature=reorderingfeature, reorderingweight=reorderingweight)) f.close() print("Starting Moses Server",file=sys.stderr) if args.mosesdir: cmd = args.mosesdir + '/bin/mosesserver' else: cmd = 'mosesserver' if args.moses: if args.mosesinclusive: cmd += " -xml-input inclusive" #compete with phrase-table elif args.mosesexclusive: cmd += " -xml-input exclusive" #only used for passing verbatim L2 (tested whether it makes a difference with inclusive baseline on en-es data, it doesn't) cmd += ' -f ' + decodedir + '/moses.ini' print("Calling mosesserver: " + cmd,file=sys.stderr) p = subprocess.Popen(cmd,shell=True) mosesserverpid = p.pid while True: time.sleep(5) try: s = socket.socket() s.connect( ("localhost", args.mosesport) ) break except Exception as e: print("Waiting for Moses server....", e, file=sys.stderr) print("Connecting to Moses Server",file=sys.stderr) mosesclient = xmlrpc.client.ServerProxy("http://*****:*****@" + str(i+1) + "/" + str(sourcepatterncount) + " -- Processing " + str(sentenceindex) + ":" + str(tokenindex) + " " + sourcepattern_s + " -- Features: " + str(repr(featurevector)),file=sys.stderr) if classifier and not args.ignoreclassifier: if not classifierconf['monolithic'] or (classifierconf['monolithic'] and sourcepattern_s in classifierindex): print("\tClassifying",file=sys.stderr) #call classifier classlabel, distribution, distance = classifier.classify(featurevector) #process classifier result for targetpattern_s, score in distribution.items(): targetpattern = targetencoder.buildpattern(targetpattern_s) if (sourcepattern, targetpattern) in alignmodel: scorevector = [ x for x in alignmodel[(sourcepattern,targetpattern)] if isinstance(x,int) or isinstance(x,float) ] #make a copy else: continue if args.scorehandling == 'append': scorevector.append(score) elif args.scorehandling == 'replace': scorevector[2] = score else: raise NotImplementedError #TODO: implemented weighed! translationcount += 1 #write phrasetable entries ftable.write(tokenspan + " ||| " + targetpattern_s + " ||| " + " ".join([str(x) for x in scorevector]) + "\n") if freordering: reordering_scores = None try: for t, sv in rtable[sourcepattern_s]: if t == targetpattern_s: reordering_scores = sv except KeyError: if args.ignoreerrors: print("******* ERROR ********* Source pattern notfound in reordering table: " + sourcepattern_s,file=sys.stderr) continue else: raise Exception("Source pattern notfound in reordering table: " + sourcepattern_s) if reordering_scores: freordering.write(tokenspan + " ||| " + targetpattern_s + " ||| " + " ".join([str(x) for x in reordering_scores]) + "\n") else: if args.ignoreerrors: print("******** ERROR ********* Target pattern not found in reordering table: " + targetpattern_s + " (for source " + sourcepattern_s + ")",file=sys.stderr) continue else: raise Exception("Target pattern not found in reordering table: " + targetpattern_s + " (for source " + sourcepattern_s + ")") if translationcount == 0: print("\tNo overlap between classifier translations (" + str(len(distribution)) + ") and phrase table. Falling back to statistical baseline.",file=sys.stderr) statistical = True else: print("\t\t" + str(translationcount) + " translation options written",file=sys.stderr) statistical = False else: print("\tNot in classifier. Falling back to statistical baseline.",file=sys.stderr) statistical = True else: statistical = True if statistical: print("\tPhrasetable lookup",file=sys.stderr) #ignore classifier or no classifier present for this item for targetpattern in alignmodel.targetpatterns(sourcepattern): scorevector = [ x for x in alignmodel[(sourcepattern,targetpattern)] if isinstance(x,int) or isinstance(x,float) ] #make a copy if args.scorehandling == 'append': scorevector.append(scorevector[2]) elif args.scorehandling == 'replace': pass #nothing to do, scorevector is okay as it is elif args.scorehandling == 'weighed': raise NotImplementedError #TODO: implemented weighed! translationcount += 1 #write phrasetable entries targetpattern_s = targetpattern.tostring(targetdecoder) ftable.write(tokenspan + " ||| " + targetpattern_s + " ||| " + " ".join([ str(x) for x in scorevector]) + "\n") if freordering: reordering_scores = None try: for t, sv in rtable[sourcepattern_s]: if t == targetpattern_s: reordering_scores = sv except KeyError: if args.ignoreerrors: print("******** ERROR ******* Source pattern not found in reordering table: " + sourcepattern_s,file=sys.stderr) continue else: raise Exception("Source pattern not found in reordering table: " + sourcepattern_s) if reordering_scores: freordering.write(tokenspan + " ||| " + targetpattern_s + " ||| " + " ".join([str(x) for x in reordering_scores]) + "\n") else: if args.ignoreerrors: print("******* ERROR ****** Target pattern not found in reordering table: " + targetpattern_s + " (for source " + sourcepattern_s + ")",file=sys.stderr) continue else: raise Exception("Target pattern not found in reordering table: " + targetpattern_s + " (for source " + sourcepattern_s + ")") print("\t\t" + str(translationcount) + " translation options written",file=sys.stderr) prevpattern = None ftable.close() if freordering: freordering.close() if not args.tweight: if args.scorehandling == "append": lentweights = 5 else: lentweights = 4 tweights = " ".join([str(1/(lentweights+1))]*lentweights) else: tweights = " ".join([ str(x) for x in args.tweight]) lentweights = len(args.tweight) print("Writing " + decodedir + "/moses.ini",file=sys.stderr) if args.reordering: reorderingfeature = "LexicalReordering name=LexicalReordering0 num-features=6 type=" + args.reordering + " input-factor=0 output-factor=0 path=" + decodedir + "/reordering-table" if not args.reorderingweight: reorderingweight = "LexicalReordering0= 0.3 0.3 0.3 0.3 0.3 0.3" else: reorderingweight = "LexicalReordering0= " + " ".join([str(x) for x in args.reorderingweight]) else: reorderingfeature = "" reorderingweight = "" #write moses.ini f = open(decodedir + '/moses.ini','w',encoding='utf-8') f.write(""" #Moses INI, produced by contextmoses.py [input-factors] 0 [mapping] 0 T 0 [distortion-limit] 6 [feature] UnknownWordPenalty WordPenalty PhrasePenalty PhraseDictionaryMemory name=TranslationModel0 num-features={lentweights} path={phrasetable} input-factor=0 output-factor=0 table-limit=20 {reorderingfeature} Distortion SRILM name=LM0 factor=0 path={lm} order={lmorder} [weight] UnknownWordPenalty0= 1 WordPenalty0= {wweight} PhrasePenalty0= {pweight} LM0= {lmweight} TranslationModel0= {tweights} Distortion0= {dweight} {reorderingweight} """.format(phrasetable=decodedir + "/phrase-table", lm=args.lm, lmorder=args.lmorder, lmweight = args.lmweight, dweight = args.dweight, tweights=tweights, lentweights=lentweights, wweight=args.wweight, pweight = args.pweight, reorderingfeature=reorderingfeature, reorderingweight=reorderingweight)) f.close() if not args.skipdecoder: if args.mert: if args.ref[0] == '/': ref = args.ref else: ref = os.getcwd() + '/' + args.ref for mertrun in range(1,args.mert+1): if os.path.exists(decodedir+"/mert-work-" + str(mertrun) +"/moses.ini"): print("Mert run #" + str(mertrun) + " already ran, skipping...",file=sys.stderr) else: #invoke mert cmd = args.mosesdir + "/scripts/training/mert-moses.pl --working-dir=" + decodedir + "/mert-work-" + str(mertrun) + " --mertdir=" + args.mosesdir + '/mert/' + ' --decoder-flags="-threads ' + str(args.threads) + '" ' + decodedir + "/test.txt " + ref + " `which moses` " + decodedir + "/moses.ini --threads=" + str(args.threads) print("Contextmoses calling mert #" + str(mertrun) + ": " + cmd,file=sys.stderr) r = subprocess.call(cmd, shell=True) if r != 0: print("Contextmoses called mert #" + str(mertrun) + " but failed!", file=sys.stderr) sys.exit(1) print("DONE: Contextmoses calling mert #" + str(mertrun)+": " + cmd,file=sys.stderr) else: #invoke moses cmd = EXEC_MOSES + " -threads " + str(args.threads) + " -f " + decodedir + "/moses.ini < " + decodedir + "/test.txt > " + decodedir + "/output.txt" print("Contextmoses calling moses: " + cmd,file=sys.stderr) r = subprocess.call(cmd, shell=True) if r != 0: print("Contextmoses called moses but failed!", file=sys.stderr) sys.exit(1) print("DONE: Contextmoses calling moses: " + cmd,file=sys.stderr) else: print("Contextmoses skipping decoder",file=sys.stderr)
def extractpairs(ttablefile, gizamodelfile_s2t, gizamodelfile_t2s, patternmodelfile_source, patternmodelfile_target, classfile_source, classfile_target, joinedprobabilitythreshold, divergencefrombestthreshold, DEBUG): if DEBUG: print("Loading phrase-table", file=sys.stderr) #ttable = PhraseTable(ttablefile,False, False, "|||", 3, 0,None, None, lambda x: x[0] * x[2] > joinedprobabilitythreshold) ttable = AlignmentModel() ttable.load(ttablefile) if DEBUG: print("Loading GIZA model (s->t)", file=sys.stderr) gizamodel_s2t = GizaModel(gizamodelfile_s2t) if DEBUG: print("Loading GIZA model (t->s)", file=sys.stderr) gizamodel_t2s = GizaModel(gizamodelfile_t2s) if DEBUG: print("Loading decoders", file=sys.stderr) classdecoder_source = ClassDecoder(classfile_source) classencoder_source = ClassEncoder(classfile_source) classdecoder_target = ClassDecoder(classfile_target) classencoder_target = ClassEncoder(classfile_target) if DEBUG: print("Loading source pattern model " + patternmodelfile_source, file=sys.stderr) options = PatternModelOptions() #options.DOREVERSEINDEX = False patternmodel_source = IndexedPatternModel(patternmodelfile_source, options) if DEBUG: print("Loading target pattern model " + patternmodelfile_target, file=sys.stderr) patternmodel_target = IndexedPatternModel(patternmodelfile_target, options) #with open(sourcecorpusfile, 'r', encoding='utf-8') as f: # sourcecorpus = [x.strip() for x in f.readlines()] #with open(targetcorpusfile, 'r', encoding='utf-8') as f: # targetcorpus = [x.strip() for x in f.readlines()] iter_s2t = iter(gizamodel_s2t) iter_t2s = iter(gizamodel_t2s) if DEBUG: print("Iterating over all sentence pairs", file=sys.stderr) #iterate over all sentences in the parallel corpus (GIZA alignment acts as source) while True: try: s2t = next(iter_s2t) t2s = next(iter_t2s) except StopIteration: print("WARNING: No more GIZA alignments, breaking",file=sys.stderr) break sentence = s2t.index assert t2s.index == s2t.index if DEBUG: print("(extractpatterns) s2t.source=", s2t.source , file=sys.stderr) print("(extractpatterns) t2s.target=", t2s.target , file=sys.stderr) print("(extractpatterns) t2s.source=", t2s.source , file=sys.stderr) print("(extractpatterns) s2t.target=", s2t.target , file=sys.stderr) intersection = s2t.intersect(t2s) if not intersection: continue #gather all target patterns found in this sentence sourcepatterns = list(patternmodel_source.reverseindex_bysentence(sentence)) targetpatterns = [ targetpattern.tostring(classdecoder_target) for targetpattern in patternmodel_target.reverseindex_bysentence(sentence) ] if DEBUG: print("(extractpatterns) processing sentence " + str(sentence) + ", collected " + str(len(sourcepatterns)) + " source patterns and " + str(len(targetpatterns)) + " target patterns", file=sys.stderr) if DEBUG: for targetpattern in targetpatterns: if DEBUG: print("(extractpatterns) -- identified target pattern " + str(targetpattern) , file=sys.stderr) #iterate over all source patterns found in this sentence for sourcepattern in sourcepatterns: sourcepattern_s = sourcepattern.tostring(classdecoder_source) if any(( noword(x) for x in sourcepattern_s.split() ) ): continue sourceindices = [ (x,y) for x,y in patternmodel_source[sourcepattern] if x == sentence ] source_n = sourcepattern_s.count(" ") + 1 assert bool(sourceindices) if sourcepattern_s in ttable: if DEBUG: print("(extractpatterns) -- source pattern candidate " + str(sourcepattern_s) + " (occuring " + str(len(sourceindices)) + " time(s)), has " + str(len(ttable[sourcepattern_s])) + " translation options in phrase-table" , file=sys.stderr) sourcesentence = s2t.source targetsentence = s2t.target targetoptions = sorted( ( (targetpattern_s, scores) for targetpattern_s, scores in ttable[sourcepattern_s] ) , key=lambda x: x[1] ) bestscore = targetoptions[0][1][0] * targetoptions[0][1][2] #iterate over the target patterns in the phrasetable for targetpattern_s, scores in ttable[sourcepattern_s]: if DEBUG: print("(extractpatterns) -- considering target pattern from phrase-table: " + str(targetpattern_s) , file=sys.stderr) if targetpattern_s in targetpatterns: if any(( noword(x) for x in targetpattern_s.split() ) ): continue joinedprob = scores[0] * scores[2] if joinedprob < bestscore * divergencefrombestthreshold: continue #we have a pair, occurring in pattern models and phrase table target_n = targetpattern_s.count(" ") + 1 #obtain positional offsets for source and target in sentence targetindices = [ (x,y) for x,y in patternmodel_target[classencoder_target.buildpattern(targetpattern_s)] if x == sentence] assert bool(targetindices) if DEBUG: print("(extractpatterns) --- found target pattern candidate " + str(targetpattern_s) + " (occuring " + str(len(targetindices)) + " time(s))" , file=sys.stderr) #yield the pair and full context for _, sourceoffset in sourceindices: for _, targetoffset in targetindices: #check if offsets don't violate the word alignment valid = True for i in range(sourceoffset, sourceoffset + source_n): target, foundindex = intersection.getalignedtarget(i) if isinstance(foundindex, tuple): targetl = foundindex[1] foundindex = foundindex[0] if foundindex < targetoffset or foundindex >= targetoffset + target_n: valid = False if DEBUG: print("(extractpatterns) --- violates word alignment", file=sys.stderr) break if valid: if DEBUG: print("(extractpatterns) --- ok", file=sys.stderr) yield sourcepattern_s, targetpattern_s, sourceoffset, targetoffset, tuple(sourcesentence), tuple(targetsentence), sentence
def main(): parser = argparse.ArgumentParser(description="Wrapper around the Moses Decoder that adds support for context features through classifiers.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-f','--inputfile', type=str,help="Input text file; the test corpus (plain text, tokenised, one sentence per line), may be specified multiple times for each factor", action='append',required=False) parser.add_argument('-d','--devinputfile', type=str,help="Extra input text file to consider when training classifiers; the development corpus (plain text, tokenised, one sentence per line)", action='store',required=False) parser.add_argument('-S','--sourceclassfile', type=str, help="Source class file", action='store',required=True) parser.add_argument('-T','--targetclassfile', type=str, help="Target class file", action='store',required=True) parser.add_argument('-a','--alignmodelfile', type=str,help="Colibri alignment model (made from phrase translation table)", action='store',default="",required=False) parser.add_argument('-w','--workdir', type=str,help="Working directory, should contain classifier training files", action='store',default="",required=True) parser.add_argument('--train', help="Train classifiers", action="store_true", default=False) #parser.add_argument('-O','--timbloptions', type=str, help="Options for the Timbl classifier", action="store", default="-a 0 -k 1") parser.add_argument('--ta', type=str, help="Timbl algorithm", action="store", default="0") parser.add_argument('--tk', type=str, help="Timbl k value", action="store", default="1") parser.add_argument('--tw', type=str, help="Timbl weighting", action="store", default="gr") parser.add_argument('--tm', type=str, help="Timbl feature metrics", action="store", default="O") parser.add_argument('--td', type=str, help="Timbl distance metric", action="store", default="Z") parser.add_argument('-I','--ignoreclassifier', help="Ignore classifier (for testing bypass method)", action="store_true", default=False) parser.add_argument('-H','--scorehandling', type=str, help="Score handling, can be 'append' (default), 'replace', or 'weighed'", action="store", default="append") parser.add_argument('--mosesinclusive',help="Pass full sentences through through Moses server using XML input (will start a moses server, requires --moseslm). Classifier output competes with normal translation table. Score handling (-H) has no effect as only the classifier score will be passed.", action='store_true',default=False) parser.add_argument('--mosesexclusive',help="Pass full sentences through through Moses server using XML input (will start a moses server, requires --moseslm). Classifier does NOT compete with normal translation table. Score handling (-H) has no effect as only the classifier score will be passed.", action='store_true',default=False) parser.add_argument('--mosesdir', type=str,help='Path to Moses directory (required for MERT)', default="") parser.add_argument('--mert', type=int,help="Do MERT parameter tuning, set to number of MERT runs to perform", required=False, default=0) parser.add_argument('--threads', type=int, default=1, help="Number of threads to use for Moses or Mert") parser.add_argument('--reordering', type=str,action="store",help="Reordering type (use with --reorderingtable)", required=False) parser.add_argument('--reorderingtable', type=str,action="store",help="Use reordering table (use with --reordering)", required=False) parser.add_argument('--ref', type=str,action="store",help="Reference corpus (target corpus, plain text)", required=False) parser.add_argument('--lm', type=str, help="Language Model", action="store", default="", required=False) parser.add_argument('--lmorder', type=int, help="Language Model order", action="store", default=3, required=False) parser.add_argument('--lmweight', type=float, help="Language Model weight", action="store", default=0.5, required=False) parser.add_argument('--dweight', type=float, help="Distortion Model weight", action="store", default=0.3, required=False) parser.add_argument('--wweight', type=float, help="Word penalty weight", action="store", default=-1, required=False) parser.add_argument('--tweight', type=float, help="Translation Model weight (may be specified multiple times for each score making up the translation model)", action="append", required=False) parser.add_argument('--reorderingweight', type=float, help="Reordering Model weight (may be specified multiple times for each score making up the reordering model)", action="append", required=False) parser.add_argument('--pweight', type=float, help="Phrase penalty", default=0.2, action="store", required=False) parser.add_argument('--classifierdir', type=str,help="Trained classifiers, intermediate phrase-table and test file will be written here (only specify if you want a different location than the work directory)", action='store',default="",required=False) parser.add_argument('--decodedir', type=str,help="Moses output will be written here (only specify if you want a different location than the work directory)", action='store',default="",required=False) parser.add_argument('--skipdecoder',action="store_true",default=False) parser.add_argument('--ignoreerrors',action="store_true",help="Attempt to ignore errors",default=False) parser.add_argument('--mosesport',type=int, help="Port for Moses server (will be started for you), if -Z is enabled",action='store',default=8080) args = parser.parse_args() #args.storeconst, args.dataset, args.num, args.bar if not os.path.isdir(args.workdir) or not os.path.exists(args.workdir + '/classifier.conf'): print("Work directory " + args.workdir + " or classifier configuration therein does not exist. Did you extract features and create classifier training files using colibri-extractfeatures?" ,file=sys.stderr) sys.exit(2) if args.classifierdir: classifierdir = args.classifierdir else: classifierdir = args.workdir if not classifierdir: classifierdir = os.getcwd() elif classifierdir and classifierdir[0] != '/': classifierdir = os.getcwd() + '/' + classifierdir if args.mert and not args.mosesdir: print("--mert requires --mosesdir to be set",file=sys.stderr) sys.exit(2) if args.mert and not args.ref: print("--mert requires --ref to be set",file=sys.stderr) sys.exit(2) if args.decodedir: decodedir = args.decodedir else: decodedir = args.workdir if not decodedir: decodedir = os.getcwd() elif decodedir and decodedir[0] != '/': decodedir = os.getcwd() + '/' + decodedir print("Loading configuration (training corpora and class decoders)",file=sys.stderr) f = open(args.workdir + '/classifier.conf','rb') classifierconf = pickle.load(f) f.close() print("Configuration: ", classifierconf,file=sys.stderr) if args.inputfile: if len(classifierconf['featureconf']) > len(args.inputfile): raise Exception("Number of input files (" + str(len(args.inputfile)) + ") is less than the number of factor-features in configuration (" + str(len(classifierconf['featureconf'])) + "), you need to specify all") #one for each factor sourceencoders = [] if args.inputfile: l = [] for i, (inputfile, conf) in enumerate(zip(args.inputfile, classifierconf['featureconf'])): trainclassfile = conf['classdecoder'] print("Processing factor #" + str(i),file=sys.stderr) #process inputfile corpusfile = os.path.basename(inputfile).replace('.txt','') + '.colibri.dat' classfile = os.path.basename(inputfile).replace('.txt','') + '.colibri.cls' #if os.path.exists(corpusfiles[i]) and os.path.exists(classfiles[i]): # print("Notice: Re-using previously generated corpusfile and classfile",file=sys.stderr) # print("Loading source class encoder and decoder",file=sys.stderr) # sourceencoders.append( ClassEncoder(classfiles[i]) ) # sourcedecoders.append( ClassDecoder(classfiles[i]) ) #else: print("Loading and extending source class encoder, from " + trainclassfile + " to " + classfile,file=sys.stderr) sourceencoders.append( ClassEncoder(trainclassfile) ) sourceencoders[i].processcorpus(inputfile) if i == 0 and args.devinputfile: print("(including development corpus in extended class encoder)",file=sys.stderr) sourceencoders[i].processcorpus(args.devinputfile) sourceencoders[i].buildclasses() sourceencoders[i].save(classfile) print("Encoding test corpus, from " + inputfile + " to " + corpusfile,file=sys.stderr) sourceencoders[i].encodefile(inputfile, corpusfile) if i == 0 and args.devinputfile: print("Encoding development corpus, from " + args.devinputfile + " to " + args.devinputfile + '.colibri.dat',file=sys.stderr) sourceencoders[i].encodefile(args.devinputfile, args.devinputfile + '.colibri.dat') print("Loading source class decoder " + classfile,file=sys.stderr) sourcedecoder = ClassDecoder(classfile) print("Loading test corpus " + corpusfile,file=sys.stderr) l.append( Configuration( IndexedCorpus(corpusfile), sourcedecoder, conf['leftcontext'], conf['focus'], conf['rightcontext']) ) classifierconf['featureconf'] = l else: print("Loading source class decoders",file=sys.stderr) l = [] for conf in classifierconf['featureconf']: sourcedecoder = ClassDecoder(conf['classdecoder']) l.append( Configuration( IndexedCorpus(), sourcedecoder, conf['leftcontext'], conf['focus'], conf['rightcontext'] ) ) if args.inputfile and args.alignmodelfile: print("Loading target encoder " + args.targetclassfile,file=sys.stderr) targetencoder = ClassEncoder(args.targetclassfile) print("Loading target decoder " + args.targetclassfile,file=sys.stderr) targetdecoder = ClassDecoder(args.targetclassfile) print("Loading alignment model " + args.alignmodelfile ,file=sys.stderr) alignmodel = AlignmentModel(args.alignmodelfile) print("\tAlignment model has " + str(len(alignmodel)) + " source patterns",file=sys.stderr) print("Building patternmodel on test corpus " + classifierconf['featureconf'][0].corpus.filename() ,file=sys.stderr) options = PatternModelOptions(mintokens=1, maxlength=12, debug=True) testmodel = IndexedPatternModel(reverseindex=classifierconf['featureconf'][0].corpus) testmodel.train( "", options, alignmodel) print("\tTest model has " + str(len(testmodel)) + " source patterns",file=sys.stderr) #saving just so we can inspect it for debug purposes: testmodel.write( decodedir + '/test.colibri.indexedpatternmodel' ) if args.devinputfile: print("Building patternmodel on development corpus " + args.devinputfile + ".colibri.dat" ,file=sys.stderr) devcorpus = IndexedCorpus(args.devinputfile + ".colibri.dat") print("Development corpus has " + str(devcorpus.sentences()) + " sentences") devmodel = IndexedPatternModel(reverseindex=devcorpus) devmodel.train( "", options, alignmodel) print("\tDevelopment model has " + str(len(testmodel)) + " source patterns",file=sys.stderr) #saving just so we can inspect it for debug purposes: devmodel.write( decodedir + '/dev.colibri.indexedpatternmodel' ) else: devmodel = {} if args.reorderingtable: print("Loading reordering model (may take a while)",file=sys.stderr) rtable = PhraseTable(args.reorderingtable) #TODO: convert to colibri alignmodel elif args.train and args.inputfile: if not args.alignmodelfile: print("No alignment model specified (-a)",file=sys.stderr) sys.exit(2) elif not args.train: if not args.inputfile: print("No input file specified (-f)",file=sys.stderr) if not args.alignmodelfile: print("No alignment model specified (-a)",file=sys.stderr) sys.exit(2) if args.train: #training mode if args.inputfile: print("Training classifiers (constrained by test data)",file=sys.stderr) else: print("Training all classifiers (you may want to constrain by test data using -f)",file=sys.stderr) if 'monolithic' in classifierconf and classifierconf['monolithic']: #monolithic trainfile = args.workdir + "/train" #build a classifier print("Training monolithic classifier " + trainfile,file=sys.stderr) timbloptions = gettimbloptions(args, classifierconf) if args.classifierdir: #ugly hack since we want ibases in a different location trainfilecopy = trainfile.replace(args.workdir, args.classifierdir) shutil.copyfile(trainfile+".train", trainfilecopy+".train") trainfile = trainfilecopy classifier = timbl.TimblClassifier(trainfile, timbloptions) classifier.train() classifier.save() if args.classifierdir: #remove copy os.unlink(trainfile+".train") trained = 1 else: #experts trained = 0 for trainfile in itertools.chain(glob.glob(args.workdir + "/*.train"), glob.glob(args.workdir + "/.*.train")): #explicitly add 'dotfiles', will be skipped by default if args.inputfile: sourcepattern_s = unquote_plus(os.path.basename(trainfile.replace('.train',''))) sourcepattern = sourceencoders[0].buildpattern(sourcepattern_s) if not sourcepattern in testmodel and not sourcepattern in devmodel: print("Skipping " + trainfile + " (\"" + sourcepattern_s + "\" not in test/dev model)",file=sys.stderr) continue #build a classifier print("Training " + trainfile,file=sys.stderr) trained += 1 timbloptions = gettimbloptions(args, classifierconf) if args.classifierdir: #ugly hack since we want ibases in a different location trainfilecopy = trainfile.replace(args.workdir, args.classifierdir) shutil.copyfile(trainfile, trainfilecopy) trainfile = trainfilecopy classifier = timbl.TimblClassifier(trainfile.replace('.train',''), timbloptions) classifier.train() classifier.save() if args.classifierdir: #remove copy os.unlink(trainfile) if not os.path.exists(trainfile.replace(".train",".ibase")): raise Exception("Resulting instance base " + trainfile.replace(".train",".ibase") + " not found!") with open(args.classifierdir + '/trained','w',encoding='utf-8') as f: f.write(str(trained)+"\n") else: #TEST if not args.inputfile: print("Specify an input file (-f)",file=sys.stderr) sys.exit(2) if not args.mosesinclusive and not args.mosesexclusive: print("Writing intermediate test data to " + decodedir + "/test.txt",file=sys.stderr) #write intermediate test data (consisting only of indices AND unknown words) and f = open(decodedir + "/test.txt",'w',encoding='utf-8') for sentencenum, line in enumerate(classifierconf['featureconf'][0].corpus.sentences()): sentenceindex = sentencenum + 1 print("@" + str(sentenceindex),file=sys.stderr) tokens = [] #actual string representations for tokenindex,pattern in enumerate(line): #will yield only unigrams #is this an uncovered word that does not appear in the phrasetable? check using alignment model and keep the word untranslated if so print("DEBUG: Processing pattern " + str(sentenceindex) + ":" + str(tokenindex) + ": "+ pattern.tostring(classifierconf['featureconf'][0].classdecoder),file=sys.stderr) if not pattern in alignmodel: print(" Found OOV at @" + str(sentenceindex) + ":" + str(tokenindex) + ": " + pattern.tostring(classifierconf['featureconf'][0].classdecoder), file=sys.stderr) tokens.append(pattern.tostring(classifierconf['featureconf'][0].classdecoder)) else: tokens.append(str(sentenceindex) + "_" + str(tokenindex)) f.write(" ".join(tokens) + "\n") f.close() classifierindex = set() if classifierconf['monolithic']: print("Loading classifier index for monolithic classifier",file=sys.stderr) with open(args.workdir + "/sourcepatterns.list",'r',encoding='utf-8') as f: for line in f: classifierindex.add(line.strip()) print("Loading monolithic classifier " + classifierdir + "/train.train",file=sys.stderr) timbloptions = gettimbloptions(args, classifierconf) classifier = timbl.TimblClassifier(classifierdir + "/train", timbloptions) else: classifier = None if args.reorderingtable: print("Creating intermediate phrase-table and reordering-table",file=sys.stderr) freordering = open(decodedir + "/reordering-table", 'w',encoding='utf-8') else: print("Creating intermediate phrase-table",file=sys.stderr) freordering = None if args.mosesinclusive or args.mosesexclusive: #Use mosesserver with XML input method #write mos if not args.tweight: lentweights = 4 tweights = " ".join([str(1/(lentweights+1))]*lentweights) else: tweights = " ".join([ str(x) for x in args.tweight]) lentweights = len(args.tweight) if os.path.exists(decodedir + "/moses.ini"): os.unlink(decodedir+"/moses.ini") print("Writing " + decodedir + "/moses.ini",file=sys.stderr) if args.reordering: reorderingfeature = "LexicalReordering name=LexicalReordering0 num-features=6 type=" + args.reordering + " input-factor=0 output-factor=0 path=" + decodedir + "/reordering-table" reorderingweight = "LexicalReordering0= 0.3 0.3 0.3 0.3 0.3 0.3" else: reorderingfeature = "" reorderingweight = "" #write moses.ini f = open(decodedir + '/moses.ini','w',encoding='utf-8') f.write(""" #Moses INI, produced by contextmoses.py [input-factors] 0 [mapping] 0 T 0 [distortion-limit] 6 [feature] UnknownWordPenalty WordPenalty PhrasePenalty PhraseDictionaryMemory name=TranslationModel0 num-features={lentweights} path={phrasetable} input-factor=0 output-factor=0 table-limit=20 {reorderingfeature} Distortion SRILM name=LM0 factor=0 path={lm} order={lmorder} [weight] UnknownWordPenalty0= 1 WordPenalty0= {wweight} PhrasePenalty0= {pweight} LM0= {lmweight} TranslationModel0= {tweights} Distortion0= {dweight} {reorderingweight} """.format(phrasetable=decodedir + "/phrase-table", lm=args.lm, lmorder=args.lmorder, lmweight = args.lmweight, dweight = args.dweight, tweights=tweights, lentweights=lentweights, wweight=args.wweight, pweight = args.pweight, reorderingfeature=reorderingfeature, reorderingweight=reorderingweight)) f.close() print("Starting Moses Server",file=sys.stderr) if args.mosesdir: cmd = args.mosesdir + '/bin/mosesserver' else: cmd = 'mosesserver' if args.moses: if args.mosesinclusive: cmd += " -xml-input inclusive" #compete with phrase-table elif args.mosesexclusive: cmd += " -xml-input exclusive" #only used for passing verbatim L2 (tested whether it makes a difference with inclusive baseline on en-es data, it doesn't) cmd += ' -f ' + decodedir + '/moses.ini' print("Calling mosesserver: " + cmd,file=sys.stderr) p = subprocess.Popen(cmd,shell=True) mosesserverpid = p.pid while True: time.sleep(5) try: s = socket.socket() s.connect( ("localhost", args.mosesport) ) break except Exception as e: print("Waiting for Moses server....", e, file=sys.stderr) print("Connecting to Moses Server",file=sys.stderr) mosesclient = xmlrpc.client.ServerProxy("http://*****:*****@" + str(i+1) + "/" + str(sourcepatterncount) + " -- Processing " + str(sentenceindex) + ":" + str(tokenindex) + " " + sourcepattern_s + " -- Features: " + str(repr(featurevector)),file=sys.stderr) if classifier and not args.ignoreclassifier: if not classifierconf['monolithic'] or (classifierconf['monolithic'] and sourcepattern_s in classifierindex): print("\tClassifying",file=sys.stderr) #call classifier classlabel, distribution, distance = classifier.classify(featurevector) #process classifier result for targetpattern_s, score in distribution.items(): targetpattern = targetencoder.buildpattern(targetpattern_s) if (sourcepattern, targetpattern) in alignmodel: scorevector = [ x for x in alignmodel[(sourcepattern,targetpattern)] if isinstance(x,int) or isinstance(x,float) ] #make a copy else: continue if args.scorehandling == 'append': scorevector.append(score) elif args.scorehandling == 'replace': scorevector[2] = score else: raise NotImplementedError #TODO: implemented weighed! translationcount += 1 #write phrasetable entries ftable.write(tokenspan + " ||| " + targetpattern_s + " ||| " + " ".join([str(x) for x in scorevector]) + "\n") if freordering: reordering_scores = None try: for t, sv in rtable[sourcepattern_s]: if t == targetpattern_s: reordering_scores = sv except KeyError: if args.ignoreerrors: print("******* ERROR ********* Source pattern notfound in reordering table: " + sourcepattern_s,file=sys.stderr) continue else: raise Exception("Source pattern notfound in reordering table: " + sourcepattern_s) if reordering_scores: freordering.write(tokenspan + " ||| " + targetpattern_s + " ||| " + " ".join([str(x) for x in reordering_scores]) + "\n") else: if args.ignoreerrors: print("******** ERROR ********* Target pattern not found in reordering table: " + targetpattern_s + " (for source " + sourcepattern_s + ")",file=sys.stderr) continue else: raise Exception("Target pattern not found in reordering table: " + targetpattern_s + " (for source " + sourcepattern_s + ")") if translationcount == 0: print("\tNo overlap between classifier translations (" + str(len(distribution)) + ") and phrase table. Falling back to statistical baseline.",file=sys.stderr) statistical = True else: print("\t\t" + str(translationcount) + " translation options written",file=sys.stderr) statistical = False else: print("\tNot in classifier. Falling back to statistical baseline.",file=sys.stderr) statistical = True else: statistical = True if statistical: print("\tPhrasetable lookup",file=sys.stderr) #ignore classifier or no classifier present for this item for targetpattern in alignmodel.targetpatterns(sourcepattern): scorevector = [ x for x in alignmodel[(sourcepattern,targetpattern)] if isinstance(x,int) or isinstance(x,float) ] #make a copy if args.scorehandling == 'append': scorevector.append(scorevector[2]) elif args.scorehandling == 'replace': pass #nothing to do, scorevector is okay as it is elif args.scorehandling == 'weighed': raise NotImplementedError #TODO: implemented weighed! translationcount += 1 #write phrasetable entries targetpattern_s = targetpattern.tostring(targetdecoder) ftable.write(tokenspan + " ||| " + targetpattern_s + " ||| " + " ".join([ str(x) for x in scorevector]) + "\n") if freordering: reordering_scores = None try: for t, sv in rtable[sourcepattern_s]: if t == targetpattern_s: reordering_scores = sv except KeyError: if args.ignoreerrors: print("******** ERROR ******* Source pattern not found in reordering table: " + sourcepattern_s,file=sys.stderr) continue else: raise Exception("Source pattern not found in reordering table: " + sourcepattern_s) if reordering_scores: freordering.write(tokenspan + " ||| " + targetpattern_s + " ||| " + " ".join([str(x) for x in reordering_scores]) + "\n") else: if args.ignoreerrors: print("******* ERROR ****** Target pattern not found in reordering table: " + targetpattern_s + " (for source " + sourcepattern_s + ")",file=sys.stderr) continue else: raise Exception("Target pattern not found in reordering table: " + targetpattern_s + " (for source " + sourcepattern_s + ")") print("\t\t" + str(translationcount) + " translation options written",file=sys.stderr) prevpattern = None ftable.close() if freordering: freordering.close() if not args.tweight: if args.scorehandling == "append": lentweights = 5 else: lentweights = 4 tweights = " ".join([str(1/(lentweights+1))]*lentweights) else: tweights = " ".join([ str(x) for x in args.tweight]) lentweights = len(args.tweight) print("Writing " + decodedir + "/moses.ini",file=sys.stderr) if args.reordering: reorderingfeature = "LexicalReordering name=LexicalReordering0 num-features=6 type=" + args.reordering + " input-factor=0 output-factor=0 path=" + decodedir + "/reordering-table" if not args.reorderingweight: reorderingweight = "LexicalReordering0= 0.3 0.3 0.3 0.3 0.3 0.3" else: reorderingweight = "LexicalReordering0= " + " ".join([str(x) for x in args.reorderingweight]) else: reorderingfeature = "" reorderingweight = "" #write moses.ini f = open(decodedir + '/moses.ini','w',encoding='utf-8') f.write(""" #Moses INI, produced by contextmoses.py [input-factors] 0 [mapping] 0 T 0 [distortion-limit] 6 [feature] UnknownWordPenalty WordPenalty PhrasePenalty PhraseDictionaryMemory name=TranslationModel0 num-features={lentweights} path={phrasetable} input-factor=0 output-factor=0 table-limit=20 {reorderingfeature} Distortion SRILM name=LM0 factor=0 path={lm} order={lmorder} [weight] UnknownWordPenalty0= 1 WordPenalty0= {wweight} PhrasePenalty0= {pweight} LM0= {lmweight} TranslationModel0= {tweights} Distortion0= {dweight} {reorderingweight} """.format(phrasetable=decodedir + "/phrase-table", lm=args.lm, lmorder=args.lmorder, lmweight = args.lmweight, dweight = args.dweight, tweights=tweights, lentweights=lentweights, wweight=args.wweight, pweight = args.pweight, reorderingfeature=reorderingfeature, reorderingweight=reorderingweight)) f.close() if not args.skipdecoder: if args.mert: if args.ref[0] == '/': ref = args.ref else: ref = os.getcwd() + '/' + args.ref for mertrun in range(1,args.mert+1): if os.path.exists(decodedir+"/mert-work-" + str(mertrun) +"/moses.ini"): print("Mert run #" + str(mertrun) + " already ran, skipping...",file=sys.stderr) else: #invoke mert cmd = args.mosesdir + "/scripts/training/mert-moses.pl --working-dir=" + decodedir + "/mert-work-" + str(mertrun) + " --mertdir=" + args.mosesdir + '/mert/' + ' --decoder-flags="-threads ' + str(args.threads) + '" ' + decodedir + "/test.txt " + ref + " `which moses` " + decodedir + "/moses.ini --threads=" + str(args.threads) print("Contextmoses calling mert #" + str(mertrun) + ": " + cmd,file=sys.stderr) r = subprocess.call(cmd, shell=True) if r != 0: print("Contextmoses called mert #" + str(mertrun) + " but failed!", file=sys.stderr) sys.exit(1) print("DONE: Contextmoses calling mert #" + str(mertrun)+": " + cmd,file=sys.stderr) else: #invoke moses cmd = EXEC_MOSES + " -threads " + str(args.threads) + " -f " + decodedir + "/moses.ini < " + decodedir + "/test.txt > " + decodedir + "/output.txt" print("Contextmoses calling moses: " + cmd,file=sys.stderr) r = subprocess.call(cmd, shell=True) if r != 0: print("Contextmoses called moses but failed!", file=sys.stderr) sys.exit(1) print("DONE: Contextmoses calling moses: " + cmd,file=sys.stderr) else: print("Contextmoses skipping decoder",file=sys.stderr)