def test001_alignmodel(self): """Checking alignment model""" options = colibricore.PatternModelOptions(mintokens=1,doreverseindex=False) s = colibricore.ClassEncoder("test-en-nl/test-en-train.colibri.cls") t = colibricore.ClassEncoder("test-en-nl/test-nl-train.colibri.cls") sdec = colibricore.ClassDecoder("test-en-nl/test-en-train.colibri.cls") tdec = colibricore.ClassDecoder("test-en-nl/test-nl-train.colibri.cls") print("Loading alignment model",file=sys.stderr) model = AlignmentModel() model.load("test-en-nl/test-en-nl.colibri.alignmodel",options) print("Loaded",file=sys.stderr) model.output(sdec,tdec) print("Testing contents",file=sys.stderr) self.assertTrue( (s.buildpattern('a'), t.buildpattern('een') ) in model ) self.assertTrue( (s.buildpattern('just'), t.buildpattern('maar') ) in model ) self.assertTrue( (s.buildpattern('only'), t.buildpattern('maar') ) in model ) self.assertTrue( (s.buildpattern('bank'), t.buildpattern('oever') ) in model ) self.assertTrue( (s.buildpattern('bank'), t.buildpattern('bank') ) in model ) self.assertTrue( (s.buildpattern('bank'), t.buildpattern('sturen') ) in model ) self.assertTrue( (s.buildpattern('couch'), t.buildpattern('bank') ) in model ) self.assertTrue( (s.buildpattern('the bank'), t.buildpattern('de oever') ) in model ) self.assertTrue( (s.buildpattern('the bank'), t.buildpattern('de bank') ) in model ) self.assertTrue( (s.buildpattern('the couch'), t.buildpattern('de bank') ) in model ) self.assertTrue( (s.buildpattern('I see'), t.buildpattern('Ik zie') ) in model ) self.assertTrue( (s.buildpattern('He'), t.buildpattern('Hij') ) in model ) self.assertTrue( (s.buildpattern('sits'), t.buildpattern('zit') ) in model ) self.assertTrue( (s.buildpattern('on'), t.buildpattern('on') ) in model ) self.assertTrue( (s.buildpattern('today'), t.buildpattern('vandaag') ) in model ) self.assertEqual( len(list(model.triples())), 15 )
def test001_alignmodel(self): """Checking alignment model""" options = colibricore.PatternModelOptions(mintokens=1, doreverseindex=False) s = colibricore.ClassEncoder("test-en-nl/test-en-train.colibri.cls") t = colibricore.ClassEncoder("test-en-nl/test-nl-train.colibri.cls") sdec = colibricore.ClassDecoder("test-en-nl/test-en-train.colibri.cls") tdec = colibricore.ClassDecoder("test-en-nl/test-nl-train.colibri.cls") print("Loading alignment model", file=sys.stderr) model = AlignmentModel() model.load("test-en-nl/test-en-nl.colibri.alignmodel", options) print("Loaded", file=sys.stderr) model.output(sdec, tdec) print("Testing contents", file=sys.stderr) self.assertTrue((s.buildpattern('a'), t.buildpattern('een')) in model) self.assertTrue((s.buildpattern('just'), t.buildpattern('maar')) in model) self.assertTrue((s.buildpattern('only'), t.buildpattern('maar')) in model) self.assertTrue((s.buildpattern('bank'), t.buildpattern('oever')) in model) self.assertTrue((s.buildpattern('bank'), t.buildpattern('bank')) in model) self.assertTrue((s.buildpattern('bank'), t.buildpattern('sturen')) in model) self.assertTrue((s.buildpattern('couch'), t.buildpattern('bank')) in model) self.assertTrue((s.buildpattern('the bank'), t.buildpattern('de oever')) in model) self.assertTrue((s.buildpattern('the bank'), t.buildpattern('de bank')) in model) self.assertTrue((s.buildpattern('the couch'), t.buildpattern('de bank')) in model) self.assertTrue((s.buildpattern('I see'), t.buildpattern('Ik zie')) in model) self.assertTrue((s.buildpattern('He'), t.buildpattern('Hij')) in model) self.assertTrue((s.buildpattern('sits'), t.buildpattern('zit')) in model) self.assertTrue((s.buildpattern('on'), t.buildpattern('on')) in model) self.assertTrue((s.buildpattern('today'), t.buildpattern('vandaag')) in model) self.assertEqual(len(list(model.triples())), 15)
def extractpairs(ttablefile, gizamodelfile_s2t, gizamodelfile_t2s, patternmodelfile_source, patternmodelfile_target, classfile_source, classfile_target, joinedprobabilitythreshold, divergencefrombestthreshold, DEBUG): if DEBUG: print("Loading phrase-table", file=sys.stderr) #ttable = PhraseTable(ttablefile,False, False, "|||", 3, 0,None, None, lambda x: x[0] * x[2] > joinedprobabilitythreshold) ttable = AlignmentModel() ttable.load(ttablefile) if DEBUG: print("Loading GIZA model (s->t)", file=sys.stderr) gizamodel_s2t = GizaModel(gizamodelfile_s2t) if DEBUG: print("Loading GIZA model (t->s)", file=sys.stderr) gizamodel_t2s = GizaModel(gizamodelfile_t2s) if DEBUG: print("Loading decoders", file=sys.stderr) classdecoder_source = ClassDecoder(classfile_source) classencoder_source = ClassEncoder(classfile_source) classdecoder_target = ClassDecoder(classfile_target) classencoder_target = ClassEncoder(classfile_target) if DEBUG: print("Loading source pattern model " + patternmodelfile_source, file=sys.stderr) options = PatternModelOptions() #options.DOREVERSEINDEX = False patternmodel_source = IndexedPatternModel(patternmodelfile_source, options) if DEBUG: print("Loading target pattern model " + patternmodelfile_target, file=sys.stderr) patternmodel_target = IndexedPatternModel(patternmodelfile_target, options) #with open(sourcecorpusfile, 'r', encoding='utf-8') as f: # sourcecorpus = [x.strip() for x in f.readlines()] #with open(targetcorpusfile, 'r', encoding='utf-8') as f: # targetcorpus = [x.strip() for x in f.readlines()] iter_s2t = iter(gizamodel_s2t) iter_t2s = iter(gizamodel_t2s) if DEBUG: print("Iterating over all sentence pairs", file=sys.stderr) #iterate over all sentences in the parallel corpus (GIZA alignment acts as source) while True: try: s2t = next(iter_s2t) t2s = next(iter_t2s) except StopIteration: print("WARNING: No more GIZA alignments, breaking",file=sys.stderr) break sentence = s2t.index assert t2s.index == s2t.index if DEBUG: print("(extractpatterns) s2t.source=", s2t.source , file=sys.stderr) print("(extractpatterns) t2s.target=", t2s.target , file=sys.stderr) print("(extractpatterns) t2s.source=", t2s.source , file=sys.stderr) print("(extractpatterns) s2t.target=", s2t.target , file=sys.stderr) intersection = s2t.intersect(t2s) if not intersection: continue #gather all target patterns found in this sentence sourcepatterns = list(patternmodel_source.reverseindex_bysentence(sentence)) targetpatterns = [ targetpattern.tostring(classdecoder_target) for targetpattern in patternmodel_target.reverseindex_bysentence(sentence) ] if DEBUG: print("(extractpatterns) processing sentence " + str(sentence) + ", collected " + str(len(sourcepatterns)) + " source patterns and " + str(len(targetpatterns)) + " target patterns", file=sys.stderr) if DEBUG: for targetpattern in targetpatterns: if DEBUG: print("(extractpatterns) -- identified target pattern " + str(targetpattern) , file=sys.stderr) #iterate over all source patterns found in this sentence for sourcepattern in sourcepatterns: sourcepattern_s = sourcepattern.tostring(classdecoder_source) if any(( noword(x) for x in sourcepattern_s.split() ) ): continue sourceindices = [ (x,y) for x,y in patternmodel_source[sourcepattern] if x == sentence ] source_n = sourcepattern_s.count(" ") + 1 assert bool(sourceindices) if sourcepattern_s in ttable: if DEBUG: print("(extractpatterns) -- source pattern candidate " + str(sourcepattern_s) + " (occuring " + str(len(sourceindices)) + " time(s)), has " + str(len(ttable[sourcepattern_s])) + " translation options in phrase-table" , file=sys.stderr) sourcesentence = s2t.source targetsentence = s2t.target targetoptions = sorted( ( (targetpattern_s, scores) for targetpattern_s, scores in ttable[sourcepattern_s] ) , key=lambda x: x[1] ) bestscore = targetoptions[0][1][0] * targetoptions[0][1][2] #iterate over the target patterns in the phrasetable for targetpattern_s, scores in ttable[sourcepattern_s]: if DEBUG: print("(extractpatterns) -- considering target pattern from phrase-table: " + str(targetpattern_s) , file=sys.stderr) if targetpattern_s in targetpatterns: if any(( noword(x) for x in targetpattern_s.split() ) ): continue joinedprob = scores[0] * scores[2] if joinedprob < bestscore * divergencefrombestthreshold: continue #we have a pair, occurring in pattern models and phrase table target_n = targetpattern_s.count(" ") + 1 #obtain positional offsets for source and target in sentence targetindices = [ (x,y) for x,y in patternmodel_target[classencoder_target.buildpattern(targetpattern_s)] if x == sentence] assert bool(targetindices) if DEBUG: print("(extractpatterns) --- found target pattern candidate " + str(targetpattern_s) + " (occuring " + str(len(targetindices)) + " time(s))" , file=sys.stderr) #yield the pair and full context for _, sourceoffset in sourceindices: for _, targetoffset in targetindices: #check if offsets don't violate the word alignment valid = True for i in range(sourceoffset, sourceoffset + source_n): target, foundindex = intersection.getalignedtarget(i) if isinstance(foundindex, tuple): targetl = foundindex[1] foundindex = foundindex[0] if foundindex < targetoffset or foundindex >= targetoffset + target_n: valid = False if DEBUG: print("(extractpatterns) --- violates word alignment", file=sys.stderr) break if valid: if DEBUG: print("(extractpatterns) --- ok", file=sys.stderr) yield sourcepattern_s, targetpattern_s, sourceoffset, targetoffset, tuple(sourcesentence), tuple(targetsentence), sentence
def main(): parser = argparse.ArgumentParser(description="Wrapper around the Moses Decoder that adds support for context features through classifiers.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-f','--inputfile', type=str,help="Input text file; the test corpus (plain text, tokenised, one sentence per line), may be specified multiple times for each factor", action='append',required=False) parser.add_argument('-d','--devinputfile', type=str,help="Extra input text file to consider when training classifiers; the development corpus (plain text, tokenised, one sentence per line)", action='store',required=False) parser.add_argument('-S','--sourceclassfile', type=str, help="Source class file", action='store',required=True) parser.add_argument('-T','--targetclassfile', type=str, help="Target class file", action='store',required=True) parser.add_argument('-a','--alignmodelfile', type=str,help="Colibri alignment model (made from phrase translation table)", action='store',default="",required=False) parser.add_argument('-w','--workdir', type=str,help="Working directory, should contain classifier training files", action='store',default="",required=True) parser.add_argument('--train', help="Train classifiers", action="store_true", default=False) #parser.add_argument('-O','--timbloptions', type=str, help="Options for the Timbl classifier", action="store", default="-a 0 -k 1") parser.add_argument('--ta', type=str, help="Timbl algorithm", action="store", default="0") parser.add_argument('--tk', type=str, help="Timbl k value", action="store", default="1") parser.add_argument('--tw', type=str, help="Timbl weighting", action="store", default="gr") parser.add_argument('--tm', type=str, help="Timbl feature metrics", action="store", default="O") parser.add_argument('--td', type=str, help="Timbl distance metric", action="store", default="Z") parser.add_argument('-I','--ignoreclassifier', help="Ignore classifier (for testing bypass method)", action="store_true", default=False) parser.add_argument('-H','--scorehandling', type=str, help="Score handling, can be 'append' (default), 'replace', or 'weighed'", action="store", default="append") parser.add_argument('--mosesinclusive',help="Pass full sentences through through Moses server using XML input (will start a moses server, requires --moseslm). Classifier output competes with normal translation table. Score handling (-H) has no effect as only the classifier score will be passed.", action='store_true',default=False) parser.add_argument('--mosesexclusive',help="Pass full sentences through through Moses server using XML input (will start a moses server, requires --moseslm). Classifier does NOT compete with normal translation table. Score handling (-H) has no effect as only the classifier score will be passed.", action='store_true',default=False) parser.add_argument('--mosesdir', type=str,help='Path to Moses directory (required for MERT)', default="") parser.add_argument('--mert', type=int,help="Do MERT parameter tuning, set to number of MERT runs to perform", required=False, default=0) parser.add_argument('--threads', type=int, default=1, help="Number of threads to use for Moses or Mert") parser.add_argument('--reordering', type=str,action="store",help="Reordering type (use with --reorderingtable)", required=False) parser.add_argument('--reorderingtable', type=str,action="store",help="Use reordering table (use with --reordering)", required=False) parser.add_argument('--ref', type=str,action="store",help="Reference corpus (target corpus, plain text)", required=False) parser.add_argument('--lm', type=str, help="Language Model", action="store", default="", required=False) parser.add_argument('--lmorder', type=int, help="Language Model order", action="store", default=3, required=False) parser.add_argument('--lmweight', type=float, help="Language Model weight", action="store", default=0.5, required=False) parser.add_argument('--dweight', type=float, help="Distortion Model weight", action="store", default=0.3, required=False) parser.add_argument('--wweight', type=float, help="Word penalty weight", action="store", default=-1, required=False) parser.add_argument('--tweight', type=float, help="Translation Model weight (may be specified multiple times for each score making up the translation model)", action="append", required=False) parser.add_argument('--reorderingweight', type=float, help="Reordering Model weight (may be specified multiple times for each score making up the reordering model)", action="append", required=False) parser.add_argument('--pweight', type=float, help="Phrase penalty", default=0.2, action="store", required=False) parser.add_argument('--classifierdir', type=str,help="Trained classifiers, intermediate phrase-table and test file will be written here (only specify if you want a different location than the work directory)", action='store',default="",required=False) parser.add_argument('--decodedir', type=str,help="Moses output will be written here (only specify if you want a different location than the work directory)", action='store',default="",required=False) parser.add_argument('--skipdecoder',action="store_true",default=False) parser.add_argument('--ignoreerrors',action="store_true",help="Attempt to ignore errors",default=False) parser.add_argument('--mosesport',type=int, help="Port for Moses server (will be started for you), if -Z is enabled",action='store',default=8080) args = parser.parse_args() #args.storeconst, args.dataset, args.num, args.bar if not os.path.isdir(args.workdir) or not os.path.exists(args.workdir + '/classifier.conf'): print("Work directory " + args.workdir + " or classifier configuration therein does not exist. Did you extract features and create classifier training files using colibri-extractfeatures?" ,file=sys.stderr) sys.exit(2) if args.classifierdir: classifierdir = args.classifierdir else: classifierdir = args.workdir if not classifierdir: classifierdir = os.getcwd() elif classifierdir and classifierdir[0] != '/': classifierdir = os.getcwd() + '/' + classifierdir if args.mert and not args.mosesdir: print("--mert requires --mosesdir to be set",file=sys.stderr) sys.exit(2) if args.mert and not args.ref: print("--mert requires --ref to be set",file=sys.stderr) sys.exit(2) if args.decodedir: decodedir = args.decodedir else: decodedir = args.workdir if not decodedir: decodedir = os.getcwd() elif decodedir and decodedir[0] != '/': decodedir = os.getcwd() + '/' + decodedir print("Loading configuration (training corpora and class decoders)",file=sys.stderr) f = open(args.workdir + '/classifier.conf','rb') classifierconf = pickle.load(f) f.close() print("Configuration: ", classifierconf,file=sys.stderr) if args.inputfile: if len(classifierconf['featureconf']) > len(args.inputfile): raise Exception("Number of input files (" + str(len(args.inputfile)) + ") is less than the number of factor-features in configuration (" + str(len(classifierconf['featureconf'])) + "), you need to specify all") #one for each factor sourceencoders = [] if args.inputfile: l = [] for i, (inputfile, conf) in enumerate(zip(args.inputfile, classifierconf['featureconf'])): trainclassfile = conf['classdecoder'] print("Processing factor #" + str(i),file=sys.stderr) #process inputfile corpusfile = os.path.basename(inputfile).replace('.txt','') + '.colibri.dat' classfile = os.path.basename(inputfile).replace('.txt','') + '.colibri.cls' #if os.path.exists(corpusfiles[i]) and os.path.exists(classfiles[i]): # print("Notice: Re-using previously generated corpusfile and classfile",file=sys.stderr) # print("Loading source class encoder and decoder",file=sys.stderr) # sourceencoders.append( ClassEncoder(classfiles[i]) ) # sourcedecoders.append( ClassDecoder(classfiles[i]) ) #else: print("Loading and extending source class encoder, from " + trainclassfile + " to " + classfile,file=sys.stderr) sourceencoders.append( ClassEncoder(trainclassfile) ) sourceencoders[i].processcorpus(inputfile) if i == 0 and args.devinputfile: print("(including development corpus in extended class encoder)",file=sys.stderr) sourceencoders[i].processcorpus(args.devinputfile) sourceencoders[i].buildclasses() sourceencoders[i].save(classfile) print("Encoding test corpus, from " + inputfile + " to " + corpusfile,file=sys.stderr) sourceencoders[i].encodefile(inputfile, corpusfile) if i == 0 and args.devinputfile: print("Encoding development corpus, from " + args.devinputfile + " to " + args.devinputfile + '.colibri.dat',file=sys.stderr) sourceencoders[i].encodefile(args.devinputfile, args.devinputfile + '.colibri.dat') print("Loading source class decoder " + classfile,file=sys.stderr) sourcedecoder = ClassDecoder(classfile) print("Loading test corpus " + corpusfile,file=sys.stderr) l.append( Configuration( IndexedCorpus(corpusfile), sourcedecoder, conf['leftcontext'], conf['focus'], conf['rightcontext']) ) classifierconf['featureconf'] = l else: print("Loading source class decoders",file=sys.stderr) l = [] for conf in classifierconf['featureconf']: sourcedecoder = ClassDecoder(conf['classdecoder']) l.append( Configuration( IndexedCorpus(), sourcedecoder, conf['leftcontext'], conf['focus'], conf['rightcontext'] ) ) if args.inputfile and args.alignmodelfile: print("Loading target encoder " + args.targetclassfile,file=sys.stderr) targetencoder = ClassEncoder(args.targetclassfile) print("Loading target decoder " + args.targetclassfile,file=sys.stderr) targetdecoder = ClassDecoder(args.targetclassfile) print("Loading alignment model " + args.alignmodelfile ,file=sys.stderr) alignmodel = AlignmentModel(args.alignmodelfile) print("\tAlignment model has " + str(len(alignmodel)) + " source patterns",file=sys.stderr) print("Building patternmodel on test corpus " + classifierconf['featureconf'][0].corpus.filename() ,file=sys.stderr) options = PatternModelOptions(mintokens=1, maxlength=12, debug=True) testmodel = IndexedPatternModel(reverseindex=classifierconf['featureconf'][0].corpus) testmodel.train( "", options, alignmodel) print("\tTest model has " + str(len(testmodel)) + " source patterns",file=sys.stderr) #saving just so we can inspect it for debug purposes: testmodel.write( decodedir + '/test.colibri.indexedpatternmodel' ) if args.devinputfile: print("Building patternmodel on development corpus " + args.devinputfile + ".colibri.dat" ,file=sys.stderr) devcorpus = IndexedCorpus(args.devinputfile + ".colibri.dat") print("Development corpus has " + str(devcorpus.sentences()) + " sentences") devmodel = IndexedPatternModel(reverseindex=devcorpus) devmodel.train( "", options, alignmodel) print("\tDevelopment model has " + str(len(testmodel)) + " source patterns",file=sys.stderr) #saving just so we can inspect it for debug purposes: devmodel.write( decodedir + '/dev.colibri.indexedpatternmodel' ) else: devmodel = {} if args.reorderingtable: print("Loading reordering model (may take a while)",file=sys.stderr) rtable = PhraseTable(args.reorderingtable) #TODO: convert to colibri alignmodel elif args.train and args.inputfile: if not args.alignmodelfile: print("No alignment model specified (-a)",file=sys.stderr) sys.exit(2) elif not args.train: if not args.inputfile: print("No input file specified (-f)",file=sys.stderr) if not args.alignmodelfile: print("No alignment model specified (-a)",file=sys.stderr) sys.exit(2) if args.train: #training mode if args.inputfile: print("Training classifiers (constrained by test data)",file=sys.stderr) else: print("Training all classifiers (you may want to constrain by test data using -f)",file=sys.stderr) if 'monolithic' in classifierconf and classifierconf['monolithic']: #monolithic trainfile = args.workdir + "/train" #build a classifier print("Training monolithic classifier " + trainfile,file=sys.stderr) timbloptions = gettimbloptions(args, classifierconf) if args.classifierdir: #ugly hack since we want ibases in a different location trainfilecopy = trainfile.replace(args.workdir, args.classifierdir) shutil.copyfile(trainfile+".train", trainfilecopy+".train") trainfile = trainfilecopy classifier = timbl.TimblClassifier(trainfile, timbloptions) classifier.train() classifier.save() if args.classifierdir: #remove copy os.unlink(trainfile+".train") trained = 1 else: #experts trained = 0 for trainfile in itertools.chain(glob.glob(args.workdir + "/*.train"), glob.glob(args.workdir + "/.*.train")): #explicitly add 'dotfiles', will be skipped by default if args.inputfile: sourcepattern_s = unquote_plus(os.path.basename(trainfile.replace('.train',''))) sourcepattern = sourceencoders[0].buildpattern(sourcepattern_s) if not sourcepattern in testmodel and not sourcepattern in devmodel: print("Skipping " + trainfile + " (\"" + sourcepattern_s + "\" not in test/dev model)",file=sys.stderr) continue #build a classifier print("Training " + trainfile,file=sys.stderr) trained += 1 timbloptions = gettimbloptions(args, classifierconf) if args.classifierdir: #ugly hack since we want ibases in a different location trainfilecopy = trainfile.replace(args.workdir, args.classifierdir) shutil.copyfile(trainfile, trainfilecopy) trainfile = trainfilecopy classifier = timbl.TimblClassifier(trainfile.replace('.train',''), timbloptions) classifier.train() classifier.save() if args.classifierdir: #remove copy os.unlink(trainfile) if not os.path.exists(trainfile.replace(".train",".ibase")): raise Exception("Resulting instance base " + trainfile.replace(".train",".ibase") + " not found!") with open(args.classifierdir + '/trained','w',encoding='utf-8') as f: f.write(str(trained)+"\n") else: #TEST if not args.inputfile: print("Specify an input file (-f)",file=sys.stderr) sys.exit(2) if not args.mosesinclusive and not args.mosesexclusive: print("Writing intermediate test data to " + decodedir + "/test.txt",file=sys.stderr) #write intermediate test data (consisting only of indices AND unknown words) and f = open(decodedir + "/test.txt",'w',encoding='utf-8') for sentencenum, line in enumerate(classifierconf['featureconf'][0].corpus.sentences()): sentenceindex = sentencenum + 1 print("@" + str(sentenceindex),file=sys.stderr) tokens = [] #actual string representations for tokenindex,pattern in enumerate(line): #will yield only unigrams #is this an uncovered word that does not appear in the phrasetable? check using alignment model and keep the word untranslated if so if not pattern in alignmodel: print(" Found OOV at @" + str(sentenceindex) + ":" + str(tokenindex) + ": " + pattern.tostring(classifierconf['featureconf'][0].classdecoder), file=sys.stderr) tokens.append(pattern.tostring(classifierconf['featureconf'][0].classdecoder)) else: tokens.append(str(sentenceindex) + "_" + str(tokenindex)) f.write(" ".join(tokens) + "\n") f.close() classifierindex = set() if classifierconf['monolithic']: print("Loading classifier index for monolithic classifier",file=sys.stderr) with open(args.workdir + "/sourcepatterns.list",'r',encoding='utf-8') as f: for line in f: classifierindex.add(line.strip()) print("Loading monolithic classifier " + classifierdir + "/train.train",file=sys.stderr) timbloptions = gettimbloptions(args, classifierconf) classifier = timbl.TimblClassifier(classifierdir + "/train", timbloptions) else: classifier = None if args.reorderingtable: print("Creating intermediate phrase-table and reordering-table",file=sys.stderr) freordering = open(decodedir + "/reordering-table", 'w',encoding='utf-8') else: print("Creating intermediate phrase-table",file=sys.stderr) freordering = None if args.mosesinclusive or args.mosesexclusive: #Use mosesserver with XML input method #write mos if not args.tweight: lentweights = 4 tweights = " ".join([str(1/(lentweights+1))]*lentweights) else: tweights = " ".join([ str(x) for x in args.tweight]) lentweights = len(args.tweight) if os.path.exists(decodedir + "/moses.ini"): os.unlink(decodedir+"/moses.ini") print("Writing " + decodedir + "/moses.ini",file=sys.stderr) if args.reordering: reorderingfeature = "LexicalReordering name=LexicalReordering0 num-features=6 type=" + args.reordering + " input-factor=0 output-factor=0 path=" + decodedir + "/reordering-table" reorderingweight = "LexicalReordering0= 0.3 0.3 0.3 0.3 0.3 0.3" else: reorderingfeature = "" reorderingweight = "" #write moses.ini f = open(decodedir + '/moses.ini','w',encoding='utf-8') f.write(""" #Moses INI, produced by contextmoses.py [input-factors] 0 [mapping] 0 T 0 [distortion-limit] 6 [feature] UnknownWordPenalty WordPenalty PhrasePenalty PhraseDictionaryMemory name=TranslationModel0 num-features={lentweights} path={phrasetable} input-factor=0 output-factor=0 table-limit=20 {reorderingfeature} Distortion SRILM name=LM0 factor=0 path={lm} order={lmorder} [weight] UnknownWordPenalty0= 1 WordPenalty0= {wweight} PhrasePenalty0= {pweight} LM0= {lmweight} TranslationModel0= {tweights} Distortion0= {dweight} {reorderingweight} """.format(phrasetable=decodedir + "/phrase-table", lm=args.lm, lmorder=args.lmorder, lmweight = args.lmweight, dweight = args.dweight, tweights=tweights, lentweights=lentweights, wweight=args.wweight, pweight = args.pweight, reorderingfeature=reorderingfeature, reorderingweight=reorderingweight)) f.close() print("Starting Moses Server",file=sys.stderr) if args.mosesdir: cmd = args.mosesdir + '/bin/mosesserver' else: cmd = 'mosesserver' if args.moses: if args.mosesinclusive: cmd += " -xml-input inclusive" #compete with phrase-table elif args.mosesexclusive: cmd += " -xml-input exclusive" #only used for passing verbatim L2 (tested whether it makes a difference with inclusive baseline on en-es data, it doesn't) cmd += ' -f ' + decodedir + '/moses.ini' print("Calling mosesserver: " + cmd,file=sys.stderr) p = subprocess.Popen(cmd,shell=True) mosesserverpid = p.pid while True: time.sleep(5) try: s = socket.socket() s.connect( ("localhost", args.mosesport) ) break except Exception as e: print("Waiting for Moses server....", e, file=sys.stderr) print("Connecting to Moses Server",file=sys.stderr) mosesclient = xmlrpc.client.ServerProxy("http://*****:*****@" + str(i+1) + "/" + str(sourcepatterncount) + " -- Processing " + str(sentenceindex) + ":" + str(tokenindex) + " " + sourcepattern_s + " -- Features: " + str(repr(featurevector)),file=sys.stderr) if classifier and not args.ignoreclassifier: if not classifierconf['monolithic'] or (classifierconf['monolithic'] and sourcepattern_s in classifierindex): print("\tClassifying",file=sys.stderr) #call classifier classlabel, distribution, distance = classifier.classify(featurevector) #process classifier result for targetpattern_s, score in distribution.items(): targetpattern = targetencoder.buildpattern(targetpattern_s) if (sourcepattern, targetpattern) in alignmodel: scorevector = [ x for x in alignmodel[(sourcepattern,targetpattern)] if isinstance(x,int) or isinstance(x,float) ] #make a copy else: continue if args.scorehandling == 'append': scorevector.append(score) elif args.scorehandling == 'replace': scorevector[2] = score else: raise NotImplementedError #TODO: implemented weighed! translationcount += 1 #write phrasetable entries ftable.write(tokenspan + " ||| " + targetpattern_s + " ||| " + " ".join([str(x) for x in scorevector]) + "\n") if freordering: reordering_scores = None try: for t, sv in rtable[sourcepattern_s]: if t == targetpattern_s: reordering_scores = sv except KeyError: if args.ignoreerrors: print("******* ERROR ********* Source pattern notfound in reordering table: " + sourcepattern_s,file=sys.stderr) continue else: raise Exception("Source pattern notfound in reordering table: " + sourcepattern_s) if reordering_scores: freordering.write(tokenspan + " ||| " + targetpattern_s + " ||| " + " ".join([str(x) for x in reordering_scores]) + "\n") else: if args.ignoreerrors: print("******** ERROR ********* Target pattern not found in reordering table: " + targetpattern_s + " (for source " + sourcepattern_s + ")",file=sys.stderr) continue else: raise Exception("Target pattern not found in reordering table: " + targetpattern_s + " (for source " + sourcepattern_s + ")") if translationcount == 0: print("\tNo overlap between classifier translations (" + str(len(distribution)) + ") and phrase table. Falling back to statistical baseline.",file=sys.stderr) statistical = True else: print("\t\t" + str(translationcount) + " translation options written",file=sys.stderr) statistical = False else: print("\tNot in classifier. Falling back to statistical baseline.",file=sys.stderr) statistical = True else: statistical = True if statistical: print("\tPhrasetable lookup",file=sys.stderr) #ignore classifier or no classifier present for this item for targetpattern in alignmodel.targetpatterns(sourcepattern): scorevector = [ x for x in alignmodel[(sourcepattern,targetpattern)] if isinstance(x,int) or isinstance(x,float) ] #make a copy if args.scorehandling == 'append': scorevector.append(scorevector[2]) elif args.scorehandling == 'replace': pass #nothing to do, scorevector is okay as it is elif args.scorehandling == 'weighed': raise NotImplementedError #TODO: implemented weighed! translationcount += 1 #write phrasetable entries targetpattern_s = targetpattern.tostring(targetdecoder) ftable.write(tokenspan + " ||| " + targetpattern_s + " ||| " + " ".join([ str(x) for x in scorevector]) + "\n") if freordering: reordering_scores = None try: for t, sv in rtable[sourcepattern_s]: if t == targetpattern_s: reordering_scores = sv except KeyError: if args.ignoreerrors: print("******** ERROR ******* Source pattern not found in reordering table: " + sourcepattern_s,file=sys.stderr) continue else: raise Exception("Source pattern not found in reordering table: " + sourcepattern_s) if reordering_scores: freordering.write(tokenspan + " ||| " + targetpattern_s + " ||| " + " ".join([str(x) for x in reordering_scores]) + "\n") else: if args.ignoreerrors: print("******* ERROR ****** Target pattern not found in reordering table: " + targetpattern_s + " (for source " + sourcepattern_s + ")",file=sys.stderr) continue else: raise Exception("Target pattern not found in reordering table: " + targetpattern_s + " (for source " + sourcepattern_s + ")") print("\t\t" + str(translationcount) + " translation options written",file=sys.stderr) prevpattern = None ftable.close() if freordering: freordering.close() if not args.tweight: if args.scorehandling == "append": lentweights = 5 else: lentweights = 4 tweights = " ".join([str(1/(lentweights+1))]*lentweights) else: tweights = " ".join([ str(x) for x in args.tweight]) lentweights = len(args.tweight) print("Writing " + decodedir + "/moses.ini",file=sys.stderr) if args.reordering: reorderingfeature = "LexicalReordering name=LexicalReordering0 num-features=6 type=" + args.reordering + " input-factor=0 output-factor=0 path=" + decodedir + "/reordering-table" if not args.reorderingweight: reorderingweight = "LexicalReordering0= 0.3 0.3 0.3 0.3 0.3 0.3" else: reorderingweight = "LexicalReordering0= " + " ".join([str(x) for x in args.reorderingweight]) else: reorderingfeature = "" reorderingweight = "" #write moses.ini f = open(decodedir + '/moses.ini','w',encoding='utf-8') f.write(""" #Moses INI, produced by contextmoses.py [input-factors] 0 [mapping] 0 T 0 [distortion-limit] 6 [feature] UnknownWordPenalty WordPenalty PhrasePenalty PhraseDictionaryMemory name=TranslationModel0 num-features={lentweights} path={phrasetable} input-factor=0 output-factor=0 table-limit=20 {reorderingfeature} Distortion SRILM name=LM0 factor=0 path={lm} order={lmorder} [weight] UnknownWordPenalty0= 1 WordPenalty0= {wweight} PhrasePenalty0= {pweight} LM0= {lmweight} TranslationModel0= {tweights} Distortion0= {dweight} {reorderingweight} """.format(phrasetable=decodedir + "/phrase-table", lm=args.lm, lmorder=args.lmorder, lmweight = args.lmweight, dweight = args.dweight, tweights=tweights, lentweights=lentweights, wweight=args.wweight, pweight = args.pweight, reorderingfeature=reorderingfeature, reorderingweight=reorderingweight)) f.close() if not args.skipdecoder: if args.mert: if args.ref[0] == '/': ref = args.ref else: ref = os.getcwd() + '/' + args.ref for mertrun in range(1,args.mert+1): if os.path.exists(decodedir+"/mert-work-" + str(mertrun) +"/moses.ini"): print("Mert run #" + str(mertrun) + " already ran, skipping...",file=sys.stderr) else: #invoke mert cmd = args.mosesdir + "/scripts/training/mert-moses.pl --working-dir=" + decodedir + "/mert-work-" + str(mertrun) + " --mertdir=" + args.mosesdir + '/mert/' + ' --decoder-flags="-threads ' + str(args.threads) + '" ' + decodedir + "/test.txt " + ref + " `which moses` " + decodedir + "/moses.ini --threads=" + str(args.threads) print("Contextmoses calling mert #" + str(mertrun) + ": " + cmd,file=sys.stderr) r = subprocess.call(cmd, shell=True) if r != 0: print("Contextmoses called mert #" + str(mertrun) + " but failed!", file=sys.stderr) sys.exit(1) print("DONE: Contextmoses calling mert #" + str(mertrun)+": " + cmd,file=sys.stderr) else: #invoke moses cmd = EXEC_MOSES + " -threads " + str(args.threads) + " -f " + decodedir + "/moses.ini < " + decodedir + "/test.txt > " + decodedir + "/output.txt" print("Contextmoses calling moses: " + cmd,file=sys.stderr) r = subprocess.call(cmd, shell=True) if r != 0: print("Contextmoses called moses but failed!", file=sys.stderr) sys.exit(1) print("DONE: Contextmoses calling moses: " + cmd,file=sys.stderr) else: print("Contextmoses skipping decoder",file=sys.stderr)
def main(): parser = argparse.ArgumentParser(description="Wrapper around the Moses Decoder that adds support for context features through classifiers.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-f','--inputfile', type=str,help="Input text file; the test corpus (plain text, tokenised, one sentence per line), may be specified multiple times for each factor", action='append',required=False) parser.add_argument('-d','--devinputfile', type=str,help="Extra input text file to consider when training classifiers; the development corpus (plain text, tokenised, one sentence per line)", action='store',required=False) parser.add_argument('-S','--sourceclassfile', type=str, help="Source class file", action='store',required=True) parser.add_argument('-T','--targetclassfile', type=str, help="Target class file", action='store',required=True) parser.add_argument('-a','--alignmodelfile', type=str,help="Colibri alignment model (made from phrase translation table)", action='store',default="",required=False) parser.add_argument('-w','--workdir', type=str,help="Working directory, should contain classifier training files", action='store',default="",required=True) parser.add_argument('--train', help="Train classifiers", action="store_true", default=False) #parser.add_argument('-O','--timbloptions', type=str, help="Options for the Timbl classifier", action="store", default="-a 0 -k 1") parser.add_argument('--ta', type=str, help="Timbl algorithm", action="store", default="0") parser.add_argument('--tk', type=str, help="Timbl k value", action="store", default="1") parser.add_argument('--tw', type=str, help="Timbl weighting", action="store", default="gr") parser.add_argument('--tm', type=str, help="Timbl feature metrics", action="store", default="O") parser.add_argument('--td', type=str, help="Timbl distance metric", action="store", default="Z") parser.add_argument('-I','--ignoreclassifier', help="Ignore classifier (for testing bypass method)", action="store_true", default=False) parser.add_argument('-H','--scorehandling', type=str, help="Score handling, can be 'append' (default), 'replace', or 'weighed'", action="store", default="append") parser.add_argument('--mosesinclusive',help="Pass full sentences through through Moses server using XML input (will start a moses server, requires --moseslm). Classifier output competes with normal translation table. Score handling (-H) has no effect as only the classifier score will be passed.", action='store_true',default=False) parser.add_argument('--mosesexclusive',help="Pass full sentences through through Moses server using XML input (will start a moses server, requires --moseslm). Classifier does NOT compete with normal translation table. Score handling (-H) has no effect as only the classifier score will be passed.", action='store_true',default=False) parser.add_argument('--mosesdir', type=str,help='Path to Moses directory (required for MERT)', default="") parser.add_argument('--mert', type=int,help="Do MERT parameter tuning, set to number of MERT runs to perform", required=False, default=0) parser.add_argument('--threads', type=int, default=1, help="Number of threads to use for Moses or Mert") parser.add_argument('--reordering', type=str,action="store",help="Reordering type (use with --reorderingtable)", required=False) parser.add_argument('--reorderingtable', type=str,action="store",help="Use reordering table (use with --reordering)", required=False) parser.add_argument('--ref', type=str,action="store",help="Reference corpus (target corpus, plain text)", required=False) parser.add_argument('--lm', type=str, help="Language Model", action="store", default="", required=False) parser.add_argument('--lmorder', type=int, help="Language Model order", action="store", default=3, required=False) parser.add_argument('--lmweight', type=float, help="Language Model weight", action="store", default=0.5, required=False) parser.add_argument('--dweight', type=float, help="Distortion Model weight", action="store", default=0.3, required=False) parser.add_argument('--wweight', type=float, help="Word penalty weight", action="store", default=-1, required=False) parser.add_argument('--tweight', type=float, help="Translation Model weight (may be specified multiple times for each score making up the translation model)", action="append", required=False) parser.add_argument('--reorderingweight', type=float, help="Reordering Model weight (may be specified multiple times for each score making up the reordering model)", action="append", required=False) parser.add_argument('--pweight', type=float, help="Phrase penalty", default=0.2, action="store", required=False) parser.add_argument('--classifierdir', type=str,help="Trained classifiers, intermediate phrase-table and test file will be written here (only specify if you want a different location than the work directory)", action='store',default="",required=False) parser.add_argument('--decodedir', type=str,help="Moses output will be written here (only specify if you want a different location than the work directory)", action='store',default="",required=False) parser.add_argument('--skipdecoder',action="store_true",default=False) parser.add_argument('--ignoreerrors',action="store_true",help="Attempt to ignore errors",default=False) parser.add_argument('--mosesport',type=int, help="Port for Moses server (will be started for you), if -Z is enabled",action='store',default=8080) args = parser.parse_args() #args.storeconst, args.dataset, args.num, args.bar if not os.path.isdir(args.workdir) or not os.path.exists(args.workdir + '/classifier.conf'): print("Work directory " + args.workdir + " or classifier configuration therein does not exist. Did you extract features and create classifier training files using colibri-extractfeatures?" ,file=sys.stderr) sys.exit(2) if args.classifierdir: classifierdir = args.classifierdir else: classifierdir = args.workdir if not classifierdir: classifierdir = os.getcwd() elif classifierdir and classifierdir[0] != '/': classifierdir = os.getcwd() + '/' + classifierdir if args.mert and not args.mosesdir: print("--mert requires --mosesdir to be set",file=sys.stderr) sys.exit(2) if args.mert and not args.ref: print("--mert requires --ref to be set",file=sys.stderr) sys.exit(2) if args.decodedir: decodedir = args.decodedir else: decodedir = args.workdir if not decodedir: decodedir = os.getcwd() elif decodedir and decodedir[0] != '/': decodedir = os.getcwd() + '/' + decodedir print("Loading configuration (training corpora and class decoders)",file=sys.stderr) f = open(args.workdir + '/classifier.conf','rb') classifierconf = pickle.load(f) f.close() print("Configuration: ", classifierconf,file=sys.stderr) if args.inputfile: if len(classifierconf['featureconf']) > len(args.inputfile): raise Exception("Number of input files (" + str(len(args.inputfile)) + ") is less than the number of factor-features in configuration (" + str(len(classifierconf['featureconf'])) + "), you need to specify all") #one for each factor sourceencoders = [] if args.inputfile: l = [] for i, (inputfile, conf) in enumerate(zip(args.inputfile, classifierconf['featureconf'])): trainclassfile = conf['classdecoder'] print("Processing factor #" + str(i),file=sys.stderr) #process inputfile corpusfile = os.path.basename(inputfile).replace('.txt','') + '.colibri.dat' classfile = os.path.basename(inputfile).replace('.txt','') + '.colibri.cls' #if os.path.exists(corpusfiles[i]) and os.path.exists(classfiles[i]): # print("Notice: Re-using previously generated corpusfile and classfile",file=sys.stderr) # print("Loading source class encoder and decoder",file=sys.stderr) # sourceencoders.append( ClassEncoder(classfiles[i]) ) # sourcedecoders.append( ClassDecoder(classfiles[i]) ) #else: print("Loading and extending source class encoder, from " + trainclassfile + " to " + classfile,file=sys.stderr) sourceencoders.append( ClassEncoder(trainclassfile) ) sourceencoders[i].processcorpus(inputfile) if i == 0 and args.devinputfile: print("(including development corpus in extended class encoder)",file=sys.stderr) sourceencoders[i].processcorpus(args.devinputfile) sourceencoders[i].buildclasses() sourceencoders[i].save(classfile) print("Encoding test corpus, from " + inputfile + " to " + corpusfile,file=sys.stderr) sourceencoders[i].encodefile(inputfile, corpusfile) if i == 0 and args.devinputfile: print("Encoding development corpus, from " + args.devinputfile + " to " + args.devinputfile + '.colibri.dat',file=sys.stderr) sourceencoders[i].encodefile(args.devinputfile, args.devinputfile + '.colibri.dat') print("Loading source class decoder " + classfile,file=sys.stderr) sourcedecoder = ClassDecoder(classfile) print("Loading test corpus " + corpusfile,file=sys.stderr) l.append( Configuration( IndexedCorpus(corpusfile), sourcedecoder, conf['leftcontext'], conf['focus'], conf['rightcontext']) ) classifierconf['featureconf'] = l else: print("Loading source class decoders",file=sys.stderr) l = [] for conf in classifierconf['featureconf']: sourcedecoder = ClassDecoder(conf['classdecoder']) l.append( Configuration( IndexedCorpus(), sourcedecoder, conf['leftcontext'], conf['focus'], conf['rightcontext'] ) ) if args.inputfile and args.alignmodelfile: print("Loading target encoder " + args.targetclassfile,file=sys.stderr) targetencoder = ClassEncoder(args.targetclassfile) print("Loading target decoder " + args.targetclassfile,file=sys.stderr) targetdecoder = ClassDecoder(args.targetclassfile) print("Loading alignment model " + args.alignmodelfile ,file=sys.stderr) alignmodel = AlignmentModel(args.alignmodelfile) print("\tAlignment model has " + str(len(alignmodel)) + " source patterns",file=sys.stderr) print("Building patternmodel on test corpus " + classifierconf['featureconf'][0].corpus.filename() ,file=sys.stderr) options = PatternModelOptions(mintokens=1, maxlength=12, debug=True) testmodel = IndexedPatternModel(reverseindex=classifierconf['featureconf'][0].corpus) testmodel.train( "", options, alignmodel) print("\tTest model has " + str(len(testmodel)) + " source patterns",file=sys.stderr) #saving just so we can inspect it for debug purposes: testmodel.write( decodedir + '/test.colibri.indexedpatternmodel' ) if args.devinputfile: print("Building patternmodel on development corpus " + args.devinputfile + ".colibri.dat" ,file=sys.stderr) devcorpus = IndexedCorpus(args.devinputfile + ".colibri.dat") print("Development corpus has " + str(devcorpus.sentences()) + " sentences") devmodel = IndexedPatternModel(reverseindex=devcorpus) devmodel.train( "", options, alignmodel) print("\tDevelopment model has " + str(len(testmodel)) + " source patterns",file=sys.stderr) #saving just so we can inspect it for debug purposes: devmodel.write( decodedir + '/dev.colibri.indexedpatternmodel' ) else: devmodel = {} if args.reorderingtable: print("Loading reordering model (may take a while)",file=sys.stderr) rtable = PhraseTable(args.reorderingtable) #TODO: convert to colibri alignmodel elif args.train and args.inputfile: if not args.alignmodelfile: print("No alignment model specified (-a)",file=sys.stderr) sys.exit(2) elif not args.train: if not args.inputfile: print("No input file specified (-f)",file=sys.stderr) if not args.alignmodelfile: print("No alignment model specified (-a)",file=sys.stderr) sys.exit(2) if args.train: #training mode if args.inputfile: print("Training classifiers (constrained by test data)",file=sys.stderr) else: print("Training all classifiers (you may want to constrain by test data using -f)",file=sys.stderr) if 'monolithic' in classifierconf and classifierconf['monolithic']: #monolithic trainfile = args.workdir + "/train" #build a classifier print("Training monolithic classifier " + trainfile,file=sys.stderr) timbloptions = gettimbloptions(args, classifierconf) if args.classifierdir: #ugly hack since we want ibases in a different location trainfilecopy = trainfile.replace(args.workdir, args.classifierdir) shutil.copyfile(trainfile+".train", trainfilecopy+".train") trainfile = trainfilecopy classifier = timbl.TimblClassifier(trainfile, timbloptions) classifier.train() classifier.save() if args.classifierdir: #remove copy os.unlink(trainfile+".train") trained = 1 else: #experts trained = 0 for trainfile in itertools.chain(glob.glob(args.workdir + "/*.train"), glob.glob(args.workdir + "/.*.train")): #explicitly add 'dotfiles', will be skipped by default if args.inputfile: sourcepattern_s = unquote_plus(os.path.basename(trainfile.replace('.train',''))) sourcepattern = sourceencoders[0].buildpattern(sourcepattern_s) if not sourcepattern in testmodel and not sourcepattern in devmodel: print("Skipping " + trainfile + " (\"" + sourcepattern_s + "\" not in test/dev model)",file=sys.stderr) continue #build a classifier print("Training " + trainfile,file=sys.stderr) trained += 1 timbloptions = gettimbloptions(args, classifierconf) if args.classifierdir: #ugly hack since we want ibases in a different location trainfilecopy = trainfile.replace(args.workdir, args.classifierdir) shutil.copyfile(trainfile, trainfilecopy) trainfile = trainfilecopy classifier = timbl.TimblClassifier(trainfile.replace('.train',''), timbloptions) classifier.train() classifier.save() if args.classifierdir: #remove copy os.unlink(trainfile) if not os.path.exists(trainfile.replace(".train",".ibase")): raise Exception("Resulting instance base " + trainfile.replace(".train",".ibase") + " not found!") with open(args.classifierdir + '/trained','w',encoding='utf-8') as f: f.write(str(trained)+"\n") else: #TEST if not args.inputfile: print("Specify an input file (-f)",file=sys.stderr) sys.exit(2) if not args.mosesinclusive and not args.mosesexclusive: print("Writing intermediate test data to " + decodedir + "/test.txt",file=sys.stderr) #write intermediate test data (consisting only of indices AND unknown words) and f = open(decodedir + "/test.txt",'w',encoding='utf-8') for sentencenum, line in enumerate(classifierconf['featureconf'][0].corpus.sentences()): sentenceindex = sentencenum + 1 print("@" + str(sentenceindex),file=sys.stderr) tokens = [] #actual string representations for tokenindex,pattern in enumerate(line): #will yield only unigrams #is this an uncovered word that does not appear in the phrasetable? check using alignment model and keep the word untranslated if so print("DEBUG: Processing pattern " + str(sentenceindex) + ":" + str(tokenindex) + ": "+ pattern.tostring(classifierconf['featureconf'][0].classdecoder),file=sys.stderr) if not pattern in alignmodel: print(" Found OOV at @" + str(sentenceindex) + ":" + str(tokenindex) + ": " + pattern.tostring(classifierconf['featureconf'][0].classdecoder), file=sys.stderr) tokens.append(pattern.tostring(classifierconf['featureconf'][0].classdecoder)) else: tokens.append(str(sentenceindex) + "_" + str(tokenindex)) f.write(" ".join(tokens) + "\n") f.close() classifierindex = set() if classifierconf['monolithic']: print("Loading classifier index for monolithic classifier",file=sys.stderr) with open(args.workdir + "/sourcepatterns.list",'r',encoding='utf-8') as f: for line in f: classifierindex.add(line.strip()) print("Loading monolithic classifier " + classifierdir + "/train.train",file=sys.stderr) timbloptions = gettimbloptions(args, classifierconf) classifier = timbl.TimblClassifier(classifierdir + "/train", timbloptions) else: classifier = None if args.reorderingtable: print("Creating intermediate phrase-table and reordering-table",file=sys.stderr) freordering = open(decodedir + "/reordering-table", 'w',encoding='utf-8') else: print("Creating intermediate phrase-table",file=sys.stderr) freordering = None if args.mosesinclusive or args.mosesexclusive: #Use mosesserver with XML input method #write mos if not args.tweight: lentweights = 4 tweights = " ".join([str(1/(lentweights+1))]*lentweights) else: tweights = " ".join([ str(x) for x in args.tweight]) lentweights = len(args.tweight) if os.path.exists(decodedir + "/moses.ini"): os.unlink(decodedir+"/moses.ini") print("Writing " + decodedir + "/moses.ini",file=sys.stderr) if args.reordering: reorderingfeature = "LexicalReordering name=LexicalReordering0 num-features=6 type=" + args.reordering + " input-factor=0 output-factor=0 path=" + decodedir + "/reordering-table" reorderingweight = "LexicalReordering0= 0.3 0.3 0.3 0.3 0.3 0.3" else: reorderingfeature = "" reorderingweight = "" #write moses.ini f = open(decodedir + '/moses.ini','w',encoding='utf-8') f.write(""" #Moses INI, produced by contextmoses.py [input-factors] 0 [mapping] 0 T 0 [distortion-limit] 6 [feature] UnknownWordPenalty WordPenalty PhrasePenalty PhraseDictionaryMemory name=TranslationModel0 num-features={lentweights} path={phrasetable} input-factor=0 output-factor=0 table-limit=20 {reorderingfeature} Distortion SRILM name=LM0 factor=0 path={lm} order={lmorder} [weight] UnknownWordPenalty0= 1 WordPenalty0= {wweight} PhrasePenalty0= {pweight} LM0= {lmweight} TranslationModel0= {tweights} Distortion0= {dweight} {reorderingweight} """.format(phrasetable=decodedir + "/phrase-table", lm=args.lm, lmorder=args.lmorder, lmweight = args.lmweight, dweight = args.dweight, tweights=tweights, lentweights=lentweights, wweight=args.wweight, pweight = args.pweight, reorderingfeature=reorderingfeature, reorderingweight=reorderingweight)) f.close() print("Starting Moses Server",file=sys.stderr) if args.mosesdir: cmd = args.mosesdir + '/bin/mosesserver' else: cmd = 'mosesserver' if args.moses: if args.mosesinclusive: cmd += " -xml-input inclusive" #compete with phrase-table elif args.mosesexclusive: cmd += " -xml-input exclusive" #only used for passing verbatim L2 (tested whether it makes a difference with inclusive baseline on en-es data, it doesn't) cmd += ' -f ' + decodedir + '/moses.ini' print("Calling mosesserver: " + cmd,file=sys.stderr) p = subprocess.Popen(cmd,shell=True) mosesserverpid = p.pid while True: time.sleep(5) try: s = socket.socket() s.connect( ("localhost", args.mosesport) ) break except Exception as e: print("Waiting for Moses server....", e, file=sys.stderr) print("Connecting to Moses Server",file=sys.stderr) mosesclient = xmlrpc.client.ServerProxy("http://*****:*****@" + str(i+1) + "/" + str(sourcepatterncount) + " -- Processing " + str(sentenceindex) + ":" + str(tokenindex) + " " + sourcepattern_s + " -- Features: " + str(repr(featurevector)),file=sys.stderr) if classifier and not args.ignoreclassifier: if not classifierconf['monolithic'] or (classifierconf['monolithic'] and sourcepattern_s in classifierindex): print("\tClassifying",file=sys.stderr) #call classifier classlabel, distribution, distance = classifier.classify(featurevector) #process classifier result for targetpattern_s, score in distribution.items(): targetpattern = targetencoder.buildpattern(targetpattern_s) if (sourcepattern, targetpattern) in alignmodel: scorevector = [ x for x in alignmodel[(sourcepattern,targetpattern)] if isinstance(x,int) or isinstance(x,float) ] #make a copy else: continue if args.scorehandling == 'append': scorevector.append(score) elif args.scorehandling == 'replace': scorevector[2] = score else: raise NotImplementedError #TODO: implemented weighed! translationcount += 1 #write phrasetable entries ftable.write(tokenspan + " ||| " + targetpattern_s + " ||| " + " ".join([str(x) for x in scorevector]) + "\n") if freordering: reordering_scores = None try: for t, sv in rtable[sourcepattern_s]: if t == targetpattern_s: reordering_scores = sv except KeyError: if args.ignoreerrors: print("******* ERROR ********* Source pattern notfound in reordering table: " + sourcepattern_s,file=sys.stderr) continue else: raise Exception("Source pattern notfound in reordering table: " + sourcepattern_s) if reordering_scores: freordering.write(tokenspan + " ||| " + targetpattern_s + " ||| " + " ".join([str(x) for x in reordering_scores]) + "\n") else: if args.ignoreerrors: print("******** ERROR ********* Target pattern not found in reordering table: " + targetpattern_s + " (for source " + sourcepattern_s + ")",file=sys.stderr) continue else: raise Exception("Target pattern not found in reordering table: " + targetpattern_s + " (for source " + sourcepattern_s + ")") if translationcount == 0: print("\tNo overlap between classifier translations (" + str(len(distribution)) + ") and phrase table. Falling back to statistical baseline.",file=sys.stderr) statistical = True else: print("\t\t" + str(translationcount) + " translation options written",file=sys.stderr) statistical = False else: print("\tNot in classifier. Falling back to statistical baseline.",file=sys.stderr) statistical = True else: statistical = True if statistical: print("\tPhrasetable lookup",file=sys.stderr) #ignore classifier or no classifier present for this item for targetpattern in alignmodel.targetpatterns(sourcepattern): scorevector = [ x for x in alignmodel[(sourcepattern,targetpattern)] if isinstance(x,int) or isinstance(x,float) ] #make a copy if args.scorehandling == 'append': scorevector.append(scorevector[2]) elif args.scorehandling == 'replace': pass #nothing to do, scorevector is okay as it is elif args.scorehandling == 'weighed': raise NotImplementedError #TODO: implemented weighed! translationcount += 1 #write phrasetable entries targetpattern_s = targetpattern.tostring(targetdecoder) ftable.write(tokenspan + " ||| " + targetpattern_s + " ||| " + " ".join([ str(x) for x in scorevector]) + "\n") if freordering: reordering_scores = None try: for t, sv in rtable[sourcepattern_s]: if t == targetpattern_s: reordering_scores = sv except KeyError: if args.ignoreerrors: print("******** ERROR ******* Source pattern not found in reordering table: " + sourcepattern_s,file=sys.stderr) continue else: raise Exception("Source pattern not found in reordering table: " + sourcepattern_s) if reordering_scores: freordering.write(tokenspan + " ||| " + targetpattern_s + " ||| " + " ".join([str(x) for x in reordering_scores]) + "\n") else: if args.ignoreerrors: print("******* ERROR ****** Target pattern not found in reordering table: " + targetpattern_s + " (for source " + sourcepattern_s + ")",file=sys.stderr) continue else: raise Exception("Target pattern not found in reordering table: " + targetpattern_s + " (for source " + sourcepattern_s + ")") print("\t\t" + str(translationcount) + " translation options written",file=sys.stderr) prevpattern = None ftable.close() if freordering: freordering.close() if not args.tweight: if args.scorehandling == "append": lentweights = 5 else: lentweights = 4 tweights = " ".join([str(1/(lentweights+1))]*lentweights) else: tweights = " ".join([ str(x) for x in args.tweight]) lentweights = len(args.tweight) print("Writing " + decodedir + "/moses.ini",file=sys.stderr) if args.reordering: reorderingfeature = "LexicalReordering name=LexicalReordering0 num-features=6 type=" + args.reordering + " input-factor=0 output-factor=0 path=" + decodedir + "/reordering-table" if not args.reorderingweight: reorderingweight = "LexicalReordering0= 0.3 0.3 0.3 0.3 0.3 0.3" else: reorderingweight = "LexicalReordering0= " + " ".join([str(x) for x in args.reorderingweight]) else: reorderingfeature = "" reorderingweight = "" #write moses.ini f = open(decodedir + '/moses.ini','w',encoding='utf-8') f.write(""" #Moses INI, produced by contextmoses.py [input-factors] 0 [mapping] 0 T 0 [distortion-limit] 6 [feature] UnknownWordPenalty WordPenalty PhrasePenalty PhraseDictionaryMemory name=TranslationModel0 num-features={lentweights} path={phrasetable} input-factor=0 output-factor=0 table-limit=20 {reorderingfeature} Distortion SRILM name=LM0 factor=0 path={lm} order={lmorder} [weight] UnknownWordPenalty0= 1 WordPenalty0= {wweight} PhrasePenalty0= {pweight} LM0= {lmweight} TranslationModel0= {tweights} Distortion0= {dweight} {reorderingweight} """.format(phrasetable=decodedir + "/phrase-table", lm=args.lm, lmorder=args.lmorder, lmweight = args.lmweight, dweight = args.dweight, tweights=tweights, lentweights=lentweights, wweight=args.wweight, pweight = args.pweight, reorderingfeature=reorderingfeature, reorderingweight=reorderingweight)) f.close() if not args.skipdecoder: if args.mert: if args.ref[0] == '/': ref = args.ref else: ref = os.getcwd() + '/' + args.ref for mertrun in range(1,args.mert+1): if os.path.exists(decodedir+"/mert-work-" + str(mertrun) +"/moses.ini"): print("Mert run #" + str(mertrun) + " already ran, skipping...",file=sys.stderr) else: #invoke mert cmd = args.mosesdir + "/scripts/training/mert-moses.pl --working-dir=" + decodedir + "/mert-work-" + str(mertrun) + " --mertdir=" + args.mosesdir + '/mert/' + ' --decoder-flags="-threads ' + str(args.threads) + '" ' + decodedir + "/test.txt " + ref + " `which moses` " + decodedir + "/moses.ini --threads=" + str(args.threads) print("Contextmoses calling mert #" + str(mertrun) + ": " + cmd,file=sys.stderr) r = subprocess.call(cmd, shell=True) if r != 0: print("Contextmoses called mert #" + str(mertrun) + " but failed!", file=sys.stderr) sys.exit(1) print("DONE: Contextmoses calling mert #" + str(mertrun)+": " + cmd,file=sys.stderr) else: #invoke moses cmd = EXEC_MOSES + " -threads " + str(args.threads) + " -f " + decodedir + "/moses.ini < " + decodedir + "/test.txt > " + decodedir + "/output.txt" print("Contextmoses calling moses: " + cmd,file=sys.stderr) r = subprocess.call(cmd, shell=True) if r != 0: print("Contextmoses called moses but failed!", file=sys.stderr) sys.exit(1) print("DONE: Contextmoses calling moses: " + cmd,file=sys.stderr) else: print("Contextmoses skipping decoder",file=sys.stderr)