Exemplo n.º 1
def main():
    parser = argparse.ArgumentParser(description="Wrapper around the Moses Decoder that adds support for context features through classifiers.", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-f','--inputfile', type=str,help="Input text file; the test corpus (plain text, tokenised, one sentence per line), may be specified multiple times for each factor", action='append',required=False)
    parser.add_argument('-d','--devinputfile', type=str,help="Extra input text file to consider when training classifiers; the development corpus (plain text, tokenised, one sentence per line)", action='store',required=False)
    parser.add_argument('-S','--sourceclassfile', type=str, help="Source class file", action='store',required=True)
    parser.add_argument('-T','--targetclassfile', type=str, help="Target class file", action='store',required=True)
    parser.add_argument('-a','--alignmodelfile', type=str,help="Colibri alignment model (made from phrase translation table)", action='store',default="",required=False)
    parser.add_argument('-w','--workdir', type=str,help="Working directory, should contain classifier training files", action='store',default="",required=True)
    parser.add_argument('--train', help="Train classifiers", action="store_true", default=False)
    #parser.add_argument('-O','--timbloptions', type=str, help="Options for the Timbl classifier", action="store", default="-a 0 -k 1")
    parser.add_argument('--ta', type=str, help="Timbl algorithm", action="store", default="0")
    parser.add_argument('--tk', type=str, help="Timbl k value", action="store", default="1")
    parser.add_argument('--tw', type=str, help="Timbl weighting", action="store", default="gr")
    parser.add_argument('--tm', type=str, help="Timbl feature metrics", action="store", default="O")
    parser.add_argument('--td', type=str, help="Timbl distance metric", action="store", default="Z")
    parser.add_argument('-I','--ignoreclassifier', help="Ignore classifier (for testing bypass method)", action="store_true", default=False)
    parser.add_argument('-H','--scorehandling', type=str, help="Score handling, can be 'append' (default), 'replace', or 'weighed'", action="store", default="append")
    parser.add_argument('--mosesinclusive',help="Pass full sentences through through Moses server using XML input (will start a moses server, requires --moseslm). Classifier output competes with normal translation table. Score handling (-H) has no effect as only the classifier score will be passed.", action='store_true',default=False)
    parser.add_argument('--mosesexclusive',help="Pass full sentences through through Moses server using XML input (will start a moses server, requires --moseslm). Classifier does NOT compete with normal translation table. Score handling (-H) has no effect as only the classifier score will be passed.", action='store_true',default=False)
    parser.add_argument('--mosesdir', type=str,help='Path to Moses directory (required for MERT)', default="")
    parser.add_argument('--mert', type=int,help="Do MERT parameter tuning, set to number of MERT runs to perform", required=False, default=0)
    parser.add_argument('--threads', type=int, default=1, help="Number of threads to use for Moses or Mert")
    parser.add_argument('--reordering', type=str,action="store",help="Reordering type (use with --reorderingtable)", required=False)
    parser.add_argument('--reorderingtable', type=str,action="store",help="Use reordering table (use with --reordering)", required=False)
    parser.add_argument('--ref', type=str,action="store",help="Reference corpus (target corpus, plain text)", required=False)
    parser.add_argument('--lm', type=str, help="Language Model", action="store", default="", required=False)
    parser.add_argument('--lmorder', type=int, help="Language Model order", action="store", default=3, required=False)
    parser.add_argument('--lmweight', type=float, help="Language Model weight", action="store", default=0.5, required=False)
    parser.add_argument('--dweight', type=float, help="Distortion Model weight", action="store", default=0.3, required=False)
    parser.add_argument('--wweight', type=float, help="Word penalty weight", action="store", default=-1, required=False)
    parser.add_argument('--tweight', type=float, help="Translation Model weight (may be specified multiple times for each score making up the translation model)", action="append", required=False)
    parser.add_argument('--reorderingweight', type=float, help="Reordering Model weight (may be specified multiple times for each score making up the reordering model)", action="append", required=False)
    parser.add_argument('--pweight', type=float, help="Phrase penalty", default=0.2, action="store", required=False)
    parser.add_argument('--classifierdir', type=str,help="Trained classifiers, intermediate phrase-table and test file will be written here (only specify if you want a different location than the work directory)", action='store',default="",required=False)
    parser.add_argument('--decodedir', type=str,help="Moses output will be written here (only specify if you want a different location than the work directory)", action='store',default="",required=False)
    parser.add_argument('--ignoreerrors',action="store_true",help="Attempt to ignore errors",default=False)
    parser.add_argument('--mosesport',type=int, help="Port for Moses server (will be started for you), if -Z is enabled",action='store',default=8080)
    args = parser.parse_args()
    #args.storeconst, args.dataset, args.num, args.bar

    if not os.path.isdir(args.workdir) or not os.path.exists(args.workdir + '/classifier.conf'):
        print("Work directory " + args.workdir + " or classifier configuration therein does not exist. Did you extract features and create classifier training files using colibri-extractfeatures?" ,file=sys.stderr)

    if args.classifierdir:
        classifierdir = args.classifierdir
        classifierdir = args.workdir

    if not classifierdir:
        classifierdir = os.getcwd()
    elif classifierdir and classifierdir[0] != '/':
        classifierdir = os.getcwd() + '/' + classifierdir

    if args.mert and not args.mosesdir:
        print("--mert requires --mosesdir to be set",file=sys.stderr)
    if args.mert and not args.ref:
        print("--mert requires --ref to be set",file=sys.stderr)

    if args.decodedir:
        decodedir = args.decodedir
        decodedir = args.workdir

    if not decodedir:
        decodedir = os.getcwd()
    elif decodedir and decodedir[0] != '/':
        decodedir = os.getcwd() + '/' + decodedir

    print("Loading configuration (training corpora and class decoders)",file=sys.stderr)
    f = open(args.workdir + '/classifier.conf','rb')
    classifierconf = pickle.load(f)

    print("Configuration: ", classifierconf,file=sys.stderr)

    if args.inputfile:
        if len(classifierconf['featureconf']) > len(args.inputfile):
            raise Exception("Number of input files (" + str(len(args.inputfile)) + ") is less than the number of factor-features in configuration (" + str(len(classifierconf['featureconf'])) + "), you need to specify all")

    #one for each factor
    sourceencoders = []

    if args.inputfile:
        l = []
        for i, (inputfile, conf) in enumerate(zip(args.inputfile, classifierconf['featureconf'])):
            trainclassfile = conf['classdecoder']
            print("Processing factor #" + str(i),file=sys.stderr)
            #process inputfile
            corpusfile =   os.path.basename(inputfile).replace('.txt','') + '.colibri.dat'
            classfile = os.path.basename(inputfile).replace('.txt','') + '.colibri.cls'

            #if os.path.exists(corpusfiles[i]) and os.path.exists(classfiles[i]):
            #    print("Notice: Re-using previously generated corpusfile and classfile",file=sys.stderr)
            #    print("Loading source class encoder and decoder",file=sys.stderr)
            #    sourceencoders.append( ClassEncoder(classfiles[i]) )
            #    sourcedecoders.append( ClassDecoder(classfiles[i]) )
            print("Loading and extending source class encoder, from " + trainclassfile + " to " + classfile,file=sys.stderr)
            sourceencoders.append( ClassEncoder(trainclassfile) )
            if i == 0 and args.devinputfile:
                print("(including development corpus in extended class encoder)",file=sys.stderr)
            print("Encoding test corpus, from " + inputfile + " to " + corpusfile,file=sys.stderr)
            sourceencoders[i].encodefile(inputfile, corpusfile)
            if i == 0 and args.devinputfile:
                print("Encoding development corpus, from " + args.devinputfile + " to " + args.devinputfile + '.colibri.dat',file=sys.stderr)
                sourceencoders[i].encodefile(args.devinputfile, args.devinputfile + '.colibri.dat')
            print("Loading source class decoder " + classfile,file=sys.stderr)
            sourcedecoder = ClassDecoder(classfile)

            print("Loading test corpus " + corpusfile,file=sys.stderr)

            l.append( Configuration( IndexedCorpus(corpusfile), sourcedecoder, conf['leftcontext'], conf['focus'], conf['rightcontext']) )

        classifierconf['featureconf'] = l

        print("Loading source class decoders",file=sys.stderr)
        l = []
        for conf in classifierconf['featureconf']:
            sourcedecoder = ClassDecoder(conf['classdecoder'])
            l.append( Configuration( IndexedCorpus(), sourcedecoder, conf['leftcontext'], conf['focus'], conf['rightcontext'] ) )

    if args.inputfile and args.alignmodelfile:

        print("Loading target encoder " + args.targetclassfile,file=sys.stderr)
        targetencoder = ClassEncoder(args.targetclassfile)
        print("Loading target decoder " + args.targetclassfile,file=sys.stderr)
        targetdecoder = ClassDecoder(args.targetclassfile)

        print("Loading alignment model " + args.alignmodelfile ,file=sys.stderr)
        alignmodel = AlignmentModel(args.alignmodelfile)
        print("\tAlignment model has " + str(len(alignmodel)) + " source patterns",file=sys.stderr)

        print("Building patternmodel on test corpus " + classifierconf['featureconf'][0].corpus.filename() ,file=sys.stderr)
        options = PatternModelOptions(mintokens=1, maxlength=12, debug=True)
        testmodel = IndexedPatternModel(reverseindex=classifierconf['featureconf'][0].corpus)
        testmodel.train( "", options, alignmodel)
        print("\tTest model has " + str(len(testmodel)) + " source patterns",file=sys.stderr)

        #saving just so we can inspect it for debug purposes:
        testmodel.write(  decodedir + '/test.colibri.indexedpatternmodel'  )

        if args.devinputfile:
            print("Building patternmodel on development corpus " + args.devinputfile + ".colibri.dat" ,file=sys.stderr)
            devcorpus = IndexedCorpus(args.devinputfile + ".colibri.dat")
            print("Development corpus has " + str(devcorpus.sentences()) + " sentences")
            devmodel = IndexedPatternModel(reverseindex=devcorpus)
            devmodel.train( "", options, alignmodel)
            print("\tDevelopment model has " + str(len(testmodel)) + " source patterns",file=sys.stderr)

            #saving just so we can inspect it for debug purposes:
            devmodel.write(  decodedir + '/dev.colibri.indexedpatternmodel'  )
            devmodel = {}

        if args.reorderingtable:
            print("Loading reordering model (may take a while)",file=sys.stderr)
            rtable = PhraseTable(args.reorderingtable) #TODO: convert to colibri alignmodel

    elif args.train and args.inputfile:
        if not args.alignmodelfile:
            print("No alignment model specified (-a)",file=sys.stderr)
    elif not args.train:
        if not args.inputfile:
            print("No input file specified (-f)",file=sys.stderr)
        if not args.alignmodelfile:
            print("No alignment model specified (-a)",file=sys.stderr)

    if args.train:
        #training mode
        if args.inputfile:
            print("Training classifiers (constrained by test data)",file=sys.stderr)
            print("Training all classifiers (you may want to constrain by test data using -f)",file=sys.stderr)
        if 'monolithic' in classifierconf and classifierconf['monolithic']:
            trainfile = args.workdir + "/train"
            #build a classifier
            print("Training monolithic classifier " + trainfile,file=sys.stderr)
            timbloptions = gettimbloptions(args, classifierconf)
            if args.classifierdir:
                #ugly hack since we want ibases in a different location
                trainfilecopy = trainfile.replace(args.workdir, args.classifierdir)
                shutil.copyfile(trainfile+".train", trainfilecopy+".train")
                trainfile = trainfilecopy
            classifier = timbl.TimblClassifier(trainfile, timbloptions)
            if args.classifierdir:
                #remove copy
            trained = 1
            trained = 0
            for trainfile in itertools.chain(glob.glob(args.workdir + "/*.train"), glob.glob(args.workdir + "/.*.train")): #explicitly add 'dotfiles', will be skipped by default
                if args.inputfile:
                    sourcepattern_s = unquote_plus(os.path.basename(trainfile.replace('.train','')))
                    sourcepattern = sourceencoders[0].buildpattern(sourcepattern_s)
                    if not sourcepattern in testmodel and not sourcepattern in devmodel:
                        print("Skipping " + trainfile + " (\"" + sourcepattern_s + "\" not in test/dev model)",file=sys.stderr)

                #build a classifier
                print("Training " + trainfile,file=sys.stderr)
                trained += 1
                timbloptions = gettimbloptions(args, classifierconf)
                if args.classifierdir:
                    #ugly hack since we want ibases in a different location
                    trainfilecopy = trainfile.replace(args.workdir, args.classifierdir)
                    shutil.copyfile(trainfile, trainfilecopy)
                    trainfile = trainfilecopy
                classifier = timbl.TimblClassifier(trainfile.replace('.train',''), timbloptions)
                if args.classifierdir:
                    #remove copy
                if not os.path.exists(trainfile.replace(".train",".ibase")):
                    raise Exception("Resulting instance base " + trainfile.replace(".train",".ibase") + " not found!")

        with open(args.classifierdir + '/trained','w',encoding='utf-8') as f:

        if not args.inputfile:
            print("Specify an input file (-f)",file=sys.stderr)

        if not args.mosesinclusive and not args.mosesexclusive:
            print("Writing intermediate test data to " + decodedir + "/test.txt",file=sys.stderr)
            #write intermediate test data (consisting only of indices AND unknown words) and
            f = open(decodedir + "/test.txt",'w',encoding='utf-8')
            for sentencenum, line in enumerate(classifierconf['featureconf'][0].corpus.sentences()):
                sentenceindex = sentencenum + 1
                print("@" + str(sentenceindex),file=sys.stderr)
                tokens = [] #actual string representations
                for tokenindex,pattern in enumerate(line): #will yield only unigrams
                    #is this an uncovered word that does not appear in the phrasetable? check using alignment model and keep the word untranslated if so
                    if not pattern in alignmodel:
                        print("     Found OOV at @" + str(sentenceindex) + ":" + str(tokenindex) + ": " + pattern.tostring(classifierconf['featureconf'][0].classdecoder), file=sys.stderr)
                        tokens.append(str(sentenceindex) + "_" + str(tokenindex))
                f.write(" ".join(tokens) + "\n")

        classifierindex = set()
        if classifierconf['monolithic']:
            print("Loading classifier index for monolithic classifier",file=sys.stderr)

            with open(args.workdir + "/sourcepatterns.list",'r',encoding='utf-8') as f:
                for line in f:

            print("Loading monolithic classifier " + classifierdir + "/train.train",file=sys.stderr)
            timbloptions = gettimbloptions(args, classifierconf)
            classifier = timbl.TimblClassifier(classifierdir + "/train", timbloptions)
            classifier = None

        if args.reorderingtable:
            print("Creating intermediate phrase-table and reordering-table",file=sys.stderr)
            freordering = open(decodedir + "/reordering-table", 'w',encoding='utf-8')
            print("Creating intermediate phrase-table",file=sys.stderr)
            freordering = None

        if args.mosesinclusive or args.mosesexclusive:
            #Use mosesserver with XML input method

            #write mos
            if not args.tweight:
                lentweights = 4
                tweights = " ".join([str(1/(lentweights+1))]*lentweights)
                tweights = " ".join([ str(x) for x in args.tweight])
                lentweights = len(args.tweight)

            if os.path.exists(decodedir + "/moses.ini"):

            print("Writing " + decodedir + "/moses.ini",file=sys.stderr)

            if args.reordering:
                reorderingfeature = "LexicalReordering name=LexicalReordering0 num-features=6 type=" + args.reordering + " input-factor=0 output-factor=0 path=" + decodedir + "/reordering-table"
                reorderingweight =  "LexicalReordering0= 0.3 0.3 0.3 0.3 0.3 0.3"
                reorderingfeature = ""
                reorderingweight = ""

            #write moses.ini
            f = open(decodedir + '/moses.ini','w',encoding='utf-8')
#Moses INI, produced by contextmoses.py

0 T 0


PhraseDictionaryMemory name=TranslationModel0 num-features={lentweights} path={phrasetable} input-factor=0 output-factor=0 table-limit=20
SRILM name=LM0 factor=0 path={lm} order={lmorder}

UnknownWordPenalty0= 1
WordPenalty0= {wweight}
PhrasePenalty0= {pweight}
LM0= {lmweight}
TranslationModel0= {tweights}
Distortion0= {dweight}
""".format(phrasetable=decodedir + "/phrase-table", lm=args.lm, lmorder=args.lmorder, lmweight = args.lmweight, dweight = args.dweight, tweights=tweights, lentweights=lentweights, wweight=args.wweight, pweight = args.pweight, reorderingfeature=reorderingfeature, reorderingweight=reorderingweight))

            print("Starting Moses Server",file=sys.stderr)
            if args.mosesdir:
                cmd = args.mosesdir + '/bin/mosesserver'
                cmd = 'mosesserver'
            if args.moses:
                if args.mosesinclusive:
                    cmd += " -xml-input inclusive" #compete with phrase-table
                elif args.mosesexclusive:
                    cmd += " -xml-input exclusive" #only used for passing verbatim L2 (tested whether it makes a difference with inclusive baseline on en-es data, it doesn't)
            cmd += ' -f ' + decodedir + '/moses.ini'
            print("Calling mosesserver: " + cmd,file=sys.stderr)

            p = subprocess.Popen(cmd,shell=True)
            mosesserverpid = p.pid

            while True:
                    s = socket.socket()
                    s.connect( ("localhost", args.mosesport) )
                except Exception as e:
                    print("Waiting for Moses server....", e, file=sys.stderr)

            print("Connecting to Moses Server",file=sys.stderr)
            mosesclient = xmlrpc.client.ServerProxy("http://*****:*****@" + str(i+1) + "/" + str(sourcepatterncount)  + " -- Processing " + str(sentenceindex) + ":" + str(tokenindex) + " " + sourcepattern_s + " -- Features: " + str(repr(featurevector)),file=sys.stderr)

                    if classifier and not args.ignoreclassifier:
                        if not classifierconf['monolithic'] or (classifierconf['monolithic'] and sourcepattern_s in classifierindex):

                            #call classifier
                            classlabel, distribution, distance = classifier.classify(featurevector)

                            #process classifier result
                            for targetpattern_s, score in distribution.items():
                                targetpattern = targetencoder.buildpattern(targetpattern_s)
                                if (sourcepattern, targetpattern) in alignmodel:
                                    scorevector = [ x for x in alignmodel[(sourcepattern,targetpattern)] if isinstance(x,int) or isinstance(x,float) ] #make a copy

                                if args.scorehandling == 'append':
                                elif args.scorehandling == 'replace':
                                    scorevector[2] = score
                                    raise NotImplementedError #TODO: implemented weighed!

                                translationcount += 1

                                #write phrasetable entries
                                ftable.write(tokenspan + " ||| " + targetpattern_s + " ||| " + " ".join([str(x) for x in scorevector]) + "\n")
                                if freordering:
                                    reordering_scores = None
                                        for t, sv in rtable[sourcepattern_s]:
                                            if t == targetpattern_s:
                                                reordering_scores = sv
                                    except KeyError:
                                        if args.ignoreerrors:
                                            print("******* ERROR ********* Source pattern notfound in reordering table: " + sourcepattern_s,file=sys.stderr)
                                            raise Exception("Source pattern notfound in reordering table: " + sourcepattern_s)

                                    if reordering_scores:
                                        freordering.write(tokenspan + " ||| " + targetpattern_s + " ||| " + " ".join([str(x) for x in reordering_scores]) + "\n")
                                        if args.ignoreerrors:
                                            print("******** ERROR ********* Target pattern not found in reordering table: " + targetpattern_s + " (for source " + sourcepattern_s + ")",file=sys.stderr)
                                            raise Exception("Target pattern not found in reordering table: " + targetpattern_s + " (for source " + sourcepattern_s + ")")

                            if translationcount == 0:
                                print("\tNo overlap between classifier translations (" + str(len(distribution)) + ") and phrase table. Falling back to statistical baseline.",file=sys.stderr)
                                statistical = True
                                print("\t\t" + str(translationcount) + " translation options written",file=sys.stderr)
                                statistical = False
                            print("\tNot in classifier. Falling back to statistical baseline.",file=sys.stderr)
                            statistical = True

                        statistical = True

                    if statistical:
                        print("\tPhrasetable lookup",file=sys.stderr)
                        #ignore classifier or no classifier present for this item
                        for targetpattern in alignmodel.targetpatterns(sourcepattern):
                            scorevector = [ x for x in alignmodel[(sourcepattern,targetpattern)] if isinstance(x,int) or isinstance(x,float) ] #make a copy

                            if args.scorehandling == 'append':
                            elif args.scorehandling == 'replace':
                                pass #nothing to do, scorevector is okay as it is
                            elif args.scorehandling == 'weighed':
                                raise NotImplementedError #TODO: implemented weighed!

                            translationcount += 1

                            #write phrasetable entries
                            targetpattern_s = targetpattern.tostring(targetdecoder)
                            ftable.write(tokenspan + " ||| " + targetpattern_s + " ||| " + " ".join([ str(x) for x in scorevector]) + "\n")
                            if freordering:
                                reordering_scores = None
                                    for t, sv in rtable[sourcepattern_s]:
                                        if t == targetpattern_s:
                                            reordering_scores = sv
                                except KeyError:
                                    if args.ignoreerrors:
                                        print("******** ERROR ******* Source pattern not found in reordering table: " + sourcepattern_s,file=sys.stderr)
                                        raise Exception("Source pattern not found in reordering table: " + sourcepattern_s)

                                if reordering_scores:
                                    freordering.write(tokenspan + " ||| " + targetpattern_s + " ||| " + " ".join([str(x) for x in reordering_scores]) + "\n")
                                    if args.ignoreerrors:
                                            print("******* ERROR ****** Target pattern not found in reordering table: " + targetpattern_s + " (for source " + sourcepattern_s + ")",file=sys.stderr)
                                        raise Exception("Target pattern not found in reordering table: " + targetpattern_s + " (for source " + sourcepattern_s + ")")

                        print("\t\t" + str(translationcount) + " translation options written",file=sys.stderr)

                prevpattern = None

            if freordering:

            if not args.tweight:
                if args.scorehandling == "append":
                    lentweights = 5
                    lentweights = 4
                tweights = " ".join([str(1/(lentweights+1))]*lentweights)
                tweights = " ".join([ str(x) for x in args.tweight])
                lentweights = len(args.tweight)

            print("Writing " + decodedir + "/moses.ini",file=sys.stderr)

            if args.reordering:
                reorderingfeature = "LexicalReordering name=LexicalReordering0 num-features=6 type=" + args.reordering + " input-factor=0 output-factor=0 path=" + decodedir + "/reordering-table"
                if not args.reorderingweight:
                    reorderingweight =  "LexicalReordering0= 0.3 0.3 0.3 0.3 0.3 0.3"
                    reorderingweight =  "LexicalReordering0= " + " ".join([str(x) for x in args.reorderingweight])
                reorderingfeature = ""
                reorderingweight = ""

            #write moses.ini
            f = open(decodedir + '/moses.ini','w',encoding='utf-8')
#Moses INI, produced by contextmoses.py

0 T 0


PhraseDictionaryMemory name=TranslationModel0 num-features={lentweights} path={phrasetable} input-factor=0 output-factor=0 table-limit=20
SRILM name=LM0 factor=0 path={lm} order={lmorder}

UnknownWordPenalty0= 1
WordPenalty0= {wweight}
PhrasePenalty0= {pweight}
LM0= {lmweight}
TranslationModel0= {tweights}
Distortion0= {dweight}
""".format(phrasetable=decodedir + "/phrase-table", lm=args.lm, lmorder=args.lmorder, lmweight = args.lmweight, dweight = args.dweight, tweights=tweights, lentweights=lentweights, wweight=args.wweight, pweight = args.pweight, reorderingfeature=reorderingfeature, reorderingweight=reorderingweight))


            if not args.skipdecoder:
                if args.mert:
                    if args.ref[0] == '/':
                        ref = args.ref
                        ref = os.getcwd() + '/' + args.ref

                    for mertrun in range(1,args.mert+1):
                        if os.path.exists(decodedir+"/mert-work-" + str(mertrun) +"/moses.ini"):
                            print("Mert run #" + str(mertrun) + " already ran, skipping...",file=sys.stderr)
                            #invoke mert
                            cmd = args.mosesdir + "/scripts/training/mert-moses.pl --working-dir=" + decodedir + "/mert-work-" + str(mertrun) + " --mertdir=" + args.mosesdir + '/mert/' + ' --decoder-flags="-threads ' + str(args.threads) + '" ' + decodedir + "/test.txt " + ref + " `which moses` " + decodedir + "/moses.ini --threads=" + str(args.threads)
                            print("Contextmoses calling mert #" + str(mertrun) + ": " + cmd,file=sys.stderr)
                            r = subprocess.call(cmd, shell=True)
                            if r != 0:
                                print("Contextmoses called mert #" + str(mertrun) + " but failed!", file=sys.stderr)
                            print("DONE: Contextmoses calling mert #" + str(mertrun)+": " + cmd,file=sys.stderr)
                    #invoke moses
                    cmd = EXEC_MOSES + " -threads " + str(args.threads) + " -f " + decodedir + "/moses.ini < " + decodedir + "/test.txt > " + decodedir + "/output.txt"
                    print("Contextmoses calling moses: " + cmd,file=sys.stderr)
                    r = subprocess.call(cmd, shell=True)
                    if r != 0:
                        print("Contextmoses called moses but failed!", file=sys.stderr)
                    print("DONE: Contextmoses calling moses: " + cmd,file=sys.stderr)

                print("Contextmoses skipping decoder",file=sys.stderr)
Exemplo n.º 2
def extractpairs(ttablefile, gizamodelfile_s2t, gizamodelfile_t2s, patternmodelfile_source, patternmodelfile_target, classfile_source, classfile_target, joinedprobabilitythreshold, divergencefrombestthreshold, DEBUG):
    if DEBUG: print("Loading phrase-table", file=sys.stderr)
    #ttable = PhraseTable(ttablefile,False, False, "|||", 3, 0,None, None, lambda x: x[0] * x[2] > joinedprobabilitythreshold)
    ttable = AlignmentModel()

    if DEBUG: print("Loading GIZA model (s->t)", file=sys.stderr)
    gizamodel_s2t = GizaModel(gizamodelfile_s2t)
    if DEBUG: print("Loading GIZA model (t->s)", file=sys.stderr)
    gizamodel_t2s = GizaModel(gizamodelfile_t2s)

    if DEBUG: print("Loading decoders", file=sys.stderr)
    classdecoder_source = ClassDecoder(classfile_source)
    classencoder_source = ClassEncoder(classfile_source)
    classdecoder_target = ClassDecoder(classfile_target)
    classencoder_target = ClassEncoder(classfile_target)

    if DEBUG: print("Loading source pattern model " + patternmodelfile_source, file=sys.stderr)
    options = PatternModelOptions()
    #options.DOREVERSEINDEX = False
    patternmodel_source = IndexedPatternModel(patternmodelfile_source, options)
    if DEBUG: print("Loading target pattern model " + patternmodelfile_target, file=sys.stderr)
    patternmodel_target = IndexedPatternModel(patternmodelfile_target, options)

    #with open(sourcecorpusfile, 'r', encoding='utf-8') as f:
    #    sourcecorpus = [x.strip() for x in f.readlines()]

    #with open(targetcorpusfile, 'r', encoding='utf-8') as f:
    #    targetcorpus = [x.strip() for x in f.readlines()]

    iter_s2t = iter(gizamodel_s2t)
    iter_t2s = iter(gizamodel_t2s)

    if DEBUG: print("Iterating over all sentence pairs", file=sys.stderr)
    #iterate over all sentences in the parallel corpus (GIZA alignment acts as source)
    while True:
            s2t = next(iter_s2t)
            t2s = next(iter_t2s)
        except StopIteration:
            print("WARNING: No more GIZA alignments, breaking",file=sys.stderr)

        sentence = s2t.index
        assert t2s.index == s2t.index

        if DEBUG:
            print("(extractpatterns) s2t.source=", s2t.source , file=sys.stderr)
            print("(extractpatterns) t2s.target=", t2s.target , file=sys.stderr)
            print("(extractpatterns) t2s.source=", t2s.source , file=sys.stderr)
            print("(extractpatterns) s2t.target=", s2t.target , file=sys.stderr)
        intersection = s2t.intersect(t2s)
        if not intersection:

        #gather all target patterns found  in this sentence
        sourcepatterns = list(patternmodel_source.reverseindex_bysentence(sentence))
        targetpatterns = [ targetpattern.tostring(classdecoder_target) for targetpattern in patternmodel_target.reverseindex_bysentence(sentence) ]

        if DEBUG: print("(extractpatterns) processing sentence " + str(sentence) + ", collected " + str(len(sourcepatterns)) + " source patterns and " + str(len(targetpatterns)) + " target patterns", file=sys.stderr)

        if DEBUG:
            for targetpattern in targetpatterns:
                if DEBUG: print("(extractpatterns) -- identified target pattern " + str(targetpattern) , file=sys.stderr)

        #iterate over all source patterns found in this sentence
        for sourcepattern in sourcepatterns:
            sourcepattern_s = sourcepattern.tostring(classdecoder_source)
            if any(( noword(x) for x in sourcepattern_s.split() ) ):

            sourceindices = [ (x,y) for x,y in patternmodel_source[sourcepattern] if x == sentence ]
            source_n = sourcepattern_s.count(" ") + 1
            assert bool(sourceindices)
            if sourcepattern_s in ttable:
                if DEBUG: print("(extractpatterns) -- source pattern candidate " + str(sourcepattern_s) + " (occuring " + str(len(sourceindices)) + " time(s)), has " + str(len(ttable[sourcepattern_s])) + " translation options in phrase-table" , file=sys.stderr)
                sourcesentence = s2t.source
                targetsentence = s2t.target

                targetoptions = sorted( ( (targetpattern_s, scores) for targetpattern_s, scores in ttable[sourcepattern_s] ) , key=lambda x: x[1] )
                bestscore = targetoptions[0][1][0] * targetoptions[0][1][2]

                #iterate over the target patterns in the phrasetable
                for targetpattern_s, scores in ttable[sourcepattern_s]:
                    if DEBUG: print("(extractpatterns) -- considering target pattern from phrase-table: " + str(targetpattern_s) , file=sys.stderr)
                    if targetpattern_s in targetpatterns:
                        if any(( noword(x) for x in targetpattern_s.split() ) ):
                        joinedprob = scores[0] * scores[2]
                        if joinedprob < bestscore * divergencefrombestthreshold:

                        #we have a pair, occurring in pattern models and phrase table
                        target_n = targetpattern_s.count(" ") + 1

                        #obtain positional offsets for source and target in sentence
                        targetindices = [ (x,y) for x,y in patternmodel_target[classencoder_target.buildpattern(targetpattern_s)] if x == sentence]
                        assert bool(targetindices)

                        if DEBUG: print("(extractpatterns) --- found target pattern candidate " + str(targetpattern_s) + " (occuring " + str(len(targetindices)) + " time(s))" , file=sys.stderr)

                        #yield the pair and full context
                        for _, sourceoffset in sourceindices:
                            for _, targetoffset in targetindices:
                                #check if offsets don't violate the word alignment
                                valid = True
                                for i in range(sourceoffset, sourceoffset + source_n):
                                    target, foundindex = intersection.getalignedtarget(i)
                                    if isinstance(foundindex, tuple):
                                        targetl = foundindex[1]
                                        foundindex = foundindex[0]
                                    if foundindex < targetoffset or foundindex >= targetoffset + target_n:
                                        valid = False
                                        if DEBUG: print("(extractpatterns) --- violates word alignment", file=sys.stderr)
                                if valid:
                                    if DEBUG: print("(extractpatterns) --- ok", file=sys.stderr)
                                    yield sourcepattern_s, targetpattern_s, sourceoffset, targetoffset, tuple(sourcesentence), tuple(targetsentence), sentence
