def main(): try: inputset = sys.argv[1] outputset = sys.argv[2] except: print("Syntax: inputset outputset",file=sys.stderr) sys.exit(2) reader = Reader(inputset) writer = Writer(outputset) for sentencepair in reader: sentencepair.ref = None sentencepair.source = None sentencepair.category = None writer.write(sentencepair) writer.close() reader.close()
def main(): try: outputset = sys.argv[1] inputsets = sys.argv[2:] except: print("Syntax: outputset inputset inputset2...", file=sys.stderr) sys.exit(2) id = 0 writer = Writer(outputset) for inputset in inputsets: reader = Reader(inputset) for sentencepair in reader: id += 1 sentencepair.id = id writer.write(sentencepair) reader.close() writer.close()
def main(): try: inputset = sys.argv[1] outputset = sys.argv[2] except: print("Syntax: inputset outputset",file=sys.stderr) sys.exit(2) reader = Reader(inputset) sentencepairs = [] for sentencepair in reader: sentencepairs.append(sentencepair) reader.close() writer = Writer(outputset) random.shuffle(sentencepairs) for i, sentencepair in enumerate(sentencepairs): sentencepair.id = i+1 writer.write(sentencepair) writer.close()
def main(): if len(sys.argv) < 3 or len(sys.argv) > 5: print("Syntax: inputset outputset offset maxwords",file=sys.stderr) sys.exit(2) try: inputset, outputset, offset,maxwords = sys.argv[1:] offset = int(offset) maxwords = int(maxwords) except: maxwords = 99 try: inputset, outputset, offset = sys.argv[1:] offset = int(offset) except: inputset, outputset = sys.argv[1:] offset = 1 buffer = [] BUFFERSIZE = 10 tmpfile=False inputs = set() if os.path.exists(outputset): writer = Writer(outputset + '.tmp') reader = Reader(outputset) for sentencepair in reader: inputs.add( hash(sentencepair.input) ) writer.write(sentencepair) tmpfile=True else: writer = Writer(outputset) num = 0 reader = Reader(inputset) quit = False for sentencepair in reader: if len(sentencepair.input) <= maxwords: num += 1 if not hash(sentencepair.input) in inputs: if num >= offset: buffer.append(sentencepair) if len(buffer) == BUFFERSIZE: buffer, quit = processbuffer(buffer, reader,writer, inputs,num-BUFFERSIZE) if quit: break if buffer and not quit: processbuffer(buffer, reader,writer, inputs,num) writer.close() if tmpfile: os.rename(outputset+'.tmp',outputset)
def main(): global sources, categories if len(sys.argv) < 4: print("Syntax: set L1 L2",file=sys.stderr) sys.exit(2) setfile= sys.argv[1] l1= sys.argv[2] l2= sys.argv[3] sentencepairs = [] if os.path.exists(setfile): print("Loading existing file: ", setfile) reader = Reader(setfile) for sentencepair in reader: sentencepairs.append(sentencepair) if sentencepair.source: sources[sentencepair.source] += 1 if sentencepair.category: categories[sentencepair.category] += 1 print(str(len(sentencepairs)) + " sentences loaded") else: print("New file: ", setfile,file=sys.stderr) print("Type h for help") cursor = None quit = False while not quit: cmd = input("> ") if cmd.lower() == 'q': writer = Writer(setfile,l1,l2) for sentencepair in sentencepairs: writer.write(sentencepair) writer.close() quit = True elif cmd.lower() == 'h': print("q\tSave and quit",file=sys.stderr) print("n\tNew sentence pair",file=sys.stderr) #print("d\tDelete sentence pair",file=sys.stderr) print("a\tAdd alternative",file=sys.stderr) print(">\tNext sentence pair",file=sys.stderr) print("<\tPrevious sentence pair",file=sys.stderr) print("12\tGo to sentence pair #12", file=sys.stderr) print("w\tWrite changes to disk", file=sys.stderr) elif cmd.lower() == "<": if cursor is None: cursor = len(sentencepairs) - 1 else: cursor = cursor - 1 if cursor < 0: cursor = len(sentencepairs) - 1 showsentencepair(sentencepairs, cursor) elif cmd.lower() == ">": if cursor is None: cursor = 0 else: cursor = cursor + 1 if cursor >= len(sentencepairs): cursor = 0 showsentencepair(sentencepairs, cursor) elif cmd.lower().isdigit(): cursor = int(cmd.lower()) - 1 if cursor < 0: cursor = 0 if cursor >= len(sentencepairs): cursor = len(sentencepairs) - 1 elif cmd.lower() == 'n': cursor = newsentencepair(sentencepairs) elif cmd.lower() == 'w': writer = Writer(setfile,l1,l2) for sentencepair in sentencepairs: writer.write(sentencepair) writer.close() elif cmd.lower() == 'p': if cursor is None: cursor = 0 showsentencepair(sentencepairs, cursor) elif cmd.lower() == 'a': if cursor is None: cursor = 0 addalternative(sentencepairs[cursor]) else: print("No such command, type h for help", file=sys.stderr)
def main(): try: inputset = sys.argv[1] outputset = sys.argv[2] l1 = sys.argv[3] l2 = sys.argv[4] except: print("Syntax: inputset outputset l1 l2", file=sys.stderr) sys.exit(2) writer = Writer(outputset) reader = Reader(inputset) for sentencepair in reader: if sentencepair.ref: for left, fragment, right in sentencepair.fragments(sentencepair.ref): print("Tokenising reference: L=", left, file=sys.stderr) print(" F=", fragment.value, file=sys.stderr) print(" R=", right, file=sys.stderr) if left.strip(): left = tok(left, l2) else: left = "" alts = fragment.alternatives fragment = Fragment(tok(fragment.value, l2), id=fragment.id) for alt in alts: fragment.alternatives.append(Alternative(tok(alt.value, l2))) if right.strip(): right = tok(right, l2) else: right = "" if left and right: ref = left + (fragment,) + right elif left: ref = left + (fragment,) elif right: ref = (fragment,) + right sentencepair.ref = ref if sentencepair.output: for left, fragment, right in sentencepair.fragments(sentencepair.output): print("Tokenising output: L=", left, file=sys.stderr) print(" F=", fragment.value, file=sys.stderr) print(" R=", right, file=sys.stderr) if left.strip(): left = tok(left, l2) else: left = "" alts = fragment.alternatives fragment = Fragment(tok(fragment.value, l2)) for alt in alts: fragment.alternatives.append(Alternative(tok(alt))) if right.strip(): right = tok(right, l2) else: right = "" if left and right: out = left + (fragment,) + right elif left: out = left + (fragment,) elif right: out = (fragment,) + right sentencepair.output = out if sentencepair.input: for left, fragment, right in sentencepair.fragments(sentencepair.input): print("Tokenising input: L=", left, file=sys.stderr) print(" F=", fragment.value, file=sys.stderr) print(" R=", right, file=sys.stderr) if left.strip(): left = tok(left, l2) else: left = "" alts = fragment.alternatives fragment = Fragment(tok(fragment.value, l1), id=fragment.id) if right.strip(): right = tok(right, l2) else: right = "" if left and right: inp = left + (fragment,) + right elif left: inp = left + (fragment,) elif right: inp = (fragment,) + right sentencepair.input = inp writer.write(sentencepair) reader.close() writer.close()
def generate(testoutput, ttablefile, gizamodelfile_s2t, gizamodelfile_t2s, patternmodelfile_source, patternmodelfile_target, classfile_source, classfile_target, size =0, joinedprobabilitythreshold = 0.01, divergencefrombestthreshold=0.8,DEBUG = False): if size > 0: print("Extracting instances, writing to " + testoutput + '.tmp',file=sys.stderr) writer = Writer(testoutput+'.tmp') else: print("Extracting instances, writing to " + testoutput,file=sys.stderr) writer = Writer(testoutput) prevsentence = -1 id = 0 for sourcepattern, targetpattern, sourceoffset, targetoffset, sourcesentence, targetsentence, sentence in extractpairs(ttablefile, gizamodelfile_s2t, gizamodelfile_t2s, patternmodelfile_source, patternmodelfile_target, classfile_source, classfile_target, joinedprobabilitythreshold, divergencefrombestthreshold, DEBUG): id += 1 if sentence != prevsentence: print(datetime.datetime.now().strftime('%H:%M:%S'), "Input sentence #" + str(sentence) + " , Output sentence #" + str(id), file=sys.stderr) prevsentence = sentence valid, sentencepair = makesentencepair(id, sourcepattern, targetpattern, sourceoffset, targetoffset, sourcesentence, targetsentence) if valid: writer.write(sentencepair) writer.close() if size > 0: print("Sampling " + str(size),file=sys.stderr) selected_ids = set(random.sample( range(1,id+1), size )) writer = Writer(testoutput) reader = Reader(testoutput+'.tmp') newid = 0 for sentencepair in reader: if int(sentencepair.id) in selected_ids: newid += 1 sentencepair.id = newid writer.write(sentencepair) reader.close() writer.close()
def main(): parser = argparse.ArgumentParser(description="Colibrita - Translation Assistance using Moses", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-f','--dataset', type=str,help="Dataset file", action='store',default="",required=True) parser.add_argument('--debug','-d', help="Debug", action='store_true', default=False) parser.add_argument('-o','--output',type=str,help="Output prefix", required = True) parser.add_argument('-T','--ttable', type=str,help="Phrase translation table (file) to use when testing with --lm and without classifier training", action='store',required=True) parser.add_argument('--lm',type=str, help="Language model (file in ARPA format, as produced by for instance SRILM)", action='store',required=True) parser.add_argument('--lmweight',type=float, help="Language model weight for Moses ", action='store',default=0.5) parser.add_argument('--lmorder',type=float, help="Language model order", action='store',default=3) parser.add_argument('--dweight',type=float, help="Distortion weight for Moses", action='store',default=0.6) parser.add_argument('--tmweights',type=str, help="Translation model weights for Moses (comma separated)", action='store',default="0.20,0.20,0.20,0.20,0.20") parser.add_argument('--lmweightrr',type=float, help="Language model weight in reranking", action='store',default=1) parser.add_argument('--tweightrr',type=float, help="Translation model weight in reranking", action='store',default=1) parser.add_argument('-n','--n',type=int,help="Number of output hypotheses per sentence", default=25) parser.add_argument('-a','--a',type=int,help="Add alternative translations, up to the specified numer", default=0) args = parser.parse_args() #if os.path.exists(args.output): # print("Output already " + args.output + " already exists, doing nothing..",file=sys.stderr) # sys.exit(2) #else: # os.mkdir(args.output) if not os.path.exists(args.ttable): print("Translation table " + args.ttable + " does not exist", file=sys.stderr) sys.exit(2) if not os.path.exists(args.lm): print("Language model " + args.lm + " does not exist", file=sys.stderr) sys.exit(2) data = Reader(args.dataset) f = open(args.output + '.moses.ini','w',encoding='utf-8') f.write("[input-factors]\n0\n\n") f.write("[mapping]\n0 T 0\n\n") f.write("[ttable-file]\n0 0 0 5 " + args.ttable + "\n\n") f.write("[lmodel-file]\n0 0 " + str(args.lmorder) + " " + args.lm + "\n\n") f.write("[ttable-limit]\n20\n\n") f.write("[weight-d]\n" + str(args.dweight) + "\n\n") f.write("[weight-l]\n" + str(args.lmweight) + "\n\n") f.write("[weight-t]\n" + "\n".join(args.tmweights.split(',')) + "\n\n") f.write("[weight-w]\n-1\n") f.write("[distortion-limit]\n6\n") f.close() if not os.path.exists(args.output + ".nbestlist"): cmd = 'moses -f ' + args.output + '.moses.ini -n-best-list ' + args.output + '.nbestlist ' + str(args.n) print("Calling moses: " + cmd,file=sys.stderr) p = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stdin=subprocess.PIPE,stderr=subprocess.PIPE) for sentencepair in data: for left, sourcefragment, right in sentencepair.inputfragments(): p.stdin.write( (str(sourcefragment) + "\n").encode('utf-8')) p.communicate() p.stdin.close() data.reset() else: print("Moses output already exists, not overwriting. Delete " + args.output + ".nbestlist if you want a fresh run.",file=sys.stderr) print("Loading Language model", file=sys.stderr) lm = ARPALanguageModel(args.lm) print("Processing moses output...",file=sys.stderr) previndex = -1 sentenceoutput = [] hypotheses = [] with open(args.output+'.nbestlist','r',encoding='utf-8') as f: for line in f: fields = [ x.strip() for x in line.strip().split("|||") ] print(fields,file=sys.stderr) index = int(fields[0]) if index != previndex: if hypotheses: sentenceoutput.append( hypotheses ) hypotheses = [] previndex = index solution = fields[1] rawscores = fields[2].split(' ') print(rawscores,file=sys.stderr) tscore = float(rawscores[9]) hypotheses.append( (solution, tscore) ) sentenceoutput.append( hypotheses ) #don't forget last one writer = Writer(args.output + '.output.xml') for i, sentencepair in enumerate(data): sentencepair.output = copy(sentencepair.input) hypotheses = sentenceoutput[i] for left, inputfragment, right in sentencepair.inputfragments(): candidatesentences = [] bestlmscore = -999999999 besttscore = -999999999 for hypothesis, tscore in hypotheses: #compute new lm score outputfragment = Fragment(tuple(hypothesis.split(' ')), inputfragment.id) candidatesentence = sentencepair.replacefragment(inputfragment, outputfragment, sentencepair.output) lminput = " ".join(sentencepair._str(candidatesentence)).split(" ") #joining and splitting deliberately to ensure each word is one item lmscore = lm.score(lminput) assert lmscore <= 0 if lmscore > bestlmscore: bestlmscore = lmscore if tscore > besttscore: besttscore = tscore candidatesentences.append( ( candidatesentence, hypothesis, tscore, lmscore ) ) #compute scores solutions = [] for candidatesentence, targetpattern, tscore, lmscore in candidatesentences: tscore = args.tweightrr * (tscore-besttscore) lmscore = args.lmweightrr * (lmscore-bestlmscore) score = tscore + lmscore print(targetpattern + " --- tscore=" + str(tscore) + ", lmscore=" + str(lmscore),file=sys.stderr) solutions.append( (score, targetpattern) ) solutions = sorted(solutions, key=lambda x: -1 * x[0]) translation = tuple(solutions[0][1].split()) outputfragment = Fragment(translation, inputfragment.id) print("\t" + str(inputfragment) + " -> " + str(outputfragment), file=sys.stderr) if args.a: for score, solution in solutions[1:1+args.a]: outputfragment.alternatives.append( Alternative( tuple(solution.split()), confidence=score) ) sentencepair.output = sentencepair.replacefragment(inputfragment, outputfragment, sentencepair.output) writer.write(sentencepair) break #only support one iteration for now, one fragment per sentence writer.close() print("All done.", file=sys.stderr)