at=AlignmentTemplate() at.parse(line) ruleList.add(at) gfile.close() #load sentences sentences=list() if args.sentences: if args.sentences.lower().endswith('.gz'): gfile=gzip.open(args.sentences) else: gfile=open(args.sentences) for line in gfile: line=line.strip().decode('utf-8') parallelSentence=ParallelSentence() parallelSentence.parse(line, parseTlLemmasFromDic=True,forRBPE=args.rbpe) if not args.tt1_beam: parallelSentence.add_explicit_empty_tags() sentences.append(parallelSentence) gfile.close() boxesDic=dict() if args.final_boxes_index: for line in open(args.final_boxes_index): parts=line.split("\t") boxesDic[parts[1].strip()]=int(parts[0]) #read best rule application for each sentence bestHypothesisForEachSentence=list() emptyIndexes=set() numLine=0
''' Created on 12/02/2014 @author: vitaka ''' #stdinput: sentences to be translated from beamSearchLib import RuleList, ParallelSentence, RuleApplicationHypothesis from ruleLearningLib import AlignmentTemplate import ruleLearningLib import argparse import gzip import sys if __name__ == "__main__": parser = argparse.ArgumentParser(description='') parser.add_argument('--tag_groups_file_name', required=True) parser.add_argument('--tag_sequences_file_name', required=True) args = parser.parse_args(sys.argv[1:]) ruleLearningLib.AT_LexicalTagsProcessor.initialize( args.tag_groups_file_name, args.tag_sequences_file_name) for line in sys.stdin: line = line.rstrip('\n').decode('utf-8') print "Parsing ..." print line.encode('utf-8') parallelSentence = ParallelSentence() parallelSentence.parse(line, parseTlLemmasFromDic=True) parallelSentence.add_explicit_empty_tags() print "Everything OK"
''' Created on 12/02/2014 @author: vitaka ''' #stdinput: sentences to be translated from beamSearchLib import RuleList, ParallelSentence, RuleApplicationHypothesis from ruleLearningLib import AlignmentTemplate import ruleLearningLib import argparse import gzip import sys if __name__ == "__main__": parser = argparse.ArgumentParser(description='') parser.add_argument('--tag_groups_file_name',required=True) parser.add_argument('--tag_sequences_file_name',required=True) args = parser.parse_args(sys.argv[1:]) ruleLearningLib.AT_LexicalTagsProcessor.initialize(args.tag_groups_file_name,args.tag_sequences_file_name) for line in sys.stdin: line=line.rstrip('\n').decode('utf-8') print "Parsing ..." print line.encode('utf-8') parallelSentence=ParallelSentence() parallelSentence.parse(line, parseTlLemmasFromDic=True) parallelSentence.add_explicit_empty_tags() print "Everything OK"
gfile.close() print >> sys.stderr, "... done" print >> sys.stderr, "Loading sentences ..." #load sentences sentences = list() if args.sentences: if args.sentences.lower().endswith(".gz"): gfile = gzip.open(args.sentences) else: gfile = open(args.sentences) for line in gfile: line = line.strip().decode('utf-8') parallelSentence = ParallelSentence() parallelSentence.parse(line, parseTlLemmasFromDic=True, forRBPE=args.rbpe) parallelSentence.add_explicit_empty_tags() sentences.append(parallelSentence) gfile.close() print >> sys.stderr, "... done" boxesInvDic = dict() boxesDic = dict() if args.final_boxes_index: for line in open(args.final_boxes_index): parts = line.split("\t") boxesInvDic[int(parts[0])] = parts[1].strip() boxesDic[parts[1].strip()] = int(parts[0]) nfirst = None