def pgf_parse(args): grammar = pgf.readPGF(args.pgfgrammar); import translation_pipeline; preprocessor = lexer(); inputSet = translation_pipeline.web_lexer(grammar, args.srclang, imap(preprocessor, args.inputstream) ); outputPrinter = lambda X: "%f\t%s" %(X[0], str(X[1])); #operator.itemgetter(1); callbacks = [('PN', translation_pipeline.parseNames(grammar, args.srclang)), ('Symb', translation_pipeline.parseUnknown(grammar, args.srclang))]; parser = getKBestParses(grammar, args.srclang, 1, callbacks); sentidx = 0; for time, parsesBlock in imap(parser, inputSet): sentidx += 1; print >>args.outputstream, "%d\t%f\t%s" %(sentidx, time, str(outputPrinter(parsesBlock[0])) if len(parsesBlock) else ''); return;
def pgf_kparse(args): grammar = pgf.readPGF(args.pgfgrammar); import translation_pipeline; preprocessor = lexer(); inputSet = translation_pipeline.web_lexer(grammar, args.srclang, imap(preprocessor, args.inputstream) ); outputPrinter = printJohnsonRerankerFormat; callbacks = [('PN', translation_pipeline.parseNames(grammar, args.srclang)), ('Symb', translation_pipeline.parseUnknown(grammar, args.srclang))]; parser = getKBestParses(grammar, args.srclang, args.K, callbacks=callbacks); sentidx = 0; for time, parsesBlock in imap(parser, inputSet): sentidx += 1; strParses = str(outputPrinter(parsesBlock)); if not (strParses == '\n'): print >>args.outputstream, strParses; return;
def pgf_kparse(args): grammar = pgf.readPGF(args.pgfgrammar) import translation_pipeline preprocessor = lexer() inputSet = translation_pipeline.web_lexer( grammar, args.srclang, imap(preprocessor, args.inputstream)) outputPrinter = printJohnsonRerankerFormat callbacks = [('PN', translation_pipeline.parseNames(grammar, args.srclang)), ('Symb', translation_pipeline.parseUnknown(grammar, args.srclang))] parser = getKBestParses(grammar, args.srclang, args.K, callbacks=callbacks) sentidx = 0 for time, parsesBlock in imap(parser, inputSet): sentidx += 1 strParses = str(outputPrinter(parsesBlock)) if not (strParses == '\n'): print >> args.outputstream, strParses return
def pgf_parse(args): grammar = pgf.readPGF(args.pgfgrammar) import translation_pipeline preprocessor = lexer() inputSet = translation_pipeline.web_lexer( grammar, args.srclang, imap(preprocessor, args.inputstream)) outputPrinter = lambda X: "%f\t%s" % (X[0], str(X[1])) #operator.itemgetter(1); callbacks = [('PN', translation_pipeline.parseNames(grammar, args.srclang)), ('Symb', translation_pipeline.parseUnknown(grammar, args.srclang))] parser = getKBestParses(grammar, args.srclang, 1, callbacks) sentidx = 0 for time, parsesBlock in imap(parser, inputSet): sentidx += 1 print >> args.outputstream, "%d\t%f\t%s" % ( sentidx, time, str(outputPrinter(parsesBlock[0])) if len(parsesBlock) else '') return
def worker(sentence): sentence = sentence.strip(); curid = sentid.next(); tstart = time.time(); kBestParses = []; parseScores = {}; if len(sentence.split()) > max_length: tend, err = time.time(), "Sentence too long (%d tokens). Might potentially run out of memory" %(len(sentence.split())); print >>sys.stderr, '%d\t%.4f\t%s' %(curid, tend-tstart, err); return tend-tstart, kBestParses; # temporary hack to make sure parser does not get killed for very long sentences; try: callbacks = [('PN', translation_pipeline.parseNames(grammar, args.srclang, sentence)), ('Symb', translation_pipeline.parseUnknown(grammar, args.srclang, sentence))] for parseidx, parse in enumerate( parser(sentence, heuristics=0, callbacks=callbacks) ): parseScores[parse[0]] = True; kBestParses.append( (parse[0], str(parse[1]) if serializable else parse[1]) ); if parseidx == K-1: break; #if len(parseScores) >= K: break; tend = time.time(); print >>sys.stderr, '%d\t%.4f' %(curid, tend-tstart); return tend-tstart, kBestParses; except pgf.ParseError, err: tend = time.time(); print >>sys.stderr, '%d\t%.4f\t%s' %(curid, tend-tstart, err); return tend-tstart, kBestParses;