def load_parser(gpu=True): import types extra_args=types.SimpleNamespace() if gpu: extra_args.__dict__["udify_mod.device"]="0" #simulates someone giving a --device 0 parameter to Udify extra_args.__dict__["lemmatizer_mod.device"]="0" available_pipelines=read_pipelines("models_fi_tdt_v2.7/pipelines.yaml") # {pipeline_name -> its steps} turku_parser=Pipeline(available_pipelines["parse_plaintext"], extra_args) # launch the pipeline from the steps return turku_parser
pipeline = pipelines[args.action] if pipeline[0].startswith("extraoptions"): extraoptions = pipeline[0].split()[1:] pipeline.pop(0) newoptions = extraoptions + sys.argv[1:] print("Got extra arguments from the pipeline, now running with", newoptions, file=sys.stderr, flush=True) args = argparser.parse_args(newoptions) #args.__dict__["lemmatizer_mod.device"]=-1 #args.device force lemmatizer onto CPU pipeline.append("output_mod") p = Pipeline(steps=pipeline, extra_args=args) print("Waiting for input", file=sys.stderr, flush=True) comment_regex = re.compile("^####?\s?C:") line_buffer = [] for line in sys.stdin: line_buffer.append(line) if not comment_regex.match(line) and ( line.strip() == "" or not args.empty_line_batching ) and len(line_buffer) > args.batch_lines and batch_endswith_text( line_buffer): if not p.is_alive(): #gotta end if something dies print("Something crashed. Exiting.", file=sys.stderr, flush=True) sys.exit(-1)
"/home/jmnybl/git_checkout/Turku-neural-parser-pipeline-modularize") from tnparser.pipeline import read_pipelines, Pipeline ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC = range(10) # GPU import types extra_args = types.SimpleNamespace() extra_args.__dict__[ "udify_mod.device"] = "0" #simulates someone giving a --device 0 parameter to Udify extra_args.__dict__["lemmatizer_mod.device"] = "0" available_pipelines = read_pipelines( "models_fi_tdt_v2.7/pipelines.yaml") # {pipeline_name -> its steps} p = Pipeline(available_pipelines["parse_plaintext"] ) # launch the pipeline from the steps def parse(txt): txt_parsed = p.parse(txt) # txt be a paragraph sents = [] tokens = [] lemmas = [] txt_parsed = txt_parsed.split("\n\n") for sent_parsed in txt_parsed: lemma_sent = [] for line in sent_parsed.split("\n"): line = line.strip() if not line: continue
general_group.add_argument('--host',default="localhost",help="Host on which to bind. Default %(default)s") general_group.add_argument('--max-char', default=0, type=int, help='Number of chars maximum in a job batch. Cuts longer. Zero for no limit. Default %(default)d') lemmatizer_group = argparser.add_argument_group(title='lemmatizer_mod', description='Lemmatizer arguments') lemmatizer_group.add_argument('--gpu', dest='lemmatizer_mod.gpu', type=int, default=0, help='GPU device id for the lemmatizer, if -1 use CPU') lemmatizer_group.add_argument('--batch_size', dest='lemmatizer_mod.batch_size', type=int, default=100, help='Lemmatizer batch size') args = argparser.parse_args() pipelines = read_pipelines(args.conf_yaml) if args.action=="list": print(sorted(pipelines.keys()),file=sys.stderr,flush=True) sys.exit(0) else: pipeline=pipelines[args.action] if pipeline[0].startswith("extraoptions"): extraoptions=pipeline[0].split()[1:] pipeline.pop(0) newoptions=extraoptions+sys.argv[1:] print("Got extra arguments from the pipeline, now running with", newoptions, file=sys.stderr, flush=True) args=argparser.parse_args(newoptions) p=Pipeline(steps=pipeline, extra_args=args) app.run(host=args.host,port=args.port,threaded=True,processes=1,use_reloader=False)
#!/usr/bin/env python import yaml import os import flask import sys from tnparser.pipeline import Pipeline, read_pipelines app = flask.Flask(__name__) model = os.environ.get("TNPP_MODEL", "models_fi_tdt/pipelines.yaml") pipeline = os.environ.get("TNPP_PIPELINE", "parse_plaintext") max_char = int(os.environ.get("TNPP_MAX_CHARS", 15000)) available_pipelines = read_pipelines(model) p = Pipeline(available_pipelines[pipeline]) @app.route("/", methods=["GET"]) def parse_get(): global p txt = flask.request.args.get("text") if not txt: return "You need to specify ?text=sometext", 400 res = p.parse(txt) return flask.Response(res, mimetype="text/plain; charset=utf-8") @app.route("/", methods=["POST"]) def parse_post(): global p, max_char txt = flask.request.get_data(as_text=True) if max_char > 0: txt = txt[:max_char]
from tnparser.pipeline import read_pipelines, Pipeline import json import tqdm import argparse import sys ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC = range(10) # GPU import types extra_args=types.SimpleNamespace() extra_args.__dict__["udify_mod.device"]="0" #simulates someone giving a --device 0 parameter to Udify extra_args.__dict__["lemmatizer_mod.device"]="0" available_pipelines=read_pipelines("models_fi_tdt_v2.7/pipelines.yaml") # {pipeline_name -> its steps} turku_segmenter=Pipeline(available_pipelines["tokenize"]) # launch the pipeline from the steps conllu_pipeline = available_pipelines["parse_conllu"] if conllu_pipeline[0].startswith("extraoptions"): extraoptions=conllu_pipeline[0].split()[1:] # ['--empty-line-batching'] conllu_pipeline.pop(0) extra_args.__dict__["empty_line_batching"]=True turku_parser=Pipeline(conllu_pipeline, extra_args) def read_conllu(txt): sent=[] comment=[] for line in txt.split("\n"): line=line.strip()
from tnparser.pipeline import read_pipelines, Pipeline text1 = "I have a dog! Let's see what I can do with Silo.ai. :) Can I tokenize it? I think so! Heading: This is the heading And here continues a new sentence and there's no dot." text2 = "Some other text, to see we can tokenize more stuff without reloading the model... :)" # What do we have for English in models_en_ewt? available_pipelines = read_pipelines( "models_en_ewt/pipelines.yaml") # {pipeline_name -> its steps} p = Pipeline( available_pipelines["tokenize"]) # launch the pipeline from the steps for _ in range(1000): print(p.parse(text1)) print(p.parse(text2))