def main(args): words = args.word or [] categories = list(args.category or ()) dependencies = list(args.dependency or ()) if args.case_insensitive: words = list(map(str.lower, words)) for spec in read_specs(args, converters=FROM_FORMAT): if args.dependency: spec.passages = annotate_udpipe(spec.passages, spec.udpipe) if spec.udpipe else \ annotate_all(spec.passages, as_array=True, replace=not spec.udpipe, lang=spec.lang) t = tqdm(spec.passages, unit=" passages", desc="Finding") if words: t.set_postfix(words=",".join(words)) if categories: t.set_postfix(categories=",".join(categories)) if dependencies: t.set_postfix(dependencies=",".join(dependencies)) found = 0 filename = os.path.join( spec.out_dir, "_".join(words + categories + dependencies) + ".txt") with open(filename, "w", encoding="utf-8") as f: for passage in t: for terminal in passage.layer(layer0.LAYER_ID).all: parent = terminal.parents[0] word = terminal.text if args.case_insensitive: word = word.lower() if (not words or word in words) and ( not categories or parent.ftag in categories) and ( not dependencies or get_annotation( terminal, spec.udpipe) in dependencies): print(passage.ID, parent.fparent, file=f) found += 1 t.set_postfix(found=found) print("Wrote '%s'" % filename)
def main(args): if not args.as_array and not args.as_extra: args.as_extra = True for spec in read_specs(args, converters=FROM_FORMAT_NO_PLACEHOLDERS): kwargs = dict(as_array=args.as_array, as_extra=args.as_extra, verbose=args.verbose, lang=spec.lang) passages = spec.passages if spec.conllu: passages = copy_annotation(passages, spec.conllu, by_id=args.by_id, **kwargs) elif spec.udpipe: passages = annotate_udpipe(passages, spec.udpipe, **kwargs) elif spec.stanfordnlp: passages = annotate_stanfordnlp(passages, spec.stanfordnlp, **kwargs) for passage in annotate_all(passages if args.verbose else tqdm( passages, unit=" passages", desc="Annotating " + spec.out_dir), replace=spec.conllu or not (spec.udpipe or spec.stanfordnlp), **kwargs): if passage.extra.get("format") == "amr" and args.as_array: from semstr.conversion.amr import AmrConverter AmrConverter.introduce_placeholders(passage) write_passage(passage, outdir=spec.out_dir, verbose=args.verbose, binary=args.binary)
def main(args): for spec in read_specs(args, converters=FROM_FORMAT): spec.passages = annotate_udpipe(spec.passages, spec.udpipe) if spec.udpipe else \ annotate_all(spec.passages, as_array=True, replace=not spec.udpipe, lang=spec.lang) filename = os.path.join(spec.out_dir, "find.db") with sqlite3.connect(filename) as conn: c = conn.cursor() c.execute("DROP TABLE terminals") c.execute( "CREATE TABLE terminals (pid, tid, text, ftag, fparent, dep)") c.execute("CREATE INDEX idx_terminals_pid ON terminals (pid)") c.execute("CREATE INDEX idx_terminals_text ON terminals (text)") c.execute("CREATE INDEX idx_terminals_ftag ON terminals (ftag)") c.execute("CREATE INDEX idx_terminals_dep ON terminals (dep)") for passage in tqdm(spec.passages, unit=" passages", desc="Creating " + filename): rows = [] for terminal in passage.layer(layer0.LAYER_ID).all: parent = terminal.parents[0] rows.append( (passage.ID, terminal.ID, terminal.text, parent.ftag, str(parent.fparent), get_annotation(terminal, spec.udpipe))) c.executemany("INSERT INTO terminals VALUES (?,?,?,?,?,?)", rows) conn.commit()
def main(args): for spec in read_specs(args, converters=FROM_FORMAT): if spec.udpipe: spec.passages = annotate_udpipe(spec.passages, spec.udpipe, as_array=args.as_array, verbose=args.verbose) elif spec.conllu: spec.passages = copy_annotation(spec.passages, spec.conllu, as_array=args.as_array, verbose=args.verbose) for passage in annotate_all(spec.passages if args.verbose else tqdm(spec.passages, unit=" passages", desc="Annotating " + spec.out_dir), as_array=args.as_array, replace=not spec.udpipe, lang=spec.lang, verbose=args.verbose): write_passage(passage, outdir=spec.out_dir, verbose=args.verbose, binary=args.binary)
def annotate_stanfordnlp(passages, model_name, as_array=True, as_extra=True, verbose=False, lang=None): def _parser(conllu, *args, **kwargs): del args, kwargs import stanfordnlp text = "\n".join(" ".join(line.split()[1] if line.strip() else line for line in lines if line and not line.startswith("#")) for lines in conllu if lines) nlp = stanfordnlp.Pipeline(lang=lang, tokenize_pretokenized=True) return nlp(text).conll_file.conll_as_string().splitlines() yield from annotate_udpipe(passages, model_name, as_array=as_array, as_extra=as_extra, verbose=verbose, lang=lang, parser=_parser)