Пример #1
0
def main(args):
    words = args.word or []
    categories = list(args.category or ())
    dependencies = list(args.dependency or ())
    if args.case_insensitive:
        words = list(map(str.lower, words))
    for spec in read_specs(args, converters=FROM_FORMAT):
        if args.dependency:
            spec.passages = annotate_udpipe(spec.passages, spec.udpipe) if spec.udpipe else \
                annotate_all(spec.passages, as_array=True, replace=not spec.udpipe, lang=spec.lang)
        t = tqdm(spec.passages, unit=" passages", desc="Finding")
        if words:
            t.set_postfix(words=",".join(words))
        if categories:
            t.set_postfix(categories=",".join(categories))
        if dependencies:
            t.set_postfix(dependencies=",".join(dependencies))
        found = 0
        filename = os.path.join(
            spec.out_dir, "_".join(words + categories + dependencies) + ".txt")
        with open(filename, "w", encoding="utf-8") as f:
            for passage in t:
                for terminal in passage.layer(layer0.LAYER_ID).all:
                    parent = terminal.parents[0]
                    word = terminal.text
                    if args.case_insensitive:
                        word = word.lower()
                    if (not words or word in words) and (
                            not categories or parent.ftag in categories) and (
                                not dependencies or get_annotation(
                                    terminal, spec.udpipe) in dependencies):
                        print(passage.ID, parent.fparent, file=f)
                        found += 1
                        t.set_postfix(found=found)
        print("Wrote '%s'" % filename)
Пример #2
0
def main(args):
    if not args.as_array and not args.as_extra:
        args.as_extra = True
    for spec in read_specs(args, converters=FROM_FORMAT_NO_PLACEHOLDERS):
        kwargs = dict(as_array=args.as_array,
                      as_extra=args.as_extra,
                      verbose=args.verbose,
                      lang=spec.lang)
        passages = spec.passages
        if spec.conllu:
            passages = copy_annotation(passages,
                                       spec.conllu,
                                       by_id=args.by_id,
                                       **kwargs)
        elif spec.udpipe:
            passages = annotate_udpipe(passages, spec.udpipe, **kwargs)
        elif spec.stanfordnlp:
            passages = annotate_stanfordnlp(passages, spec.stanfordnlp,
                                            **kwargs)
        for passage in annotate_all(passages if args.verbose else tqdm(
                passages, unit=" passages", desc="Annotating " + spec.out_dir),
                                    replace=spec.conllu
                                    or not (spec.udpipe or spec.stanfordnlp),
                                    **kwargs):
            if passage.extra.get("format") == "amr" and args.as_array:
                from semstr.conversion.amr import AmrConverter
                AmrConverter.introduce_placeholders(passage)
            write_passage(passage,
                          outdir=spec.out_dir,
                          verbose=args.verbose,
                          binary=args.binary)
Пример #3
0
def main(args):
    for spec in read_specs(args, converters=FROM_FORMAT):
        spec.passages = annotate_udpipe(spec.passages, spec.udpipe) if spec.udpipe else \
            annotate_all(spec.passages, as_array=True, replace=not spec.udpipe, lang=spec.lang)
        filename = os.path.join(spec.out_dir, "find.db")
        with sqlite3.connect(filename) as conn:
            c = conn.cursor()
            c.execute("DROP TABLE terminals")
            c.execute(
                "CREATE TABLE terminals (pid, tid, text, ftag, fparent, dep)")
            c.execute("CREATE INDEX idx_terminals_pid ON terminals (pid)")
            c.execute("CREATE INDEX idx_terminals_text ON terminals (text)")
            c.execute("CREATE INDEX idx_terminals_ftag ON terminals (ftag)")
            c.execute("CREATE INDEX idx_terminals_dep ON terminals (dep)")
            for passage in tqdm(spec.passages,
                                unit=" passages",
                                desc="Creating " + filename):
                rows = []
                for terminal in passage.layer(layer0.LAYER_ID).all:
                    parent = terminal.parents[0]
                    rows.append(
                        (passage.ID, terminal.ID, terminal.text, parent.ftag,
                         str(parent.fparent),
                         get_annotation(terminal, spec.udpipe)))
                c.executemany("INSERT INTO terminals VALUES (?,?,?,?,?,?)",
                              rows)
                conn.commit()
Пример #4
0
def main(args):
    for spec in read_specs(args, converters=FROM_FORMAT):
        if spec.udpipe:
            spec.passages = annotate_udpipe(spec.passages, spec.udpipe, as_array=args.as_array, verbose=args.verbose)
        elif spec.conllu:
            spec.passages = copy_annotation(spec.passages, spec.conllu, as_array=args.as_array, verbose=args.verbose)
        for passage in annotate_all(spec.passages if args.verbose else
                                    tqdm(spec.passages, unit=" passages", desc="Annotating " + spec.out_dir),
                                    as_array=args.as_array, replace=not spec.udpipe, lang=spec.lang,
                                    verbose=args.verbose):
            write_passage(passage, outdir=spec.out_dir, verbose=args.verbose, binary=args.binary)
Пример #5
0
def annotate_stanfordnlp(passages, model_name, as_array=True, as_extra=True, verbose=False, lang=None):
    def _parser(conllu, *args, **kwargs):
        del args, kwargs
        import stanfordnlp
        text = "\n".join(" ".join(line.split()[1] if line.strip() else line
                                  for line in lines if line and not line.startswith("#"))
                         for lines in conllu if lines)
        nlp = stanfordnlp.Pipeline(lang=lang, tokenize_pretokenized=True)
        return nlp(text).conll_file.conll_as_string().splitlines()
    yield from annotate_udpipe(passages, model_name, as_array=as_array, as_extra=as_extra, verbose=verbose, lang=lang,
                               parser=_parser)