예제 #1
0
def main(args):
    if not args.as_array and not args.as_extra:
        args.as_extra = True
    for spec in read_specs(args, converters=FROM_FORMAT_NO_PLACEHOLDERS):
        kwargs = dict(as_array=args.as_array,
                      as_extra=args.as_extra,
                      verbose=args.verbose,
                      lang=spec.lang)
        passages = spec.passages
        if spec.conllu:
            passages = copy_annotation(passages,
                                       spec.conllu,
                                       by_id=args.by_id,
                                       **kwargs)
        elif spec.udpipe:
            passages = annotate_udpipe(passages, spec.udpipe, **kwargs)
        elif spec.stanfordnlp:
            passages = annotate_stanfordnlp(passages, spec.stanfordnlp,
                                            **kwargs)
        for passage in annotate_all(passages if args.verbose else tqdm(
                passages, unit=" passages", desc="Annotating " + spec.out_dir),
                                    replace=spec.conllu
                                    or not (spec.udpipe or spec.stanfordnlp),
                                    **kwargs):
            if passage.extra.get("format") == "amr" and args.as_array:
                from semstr.conversion.amr import AmrConverter
                AmrConverter.introduce_placeholders(passage)
            write_passage(passage,
                          outdir=spec.out_dir,
                          verbose=args.verbose,
                          binary=args.binary)
예제 #2
0
def main(args):
    words = args.word or []
    categories = list(args.category or ())
    dependencies = list(args.dependency or ())
    if args.case_insensitive:
        words = list(map(str.lower, words))
    for spec in read_specs(args, converters=FROM_FORMAT):
        if args.dependency:
            spec.passages = annotate_udpipe(spec.passages, spec.udpipe) if spec.udpipe else \
                annotate_all(spec.passages, as_array=True, replace=not spec.udpipe, lang=spec.lang)
        t = tqdm(spec.passages, unit=" passages", desc="Finding")
        if words:
            t.set_postfix(words=",".join(words))
        if categories:
            t.set_postfix(categories=",".join(categories))
        if dependencies:
            t.set_postfix(dependencies=",".join(dependencies))
        found = 0
        filename = os.path.join(
            spec.out_dir, "_".join(words + categories + dependencies) + ".txt")
        with open(filename, "w", encoding="utf-8") as f:
            for passage in t:
                for terminal in passage.layer(layer0.LAYER_ID).all:
                    parent = terminal.parents[0]
                    word = terminal.text
                    if args.case_insensitive:
                        word = word.lower()
                    if (not words or word in words) and (
                            not categories or parent.ftag in categories) and (
                                not dependencies or get_annotation(
                                    terminal, spec.udpipe) in dependencies):
                        print(passage.ID, parent.fparent, file=f)
                        found += 1
                        t.set_postfix(found=found)
        print("Wrote '%s'" % filename)
예제 #3
0
def main(args):
    for spec in read_specs(args, converters=CONVERTERS):
        scores = []
        sentences, to_parse = tee(
            (to_conllu_native(p),
             to_conllu_native(p, test=True, enhanced=False)
             ) if isinstance(p, core.Passage) else (p, strip_enhanced(p))
            for p in spec.passages)
        t = tqdm(zip((x for x, _ in sentences),
                     split_by_empty_lines(
                         udpipe((x for _, x in to_parse), spec.udpipe,
                                args.verbose))),
                 unit=" sentences")
        for sentence, parsed in t:
            sentence = list(sentence)
            if args.write:
                i = next(find_ids(sentence))
                t.set_postfix(id=i)
                with open_out_file(spec, i) as f:
                    for line in parsed:
                        print(line, file=f)
            if args.evaluate:
                scores.append(
                    evaluate(parsed, sentence, verbose=args.verbose > 1))
        if scores:
            Scores(scores).print()
예제 #4
0
파일: parse_ud.py 프로젝트: zoharai/semstr
def main(args):
    for spec in read_specs(args, converters=FROM_FORMAT):
        scores = []
        if not args.verbose:
            spec.passages = tqdm(
                spec.passages,
                unit=" passages",
                desc="Parsing " +
                (spec.out_dir if spec.out_dir != "." else spec.lang))
        for passage, parsed in parse(spec.passages, spec.lang, spec.udpipe,
                                     args.verbose):
            map_labels(parsed, args.label_map)
            normalize(parsed, extra=True)
            if args.write:
                write_passage(parsed, args)
            if args.evaluate:
                evaluator = EVALUATORS.get(args.output_format)
                converter = TO_FORMAT.get(args.output_format)
                if converter is not None:
                    passage, parsed = map(converter, (passage, parsed))
                if evaluator is not None:
                    scores.append(
                        evaluator.evaluate(parsed,
                                           passage,
                                           verbose=args.verbose > 1))
        if scores:
            Scores(scores).print()
예제 #5
0
def main(args):
    for spec in read_specs(args, converters=FROM_FORMAT):
        spec.passages = annotate_udpipe(spec.passages, spec.udpipe) if spec.udpipe else \
            annotate_all(spec.passages, as_array=True, replace=not spec.udpipe, lang=spec.lang)
        filename = os.path.join(spec.out_dir, "find.db")
        with sqlite3.connect(filename) as conn:
            c = conn.cursor()
            c.execute("DROP TABLE terminals")
            c.execute(
                "CREATE TABLE terminals (pid, tid, text, ftag, fparent, dep)")
            c.execute("CREATE INDEX idx_terminals_pid ON terminals (pid)")
            c.execute("CREATE INDEX idx_terminals_text ON terminals (text)")
            c.execute("CREATE INDEX idx_terminals_ftag ON terminals (ftag)")
            c.execute("CREATE INDEX idx_terminals_dep ON terminals (dep)")
            for passage in tqdm(spec.passages,
                                unit=" passages",
                                desc="Creating " + filename):
                rows = []
                for terminal in passage.layer(layer0.LAYER_ID).all:
                    parent = terminal.parents[0]
                    rows.append(
                        (passage.ID, terminal.ID, terminal.text, parent.ftag,
                         str(parent.fparent),
                         get_annotation(terminal, spec.udpipe)))
                c.executemany("INSERT INTO terminals VALUES (?,?,?,?,?,?)",
                              rows)
                conn.commit()
예제 #6
0
def main(args):
    for spec in read_specs(args, converters=FROM_FORMAT):
        if spec.udpipe:
            spec.passages = annotate_udpipe(spec.passages, spec.udpipe, as_array=args.as_array, verbose=args.verbose)
        elif spec.conllu:
            spec.passages = copy_annotation(spec.passages, spec.conllu, as_array=args.as_array, verbose=args.verbose)
        for passage in annotate_all(spec.passages if args.verbose else
                                    tqdm(spec.passages, unit=" passages", desc="Annotating " + spec.out_dir),
                                    as_array=args.as_array, replace=not spec.udpipe, lang=spec.lang,
                                    verbose=args.verbose):
            write_passage(passage, outdir=spec.out_dir, verbose=args.verbose, binary=args.binary)
예제 #7
0
def main(args):
    categories = "".join(args.category)
    for spec in read_specs(args, converters=FROM_FORMAT):
        t = tqdm(spec.passages,
                 unit=" passages",
                 desc="Finding multi-token units",
                 postfix={"categories": categories})
        found = 0
        filename = os.path.join(spec.out_dir,
                                "multi_token_" + categories + ".txt")
        with open(filename, "w", encoding="utf-8") as f:
            for passage in t:
                for node in passage.layer(layer1.LAYER_ID).all:
                    try:
                        if node.ftag in categories and len(
                                node.get_terminals(remotes=True)) > 1:
                            print(passage.ID, node.ID, node, file=f)
                            found += 1
                            t.set_postfix(found=found)
                    except (AttributeError, KeyError, ValueError, TypeError):
                        pass
        print("Wrote '%s'" % filename)
예제 #8
0
def main(args):
    for spec in read_specs(args, converters=CONVERTERS):
        scores = []
        sentences1, sentences2 = tee(spec.passages)
        t = tqdm(zip(
            sentences1,
            split_by_empty_lines(udpipe(sentences2, spec.udpipe,
                                        args.verbose))),
                 unit=" sentences")
        for sentence, parsed in t:
            sentence = list(sentence)
            if args.write:
                i = next(find_ids(sentence))
                t.set_postfix(id=i)
                with open(os.path.join(spec.out_dir, i + ".conllu"),
                          "w",
                          encoding="utf-8") as f:
                    for line in parsed:
                        print(line, file=f)
            if args.evaluate:
                scores.append(
                    evaluate(parsed, sentence, verbose=args.verbose > 1))
        if scores:
            Scores(scores).print()
예제 #9
0
def main(args):
    for spec in read_specs(args, converters=FROM_FORMAT):
        for passage in tqdm(spec.passages, unit=" passages", desc="Setting language in " + spec.out_dir,
                            postfix={"lang": spec.lang}):
            passage.attrib["lang"] = spec.lang
            write_passage(passage, outdir=spec.out_dir, verbose=False, binary=args.binary)