Пример #1
0
def main(args):
    if args.out_dir:
        os.makedirs(args.out_dir, exist_ok=True)
        if not args.tikz:
            import matplotlib
            matplotlib.use('Agg')
    to_stdout = (args.tikz or args.standoff) and not args.out_dir
    t = args.passages
    t = get_passages(t) if to_stdout else get_passages_with_progress_bar(
        t, desc="Visualizing")
    if args.sentences:
        t = (sentence for passage in t
             for sentence in split2sentences(passage))
    for passage in t:
        if args.tikz:
            print_text(args, visualization.tikz(passage),
                       passage.ID + ".tikz.txt")
        elif args.standoff:
            print_text(args, visualization.standoff(passage),
                       passage.ID + ".ann")
        else:
            import matplotlib.pyplot as plt
            width = len(passage.layer(layer0.LAYER_ID).all) * 19 / 27
            plt.figure(passage.ID, figsize=(width, width * 10 / 19))
            visualization.draw(passage, node_ids=args.node_ids)
            if args.out_dir:
                plt.savefig(
                    os.path.join(args.out_dir, passage.ID + "." + args.format))
                plt.close()
            else:
                plt.show()
Пример #2
0
def copy_annotation(passages,
                    conllu,
                    by_id=False,
                    as_array=True,
                    as_extra=True,
                    verbose=False,
                    lang=None):
    conllu_sentences = {annotated.ID: annotated for annotated in
                        get_passages_with_progress_bar(conllu, converters=CONVERTERS, desc="Reading '%s'" % conllu)} \
        if by_id else get_passages(conllu, converters=CONVERTERS)
    for passage in passages:
        try:
            annotated = conllu_sentences[passage.ID] if by_id else next(
                conllu_sentences)
        except (KeyError, StopIteration) as e:
            raise ValueError(
                "Missing annotation for passage ID '%s', by_id=%s" %
                (passage.ID, by_id)) from e
        if verbose:
            with external_write_mode():
                print("Reading annotation from '%s'" % annotated.ID)
        if as_array:
            passage.layer(layer0.LAYER_ID).docs()[:] = annotated.layer(
                layer0.LAYER_ID).docs()
        if as_extra:
            for terminal, annotated_terminal in zip(
                    passage.layer(layer0.LAYER_ID).all,
                    annotated.layer(layer0.LAYER_ID).all):
                copy_tok_to_extra(annotated_terminal, terminal, lang=lang)
        yield passage
Пример #3
0
def main(args):
    os.makedirs(args.outdir, exist_ok=True)
    if args.join:
        out_file = os.path.join(args.outdir, args.join)
        with open(out_file, "w", encoding="utf-8") as f:
            for passage in get_passages_with_progress_bar(sorted(
                    args.filenames, key=numeric),
                                                          desc="Converting"):
                write_text(passage,
                           f,
                           sentences=args.sentences,
                           lang=args.lang,
                           prepend_id=args.prepend_id)
        print("Wrote '%s'." % out_file)
    else:  # one file per passage
        for pattern in args.filenames:
            for filename in tqdm(glob(pattern) or [pattern],
                                 desc="Converting",
                                 unit=" passages"):
                passage = file2passage(filename)
                basename = os.path.splitext(os.path.basename(filename))[0]
                with open(os.path.join(args.outdir, basename + ".txt"),
                          "w",
                          encoding="utf-8") as f:
                    write_text(passage,
                               f,
                               sentences=args.sentences,
                               lang=args.lang,
                               prepend_id=args.prepend_id)
Пример #4
0
def main(args):
    matchers = [CandidateMatcher(line) for line in tqdm(list(gen_lines(args.text)),
                                                        desc="Indexing " + args.text, unit=" lines")]
    out = open(args.out, "w", encoding="utf-8") if args.out else sys.stdout
    for p in get_passages_with_progress_bar(args.filenames, desc="Matching", converters={}):
        match_passage_text(p, matchers, out)
    out.close()
Пример #5
0
def main(args):
    splitter = Splitter.read_file(args.sentences,
                                  enum=args.enumerate,
                                  suffix_format=args.suffix_format,
                                  suffix_start=args.suffix_start)
    os.makedirs(args.outdir, exist_ok=True)
    i = 0
    for passage in get_passages_with_progress_bar(args.filenames, "Splitting"):
        for sentence in splitter.split(
                passage) if splitter else split2sentences(
                    passage,
                    remarks=args.remarks,
                    lang=args.lang,
                    ids=map(str, count(i)) if args.enumerate else None):
            i += 1
            outfile = os.path.join(
                args.outdir, args.prefix + sentence.ID +
                (".pickle" if args.binary else ".xml"))
            if args.verbose:
                with external_write_mode():
                    print("Writing passage file for sentence '%s'..." %
                          outfile,
                          file=sys.stderr)
            if args.normalize:
                normalize(sentence)
            passage2file(sentence, outfile, binary=args.binary)
    if splitter and len(splitter.matched_indices) < len(splitter.sentences):
        print("Unmatched sentences:",
              *[
                  s for i, s in enumerate(splitter.sentences)
                  if i not in splitter.matched_indices
              ],
              sep="\n")
Пример #6
0
def main(args):
    df = pd.DataFrame(index=args.directories, columns=["sentences", "tokens", "nodes", "discontinuous", "reentrant",
                                                       "implicit", "edges", "primary", "remote"])
    df.fillna(0, inplace=True)
    for i, directory in enumerate(args.directories):
        row = df.loc[directory]
        for passage in get_passages_with_progress_bar(directory, desc=directory):
            l1 = passage.layer(layer1.LAYER_ID)
            non_terminals = [n for n in l1.all if n not in l1.heads and len(n.get_terminals()) > 1]
            edges = {e for n in non_terminals for e in n}
            remote_counter = Counter(e.attrib.get("remote", False) for e in edges)
            row["sentences"] += 1
            row["tokens"] += len(passage.layer(layer0.LAYER_ID).all)
            row["nodes"] += len(non_terminals)
            row["discontinuous"] += sum(1 for n in non_terminals if n.discontiguous)
            row["reentrant"] += sum(1 for n in non_terminals if any(e.attrib.get("remote") for e in n.incoming))
            row["edges"] += len(edges)
            row["primary"] += remote_counter[False]
            row["remote"] += remote_counter[True]
            row["implicit"] += sum(1 for n in l1.all if n.attrib.get("implicit"))

    # Change to percentages
    df["discontinuous"] *= 100. / df["nodes"]
    df["reentrant"] *= 100. / df["nodes"]
    df["implicit"] *= 100. / df["nodes"]
    df["primary"] *= 100. / df["edges"]
    df["remote"] *= 100. / df["edges"]

    # Print
    if args.outfile:
        df.T.to_csv(args.outfile, float_format="%.2f", sep="&", line_terminator=" \\\\\n")
        print("Saved to " + args.outfile)
    else:
        with pd.option_context("display.max_rows", None, "display.max_columns", None):
            print(df.T)
Пример #7
0
def main(args):
    matchers = [CandidateMatcher(spelling) for line in tqdm(list(gen_lines(args.text)),
                                                            desc="Indexing " + args.text, unit=" lines")
                for spelling in alternative_spellings(line)]
    out = open(args.out, "w", encoding="utf-8") if args.out else sys.stdout
    for p in get_passages_with_progress_bar(args.filenames, desc="Matching", converters={}):
        match_passage_text(p, matchers, out)
    out.close()
Пример #8
0
def main(filename, input_filenames, outdir):
    os.makedirs(outdir, exist_ok=True)
    with open(filename, encoding="utf-8") as f:
        pairs = [line.strip().split() for line in f]
        old_to_new_id = {old_id: new_id for new_id, old_id in pairs}
    for passage in get_passages_with_progress_bar(input_filenames, desc="Renaming"):
        passage._ID = old_to_new_id[passage.ID]
        write_passage(passage, outdir=outdir, verbose=False)
Пример #9
0
def main(args):
    os.makedirs(args.outdir, exist_ok=True)
    for passage in get_passages_with_progress_bar(args.filenames):
        site_filename = os.path.join(args.outdir, passage.ID + ".xml")
        with open(site_filename, "w", encoding="utf-8") as f:
            print(tostring(convert.to_site(passage)).decode(), file=f)
        if args.verbose:
            with external_write_mode():
                print("Wrote '%s'" % site_filename)
Пример #10
0
def main(args):
    os.makedirs(args.outdir, exist_ok=True)
    for passage in get_passages_with_progress_bar(args.filenames):
        site_filename = os.path.join(args.outdir, passage.ID + ".xml")
        with open(site_filename, "w", encoding="utf-8") as f:
            print(tostring(convert.to_site(passage)).decode(), file=f)
        if args.verbose:
            with external_write_mode():
                print("Wrote '%s'" % site_filename)
Пример #11
0
def main(filename, input_filenames, outdir):
    os.makedirs(outdir, exist_ok=True)
    with open(filename, encoding="utf-8") as f:
        pairs = [line.strip().split() for line in f]
        old_to_new_id = {old_id: new_id for new_id, old_id in pairs}
    for passage in get_passages_with_progress_bar(input_filenames,
                                                  desc="Renaming"):
        passage._ID = old_to_new_id[passage.ID]
        write_passage(passage, outdir=outdir, verbose=False)
Пример #12
0
def main(args):
    for passage in annotate_all(get_passages_with_progress_bar(
            args.filenames, desc="Annotating"),
                                replace=True,
                                as_array=args.as_array,
                                verbose=args.verbose):
        assert is_annotated(
            passage, args.as_array), "Passage %s is not annotated" % passage.ID
        write_passage(passage, outdir=args.out_dir, verbose=args.verbose)
Пример #13
0
def main(args):
    os.makedirs(args.outdir, exist_ok=True)
    with open(args.outfile, "w", encoding="utf-8", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(("rule", "passage", "edge", "before", "after"))
        for passage in get_passages_with_progress_bar(args.passages, desc="Converting"):
            convert_passage(passage, report_writer=writer)
            write_passage(passage, outdir=args.outdir, prefix=args.prefix, verbose=args.verbose)
            f.flush()
    print("Wrote '%s'" % args.outfile)
def main(args):
    os.makedirs(args.outdir, exist_ok=True)
    with open(args.outfile, "w", encoding="utf-8", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(("rule", "passage", "terminal", "before", "after"))
        for passage in get_passages_with_progress_bar(args.passages, desc="Converting"):
            convert_passage(passage, lang=passage.attrib.get("lang", args.lang), report_writer=writer)
            write_passage(passage, outdir=args.outdir, prefix=args.prefix, verbose=args.verbose)
            f.flush()
    print("Wrote '%s'" % args.outfile)
Пример #15
0
def read(fp, text = None, prefix = None):
    parent = Path(fp.name).parent;
    paths = {parent / file.strip() for file in fp};
    for passage in get_passages_with_progress_bar(map(str, paths), desc="Analyzing"):
        try:
            graph = passage2graph(passage, text, prefix);
        except Exception as exception:
            print(exception);
            continue;
        yield graph;
Пример #16
0
def main(args):
    print(
        "id,passages,paragraphs,sentences,nodes,terminals,non-terminals,implicit,linkage,discont,"
        "edges,primary,remote,linkage,parents,children,mult-parents")
    data = []
    for passage in get_passages_with_progress_bar(args.filenames):
        terminals = passage.layer(layer0.LAYER_ID).all
        non_terminals = [
            n for n in passage.layer(layer1.LAYER_ID).all if n.ID != "1.1"
        ]
        non_linkage = [n for n in non_terminals if n.tag != NodeTags.Linkage]
        linkage_nodes = passage.layer(layer1.LAYER_ID).top_linkages
        edges = {e for n in non_terminals for e in n}
        remote = [e for e in edges if e.attrib.get("remote")]
        linkage_edges = [e for n in linkage_nodes for e in n]
        fields = (
            int(passage.ID),
            1,
            len({t.paragraph
                 for t in terminals}),
            len(break2sentences(passage)),
            len(terminals) + len(non_terminals),
            len(terminals),
            len(non_terminals),
            len([n for n in non_linkage if n.attrib.get("implicit")]),
            len(linkage_nodes),
            len([
                n for n in non_linkage
                if n.tag == NodeTags.Foundational and n.discontiguous
            ]),
            len(edges),
            len(edges) - len(remote) - len(linkage_edges),
            len(remote),
            len(linkage_edges),
            sum(
                len([p for p in n.parents if p.ID != "1.1"])
                for n in non_linkage),
            sum(len(n.children) for n in non_linkage),
            len([
                n for n in non_linkage
                if len([p for p in n.parents if p.ID != "1.1"]) > 1
            ]),
        )
        if not args.summary:
            with tqdm.external_write_mode():
                print(",".join("%d" % f for f in fields))
        data.append(fields)
    data = np.array(data, dtype=int)
    if args.outfile:
        np.savetxt(args.outfile,
                   data[data[:, 0].argsort()],
                   fmt="%i",
                   delimiter="\t")
    if args.summary:
        print(",".join("%d" % f for f in data.sum(axis=0)))
Пример #17
0
def main(args):
    os.makedirs(args.outdir, exist_ok=True)
    words_set = read_dict(args.words_set)
    with open(args.logfile, "w", newline="", encoding="utf-8") as outfile:
        cw = csv.writer(outfile)
        for passage in get_passages_with_progress_bar(args.filenames, "Fixing tokenization"):
            fixed = fix_tokenization(passage, words_set, lang=args.lang, cw=cw)
            if fixed is not None:
                outfile.flush()
                normalize(fixed)
                write_passage(fixed, outdir=args.outdir, binary=args.binary, prefix=args.prefix, verbose=args.verbose)
Пример #18
0
def from_xml(xml_files):

    passages = get_passages_with_progress_bar(xml_files, desc="Visualizing")

    for passage in passages:

        width = len(passage.layer(layer0.LAYER_ID).all) * 19 / 27
        plt.figure(figsize=(width, width * 10 / 19))
        draw_from_native(passage)

        plt.savefig(os.path.join(args.out_dir, passage.ID + ".png"))
        plt.close()
Пример #19
0
def main(args):
    if args.outdir:
        os.makedirs(args.outdir, exist_ok=True)
    for p in get_passages_with_progress_bar(args.filenames,
                                            desc="Normalizing",
                                            converters={}):
        normalize(p, extra=args.extra)
        write_passage(p,
                      outdir=args.outdir,
                      prefix=args.prefix,
                      binary=args.binary,
                      verbose=False)
Пример #20
0
def main(args):
    textutil.BATCH_SIZE = 1
    os.makedirs(args.outdir, exist_ok=True)
    with open(args.outfile, "w", encoding="utf-8", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(("rule", "passage", "terminal", "pos", "before", "after"))
        for passage in annotate_all(get_passages_with_progress_bar(args.passages, desc="Converting"),
                                    verbose=args.verbose):
            convert_passage(passage, report_writer=writer)
            write_passage(passage, outdir=args.outdir, prefix=args.prefix, verbose=args.verbose)
            f.flush()
    print("Wrote '%s'" % args.outfile)
Пример #21
0
def main(args):
    errors = ((p.ID, list(validate(p)))
              for p in get_passages_with_progress_bar(args.filenames,
                                                      desc="Validating"))
    errors = {k: v for k, v in errors if v}
    if errors:
        id_len = max(map(len, errors))
        for passage_id, es in sorted(errors.items()):
            for i, e in enumerate(es):
                print("%-*s|%s" % (id_len, "" if i else passage_id, e))
        sys.exit(1)
    else:
        print("No errors found.")
Пример #22
0
def main(args):
    validator = Validator(args.normalize, args.extra, linkage=args.linkage, strict=args.strict)
    with Pool(10) as pool:
        errors = pool.map(validator.validate_passage,
                          get_passages_with_progress_bar(args.filenames, desc="Validating", converters={}))
    errors = dict((k, v) for k, v in errors if v)
    if errors:
        if not args.strict:
            id_len = max(map(len, errors))
            for passage_id, es in sorted(errors.items()):
                print_errors(passage_id, es, id_len)
        sys.exit(1)
    else:
        print("No errors found.")
Пример #23
0
def main(args):
    df = pd.DataFrame(index=args.directories,
                      columns=[
                          "sentences", "tokens", "nodes", "discontinuous",
                          "reentrant", "implicit", "edges", "primary", "remote"
                      ])
    df.fillna(0, inplace=True)
    for i, directory in enumerate(args.directories):
        row = df.loc[directory]
        for passage in get_passages_with_progress_bar(directory,
                                                      desc=directory):
            l1 = passage.layer(layer1.LAYER_ID)
            non_terminals = [
                n for n in l1.all
                if n not in l1.heads and len(n.get_terminals()) > 1
            ]
            edges = {e for n in non_terminals for e in n}
            remote_counter = Counter(
                e.attrib.get("remote", False) for e in edges)
            row["sentences"] += 1
            row["tokens"] += len(passage.layer(layer0.LAYER_ID).all)
            row["nodes"] += len(non_terminals)
            row["discontinuous"] += sum(1 for n in non_terminals
                                        if n.discontiguous)
            row["reentrant"] += sum(1 for n in non_terminals if any(
                e.attrib.get("remote") for e in n.incoming))
            row["edges"] += len(edges)
            row["primary"] += remote_counter[False]
            row["remote"] += remote_counter[True]
            row["implicit"] += sum(1 for n in l1.all
                                   if n.attrib.get("implicit"))

    # Change to percentages
    df["discontinuous"] *= 100. / df["nodes"]
    df["reentrant"] *= 100. / df["nodes"]
    df["implicit"] *= 100. / df["nodes"]
    df["primary"] *= 100. / df["edges"]
    df["remote"] *= 100. / df["edges"]

    # Print
    if args.outfile:
        df.T.to_csv(args.outfile,
                    float_format="%.2f",
                    sep="&",
                    line_terminator=" \\\\\n")
        print("Saved to " + args.outfile)
    else:
        with pd.option_context("display.max_rows", None, "display.max_columns",
                               None):
            print(df.T)
Пример #24
0
def main(args):
    splitter = Splitter.read_file(args.sentences, enum=args.enumerate)
    os.makedirs(args.outdir, exist_ok=True)
    i = 0
    for passage in get_passages_with_progress_bar(args.filenames, "Splitting"):
        for sentence in splitter.split(passage) if splitter else split2sentences(
                passage, remarks=args.remarks, lang=args.lang, ids=map(str, count(i)) if args.enumerate else None):
            i += 1
            outfile = os.path.join(args.outdir, args.prefix + sentence.ID + (".pickle" if args.binary else ".xml"))
            with external_write_mode():
                print("Writing passage file for sentence '%s'..." % outfile, file=sys.stderr)
            if args.normalize:
                normalize(sentence)
            passage2file(sentence, outfile, binary=args.binary)
Пример #25
0
def main(args):
    out = args.direction == "out"
    roles = set(tag for name, tag in layer1.EdgeTags.__dict__.items()
                if isinstance(tag, str) and not name.startswith('__'))
    for passage in get_passages_with_progress_bar([args.directory]):
        for node in passage.layer(layer1.LAYER_ID).all:
            counts = Counter(edge.tag for edge in (node if out else node.incoming))
            roles.difference_update(tag for tag, count in counts.items() if count > 1)

    lines = "\n".join(sorted(roles))
    print(lines)
    if args.outfile:
        with open(args.outfile, "w", encoding="utf-8") as f:
            print(lines, file=f)
Пример #26
0
def main(args):
    for passage in get_passages_with_progress_bar(args.passages):
        extracted = constructions.extract_edges(
            passage, constructions=args.constructions, verbose=args.verbose)
        if any(extracted.values()):
            with tqdm.external_write_mode():
                if not args.verbose:
                    print("%s:" % passage.ID)
                for construction, edges in extracted.items():
                    if edges:
                        print("  %s:" % construction.description)
                        for edge in edges:
                            print("    %s [%s %s]" %
                                  (edge, edge.tag, edge.child))
                print()
Пример #27
0
def main(args):
    for passage in get_passages_with_progress_bar(args.passages):
        c2es = OrderedDict((c, [candidate.edge for candidate in candidates]) for c, candidates in
                           extract_candidates(passage, constructions=args.constructions, verbose=args.verbose).items()
                           if candidates)
        if any(c2es.values()):
            with external_write_mode():
                if not args.verbose:
                    print("%s:" % passage.ID)
                for construction, edges in c2es.items():
                    if edges:
                        print("  %s:" % construction.description)
                        for edge in edges:
                            print("    %s [%s %s]" % (edge, edge.tag, edge.child))
                print()
Пример #28
0
def main(args):
    order = None
    if args.sentences:
        with open(args.sentences, encoding="utf-8") as f:
            order = dict(map(reversed, enumerate(map(str.strip, f))))
    for passage in get_passages_with_progress_bar(args.filenames, "Splitting"):
        for sentence in split(passage, order) if order else split2sentences(
                passage, remarks=args.remarks, lang=args.lang):
            outfile = os.path.join(
                args.outdir, args.prefix + sentence.ID +
                (".pickle" if args.binary else ".xml"))
            with tqdm.external_write_mode():
                print("Writing passage file for sentence '%s'..." % outfile,
                      file=sys.stderr)
            passage2file(sentence, outfile, args.binary)
Пример #29
0
def main(args):
    os.makedirs(args.outdir, exist_ok=True)
    if args.join:
        out_file = os.path.join(args.outdir, args.join)
        with open(out_file, "w", encoding="utf-8") as f:
            for passage in get_passages_with_progress_bar(sorted(args.filenames, key=numeric), desc="Converting"):
                write_text(passage, f, sentences=args.sentences, lang=args.lang, prepend_id=args.prepend_id)
        print("Wrote '%s'." % out_file)
    else:  # one file per passage
        for pattern in args.filenames:
            for filename in tqdm(glob(pattern) or [pattern], desc="Converting", unit=" passages"):
                passage = file2passage(filename)
                basename = os.path.splitext(os.path.basename(filename))[0]
                with open(os.path.join(args.outdir, basename + ".txt"), "w", encoding="utf-8") as f:
                    write_text(passage, f, sentences=args.sentences, lang=args.lang, prepend_id=args.prepend_id)
Пример #30
0
def main(args):
    validator = Validator(args.normalize, args.extra, linkage=args.linkage, multigraph=args.multigraph,
                          strict=args.strict)
    with Pool(10) as pool:
        errors = pool.map(validator.validate_passage,
                          get_passages_with_progress_bar(args.filenames, desc="Validating", converters={}))
    errors = dict((k, v) for k, v in errors if v)
    if errors:
        if not args.strict:
            id_len = max(map(len, errors))
            for passage_id, es in sorted(errors.items()):
                print_errors(passage_id, es, id_len)
        sys.exit(1)
    else:
        print("No errors found.")
Пример #31
0
def main(args):
    os.makedirs(args.outdir, exist_ok=True)
    words_set = read_dict(args.words_set)
    with open(args.logfile, "w", newline="", encoding="utf-8") as outfile:
        cw = csv.writer(outfile)
        for passage in get_passages_with_progress_bar(args.filenames,
                                                      "Fixing tokenization"):
            fixed = fix_tokenization(passage, words_set, lang=args.lang, cw=cw)
            if fixed is not None:
                outfile.flush()
                normalize(fixed)
                write_passage(fixed,
                              outdir=args.outdir,
                              binary=args.binary,
                              prefix=args.prefix,
                              verbose=args.verbose)
Пример #32
0
def main(args):

    for passage in get_passages_with_progress_bar(args.passages):
        t = split2sentences(passage)
        i = 0
        for sen in t:
            #print('sentence %d\n\n%s\n%s' %(i,convert.to_text(sen), convert.to_sequence(sen)))
            print('sentence %d\n\n%s\n' % (i, convert.to_text(sen)))
            i += 1
            compunds = []
            for node in sen.nodes:
                if (sen.nodes[node].layer.ID == '0'):
                    find_id = ''
                    l = sen.nodes[node]
                    if (l.parents[0].ftag == 'C'):
                        if (l.parents[0].ID not in compunds):
                            compunds.append(l.parents[0].ID)
                            tmp_c = []
                            for n in l.parents[0].children:
                                tmp_c.append(n.text)
                            #print('Word: %s\nWord ID: %s' %(tmp_c,l.parents[0].ID))
                            find_id = l.parents[0].ID
                            path = []
                            path.append(' '.join(tmp_c))
                            path = find_path(sen.nodes[find_id], path)
                            print(' '.join(path))
                            '''
                            for j in path:
                                print(j)
                            '''
                            print('-------')

                    else:
                        #print('Word: %s\nWord ID: %s' % (l.text, l.ID))
                        find_id = l.ID
                        path = []
                        path = find_path(sen.nodes[find_id], path)
                        print(' '.join(path))
                        '''
                        for j in path:
                            print(j)
                        '''
                        print('-------')
            print(
                '------------------------------------------------------------------'
            )
Пример #33
0
def main(args):
    filenames = list(args.passages)
    if args.filenames:
        with open(args.filenames, encoding="utf-8") as f:
            filenames += list(filter(None, map(str.strip, f)))
    with open(args.out, "w", encoding="utf-8") as f:
        for passage in get_passages_with_progress_bar(filenames):
            out = upload_passage(convert.to_site(passage), verbose=args.verbose,
                                 site_filename=passage.ID + "_site_upload.xml" if args.write_site else None,
                                 db_name=args.db_name, host_name=args.host_name,
                                 new_pid=passage.ID, new_prid=args.project_id, username=args.username)
            print(passage.ID, out, file=f)
            if args.verbose:
                print("Uploaded passage %s with xid=%s" % (passage.ID, out))
    if CONNECTION is not None:
        CONNECTION.commit()
    print("Wrote '%s'" % args.out)
Пример #34
0
def main(args):
    splitter = Splitter.read_file(args.sentences, enum=args.enumerate,
                                  suffix_format=args.suffix_format, suffix_start=args.suffix_start)
    os.makedirs(args.outdir, exist_ok=True)
    i = 0
    for passage in get_passages_with_progress_bar(args.filenames, "Splitting"):
        for sentence in splitter.split(passage) if splitter else split2sentences(
                passage, remarks=args.remarks, lang=args.lang, ids=map(str, count(i)) if args.enumerate else None):
            i += 1
            outfile = os.path.join(args.outdir, args.prefix + sentence.ID + (".pickle" if args.binary else ".xml"))
            if args.verbose:
                with external_write_mode():
                    print(sentence, file=sys.stderr)
                    print("Writing passage file for sentence '%s'..." % outfile, file=sys.stderr)
            if args.normalize:
                normalize(sentence)
            passage2file(sentence, outfile, binary=args.binary)
    if splitter and len(splitter.matched_indices) < len(splitter.sentences):
        print("", "Unmatched sentences:", *[s for i, s in enumerate(splitter.sentences)
                                            if i not in splitter.matched_indices], sep="\n")
Пример #35
0
def main(args):
    histograms = defaultdict(Counter)
    for passage in get_passages_with_progress_bar(args.filenames):
        for node in passage.layer(layer1.LAYER_ID).all:
            if node.ID != "1.1":  # Exclude the root node
                histograms["parents"][clip(node.incoming, 3)] += 1
                histograms["children"][clip(node.outgoing, 7)] += 1

    for label, counter in histograms.items():
        handle = open(args.outfile + label + ".txt", "w",
                      encoding="utf-8") if args.outfile else sys.stdout
        handle.writelines(
            ["%s\t%d\n" % (num, count) for num, count in counter.items()])
        if handle is not sys.stdout:
            handle.close()
        try:
            plot_histogram(counter, label, plot=args.plot)
            plot_pie(counter, label, plot=args.plot)
        except:
            pass
Пример #36
0
 def upload_tasks(self, filenames, log=None, **kwargs):
     del kwargs
     log_h = open(log, "w", encoding="utf-8") if log else None
     try:
         for pattern in filenames:
             filenames = sorted(glob(pattern))
             if not filenames:
                 raise IOError("Not found: " + pattern)
             for passage in get_passages_with_progress_bar(filenames, desc="Uploading"):
                 logging.debug("Uploading passage %s" % passage.ID)
                 task = self.upload_task(passage, log=log_h)
                 logging.debug("Submitted task %d" % task["id"])
                 yield task
     except HTTPError as e:
         try:
             raise ValueError(e.response.json()["detail"]) from e
         except JSONDecodeError:
             raise ValueError(e.response.text) from e
         finally:
             if log:
                 log_h.close()
Пример #37
0
 def upload_tasks(self, filenames, log=None, **kwargs):
     del kwargs
     log_h = open(log, "w", encoding="utf-8") if log else None
     try:
         for pattern in filenames:
             filenames = sorted(glob(pattern))
             if not filenames:
                 raise IOError("Not found: " + pattern)
             for passage in get_passages_with_progress_bar(
                     filenames, desc="Uploading"):
                 logging.debug("Uploading passage %s" % passage.ID)
                 task = self.upload_task(passage, log=log_h)
                 logging.debug("Submitted task %d" % task["id"])
                 yield task
     except HTTPError as e:
         try:
             raise ValueError(e.response.json()["detail"]) from e
         except JSONDecodeError:
             raise ValueError(e.response.text) from e
         finally:
             if log:
                 log_h.close()
Пример #38
0
 def upload_tasks(self,
                  filenames,
                  log=None,
                  submit=True,
                  existing_ids=None,
                  **kwargs):
     del kwargs
     log_h = open(log, "w", encoding="utf-8") if log else None
     if existing_ids:
         with open(existing_ids, "r", encoding="utf-8") as ids_h:
             ids = {
                 old_passage_id: (passage_id, tok_id, ann_id)
                 for (old_passage_id, passage_id, tok_id,
                      ann_id) in map(str.split, ids_h)
             }
     else:
         ids = None
     try:
         for pattern in filenames:
             filenames = sorted(glob(pattern))
             if not filenames:
                 raise IOError("Not found: " + pattern)
             for passage in get_passages_with_progress_bar(
                     filenames, desc="Uploading"):
                 logging.debug("Uploading passage %s" % passage.ID)
                 task = self.upload_task(passage,
                                         log=log_h,
                                         submit=submit,
                                         ids=ids)
                 logging.debug("Submitted task %d" % task["id"])
                 yield task
     except HTTPError as e:
         try:
             raise ValueError(e.response.json()["detail"]) from e
         except JSONDecodeError:
             raise ValueError(e.response.text) from e
         finally:
             if log:
                 log_h.close()
Пример #39
0
def main(args):
    os.makedirs(args.outdir, exist_ok=True)
    i = 0
    for passage in get_passages_with_progress_bar(args.filenames, "Splitting"):
        for paragraph in split2paragraphs(
                passage,
                remarks=args.remarks,
                lang=args.lang,
                ids=map(str, count(i)) if args.enumerate else None):
            i += 1
            outfile = os.path.join(
                args.outdir, args.prefix + paragraph.ID +
                (".pickle" if args.binary else ".xml"))
            if args.verbose:
                with external_write_mode():
                    print(paragraph, file=sys.stderr)
                    print("Writing passage file for paragraph '%s'..." %
                          outfile,
                          file=sys.stderr)
            if args.normalize:
                normalize(paragraph)
            passage2file(paragraph, outfile, binary=args.binary)
Пример #40
0
def main(args):
    for passage in annotate_all(get_passages_with_progress_bar(args.filenames, desc="Annotating"),
                                replace=True, as_array=args.as_array, verbose=args.verbose):
        assert is_annotated(passage, args.as_array), "Passage %s is not annotated" % passage.ID
        write_passage(passage, outdir=args.out_dir, verbose=args.verbose)
Пример #41
-1
def main(args):

    for passage in get_passages_with_progress_bar(args.passages):
        #passage = convert.from_standard(elem)
        #print("Linearised\n------\n")
        # print(convert.to_sequence(passage))
        words = {}
        xmltoconll(passage)
        t = split2sentences(passage)
        i = 0
        for sen in t:
            print('sentence %d\n\n%s\n' % (i, convert.to_text(sen)))
            i += 1

        while (1):
            word = input('\nType the word below\n\n')
            for node in passage.nodes:
                t = passage.nodes[node]
                if (re.match(rf'\b{word}\b', t.text, re.IGNORECASE)):
                    #print('Word: %s\nWord ID: %s' %(t.text,t.ID))
                    #ans = input('\nDo you want to continue with wordi Id : %s', t.ID)
                    path = []
                    path = find_path(passage.nodes[t.ID], path)
                    break
            print(' '.join(path))
Пример #42
-1
def main(args):

    for passage in get_passages_with_progress_bar(args.passages):
        t = split2sentences(passage)
        sen_no = 0
        for sen in t:
            #print('sentence %d\n\n%s\n%s' %(i,convert.to_text(sen), convert.to_sequence(sen)))
            print('sentence %d\n\n%s\n' % (sen_no, convert.to_text(sen)))

            root = sen.nodes['1.1']
            first = 1
            tab_len = {}
            tab_len[0] = len('1.1')
            for i in root.children:
                print('\n')
                path = []
                level = 1
                path.append((i.ftag, i.ID, level, False))
                path = find_children(i, path, level)
                end = 0
                if (first):
                    pstr = root.ID
                    first = 0
                else:
                    for k in range(0, tab_len[0]):
                        pstr = pstr + ' '
                for j in path:
                    if (j == 'End'):
                        print(pstr)
                        pstr = ''
                        end = 1
                        continue
                    rel = j[0]
                    nd = j[1]
                    tab = int(j[2])
                    remote = j[3]
                    if (end):
                        q_mark = 0
                        for k in range(0, tab_len[tab - 1]):
                            if (k == tab_len[q_mark]):
                                pstr = pstr + '.'
                                q_mark += 1
                            else:
                                pstr = pstr + ' '
                            end = 0
                    if (rel in descr):
                        rel_desc = rel + ':' + descr[rel]
                    else:
                        rel_desc = rel
                    if (remote):
                        pstr = pstr + '|-->Remote(' + rel_desc + ')-->' + nd
                    else:
                        pstr = pstr + '|-->(' + rel_desc + ')-->' + nd
                    tab_len[tab] = len(pstr)

            print('-----------------------------------\n')
            sen_no += 1