예제 #1
0
파일: USim.py 프로젝트: huji-nlp/USim
def parsed_sentence2xml(sentence, parse_dir, sent_id=None, normalize_sentence=normalize_sentence):
    if sent_id is None:
        location = get_parsed_subdir(sentence, parse_dir)
        filename = parse_location(location, "", get_sentence_id(
            sentence, location, False, normalize_sentence))
        # print("reading parse from ", filename)
        # with open(filename) as fl:
        #     print("sentence:", sentence)
        #     print("xml first lines:", fl.readlines()[:30])
        return file2passage(filename)
    else:
        return file2passage(parse_location(parse_dir, sentence, sent_id))
예제 #2
0
def main():
    argparser = argparse.ArgumentParser(description=desc)
    argparser.add_argument("filenames", nargs="+",
                           help="file names to convert and evaluate")
    argparser.add_argument("-f", "--format", required=True, choices=convert.CONVERTERS,
                           help="input file format")
    argparser.add_argument("-T", "--tree", action="store_true",
                           help="remove multiple parents to get a tree")
    args = argparser.parse_args()

    converter1 = convert.TO_FORMAT[args.format]
    converter2 = convert.FROM_FORMAT[args.format]
    scores = []
    for pattern in args.filenames:
        filenames = glob.glob(pattern)
        if not filenames:
            raise IOError("Not found: " + pattern)
        for filename in filenames:
            ref = file2passage(filename)
            try:
                guessed = next(converter2(converter1(ref, tree=args.tree), ref.ID))
                scores.append(evaluate(guessed, ref, fscore=True, verbose=False,
                                       units=False, errors=False))
            except Exception as e:
                raise ValueError("Error evaluating conversion of %s" % filename, e)
    if len(scores) > 1:
        print("Aggregated scores:")
    Scores.aggregate(scores).print()

    sys.exit(0)
예제 #3
0
def main(args):
    os.makedirs(args.outdir, exist_ok=True)
    if args.join:
        out_file = os.path.join(args.outdir, args.join)
        with open(out_file, "w", encoding="utf-8") as f:
            for passage in get_passages_with_progress_bar(sorted(
                    args.filenames, key=numeric),
                                                          desc="Converting"):
                write_text(passage,
                           f,
                           sentences=args.sentences,
                           lang=args.lang,
                           prepend_id=args.prepend_id)
        print("Wrote '%s'." % out_file)
    else:  # one file per passage
        for pattern in args.filenames:
            for filename in tqdm(glob(pattern) or [pattern],
                                 desc="Converting",
                                 unit=" passages"):
                passage = file2passage(filename)
                basename = os.path.splitext(os.path.basename(filename))[0]
                with open(os.path.join(args.outdir, basename + ".txt"),
                          "w",
                          encoding="utf-8") as f:
                    write_text(passage,
                               f,
                               sentences=args.sentences,
                               lang=args.lang,
                               prepend_id=args.prepend_id)
예제 #4
0
def main():
    argparser = argparse.ArgumentParser(description=desc)
    argparser.add_argument('filenames', nargs='+', help="file names to analyze")
    argparser.add_argument('-o', '--outfile', default="data/counts_",
                        help="output file prefix for histogram")
    argparser.add_argument('-p', '--plot', default="data/plot_",
                        help="output file prefix for plot image file")
    args = argparser.parse_args()

    histograms = defaultdict(Counter)
    for pattern in args.filenames:
        for filename in glob.glob(pattern):
            sys.stderr.write("Reading passage '%s'...\n" % filename)
            passage = file2passage(filename)
            for node in passage.layer("1").all:
                if node.ID != "1.1":  # Exclude the root node
                    histograms["parents"][clip(node.incoming, 3)] += 1
                    histograms["children"][clip(node.outgoing, 7)] += 1

    for label, counter in histograms.items():
        handle = open(args.outfile + label + ".txt", 'w') if args.outfile else sys.stdout
        handle.writelines(["%s\t%d\n" % (num, count) for num, count in counter.items()])
        if handle is not sys.stdout:
            handle.close()
        try:
            plot_histogram(counter, label, plot=args.plot)
            plot_pie(counter, label, plot=args.plot)
        except:
            pass

    sys.exit(0)
예제 #5
0
def main():
    argparser = argparse.ArgumentParser(description=desc)
    argparser.add_argument('filenames',
                           nargs='+',
                           help="passage file names to convert")
    argparser.add_argument('-o',
                           '--outdir',
                           default='.',
                           help="output directory")
    argparser.add_argument('-p',
                           '--prefix',
                           default='',
                           help="output filename prefix")
    argparser.add_argument('-r',
                           '--remarks',
                           action='store_true',
                           help="annotate original IDs")
    argparser.add_argument("-b",
                           "--binary",
                           action="store_true",
                           help="write in pickle binary format (.pickle)")
    args = argparser.parse_args()

    for filename in args.filenames:
        passage = file2passage(filename)
        sentences = ucca.convert.split2sentences(passage, remarks=args.remarks)
        for i, sentence in enumerate(sentences):
            outfile = "%s/%s.%s" % (args.outdir, args.prefix + sentence.ID,
                                    "pickle" if args.binary else "xml")
            sys.stderr.write("Writing passage file for sentence '%s'...\n" %
                             outfile)
            passage2file(sentence, outfile, args.binary)

    sys.exit(0)
예제 #6
0
파일: convert.py 프로젝트: StefPac/tupa
def main(args):
    for pattern in args.filenames:
        filenames = glob.glob(pattern)
        if not filenames:
            raise IOError("Not found: " + pattern)
        for filename in filenames:
            no_ext, ext = os.path.splitext(filename)
            if ext in UCCA_EXT:  # UCCA input
                write_passage(ioutil.file2passage(filename), args)
            else:
                basename = os.path.basename(no_ext)
                try:
                    passage_id = re.search(r"\d+", basename).group(0)
                except AttributeError:
                    passage_id = basename
                converter = CONVERTERS.get(args.input_format
                                           or ext.lstrip("."))
                if converter is None:
                    raise IOError(
                        "Unknown extension '%s'. Specify format using -f" %
                        ext)
                converter = converter[0]
                with open(filename, encoding="utf-8") as f:
                    for passage in converter(f,
                                             passage_id,
                                             split=args.split,
                                             mark_aux=args.mark_aux):
                        write_passage(passage, args)
예제 #7
0
def read_files(files, verbose=0, force_basename=False, **kw):
    try:
        files = sorted(files,
                       key=lambda x: tuple(map(int, re.findall("\d+", x))) or
                       (x, ))
    except TypeError as e:
        print("Cannot sort filenames: %s" % e, file=sys.stderr)
    for filename in files:
        basename, converted_format = passage_format(filename)
        if converted_format == "txt":
            converted_format = kw["format"]
        in_converter, out_converter = CONVERTERS.get(converted_format,
                                                     CONVERTERS[kw["format"]])
        kwargs = dict(converted_format=converted_format,
                      in_converter=in_converter,
                      out_converter=out_converter)
        if in_converter:
            with open(filename, encoding="utf-8") as f:
                for converted, passage, passage_id in in_converter(
                        f, passage_id=basename, return_original=True, **kw):
                    if verbose:
                        with ioutil.external_write_mode():
                            print("Converting %s from %s" %
                                  (filename, converted_format))
                    yield ConvertedPassage(
                        converted, passage,
                        basename if force_basename else passage_id, **kwargs)
        else:
            passage_id = basename if force_basename else None
            yield ConvertedPassage(ioutil.file2passage(filename),
                                   passage_id=passage_id,
                                   **kwargs)
예제 #8
0
def main():
    argparser = argparse.ArgumentParser(description=desc)
    argparser.add_argument('-d', '--directory', required=True, help="directory with passage files to process")
    argparser.add_argument('-o', '--outfile', default="data/unique_roles.txt", help="output file for data")
    argparser.add_argument('-D', '--direction', default="out", help="direction of edges to check (out|in)")
    args = argparser.parse_args()

    out = args.direction == "out"
    if not os.path.isdir(args.directory):
        raise Exception("Not a directory: " + args.directory)
    roles = set(tag for name, tag in layer1.EdgeTags.__dict__.items()
                if isinstance(tag, str) and not name.startswith('__'))
    for filename in os.listdir(args.directory):
        sys.stderr.write("Reading passage '%s'...\n" % filename)
        passage = file2passage(args.directory + os.path.sep + filename)
        for node in passage.layer(layer1.LAYER_ID).all:
            counts = Counter(edge.tag for edge in (node if out else node.incoming))
            roles.difference_update(tag for tag, count in counts.items() if count > 1)

    lines = "\n".join(sorted(roles))
    print(lines)
    if args.outfile:
        with open(args.outfile, "w", encoding="utf-8") as f:
            print(lines, file=f)

    sys.exit(0)
예제 #9
0
def read_words_and_punctuations(args):
    words = set()
    punctuations = set()
    passages = glob.glob(args.directory + "/*.xml")
    words_file_name = os.path.join(args.directory, "words.txt")
    punctuations_file_name = os.path.join(args.directory, "punctuations.txt")
    if passages:
        for filename in passages:
            sys.stderr.write("Reading passage '%s'...\n" % filename)
            passage = file2passage(filename)
            terminals = passage.layer(layer0.LAYER_ID).all
            w, p = [[
                terminal.attrib.get("text") for terminal in terminals
                if terminal.tag == tag
            ] for tag in (layer0.NodeTags.Word, layer0.NodeTags.Punct)]
            words.update(w)
            punctuations.update(p)
        words = sorted(words)
        punctuations = sorted(punctuations)
        with open(words_file_name, "w") as words_file:
            words_file.writelines(word + "\n" for word in words)
        with open(punctuations_file_name, "w") as punctuations_file:
            punctuations_file.writelines(punctuation + "\n"
                                         for punctuation in punctuations)
    else:
        with open(words_file_name) as words_file:
            words = [word.rstrip() for word in words_file.readlines()]
        with open(punctuations_file_name) as punctuations_file:
            punctuations = [
                punctuation.rstrip()
                for punctuation in punctuations_file.readlines()
            ]
    return punctuations, words
예제 #10
0
def main(args):
    os.makedirs(args.out_dir, exist_ok=True)
    for filename in tqdm(list(iter_files(args.filenames)),
                         unit="file",
                         desc="Converting"):
        if not os.path.isfile(filename):
            raise IOError("Not a file: %s" % filename)
        no_ext, ext = os.path.splitext(filename)
        if ext in UCCA_EXT:  # UCCA input
            write_passage(ioutil.file2passage(filename), args)
        else:
            basename = os.path.basename(no_ext)
            try:
                passage_id = re.search(r"\d+(\.\d+)*", basename).group(0)
            except AttributeError:
                passage_id = basename
            converter = CONVERTERS.get(args.input_format or ext.lstrip("."))
            if converter is None:
                raise IOError(
                    "Unknown extension '%s'. Specify format using -f" % ext)
            converter = converter[0]
            with open(filename, encoding="utf-8") as f:
                for passage in converter(f,
                                         args.prefix + passage_id,
                                         split=args.split,
                                         mark_aux=args.mark_aux):
                    write_passage(passage, args)
예제 #11
0
def main():
    argparser = argparse.ArgumentParser(description=desc)
    argparser.add_argument("filenames", nargs="+", help="file names to analyze")
    argparser.add_argument("-o", "--outfile", default="data/counts_",
                           help="output file prefix for histogram")
    argparser.add_argument("-p", "--plot", default="data/plot_",
                           help="output file prefix for plot image file")
    args = argparser.parse_args()

    histograms = defaultdict(Counter)
    for pattern in args.filenames:
        for filename in glob.glob(pattern):
            sys.stderr.write("Reading passage '%s'...\n" % filename)
            passage = file2passage(filename)
            for node in passage.layer(layer1.LAYER_ID).all:
                if node.ID != "1.1":  # Exclude the root node
                    histograms["parents"][clip(node.incoming, 3)] += 1
                    histograms["children"][clip(node.outgoing, 7)] += 1

    for label, counter in histograms.items():
        handle = open(args.outfile + label + ".txt", "w", encoding="utf-8") if args.outfile else sys.stdout
        handle.writelines(["%s\t%d\n" % (num, count) for num, count in counter.items()])
        if handle is not sys.stdout:
            handle.close()
        # noinspection PyBroadException
        try:
            plot_histogram(counter, label, plot=args.plot)
            plot_pie(counter, label, plot=args.plot)
        except:
            pass

    sys.exit(0)
예제 #12
0
def read_words_and_punctuations(args):
    words = set()
    punctuations = set()
    passages = glob.glob(args.directory + "/*.xml")
    words_file_name = os.path.join(args.directory, "words.txt")
    punctuations_file_name = os.path.join(args.directory, "punctuations.txt")
    if passages:
        for filename in passages:
            sys.stderr.write("Reading passage '%s'...\n" % filename)
            passage = file2passage(filename)
            terminals = passage.layer(layer0.LAYER_ID).all
            w, p = [[terminal.attrib.get("text") for terminal in terminals if terminal.tag == tag]
                    for tag in (layer0.NodeTags.Word, layer0.NodeTags.Punct)]
            words.update(w)
            punctuations.update(p)
        words = sorted(words)
        punctuations = sorted(punctuations)
        with open(words_file_name, "w") as words_file:
            words_file.writelines(word + "\n" for word in words)
        with open(punctuations_file_name, "w") as punctuations_file:
            punctuations_file.writelines(punctuation + "\n" for punctuation in punctuations)
    else:
        with open(words_file_name) as words_file:
            words = [word.rstrip() for word in words_file.readlines()]
        with open(punctuations_file_name) as punctuations_file:
            punctuations = [punctuation.rstrip() for punctuation in punctuations_file.readlines()]
    return punctuations, words
예제 #13
0
def iter_passages(patterns,
                  desc=None,
                  input_format=None,
                  prefix="",
                  split=False,
                  mark_aux=False,
                  annotate=False):
    t = tqdm(list(iter_files(patterns)), unit="file", desc=desc)
    for filename in t:
        t.set_postfix(file=filename)
        if not os.path.isfile(filename):
            raise IOError("Not a file: %s" % filename)
        no_ext, ext = os.path.splitext(filename)
        if ext in UCCA_EXT:  # UCCA input
            yield ioutil.file2passage(filename)
        else:
            basename = os.path.basename(no_ext)
            try:
                passage_id = re.search(r"\d+(\.\d+)*", basename).group(0)
            except AttributeError:
                passage_id = basename
            converter, _ = CONVERTERS.get(input_format or ext.lstrip("."),
                                          (from_text, ))
            with open(filename, encoding="utf-8") as f:
                yield from converter(f,
                                     prefix + passage_id,
                                     split=split,
                                     mark_aux=mark_aux,
                                     annotate=annotate)
예제 #14
0
파일: unique_roles.py 프로젝트: borgr/ucca
def main():
    argparser = argparse.ArgumentParser(description=desc)
    argparser.add_argument('-d', '--directory', required=True, help="directory with passage files to process")
    argparser.add_argument('-o', '--outfile', default="data/unique_roles.txt", help="output file for data")
    argparser.add_argument('-D', '--direction', default="out", help="direction of edges to check (out|in)")
    args = argparser.parse_args()

    out = args.direction == "out"
    if not os.path.isdir(args.directory):
        raise Exception("Not a directory: " + args.directory)
    roles = set(tag for name, tag in layer1.EdgeTags.__dict__.items()
                if isinstance(tag, str) and not name.startswith('__'))
    for filename in os.listdir(args.directory):
        sys.stderr.write("Reading passage '%s'...\n" % filename)
        passage = file2passage(args.directory + os.path.sep + filename)
        for node in passage.layer(layer1.LAYER_ID).all:
            counts = Counter(edge.tag for edge in (node if out else node.incoming))
            roles.difference_update(tag for tag, count in counts.items() if count > 1)

    lines = "\n".join(sorted(roles))
    print(lines)
    if args.outfile:
        with open(args.outfile, "w") as f:
            print(lines, file=f)

    sys.exit(0)
예제 #15
0
def main(args):
    for filename in args.filenames:
        print("Reading passage '%s'..." % filename, file=sys.stderr)
        passage = file2passage(filename)
        basename = os.path.splitext(os.path.basename(filename))[0]
        outfile = args.outdir + os.path.sep + basename + ".pickle"
        print("Writing file '%s'..." % outfile, file=sys.stderr)
        passage2file(passage, outfile, binary=True)
예제 #16
0
def main(args):
    for filename in args.filenames:
        sys.stderr.write("Reading passage '%s'...\n" % filename)
        passage = file2passage(filename)
        basename = os.path.splitext(os.path.basename(filename))[0]
        outfile = args.outdir + os.path.sep + basename + ".xml"
        sys.stderr.write("Writing file '%s'...\n" % outfile)
        passage2file(passage, outfile)
예제 #17
0
def main():
    argparser = argparse.ArgumentParser(description=desc)
    argparser.add_argument("filenames",
                           nargs="+",
                           help="file names to convert and evaluate")
    argparser.add_argument("-f",
                           "--format",
                           required=True,
                           choices=convert.CONVERTERS,
                           help="input file format")
    argparser.add_argument("-T",
                           "--tree",
                           action="store_true",
                           help="remove multiple parents to get a tree")
    argparser.add_argument(
        "-s",
        "--strict",
        action="store_true",
        help="stop immediately if failed to convert or evaluate a file")
    argparser.add_argument(
        "-v",
        "--verbose",
        action="store_true",
        help="print evaluation results for each file separately")
    args = argparser.parse_args()

    converter1 = convert.TO_FORMAT[args.format]
    converter2 = convert.FROM_FORMAT[args.format]
    scores = []
    for pattern in args.filenames:
        filenames = glob.glob(pattern)
        if not filenames:
            raise IOError("Not found: " + pattern)
        for filename in filenames:
            sys.stdout.write("\rConverting %s" % filename)
            sys.stdout.flush()
            ref = file2passage(filename)
            try:
                guessed = next(
                    converter2(converter1(ref, tree=args.tree), ref.ID))
                scores.append(evaluate(guessed, ref, verbose=args.verbose))
            except Exception as e:
                if args.strict:
                    raise ValueError("Error evaluating conversion of %s" %
                                     filename) from e
                else:
                    print("Error evaluating conversion of %s: %s" %
                          (filename, e),
                          file=sys.stderr)
    print()
    if args.verbose and len(scores) > 1:
        print("Aggregated scores:")
    Scores.aggregate(scores).print()

    sys.exit(0)
예제 #18
0
def convert_passage(filename, converter, args):
    """Opens a passage file and returns a string after conversion
    :param filename: input passage file
    :param converter: function to use for conversion
    :param args: ArgumentParser object
    """
    passage = file2passage(filename)
    passages = convert.split2sentences(passage) if args.sentences else [passage]
    output = "\n".join(line for p in passages for line in
                       converter(p, args.test, args.tree, args.markaux))
    return output, passage.ID
예제 #19
0
def main():
    argparser = argparse.ArgumentParser(description=desc)
    argparser.add_argument("filenames",
                           nargs="+",
                           help="passage file names to join")
    argparser.add_argument("-o",
                           "--outdir",
                           default=".",
                           help="output directory")
    argparser.add_argument("-p",
                           "--prefix",
                           default="",
                           help="output filename prefix")
    argparser.add_argument("-r",
                           "--remarks",
                           action="store_true",
                           help="annotate original IDs")
    argparser.add_argument("-b",
                           "--binary",
                           action="store_true",
                           help="write in pickle binary format (.pickle)")
    argparser.add_argument(
        "-j",
        "--join-by-prefix",
        action="store_true",
        help=
        "join each set of passages whose IDs share all but the last 3 characters"
    )
    args = argparser.parse_args()

    passages = [
        file2passage(filename) for pattern in args.filenames
        for filename in sorted(glob.glob(pattern))
    ]
    if args.join_by_prefix:
        subsets = defaultdict(list)
        for passage in passages:
            subsets[passage.ID[:-3]].append(passage)
    else:
        subsets = {passages[0].ID: passages}
    for passage_id, subset in sorted(subsets.items()):
        sys.stderr.write("Joining passages " +
                         ", ".join(passage.ID for passage in subset) + "\n")
        joined = ucca.convert.join_passages(passages,
                                            passage_id=passage_id,
                                            remarks=args.remarks)
        outfile = "%s/%s.%s" % (args.outdir, args.prefix + joined.ID,
                                "pickle" if args.binary else "xml")
        sys.stderr.write("Writing joined passage file '%s'...\n" % outfile)
        passage2file(joined, outfile, args.binary)

    sys.exit(0)
예제 #20
0
def main(args):
    os.makedirs(args.outdir, exist_ok=True)
    for filename in tqdm(args.filenames, desc="Converting", unit=" passages"):
        if args.verbose:
            with external_write_mode():
                print("Reading passage '%s'..." % filename, file=sys.stderr)
        passage = file2passage(filename)
        basename = os.path.splitext(os.path.basename(filename))[0]
        outfile = args.outdir + os.path.sep + basename + ".pickle"
        if args.verbose:
            with external_write_mode():
                print("Writing file '%s'..." % outfile, file=sys.stderr)
        passage2file(passage, outfile, binary=True)
예제 #21
0
def convert_passage(filename, converter, args):
    """Opens a passage file and returns a string after conversion
    :param filename: input passage file
    :param converter: function to use for conversion
    :param args: ArgumentParser object
    """
    passage = file2passage(filename)
    passages = convert.split2sentences(passage) if args.sentences else [
        passage
    ]
    output = "\n".join(
        line for p in passages
        for line in converter(p, args.test, args.tree, args.markaux))
    return output, passage.ID
예제 #22
0
def main(args):
    os.makedirs(args.outdir, exist_ok=True)
    if args.join:
        out_file = os.path.join(args.outdir, args.join)
        with open(out_file, "w", encoding="utf-8") as f:
            for passage in get_passages_with_progress_bar(sorted(args.filenames, key=numeric), desc="Converting"):
                write_text(passage, f, sentences=args.sentences, lang=args.lang, prepend_id=args.prepend_id)
        print("Wrote '%s'." % out_file)
    else:  # one file per passage
        for pattern in args.filenames:
            for filename in tqdm(glob(pattern) or [pattern], desc="Converting", unit=" passages"):
                passage = file2passage(filename)
                basename = os.path.splitext(os.path.basename(filename))[0]
                with open(os.path.join(args.outdir, basename + ".txt"), "w", encoding="utf-8") as f:
                    write_text(passage, f, sentences=args.sentences, lang=args.lang, prepend_id=args.prepend_id)
예제 #23
0
def main():
    argparser = argparse.ArgumentParser(description=desc)
    argparser.add_argument('filenames', nargs='+', help="XML file names to convert")
    argparser.add_argument('-o', '--outdir', default='.', help="output directory")
    args = argparser.parse_args()

    for filename in args.filenames:
        sys.stderr.write("Reading passage '%s'...\n" % filename)
        passage = file2passage(filename)
        basename = os.path.splitext(os.path.basename(filename))[0]
        outfile = args.outdir + os.path.sep + basename + ".pickle"
        sys.stderr.write("Writing file '%s'...\n" % outfile)
        passage2file(passage, outfile, binary=True)

    sys.exit(0)
예제 #24
0
파일: evaluate.py 프로젝트: zoharai/semstr
def read_files(files, default_format=None, verbose=0, force_basename=False):
    for filename in sorted(files, key=lambda x: tuple(map(int, re.findall("\d+", x))) or x):
        basename, converted_format = passage_format(filename)
        in_converter, out_converter = CONVERTERS.get(converted_format, CONVERTERS[default_format])
        kwargs = dict(converted_format=converted_format, in_converter=in_converter, out_converter=out_converter)
        if in_converter:
            with open(filename, encoding="utf-8") as f:
                for converted, passage, passage_id in in_converter(f, passage_id=basename, return_original=True):
                    if verbose:
                        with tqdm.external_write_mode():
                            print("Converting %s from %s" % (filename, converted_format))
                    yield ConvertedPassage(converted, passage, basename if force_basename else passage_id, **kwargs)
        else:
            passage_id = basename if force_basename else None
            yield ConvertedPassage(ioutil.file2passage(filename), passage_id=passage_id, **kwargs)
예제 #25
0
def main():
    argparser = argparse.ArgumentParser(description=desc)
    argparser.add_argument('directory', help="directory containing XML files to process")
    args = argparser.parse_args()

    passages = glob.glob(args.directory + "/*.xml")
    for filename in passages:
        sys.stderr.write("Fixing passage '%s'...\n" % filename)
        passage = file2passage(filename)
        terminals = passage.layer(layer0.LAYER_ID).all
        for terminal in terminals:
            terminal.tag = layer0.NodeTags.Punct if is_punctuation(
                terminal.attrib.get("text")) else layer0.NodeTags.Word
        passage2file(passage, filename, indent=False)

    sys.exit(0)
예제 #26
0
def main():
    argparser = argparse.ArgumentParser(description=desc)
    argparser.add_argument('directory',
                           help="directory containing XML files to process")
    args = argparser.parse_args()

    passages = glob.glob(args.directory + "/*.xml")
    for filename in passages:
        sys.stderr.write("Fixing passage '%s'...\n" % filename)
        passage = file2passage(filename)
        terminals = passage.layer(layer0.LAYER_ID).all
        for terminal in terminals:
            terminal.tag = layer0.NodeTags.Punct if is_punctuation(
                terminal.attrib.get("text")) else layer0.NodeTags.Word
        passage2file(passage, filename, indent=False)

    sys.exit(0)
    def parse_sentences(self, sentences):

        parsed_passages = []

        # the command tempfile.mkdtemp() leaves the directory in place ..
        # consider replacing with 'with tempfile.TemporaryDirectory() as dir_name',
        # which will created the directory and then delete it with all it's contents
        # at the end of the 'with' block

        dir_name = tempfile.mkdtemp()
        print(
            "using directory {} for input to and output from 'python -m tupa command'"
            .format(dir_name),
            file=sys.stderr)

        for count, sentence in enumerate(sentences):
            input_path = '{}/file_{}'.format(dir_name, count)
            with open(input_path, 'w') as input:
                input.write(sentence)

        command = 'cd {}; python -m tupa {} -m {} -p parsed_ -o {}'.format(
            self._tupa_utility_path, dir_name, self._model_prefix, dir_name)
        result = subprocess.run([command],
                                shell=True,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.STDOUT,
                                universal_newlines=True)

        if result.returncode != 0:
            print("commnd '{}' failed ".format(command), file=sys.stderr)
            print("it\'s output was:\n{}".format(result.stdout),
                  file=sys.stderr)

            # return empty list of parsed outputs
            return []

        for count, _ in enumerate(sentences):
            output_file = '{}/parsed_file_{}_0.xml'.format(dir_name, count)
            internal_parsed_passage = file2passage(output_file)
            parsed_passage = TupaParser2.__get_ucca_parsed_passage_from_passage(
                internal_parsed_passage)

            parsed_passages.append(parsed_passage)

        return parsed_passages
예제 #28
0
def iter_passages(patterns, desc=None, input_format=None, prefix="", label_map=None, output_format=None, **kwargs):
    t = tqdm(list(iter_files(patterns)), unit="file", desc=desc)
    for filename in t:
        t.set_postfix(file=os.path.basename(filename))
        if not os.path.isfile(filename):
            raise IOError("Not a file: %s" % filename)
        no_ext, ext = os.path.splitext(filename)
        if ext in UCCA_EXT:  # UCCA input
            yield ioutil.file2passage(filename)
        else:
            basename = os.path.basename(no_ext)
            try:
                passage_id = re.search(r"\d+(\.\d+)*", basename).group(0)
            except AttributeError:
                passage_id = basename
            converter = FROM_FORMAT.get(input_format or ext.lstrip("."), (from_text,))
            with open(filename, encoding="utf-8") as f:
                yield from converter(f, prefix + passage_id, format=output_format if label_map else None, **kwargs)
예제 #29
0
파일: statistics.py 프로젝트: huji-nlp/ucca
def main():
    argparser = argparse.ArgumentParser(description=desc)
    argparser.add_argument("filenames", nargs="+", help="files to process")
    argparser.add_argument("-o", "--outfile", help="output file for data")
    args = argparser.parse_args()

    print("id,passages,paragraphs,sentences,nodes,terminals,non-terminals,implicit,linkage,discont,"
          "edges,primary,remote,linkage,parents,children,mult-parents")
    data = []
    for pattern in args.filenames:
        for filename in glob.glob(pattern):
            passage = file2passage(filename)
            terminals = passage.layer(layer0.LAYER_ID).all
            non_terminals = [n for n in passage.layer(layer1.LAYER_ID).all if n.ID != "1.1"]
            non_linkage = [n for n in non_terminals if n.tag != NodeTags.Linkage]
            linkage_nodes = passage.layer(layer1.LAYER_ID).top_linkages
            edges = {e for n in non_terminals for e in n}
            remote = [e for e in edges if e.attrib.get("remote")]
            linkage_edges = [e for n in linkage_nodes for e in n]
            fields = (int(passage.ID),
                      1,
                      len({t.paragraph for t in terminals}),
                      len(break2sentences(passage)),
                      len(terminals) + len(non_terminals),
                      len(terminals),
                      len(non_terminals),
                      len([n for n in non_linkage if n.attrib.get("implicit")]),
                      len(linkage_nodes),
                      len([n for n in non_linkage if n.tag == NodeTags.Foundational and n.discontiguous]),
                      len(edges),
                      len(edges) - len(remote) - len(linkage_edges),
                      len(remote),
                      len(linkage_edges),
                      sum(len([p for p in n.parents if p.ID != "1.1"]) for n in non_linkage),
                      sum(len(n.children) for n in non_linkage),
                      len([n for n in non_linkage if len([p for p in n.parents if p.ID != "1.1"]) > 1]),
                      )
            print(",".join("%d" % f for f in fields))
            data.append(fields)
    data = np.array(data, dtype=int)
    if args.outfile:
        np.savetxt(args.outfile, data[data[:, 0].argsort()], fmt="%i", delimiter="\t")

    sys.exit(0)
예제 #30
0
파일: statistics.py 프로젝트: viksit/ucca
def main():
    argparser = argparse.ArgumentParser(description=desc)
    argparser.add_argument("filenames", nargs="+", help="files to process")
    argparser.add_argument("-o", "--outfile", help="output file for data")
    args = argparser.parse_args()

    print("id,passages,paragraphs,sentences,nodes,terminals,non-terminals,implicit,linkage,discont,"
          "edges,primary,remote,linkage,parents,children,mult-parents")
    data = []
    for pattern in args.filenames:
        for filename in glob.glob(pattern):
            passage = file2passage(filename)
            terminals = passage.layer(layer0.LAYER_ID).all
            non_terminals = [n for n in passage.layer(layer1.LAYER_ID).all if n.ID != "1.1"]
            non_linkage = [n for n in non_terminals if n.tag != NodeTags.Linkage]
            linkage_nodes = passage.layer(layer1.LAYER_ID).top_linkages
            edges = {e for n in non_terminals for e in n}
            remote = [e for e in edges if e.attrib.get("remote")]
            linkage_edges = [e for n in linkage_nodes for e in n]
            fields = (int(passage.ID),
                      1,
                      len({t.paragraph for t in terminals}),
                      len(break2sentences(passage)),
                      len(terminals) + len(non_terminals),
                      len(terminals),
                      len(non_terminals),
                      len([n for n in non_linkage if n.attrib.get("implicit")]),
                      len(linkage_nodes),
                      len([n for n in non_linkage if n.tag == NodeTags.Foundational and n.discontiguous]),
                      len(edges),
                      len(edges) - len(remote) - len(linkage_edges),
                      len(remote),
                      len(linkage_edges),
                      sum(len([p for p in n.parents if p.ID != "1.1"]) for n in non_linkage),
                      sum(len(n.children) for n in non_linkage),
                      len([n for n in non_linkage if len([p for p in n.parents if p.ID != "1.1"]) > 1]),
                      )
            print(",".join("%d" % f for f in fields))
            data.append(fields)
    data = np.array(data, dtype=int)
    np.savetxt(args.outfile, data[data[:, 0].argsort()], fmt="%i", delimiter="\t")

    sys.exit(0)
예제 #31
0
def main():
    argparser = argparse.ArgumentParser(description=desc)
    argparser.add_argument('filenames', nargs='+', help="passage file names to convert")
    argparser.add_argument('-o', '--outdir', default='.', help="output directory")
    argparser.add_argument('-p', '--prefix', default='', help="output filename prefix")
    argparser.add_argument('-r', '--remarks', action='store_true', help="annotate original IDs")
    argparser.add_argument("-b", "--binary", action="store_true",
                           help="write in pickle binary format (.pickle)")
    args = argparser.parse_args()

    for filename in args.filenames:
        passage = file2passage(filename)
        sentences = ucca.convert.split2sentences(passage, remarks=args.remarks)
        for i, sentence in enumerate(sentences):
            outfile = "%s/%s.%s" % (args.outdir, args.prefix + sentence.ID,
                                    "pickle" if args.binary else "xml")
            sys.stderr.write("Writing passage file for sentence '%s'...\n" % outfile)
            passage2file(sentence, outfile, args.binary)

    sys.exit(0)
예제 #32
0
def main():
    argparser = argparse.ArgumentParser(description=desc)
    argparser.add_argument("filenames", nargs="+",
                           help="file names to convert and evaluate")
    argparser.add_argument("-f", "--format", required=True, choices=convert.CONVERTERS,
                           help="input file format")
    argparser.add_argument("-T", "--tree", action="store_true",
                           help="remove multiple parents to get a tree")
    argparser.add_argument("-s", "--strict", action="store_true",
                           help="stop immediately if failed to convert or evaluate a file")
    argparser.add_argument("-v", "--verbose", action="store_true",
                           help="print evaluation results for each file separately")
    args = argparser.parse_args()

    converter1 = convert.TO_FORMAT[args.format]
    converter2 = convert.FROM_FORMAT[args.format]
    scores = []
    for pattern in args.filenames:
        filenames = glob.glob(pattern)
        if not filenames:
            raise IOError("Not found: " + pattern)
        for filename in filenames:
            sys.stdout.write("\rConverting %s" % filename)
            sys.stdout.flush()
            ref = file2passage(filename)
            try:
                guessed = next(converter2(converter1(ref, tree=args.tree), ref.ID))
                scores.append(evaluate(guessed, ref, verbose=args.verbose))
            except Exception as e:
                if args.strict:
                    raise ValueError("Error evaluating conversion of %s" % filename) from e
                else:
                    print("Error evaluating conversion of %s: %s" % (filename, e), file=sys.stderr)
    print()
    if args.verbose and len(scores) > 1:
        print("Aggregated scores:")
    Scores.aggregate(scores).print()

    sys.exit(0)
예제 #33
0
def main():
    argparser = argparse.ArgumentParser(description=desc)
    argparser.add_argument("filenames",
                           nargs="+",
                           help="file names to convert and evaluate")
    argparser.add_argument("-f",
                           "--format",
                           required=True,
                           choices=convert.CONVERTERS,
                           help="input file format")
    argparser.add_argument("-T",
                           "--tree",
                           action="store_true",
                           help="remove multiple parents to get a tree")
    args = argparser.parse_args()

    converter1 = convert.TO_FORMAT[args.format]
    converter2 = convert.FROM_FORMAT[args.format]
    scores = []
    for pattern in args.filenames:
        filenames = glob.glob(pattern)
        if not filenames:
            raise IOError("Not found: " + pattern)
        for filename in filenames:
            ref = file2passage(filename)
            guessed = next(converter2(converter1(ref), ref.ID))
            scores.append(
                evaluate(guessed,
                         ref,
                         fscore=True,
                         verbose=True,
                         units=False,
                         errors=False))
    if len(scores) > 1:
        print("Aggregated scores:")
        Scores.aggregate(scores).print()

    sys.exit(0)
예제 #34
0
파일: util.py 프로젝트: viksit/ucca
def read_passages(files):
    """
    :param files: iterable of files or Passage objects
    :return: generator of passages from all files given
    """
    for file in files:
        if isinstance(file, core.Passage):  # Not really a file, but a Passage
            passage = file
        elif os.path.exists(file):  # A file
            try:
                passage = ioutil.file2passage(file)  # XML or binary format
            except (IOError, ParseError):  # Failed to read as passage file
                base, ext = os.path.splitext(os.path.basename(file))
                converter = convert.FROM_FORMAT.get(ext.lstrip("."), convert.from_text)
                with open(file) as f:
                    yield from converter(f, passage_id=base, split=Config().split)
                continue
        else:
            raise IOError("File not found: %s" % file)
        if Config().split:
            yield from convert.split2segments(passage, is_sentences=Config().sentences)
        else:
            yield passage
예제 #35
0
def main():
    argparser = argparse.ArgumentParser(description=desc)
    argparser.add_argument("filenames",
                           nargs="+",
                           help="passage file names to annotate")
    argparser.add_argument("-v",
                           "--verbose",
                           action="store_true",
                           help="print tagged text for each passage")
    args = argparser.parse_args()

    for pattern in args.filenames:
        filenames = glob.glob(pattern)
        if not filenames:
            raise IOError("Not found: " + pattern)
        for filename in filenames:
            passage = file2passage(filename)
            annotate(passage, verbose=args.verbose, replace=True)
            sys.stderr.write("Writing '%s'...\n" % filename)
            passage2file(passage,
                         filename,
                         binary=not filename.endswith("xml"))

    sys.exit(0)
예제 #36
0
from argparse import ArgumentParser

from ucca.evaluation import evaluate
from ucca.ioutil import file2passage


################
# MAIN         #
################

if __name__ == "__main__":
    argparser = ArgumentParser(description="Compare two UCCA passages.")
    argparser.add_argument("guessed", help="xml/pickle file name for the guessed annotation")
    argparser.add_argument("ref", help="xml/pickle file name for the reference annotation")
    argparser.add_argument("--units", "-u", dest="units", action="store_true",
                           help="the units the annotations have in common, and those each has separately")
    argparser.add_argument("--fscore", "-f", dest="fscore", action="store_true",
                           help="outputs the traditional P,R,F instead of the scene structure evaluation")
    argparser.add_argument("--errors", "-e", dest="errors", action="store_true",
                           help="prints the error distribution according to its frequency")
    args = argparser.parse_args()

    if not (args.units or args.fscore or args.errors):
        argparser.error("At least one of -u, -f or -e is required.")

    guessed, ref = [file2passage(x) for x in (args.guessed, args.ref)]

    if args.units or args.fscore or args.errors:
        evaluate(guessed, ref,
                 units=args.units, fscore=args.fscore, errors=args.errors, verbose=True)
예제 #37
0
def main():
    print(
        align.align("what has is by the meaning of the word is",
                    "what is the men for the wk is are be"))

    # read xml files
    print("reading db xmls")
    p = []
    for filename in filenames:
        with open(add_path(filename), "rb") as fl:
            p += pickle.load(fl)[0]
        print(
            "read ", filename, " it starts with ",
            tuple(term.text for term in textutil.extract_terminals(
                convert.from_site(p[-1]))[:6]))
    # convert xml to passages
    p = list(map(convert.from_site, p))

    print("reading passage xmls")
    # read passage files
    for filename in passage_filenames:
        print("reading" + filename)
        if os.path.isfile(add_path(os.path.splitext(filename)[0] + ".pkl")):
            with open(add_path(os.path.splitext(filename)[0] + ".pkl"),
                      "rb") as fl:
                p.append(pickle.load(fl))
        else:
            p.append(file2passage(add_path(filename)))
            with open(add_path(os.path.splitext(filename)[0] + ".pkl"),
                      "wb") as fl:
                pickle.dump(p[-1], fl)
                print("dumping",
                      add_path(os.path.splitext(filename)[0] + ".pkl"))

    all_filenames = filenames + passage_filenames
    print("read ", all_filenames)
    word2word = align.align_yields(p[0], p[1])
    assert align.reverse_mapping(word2word) == align.align_yields(
        p[1], p[0]), "align_yields asymmetrical"

    # create symmilarity matrix
    sources = []
    goals = []
    names = []
    i = 0
    while i < len(p):
        names.append(all_filenames[i])
        sources.append(p[i])
        i += 1
        goals.append(p[i])
        i += 1
    chunksize = 1
    if (len(goals) > 100):
        chunksize = int(len(goals) / POOL_SIZE / 10)
    print("multithreading with chunksize", chunksize)
    pool = Pool(POOL_SIZE)
    if r2s:
        results = pool.starmap(distances, zip(goals, sources, names),
                               chunksize)
    else:
        results = pool.starmap(distances, zip(sources, goals, names),
                               chunksize)
    print(results)
    pool.close()
    pool.join()
    sym_mat = []
    keys = []
    for row, key in results:
        keys.append(key)
        sym_mat.append(row)
    print("functions and matrix")
    print(funcs + keys)
    for item in sym_mat:
        print(item)
    print("overall token analysis")
    print(align.token_level_analysis(p))
    output_path = trial_name + "output.csv"
    with open(output_path, "w") as f:
        print("writing output to " + output_path)
        writer = csv.writer(f)
        writer.writerows(sym_mat)
    send_mail("*****@*****.**", "finished",
              os.path.abspath(output_path))
    return
예제 #38
0
        "the units the annotations have in common, and those each has separately"
    )
    argparser.add_argument(
        "--fscore",
        "-f",
        dest="fscore",
        action="store_true",
        help=
        "outputs the traditional P,R,F instead of the scene structure evaluation"
    )
    argparser.add_argument(
        "--errors",
        "-e",
        dest="errors",
        action="store_true",
        help="prints the error distribution according to its frequency")
    args = argparser.parse_args()

    if not (args.units or args.fscore or args.errors):
        argparser.error("At least one of -u, -f or -e is required.")

    guessed, ref = [file2passage(x) for x in (args.guessed, args.ref)]

    if args.units or args.fscore or args.errors:
        evaluate(guessed,
                 ref,
                 units=args.units,
                 fscore=args.fscore,
                 errors=args.errors,
                 verbose=True)