Пример #1
0
def main(args):
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--ontonotes",
        type=str,
        required=True,
        help="Path to OntoNotes, e.g. /path/to/conll-formatted-ontonotes-5.0",
    )
    parser.add_argument("--tasks",
                        type=str,
                        nargs="+",
                        help="Tasks, one or more of {const, coref, ner, srl}.")
    parser.add_argument(
        "--splits",
        type=str,
        nargs="+",
        default=["train", "development", "test", "conll-2012-test"],
        help=
        "Splits, one or more of {train, development, test, conll-2012-test}.",
    )
    parser.add_argument("-o",
                        dest="output_dir",
                        type=str,
                        default=".",
                        help="Output directory for JSON files.")
    args = parser.parse_args(args)

    if not os.path.isdir(args.output_dir):
        os.mkdir(args.output_dir)

    import pandas as pd

    pd.options.display.float_format = "{:.2f}".format

    # Load OntoNotes reader.
    ontonotes = Ontonotes()
    for split in args.splits:
        for task in args.tasks:
            source_path = os.path.join(args.ontonotes, "data", split)
            print('########### Reading ontonotes split from', source_path)
            ontonotes_reader = ontonotes.dataset_iterator(
                file_path=source_path)

            log.info("Processing split '%s' for task '%s'", split, task)
            task_dir = os.path.join(args.output_dir, task)
            if not os.path.isdir(task_dir):
                os.mkdir(task_dir)
            target_fname = os.path.join(task_dir, f"{split}.json")
            ontonotes_stats = collections.Counter()
            converted_records = process_task_split(tqdm(ontonotes_reader),
                                                   task, ontonotes_stats)

            stats = utils.EdgeProbingDatasetStats()
            converted_records = stats.passthrough(converted_records)
            utils.write_json_data(target_fname, converted_records)
            log.info("Wrote examples to %s", target_fname)
            log.info(stats.format())
            log.info(str(pd.Series(ontonotes_stats, dtype=object)))
Пример #2
0
def main(args):
    parser = argparse.ArgumentParser()
    parser.add_argument('-i',
                        dest='inputs',
                        type=str,
                        nargs="+",
                        help="Input files (JSON) for SPR1 splits.")
    parser.add_argument('-o',
                        dest='output_dir',
                        type=str,
                        required=True,
                        help="Output directory.")
    args = parser.parse_args(args)

    if not os.path.isdir(args.output_dir):
        os.mkdir(args.output_dir)

    pd.options.display.float_format = '{:.2f}'.format
    for fname in args.inputs:
        log.info("Converting %s", fname)
        source_records = list(utils.load_json_data(fname))
        converted_records = (convert_record(r) for r in tqdm(source_records))
        stats = utils.EdgeProbingDatasetStats()
        converted_records = stats.passthrough(converted_records)
        target_fname = os.path.join(args.output_dir, os.path.basename(fname))
        utils.write_json_data(target_fname, converted_records)
        log.info("Wrote examples to %s", target_fname)
        log.info(stats.format())
Пример #3
0
def convert_with_stats(source_records, target_fname, convert_fn):
    converted_records = (convert_fn(r) for r in tqdm(source_records))
    stats = utils.EdgeProbingDatasetStats()
    converted_records = stats.passthrough(converted_records)
    utils.write_json_data(target_fname, converted_records)
    log.info("Wrote examples to %s", target_fname)
    log.info(stats.format())
Пример #4
0
def main(args):
    parser = argparse.ArgumentParser()
    parser.add_argument("-i",
                        dest="input_files",
                        type=str,
                        nargs="+",
                        help="Input file(s), e.g. en_ewt-ud-*.conllu")
    parser.add_argument(
        "-o",
        dest="output_dir",
        type=str,
        required=True,
        help="Output directory, e.g. /path/to/edges/data/ud_ewt",
    )
    args = parser.parse_args(args)

    if not os.path.isdir(args.output_dir):
        os.mkdir(args.output_dir)

    for filename in args.input_files:
        with open(filename) as fd:
            records = convert_ud_file(fd)
        stats = utils.EdgeProbingDatasetStats()
        records = stats.passthrough(records)
        target_basename = os.path.basename(filename).replace(
            ".conllu", ".json")
        target_fname = os.path.join(args.output_dir, target_basename)
        utils.write_json_data(target_fname, records)
        log.info("Wrote examples to %s", target_fname)
        log.info(stats.format())

    log.info("Done!")