Exemplo n.º 1
0
def convert(data_home):
    preco_directory = os.path.join(data_home, "original", "PreCo_1.0")
    resplit_directory = os.path.join(data_home, "processed", PRECO, "resplit")
    convert_lib.create_processed_data_dir(resplit_directory)
    output_directory = os.path.join(data_home, "processed", PRECO)

    resplit(preco_directory, resplit_directory)

    convert_lib.create_processed_data_dir(output_directory)
    preco_datasets = {}
    for split in [
            convert_lib.DatasetSplit.train, convert_lib.DatasetSplit.dev,
            convert_lib.DatasetSplit.test
    ]:
        input_filename = os.path.join(
            resplit_directory, split + "." + convert_lib.FormatName.jsonl)
        converted_dataset = create_dataset(input_filename)
        convert_lib.write_converted(converted_dataset,
                                    output_directory + "/" + split)
        preco_datasets[split] = converted_dataset

    mult_directory = output_directory.replace(PRECO, "preco_mult")
    convert_lib.create_processed_data_dir(mult_directory)
    for split, dataset in preco_datasets.items():
        dataset.remove_singletons()
        convert_lib.write_converted(dataset, mult_directory + "/" + split)
Exemplo n.º 2
0
def convert(data_home):
    output_directory = os.path.join(data_home, "processed", "wikicoref",
                                    "test")
    convert_lib.create_processed_data_dir(output_directory)
    test_set = os.path.join(data_home, "original", "WikiCoref", "Evaluation",
                            "key-OntoNotesScheme")
    converted_dataset = create_dataset(test_set)
    convert_lib.write_converted(converted_dataset, output_directory)
Exemplo n.º 3
0
def convert(data_home):
    ontonotes_directory = os.path.join(data_home, "original", "CoNLL12/flat/")
    output_directory = os.path.join(data_home, "processed", CONLL12)
    convert_lib.create_processed_data_dir(output_directory)
    ontonotes_datasets = {}
    for split in [
            convert_lib.DatasetSplit.train, convert_lib.DatasetSplit.dev
    ]:
        input_filename = ''.join(
            [ontonotes_directory, split, ".", convert_lib.FormatName.txt])
        converted_dataset = create_dataset(input_filename, ONTONOTES_FIELD_MAP)
        convert_lib.write_converted(converted_dataset,
                                    output_directory + "/" + split)
Exemplo n.º 4
0
def convert(data_home):
    output_directory = os.path.join(data_home, "processed", "wikicoref",
                                    "test")
    convert_lib.create_processed_data_dir(output_directory)
    test_set = os.path.join(data_home, "original", "WikiCoref", "Evaluation",
                            "key-OntoNotesScheme")
    converted_dataset = create_dataset(test_set)
    convert_lib.write_converted(converted_dataset, output_directory)

    mult_directory = output_directory.replace(
        convert_lib.DatasetName.wikicoref, "wikicoref_mult")
    convert_lib.create_processed_data_dir(mult_directory)
    # for split, dataset in preco_datasets.items():
    converted_dataset.remove_singletons()
    convert_lib.write_converted(converted_dataset, mult_directory)
Exemplo n.º 5
0
def convert_format(data_home):
    """Convert preco json format into spanbert json format."""
    input_directory = os.path.join(data_home, "original", "preco")
    output_directory = os.path.join(data_home, "processed", "preco/all_info")
    convert_lib.create_dir(output_directory)
    for split in [
            convert_lib.DatasetSplit.train, convert_lib.DatasetSplit.dev,
            convert_lib.DatasetSplit.test
    ]:
        input_filename = os.path.join(input_directory, split + ".jsonl")
        converted_dataset = create_dataset(input_filename)
        for drop_singeltons in [True, False]:
            converted_dataset.dump_to_conll(
                input_filename.replace(".jsonl", ".conll"), drop_singeltons)
        convert_lib.write_converted(converted_dataset,
                                    output_directory + "/" + split)
Exemplo n.º 6
0
def convert(data_home):
    gap_directory = os.path.join(data_home, "original", "GAP",
                                 "gap-coreference")
    output_directory = os.path.join(data_home, "processed", "gap")
    convert_lib.create_processed_data_dir(output_directory)

    gap_datasets = {}

    for split, split_name in zip([
            convert_lib.DatasetSplit.dev, convert_lib.DatasetSplit.valid,
            convert_lib.DatasetSplit.test
    ], ["development", "validation", "test"]):
        input_filename = os.path.join(gap_directory,
                                      "gap-" + split_name + ".tsv")
        print(input_filename)
        converted_dataset = create_dataset(input_filename)
        convert_lib.write_converted(converted_dataset,
                                    output_directory + "/" + split)