def load_and_align_data():
    training_data = dataset_loader.read_data("training", "r2")
    dev_data = dataset_loader.read_data("dev", "r2")
    test_data = dataset_loader.read_data("test", "r2")

    data = training_data + dev_data + test_data

    processed_amr_ids = []
    filtered_data = []
    for i in range(len(data)):
        if data[i].amr_id not in processed_amr_ids:
            filtered_data.append(data[i])
            processed_amr_ids.append(data[i].amr_id)
    data = sorted(filtered_data, key=lambda d: d.amr_id)

    sequences, _, dependencies, _, amr_ids, _, _ = feature_vector_generator.extract_data_components(data)

    training_data_orig = dataset_loader.read_original_graphs("training", "r2")
    dev_data_orig = dataset_loader.read_original_graphs("dev", "r2")
    test_data_orig = dataset_loader.read_original_graphs("test", "r2")

    data_orig = training_data_orig + dev_data_orig + test_data_orig

    processed_orig_amr_ids = []
    filtered_data_orig = []
    for i in range(len(data_orig)):
        if data_orig[i][0] in amr_ids and data_orig[i][0] not in processed_orig_amr_ids:
            filtered_data_orig.append(data_orig[i])
            processed_orig_amr_ids.append(data_orig[i][0])
    data_orig = sorted(filtered_data_orig, key=lambda d: d[0])

    sentences = [d[1] for d in data_orig]
    amrs = [d[2] for d in data_orig]

    return sequences, dependencies, amrs, sentences
示例#2
0
def extract_amr_relations_from_dataset(file_path):
    test_data_action_sequences = [
        d.action_sequence for d in dataset_loader.read_data("test", cache=True)
    ]
    train_data_action_sequences = [
        d.action_sequence
        for d in dataset_loader.read_data("training", cache=True)
    ]
    dev_data_action_sequences = [
        d.action_sequence for d in dataset_loader.read_data("dev", cache=True)
    ]

    action_sequences = test_data_action_sequences + train_data_action_sequences + dev_data_action_sequences

    amr_relations_set = set()
    for action_sequence in action_sequences:
        for action in action_sequence:
            if action.action == "RL" or action.action == "RR":
                amr_relations_set.add(action.label)

    amr_relations_list = list(amr_relations_set)
    amr_relations_list.sort()

    with open(file_path, "w") as f:
        for rel in amr_relations_list:
            f.write("%s\n" % rel)
示例#3
0
def train_file(model_name, train_case_name, train_data_path, test_data_path,
               parser_parameters):
    train_data = dataset_loader.read_data("training",
                                          train_data_path,
                                          parser_parameters=parser_parameters,
                                          cache=True)
    test_data = dataset_loader.read_data("dev",
                                         test_data_path,
                                         parser_parameters=parser_parameters,
                                         cache=True)

    train(model_name, train_case_name, train_data, test_data,
          parser_parameters)
示例#4
0
def test_file(model_name, test_case_name, test_data_path, parser_parameters):
    test_data = dataset_loader.read_data("test",
                                         test_data_path,
                                         parser_parameters=parser_parameters,
                                         cache=True)

    return test(model_name, test_case_name, test_data, parser_parameters)
示例#5
0
def generate_tokenizer():
    print("Generating tokenizer at %s." % TOKENIZER_PATH)
    train_data_sentences = [
        d.sentence for d in dataset_loader.read_data("training", cache=True)
    ]
    dev_data_sentences = [
        d.sentence for d in dataset_loader.read_data("dev", cache=True)
    ]
    test_data_sentences = [
        d.sentence for d in dataset_loader.read_data("test", cache=True)
    ]

    sentences = train_data_sentences + dev_data_sentences + test_data_sentences

    tokenizer = Tokenizer(filters="", lower=True, split=" ")
    tokenizer.fit_on_texts(sentences)

    pickle.dump(tokenizer, open(TOKENIZER_PATH, "wb"))

    return tokenizer
示例#6
0
    for dataset_version in dataset_versions:
        directory_content = listdir(partition_path)
        directory_content_filtered = [
            x for x in directory_content if dataset_version in x
        ]
        directory_content_filtered = sorted(directory_content_filtered)
        for file_name in directory_content_filtered:
            file_path = partition_path + "/" + file_name
            dataset_name = file_name.split('-')[6].split('.')[0]
            file_length = len(
                input_file_parser.extract_data_records(file_path))
            parsed_AMR = len(
                dataset_loader.read_original_graphs(dataset_partition,
                                                    file_name))
            act_seq_gen = len(
                dataset_loader.read_data(dataset_partition, file_name))

            if dataset_name not in dataset_stats:
                dataset_stats[dataset_name] = {}
            if dataset_version not in dataset_stats[dataset_name]:
                dataset_stats[dataset_name][dataset_version] = {}

            dataset_stats[dataset_name][dataset_version][dataset_partition] = [
                file_length, parsed_AMR, act_seq_gen
            ]

# small column width
scw = 7
# large column width
lcw = 12
示例#7
0
from constants import __DEP_AMR_REL_TABLE, __AMR_RELATIONS
from data_extraction import dataset_loader

training_data = dataset_loader.read_data("training")
dev_data = dataset_loader.read_data("dev")
test_data = dataset_loader.read_data("test")

data = training_data + dev_data + test_data

dependencies = set()
for d in data:
    for dep in list(d.dependencies.values()):
        dependencies.add(dep[1])

dependencies = set([d.split("_")[0] for d in dependencies])
dependencies = sorted(list(dependencies))

print("-----------------------------------------")
print("Dependencies from dataset: %d" % len(dependencies))
print(dependencies)

mapped_dependencies = sorted(list(set(__DEP_AMR_REL_TABLE.keys())))

mapped_relations = sorted(list(set(__DEP_AMR_REL_TABLE.values())))

print("-----------------------------------------")
print("Dependencies from mapping file: %d" % len(mapped_dependencies))
print(mapped_dependencies)

print("-----------------------------------------")
print("Unmapped dependency relations:")