def load_and_align_data(): training_data = dataset_loader.read_data("training", "r2") dev_data = dataset_loader.read_data("dev", "r2") test_data = dataset_loader.read_data("test", "r2") data = training_data + dev_data + test_data processed_amr_ids = [] filtered_data = [] for i in range(len(data)): if data[i].amr_id not in processed_amr_ids: filtered_data.append(data[i]) processed_amr_ids.append(data[i].amr_id) data = sorted(filtered_data, key=lambda d: d.amr_id) sequences, _, dependencies, _, amr_ids, _, _ = feature_vector_generator.extract_data_components(data) training_data_orig = dataset_loader.read_original_graphs("training", "r2") dev_data_orig = dataset_loader.read_original_graphs("dev", "r2") test_data_orig = dataset_loader.read_original_graphs("test", "r2") data_orig = training_data_orig + dev_data_orig + test_data_orig processed_orig_amr_ids = [] filtered_data_orig = [] for i in range(len(data_orig)): if data_orig[i][0] in amr_ids and data_orig[i][0] not in processed_orig_amr_ids: filtered_data_orig.append(data_orig[i]) processed_orig_amr_ids.append(data_orig[i][0]) data_orig = sorted(filtered_data_orig, key=lambda d: d[0]) sentences = [d[1] for d in data_orig] amrs = [d[2] for d in data_orig] return sequences, dependencies, amrs, sentences
def extract_amr_relations_from_dataset(file_path): test_data_action_sequences = [ d.action_sequence for d in dataset_loader.read_data("test", cache=True) ] train_data_action_sequences = [ d.action_sequence for d in dataset_loader.read_data("training", cache=True) ] dev_data_action_sequences = [ d.action_sequence for d in dataset_loader.read_data("dev", cache=True) ] action_sequences = test_data_action_sequences + train_data_action_sequences + dev_data_action_sequences amr_relations_set = set() for action_sequence in action_sequences: for action in action_sequence: if action.action == "RL" or action.action == "RR": amr_relations_set.add(action.label) amr_relations_list = list(amr_relations_set) amr_relations_list.sort() with open(file_path, "w") as f: for rel in amr_relations_list: f.write("%s\n" % rel)
def train_file(model_name, train_case_name, train_data_path, test_data_path, parser_parameters): train_data = dataset_loader.read_data("training", train_data_path, parser_parameters=parser_parameters, cache=True) test_data = dataset_loader.read_data("dev", test_data_path, parser_parameters=parser_parameters, cache=True) train(model_name, train_case_name, train_data, test_data, parser_parameters)
def test_file(model_name, test_case_name, test_data_path, parser_parameters): test_data = dataset_loader.read_data("test", test_data_path, parser_parameters=parser_parameters, cache=True) return test(model_name, test_case_name, test_data, parser_parameters)
def generate_tokenizer(): print("Generating tokenizer at %s." % TOKENIZER_PATH) train_data_sentences = [ d.sentence for d in dataset_loader.read_data("training", cache=True) ] dev_data_sentences = [ d.sentence for d in dataset_loader.read_data("dev", cache=True) ] test_data_sentences = [ d.sentence for d in dataset_loader.read_data("test", cache=True) ] sentences = train_data_sentences + dev_data_sentences + test_data_sentences tokenizer = Tokenizer(filters="", lower=True, split=" ") tokenizer.fit_on_texts(sentences) pickle.dump(tokenizer, open(TOKENIZER_PATH, "wb")) return tokenizer
for dataset_version in dataset_versions: directory_content = listdir(partition_path) directory_content_filtered = [ x for x in directory_content if dataset_version in x ] directory_content_filtered = sorted(directory_content_filtered) for file_name in directory_content_filtered: file_path = partition_path + "/" + file_name dataset_name = file_name.split('-')[6].split('.')[0] file_length = len( input_file_parser.extract_data_records(file_path)) parsed_AMR = len( dataset_loader.read_original_graphs(dataset_partition, file_name)) act_seq_gen = len( dataset_loader.read_data(dataset_partition, file_name)) if dataset_name not in dataset_stats: dataset_stats[dataset_name] = {} if dataset_version not in dataset_stats[dataset_name]: dataset_stats[dataset_name][dataset_version] = {} dataset_stats[dataset_name][dataset_version][dataset_partition] = [ file_length, parsed_AMR, act_seq_gen ] # small column width scw = 7 # large column width lcw = 12
from constants import __DEP_AMR_REL_TABLE, __AMR_RELATIONS from data_extraction import dataset_loader training_data = dataset_loader.read_data("training") dev_data = dataset_loader.read_data("dev") test_data = dataset_loader.read_data("test") data = training_data + dev_data + test_data dependencies = set() for d in data: for dep in list(d.dependencies.values()): dependencies.add(dep[1]) dependencies = set([d.split("_")[0] for d in dependencies]) dependencies = sorted(list(dependencies)) print("-----------------------------------------") print("Dependencies from dataset: %d" % len(dependencies)) print(dependencies) mapped_dependencies = sorted(list(set(__DEP_AMR_REL_TABLE.keys()))) mapped_relations = sorted(list(set(__DEP_AMR_REL_TABLE.values()))) print("-----------------------------------------") print("Dependencies from mapping file: %d" % len(mapped_dependencies)) print(mapped_dependencies) print("-----------------------------------------") print("Unmapped dependency relations:")