def main(): dir = 'data-release/alignments' reader = AMR_Reader() dev_amrs = reader.load('data-release/amrs/leamr_dev.txt') test_amrs = reader.load('data-release/amrs/leamr_test.txt') all_amrs = reader.load('data-release/amrs/ldc+little_prince.txt') amr_map = { 'leamr_dev': dev_amrs, 'leamr_test': test_amrs, 'ldc+little_prince': all_amrs } for filename in os.listdir(dir): file = os.path.join(dir, filename) if file.endswith('alignments.json') or file.endswith( 'alignments.gold.json'): for k in amr_map: if filename.startswith(k): amrs = amr_map[k] aligns = load_from_json(file, amrs, unanonymize=True) # run quick test for amr in amrs: for align in aligns[amr.id]: for n in align.nodes: if n not in amr.nodes: raise Exception( f'Failed to match alignments to AMR data. AMR "{amr.id}" has no node named "{n}".' ) # write output write_to_json(file, aligns, amrs=amrs, anonymize=False) break
def main(): import argparse from amr_utils.amr_readers import AMR_Reader parser = argparse.ArgumentParser(description='Style AMRs as HTML or Latex') parser.add_argument('-f', '--files', type=str, nargs=2, required=True, help='input and output files (AMRs in JAMR format)') parser.add_argument('--latex', action='store_true', help='style as latex') parser.add_argument('--html', action='store_true', help='style as html') args = parser.parse_args() file = args.files[0] outfile = args.files[1] cr = AMR_Reader(style='letters') amrs = cr.load(file, remove_wiki=True) if args.html: output = HTML_AMR.style(amrs) with open(outfile, 'w+', encoding='utf8') as f: f.write(output) else: output = Latex_AMR.style(amrs) with open(outfile, 'w+', encoding='utf8') as f: f.write(output)
def main(): top_dir = sys.argv[1] parse_dir = sys.argv[2] dependency_dir = sys.argv[3] # os.chdir(top_dir) reader = AMR_Reader() amrs = reader.load('../data/split/train.txt', remove_wiki=True) amrs2 = reader.load('../data/split/dev.txt', remove_wiki=True) amrs3 = reader.load('../data/split/test.txt', remove_wiki=True) amr_ids = {'train':{' '.join(amr.tokens):amr.id for amr in amrs}, 'dev':{' '.join(amr.tokens):amr.id for amr in amrs2}, 'test':{' '.join(amr.tokens):amr.id for amr in amrs3}} # idx, deps = load_dependencies(r'C:\Users\Austin\OneDrive\Desktop\ccg rebank\data\PARG\00\wsj_0001.parg') # idx, deps = load_dependencies(r'C:\Users\Austin\OneDrive\Desktop\AMR-enhanced-alignments\data\test.ccg_dependencies.tsv', flavor='easysrl') ids = {} words = [] for subdir_name in os.listdir(parse_dir): subdir = os.path.join(parse_dir, subdir_name) for file_name in os.listdir(subdir): file = os.path.join(subdir, file_name) idx, ccg_words, ccg_trees = load_ccgbank(file) for id, ws in zip(idx, ccg_words): ids[' '.join(w[1] for w in ws)] = id words.extend(ccg_words) with open('ids_map_train.tsv', 'w+', encoding='utf8') as f: for k in ['train']: common_sents = [(ids[i],amr_ids[k][i],i) for i in ids if i in amr_ids[k]] print(k, len(common_sents)) for id1, id2, sent in common_sents: f.write(f'{id1}\t{id2}\t{sent}\n') with open('ids_map_test.tsv', 'w+', encoding='utf8') as f: for k in ['dev','test']: common_sents = [(ids[i],amr_ids[k][i],i) for i in ids if i in amr_ids[k]] print(k, len(common_sents)) for id1, id2, sent in common_sents: f.write(f'{id1}\t{id2}\t{sent}\n') output_file = os.path.join(top_dir, 'ccgbank_parses.gold.txt') with open(output_file, 'w+', encoding='utf8') as fw: for subdir_name in os.listdir(parse_dir): subdir = os.path.join(parse_dir, subdir_name) for file_name in os.listdir(subdir): file = os.path.join(subdir, file_name) with open(file, 'r', encoding='utf') as fr: s = fr.read() fw.write(s) output_file = os.path.join(top_dir, 'ccgbank_dependencies.gold.txt') with open(output_file, 'w+', encoding='utf8') as fw: for subdir_name in os.listdir(dependency_dir): subdir = os.path.join(dependency_dir, subdir_name) for file_name in os.listdir(subdir): file = os.path.join(subdir, file_name) with open(file, 'r', encoding='utf') as fr: s = fr.read() fw.write(s) print()
def main(): amr_file = sys.argv[1] alignment_file = sys.argv[2] relation_alignment_file = sys.argv[3] reader = AMR_Reader() amrs = reader.load(amr_file, remove_wiki=True) subgraph_alignments = reader.load_alignments_from_json(alignment_file, amrs) relation_alignments = reader.load_alignments_from_json(relation_alignment_file, amrs) amrs = [amr for amr in amrs if amr.id in subgraph_alignments] for amr in amrs: amr.alignments = subgraph_alignments[amr.id] # random.shuffle(amrs) # amrs = amrs[:100] # print('Sampling AMRs:') # for amr in amrs: # print(amr.id) output_file = amr_file.replace('.txt','.gold.txt') with open(output_file, 'w+', encoding='utf8') as f: for amr in amrs: f.write(amr.jamr_string()) output_file2 = output_file.replace('.txt','.html') ID_Display.style(amrs, output_file2) output_file3 = output_file.replace('.gold.txt','.gold_alignments.tsv') with open(output_file3, 'w+', encoding='utf8') as f: for amr in amrs: f.write('\t'.join(['amr',str(amr.id)])+'\n') reentrancies = [] for n in amr.nodes: parents = [(s,r,t) for s,r,t in amr.edges if t==n] if len(parents)>1: reentrancies.extend(parents) node_labels = get_node_labels(amr) edge_labels = get_edge_labels(amr) f.write('\t'.join(['tokens']+[f'{i}={token}' for i,token in enumerate(amr.tokens)])+'\n') for n in amr.nodes: nalign = amr.get_alignment(subgraph_alignments, node_id=n) if nalign: token_ids = nalign.tokens token_ids = ','.join(str(t) for t in token_ids) f.write('\t'.join(['node',node_labels[n], amr.nodes[n], token_ids])+'\n') else: f.write('\t'.join(['node', node_labels[n], amr.nodes[n], '']) + '\n') for s,r,t in amr.edges: ealign = amr.get_alignment(relation_alignments, edge=(s,r,t)) if ealign: token_ids = ealign.tokens token_ids = ','.join(str(t) for t in token_ids) f.write('\t'.join(['edge', edge_labels[(s,r,t)], f'{amr.nodes[s]} {r} {amr.nodes[t]}', token_ids])+'\n') else: f.write('\t'.join(['edge', edge_labels[(s,r,t)], f'{amr.nodes[s]} {r} {amr.nodes[t]}', '']) + '\n') for s,r,t in reentrancies: f.write('\t'.join(['reentrancy', edge_labels[(s,r,t)], f'{amr.nodes[s]} {r} {amr.nodes[t]}']) + '\n')
def main(): file = sys.argv[1] align_file = sys.argv[2] outfile = sys.argv[3] reader = AMR_Reader() amrs = reader.load(file, remove_wiki=True) alignments = reader.load_alignments_from_json(align_file, amrs) style(amrs[:5000], alignments, outfile)
def main(): amr_file = args.train reader = AMR_Reader() amrs = reader.load(amr_file, remove_wiki=True) add_nlp_data(amrs, amr_file) eval_amr_file, eval_amrs, gold_eval_alignments = None, None, None if args.test: eval_amr_file, eval_align_file = args.test eval_amrs = reader.load(eval_amr_file, remove_wiki=True) add_nlp_data(eval_amrs, eval_amr_file) gold_eval_alignments = load_from_json(eval_align_file, eval_amrs, unanonymize=True) eval_amr_ids = {amr.id for amr in eval_amrs} amrs = [amr for amr in amrs if amr.id not in eval_amr_ids] # amrs = amrs[:1000] if args.load_model: print('Loading model from:', args.load_model) align_model = Subgraph_Model.load_model(args.load_model) else: align_model = Subgraph_Model(amrs, align_duplicates=True) iters = args.iter alignments = None for i in range(iters): print(f'Epoch {i}: Training data') alignments = align_model.align_all(amrs) align_model.update_parameters(amrs, alignments) perplexity(align_model, amrs, alignments) report_progress(amr_file, alignments, reader, epoch=i) print() if eval_amrs: print(f'Epoch {i}: Evaluation data') eval_alignments = align_model.align_all(eval_amrs) perplexity(align_model, eval_amrs, eval_alignments) evaluate(eval_amrs, eval_alignments, gold_eval_alignments) evaluate_duplicates(eval_amrs, eval_alignments, gold_eval_alignments) report_progress(eval_amr_file, eval_alignments, reader, epoch=i) print() report_progress(amr_file, alignments, reader) if args.save_model: align_model.save_model(args.save_model) print('Saving model to:', args.save_model)
def main(): file = '../data/szubert/szubert_amrs.isi_alignments.txt' ids_file = '../data/szubert/szubert_ids.isi.txt' output = '../data/szubert/szubert_amrs.isi.txt' amr_file1 = '../data/ldc_train.txt' amr_file2 = '../data/szubert/szubert_amrs.txt' reader = AMR_Reader() amrs = reader.load(amr_file1, remove_wiki=True) szubert_amrs = reader.load(amr_file2, remove_wiki=True) szubert_amr_ids = [amr.id for amr in szubert_amrs] amrs += szubert_amrs amrs = {amr.id: amr for amr in amrs} amr_ids = [] with open(ids_file, encoding='utf8') as f: for line in f: if line: amr_ids.append(line.strip()) isi_amrs, isi_alignments = reader.load(file, output_alignments=True) subgraph_alignments = {} relation_alignments = {} for isi_amr in isi_amrs: if isi_amr.id not in szubert_amr_ids: continue amr = amrs[isi_amr.id] if len(amr.tokens) != len(isi_amr.tokens): raise Exception('Inconsistent Tokenization:', amr.id) node_labels = node_map(isi_amr, amr) edge_labels = edge_map(isi_amr, amr) isi_aligns = isi_alignments[amr.id] subgraph_alignments[amr.id] = [] relation_alignments[amr.id] = [] for i, tok in enumerate(amr.tokens): aligns = [align for align in isi_aligns if i in align.tokens] nodes = [node_labels[n] for align in aligns for n in align.nodes] edges = [edge_labels[e] for align in aligns for e in align.edges] subgraph_alignments[amr.id].append( AMR_Alignment(type='subgraph', tokens=[i], nodes=nodes)) relation_alignments[amr.id].append( AMR_Alignment(type='relation', tokens=[i], edges=edges)) reader.save_alignments_to_json( output.replace('.txt', '.subgraph_alignments.json'), subgraph_alignments) reader.save_alignments_to_json( output.replace('.txt', '.relation_alignments.json'), relation_alignments) for amr in szubert_amrs: if amr.id not in subgraph_alignments: raise Exception('Missing AMR:', amr.id)
def main(): file = 'data/szubert/szubert_amrs.txt' output = 'data/szubert/szubert_amrs.jamr.txt' reader = AMR_Reader() amrs = reader.load(file, remove_wiki=True) with open(output, 'w+') as f: for amr in amrs: f.write('# ::id '+amr.id+'\n') tokens = [t for t in amr.tokens] for i,t in enumerate(tokens): if t[0]=='@' and t[-1]=='@' and len(t)==3: tokens[i] = t[1] f.write('# ::snt ' + ' '.join(tokens) + '\n') graph_string = amr.graph_string().replace('/',' / ') f.write(graph_string)
def main(): file = '../data/split/train.txt' file2 = '../data/train.sents.txt' reader = AMR_Reader() amrs = reader.load(file, remove_wiki=True) add_nlp_data(amrs, file) # amrs2 = reader.load('../data/split/test.txt', remove_wiki=True) # add_nlp_data(amrs2, '../data/split/test.txt') # amrs = amrs+amrs2 with open(file2, 'w+', encoding='utf8') as f: for amr in amrs: for token, pos in zip(amr.tokens, amr.pos): f.write(f'{token}|{pos} ') f.write('\n')
def load_szubert_data(amr_file): # Szubert data reader = AMR_Reader() amrs1 = reader.load(amr_file, remove_wiki=True) amr_ids = [amr.id for amr in amrs1] for amr_id in amr_ids: if amr_ids.count(amr_id)>1: print('Repeated:', amr_id) # LDC data amrs2 = [] amrs2 += reader.load('data/ldc_train.txt', remove_wiki=True) amrs2 += reader.load('data/ldc_dev.txt', remove_wiki=True) amrs2 += reader.load('data/ldc_test.txt', remove_wiki=True) amrs = [amr for amr in amrs2 if amr.id in amr_ids] ldc_ids = [amr.id for amr in amrs] # Little Prince data amrs3 = reader.load('data/little_prince.txt', remove_wiki=True) little_prince_ids = [amr.id for amr in amrs3 if amr.id in amr_ids] amrs += [amr for amr in amrs3 if amr.id in little_prince_ids] # other data other_ids = [amr_id for amr_id in amr_ids if amr_id not in ldc_ids and amr_id not in little_prince_ids] amrs += [amr for amr in amrs1 if amr.id in other_ids] print('Missing:', ' '.join(i for i in other_ids)) print(len(amrs), '/', len(amrs1), 'AMRs printed') return amrs
def main(): amr_file = sys.argv[1] align_file = sys.argv[2] gold_file = sys.argv[3] reader = AMR_Reader() amrs = reader.load(amr_file, remove_wiki=True) add_nlp_data(amrs, amr_file) alignments = reader.load_alignments_from_json(align_file, amrs) gold_alignments = reader.load_alignments_from_json(gold_file, amrs) pred_subgraph_alignments = reader.load_alignments_from_json( align_file.replace('relation_', 'subgraph_'), amrs) gold_subgraph_alignments = reader.load_alignments_from_json( gold_file.replace('relation_', 'subgraph_'), amrs) # Display.style([amr for amr in amrs if amr.id in gold_alignments], # gold_file.replace('.json', '') + f'.html', # gold_alignments) if len(amrs) != len(alignments): amrs = [ amr for amr in amrs if amr.id in alignments and amr.id in gold_alignments ] evaluate(amrs, alignments, gold_alignments, mode='edges')
def load_data4(): amr_file1 = '../data/split/dev.txt' amr_file2 = '../data/split/test.txt' ccg_dependency_file = '../data/ccg/ccgbank_dependencies.gold.txt' ccgbank_file = '../data/ccg/ccgbank_parses.gold.txt' ids_file = '../data/ccg/ids_map_test.tsv' reader = AMR_Reader() amrs = reader.load(amr_file1, remove_wiki=True) add_nlp_data(amrs, amr_file1) amrs2 = reader.load(amr_file2, remove_wiki=True) add_nlp_data(amrs2, amr_file2) amrs += amrs2 # gold data align_file = amr_file1.replace('.txt', '') + '.subgraph_alignments.gold.json' subgraph_alignments = reader.load_alignments_from_json(align_file, amrs) align_file = amr_file1.replace('.txt', '') + '.relation_alignments.gold.json' relation_alignments = reader.load_alignments_from_json(align_file, amrs) align_file = amr_file1.replace('.txt', '') + '.reentrancy_alignments.gold.json' reentrancy_alignments = reader.load_alignments_from_json(align_file, amrs) align_file = amr_file2.replace('.txt', '') + '.subgraph_alignments.gold.json' subgraph_alignments.update( reader.load_alignments_from_json(align_file, amrs)) align_file = amr_file2.replace('.txt', '') + '.relation_alignments.gold.json' relation_alignments.update( reader.load_alignments_from_json(align_file, amrs)) align_file = amr_file2.replace('.txt', '') + '.reentrancy_alignments.gold.json' reentrancy_alignments.update( reader.load_alignments_from_json(align_file, amrs)) ids, dependencies, ccg_lex, ccg_trees = load_gold_ccgs( ids_file, ccg_dependency_file, ccgbank_file) amrs = {amr.id: amr for amr in amrs} amrs2 = [] for id in ids: amrs2.append(amrs[id]) amrs = amrs2 return amrs, subgraph_alignments, relation_alignments, reentrancy_alignments, dependencies, ccg_lex, ccg_trees
def load_data2(): amr_file1 = '../data/split/dev.txt' amr_file2 = '../data/split/test.txt' ccg_dependency_file = '../data/test.ccg_dependencies.tsv' ccgbank_file = '../data/test.ccg_parse.txt' reader = AMR_Reader() amrs = reader.load(amr_file1, remove_wiki=True) add_nlp_data(amrs, amr_file1) amrs2 = reader.load(amr_file2, remove_wiki=True) add_nlp_data(amrs2, amr_file2) amrs += amrs2 # gold data align_file = amr_file1.replace('.txt', '') + '.subgraph_alignments.gold.json' subgraph_alignments = reader.load_alignments_from_json(align_file, amrs) align_file = amr_file1.replace('.txt', '') + '.relation_alignments.gold.json' relation_alignments = reader.load_alignments_from_json(align_file, amrs) align_file = amr_file1.replace('.txt', '') + '.reentrancy_alignments.gold.json' reentrancy_alignments = reader.load_alignments_from_json(align_file, amrs) align_file = amr_file2.replace('.txt', '') + '.subgraph_alignments.gold.json' subgraph_alignments.update( reader.load_alignments_from_json(align_file, amrs)) align_file = amr_file2.replace('.txt', '') + '.relation_alignments.gold.json' relation_alignments.update( reader.load_alignments_from_json(align_file, amrs)) align_file = amr_file2.replace('.txt', '') + '.reentrancy_alignments.gold.json' reentrancy_alignments.update( reader.load_alignments_from_json(align_file, amrs)) sentences = [amr.tokens for amr in amrs] _, dependencies = align_dependencies_to_sentences( load_dependencies(ccg_dependency_file, flavor='easysrl'), sentences) _, ccg_lex, ccg_trees = align_ccgbank_to_sentences( load_ccgbank(ccgbank_file), sentences) return amrs, subgraph_alignments, relation_alignments, reentrancy_alignments, dependencies, ccg_lex, ccg_trees
def main(): file = 'data/szubert/szubert_amrs.txt' file2 = 'data/ldc_train.txt' output1 = 'data/szubert/szubert_sents.isi.txt' output2 = 'data/szubert/szubert_amrs.isi.txt' output3 = 'data/szubert/szubert_ids.isi.txt' reader = AMR_Reader() amrs = reader.load(file, remove_wiki=True) amrs += reader.load(file2, remove_wiki=True) unique_ids = set() amrs2 = [] for amr in amrs: if amr.id not in unique_ids: unique_ids.add(amr.id) amrs2.append(amr) amrs = amrs2 with open(output1, 'w+', encoding='utf8') as f: for amr in amrs: tokens = [t for t in amr.tokens] for i, t in enumerate(tokens): if t[0] == '@' and t[-1] == '@' and len(t) == 3: tokens[i] = t[1] f.write(' '.join(tokens) + '\n') with open(output2, 'w+', encoding='utf8') as f: for amr in amrs: graph_string = amr.graph_string()\ .replace('/', ' / ')\ .replace('\n','')\ .replace('\r','') \ .replace('\t:', ' :') \ .replace('\t','') f.write(graph_string+'\n') with open(output3, 'w+', encoding='utf8') as f: for amr in amrs: f.write(amr.id+'\n')
def main(): amr_file = 'data/szubert/szubert_amrs.jamr_alignments.txt' # amr_file2 = 'data/szubert/szubert_amrs.txt' reader = AMR_Reader() amrs, alignments = reader.load(amr_file, remove_wiki=True, output_alignments=True) for amr in amrs: spans = set() taken = set() for align in alignments[amr.id]: align.type = 'subgraph' align.amr = amr spans.add(tuple(align.tokens)) taken.update(align.tokens) for t in range(len(amr.tokens)): if t not in taken: spans.add((t, )) spans = [list(span) for span in sorted(spans, key=lambda x: x[0])] clean_alignments(amr, alignments, spans) reader.save_alignments_to_json( 'data/szubert/szubert_amrs.jamr_alignments.json', alignments)
def main(): unaligned_amr_file = args.test reader = AMR_Reader() eval_amrs = reader.load(unaligned_amr_file, remove_wiki=True) add_nlp_data(eval_amrs, unaligned_amr_file) # subgraphs print(f'Loading model: {args.subgraph_model}') subgraph_model = Subgraph_Model.load_model(args.subgraph_model) sub_alignments = subgraph_model.align_all(eval_amrs) align_file = unaligned_amr_file.replace('.txt', '') + f'.subgraph_alignments.json' print(f'Writing subgraph alignments to: {align_file}') reader.save_alignments_to_json(align_file, sub_alignments) # relations print(f'Loading model: {args.relation_model}') rel_model = Relation_Model.load_model(args.relation_model) rel_model.subgraph_alignments = sub_alignments rel_alignments = rel_model.align_all(eval_amrs) align_file = unaligned_amr_file.replace('.txt', '') + f'.relation_alignments.json' print(f'Writing relation alignments to: {align_file}') reader.save_alignments_to_json(align_file, rel_alignments) # reentrancies print(f'Loading model: {args.reentrancy_model}') reent_model = Reentrancy_Model.load_model(args.reentrancy_model) reent_model.subgraph_alignments = sub_alignments reent_model.relation_alignments = rel_alignments reent_alignments = reent_model.align_all(eval_amrs) align_file = unaligned_amr_file.replace('.txt', '') + f'.reentrancy_alignments.json' print(f'Writing reentrancy alignments to: {align_file}') reader.save_alignments_to_json(align_file, reent_alignments)
def main(): amr_file = sys.argv[1] output_file = sys.argv[2] reader = AMR_Reader() amrs = reader.load(amr_file, remove_wiki=True) # subgraphs align_file = amr_file.replace('.txt', '') + '.subgraph_alignments.json' sub_aligns = reader.load_alignments_from_json(align_file) for amr in amrs: sub_aligns[amr.id] = [a for a in sub_aligns[amr.id] if a.nodes] align_file = output_file.replace('.txt', '') + '.subgraph_alignments.json' print('Writing subgraph alignments to:', align_file) write_to_json(align_file, sub_aligns, anonymize=True, amrs=amrs) # relations align_file = amr_file.replace('.txt', '') + '.relation_alignments.json' rel_aligns = reader.load_alignments_from_json(align_file) for amr in amrs: rel_aligns[amr.id] = [a for a in rel_aligns[amr.id] if a.edges] align_file = output_file.replace('.txt', '') + '.relation_alignments.json' print('Writing relation alignments to:', align_file) write_to_json(align_file, rel_aligns, anonymize=True, amrs=amrs) # reentrancies align_file = amr_file.replace('.txt', '') + '.reentrancy_alignments.json' reent_aligns = reader.load_alignments_from_json(align_file) for amr in amrs: reent_aligns[amr.id] = [a for a in reent_aligns[amr.id] if a.edges] align_file = output_file.replace('.txt', '') + '.reentrancy_alignments.json' print('Writing reentrancy alignments to:', align_file) write_to_json(align_file, reent_aligns, anonymize=True, amrs=amrs) for amr in amrs: for n in amr.nodes: n_aligned = [a for a in sub_aligns[amr.id] if n in a.nodes] if len(n_aligned) != 1: raise Exception('Bad node alignment', amr.id, n) for e in amr.edges: e_aligned = [a for a in sub_aligns[amr.id] if e in a.edges]+\ [a for a in rel_aligns[amr.id] if e in a.edges] if len(e_aligned) != 1: raise Exception('Bad edge alignment', amr.id, e)
def load_data1(): amr_file = '../data/split/train.txt' ccg_dependency_file = '../data/train.ccg_dependencies.tsv' ccgbank_file = '../data/train.ccg_parse.txt' reader = AMR_Reader() amrs = reader.load(amr_file, remove_wiki=True) add_nlp_data(amrs, amr_file) # predicted data align_file = amr_file.replace('.txt', '') + '.subgraph_alignments.json' subgraph_alignments = reader.load_alignments_from_json(align_file, amrs) align_file = amr_file.replace('.txt', '') + '.relation_alignments.json' relation_alignments = reader.load_alignments_from_json(align_file, amrs) align_file = amr_file.replace('.txt', '') + '.reentrancy_alignments.json' reentrancy_alignments = reader.load_alignments_from_json(align_file, amrs) sentences = [amr.tokens for amr in amrs] _, dependencies = align_dependencies_to_sentences( load_dependencies(ccg_dependency_file, flavor='easysrl'), sentences) _, ccg_lex, ccg_trees = align_ccgbank_to_sentences( load_ccgbank(ccgbank_file), sentences) return amrs, subgraph_alignments, relation_alignments, reentrancy_alignments, dependencies, ccg_lex, ccg_trees
def main(): amr_file = args.train reader = AMR_Reader() amrs = reader.load(amr_file, remove_wiki=True) add_nlp_data(amrs, amr_file) align_file = amr_file.replace('.txt', '') + '.subgraph_alignments.json' subgraph_alignments = reader.load_alignments_from_json(align_file, amrs) # amrs = amrs[:1000] eval_amr_file, eval_amrs, gold_eval_alignments = None, None, None if args.test: eval_amr_file, eval_align_file = args.test eval_amrs = reader.load(eval_amr_file, remove_wiki=True) add_nlp_data(eval_amrs, eval_amr_file) gold_eval_alignments = reader.load_alignments_from_json( eval_align_file, eval_amrs) eval_amr_ids = {amr.id for amr in eval_amrs} amrs = [amr for amr in amrs if amr.id not in eval_amr_ids] align_file = eval_amr_file.replace( '.txt', '') + '.subgraph_alignments.gold.json' gold_subgraph_alignments = reader.load_alignments_from_json( align_file, eval_amrs) align_file = eval_amr_file.replace('.txt', '') + '.subgraph_alignments.json' pred_subgraph_alignments = reader.load_alignments_from_json( align_file, eval_amrs) if USE_GOLD_SUBGRAPHS: pred_subgraph_alignments = gold_subgraph_alignments for amr_id in pred_subgraph_alignments: subgraph_alignments[amr_id] = pred_subgraph_alignments[amr_id] for amr in eval_amrs: spans = [ align.tokens for align in pred_subgraph_alignments[amr.id] if align.type == 'subgraph' ] amr.spans = spans if args.load_model: print('Loading model from:', args.load_model) align_model = Relation_Model.load_model(args.load_model) else: align_model = Relation_Model(amrs, subgraph_alignments) iters = args.iter alignments = None for i in range(iters): print(f'Epoch {i}: Training data') alignments = align_model.align_all(amrs) align_model.update_parameters(amrs, alignments) report_progress(amr_file, alignments, reader, epoch=i) perplexity(align_model, amrs, alignments) print() if eval_amrs: print(f'Epoch {i}: Evaluation data') eval_alignments = align_model.align_all(eval_amrs) perplexity(align_model, eval_amrs, eval_alignments) evaluate_relations(eval_amrs, eval_alignments, gold_eval_alignments, pred_subgraph_alignments, gold_subgraph_alignments) report_progress(eval_amr_file, eval_alignments, reader, epoch=i) print() report_progress(amr_file, alignments, reader) if args.save_model: align_model.save_model(args.save_model) print('Saving model to:', args.save_model)
def main(): amr_file_old = sys.argv[1] amr_file_new = sys.argv[2] output_file = sys.argv[3] reader = AMR_Reader() amrs_old = reader.load(amr_file_old) amrs_new = reader.load(amr_file_new, remove_wiki=True) bad_node_map = {} for amr1 in amrs_new: amr2 = next(a for a in amrs_old if a.id == amr1.id) for n in amr1.nodes: amr1.nodes[n] = amr1.nodes[n].replace('"', '') for n in amr2.nodes: amr2.nodes[n] = amr2.nodes[n].replace('"', '') bad_nodes = [] for n in amr1.nodes: if n not in amr2.nodes or amr1.nodes[n] != amr2.nodes[n]: bad_nodes.append(n) continue neighborhood = { f'{amr1.nodes[e[0]]} {e[1]} {amr1.nodes[e[2]]}' for e in amr1.edges if n in e } neighborhood2 = { f'{amr2.nodes[e[0]]} {e[1]} {amr2.nodes[e[2]]}' for e in amr2.edges if n in e } if neighborhood != neighborhood2: bad_nodes.append(n) if bad_nodes: bad_node_map[amr1.id] = {} for n in amr1.nodes: if n in bad_nodes: new_n = [ n2 for n2 in amr2.nodes if amr2.nodes[n2] == amr1.nodes[n] and not (n2 in amr1.nodes and amr1.nodes[n2] == amr1.nodes[n]) ] neighborhood = { f'{amr1.nodes[e[0]]} {e[1]} {amr1.nodes[e[2]]}' for e in amr1.edges if n in e } if len(new_n) > 1: neighborhood2 = { n2: [ f'{amr2.nodes[e[0]]} {e[1]} {amr2.nodes[e[2]]}' for e in amr2.edges if n2 in e ] for n2 in new_n } new_n = [ n2 for n2 in new_n if neighborhood == set(neighborhood2[n2]) ] if len(new_n) == 1: bad_node_map[amr1.id][new_n[0]] = n else: raise Exception('Bad node match', amr1.id, n) else: bad_node_map[amr1.id][n] = n for amr1 in amrs_new: amr2 = next(a for a in amrs_old if a.id == amr1.id) for n2 in amr2.nodes: n = bad_node_map[amr1.id][n2] if amr1.id in bad_node_map else n2 if amr1.nodes[n] != amr2.nodes[n2]: raise Exception('Bad node match', amr1.id, n) for e in amr2.edges: s2, r, t2 = e s = bad_node_map[amr1.id][s2] if amr1.id in bad_node_map else s2 t = bad_node_map[amr1.id][t2] if amr1.id in bad_node_map else t2 if (s, r, t) not in amr1.edges: raise Exception('Bad edge match', amr1.id, e, amr1.nodes[s], r.amr1.nodes[t]) print('Node id fixes:', len(bad_node_map), ' '.join(bad_node_map.keys())) # subgraphs align_file = amr_file_old.replace('.txt', '') + '.subgraph_alignments.gold.json' sub_aligns = reader.load_alignments_from_json(align_file) for amr in amrs_new: for align in sub_aligns[amr.id]: if amr.id in bad_node_map: align.nodes = [bad_node_map[amr.id][n] for n in align.nodes] for align in sub_aligns[amr.id]: if len(align.nodes) > 1: for s, r, t in amr.edges: if s in align.nodes and t in align.nodes: align.edges.append((s, r, t)) sub_aligns[amr.id] = [a for a in sub_aligns[amr.id] if a.nodes] align_file = output_file.replace('.txt', '') + '.subgraph_alignments.gold.json' print('Writing subgraph alignments to:', align_file) write_to_json(align_file, sub_aligns, anonymize=True, amrs=amrs_new) # relations align_file = amr_file_old.replace('.txt', '') + '.relation_alignments.gold.json' rel_aligns = reader.load_alignments_from_json(align_file) for amr in amrs_new: for align in rel_aligns[amr.id]: if amr.id in bad_node_map: align.edges = [(bad_node_map[amr.id][s], r, bad_node_map[amr.id][t]) for s, r, t in align.edges] for align in rel_aligns[amr.id]: sub_align = amr.get_alignment(sub_aligns, token_id=align.tokens[0]) if sub_align.nodes: align.edges = [ e for e in align.edges if not ( e[0] in sub_align.nodes and e[-1] in sub_align.nodes) ] rel_aligns[amr.id] = [a for a in rel_aligns[amr.id] if a.edges] align_file = output_file.replace('.txt', '') + '.relation_alignments.gold.json' print('Writing relation alignments to:', align_file) write_to_json(align_file, rel_aligns, anonymize=True, amrs=amrs_new) # reentrancies align_file = amr_file_old.replace('.txt', '') + '.reentrancy_alignments.gold.json' reent_aligns = reader.load_alignments_from_json(align_file) for amr in amrs_new: reent_aligns[amr.id] = [a for a in reent_aligns[amr.id] if a.edges] for align in reent_aligns[amr.id]: if amr.id in bad_node_map: align.edges = [(bad_node_map[amr.id][s], r, bad_node_map[amr.id][t]) for s, r, t in align.edges] align_file = output_file.replace('.txt', '') + '.reentrancy_alignments.gold.json' print('Writing reentrancy alignments to:', align_file) write_to_json(align_file, reent_aligns, anonymize=True, amrs=amrs_new) for amr in amrs_new: for n in amr.nodes: n_aligned = [a for a in sub_aligns[amr.id] if n in a.nodes] if len(n_aligned) != 1: raise Exception('Bad node alignment', amr.id, n) for e in amr.edges: e_aligned = [a for a in sub_aligns[amr.id] if e in a.edges]+\ [a for a in rel_aligns[amr.id] if e in a.edges] if len(e_aligned) != 1: raise Exception('Bad edge alignment', amr.id, e)
def main(): amr_file = sys.argv[1] # output_file = sys.argv[2] # stanza.download('en') nlp = stanza.Pipeline('en', processors='tokenize,pos,lemma,ner') reader = AMR_Reader() amrs = reader.load(amr_file, remove_wiki=True) lemmas_json = {} pos_json = {} enum_amrs = [_ for _ in enumerate(amrs)] for amr_idx, amr in tqdm(enum_amrs): tokens = amr.tokens.copy() for i, tok in enumerate(tokens): if tok.startswith('@') and tok.endswith('@') and len(tok) == 3: tokens[i] = tok[1] doc = nlp(' '.join(tokens)) start_idx = {} end_idx = {} i = 0 for j, tok in enumerate(tokens): start_idx[j] = i end_idx[j] = i + len(tok) i += len(tok) + 1 convert_ids = {} stanza_lemmas = {} stanza_pos = {} for s in doc.sentences: for token in s.tokens: start = token.start_char end = token.end_char idx = [k for k in start_idx if start >= start_idx[k] and end <= end_idx[k]] if len(idx) == 0: idx = [k for k in start_idx if start <= start_idx[k] <= end] idx = idx[0] convert_ids[start] = idx for word in token.words: if start not in stanza_lemmas: stanza_lemmas[start] = '' lemma = word.lemma stanza_lemmas[start] += lemma stanza_pos[start] = word.xpos lemmas = ['' for _ in amr.tokens] pos = ['' for _ in amr.tokens] for i in stanza_lemmas: lemmas[convert_ids[i]] += stanza_lemmas[i] pos[convert_ids[i]] = stanza_pos[i] for i, l in enumerate(lemmas): if not l and i > 0: lemmas[i] = lemmas[i - 1] pos[i] = pos[i - 1] lemmas_json[amr.id] = lemmas pos_json[amr.id] = pos filename = amr_file.replace('.txt', '') with open(filename + '.lemmas.json', 'w+', encoding='utf8') as f: json.dump(lemmas_json, f) with open(filename + '.pos.json', 'w+', encoding='utf8') as f: json.dump(pos_json, f)
def main(): split_file = Path('../data/split/train_ids.txt') if split_file.is_file(): raise Exception( 'Cannot create Train, Dev, Test split because split already exists:', str(split_file.resolve())) reader = AMR_Reader() ldc_train = '../data/ldc_train.txt' ldc_train = reader.load(ldc_train, remove_wiki=True) ldc_dev = '../data/ldc_dev.txt' ldc_dev = reader.load(ldc_dev, remove_wiki=True) ldc_test = '../data/ldc_test.txt' ldc_test = reader.load(ldc_test, remove_wiki=True) little_prince = '../data/little_prince.txt' little_prince = reader.load(little_prince, remove_wiki=True) szubert = '../data/szubert/szubert_amrs.txt' szubert = reader.load(szubert, remove_wiki=True) gold_dev = '../data/gold_dev/ldc_dev.gold.txt' gold_dev = reader.load(gold_dev, remove_wiki=True) szubert_ids = [amr.id for amr in szubert] train_ids = [amr.id for amr in ldc_train if amr.id not in szubert_ids] little_prince_ids = [ amr.id for amr in little_prince if amr.id not in szubert_ids ] gold_dev_ids = [amr.id for amr in gold_dev if amr.id not in szubert_ids] shuffle(little_prince_ids) sample = little_prince_ids[:50] new_train_ids = train_ids + [ n for n in little_prince_ids if n not in sample ] new_dev_ids = gold_dev_ids + sample new_test_ids = szubert_ids little_prince1 = [ amr.id for amr in little_prince if amr.id in new_train_ids ] little_prince2 = [amr.id for amr in little_prince if amr.id in new_dev_ids] little_prince3 = [ amr.id for amr in little_prince if amr.id in new_test_ids ] print('Split up little prince:', len(little_prince1), len(little_prince2), len(little_prince3)) with open('../data/split/train_ids.txt', 'w+') as f: f.write(f'# {len(new_train_ids)} train AMRs\n') for n in sorted(new_train_ids): f.write(n + '\n') with open('../data/split/dev_ids.txt', 'w+') as f: f.write(f'# {len(new_dev_ids)} dev AMRs\n') for n in sorted(new_dev_ids): f.write(n + '\n') with open('../data/split/test_ids.txt', 'w+') as f: f.write(f'# {len(new_test_ids)} test AMRs\n') for n in sorted(new_test_ids): f.write(n + '\n') train_amrs = {amr.id: amr for amr in ldc_train + little_prince} reader.write_to_file('../data/split/train.txt', [train_amrs[n] for n in sorted(new_train_ids)]) dev_amrs = {amr.id: amr for amr in ldc_dev + little_prince} reader.write_to_file('../data/split/dev.txt', [dev_amrs[n] for n in sorted(new_dev_ids)]) test_amrs = {amr.id: amr for amr in szubert} reader.write_to_file('../data/split/test.txt', [test_amrs[n] for n in sorted(new_test_ids)])
def main(): amr_file = sys.argv[1] # output_file = sys.argv[2] # stanza.download('en') nlp = stanza.Pipeline('en', processors='tokenize,pos,lemma,ner') reader = AMR_Reader() amrs = reader.load(amr_file, remove_wiki=True) lemmas_json = {} pos_json = {} ner_spans = {} mwe_spans = {} multi_word_spans = {} coreferences = {} mwe_types = get_mwe_types_by_first_token() coref_parser = None try: coref_parser = get_coref_parser() except Exception as e: print( 'Warning: Failed to parse coreference. ' 'Please install neuralcoref from source: https://github.com/huggingface/neuralcoref#install-neuralcoref-from-source', file=sys.stderr) enum_amrs = [_ for _ in enumerate(amrs)] for amr_idx, amr in tqdm(enum_amrs): tokens = amr.tokens.copy() for i, tok in enumerate(tokens): if tok.startswith('@') and tok.endswith('@') and len(tok) == 3: tokens[i] = tok[1] doc = nlp(' '.join(tokens)) start_idx = {} end_idx = {} i = 0 for j, tok in enumerate(tokens): start_idx[j] = i end_idx[j] = i + len(tok) i += len(tok) + 1 convert_ids = {} stanza_lemmas = {} stanza_entity_type = [] stanza_entity_spans = [] stanza_pos = {} for s in doc.sentences: for token in s.tokens: start = token.start_char end = token.end_char idx = [ k for k in start_idx if start >= start_idx[k] and end <= end_idx[k] ] if len(idx) == 0: idx = [ k for k in start_idx if start <= start_idx[k] <= end ] idx = idx[0] convert_ids[start] = idx for word in token.words: if start not in stanza_lemmas: stanza_lemmas[start] = '' lemma = word.lemma stanza_lemmas[start] += lemma stanza_pos[start] = word.xpos for e in s.entities: stanza_entity_type.append(e.type) ent_type = e.type span = [] for t in e.tokens: start = t.start_char span.append(start) # name = ' '.join(amr.tokens[convert_ids[t]] for t in span) # type = e.type pos = [stanza_pos[t] for t in span] if pos[0] in [ 'DT', 'PDT', 'PRP$', 'RB', 'RP', 'JJ', 'JJR', 'JJS', 'IN' ]: while pos and pos[0] in [ 'DT', 'PDT', 'PRP$', 'RB', 'RP', 'JJ', 'JJR', 'JJS', 'IN' ]: pos = pos[1:] span = span[1:] if len(span) == 0: stanza_entity_type.pop() continue if pos and pos[-1] in ['POS', 'RB', 'RBR', 'RBS']: span = span[:-1] if len(span) == 0: stanza_entity_type.pop() continue # next_tok = convert_ids[span[-1]] # next_tok = [s for s,t in convert_ids.items() if t==next_tok+1] # prev_tok = convert_ids[span[0]] # prev_tok = [s for s, t in convert_ids.items() if t == prev_tok - 1] # if next_tok: # next_tok = next_tok[0] # next_pos = stanza_pos[next_tok] # if next_pos == 'NNP': # span.append(next_tok) # if prev_tok: # prev_tok = prev_tok[0] # prev_pos = stanza_pos[prev_tok] # if prev_pos == 'NNP': # span.insert(0,prev_tok) # if amr.id=='bolt12_10494_3592.5': # print() if len(span) == 1: stanza_entity_type.pop() continue stanza_entity_spans.append(span) # if ent_type in ['DATE','TIME','MONEY','QUANTITY']: # print() # print(ent_type, ' '.join(amr.tokens[convert_ids[i]] for i in span)) # print() lemmas = ['' for _ in amr.tokens] pos = ['' for _ in amr.tokens] for i in stanza_lemmas: lemmas[convert_ids[i]] += stanza_lemmas[i] pos[convert_ids[i]] = stanza_pos[i] for i, l in enumerate(lemmas): if not l and i > 0: lemmas[i] = lemmas[i - 1] pos[i] = pos[i - 1] entities = [] for span in stanza_entity_spans: span = [convert_ids[i] for i in span] start = min(span) end = max(span) + 1 entities.append((start, end)) lemmas_json[amr.id] = lemmas pos_json[amr.id] = pos ner_spans[amr.id] = entities # get MWE spans mwe_spans[amr.id] = [] taken = [] for i, token in enumerate(amr.tokens): if i in taken: continue found = False token = token.lower() lemma = lemmas[i].lower() if token in mwe_types: for mwe in mwe_types[token]: size = len(mwe) if i + size - 1 >= len(amr.tokens): continue if all(amr.tokens[i + idx].lower().replace('@', '') == mwe[idx] for idx in range(size)): span = (i, i + size) mwe_spans[amr.id].append(span) for t in range(span[0], span[-1]): taken.append(t) found = True break if found: continue if lemma in mwe_types: for mwe in mwe_types[lemma]: size = len(mwe) if i + size - 1 >= len(amr.tokens): continue if all(lemmas[i + idx].lower().replace('@', '') == mwe[idx] for idx in range(size)): span = (i, i + size) mwe_spans[amr.id].append(span) for t in range(span[0], span[-1]): taken.append(t) break taken.append(i) # look for names matching gold amr name_spans = [] if SEE_GOLD_AMR: for n in amr.nodes: if amr.nodes[n] == 'name': parts = [(int(r[3:]), t) for s, r, t in amr.edges if s == n and r.startswith(':op')] parts = [t for r, t in sorted(parts, key=lambda x: x[0])] label = ' '.join(amr.nodes[t].replace('"', '') for t in parts) name_type = [ s for s, r, t in amr.edges if t == n and r == ':name' ] name_type = amr.nodes[name_type[0]] if name_type else None if parts: for start in range(len(amr.tokens)): span = [ t for t in range(start, start + len(parts)) ] if span[-1] >= len(amr.tokens): break tokens = [amr.tokens[t] for t in span] token_label = ' '.join( [tok for tok in tokens if tok != '"']) if token_label.lower() == label.lower(): next_tok = span[-1] + 1 if next_tok < len(amr.tokens) and amr.tokens[ next_tok] == name_type: span += [next_tok] if len(parts) > 1: name_spans.append((span[0], span[-1] + 1)) start = span[0] end = span[-1] + 1 for span in ner_spans[amr.id][:]: if span[0] <= start < span[1] and span[ 0] < end <= span[1] and ( start, end) != span: ner_spans[amr.id].remove(span) break break for t in range(len(amr.tokens)): if t + 2 < len(amr.tokens) and amr.tokens[t + 1] == '@-@': label1 = f'{lemmas[t]}{lemmas[t + 2]}'.lower( )[:len(lemmas[t]) + 4] label2 = f'{lemmas[t]}-{lemmas[t + 2]}'.lower( )[:len(lemmas[t]) + 5] if any(amr.nodes[n].startswith(label1) or amr.nodes[n].startswith(label2) for n in amr.nodes): name_spans.append((t, t + 3)) # times taken = set() for t in range(len(amr.tokens)): if t in taken: continue start = t if amr.tokens[t].isdigit() and len( amr.tokens[t]) <= 2 and t + 2 < len(amr.tokens): if amr.tokens[t + 1] in [ '@:@', ':' ] and amr.tokens[t + 2].isdigit() and len( amr.tokens[t + 2]) == 2: end = t + 2 while end + 1 < len(amr.tokens) \ and (amr.tokens[end + 1] in ['am', 'pm', 'a.m.', 'p.m.', '@:@', ':', 'UTC', 'GMT', 'EST', 'Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', ] or (amr.tokens[end] in ['@:@', ':'] and amr.tokens[end + 1].isdigit() and len( amr.tokens[end + 1]) == 2)): end += 1 end += 1 time = ' '.join(amr.tokens[t] for t in range(start, end)) name_spans.append((start, end)) for span in ner_spans[amr.id]: if start < span[1] < end and span[0] < start: name_spans[-1] = (span[0], end) break elif start < span[0] < end and span[1] > end: name_spans[-1] = (start, span[1]) break elif span[0] <= start < span[1] and span[ 0] < end <= span[1]: name_spans[-1] = span break start, end = name_spans[-1] for i in range(start, end): taken.add(i) multi_word_spans[amr.id] = [] taken = set() for i, tok in enumerate(amr.tokens): if i in taken: continue if any(i == span[0] for span in name_spans): span = [s for s in name_spans if s[0] <= i < s[1]][0] span = [i for i in range(span[0], span[1])] multi_word_spans[amr.id].append(span) taken.update(span) elif any(i == span[0] for span in ner_spans[amr.id]): span = [s for s in ner_spans[amr.id] if s[0] <= i < s[1]][0] span = [i for i in range(span[0], span[1])] multi_word_spans[amr.id].append(span) taken.update(span) elif any(i == span[0] for span in mwe_spans[amr.id]): span = [s for s in mwe_spans[amr.id] if s[0] <= i < s[1]][0] span = [i for i in range(span[0], span[1])] multi_word_spans[amr.id].append(span) taken.update(span) else: multi_word_spans[amr.id].append([i]) taken.add(i) if coref_parser is not None: corefs = get_corefs(amr, coref_parser) coreferences[amr.id] = corefs # ner_spans = {k: v for k, v in ner_spans.items() if v} # mwe_spans = {k: v for k, v in mwe_spans.items() if v} filename = amr_file.replace('.txt', '') with open(filename + '.lemmas.json', 'w+', encoding='utf8') as f: json.dump(lemmas_json, f) with open(filename + '.pos.json', 'w+', encoding='utf8') as f: json.dump(pos_json, f) with open(filename + '.spans.json', 'w+', encoding='utf8') as f: json.dump(multi_word_spans, f) if coreferences: with open(filename + '.coref.json', 'w+', encoding='utf8') as f: json.dump(coreferences, f)
def main(): amr_file = sys.argv[1] hand_alignments_file = sys.argv[2] reader = AMR_Reader() amrs = reader.load(amr_file, remove_wiki=True) amrs = {amr.id: amr for amr in amrs} subgraph_alignments = {} relation_alignments = {} reentrancy_alignments = {} all_spans = {amr_id: set() for amr_id in amrs} amr = None node_labels = {} with open(hand_alignments_file) as f: hand_alignments = csv.reader(f, delimiter="\t") for row in hand_alignments: if row[0] == 'amr': amr_id = row[1] subgraph_alignments[amr_id] = [] relation_alignments[amr_id] = [] reentrancy_alignments[amr_id] = [] amr = amrs[amr_id] taken = set() node_labels = get_node_labels(amr) node_labels = {v: k for k, v in node_labels.items()} edge_labels = get_edge_labels(amr) edge_labels = {v: k for k, v in edge_labels.items()} elif row[0] == 'node': type = 'subgraph' if row[3].startswith('*'): type = 'dupl-subgraph' row[3] = row[3].replace('*', '') if not row[3]: raise Exception('Missing Annotation:', amr_id) node_id = row[1] if node_id not in node_labels: raise Exception('Failed to parse node labels:', amr.id, node_id) n = node_labels[node_id] token_ids = [int(t) for t in row[3].split(',')] if any(t >= len(amr.tokens) for t in token_ids): raise Exception('Bad Annotation:', amr_id) if tuple(token_ids) not in all_spans[amr_id] and any( t in taken for t in token_ids): raise Exception('Bad Span Annotation', amr_id) all_spans[amr_id].add(tuple(token_ids)) taken.update(token_ids) align = amr.get_alignment(subgraph_alignments, token_id=token_ids[0]) if align and align.type == type: align.nodes.append(n) else: new_align = AMR_Alignment(type=type, tokens=token_ids, nodes=[n], amr=amr) subgraph_alignments[amr.id].append(new_align) elif row[0] == 'edge': type = 'relation' if row[3].startswith('*'): row[3] = row[3].replace('*', '') if not row[3]: raise Exception('Missing Annotation:', amr_id) edge_id = row[1] if edge_id not in edge_labels: raise Exception('Failed to parse edge labels:', amr.id, node_id) e = edge_labels[edge_id] token_ids = [int(t) for t in row[3].split(',')] if any(t >= len(amr.tokens) for t in token_ids): raise Exception('Bad Annotation:', amr_id) if tuple(token_ids) not in all_spans[amr_id] and any( t in taken for t in token_ids): raise Exception('Bad Span Annotation', amr_id, token_ids) all_spans[amr_id].add(tuple(token_ids)) taken.update(token_ids) align = amr.get_alignment(relation_alignments, token_id=token_ids[0]) if align and align.type == type: align.edges.append(e) else: new_align = AMR_Alignment(type=type, tokens=token_ids, edges=[e], amr=amr) relation_alignments[amr.id].append(new_align) elif row[0] == 'reentrancy': if not row[3]: raise Exception('Missing Annotation:', amr_id) edge_id = row[1] e = edge_labels[edge_id] if row[3].startswith('*'): row[3] = row[3].replace('*', '') if row[3] == '_': token_ids = amr.get_alignment(relation_alignments, edge=e).tokens else: token_ids = [int(t) for t in row[3].split(',')] tag = row[4] if row[3] == '_': tag = 'primary' if not tag: raise Exception('Missing reentrancy tag:', amr.id) type = f'reentrancy:{tag}' if any(t >= len(amr.tokens) for t in token_ids): raise Exception('Bad Annotation:', amr_id) if tuple(token_ids) not in all_spans[amr_id] and any( t in taken for t in token_ids): raise Exception('Bad Span Annotation', amr_id, token_ids) all_spans[amr_id].add(tuple(token_ids)) taken.update(token_ids) new_align = AMR_Alignment(type=type, tokens=token_ids, edges=[e], amr=amr) reentrancy_alignments[amr.id].append(new_align) for amr_id in subgraph_alignments: amr = amrs[amr_id] for t in range(len(amr.tokens)): if not any(t in span for span in all_spans[amr_id]): all_spans[amr_id].add((t, )) spans = [ list(span) for span in sorted(all_spans[amr_id], key=lambda x: x[0]) ] for align in subgraph_alignments[amr_id]: if align.nodes and not is_subgraph(amr, align.nodes): print('Possible Bad align:', amr.id, align.tokens, ' '.join(amr.tokens[t] for t in align.tokens), file=sys.stderr) for align in relation_alignments[amr_id]: subgraph_aligns = [ a for a in subgraph_alignments[amr.id] if a.tokens == align.tokens ] for s, r, t in align.edges: if subgraph_aligns and not any( s in a.nodes or t in a.nodes or not a.nodes for a in subgraph_aligns): if r == ':manner' and amr.tokens[ align.tokens[0]] == 'without': continue raise Exception('Bad Relation align:', amr.id, align.tokens, s, r, t) dupl_sub_aligns = [ align for align in subgraph_alignments[amr_id] if align.type.startswith('dupl') ] subgraph_alignments[amr_id] = [ align for align in subgraph_alignments[amr_id] if not align.type.startswith('dupl') ] # dupl_rel_aligns = [align for align in relation_alignments[amr_id] if align.type.startswith('dupl')] # relation_alignments[amr_id] = [align for align in relation_alignments[amr_id] if not align.type.startswith('dupl')] clean_alignments(amr, subgraph_alignments, dupl_sub_aligns, spans) clean_alignments(amr, relation_alignments, [], spans, mode='relations') for t, _ in enumerate(amr.tokens): count = [span for span in spans if t in span] if len(count) != 1: raise Exception('Bad Span:', amr.id, count) # amr_file = amr_file.replace('.txt', '.jakob') align_file = amr_file.replace('.txt', '') + f'.subgraph_alignments.gold.json' print(f'Writing subgraph alignments to: {align_file}') reader.save_alignments_to_json(align_file, subgraph_alignments) align_file = amr_file.replace('.txt', '') + f'.relation_alignments.gold.json' print(f'Writing relation alignments to: {align_file}') reader.save_alignments_to_json(align_file, relation_alignments) align_file = amr_file.replace('.txt', '') + f'.reentrancy_alignments.gold.json' print(f'Writing reentrancy alignments to: {align_file}') reader.save_alignments_to_json(align_file, reentrancy_alignments)
def main(): dir = '../data/tamr' szubert_amrs = '../data/szubert/szubert_amrs.txt' output = '../data/szubert/szubert_amrs.tamr.subgraph_alignments.json' file2 = '../data/tamr/ldc_train_2017.txt' reader = AMR_Reader() amrs = reader.load(szubert_amrs, remove_wiki=True) amrs2 = reader.load(file2, remove_wiki=True) alignments = {} for filename in os.listdir(dir): if filename.endswith(".tamr_alignment"): file = os.path.join(dir, filename) amr_id = '' with open(file) as f: for line in f: if line.startswith('# ::alignments'): aligns = line[len('# ::alignments '):].split() aligns = [s.split('|') for s in aligns if '|' in s] aligns = [(a[0], a[1].split('+')) for a in aligns] for span, nodes in aligns: start = int(span.split('-')[0]) end = int(span.split('-')[1]) span = [t for t in range(start, end)] align = AMR_Alignment(type='subgraph', tokens=span, nodes=nodes) alignments[amr_id].append(align) elif line.strip(): amr_id = line.strip() alignments[amr_id] = [] amrs2 = {amr.id: amr for amr in amrs2} amrs = [amr for amr in amrs if amr.id in alignments and amr.id in amrs2] amrs3 = [] for amr in amrs[:]: amr2 = amrs2[amr.id] nodes = {amr.nodes[n] for n in amr.nodes} nodes2 = {amr2.nodes[n] for n in amr2.nodes} edges = {(amr.nodes[s], r, amr.nodes[t]) for s, r, t in amr.edges} edges2 = {(amr2.nodes[s], r, amr2.nodes[t]) for s, r, t in amr2.edges} if nodes == nodes2 and edges == edges2: amrs3.append(amr) amr_ids = [amr.id for amr in amrs] alignments = { amr_id: alignments[amr_id] for amr_id in alignments if amr_id in amr_ids } for amr in amrs: node_map = {} nodes = [n for align in alignments[amr.id] for n in align.nodes] nodes = [n for n in sorted(nodes, key=lambda x: (len(x), x))] for n in nodes: prefix = '.'.join(i for i in n.split('.')[:-1]) last = int(n.split('.')[-1]) if prefix: if prefix not in node_map: new_prefix = '.'.join( str(int(i) + 1) for i in n.split('.')[:-1]) if new_prefix not in amr.nodes: continue node_map[prefix] = new_prefix new_n = node_map[prefix] + '.' + str(last + 1) else: new_n = str(last + 1) if new_n in amr.nodes: node_map[n] = new_n nodes = [ n for align in alignments[amr.id] for n in align.nodes if n not in node_map ] nodes = [n for n in sorted(nodes, key=lambda x: (len(x), x))] for n in nodes: prefix = '.'.join(i for i in n.split('.')[:-1]) if prefix not in node_map: new_prefix = '.'.join( str(int(i) + 1) for i in n.split('.')[:-1]) if new_prefix in amr.nodes: node_map[prefix] = new_prefix else: del alignments[amr.id] break candidates = [t for s, r, t in amr.edges if s == node_map[prefix]] candidates = [t for t in candidates if t not in node_map.values()] candidates = [t for t in sorted(candidates)] if not candidates: del alignments[amr.id] break new_n = candidates[0] node_map[n] = new_n if amr.id in alignments: for align in alignments[amr.id]: align.nodes = [node_map[n] for n in align.nodes] align.amr = amr for t, tok in enumerate(amr.tokens): align = amr.get_alignment(alignments, token_id=t) if not align: align = AMR_Alignment(type='subgraph', tokens=[t], nodes=[], amr=amr) alignments[amr.id].append(align) alignments[amr.id] = [ align for align in sorted(alignments[amr.id], key=lambda a: a.tokens[0]) ] reader.save_alignments_to_json(output, alignments)
def main(): ldc_amrs_dir = sys.argv[1] lpp_amrs_file = 'data-release/amrs/little_prince.txt' additional_amrs_file = 'data-release/amrs/additional_amrs.txt' dev_ids = 'data-release/leamr_dev_ids.txt' with open(dev_ids) as f: dev_ids = [ line.strip() for line in f if not line.strip().startswith('#') ] test_ids = 'data-release/leamr_test_ids.txt' with open(test_ids) as f: test_ids = [ line.strip() for line in f if not line.strip().startswith('#') ] ldc_amrs_dir_train = os.path.join(ldc_amrs_dir, 'data/alignments/split', 'training') ldc_amrs_dir_dev = os.path.join(ldc_amrs_dir, 'data/alignments/split', 'dev') ldc_amrs_dir_test = os.path.join(ldc_amrs_dir, 'data/alignments/split', 'test') reader = AMR_Reader() ldc_amrs_train = reader.load_from_dir(ldc_amrs_dir_train) ldc_amrs_dev = reader.load_from_dir(ldc_amrs_dir_dev) ldc_amrs_test = reader.load_from_dir(ldc_amrs_dir_test) lpp_amrs = reader.load(lpp_amrs_file) add_amrs = reader.load(additional_amrs_file) all_amrs = ldc_amrs_train + ldc_amrs_dev + ldc_amrs_test + lpp_amrs + add_amrs all_amrs = {amr.id: amr for amr in all_amrs} dev_amrs = [all_amrs[amr_id] for amr_id in dev_ids] test_amrs = [all_amrs[amr_id] for amr_id in test_ids] print() output_file = 'data-release/amrs/ldc_train.txt' print('Writing LDC training AMRs to:', output_file) reader.write_to_file(output_file, ldc_amrs_train) output_file = 'data-release/amrs/ldc_dev.txt' print('Writing LDC development AMRs to:', output_file) reader.write_to_file(output_file, ldc_amrs_dev) output_file = 'data-release/amrs/ldc_test.txt' print('Writing LDC test AMRs to:', output_file) reader.write_to_file(output_file, ldc_amrs_test) output_file = 'data-release/amrs/leamr_dev.txt' print('Writing LEAMR development data to:', output_file) reader.write_to_file(output_file, dev_amrs) output_file = 'data-release/amrs/leamr_test.txt' print('Writing LEAMR test data to:', output_file) reader.write_to_file(output_file, test_amrs) output_file = 'data-release/amrs/ldc+little_prince.txt' print('Writing LDC + Little Prince data to:', output_file) all_amrs = ldc_amrs_train + ldc_amrs_dev + ldc_amrs_test + lpp_amrs reader.write_to_file(output_file, all_amrs)