# Ideally the process would be lossless, giving a SMATCH score of 1.0 if __name__ == '__main__': setup_logging(logfname='logs/serial_deserial.log', level=WARN) corpus_dir = 'amrlib/data/LDC2020T02' in_fn = 'test.txt' out_dir = 'amrlib/data/test_parse_t5' ref_out_fn = in_fn + '.roundtrip_ref' gen_out_fn = in_fn + '.roundtrip_gen' # Make the out directory os.makedirs(out_dir, exist_ok=True) # Load the reference graphs fname = os.path.join(corpus_dir, in_fn) print('Loading', fname) ref_amr_entries = load_amr_entries(fname) ref_in_graphs = [get_graph_only(e) for e in ref_amr_entries] print('Loaded %d reference graphs' % len(ref_in_graphs)) # Simulate the generated graphs by running the original references throught the serializer print('Reloading and serializing', fname) gen_in_graphs = load_and_serialize(fname) gen_in_graphs = gen_in_graphs['serials'] clip_index_set = set() # empty - just so code below is a copy of post-process for generation # process the generated graphs through penman gen_out_graphs = [] bad_graphs = set() for i, graph in enumerate(gen_in_graphs): # skip anything on the clips if i in clip_index_set:
#!/usr/bin/python3 import setup_run_dir # this import tricks script to run from 2 levels up from amrlib.graph_processing.amr_plot import AMRPlot from amrlib.graph_processing.amr_loading import load_amr_entries if __name__ == '__main__': input_file = 'amrlib/data/LDC2020T02/test.txt' snum = 4 # id numbers start at 1 so they are 1 more than snum # Load the file entries = load_amr_entries(input_file) print('Found %d entries' % len(entries)) print() # Parse AMR entry = entries[snum] plot = AMRPlot() plot.build_from_graph(entry, debug=False) plot.view()
logger = logging.getLogger(__name__) # Run the aligner on the LDC files with existing alignments for comparison # The ISI hand alignments are for LDC2014T12 (AMR1) the test-concensus.txt and dev-concensus.txt files if __name__ == '__main__': setup_logging(level=WARN, logfname='logs/rbw_aligner.log') silence_penman() in_fname = 'amrlib/data/amr_annotation_1.0/data/split/test/amr-release-1.0-test-consensus.txt' out_fname = 'amrlib/data/alignments/test-aligned.txt' # in_fname = 'amrlib/data/amr_annotation_1.0/data/split/dev/amr-release-1.0-dev-consensus.txt' # out_fname = 'amrlib/data/alignments/dev-aligned.txt' # Load and convert to a penman graph print('Loading', in_fname) entries = load_amr_entries(in_fname) print('Loaded %d entries' % len(entries)) # Convert to penman and add lemmas print('Annotating') load_spacy( ) # do this in the main process to prevent doing it multiple times graphs = [] annotate = partial(add_lemmas, snt_key='snt', verify_tok_key=None) # no existing tok key with Pool() as pool: for graph in pool.imap(annotate, entries): if graph is not None: graphs.append(graph) print('%d graphs left with the same tokenization length' % len(graphs))
# (num_beams=16, batch_size=4) run-time = 29min batch_size = 4 num_beams = 16 use_tense = True rm_clips = True # Create the filenames based on above parameters extension = '.tagged' if use_tense else '.nowiki' extension += '.clipped' if rm_clips else '.noclip' extension += '.beam' + str(num_beams) gen_fn = 'test.txt.generated' + extension ref_fn = 'test.txt.ref_sents' + extension fpath = os.path.join(corpus_dir, test_fn) print('Loading test data from', fpath) graphs = load_amr_entries(fpath) sents = [get_sentence(g) for g in graphs] print('Loading model, tokenizer and data') inference = Inference(model_dir, batch_size=batch_size, num_beams=num_beams, device=device) print('Generating') answers, clips = inference.generate(graphs, disable_progress=False, use_tense=use_tense) # Filter out any clipped graphs as invalid tests # This will raise the BLEU score
# xx.nowiki # standard AMR # xx.nowiki.tagged # pos tags added # xx.nowiki.tdata # the above 2 combined # Take graphs that are annotated (tokens, pos, ...) and align them then tag the graphs. # Save files with the tagged and untagged data together in a single training file if __name__ == '__main__': setup_logging(level=WARN, logfname='logs/create_td_gen_t5wtense.log') silence_penman() data_dir = 'amrlib/data/tdata_generate_t5wtense' base_fns = ('dev.txt', 'test.txt', 'train.txt') # Loop through the files for base_fn in base_fns: infn = os.path.join(data_dir, base_fn + '.features.nowiki') print('Loading and processing', infn) entries = load_amr_entries(infn) tagged_entries = [] for entry in tqdm(entries, ncols=100): tagged_entry = ModelInputHelper(entry).get_tagged_with_meta() tagged_entries.append(tagged_entry) # Save tagged data only to a new file # outfn = infn + '.tagged' # print('Saving to', outfn) # with open(outfn, 'w') as f: # for entry in tagged_entries: # f.write(entry + '\n\n') # Save the tagged and untagged entries into a single file, shuffled together all_entries = entries + tagged_entries shuffle(all_entries) outfn = infn + '.tdata' print('Saving to', outfn)