# Ideally the process would be lossless, giving a SMATCH score of 1.0
if __name__ == '__main__':
    setup_logging(logfname='logs/serial_deserial.log', level=WARN)
    corpus_dir = 'amrlib/data/LDC2020T02'
    in_fn      = 'test.txt'
    out_dir    = 'amrlib/data/test_parse_t5'
    ref_out_fn = in_fn + '.roundtrip_ref'
    gen_out_fn = in_fn + '.roundtrip_gen'

    # Make the out directory
    os.makedirs(out_dir, exist_ok=True)

    # Load the reference graphs
    fname = os.path.join(corpus_dir, in_fn)
    print('Loading', fname)
    ref_amr_entries = load_amr_entries(fname)
    ref_in_graphs   = [get_graph_only(e) for e in ref_amr_entries]
    print('Loaded %d reference graphs' % len(ref_in_graphs))

    # Simulate the generated graphs by running the original references throught the serializer
    print('Reloading and serializing', fname)
    gen_in_graphs  = load_and_serialize(fname)
    gen_in_graphs  = gen_in_graphs['serials']
    clip_index_set = set()  # empty - just so code below is a copy of post-process for generation

    # process the generated graphs through penman
    gen_out_graphs = []
    bad_graphs     = set()
    for i, graph in enumerate(gen_in_graphs):
        # skip anything on the clips
        if i in clip_index_set:
示例#2
0
#!/usr/bin/python3
import setup_run_dir  # this import tricks script to run from 2 levels up
from amrlib.graph_processing.amr_plot import AMRPlot
from amrlib.graph_processing.amr_loading import load_amr_entries

if __name__ == '__main__':
    input_file = 'amrlib/data/LDC2020T02/test.txt'
    snum = 4  # id numbers start at 1 so they are 1 more than snum

    # Load the file
    entries = load_amr_entries(input_file)
    print('Found %d entries' % len(entries))
    print()

    # Parse AMR
    entry = entries[snum]
    plot = AMRPlot()
    plot.build_from_graph(entry, debug=False)
    plot.view()
logger = logging.getLogger(__name__)

# Run the aligner on the LDC files with existing alignments for comparison
# The ISI hand alignments are for LDC2014T12 (AMR1) the test-concensus.txt and dev-concensus.txt files
if __name__ == '__main__':
    setup_logging(level=WARN, logfname='logs/rbw_aligner.log')
    silence_penman()

    in_fname = 'amrlib/data/amr_annotation_1.0/data/split/test/amr-release-1.0-test-consensus.txt'
    out_fname = 'amrlib/data/alignments/test-aligned.txt'
    # in_fname  = 'amrlib/data/amr_annotation_1.0/data/split/dev/amr-release-1.0-dev-consensus.txt'
    # out_fname = 'amrlib/data/alignments/dev-aligned.txt'

    # Load and convert to a penman graph
    print('Loading', in_fname)
    entries = load_amr_entries(in_fname)
    print('Loaded %d entries' % len(entries))

    # Convert to penman and add lemmas
    print('Annotating')
    load_spacy(
    )  # do this in the main process to prevent doing it multiple times
    graphs = []
    annotate = partial(add_lemmas, snt_key='snt',
                       verify_tok_key=None)  # no existing tok key
    with Pool() as pool:
        for graph in pool.imap(annotate, entries):
            if graph is not None:
                graphs.append(graph)
    print('%d graphs left with the same tokenization length' % len(graphs))
示例#4
0
    #        (num_beams=16, batch_size=4) run-time = 29min
    batch_size = 4
    num_beams = 16
    use_tense = True
    rm_clips = True

    # Create the filenames based on above parameters
    extension = '.tagged' if use_tense else '.nowiki'
    extension += '.clipped' if rm_clips else '.noclip'
    extension += '.beam' + str(num_beams)
    gen_fn = 'test.txt.generated' + extension
    ref_fn = 'test.txt.ref_sents' + extension

    fpath = os.path.join(corpus_dir, test_fn)
    print('Loading test data from', fpath)
    graphs = load_amr_entries(fpath)
    sents = [get_sentence(g) for g in graphs]

    print('Loading model, tokenizer and data')
    inference = Inference(model_dir,
                          batch_size=batch_size,
                          num_beams=num_beams,
                          device=device)

    print('Generating')
    answers, clips = inference.generate(graphs,
                                        disable_progress=False,
                                        use_tense=use_tense)

    # Filter out any clipped graphs as invalid tests
    # This will raise the BLEU score
示例#5
0
# xx.nowiki           # standard AMR
# xx.nowiki.tagged    # pos tags added
# xx.nowiki.tdata     # the above 2 combined
# Take graphs that are annotated (tokens, pos, ...) and align them then tag the graphs.
# Save files with the tagged and untagged data together in a single training file
if __name__ == '__main__':
    setup_logging(level=WARN, logfname='logs/create_td_gen_t5wtense.log')
    silence_penman()
    data_dir = 'amrlib/data/tdata_generate_t5wtense'
    base_fns = ('dev.txt', 'test.txt', 'train.txt')

    # Loop through the files
    for base_fn in base_fns:
        infn = os.path.join(data_dir, base_fn + '.features.nowiki')
        print('Loading and processing', infn)
        entries = load_amr_entries(infn)
        tagged_entries = []
        for entry in tqdm(entries, ncols=100):
            tagged_entry = ModelInputHelper(entry).get_tagged_with_meta()
            tagged_entries.append(tagged_entry)
        # Save tagged data only to a new file
        # outfn = infn + '.tagged'
        # print('Saving to', outfn)
        # with open(outfn, 'w') as f:
        #     for entry in tagged_entries:
        #         f.write(entry + '\n\n')
        # Save the tagged and untagged entries into a single file, shuffled together
        all_entries = entries + tagged_entries
        shuffle(all_entries)
        outfn = infn + '.tdata'
        print('Saving to', outfn)