def load_and_resave_amr_files(glob_patterns, out_fpath): if isinstance(glob_patterns, str): glob_patterns = [glob_patterns] # Find all the files fpaths = [] for pattern in glob_patterns: glob_fpaths = glob(pattern) assert len(glob_fpaths) > 0 # check for invalid path in the list fpaths += glob_fpaths graphs = [] # Load graphs sorted by filename for consistency for fpath in sorted(fpaths, key=lambda x: os.path.basename(x)): print('Loading', fpath) graphs.extend(load_raw_amr(fpath)) print('Loaded {:,} graphs'.format(len(graphs))) # Save the collated data print('Saving data to', out_fpath) with open(out_fpath, 'w') as f: for graph in graphs: f.write('%s\n\n' % graph) print() return graphs
#!/usr/bin/python3 import setup_run_dir # Set the working directory and python sys.path to 2 levels above import os from amrlib.graph_processing.amr_loading_raw import load_raw_amr if __name__ == '__main__': base_dir = 'amrlib/data/amr_annotation_3.0/data/amrs/split' out_dir = 'amrlib/data/LDC2020T02' os.makedirs(out_dir, exist_ok=True) # Loop through the dirctories for dirname in ('dev', 'test', 'training'): entries = [] dn = os.path.join(base_dir, dirname) print('Loading data from', dn) fpaths = [os.path.join(dn, fn) for fn in os.listdir(dn)] for fpath in fpaths: entries += load_raw_amr(fpath) print('Loaded {:,} entries'.format(len(entries))) # Save the collated data fn = 'train.txt' if dirname == 'training' else dirname + '.txt' out_path = os.path.join(out_dir, fn) print('Saving data to', out_path) with open(out_path, 'w') as f: for entry in entries: f.write('%s\n\n' % entry) print()
os.makedirs(out_dir, exist_ok=True) # Get all the amr files and put dev-consensus.txt on top, followed by test-consensus.txt # to make scoring easier fpaths = [ y for x in os.walk(amr_dir) for y in glob(os.path.join(x[0], '*.txt')) ] fpaths = sorted([fp for fp in fpaths if fp not in (dev_fp, test_fp)]) fpaths = [dev_fp, test_fp] + fpaths # Load all the entries print('Loading data') sents, gstrings = [], [] for fpath in fpaths: amr_strings = load_raw_amr(fpath) entries = get_graph_sent(amr_strings) #entries = load_amr_graph_sent(fpath) # Append the data # Filter "(a / amr-empty)" in amr-release-1.0-proxy.txt that might be causing issues # So long as this is above the dev/test data (ends at index 200) it won't mess-up scoring for sent, graph in zip(entries['sent'], entries['graph']): if sent == '.': print('Removed empty entry at index %d from %s' % (len(sents), fpath)) assert len(sents) > 200 # this will mess-up scoring continue sents.append(sent) gstrings.append(graph) if max_entries and len(gstrings) >= max_entries: break
#!/usr/bin/python3 import setup_run_dir # Set the working directory and python sys.path to 2 levels above import os from glob import glob from amrlib.graph_processing.amr_loading_raw import load_raw_amr # Collect all the amr graphs from multiple files and create a gold test file. # This simply concatenates files and cleans a few bad characters out. The glob pattern # needs to be exactly the same as what's in generate so the output graph ordering is the same. if __name__ == '__main__': glob_pattern = 'amrlib/data/amr_annotation_3.0/data/amrs/split/test/*.txt' out_fpath = 'amrlib/data/model_parse_spring/test-gold.txt.wiki' # Load the data graphs = [] print('Loading data from', glob_pattern) for fpath in sorted(glob(glob_pattern)): graphs.extend(load_raw_amr(fpath)) print('Loaded {:,} graphs'.format(len(graphs))) # Save the collated data print('Saving data to', out_fpath) with open(out_fpath, 'w') as f: for graph in graphs: f.write('%s\n\n' % graph) print()