def test_for_decode_encode_issue(gold): graph = penman.decode(gold, model=NoOpModel()) test = penman.encode(graph, indent=6, compact=True, model=NoOpModel()) gold = to_graph_line(gold) test = to_graph_line(test) is_good = test == gold return graph, is_good
def load_amrs_cached(amr_fpath): global pgraph_cache pgraphs = pgraph_cache.get(amr_fpath, None) if pgraphs is None: pgraphs = penman.load(amr_fpath, model=NoOpModel()) pgraph_cache[amr_fpath] = pgraphs return pgraphs
def __init__(self, gstring): self.graph = penman.decode(gstring, model=NoOpModel()) # Run the serialization self.elements = [] # clear elements list self.nodes = set() # nodes visited (to prevent recursion) self.serialize(self.graph.top) self.tokens = self.elements_to_tokens(self.elements)
def add_lemmas(entry, snt_key, verify_tok_key=None): global spacy_nlp load_spacy() graph = penman.decode(entry, model=NoOpModel()) # do not de-invert graphs doc = spacy_nlp(graph.metadata[snt_key]) nlp_tokens = [t.text for t in doc] graph.metadata['tokens'] = json.dumps(nlp_tokens) # Create lemmas # SpaCy's lemmatizer returns -PRON- for pronouns so strip these # Don't try to lemmatize any named-entities or proper nouns. Lower-case any other words. lemmas = [] for t in doc: if t.lemma_ == '-PRON-': lemma = t.text.lower() elif t.tag_.startswith('NNP') or t.ent_type_ not in ('', 'O'): lemma = t.text else: lemma = t.lemma_.lower() lemmas.append(lemma) graph.metadata['lemmas'] = json.dumps(lemmas) # If verify_tok_key is not None, verify that the new tokenization is the same as the existing # and only return the graph if the tokenized length is the same if verify_tok_key is not None: isi_tokens = graph.metadata[verify_tok_key].split() if len(isi_tokens) == len(lemmas) == len(nlp_tokens): return graph else: return None else: return graph
def from_string_w_json(cls, graph, token_key='tokens', lemma_key='lemmas', **kwargs): assert isinstance(graph, str) graph = penman.decode(graph, model=NoOpModel()) return cls.from_penman_w_json(graph, token_key, lemma_key, **kwargs)
def __init__(self, graph, force_annotate=False): # Convert or copy the input graph to penman format if isinstance(graph, str): pgraph = penman.decode(graph, model=NoOpModel()) elif isinstance(graph, penman.graph.Graph): pgraph = deepcopy(pgraph) else: raise ValueError('Code requires either a string a penman graph') # Annotate if needed (aligner/tagging require annotation) is_annotated = all([ key in pgraph.metadata for key in ('tokens', 'lemmas', 'pos_tags') ]) if not is_annotated or force_annotate: sentence = pgraph.metadata[ 'snt'] # Sanity check required tag. Throws KeyError if missing pgraph = annotate_penman(pgraph) self.annotation_performed = True # for unit-testing and debug else: self.annotation_performed = False # Align the graph. For simplicity, always do this. # If there are existing alignments they need to be removed. # See https://penman.readthedocs.io/en/latest/api/penman.surface.html if penman.surface.alignments(pgraph) or penman.surface.role_alignments( pgraph): for key, items in pgraph.epidata.items(): pgraph.epidata[key] = [ x for x in items if not isinstance(x, penman.surface.AlignmentMarker) ] pgraph = RBWAligner.from_penman_w_json(pgraph).get_penman_graph() # get the graph string and pos tags for the tagger self.metadata = pgraph.metadata.copy() pos_tags = json.loads(self.metadata['pos_tags']) pgraph.metadata = {} gstring = penman.encode(pgraph, model=NoOpModel(), indent=6) # Tag the graph string self.gstring_tagged = self.tag(gstring, pos_tags)
def build_from_graph(self, entry, debug=False, allow_deinvert=False): # Parse the AMR text if allow_deinvert: penman_graph = penman.decode(entry) else: model = NoOpModel() # does not de-invert edges penman_graph = penman.decode(entry, model=model) # Build g.instances() => concept relations (these are nodes) for t in penman_graph.instances(): self._add_instance(t) if debug: print(t) # Build g.edges() => relations between nodes for t in penman_graph.edges(): self._add_edge(t) if debug: print(t) # Build g.attributes => relations between nodes and a constant for t in penman_graph.attributes(): self._add_attribute(t) if debug: print(t)
def gather_test_graphs(): # These are for amr_annotation_3.0/data/multisentence/ms-amr-split/test/msamr_dfa_007.xml fn = 'data/amr_annotation_3.0/data/amrs/unsplit/amr-release-3.0-amrs-dfa.txt' gids = [ "DF-200-192400-625_7046.1", "DF-200-192400-625_7046.2", "DF-200-192400-625_7046.3", "DF-200-192400-625_7046.4", "DF-200-192400-625_7046.5", "DF-200-192400-625_7046.6", "DF-200-192400-625_7046.7", "DF-200-192400-625_7046.8", "DF-200-192400-625_7046.9", "DF-200-192400-625_7046.10", "DF-200-192400-625_7046.11", "DF-200-192400-625_7046.12", "DF-200-192400-625_7046.13", "DF-200-192400-625_7046.14", "DF-200-192400-625_7046.15", "DF-200-192400-625_7046.16", "DF-200-192400-625_7046.17", "DF-200-192400-625_7046.18" ] # Load the AMR file with penman and then extract the specific ids and put them in order pgraphs = penman.load(fn, model=NoOpModel()) ordered_pgraphs = [None] * len(gids) for pgraph in pgraphs: gid = pgraph.metadata['id'] doc_idx = gids.index(gid) if gid in gids else None if doc_idx is not None: ordered_pgraphs[doc_idx] = pgraph assert None not in ordered_pgraphs return ordered_pgraphs
def get_addresses(cls, graph): results = [] def add_result(addr, name, type): results.append(SimpleNamespace(addr=addr, name=name, type=type)) tree = penman.configure(graph, model=NoOpModel()) for path, branch in cls.walk_tree(tree.node, (1, )): # Get the node and attribute addresses if penman.tree.is_atomic( branch[1] ): # ==> is None or isinstance(x, (str, int, float)) address = '.'.join(map(str, path)) concept = branch[1] if concept.startswith('"'): # Attribute add_result(address, concept, 'attrib') else: add_result(address, concept, 'node') # Get the edge addresses if penman.tree.is_atomic(branch[0]) and branch[0] != '/': address = '.'.join(map(str, path)) edge_name = branch[0] add_result(address, edge_name, 'role') return results
print('%d generated graphs do not deserialize out of %d = %.1f%%' % (len(bad_graphs), num_non_clipped, pct)) print() # Save the reference, omitting any clipped or bad ref_fpath = os.path.join(out_dir, ref_out_fn) print('Saving', ref_fpath) skipped = 0 with open(ref_fpath, 'w') as f: for i, graph in enumerate(ref_in_graphs): if i in bad_graphs or i in clip_index_set: skipped += 1 continue f.write(graph + '\n\n') print('Skipped writing %d as either bad or clipped' % skipped) print('Wrote a total of %d reference AMR graphs' % (len(ref_in_graphs) - skipped)) print() # Save the generated gen_fpath = os.path.join(out_dir, gen_out_fn) print('Saving', gen_fpath) penman.dump(gen_out_graphs, gen_fpath, indent=6, model=NoOpModel()) print('Wrote a total of %d generated AMR graphs' % len(gen_out_graphs)) print() # Score the resultant files print('Scoring the above files with SMATCH') gold_entries = get_entries(ref_fpath) test_entries = get_entries(gen_fpath) precision, recall, f_score = compute_smatch(test_entries, gold_entries) print('SMATCH -> P: %.3f, R: %.3f, F: %.3f' % (precision, recall, f_score))
# Convert to penman and add lemmas print('Annotating') load_spacy( ) # do this in the main process to prevent doing it multiple times graphs = [] annotate = partial(add_lemmas, snt_key='snt', verify_tok_key=None) # no existing tok key with Pool() as pool: for graph in pool.imap(annotate, entries): if graph is not None: graphs.append(graph) print('%d graphs left with the same tokenization length' % len(graphs)) # Run the aligner print('Aligning Graphs') new_graphs = [] keep_keys = ('id', 'snt', 'tokens', 'lemmas', 'rbw_alignments') for graph in graphs: aligner = RBWAligner.from_penman_w_json( graph, align_str_name='rbw_alignments') pgraph = aligner.get_penman_graph() pgraph.metadata = { k: v for k, v in pgraph.metadata.items() if k in keep_keys } new_graphs.append(pgraph) # Save the graphs print('Saving to', out_fname) penman.dump(new_graphs, out_fname, model=NoOpModel(), indent=6)
from typing import List from penman import load as load_, Graph, Triple from penman import loads as loads_ from penman import encode as encode_ from penman.model import Model from penman.models.noop import NoOpModel from penman.models import amr import penman import logging op_model = Model() noop_model = NoOpModel() amr_model = amr.model DEFAULT = op_model # Mute loggers penman.layout.logger.setLevel(logging.CRITICAL) penman._parse.logger.setLevel(logging.CRITICAL) def _get_model(dereify): if dereify is None: return DEFAULT elif dereify: return op_model else: return noop_model def _remove_wiki(graph):
def get_graph_string(self): return penman.encode(self.graph, model=NoOpModel(), indent=6)
def deserialize(self, gstring): node_stack = [] # list of previously instantiated nodes node_depth = 0 # number of left parens encountered and not destacked triple = [] # Tokenize and some logging (the system has a lot of unbalanced parentheses) tokens = self.graph_tokenize(gstring) # left_parens = tokens.count('(') # right_parens = tokens.count(')') # if left_parens != right_parens: # logger.warning('gid=%s has %d left parens and %d right parens' % (self.gid, left_parens, right_parens)) # Loop through all tokens and parse the string for tnum, token in enumerate(tokens): #### Big case statement to classify parts of the graph string #### ttype = self.token_type(token) # Mostly ignored but can be used for error checking if token == '(': node_depth += 1 # Find the source for the triple elif len(triple) == 0 and ttype == TType.concept: # This path should only happen for a new graph. Make a somewhat arbitrary choice to # either stop parsing or to clear out the existing triples to prevent disconnected graphs. if len(self.triples) > 0: logger.error('gid=%s Initial node constructed when triples not empty.' % (self.gid)) if len(self.triples) > len(tokens)/4: # if > half done (on average ~2 tokens per triple) break else: self.triples = [] variable, concept, is_new_node = self.get_var_concept(token) triple.append(variable) if is_new_node: node_stack.append( variable ) # Some error logging if is_new_node and tokens[tnum-1] != '(': logger.warning('gid=%s Missing starting paren for node %s/%s' % (self.gid, variable, concept)) if not is_new_node and tokens[tnum-1] == '(': logger.warning('gid=%s Start paren present but %s is not a new concept' % (self.gid, concept)) elif len(triple) == 0 and ttype == TType.role: variable = node_stack[-1] triple.append(variable) triple.append(token) # Look for the role (aka edge) elif len(triple) == 1 and ttype == TType.role: triple.append(token) # Look for the target elif len(triple) == 2 and ttype == TType.attrib: triple.append(token) elif len(triple) == 2 and ttype == TType.concept: variable, concept, is_new_node = self.get_var_concept(token) if is_new_node: node_stack.append( variable ) # Some error logging if is_new_node and tokens[tnum-1] != '(': logger.warning('gid=%s Missing starting paren for node %s/%s' % (self.gid, variable, concept)) if not is_new_node and tokens[tnum-1] == '(': logger.warning('gid=%s Start paren present but %s is not a new concept' % (self.gid, concept)) triple.append(variable) # De-stack the root nodes based on closing parens, but don't destack past the top var # Log an error if we're trying to empty the stack and it's not the very last token elif token == ')': if len(node_stack) > 1: node_stack.pop() node_depth -= 1 elif tnum < len(self.triples)-1: logger.warning('gid=%s Trying to destack past top node' % self.gid) # Unknown situation (should never get here) else: logger.warning('gid=%s Unhandled token %s' % (self.gid, token)) #### Save the triple if complete #### if len(triple) == 3: self.triples.append( tuple(triple) ) triple = [] # Do a little post-processing check on the triples and fix attribs if needed # I haven't found instances that requires this but it could be useful for i, triple in enumerate(self.triples): if triple[1] == self.INSTANCE: continue target = triple[2] # Check if this is a varible if self.re_var.fullmatch(target) or self.re_ii.fullmatch(target): continue # If it's an attrib enforce attribute syntax else: if (target.startswith('"') and target.endswith('"')) or self.is_num(target) or \ (target in set(['-', '+', 'interrogative', 'imperative', 'expressive'])): continue else: new_target = '"' + target.replace('"', '') + '"' self.triples[i] = tuple([triple[0], triple[1], new_target]) logger.warning('gid=%s Replacing attrib %s with %s' % (self.gid, target, new_target)) # Now convert to a penman graph and then back to a string pgraph = Graph(self.triples) # Catch malformed graphs, including disconnected ones, incorrectly quoted attibs, etc.. try: self.gstring = penman.encode(pgraph, indent=6, model=NoOpModel()) self.pgraph = penman.decode(self.gstring, model=NoOpModel()) except: self.gstring = None self.pgraph = None
if __name__ == '__main__': if 1: # dev dataset gold_alignments_fn = 'amrlib/alignments/isi_hand_alignments/dev-gold.txt' test_amr_fn = 'amrlib/data/alignments/dev-aligned.txt' else: # test dataset gold_alignments_fn = 'amrlib/alignments/isi_hand_alignments/test-gold.txt' test_amr_fn = 'amrlib/data/alignments/test-aligned.txt' # Print load alignments print('Loading alignments from', gold_alignments_fn) gold_alignments, gold_ids = load_gold_alignments(gold_alignments_fn) # Load the aligned corpus and extract the data print('Loading corpus data from', test_amr_fn) pgraphs = penman.load(test_amr_fn, model=NoOpModel()) test_alignments = [ g.metadata['rbw_alignments'].strip().split() for g in pgraphs ] test_alignments = [a for a in test_alignments if a] test_ids = [g.metadata['id'] for g in pgraphs] # Sanity check that things match up assert len(gold_alignments) == len(test_alignments), '%s != %s' % ( len(gold_alignments), len(test_alignments)) assert len(gold_alignments) == 100, len(gold_alignments) for gold_id, test_id in zip(gold_ids, test_ids): assert gold_id == test_id, '%s != %s' % (gold_id, test_id) print('Gold and Test aligment files match') # Score against isi automated alignments
if not matched: bad_indexes.append(i) print('These indexes will be skipped: ', bad_indexes) print() # Test to see if old and new alignments match print('Testing match of new and original alignments') print( '%d entries do not re-encode properly and will be skipped (counted as good).' % len(bad_indexes)) num_bad = 0 bad_indexes = set(bad_indexes) for i, entry in enumerate(entries): if i in bad_indexes: continue graph = penman.decode(entry, model=NoOpModel()) graph.metadata['old_aligns'] = graph.metadata['isi_alignments'] del graph.metadata['isi_alignments'] RBWAligner.add_alignment_string(graph, 'new_aligns') # compare as a set because when there are nodes with x.y and x.y.r the ordering may be different old_aligns = set(graph.metadata['old_aligns'].split()) new_aligns = set(graph.metadata['new_aligns'].split()) if old_aligns != new_aligns: num_bad += 1 print('index', i) print('# ::id', graph.metadata['id']) print('# ::old_aligns', graph.metadata['old_aligns']) print('# ::new_aligns', graph.metadata['new_aligns']) print() print('There are %d bad alignments out of %d total' % (num_bad, len(entries)))