Пример #1
0
def test_for_decode_encode_issue(gold):
    graph = penman.decode(gold, model=NoOpModel())
    test = penman.encode(graph, indent=6, compact=True, model=NoOpModel())
    gold = to_graph_line(gold)
    test = to_graph_line(test)
    is_good = test == gold
    return graph, is_good
Пример #2
0
def load_amrs_cached(amr_fpath):
    global pgraph_cache
    pgraphs = pgraph_cache.get(amr_fpath, None)
    if pgraphs is None:
        pgraphs = penman.load(amr_fpath, model=NoOpModel())
        pgraph_cache[amr_fpath] = pgraphs
    return pgraphs
Пример #3
0
 def __init__(self, gstring):
     self.graph    = penman.decode(gstring, model=NoOpModel())
     # Run the serialization
     self.elements = []              # clear elements list
     self.nodes    = set()           # nodes visited (to prevent recursion)
     self.serialize(self.graph.top)
     self.tokens   = self.elements_to_tokens(self.elements)
Пример #4
0
def add_lemmas(entry, snt_key, verify_tok_key=None):
    global spacy_nlp
    load_spacy()
    graph  = penman.decode(entry, model=NoOpModel())    # do not de-invert graphs
    doc        = spacy_nlp(graph.metadata[snt_key])
    nlp_tokens = [t.text for t in doc]
    graph.metadata['tokens'] = json.dumps(nlp_tokens)
    # Create lemmas
    # SpaCy's lemmatizer returns -PRON- for pronouns so strip these
    # Don't try to lemmatize any named-entities or proper nouns.  Lower-case any other words.
    lemmas = []
    for t in doc:
        if t.lemma_ == '-PRON-':
            lemma = t.text.lower()
        elif t.tag_.startswith('NNP') or t.ent_type_ not in ('', 'O'):
            lemma = t.text
        else:
            lemma = t.lemma_.lower()
        lemmas.append(lemma)
    graph.metadata['lemmas'] = json.dumps(lemmas)
    # If verify_tok_key is not None, verify that the new tokenization is the same as the existing
    # and only return the graph if the tokenized length is the same
    if verify_tok_key is not None:
        isi_tokens = graph.metadata[verify_tok_key].split()
        if len(isi_tokens) == len(lemmas) == len(nlp_tokens):
            return graph
        else:
            return None
    else:
        return graph
Пример #5
0
 def from_string_w_json(cls,
                        graph,
                        token_key='tokens',
                        lemma_key='lemmas',
                        **kwargs):
     assert isinstance(graph, str)
     graph = penman.decode(graph, model=NoOpModel())
     return cls.from_penman_w_json(graph, token_key, lemma_key, **kwargs)
Пример #6
0
 def __init__(self, graph, force_annotate=False):
     # Convert or copy the input graph to penman format
     if isinstance(graph, str):
         pgraph = penman.decode(graph, model=NoOpModel())
     elif isinstance(graph, penman.graph.Graph):
         pgraph = deepcopy(pgraph)
     else:
         raise ValueError('Code requires either a string a penman graph')
     # Annotate if needed (aligner/tagging require annotation)
     is_annotated = all([
         key in pgraph.metadata for key in ('tokens', 'lemmas', 'pos_tags')
     ])
     if not is_annotated or force_annotate:
         sentence = pgraph.metadata[
             'snt']  # Sanity check required tag.  Throws KeyError if missing
         pgraph = annotate_penman(pgraph)
         self.annotation_performed = True  # for unit-testing and debug
     else:
         self.annotation_performed = False
     # Align the graph.  For simplicity, always do this.
     # If there are existing alignments they need to be removed.
     # See https://penman.readthedocs.io/en/latest/api/penman.surface.html
     if penman.surface.alignments(pgraph) or penman.surface.role_alignments(
             pgraph):
         for key, items in pgraph.epidata.items():
             pgraph.epidata[key] = [
                 x for x in items
                 if not isinstance(x, penman.surface.AlignmentMarker)
             ]
     pgraph = RBWAligner.from_penman_w_json(pgraph).get_penman_graph()
     # get the graph string and pos tags for the tagger
     self.metadata = pgraph.metadata.copy()
     pos_tags = json.loads(self.metadata['pos_tags'])
     pgraph.metadata = {}
     gstring = penman.encode(pgraph, model=NoOpModel(), indent=6)
     # Tag the graph string
     self.gstring_tagged = self.tag(gstring, pos_tags)
Пример #7
0
 def build_from_graph(self, entry, debug=False, allow_deinvert=False):
     # Parse the AMR text
     if allow_deinvert:
         penman_graph = penman.decode(entry)
     else:
         model = NoOpModel()  # does not de-invert edges
         penman_graph = penman.decode(entry, model=model)
     # Build g.instances() => concept relations  (these are nodes)
     for t in penman_graph.instances():
         self._add_instance(t)
         if debug: print(t)
     # Build g.edges() => relations between nodes
     for t in penman_graph.edges():
         self._add_edge(t)
         if debug: print(t)
     # Build g.attributes  => relations between nodes and a constant
     for t in penman_graph.attributes():
         self._add_attribute(t)
         if debug: print(t)
Пример #8
0
def gather_test_graphs():
    # These are for amr_annotation_3.0/data/multisentence/ms-amr-split/test/msamr_dfa_007.xml
    fn = 'data/amr_annotation_3.0/data/amrs/unsplit/amr-release-3.0-amrs-dfa.txt'
    gids = [
        "DF-200-192400-625_7046.1", "DF-200-192400-625_7046.2",
        "DF-200-192400-625_7046.3", "DF-200-192400-625_7046.4",
        "DF-200-192400-625_7046.5", "DF-200-192400-625_7046.6",
        "DF-200-192400-625_7046.7", "DF-200-192400-625_7046.8",
        "DF-200-192400-625_7046.9", "DF-200-192400-625_7046.10",
        "DF-200-192400-625_7046.11", "DF-200-192400-625_7046.12",
        "DF-200-192400-625_7046.13", "DF-200-192400-625_7046.14",
        "DF-200-192400-625_7046.15", "DF-200-192400-625_7046.16",
        "DF-200-192400-625_7046.17", "DF-200-192400-625_7046.18"
    ]
    # Load the AMR file with penman and then extract the specific ids and put them in order
    pgraphs = penman.load(fn, model=NoOpModel())
    ordered_pgraphs = [None] * len(gids)
    for pgraph in pgraphs:
        gid = pgraph.metadata['id']
        doc_idx = gids.index(gid) if gid in gids else None
        if doc_idx is not None:
            ordered_pgraphs[doc_idx] = pgraph
    assert None not in ordered_pgraphs
    return ordered_pgraphs
Пример #9
0
    def get_addresses(cls, graph):
        results = []

        def add_result(addr, name, type):
            results.append(SimpleNamespace(addr=addr, name=name, type=type))

        tree = penman.configure(graph, model=NoOpModel())
        for path, branch in cls.walk_tree(tree.node, (1, )):
            # Get the node and attribute addresses
            if penman.tree.is_atomic(
                    branch[1]
            ):  # ==> is None or isinstance(x, (str, int, float))
                address = '.'.join(map(str, path))
                concept = branch[1]
                if concept.startswith('"'):  # Attribute
                    add_result(address, concept, 'attrib')
                else:
                    add_result(address, concept, 'node')
            # Get the edge addresses
            if penman.tree.is_atomic(branch[0]) and branch[0] != '/':
                address = '.'.join(map(str, path))
                edge_name = branch[0]
                add_result(address, edge_name, 'role')
        return results
    print('%d generated graphs do not deserialize out of %d = %.1f%%' % (len(bad_graphs), num_non_clipped, pct))
    print()

    # Save the reference, omitting any clipped or bad
    ref_fpath = os.path.join(out_dir, ref_out_fn)
    print('Saving', ref_fpath)
    skipped = 0
    with open(ref_fpath, 'w') as f:
        for i, graph in enumerate(ref_in_graphs):
            if i in bad_graphs or i in clip_index_set:
                skipped += 1
                continue
            f.write(graph + '\n\n')
    print('Skipped writing %d as either bad or clipped' % skipped)
    print('Wrote a total of %d reference AMR graphs' % (len(ref_in_graphs) - skipped))
    print()

    # Save the generated
    gen_fpath = os.path.join(out_dir, gen_out_fn)
    print('Saving', gen_fpath)
    penman.dump(gen_out_graphs, gen_fpath, indent=6, model=NoOpModel())
    print('Wrote a total of %d generated AMR graphs' % len(gen_out_graphs))
    print()

    # Score the resultant files
    print('Scoring the above files with SMATCH')
    gold_entries = get_entries(ref_fpath)
    test_entries = get_entries(gen_fpath)
    precision, recall, f_score = compute_smatch(test_entries, gold_entries)
    print('SMATCH -> P: %.3f,  R: %.3f,  F: %.3f' % (precision, recall, f_score))
Пример #11
0
    # Convert to penman and add lemmas
    print('Annotating')
    load_spacy(
    )  # do this in the main process to prevent doing it multiple times
    graphs = []
    annotate = partial(add_lemmas, snt_key='snt',
                       verify_tok_key=None)  # no existing tok key
    with Pool() as pool:
        for graph in pool.imap(annotate, entries):
            if graph is not None:
                graphs.append(graph)
    print('%d graphs left with the same tokenization length' % len(graphs))

    # Run the aligner
    print('Aligning Graphs')
    new_graphs = []
    keep_keys = ('id', 'snt', 'tokens', 'lemmas', 'rbw_alignments')
    for graph in graphs:
        aligner = RBWAligner.from_penman_w_json(
            graph, align_str_name='rbw_alignments')
        pgraph = aligner.get_penman_graph()
        pgraph.metadata = {
            k: v
            for k, v in pgraph.metadata.items() if k in keep_keys
        }
        new_graphs.append(pgraph)

    # Save the graphs
    print('Saving to', out_fname)
    penman.dump(new_graphs, out_fname, model=NoOpModel(), indent=6)
Пример #12
0
from typing import List

from penman import load as load_, Graph, Triple
from penman import loads as loads_
from penman import encode as encode_
from penman.model import Model
from penman.models.noop import NoOpModel
from penman.models import amr
import penman
import logging

op_model = Model()
noop_model = NoOpModel()
amr_model = amr.model
DEFAULT = op_model

# Mute loggers
penman.layout.logger.setLevel(logging.CRITICAL)
penman._parse.logger.setLevel(logging.CRITICAL)


def _get_model(dereify):
    if dereify is None:
        return DEFAULT
    elif dereify:
        return op_model
    else:
        return noop_model


def _remove_wiki(graph):
Пример #13
0
 def get_graph_string(self):
     return penman.encode(self.graph, model=NoOpModel(), indent=6)
Пример #14
0
 def deserialize(self, gstring):
     node_stack = []     # list of previously instantiated nodes
     node_depth = 0      # number of left parens encountered and not destacked
     triple     = []
     # Tokenize and some logging (the system has a lot of unbalanced parentheses)
     tokens = self.graph_tokenize(gstring)
     # left_parens  = tokens.count('(')
     # right_parens = tokens.count(')')
     # if left_parens != right_parens:
     #     logger.warning('gid=%s has %d left parens and %d right parens' % (self.gid, left_parens, right_parens))
     # Loop through all tokens and parse the string
     for tnum, token in enumerate(tokens):
         #### Big case statement to classify parts of the graph string ####
         ttype = self.token_type(token)
         # Mostly ignored but can be used for error checking
         if token == '(':
             node_depth += 1
         # Find the source for the triple
         elif len(triple) == 0 and ttype == TType.concept:
             # This path should only happen for a new graph. Make a somewhat arbitrary choice to
             # either stop parsing or to clear out the existing triples to prevent disconnected graphs.
             if len(self.triples) > 0:
                 logger.error('gid=%s Initial node constructed when triples not empty.' % (self.gid))
                 if len(self.triples) > len(tokens)/4:    # if > half done (on average ~2 tokens per triple)
                     break
                 else:
                     self.triples = []
             variable, concept, is_new_node = self.get_var_concept(token)
             triple.append(variable)
             if is_new_node:
                 node_stack.append( variable )
             # Some error logging
             if is_new_node and tokens[tnum-1] != '(':
                 logger.warning('gid=%s Missing starting paren for node %s/%s' % (self.gid, variable, concept))
             if not is_new_node and tokens[tnum-1] == '(':
                 logger.warning('gid=%s Start paren present but %s is not a new concept' % (self.gid, concept))
         elif len(triple) == 0 and ttype == TType.role:
             variable = node_stack[-1]
             triple.append(variable)
             triple.append(token)
         # Look for the role (aka edge)
         elif len(triple) == 1 and ttype == TType.role:
             triple.append(token)
         # Look for the target
         elif len(triple) == 2 and ttype == TType.attrib:
             triple.append(token)
         elif len(triple) == 2 and ttype == TType.concept:
             variable, concept, is_new_node = self.get_var_concept(token)
             if is_new_node:
                 node_stack.append( variable )
             # Some error logging
             if is_new_node and tokens[tnum-1] != '(':
                 logger.warning('gid=%s Missing starting paren for node %s/%s' % (self.gid, variable, concept))
             if not is_new_node and tokens[tnum-1] == '(':
                 logger.warning('gid=%s Start paren present but %s is not a new concept' % (self.gid, concept))
             triple.append(variable)
         # De-stack the root nodes based on closing parens, but don't destack past the top var
         # Log an error if we're trying to empty the stack and it's not the very last token
         elif token == ')':
             if len(node_stack) > 1:
                 node_stack.pop()
                 node_depth -= 1
             elif tnum < len(self.triples)-1:
                 logger.warning('gid=%s Trying to destack past top node' % self.gid)
         # Unknown situation (should never get here)
         else:
             logger.warning('gid=%s Unhandled token %s' % (self.gid, token))
         #### Save the triple if complete ####
         if len(triple) == 3:
             self.triples.append( tuple(triple) )
             triple = []
     # Do a little post-processing check on the triples and fix attribs if needed
     # I haven't found instances that requires this but it could be useful
     for i, triple in enumerate(self.triples):
         if triple[1] == self.INSTANCE:
             continue
         target = triple[2]
         # Check if this is a varible
         if self.re_var.fullmatch(target) or self.re_ii.fullmatch(target):
             continue
         # If it's an attrib enforce attribute syntax
         else:
             if (target.startswith('"') and target.endswith('"')) or self.is_num(target)  or \
                (target in set(['-', '+', 'interrogative', 'imperative', 'expressive'])):
                 continue
             else:
                 new_target = '"' + target.replace('"', '') + '"'
             self.triples[i] = tuple([triple[0], triple[1], new_target])
             logger.warning('gid=%s Replacing attrib %s with %s' % (self.gid, target, new_target))
     # Now convert to a penman graph and then back to a string
     pgraph = Graph(self.triples)
     # Catch malformed graphs, including disconnected ones, incorrectly quoted attibs, etc..
     try:
         self.gstring = penman.encode(pgraph, indent=6, model=NoOpModel())
         self.pgraph  = penman.decode(self.gstring, model=NoOpModel())
     except:
         self.gstring = None
         self.pgraph  = None
Пример #15
0
if __name__ == '__main__':
    if 1:  # dev dataset
        gold_alignments_fn = 'amrlib/alignments/isi_hand_alignments/dev-gold.txt'
        test_amr_fn = 'amrlib/data/alignments/dev-aligned.txt'
    else:  # test dataset
        gold_alignments_fn = 'amrlib/alignments/isi_hand_alignments/test-gold.txt'
        test_amr_fn = 'amrlib/data/alignments/test-aligned.txt'

    # Print load alignments
    print('Loading alignments from', gold_alignments_fn)
    gold_alignments, gold_ids = load_gold_alignments(gold_alignments_fn)

    # Load the aligned corpus and extract the data
    print('Loading corpus data from', test_amr_fn)
    pgraphs = penman.load(test_amr_fn, model=NoOpModel())
    test_alignments = [
        g.metadata['rbw_alignments'].strip().split() for g in pgraphs
    ]
    test_alignments = [a for a in test_alignments if a]
    test_ids = [g.metadata['id'] for g in pgraphs]

    # Sanity check that things match up
    assert len(gold_alignments) == len(test_alignments), '%s != %s' % (
        len(gold_alignments), len(test_alignments))
    assert len(gold_alignments) == 100, len(gold_alignments)
    for gold_id, test_id in zip(gold_ids, test_ids):
        assert gold_id == test_id, '%s != %s' % (gold_id, test_id)
    print('Gold and Test aligment files match')

    # Score against isi automated alignments
            if not matched:
                bad_indexes.append(i)
        print('These indexes will be skipped: ', bad_indexes)
        print()

        # Test to see if old and new alignments match
        print('Testing match of new and original alignments')
        print(
            '%d entries do not re-encode properly and will be skipped (counted as good).'
            % len(bad_indexes))
        num_bad = 0
        bad_indexes = set(bad_indexes)
        for i, entry in enumerate(entries):
            if i in bad_indexes:
                continue
            graph = penman.decode(entry, model=NoOpModel())
            graph.metadata['old_aligns'] = graph.metadata['isi_alignments']
            del graph.metadata['isi_alignments']
            RBWAligner.add_alignment_string(graph, 'new_aligns')
            # compare as a set because when there are nodes with x.y and x.y.r the ordering may be different
            old_aligns = set(graph.metadata['old_aligns'].split())
            new_aligns = set(graph.metadata['new_aligns'].split())
            if old_aligns != new_aligns:
                num_bad += 1
                print('index', i)
                print('# ::id', graph.metadata['id'])
                print('# ::old_aligns', graph.metadata['old_aligns'])
                print('# ::new_aligns', graph.metadata['new_aligns'])
                print()
        print('There are %d bad alignments out of %d total' %
              (num_bad, len(entries)))