示例#1
0
def main():
    cr = JAMR_CorpusReader()
    cr.load_amrs(sys.argv[1], verbose=False)

    all_entities = []
    for amr in cr.amrs:
        for node_id in amr.alignments:

            # get entity info
            token_ids = amr.alignments[node_id]
            if not token_ids:
                continue
            nodes = amr.alignmentsToken2Node(token_ids[0])
            if len(nodes) <= 1:
                continue
            entity_sg = amr.findSubGraph(nodes)
            root = entity_sg.root
            if not node_id == root:
                continue
            edges = entity_sg.edges
            if not edges:
                continue
            if len(edges) == 1 and edges[0][1] in [':polarity', ':mode']:
                continue

            tokens = [
                amr.tokens[t - 1] for t in token_ids
                if 0 <= t <= len(amr.tokens)
            ]
            final_nodes = [
                n for n in nodes if not [e for e in edges if e[0] == n]
            ]

            entity_type = [
                amr.nodes[id] for id in nodes if id not in final_nodes
            ]
            entity_type = ','.join(entity_type)

            nodes = {n: amr.nodes[n] for n in nodes}
            all_entities.append((amr, entity_type, tokens, root, nodes, edges))

    create_fixed_rules(all_entities)
    create_var_rules(all_entities)
    create_name_rules(all_entities)
    create_date_entity_rules(all_entities)
    create_normalization_rules()

    print('[entity rules] Writing rules')
    with open('../entity_rules.json', 'w+', encoding='utf8') as f:
        JSON.dump(entity_rules_json, f, sort_keys=True)
    print('[entity rules] Fixed:', len(entity_rules_json['fixed']))
    print('[entity rules] Variable:', len(entity_rules_json['var']))
    print('[entity rules] Date-entity:', len(entity_rules_json['date-entity']))
    print('[entity rules] Named entity:', len(entity_rules_json['names']))
    print('[entity rules] Normalize:',
          sum(len(x) for x in entity_rules_json['normalize'].values()))
    print('[entity rules] Done')
示例#2
0
def main():
    cr = JAMR_CorpusReader()
    cr.load_amrs(sys.argv[1], verbose=False)

    special_alignments = Counter()

    for amr in cr.amrs:
        for node_id in amr.nodes:
            if node_id not in amr.alignments or not amr.alignments[node_id]:
                special_alignments[amr.nodes[node_id]] += 1

    for special in sorted(special_alignments,
                          reverse=True,
                          key=lambda x: special_alignments[x]):
        print(special.strip(), special_alignments[special])
def main():
    cr = JAMR_CorpusReader()
    cr.load_amrs(sys.argv[1], verbose=False)

    special_alignments = {}

    for amr in cr.amrs:
        for node_id in amr.alignments:
            aligned_token_ids = amr.alignments[node_id]
            aligned_node_ids = amr.alignmentsToken2Node(aligned_token_ids[0])
            aligned_node_ids = [
                id for id in aligned_node_ids if '"' not in amr.nodes[id]
            ]
            if len(aligned_node_ids) <= 1:
                continue

            subgraph = amr.findSubGraph(aligned_node_ids)
            # normalize named entities
            if len(subgraph.edges) == 1 and subgraph.edges[0][1] == ':name':
                subgraph.nodes[subgraph.root] = '[entity]'
            # normalize numbers
            for n in subgraph.nodes:
                if re.match('[0-9]+', subgraph.nodes[n]):
                    subgraph.nodes[n] = '[NUM]'
                if subgraph.nodes[n].endswith('quantity'):
                    subgraph.nodes[n] = '[quantity]'
                # if subgraph.nodes[n].endswith('entity'):
                #     subgraph.nodes[n] = '[value]'
            aligned_subgraph = str(subgraph)

            aligned_tokens = ' '.join(amr.tokens[x] for x in aligned_token_ids
                                      if x < len(amr.tokens))

            if aligned_subgraph not in special_alignments:
                special_alignments[aligned_subgraph] = Counter()
            special_alignments[aligned_subgraph][aligned_tokens] += 1

    for special in sorted(special_alignments,
                          key=lambda x: sum(special_alignments[x].values()),
                          reverse=True):
        print(special, sum(special_alignments[special].values()))
        print(special_alignments[special].most_common(10))
        print('\n')
示例#4
0
from collections import Counter
from amr import JAMR_CorpusReader


def get_token(gold_amr, t):
    if 0 <= t - 1 < len(gold_amr.tokens):
        return gold_amr.tokens[t - 1]
    else:
        return 'NA'


if __name__ == '__main__':

    file = sys.argv[1]

    cr = JAMR_CorpusReader()
    cr.load_amrs(file)
    gold_amrs = cr.amrs

    count = 0
    sentences = set()
    rels = Counter()
    for sent_idx, gold_amr in enumerate(gold_amrs):
        for i, tok in enumerate(gold_amr.tokens):
            align = gold_amr.alignmentsToken2Node(i + 1)
            # merge alignments
            root = gold_amr.findSubGraph(align).root
            for n in gold_amr.nodes:
                if n in align:
                    continue
                edges = [(s, r, t) for s, r, t in gold_amr.edges
示例#5
0
            if isHead:
                # rearrange latent if necessary
                transitions.latent.append(transitions.latent.pop(idx))
                return True
            idx -= 1
        return False


if __name__ == '__main__':

    input_file = sys.argv[1]
    gfile = sys.argv[2] if len(sys.argv) > 2 else 'oracle_amrs.txt'
    afile = sys.argv[3] if len(sys.argv) > 3 else 'oracle_actions.txt'

    cr = JAMR_CorpusReader()
    cr.load_amrs(input_file)

    oracle = AMR_Oracle(verbose=True)
    print_log("amr", "Processing oracle")
    oracle.runOracle(cr.amrs, action_file=afile, graph_file=gfile, add_unaligned=0)
    for stat in oracle.stats:
        print_log("amr", stat)
        print_log("amr", oracle.stats[stat].most_common(100))
        print_log("amr", "")

    if use_addnode_rules:
        for x in transitions.entity_rule_totals:
            perc = transitions.entity_rule_stats[x]/transitions.entity_rule_totals[x]
            print(x,  transitions.entity_rule_stats[x], '/', transitions.entity_rule_totals[x], '=', f'{perc:.2f}')
        perc = sum(transitions.entity_rule_stats.values())/sum(transitions.entity_rule_totals.values())
from amr import JAMR_CorpusReader

amr_file = '../data/train.txt'
new_amr_file = '../data/train.no_wiki.txt'

cr = JAMR_CorpusReader()
cr.load_amrs(amr_file, verbose=False)
amrs = cr.amrs
sent_idx = 0
for amr in amrs:
    wiki_edges = []
    wiki_nodes = []
    for s, r, t in amr.edges:
        if r == ':wiki':
            wiki_edges.append((s, r, t))
            wiki_nodes.append(t)
    for e in wiki_edges:
        amr.edges.remove(e)
    for n in wiki_nodes:
        del amr.nodes[n]
        if n in amr.alignments:
            del amr.alignments[n]
        print('deleting wiki:', sent_idx)
    sent_idx += 1

with open(new_amr_file, 'w+', encoding='utf8') as f:
    for amr in amrs:
        f.write(amr.toJAMRString())
示例#7
0
def main():
    cr = JAMR_CorpusReader()
    cr.load_amrs(sys.argv[1], verbose=False)

    json = {
        'size': {},
        'unaligned': {},
        'unconnected': {},
        'unrooted': {},
        'repeats': {},
        'stats': {}
    }

    all_entities = []
    unaligned_nodes = []
    unrooted_entities = []
    changes = 0
    amrs_changed = 0
    for amr in cr.amrs:
        change = fix_alignments(amr)
        changes += change
        if change > 0:
            amrs_changed += 1
        for node_id in amr.nodes:
            # get entity info
            if node_id not in amr.alignments:
                unaligned_nodes.append(amr.nodes[node_id])
                continue
            token_ids = amr.alignments[node_id]
            if not token_ids:
                unaligned_nodes.append(amr.nodes[node_id])
                continue
            nodes = amr.alignmentsToken2Node(token_ids[0])
            if len(nodes) <= 1:
                continue
            entity_sg = amr.findSubGraph(nodes)
            root = entity_sg.root
            if not node_id == root:
                continue
            edges = entity_sg.edges

            tokens = [
                amr.tokens[t - 1] for t in token_ids
                if 0 <= t <= len(amr.tokens)
            ]
            special_nodes = [
                n for n in nodes
                if (amr.nodes[n].isdigit() or amr.nodes[n].startswith('"'))
            ]

            entity_type = sorted(
                [amr.nodes[id] for id in nodes if id not in special_nodes])
            entity_type = ','.join(entity_type)

            nodes = {n: amr.nodes[n] for n in nodes}
            all_entities.append(
                (amr, entity_type, tokens, root, nodes, edges, str(amr)))
            for s, r, t in amr.edges:
                if (s, r, t) in edges:
                    continue
                if len(edges) == 0:
                    continue
                if s in nodes and s != root:
                    if t not in amr.alignments or not amr.alignments[t]:
                        continue
                    label = f'{amr.nodes[root]} {amr.nodes[s]}'
                    unrooted_entities.append(
                        (entity_type, tokens, label, str(amr)))
                if t in nodes and t != root:
                    if s not in amr.alignments or not amr.alignments[s]:
                        continue
                    label = f'{amr.nodes[root]} {amr.nodes[t]}'
                    unrooted_entities.append(
                        (entity_type, tokens, label, str(amr)))

    size_counters = dict()
    unconnected_counter = Counter()
    unaligned_counter = Counter()
    unrooted_counter = Counter()
    repeated_counter = Counter()
    attachment_counter = Counter()
    for node in unaligned_nodes:
        unaligned_counter[node] += 1
    for entity_type, tokens, label, string in unrooted_entities:
        unrooted_counter[entity_type] += 1
        attachment_counter[label] += 1
    json['stats']['unrooted-attachments'] = {}
    for node in sorted(attachment_counter,
                       reverse=True,
                       key=lambda x: attachment_counter[x]):
        json['stats']['unrooted-attachments'][node] = attachment_counter[node]
    for amr, entity_type, tokens, root, nodes, edges, string in all_entities:
        label = str(entity_type.count(',') + 1)
        if label not in size_counters:
            size_counters[label] = Counter()
        size_counters[label][entity_type] += 1
        if entity_type.count(',') + 1 > 1 and len(edges) == 0:
            unconnected_counter[entity_type] += 1
        nodes = entity_type.split(',')
        if any(nodes.count(n) > 1 for n in nodes):
            repeated_counter[entity_type] += 1

    print('Changes:', changes, 'AMRs changed:', amrs_changed)
    for label in sorted(size_counters.keys(), key=lambda x: int(x)):
        print('size', label)
        print(
            f'({len(size_counters[label])} types, {sum(size_counters[label].values())} items)'
        )
        json['stats']['size ' + label] = {
            'types': len(size_counters[label]),
            'items': sum(size_counters[label].values())
        }
        print(size_counters[label])
        json['size'][label] = {}
        for type in sorted(size_counters[label],
                           reverse=True,
                           key=lambda x: size_counters[label][x]):
            d = {
                'count': size_counters[label][type],
                'tokens': [],
                'graphs': [],
            }
            json['size'][label][type] = d
    print('unconnected')
    print(
        f'({len(unconnected_counter)} types, {sum(unconnected_counter.values())} items)'
    )
    json['stats']['unconnected'] = {
        'types': len(unconnected_counter),
        'items': sum(unconnected_counter.values())
    }
    print(unconnected_counter)
    json['unconnected'] = {}
    for type in sorted(unconnected_counter,
                       reverse=True,
                       key=lambda x: unconnected_counter[x]):
        d = {
            'count': unconnected_counter[type],
            'tokens': [],
            'graphs': [],
        }
        json['unconnected'][type] = d
    print('unaligned')
    print(
        f'({len(unaligned_counter)} types, {sum(unaligned_counter.values())} items)'
    )
    json['stats']['unaligned'] = {
        'types': len(unaligned_counter),
        'items': sum(unaligned_counter.values())
    }
    print(unaligned_counter)
    json['unaligned'] = {}
    for type in sorted(unaligned_counter,
                       reverse=True,
                       key=lambda x: unaligned_counter[x]):
        d = {
            'count': unaligned_counter[type],
        }
        if type.isdigit():
            type = '<NUM>' + type
        json['unaligned'][type] = d
    print('unrooted')
    print(
        f'({len(unrooted_counter)} types, {sum(unrooted_counter.values())} items)'
    )
    json['stats']['unrooted'] = {
        'types': len(unrooted_counter),
        'items': sum(unrooted_counter.values())
    }
    print(unrooted_counter)
    json['unrooted'] = {}
    for type in sorted(unrooted_counter,
                       reverse=True,
                       key=lambda x: unrooted_counter[x]):
        d = {
            'count': unrooted_counter[type],
            'tokens': [],
            'graphs': [],
            'attachments': []
        }
        json['unrooted'][type] = d
    print('repeats')
    print(
        f'({len(repeated_counter)} types, {sum(repeated_counter.values())} items)'
    )
    json['stats']['repeats'] = {
        'types': len(repeated_counter),
        'items': sum(repeated_counter.values())
    }
    print(repeated_counter)
    json['repeats'] = {}
    for type in sorted(repeated_counter,
                       reverse=True,
                       key=lambda x: repeated_counter[x]):
        d = {
            'count': repeated_counter[type],
            'tokens': [],
            'graphs': [],
        }
        json['repeats'][type] = d
    print()

    for entity_type, tokens, label, string in unrooted_entities:
        tokens = ' '.join(tokens)
        if tokens not in json['unrooted'][entity_type]['tokens'] and len(
                json['unrooted'][entity_type]['tokens']) < 100:
            json['unrooted'][entity_type]['tokens'].append(tokens)
            if len(json['unrooted'][entity_type]['graphs']) < 1:
                json['unrooted'][entity_type]['graphs'].append(string)
        if label not in json['unrooted'][entity_type]['attachments']:
            json['unrooted'][entity_type]['attachments'].append(label)
    for amr, entity_type, tokens, root, nodes, edges, string in all_entities:
        tokens = ' '.join(tokens)
        size = str(entity_type.count(',') + 1)
        if tokens not in json['size'][size][entity_type]['tokens'] and len(
                json['size'][size][entity_type]['tokens']) < 100:
            json['size'][size][entity_type]['tokens'].append(tokens)
            if len(json['size'][size][entity_type]['graphs']) < 1:
                json['size'][size][entity_type]['graphs'].append(string)
        if entity_type.count(',') + 1 > 1 and len(edges) == 0:
            if tokens not in json['unconnected'][entity_type]['tokens'] and len(
                    json['unconnected'][entity_type]['tokens']) < 100:
                json['unconnected'][entity_type]['tokens'].append(tokens)
                if len(json['unconnected'][entity_type]['graphs']) < 1:
                    json['unconnected'][entity_type]['graphs'].append(string)
        nodes = entity_type.split(',')
        if any(nodes.count(n) > 1 for n in nodes):
            if tokens not in json['repeats'][entity_type]['tokens'] and len(
                    json['repeats'][entity_type]['tokens']) < 100:
                json['repeats'][entity_type]['tokens'].append(tokens)
                if len(json['repeats'][entity_type]['graphs']) < 1:
                    json['repeats'][entity_type]['graphs'].append(string)

    with open('alignment_analysis.json', 'w+', encoding='utf8') as f:
        J.dump(json, f)