Пример #1
0
def loadFile(input_filename):
    """
    load AMR parsed file, re-index AMR nodes and edges.
    return corpus of nodes and edges
    """
    graph_str = ''  # AMR parse
    info_dict = {}  # AMR meta info

    doc_filename = ''
    corpus = {}  # filename -> (nodes, root_nodes, edges, exp_edges)

    doc_nodes = {}  # (concept,) -> AmrNode
    doc_root_nodes = {}  # (concept,) -> AmrNode
    doc_edges = {}  # (concept1, concept2) -> AmrEdge
    doc_exp_edges = {}  # (concept1, concept2) -> AmrEdge

    with codecs.open(input_filename, 'r', 'utf-8') as infile:
        for line in infile:
            line = line.rstrip()

            if line == '':
                # no AMR graph for current sentence
                if graph_str == '':
                    info_dict = {}
                    continue

                # get nodes and edges (linked)
                g = AmrGraph()
                nodes, edges = g.getCollapsedNodesAndEdges(graph_str.split())

                # index nodes by graph_idx
                node_indices = {}
                for node in nodes:
                    graph_idx = node.graph_idx
                    node_indices.setdefault(graph_idx, node)

                # (1) use gold AMR annotation as input
                if not 'alignments' in info_dict:

                    # get sentence info
                    sentence = info_dict['snt']  # tokenized sentence
                    filename, line_num = info_dict['id'].split('.')
                    line_num = int(line_num)

                    # add source info to nodes
                    for node in nodes:
                        node_source = NodeSource(node.graph_idx, 0, 0, '',
                                                 filename, line_num, sentence)
                        node.sources.append(node_source)

                    # add source info to edges
                    for edge in edges:
                        edge_source = EdgeSource(edge.relation, filename,
                                                 line_num, sentence)
                        edge.sources.append(edge_source)

                else:  # (2) use alignment file as input

                    # get sentence info
                    sentence = info_dict['tok']  # tokenized sentence
                    tokens = sentence.split()
                    filename, line_num = info_dict['id'].split('.')
                    line_num = int(line_num)

                    # add source info to edges
                    for edge in edges:
                        edge_source = EdgeSource(edge.relation, filename,
                                                 line_num, sentence)
                        edge.sources.append(edge_source)

                    # add alignment and other source info to nodes
                    alignments_str = info_dict['alignments']

                    for alignment in alignments_str.split():
                        word_part, graph_part = alignment.split('|')
                        start_idx, end_idx = map(int, word_part.split('-'))
                        graph_indices = graph_part.split('+')

                        for graph_idx in graph_indices:
                            curr_node = node_indices.get(graph_idx, None)
                            if curr_node is None: continue

                            # add node source info
                            new_start_idx = start_idx
                            new_end_idx = end_idx
                            # change existing start_idx/end_idx to broadest coverage
                            if curr_node.sources:
                                curr_node_source = curr_node.sources.pop()
                                if new_start_idx > curr_node_source.start_idx:
                                    new_start_idx = curr_node_source.start_idx
                                if new_end_idx < curr_node_source.end_idx:
                                    new_end_idx = curr_node_source.end_idx
                            # update new node source
                            new_node_source = NodeSource(
                                curr_node.graph_idx, new_start_idx,
                                new_end_idx,
                                ' '.join(tokens[new_start_idx:new_end_idx]),
                                filename, line_num, sentence)
                            curr_node.sources.append(new_node_source)

                    # add source info to [unaligned] nodes
                    for node in nodes:
                        if node.sources: continue
                        node_source = NodeSource(node.graph_idx, 0, 0, '',
                                                 filename, line_num, sentence)
                        node.sources.append(node_source)

                # start of new file
                if filename != doc_filename:
                    if doc_filename != '':
                        corpus[doc_filename] = (doc_nodes, doc_root_nodes,
                                                doc_edges, doc_exp_edges)
                    doc_filename = filename
                    doc_nodes = {}
                    doc_root_nodes = {}
                    doc_edges = {}
                    doc_exp_edges = {}

                # keep track of redirected nodes
                redirect_dict = {}

                # merge nodes
                first_node = True
                for node in nodes:
                    curr_anchor = tuple((node.concept, ))  # tricky tuple
                    if curr_anchor in doc_nodes:
                        old_node = doc_nodes[curr_anchor]
                        old_node.sources.extend(node.sources)
                        redirect_dict[node] = old_node
                    else:
                        doc_nodes[curr_anchor] = node
                    # root node of sentence
                    if first_node == True:
                        doc_root_nodes[curr_anchor] = doc_nodes[curr_anchor]
                        first_node = False

                # merge edges
                edge_indices = {}  # index edge by concepts
                for edge in edges:
                    # update node linkage
                    if edge.node1 in redirect_dict:
                        edge.node1 = redirect_dict[edge.node1]
                    if edge.node2 in redirect_dict:
                        edge.node2 = redirect_dict[edge.node2]

                    curr_anchor = tuple(
                        (edge.node1.concept,
                         edge.node2.concept))  # ignore relation
                    edge_indices[curr_anchor] = edge

                    if curr_anchor in doc_edges:
                        old_edge = doc_edges[curr_anchor]
                        old_edge.sources.extend(edge.sources)
                    else:
                        doc_edges[curr_anchor] = edge

                # expand edges, nodes in each sentence are fully connected
                for node1 in nodes:
                    for node2 in nodes:
                        curr_anchor = tuple((node1.concept, node2.concept))
                        redirect_node1 = doc_nodes[(node1.concept, )]
                        redirect_node2 = doc_nodes[(node2.concept, )]

                        # expanded edge exists
                        if curr_anchor in doc_exp_edges:
                            # update node linkage
                            old_edge = doc_exp_edges[curr_anchor]
                            old_edge.node1 = redirect_node1
                            old_edge.node2 = redirect_node2
                            # update edge sources
                            if curr_anchor in edge_indices:  # true edge
                                edge = edge_indices[curr_anchor]
                                old_edge.sources.extend(edge.sources)
                            else:  # NULL edge
                                edge_source = EdgeSource(
                                    'NULL', filename, line_num, sentence)
                                old_edge.sources.append(edge_source)

                        else:  # expanded edge does not exist, build a new edge
                            if curr_anchor in edge_indices:  # true edge
                                edge = edge_indices[curr_anchor]
                                new_edge = AmrEdge(node1=redirect_node1,
                                                   node2=redirect_node2,
                                                   relation=edge.relation)
                                new_edge.sources.extend(edge.sources)
                            else:  # NULL edge
                                new_edge = AmrEdge(node1=redirect_node1,
                                                   node2=redirect_node2,
                                                   relation='NULL')
                                edge_source = EdgeSource(
                                    'NULL', filename, line_num, sentence)
                                new_edge.sources.append(edge_source)
                            doc_exp_edges[curr_anchor] = new_edge

                # clear cache
                graph_str = ''
                info_dict = {}
                continue

            if line.startswith('#'):
                fields = line.split('::')
                for field in fields[1:]:
                    tokens = field.split()
                    info_name = tokens[0]
                    info_body = ' '.join(tokens[1:])
                    info_dict[info_name] = info_body
                continue

            graph_str += line

    # add nodes and edges to the last file
    corpus[doc_filename] = (doc_nodes, doc_root_nodes, doc_edges,
                            doc_exp_edges)
    # return loaded corpus
    return corpus
Пример #2
0
def getTriples(input_file):
    """
    get triples from gold-standard AMR annotations
    """
    triples = defaultdict(list)
    extended_triples = defaultdict(list)
    concepts = defaultdict(Counter)  # Counter

    graph_string = ''  # AMR graph
    tokens = ''  # current tokens
    line_num = -1  # line_num in file
    filename = ''  # current filename

    with codecs.open(input_file, 'r', 'utf-8') as infile:
        for line in infile:
            line = line.rstrip()

            # process each graph
            if line == '' and graph_string:
                g = AmrGraph()
                graph_tokens = graph_string.split()

                nodes, edges = g.getCollapsedNodesAndEdges(graph_tokens)
                #                 nodes = g.getNodes(graph_tokens)
                #                 edges = g.getEdges(graph_tokens)

                # get triples
                for edge in edges:
                    node1 = edge.node1
                    node2 = edge.node2
                    relation = edge.relation

                    t = Triple(node1.concept, node2.concept, relation,
                               filename, line_num, tokens)
                    triples[filename].append(t)

                # get extended triples (ignore relation, but keep direction)
                for node1 in nodes:
                    for node2 in nodes:
                        t = Triple(node1.concept, node2.concept, '', filename,
                                   line_num, tokens)
                        extended_triples[filename].append(t)

                # get concepts
                for node in nodes:
                    concepts[filename][node.concept] += 1

                # clear cache
                graph_string = ''
                filename = ''
                line_num = -1
                tokens = ''
                continue

            if line.startswith('# ::id'):
                tokens = line.split()
                filename, line_num = tokens[2].split('.')
                line_num = int(line_num)
                continue

            # get snt-type: summary, body, etc.
            if line.startswith('# ::snt'):
                tokens = line
                continue

            # get AMR graph string
            if line and not line.startswith('#'):
                graph_string += line
                continue

    return triples, extended_triples, concepts
Пример #3
0
def loadFile(input_filename):
    """
    load AMR parsed file, re-index AMR nodes and edges.
    return corpus of nodes and edges
    """
    graph_str = ''  # AMR parse
    info_dict = {}  # AMR meta info
    
    doc_filename = ''
    corpus = {}     # filename -> (nodes, root_nodes, edges, exp_edges)
    
    doc_nodes = {}  # (concept,) -> AmrNode
    doc_root_nodes = {}  # (concept,) -> AmrNode
    doc_edges = {}  # (concept1, concept2) -> AmrEdge
    doc_exp_edges = {} # (concept1, concept2) -> AmrEdge
    
    with codecs.open(input_filename, 'r', 'utf-8') as infile:
        for line in infile:
            line = line.rstrip()
            
            if line == '':
                # no AMR graph for current sentence
                if graph_str == '': 
                    info_dict = {}
                    continue
                
                # get nodes and edges (linked)
                g = AmrGraph()
                nodes, edges = g.getCollapsedNodesAndEdges(graph_str.split())
                
                # index nodes by graph_idx
                node_indices = {}
                for node in nodes:
                    graph_idx = node.graph_idx
                    node_indices.setdefault(graph_idx, node)

                # (1) use gold AMR annotation as input
                if not 'alignments' in info_dict:
                    
                    # get sentence info
                    sentence = info_dict['snt'] # tokenized sentence
                    filename, line_num = info_dict['id'].split('.')
                    line_num = int(line_num)
                    
                    # add source info to nodes
                    for node in nodes:
                        node_source = NodeSource(node.graph_idx, 0, 0, '', filename, line_num, sentence)
                        node.sources.append(node_source)
                        
                    # add source info to edges
                    for edge in edges:
                        edge_source = EdgeSource(edge.relation, filename, line_num, sentence)
                        edge.sources.append(edge_source)
                        
                else: # (2) use alignment file as input
                    
                    # get sentence info
                    sentence = info_dict['tok'] # tokenized sentence
                    tokens = sentence.split()
                    filename, line_num = info_dict['id'].split('.')
                    line_num = int(line_num)
                    
                    # add source info to edges
                    for edge in edges:
                        edge_source = EdgeSource(edge.relation, filename, line_num, sentence)
                        edge.sources.append(edge_source)
                    
                    # add alignment and other source info to nodes
                    alignments_str = info_dict['alignments']
                    
                    for alignment in alignments_str.split():
                        word_part, graph_part = alignment.split('|')
                        start_idx, end_idx = map(int, word_part.split('-'))
                        graph_indices = graph_part.split('+')
                        
                        for graph_idx in graph_indices:
                            curr_node = node_indices.get(graph_idx, None)
                            if curr_node is None: continue
                            
                            # add node source info
                            new_start_idx = start_idx
                            new_end_idx = end_idx
                            # change existing start_idx/end_idx to broadest coverage
                            if curr_node.sources: 
                                curr_node_source = curr_node.sources.pop()
                                if new_start_idx > curr_node_source.start_idx:
                                    new_start_idx = curr_node_source.start_idx
                                if new_end_idx < curr_node_source.end_idx:
                                    new_end_idx = curr_node_source.end_idx
                            # update new node source
                            new_node_source = NodeSource(curr_node.graph_idx, new_start_idx, new_end_idx, 
                                                         ' '.join(tokens[new_start_idx:new_end_idx]), 
                                                         filename, line_num, sentence)
                            curr_node.sources.append(new_node_source)

                    # add source info to [unaligned] nodes 
                    for node in nodes:
                        if node.sources: continue
                        node_source = NodeSource(node.graph_idx, 0, 0, '', filename, line_num, sentence)
                        node.sources.append(node_source)

                # start of new file
                if filename != doc_filename:
                    if doc_filename != '':
                        corpus[doc_filename] = (doc_nodes, doc_root_nodes, doc_edges, doc_exp_edges)
                    doc_filename = filename
                    doc_nodes = {}
                    doc_root_nodes = {}
                    doc_edges = {}
                    doc_exp_edges = {}

                # keep track of redirected nodes
                redirect_dict = {}
                
                # merge nodes
                first_node = True
                for node in nodes:
                    curr_anchor = tuple((node.concept,)) # tricky tuple
                    if curr_anchor in doc_nodes:
                        old_node = doc_nodes[curr_anchor]
                        old_node.sources.extend(node.sources)
                        redirect_dict[node] = old_node
                    else:
                        doc_nodes[curr_anchor] = node
                    # root node of sentence
                    if first_node == True:
                        doc_root_nodes[curr_anchor] = doc_nodes[curr_anchor]
                        first_node = False
                
                # merge edges
                edge_indices = {} # index edge by concepts
                for edge in edges:
                    # update node linkage
                    if edge.node1 in redirect_dict:
                        edge.node1 = redirect_dict[edge.node1]
                    if edge.node2 in redirect_dict:
                        edge.node2 = redirect_dict[edge.node2]
                        
                    curr_anchor = tuple((edge.node1.concept, edge.node2.concept)) # ignore relation
                    edge_indices[curr_anchor] = edge
                    
                    if curr_anchor in doc_edges:
                        old_edge = doc_edges[curr_anchor]
                        old_edge.sources.extend(edge.sources)
                    else:
                        doc_edges[curr_anchor] = edge
                
                # expand edges, nodes in each sentence are fully connected
                for node1 in nodes:
                    for node2 in nodes:
                        curr_anchor = tuple((node1.concept, node2.concept))
                        redirect_node1 = doc_nodes[(node1.concept,)]
                        redirect_node2 = doc_nodes[(node2.concept,)]
                        
                        # expanded edge exists
                        if curr_anchor in doc_exp_edges:
                            # update node linkage
                            old_edge = doc_exp_edges[curr_anchor]
                            old_edge.node1 = redirect_node1
                            old_edge.node2 = redirect_node2
                            # update edge sources
                            if curr_anchor in edge_indices: # true edge
                                edge = edge_indices[curr_anchor]
                                old_edge.sources.extend(edge.sources)
                            else: # NULL edge
                                edge_source = EdgeSource('NULL', filename, line_num, sentence)
                                old_edge.sources.append(edge_source)
                                
                        else: # expanded edge does not exist, build a new edge
                            if curr_anchor in edge_indices: # true edge
                                edge = edge_indices[curr_anchor]
                                new_edge = AmrEdge(node1=redirect_node1, node2=redirect_node2, relation=edge.relation)
                                new_edge.sources.extend(edge.sources)
                            else: # NULL edge
                                new_edge = AmrEdge(node1=redirect_node1, node2=redirect_node2, relation='NULL')
                                edge_source = EdgeSource('NULL', filename, line_num, sentence)
                                new_edge.sources.append(edge_source)
                            doc_exp_edges[curr_anchor] = new_edge

                # clear cache
                graph_str = ''
                info_dict = {}
                continue
            
            if line.startswith('#'):
                fields = line.split('::')
                for field in fields[1:]:
                    tokens = field.split()
                    info_name = tokens[0]
                    info_body = ' '.join(tokens[1:])
                    info_dict[info_name] = info_body
                continue
            
            graph_str += line
            
    # add nodes and edges to the last file
    corpus[doc_filename] = (doc_nodes, doc_root_nodes, doc_edges, doc_exp_edges)
    # return loaded corpus
    return corpus    
Пример #4
0
def getTriples(input_file):
    """
    get triples from gold-standard AMR annotations
    """
    triples = defaultdict(list)
    extended_triples = defaultdict(list)
    concepts = defaultdict(Counter)  # Counter

    graph_string = ""  # AMR graph
    tokens = ""  # current tokens
    line_num = -1  # line_num in file
    filename = ""  # current filename

    with codecs.open(input_file, "r", "utf-8") as infile:
        for line in infile:
            line = line.rstrip()

            # process each graph
            if line == "" and graph_string:
                g = AmrGraph()
                graph_tokens = graph_string.split()

                nodes, edges = g.getCollapsedNodesAndEdges(graph_tokens)
                #                 nodes = g.getNodes(graph_tokens)
                #                 edges = g.getEdges(graph_tokens)

                # get triples
                for edge in edges:
                    node1 = edge.node1
                    node2 = edge.node2
                    relation = edge.relation

                    t = Triple(node1.concept, node2.concept, relation, filename, line_num, tokens)
                    triples[filename].append(t)

                # get extended triples (ignore relation, but keep direction)
                for node1 in nodes:
                    for node2 in nodes:
                        t = Triple(node1.concept, node2.concept, "", filename, line_num, tokens)
                        extended_triples[filename].append(t)

                # get concepts
                for node in nodes:
                    concepts[filename][node.concept] += 1

                # clear cache
                graph_string = ""
                filename = ""
                line_num = -1
                tokens = ""
                continue

            if line.startswith("# ::id"):
                tokens = line.split()
                filename, line_num = tokens[2].split(".")
                line_num = int(line_num)
                continue

            # get snt-type: summary, body, etc.
            if line.startswith("# ::snt"):
                tokens = line
                continue

            # get AMR graph string
            if line and not line.startswith("#"):
                graph_string += line
                continue

    return triples, extended_triples, concepts