def loadFile(input_filename): """ load AMR parsed file, re-index AMR nodes and edges. return corpus of nodes and edges """ graph_str = '' # AMR parse info_dict = {} # AMR meta info doc_filename = '' corpus = {} # filename -> (nodes, root_nodes, edges, exp_edges) doc_nodes = {} # (concept,) -> AmrNode doc_root_nodes = {} # (concept,) -> AmrNode doc_edges = {} # (concept1, concept2) -> AmrEdge doc_exp_edges = {} # (concept1, concept2) -> AmrEdge with codecs.open(input_filename, 'r', 'utf-8') as infile: for line in infile: line = line.rstrip() if line == '': # no AMR graph for current sentence if graph_str == '': info_dict = {} continue # get nodes and edges (linked) g = AmrGraph() nodes, edges = g.getCollapsedNodesAndEdges(graph_str.split()) # index nodes by graph_idx node_indices = {} for node in nodes: graph_idx = node.graph_idx node_indices.setdefault(graph_idx, node) # (1) use gold AMR annotation as input if not 'alignments' in info_dict: # get sentence info sentence = info_dict['snt'] # tokenized sentence filename, line_num = info_dict['id'].split('.') line_num = int(line_num) # add source info to nodes for node in nodes: node_source = NodeSource(node.graph_idx, 0, 0, '', filename, line_num, sentence) node.sources.append(node_source) # add source info to edges for edge in edges: edge_source = EdgeSource(edge.relation, filename, line_num, sentence) edge.sources.append(edge_source) else: # (2) use alignment file as input # get sentence info sentence = info_dict['tok'] # tokenized sentence tokens = sentence.split() filename, line_num = info_dict['id'].split('.') line_num = int(line_num) # add source info to edges for edge in edges: edge_source = EdgeSource(edge.relation, filename, line_num, sentence) edge.sources.append(edge_source) # add alignment and other source info to nodes alignments_str = info_dict['alignments'] for alignment in alignments_str.split(): word_part, graph_part = alignment.split('|') start_idx, end_idx = map(int, word_part.split('-')) graph_indices = graph_part.split('+') for graph_idx in graph_indices: curr_node = node_indices.get(graph_idx, None) if curr_node is None: continue # add node source info new_start_idx = start_idx new_end_idx = end_idx # change existing start_idx/end_idx to broadest coverage if curr_node.sources: curr_node_source = curr_node.sources.pop() if new_start_idx > curr_node_source.start_idx: new_start_idx = curr_node_source.start_idx if new_end_idx < curr_node_source.end_idx: new_end_idx = curr_node_source.end_idx # update new node source new_node_source = NodeSource( curr_node.graph_idx, new_start_idx, new_end_idx, ' '.join(tokens[new_start_idx:new_end_idx]), filename, line_num, sentence) curr_node.sources.append(new_node_source) # add source info to [unaligned] nodes for node in nodes: if node.sources: continue node_source = NodeSource(node.graph_idx, 0, 0, '', filename, line_num, sentence) node.sources.append(node_source) # start of new file if filename != doc_filename: if doc_filename != '': corpus[doc_filename] = (doc_nodes, doc_root_nodes, doc_edges, doc_exp_edges) doc_filename = filename doc_nodes = {} doc_root_nodes = {} doc_edges = {} doc_exp_edges = {} # keep track of redirected nodes redirect_dict = {} # merge nodes first_node = True for node in nodes: curr_anchor = tuple((node.concept, )) # tricky tuple if curr_anchor in doc_nodes: old_node = doc_nodes[curr_anchor] old_node.sources.extend(node.sources) redirect_dict[node] = old_node else: doc_nodes[curr_anchor] = node # root node of sentence if first_node == True: doc_root_nodes[curr_anchor] = doc_nodes[curr_anchor] first_node = False # merge edges edge_indices = {} # index edge by concepts for edge in edges: # update node linkage if edge.node1 in redirect_dict: edge.node1 = redirect_dict[edge.node1] if edge.node2 in redirect_dict: edge.node2 = redirect_dict[edge.node2] curr_anchor = tuple( (edge.node1.concept, edge.node2.concept)) # ignore relation edge_indices[curr_anchor] = edge if curr_anchor in doc_edges: old_edge = doc_edges[curr_anchor] old_edge.sources.extend(edge.sources) else: doc_edges[curr_anchor] = edge # expand edges, nodes in each sentence are fully connected for node1 in nodes: for node2 in nodes: curr_anchor = tuple((node1.concept, node2.concept)) redirect_node1 = doc_nodes[(node1.concept, )] redirect_node2 = doc_nodes[(node2.concept, )] # expanded edge exists if curr_anchor in doc_exp_edges: # update node linkage old_edge = doc_exp_edges[curr_anchor] old_edge.node1 = redirect_node1 old_edge.node2 = redirect_node2 # update edge sources if curr_anchor in edge_indices: # true edge edge = edge_indices[curr_anchor] old_edge.sources.extend(edge.sources) else: # NULL edge edge_source = EdgeSource( 'NULL', filename, line_num, sentence) old_edge.sources.append(edge_source) else: # expanded edge does not exist, build a new edge if curr_anchor in edge_indices: # true edge edge = edge_indices[curr_anchor] new_edge = AmrEdge(node1=redirect_node1, node2=redirect_node2, relation=edge.relation) new_edge.sources.extend(edge.sources) else: # NULL edge new_edge = AmrEdge(node1=redirect_node1, node2=redirect_node2, relation='NULL') edge_source = EdgeSource( 'NULL', filename, line_num, sentence) new_edge.sources.append(edge_source) doc_exp_edges[curr_anchor] = new_edge # clear cache graph_str = '' info_dict = {} continue if line.startswith('#'): fields = line.split('::') for field in fields[1:]: tokens = field.split() info_name = tokens[0] info_body = ' '.join(tokens[1:]) info_dict[info_name] = info_body continue graph_str += line # add nodes and edges to the last file corpus[doc_filename] = (doc_nodes, doc_root_nodes, doc_edges, doc_exp_edges) # return loaded corpus return corpus
def getTriples(input_file): """ get triples from gold-standard AMR annotations """ triples = defaultdict(list) extended_triples = defaultdict(list) concepts = defaultdict(Counter) # Counter graph_string = '' # AMR graph tokens = '' # current tokens line_num = -1 # line_num in file filename = '' # current filename with codecs.open(input_file, 'r', 'utf-8') as infile: for line in infile: line = line.rstrip() # process each graph if line == '' and graph_string: g = AmrGraph() graph_tokens = graph_string.split() nodes, edges = g.getCollapsedNodesAndEdges(graph_tokens) # nodes = g.getNodes(graph_tokens) # edges = g.getEdges(graph_tokens) # get triples for edge in edges: node1 = edge.node1 node2 = edge.node2 relation = edge.relation t = Triple(node1.concept, node2.concept, relation, filename, line_num, tokens) triples[filename].append(t) # get extended triples (ignore relation, but keep direction) for node1 in nodes: for node2 in nodes: t = Triple(node1.concept, node2.concept, '', filename, line_num, tokens) extended_triples[filename].append(t) # get concepts for node in nodes: concepts[filename][node.concept] += 1 # clear cache graph_string = '' filename = '' line_num = -1 tokens = '' continue if line.startswith('# ::id'): tokens = line.split() filename, line_num = tokens[2].split('.') line_num = int(line_num) continue # get snt-type: summary, body, etc. if line.startswith('# ::snt'): tokens = line continue # get AMR graph string if line and not line.startswith('#'): graph_string += line continue return triples, extended_triples, concepts
def loadFile(input_filename): """ load AMR parsed file, re-index AMR nodes and edges. return corpus of nodes and edges """ graph_str = '' # AMR parse info_dict = {} # AMR meta info doc_filename = '' corpus = {} # filename -> (nodes, root_nodes, edges, exp_edges) doc_nodes = {} # (concept,) -> AmrNode doc_root_nodes = {} # (concept,) -> AmrNode doc_edges = {} # (concept1, concept2) -> AmrEdge doc_exp_edges = {} # (concept1, concept2) -> AmrEdge with codecs.open(input_filename, 'r', 'utf-8') as infile: for line in infile: line = line.rstrip() if line == '': # no AMR graph for current sentence if graph_str == '': info_dict = {} continue # get nodes and edges (linked) g = AmrGraph() nodes, edges = g.getCollapsedNodesAndEdges(graph_str.split()) # index nodes by graph_idx node_indices = {} for node in nodes: graph_idx = node.graph_idx node_indices.setdefault(graph_idx, node) # (1) use gold AMR annotation as input if not 'alignments' in info_dict: # get sentence info sentence = info_dict['snt'] # tokenized sentence filename, line_num = info_dict['id'].split('.') line_num = int(line_num) # add source info to nodes for node in nodes: node_source = NodeSource(node.graph_idx, 0, 0, '', filename, line_num, sentence) node.sources.append(node_source) # add source info to edges for edge in edges: edge_source = EdgeSource(edge.relation, filename, line_num, sentence) edge.sources.append(edge_source) else: # (2) use alignment file as input # get sentence info sentence = info_dict['tok'] # tokenized sentence tokens = sentence.split() filename, line_num = info_dict['id'].split('.') line_num = int(line_num) # add source info to edges for edge in edges: edge_source = EdgeSource(edge.relation, filename, line_num, sentence) edge.sources.append(edge_source) # add alignment and other source info to nodes alignments_str = info_dict['alignments'] for alignment in alignments_str.split(): word_part, graph_part = alignment.split('|') start_idx, end_idx = map(int, word_part.split('-')) graph_indices = graph_part.split('+') for graph_idx in graph_indices: curr_node = node_indices.get(graph_idx, None) if curr_node is None: continue # add node source info new_start_idx = start_idx new_end_idx = end_idx # change existing start_idx/end_idx to broadest coverage if curr_node.sources: curr_node_source = curr_node.sources.pop() if new_start_idx > curr_node_source.start_idx: new_start_idx = curr_node_source.start_idx if new_end_idx < curr_node_source.end_idx: new_end_idx = curr_node_source.end_idx # update new node source new_node_source = NodeSource(curr_node.graph_idx, new_start_idx, new_end_idx, ' '.join(tokens[new_start_idx:new_end_idx]), filename, line_num, sentence) curr_node.sources.append(new_node_source) # add source info to [unaligned] nodes for node in nodes: if node.sources: continue node_source = NodeSource(node.graph_idx, 0, 0, '', filename, line_num, sentence) node.sources.append(node_source) # start of new file if filename != doc_filename: if doc_filename != '': corpus[doc_filename] = (doc_nodes, doc_root_nodes, doc_edges, doc_exp_edges) doc_filename = filename doc_nodes = {} doc_root_nodes = {} doc_edges = {} doc_exp_edges = {} # keep track of redirected nodes redirect_dict = {} # merge nodes first_node = True for node in nodes: curr_anchor = tuple((node.concept,)) # tricky tuple if curr_anchor in doc_nodes: old_node = doc_nodes[curr_anchor] old_node.sources.extend(node.sources) redirect_dict[node] = old_node else: doc_nodes[curr_anchor] = node # root node of sentence if first_node == True: doc_root_nodes[curr_anchor] = doc_nodes[curr_anchor] first_node = False # merge edges edge_indices = {} # index edge by concepts for edge in edges: # update node linkage if edge.node1 in redirect_dict: edge.node1 = redirect_dict[edge.node1] if edge.node2 in redirect_dict: edge.node2 = redirect_dict[edge.node2] curr_anchor = tuple((edge.node1.concept, edge.node2.concept)) # ignore relation edge_indices[curr_anchor] = edge if curr_anchor in doc_edges: old_edge = doc_edges[curr_anchor] old_edge.sources.extend(edge.sources) else: doc_edges[curr_anchor] = edge # expand edges, nodes in each sentence are fully connected for node1 in nodes: for node2 in nodes: curr_anchor = tuple((node1.concept, node2.concept)) redirect_node1 = doc_nodes[(node1.concept,)] redirect_node2 = doc_nodes[(node2.concept,)] # expanded edge exists if curr_anchor in doc_exp_edges: # update node linkage old_edge = doc_exp_edges[curr_anchor] old_edge.node1 = redirect_node1 old_edge.node2 = redirect_node2 # update edge sources if curr_anchor in edge_indices: # true edge edge = edge_indices[curr_anchor] old_edge.sources.extend(edge.sources) else: # NULL edge edge_source = EdgeSource('NULL', filename, line_num, sentence) old_edge.sources.append(edge_source) else: # expanded edge does not exist, build a new edge if curr_anchor in edge_indices: # true edge edge = edge_indices[curr_anchor] new_edge = AmrEdge(node1=redirect_node1, node2=redirect_node2, relation=edge.relation) new_edge.sources.extend(edge.sources) else: # NULL edge new_edge = AmrEdge(node1=redirect_node1, node2=redirect_node2, relation='NULL') edge_source = EdgeSource('NULL', filename, line_num, sentence) new_edge.sources.append(edge_source) doc_exp_edges[curr_anchor] = new_edge # clear cache graph_str = '' info_dict = {} continue if line.startswith('#'): fields = line.split('::') for field in fields[1:]: tokens = field.split() info_name = tokens[0] info_body = ' '.join(tokens[1:]) info_dict[info_name] = info_body continue graph_str += line # add nodes and edges to the last file corpus[doc_filename] = (doc_nodes, doc_root_nodes, doc_edges, doc_exp_edges) # return loaded corpus return corpus
def getTriples(input_file): """ get triples from gold-standard AMR annotations """ triples = defaultdict(list) extended_triples = defaultdict(list) concepts = defaultdict(Counter) # Counter graph_string = "" # AMR graph tokens = "" # current tokens line_num = -1 # line_num in file filename = "" # current filename with codecs.open(input_file, "r", "utf-8") as infile: for line in infile: line = line.rstrip() # process each graph if line == "" and graph_string: g = AmrGraph() graph_tokens = graph_string.split() nodes, edges = g.getCollapsedNodesAndEdges(graph_tokens) # nodes = g.getNodes(graph_tokens) # edges = g.getEdges(graph_tokens) # get triples for edge in edges: node1 = edge.node1 node2 = edge.node2 relation = edge.relation t = Triple(node1.concept, node2.concept, relation, filename, line_num, tokens) triples[filename].append(t) # get extended triples (ignore relation, but keep direction) for node1 in nodes: for node2 in nodes: t = Triple(node1.concept, node2.concept, "", filename, line_num, tokens) extended_triples[filename].append(t) # get concepts for node in nodes: concepts[filename][node.concept] += 1 # clear cache graph_string = "" filename = "" line_num = -1 tokens = "" continue if line.startswith("# ::id"): tokens = line.split() filename, line_num = tokens[2].split(".") line_num = int(line_num) continue # get snt-type: summary, body, etc. if line.startswith("# ::snt"): tokens = line continue # get AMR graph string if line and not line.startswith("#"): graph_string += line continue return triples, extended_triples, concepts