def get_subordinate_clauses(tiger_docgraph): """ given a document graph of a TIGER syntax tree, return all node IDs of nodes representing subordinate clause constituents. Parameters ---------- tiger_docgraph : DiscourseDocumentGraph or TigerDocumentGraph document graph from which subordinate clauses will be extracted Returns ------- subord_clause_nodes : list(str) list of node IDs of nodes directly dominating subordinate clauses """ subord_clause_rels = \ dg.select_edges_by_attribute( tiger_docgraph, attribute='tiger:label', value=['MO', 'RC', 'SB']) subord_clause_nodes = [] for src_id, target_id in subord_clause_rels: src_cat = tiger_docgraph.node[src_id].get('tiger:cat') if src_cat == 'S' and not dg.istoken(tiger_docgraph, target_id): subord_clause_nodes.append(target_id) return subord_clause_nodes
def __repair_unconnected_nodes(self): """ Adds a (``dominance_relation``) edge from the sentence root node to all previously unconnected nodes (token nodes, that either represent a punctuation mark or are part of a headline 'sentence' that has no full syntax structure annotation). """ unconnected_node_ids = get_unconnected_nodes(self) if dg.istoken(self, self.root): # This sentence has no hierarchical structure, i.e. the root # node is also a terminal / token node. # We will add a virtual root node to compensate for this. self.root = self.ns + ':VROOT' self.add_node(self.root, layers={ 'tiger', 'tiger:syntax', 'tiger:sentence', 'tiger:sentence:root' }) for unconnected_node_id in unconnected_node_ids: self.add_edge(self.root, unconnected_node_id, layers={ self.ns, self.ns + ':sentence', self.ns + ':unconnected' }, edge_type=EdgeTypes.dominance_relation)
def __gen_struct_anno_files(self, top_level_layer): """ A struct annotation file contains node (struct) attributes (of non-token nodes). It is e.g. used to annotate the type of a syntactic category (NP, VP etc.). See also: __gen_hierarchy_file() """ paula_id = '{0}.{1}.{2}_{3}_struct'.format(top_level_layer, self.corpus_name, self.name, top_level_layer) E, tree = gen_paula_etree(paula_id) base_paula_id = self.paulamap['hierarchy'][top_level_layer] mflist = E('multiFeatList', {XMLBASE: base_paula_id+'.xml'}) for node_id in select_nodes_by_layer(self.dg, top_level_layer): if not istoken(self.dg, node_id): mfeat = E('multiFeat', {XLINKHREF: '#{0}'.format(node_id)}) node_dict = self.dg.node[node_id] for attr in node_dict: if attr not in IGNORED_NODE_ATTRIBS: mfeat.append( E('feat', {'name': attr, 'value': node_dict[attr]})) if self.human_readable: # adds node label as a <!--comment--> mfeat.append(Comment(node_dict.get('label'))) mflist.append(mfeat) tree.append(mflist) self.files[paula_id] = tree self.file2dtd[paula_id] = PaulaDTDs.multifeat return paula_id
def __gen_node_href(self, layer, node_id): """ generates a complete xlink:href for any node (token node, structure node etc.) in the docgraph. This will only work AFTER the corresponding PAULA files have been created (and their file names are registered in ``self.paulamap``). """ if istoken(self.dg, node_id): base_paula_id = self.paulamap['tokenization'] else: base_paula_id = self.paulamap['hierarchy'][layer] return '{0}.xml#{1}'.format(base_paula_id, node_id)
def node2bracket(docgraph, node_id, child_str=''): """convert a docgraph node into a PTB-style string.""" node_attrs = docgraph.node[node_id] if istoken(docgraph, node_id): pos_str = node_attrs.get(docgraph.ns+':pos', '') token_str = node_attrs[docgraph.ns+':token'] return u"({pos}{space1}{token}{space2}{child})".format( pos=pos_str, space1=bool(pos_str)*' ', token=token_str, space2=bool(child_str)*' ', child=child_str) else: # node is not a token label_str=node_attrs.get('label', '') return u"({label}{space}{child})".format( label=label_str, space=bool(label_str and child_str)*' ', child=child_str)
def node2freqt(docgraph, node_id, child_str='', include_pos=False, escape_func=FREQT_ESCAPE_FUNC): """convert a docgraph node into a FREQT string.""" node_attrs = docgraph.node[node_id] if istoken(docgraph, node_id): token_str = escape_func(node_attrs[docgraph.ns+':token']) if include_pos: pos_str = escape_func(node_attrs.get(docgraph.ns+':pos', '')) return u"({pos}({token}){child})".format( pos=pos_str, token=token_str, child=child_str) else: return u"({token}{child})".format(token=token_str, child=child_str) else: # node is not a token label_str=escape_func(node_attrs.get('label', node_id)) return u"({label}{child})".format(label=label_str, child=child_str)
def node2bracket(docgraph, node_id, child_str=''): """convert a docgraph node into a PTB-style string.""" node_attrs = docgraph.node[node_id] if istoken(docgraph, node_id): pos_str = node_attrs.get(docgraph.ns + ':pos', '') token_str = node_attrs[docgraph.ns + ':token'] return u"({pos}{space1}{token}{space2}{child})".format( pos=pos_str, space1=bool(pos_str) * ' ', token=token_str, space2=bool(child_str) * ' ', child=child_str) else: # node is not a token label_str = node_attrs.get('label', '') return u"({label}{space}{child})".format( label=label_str, space=bool(label_str and child_str) * ' ', child=child_str)
def traverse_dependencies_up(docgraph, node_id, node_attr=None): """ starting from the given node, traverse ingoing edges up to the root element of the sentence. return the given node attribute from all the nodes visited along the way. """ # there's only one, but we're in a multidigraph source, target = docgraph.in_edges(node_id)[0] traverse_attr = node_attr if node_attr else docgraph.lemma_attr attrib_value = docgraph.node[source].get(traverse_attr) if attrib_value: yield attrib_value if istoken(docgraph, source) is True: for attrib_value in traverse_dependencies_up(docgraph, source, traverse_attr): yield attrib_value
def node2freqt(docgraph, node_id, child_str='', include_pos=False, escape_func=FREQT_ESCAPE_FUNC): """convert a docgraph node into a FREQT string.""" node_attrs = docgraph.node[node_id] if istoken(docgraph, node_id): token_str = escape_func(node_attrs[docgraph.ns + ':token']) if include_pos: pos_str = escape_func(node_attrs.get(docgraph.ns + ':pos', '')) return u"({pos}({token}){child})".format(pos=pos_str, token=token_str, child=child_str) else: return u"({token}{child})".format(token=token_str, child=child_str) else: # node is not a token label_str = escape_func(node_attrs.get('label', node_id)) return u"({label}{child})".format(label=label_str, child=child_str)
def __repair_unconnected_nodes(self): """ Adds a (``dominance_relation``) edge from the sentence root node to all previously unconnected nodes (token nodes, that either represent a punctuation mark or are part of a headline 'sentence' that has no full syntax structure annotation). """ unconnected_node_ids = get_unconnected_nodes(self) if dg.istoken(self, self.root): # This sentence has no hierarchical structure, i.e. the root # node is also a terminal / token node. # We will add a virtual root node to compensate for this. self.root = self.ns+':VROOT' self.add_node(self.root, layers={'tiger', 'tiger:syntax', 'tiger:sentence', 'tiger:sentence:root'}) for unconnected_node_id in unconnected_node_ids: self.add_edge(self.root, unconnected_node_id, layers={self.ns, self.ns+':sentence', self.ns+':unconnected'}, edge_type=EdgeTypes.dominance_relation)
def get_rst_relations(docgraph): """ returns a dictionary with RST relation root node IDs (str, e.g. 'rst:23') as keys and dictionaries describing these RST relations as values. Parameters ---------- docgraph : DiscourseDocumentGraph a document graph which contains RST annotations Returns ------- rst_relations : defaultdict(str) possible keys: 'tokens', 'nucleus', 'satellites', 'multinuc' maps from an RST relation root node ID (str, e.g. 'rst:23') to a dictionary describing this RST relation. The key 'tokens' maps to a list of token (node IDs) which the relation spans. If the dictionary contains the key 'multinuc', the relation is multinuclear and the keys 'nucleus' and 'satellites' contain nothing. The key 'multinuc' maps to a list of (node ID (str), RST reltype (str), list of token node IDs) triples; each one describes a nucleus. The key 'nucleus' maps to a list of token (node IDs) which the relation spans. The key 'satellites' maps to a list of (node ID (str), RST reltype (str), list of token node IDs) triples; each one describes a satellite. """ rst_relations = defaultdict(lambda : defaultdict(str)) for dom_node, relname, toks in get_rst_relation_root_nodes(docgraph): neighbors = \ list(select_neighbors_by_layer(docgraph, dom_node, layer={'rst:segment', 'rst:group'})) multinuc_nuc_count = 1 directly_dominated_tokens = sorted([node for node in docgraph.neighbors(dom_node) if istoken(docgraph, node)], key=natural_sort_key) if directly_dominated_tokens: rst_relations[dom_node]['tokens'] = directly_dominated_tokens for neighbor in neighbors: for edge in docgraph[dom_node][neighbor]: # multidigraph edge_attrs = docgraph[dom_node][neighbor][edge] if edge_attrs['edge_type'] == EdgeTypes.spanning_relation: # a span always signifies the nucleus of a relation # there can be only one rst_relations[dom_node]['nucleus'] = (neighbor, get_span(docgraph, neighbor)) elif edge_attrs['rst:rel_type'] == 'rst': # a segment/group nucleus can dominate multiple satellites # (in different RST relations) satellite = (neighbor, edge_attrs['rst:rel_name'], get_span(docgraph, neighbor)) if 'satellites' in rst_relations[dom_node]: rst_relations[dom_node]['satellites'].append(satellite) else: rst_relations[dom_node]['satellites'] = [satellite] elif edge_attrs['rst:rel_type'] == 'multinuc': nucleus = (neighbor, edge_attrs['rst:rel_name'], get_span(docgraph, neighbor)) if 'multinuc' in rst_relations[dom_node]: rst_relations[dom_node]['multinuc'].append(nucleus) else: rst_relations[dom_node]['multinuc'] = [nucleus] multinuc_nuc_count += 1 else: raise NotImplementedError("unknown type of RST segment domination") return rst_relations
def __gen_hierarchy_file(self, layer): """ Hierarchical structures (<structList> elements) are used to create hierarchically nested annotation graphs (e.g. to express consists-of relationships or dominance-edges in syntax trees, RST). A <struct> element will be created for each hierarchical node (e.g. an NP) with edges (<rel> elements) to each dominated element (e.g. tokens, other <struct> elements). NOTE: The types/labels of these newly create hierarchical nodes and edges aren't stored in this file, but in feat/multiFeat files referencing this one! See: __gen_struct_anno_files() and __gen_rel_anno_file()). There will be one hierarchy file for each top level layer. TODO: check, if we can omit hierarchy files for layers that don't contain dominance edges """ paula_id = '{0}.{1}.{2}_{3}'.format(layer, self.corpus_name, self.name, layer) self.paulamap['hierarchy'][layer] = paula_id E, tree = gen_paula_etree(paula_id) dominance_edges = select_edges_by( self.dg, layer=layer, edge_type=EdgeTypes.dominance_relation, data=True) span_edges = select_edges_by( self.dg, layer=layer, edge_type=EdgeTypes.spanning_relation, data=True) dominance_dict = defaultdict(lambda: defaultdict(str)) for source_id, target_id, edge_attrs in dominance_edges: if source_id != layer+':root_node': dominance_dict[source_id][target_id] = edge_attrs # in PAULA XML, token spans are also part of the hierarchy for source_id, target_id, edge_attrs in span_edges: if istoken(self.dg, target_id): dominance_dict[source_id][target_id] = edge_attrs # NOTE: we don't add a base file here, because the nodes could be # tokens or structural nodes slist = E('structList', {'type': layer}) for source_id in dominance_dict: struct = E('struct', {'id': str(source_id)}) if self.human_readable: struct.append(Comment(self.dg.node[source_id].get('label'))) for target_id in dominance_dict[source_id]: if istoken(self.dg, target_id): href = '{0}.xml#{1}'.format(self.paulamap['tokenization'], target_id) else: href = '#{0}'.format(target_id) rel = E( 'rel', {'id': 'rel_{0}_{1}'.format(source_id, target_id), 'type': dominance_dict[source_id][target_id]['edge_type'], XLINKHREF: href}) struct.append(rel) if self.human_readable: struct.append( Comment(self.dg.node[target_id].get('label'))) slist.append(struct) tree.append(slist) self.files[paula_id] = tree self.file2dtd[paula_id] = PaulaDTDs.struct return paula_id