Exemplo n.º 1
0
        ID of the PP is returned.
    """
    potential_markables = []

    for node_id, nattr in dg.select_nodes_by_layer(docgraph,
                                                   'tiger:syntax',
                                                   data=True):
        if nattr['tiger:cat'] == 'NP':
            # if an NP is embedded into a PP, only print the PP
            pp_parent = False
            for source, target in docgraph.in_edges(node_id):
                parent_node = docgraph.node[source]
                if 'tiger:cat' in parent_node and parent_node[
                        'tiger:cat'] == 'PP':
                    potential_markables.append(source)  # add parent PP phrase
                    pp_parent = True
            if not pp_parent:
                potential_markables.append(node_id)  # add NP phrase

        elif nattr['tiger:cat'] == 'PP':
            potential_markables.append(node_id)  # add PP phrase
    return potential_markables


# instanciate an MMAX document graph with a pseudo-function
read_mmax2 = MMAXDocumentGraph

if __name__ == "__main__":
    generic_converter_cli(MMAXDocumentGraph,
                          '*.mmax file (MMAX2 annotation file)')
Exemplo n.º 2
0
    node IDs of nodes representing subordinate clause constituents.

    Parameters
    ----------
    tiger_docgraph : DiscourseDocumentGraph or TigerDocumentGraph
        document graph from which subordinate clauses will be extracted

    Returns
    -------
    subord_clause_nodes : list(str)
        list of node IDs of nodes directly dominating subordinate clauses
    """
    subord_clause_rels = \
        dg.select_edges_by_attribute(
            tiger_docgraph, attribute='tiger:label',
            value=['MO', 'RC', 'SB'])

    subord_clause_nodes = []
    for src_id, target_id in subord_clause_rels:
        src_cat = tiger_docgraph.node[src_id].get('tiger:cat')
        if src_cat == 'S' and not dg.istoken(tiger_docgraph, target_id):
            subord_clause_nodes.append(target_id)
    return subord_clause_nodes


# pseudo-function to create a document graph from a Tiger XML file
read_tiger = TigerDocumentGraph

if __name__ == '__main__':
    generic_converter_cli(TigerDocumentGraph, 'TigerXML (syntax)')
Exemplo n.º 3
0
                    nucleus_index = child_types['Nucleus'][0]
                    satellite_index = child_types['Satellite'][0]

                    nucleus_node_id = get_node_id(children[nucleus_index],
                                                  self.ns)
                    satellite_node_id = get_node_id(children[satellite_index],
                                                    self.ns)
                    self.add_edge(node_id,
                                  nucleus_node_id,
                                  attr_dict={self.ns + ':rel_type': 'span'},
                                  edge_type=EdgeTypes.spanning_relation)
                    self.add_edge(
                        nucleus_node_id,
                        satellite_node_id,
                        attr_dict={self.ns + ':rel_type': relation_type},
                        edge_type=EdgeTypes.dominance_relation)
                else:
                    raise ValueError(
                        "Unexpected child combinations: {}\n".format(
                            child_types))

                for child in children:
                    self.parse_dis_tree(child, indent=indent + 1)


# pseudo-function to create a document graph from a RST (.dis) file
read_dis = RSTLispDocumentGraph

if __name__ == '__main__':
    generic_converter_cli(RSTLispDocumentGraph, 'RST (rhetorical structure)')
Exemplo n.º 4
0
        if 'satellites' in rst_relations[dom_node]:
            # find the nucleus
            if 'nucleus' in rst_relations[dom_node]:
                nuc_id, nuc_toks = rst_relations[dom_node]['nucleus']
                nuc_start, nuc_end = get_segment_token_offsets(nuc_toks, token_map)
            elif 'multinuc' in rst_relations[dom_node]:
                nuc_id = dom_node # multinuc as a whole is the nucleus
                nuc_start, nuc_end = multinuc_start, multinuc_end
            elif 'tokens' in rst_relations[dom_node]:
                nuc_id = dom_node # dominating segment node directly dominates these tokens
                nuc_start, nuc_end = get_segment_token_offsets(rst_relations[dom_node]['tokens'], token_map)
            else:
                raise ValueError("Can't find a nucleus for these satellites: {}".format(rst_relations[dom_node]['satellites']))

            sat_spans = rst_relations[dom_node]['satellites']
            for satellite, relname, sat_toks in sat_spans:
                sat_start, sat_end = get_segment_token_offsets(sat_toks, token_map)
                nucleus_span =  ("{0}-{1}".format(nuc_id, satellite), 'N', relname, nuc_start, nuc_end)
                all_spans.append(nucleus_span)
                satellite_span = ("{0}-{1}".format(nuc_id, satellite), 'S', relname, sat_start, sat_end)
                all_spans.append(satellite_span)
    return all_spans


# pseudo-function(s) to create a document graph from a RST (.rs3) file
read_rst = read_rs3 = RSTGraph


if __name__ == '__main__':
    generic_converter_cli(RSTGraph, 'RST (rhetorical structure)')
Exemplo n.º 5
0
    assert anaphora in ('das', 'es')
    ret_str = u''
    annotated_token_ids = [tok_id for tok_id in dg.select_nodes_by_layer(docgraph, docgraph.ns+':annotated')
                           if docgraph.get_token(tok_id).lower() == anaphora]
    for token_id in docgraph.tokens:
        if token_id in annotated_token_ids:
            certainty_str = '' if docgraph.ns+':certainty' == '1.0' else '?'
            ret_str += u'{0}/{1}{2} '.format(
                docgraph.get_token(token_id),
                ANNOTATIONS[docgraph.node[token_id][docgraph.ns+':annotation']],
                certainty_str)
        else:
            ret_str += u'{} '.format(docgraph.get_token(token_id))
    return ret_str


def write_anaphoricity(docgraph, output_path, anaphora='das'):
    outpath, _fname = os.path.split(output_path)
    dg.util.create_dir(outpath)
    with codecs.open(output_path, 'w', encoding='utf-8') as outfile:
        outfile.write(gen_anaphoricity_str(docgraph, anaphora=anaphora))


# pseudo-function to create a document graph from an anaphoricity file
read_anaphoricity = AnaphoraDocumentGraph


if __name__ == '__main__':
    generic_converter_cli(AnaphoraDocumentGraph,
                          file_descriptor='anaphoricity')
Exemplo n.º 6
0
    Yields
    ------
    relations : str or (str, str, list of str) tuples
        If data=False, this will just yield node IDs of the nodes that
        directly dominate an RST relation. If data=True, this yields
        tuples of the form: (node ID, relation name, list of tokens that this
        relation spans).
    """
    for unit_id in sorted(select_nodes_by_layer(docgraph,
                                                conano_namespace + ':unit'),
                          key=natural_sort_key):
        yield (unit_id, get_span(docgraph, unit_id)) if data else (unit_id)


def get_connective(docgraph, unit_id):
    """
    returns the lowercased string of the connective used in the given Conano unit.
    """
    unit_index, _unit_type = unit_id.split(':')
    connective_id = unit_index + ':connective'
    return ' '.join(
        docgraph.get_token(tok_id).lower()
        for tok_id in get_span(docgraph, connective_id))


# pseudo-function to create a document graph from a ConanoXML file
read_conano = ConanoDocumentGraph

if __name__ == "__main__":
    generic_converter_cli(ConanoDocumentGraph, 'ConanoXML (connectives)')
Exemplo n.º 7
0
        in the input document. If an NP is embedded in a PP, only the node
        ID of the PP is returned.
    """
    potential_markables = []

    for node_id, nattr in dg.select_nodes_by_layer(docgraph, 'tiger:syntax', data=True):
        if nattr['tiger:cat'] == 'NP':
            # if an NP is embedded into a PP, only print the PP
            pp_parent = False
            for source, target in docgraph.in_edges(node_id):
                parent_node = docgraph.node[source]
                if 'tiger:cat' in parent_node and parent_node['tiger:cat'] == 'PP':
                    potential_markables.append(source) # add parent PP phrase
                    pp_parent = True
            if not pp_parent:
                potential_markables.append(node_id) # add NP phrase

        elif nattr['tiger:cat'] == 'PP':
            potential_markables.append(node_id) # add PP phrase
    return potential_markables



# instanciate an MMAX document graph with a pseudo-function
read_mmax2 = MMAXDocumentGraph


if __name__ == "__main__":
    generic_converter_cli(MMAXDocumentGraph,
                          '*.mmax file (MMAX2 annotation file)')
Exemplo n.º 8
0
    Yields
    ------
    relations : str or (str, str, list of str) tuples
        If data=False, this will just yield node IDs of the nodes that
        directly dominate an RST relation. If data=True, this yields
        tuples of the form: (node ID, relation name, list of tokens that this
        relation spans).
    """
    for unit_id in sorted(select_nodes_by_layer(docgraph, conano_namespace+':unit'),
                          key=natural_sort_key):
        yield (unit_id, get_span(docgraph, unit_id)) if data else (unit_id)


def get_connective(docgraph, unit_id):
    """
    returns the lowercased string of the connective used in the given Conano unit.
    """
    unit_index, _unit_type = unit_id.split(':')
    connective_id = unit_index+':connective'
    return ' '.join(docgraph.get_token(tok_id).lower()
                    for tok_id in get_span(docgraph, connective_id))


# pseudo-function to create a document graph from a ConanoXML file
read_conano = ConanoDocumentGraph


if __name__ == "__main__":
    generic_converter_cli(ConanoDocumentGraph, 'ConanoXML (connectives)')
Exemplo n.º 9
0
    Parameters
    ----------
    tiger_docgraph : DiscourseDocumentGraph or TigerDocumentGraph
        document graph from which subordinate clauses will be extracted

    Returns
    -------
    subord_clause_nodes : list(str)
        list of node IDs of nodes directly dominating subordinate clauses
    """
    subord_clause_rels = \
        dg.select_edges_by_attribute(
            tiger_docgraph, attribute='tiger:label',
            value=['MO', 'RC', 'SB'])

    subord_clause_nodes = []
    for src_id, target_id in subord_clause_rels:
        src_cat = tiger_docgraph.node[src_id].get('tiger:cat')
        if src_cat == 'S' and not dg.istoken(tiger_docgraph, target_id):
            subord_clause_nodes.append(target_id)
    return subord_clause_nodes


# pseudo-function to create a document graph from a Tiger XML file
read_tiger = TigerDocumentGraph


if __name__ == '__main__':
    generic_converter_cli(TigerDocumentGraph, 'TigerXML (syntax)')
Exemplo n.º 10
0
                            edge_type=EdgeTypes.dominance_relation)
                    else:
                        assert tree_type == 'Satellite'
                        raise NotImplementedError("I don't know how to combine two satellites")

                elif len(child_types['Satellite']) == 1 and len(child_types['Nucleus']) == 1:
                    # standard RST relation, where one satellite is dominated by one nucleus
                    nucleus_index = child_types['Nucleus'][0]
                    satellite_index = child_types['Satellite'][0]

                    nucleus_node_id = get_node_id(children[nucleus_index], self.ns)
                    satellite_node_id = get_node_id(children[satellite_index], self.ns)
                    self.add_edge(node_id, nucleus_node_id, attr_dict={self.ns+':rel_type': 'span'},
                                  edge_type=EdgeTypes.spanning_relation)
                    self.add_edge(nucleus_node_id, satellite_node_id,
                                  attr_dict={self.ns+':rel_type': relation_type},
                                  edge_type=EdgeTypes.dominance_relation)
                else:
                    raise ValueError("Unexpected child combinations: {}\n".format(child_types))

                for child in children:
                    self.parse_dis_tree(child, indent=indent+1)


# pseudo-function to create a document graph from a RST (.dis) file
read_dis = RSTLispDocumentGraph


if __name__ == '__main__':
    generic_converter_cli(RSTLispDocumentGraph, 'RST (rhetorical structure)')
Exemplo n.º 11
0
            if span_element.tag == 'act':  # doc can have 0+ acts
                self._add_spanning_relation('act_{}'.format(self.act_count),
                                            token_id)
            else:  # <intro> or <conclu>
                self._add_spanning_relation(span_element.tag, token_id)
        if span_element.tag == 'act':
            self.act_count += 1

    def _add_dominance_relation(self, source, target):
        """add a dominance relation to this docgraph"""
        # TODO: fix #39, so we don't need to add nodes by hand
        self.add_node(target, layers={self.ns, self.ns + ':unit'})
        self.add_edge(source,
                      target,
                      layers={self.ns, self.ns + ':discourse'},
                      edge_type=EdgeTypes.dominance_relation)

    def _add_spanning_relation(self, source, target):
        """add a spanning relation to this docgraph"""
        self.add_edge(source,
                      target,
                      layers={self.ns, self.ns + ':unit'},
                      edge_type=EdgeTypes.spanning_relation)


# pseudo-function to create a document graph from a DeCour XML file
read_decour = DecourDocumentGraph

if __name__ == "__main__":
    generic_converter_cli(DecourDocumentGraph, 'DeCour (court transcripts)')
Exemplo n.º 12
0
        for token in span_element.text.split():
            token_id = self._add_token_to_document(token)
            if span_element.tag == 'act':  # doc can have 0+ acts
                self._add_spanning_relation('act_{}'.format(self.act_count),
                                            token_id)
            else:  # <intro> or <conclu>
                self._add_spanning_relation(span_element.tag, token_id)
        if span_element.tag == 'act':
            self.act_count += 1

    def _add_dominance_relation(self, source, target):
        """add a dominance relation to this docgraph"""
        # TODO: fix #39, so we don't need to add nodes by hand
        self.add_node(target, layers={self.ns, self.ns+':unit'})
        self.add_edge(source, target,
                      layers={self.ns, self.ns+':discourse'},
                      edge_type=EdgeTypes.dominance_relation)

    def _add_spanning_relation(self, source, target):
        """add a spanning relation to this docgraph"""
        self.add_edge(source, target, layers={self.ns, self.ns+':unit'},
                      edge_type=EdgeTypes.spanning_relation)


# pseudo-function to create a document graph from a DeCour XML file
read_decour = DecourDocumentGraph


if __name__ == "__main__":
    generic_converter_cli(DecourDocumentGraph, 'DeCour (court transcripts)')