Пример #1
0
    def root2tree(self, start_node=None):
        root_nodes = self.child_dict[start_node]
        num_roots = len(root_nodes)
        if num_roots == 1:
            return self.dt(start_node=root_nodes[0])
        elif num_roots > 1:
            # An undesired, but common case (at least in the PCC corpus).
            # This happens if there's one EDU not to connected to the rest
            # of the tree (e.g. a headline). We will just make all 'root'
            # nodes part of a multinuc relation called VIRTUAL_ROOT.
            logging.log(
                logging.INFO, "File '{}' has {} roots!".format(
                    os.path.basename(self.filepath), num_roots))

            root_subtrees = [
                n_wrap(self.dt(start_node=root_id),
                       debug=self.debug,
                       root_id=root_id) for root_id in root_nodes
            ]
            sorted_subtrees = self.sort_subtrees(*root_subtrees)

            # assign the root_id of the highest subtree to the virtual root
            max_height, virtual_root_id = max(
                (st.height(), st.root_id) for st in sorted_subtrees)

            return t(VIRTUAL_ROOT,
                     sorted_subtrees,
                     debug=self.debug,
                     root_id=virtual_root_id)
        else:
            return t('')
Пример #2
0
    def convert_schema(self, nuc_tuple, inner_sat_tuples, outer_sat_tuples):
        """subtrees are represented as (tree, linear tree position) tuples.

        returns relation as root node.
        """
        nuc_tree, nuc_pos = nuc_tuple
        sat_tuples = inner_sat_tuples + outer_sat_tuples
        last_sat_tuple_pos = len(sat_tuples) - 1

        for i, (sat_tree, sat_pos) in enumerate(sat_tuples):
            relname = self.get_relname(sat_tree.root_id)
            if sat_pos < nuc_pos:
                ordered_trees = [sat_tree, nuc_tree]
            else:
                ordered_trees = [nuc_tree, sat_tree]

            if i == last_sat_tuple_pos:
                nuc_tree = t(relname,
                             ordered_trees,
                             debug=self.debug,
                             root_id=nuc_tree.root_id)
            else:
                nuc_tree = t('N', [(relname, ordered_trees)],
                             debug=self.debug,
                             root_id=nuc_tree.root_id)
        return nuc_tree
Пример #3
0
def make_span(parented_tree):
    """create a 'span' or 'leaf' subtree for dis/lisp/RST-DT-formatted trees.
    
    Examples:
           span     (a subtree that covers the leaves 1 to 7)
         ___|____   
        1        7 

        leaf        (a subtree that only covers leaf 7)
         |   
         7
    """
    all_leaves = all_leaf_positions(parented_tree)
    if is_root(parented_tree):
        return t('span', ['1', str(len(all_leaves))])
    
    subtree_leaves = subtree_leaf_positions(parented_tree)
    if len(subtree_leaves) == 1:
        edu_id = all_leaves.index(subtree_leaves[0]) + 1
        return t('leaf', [str(edu_id)])
    elif len(subtree_leaves) > 1:
        first_edu_id = all_leaves.index(subtree_leaves[0]) + 1
        last_edu_id = all_leaves.index(subtree_leaves[-1]) + 1
        return t('span', [str(first_edu_id), str(last_edu_id)])
    else:
        raise NotImplementedError('Subtree has no leaves')
Пример #4
0
def test_rs3filewriter_nucsat():
    """A DGParentedTree with one nuc-sat relation is correctly converted into an RS3 file and back."""
    input_tree = t("circumstance", [
        ("S", ["foo"]),
        ("N", ["bar"])])
    expected_output_tree = example2tree("foo-bar-circ-foo-to-bar.rs3")

    tempfile = NamedTemporaryFile()
    RS3FileWriter(input_tree, output_filepath=tempfile.name)
    produced_output_tree = RSTTree(tempfile.name)

    assert produced_output_tree.edu_strings == produced_output_tree.tree.leaves() == ['foo', 'bar']
    assert input_tree == expected_output_tree.tree == produced_output_tree.tree

    input_tree = t("circumstance", [
        ("N", ["foo"]),
        ("S", ["bar"])])
    expected_output_tree = example2tree("foo-bar-circ-bar-to-foo.rs3")

    tempfile = NamedTemporaryFile()
    RS3FileWriter(input_tree, output_filepath=tempfile.name)
    produced_output_tree = RSTTree(tempfile.name)

    assert produced_output_tree.edu_strings == produced_output_tree.tree.leaves() == ['foo', 'bar']
    assert input_tree == expected_output_tree.tree == produced_output_tree.tree
Пример #5
0
    def root2tree(self, start_node=None):
        root_nodes = self.child_dict[start_node]
        num_roots = len(root_nodes)
        if num_roots == 1:
            return self.dt(start_node=root_nodes[0])
        elif num_roots > 1:
            # An undesired, but common case (at least in the PCC corpus).
            # This happens if there's one EDU not to connected to the rest
            # of the tree (e.g. a headline). We will just make all 'root'
            # nodes part of a multinuc relation called VIRTUAL_ROOT.
            logging.log(logging.INFO,
                        "File '{}' has {} roots!".format(
                            os.path.basename(self.filepath), num_roots))

            root_subtrees = [n_wrap(self.dt(start_node=root_id),
                                    debug=self.debug, root_id=root_id)
                             for root_id in root_nodes]
            sorted_subtrees = self.sort_subtrees(*root_subtrees)

            # assign the root_id of the highest subtree to the virtual root
            max_height, virtual_root_id = max((st.height(), st.root_id)
                                              for st in sorted_subtrees)

            return t(VIRTUAL_ROOT, sorted_subtrees, debug=self.debug,
                     root_id=virtual_root_id)
        else:
            return t('')
Пример #6
0
def gen_numbered_nucsat(first_element, number):
    expected_elems = ('N', 'S')
    assert first_element in expected_elems

    nuc = ('N', ['nuc'])
    sat = ('S', ['sat-{}'.format(number)])

    if first_element == 'N':
        return t('nuc-sat-{}'.format(number), [nuc, sat])
    else:
        return t('sat-nuc-{}'.format(number), [sat, nuc])
Пример #7
0
 def sorted_nucsat_tree(self, nuc_tree, sat_tree):
     sorted_subtrees = self.sort_subtrees(nuc_tree, sat_tree)
     relname = self.get_relname(sat_tree.root_id)
     return t(relname,
              sorted_subtrees,
              debug=self.debug,
              root_id=nuc_tree.root_id)
Пример #8
0
def test_nucsat():
    """A single nucleus-satellite relation is converted into rst.sty format."""
    sat_before_nuc = \
    t('circumstance', [
            ('S', ['sat first']),
            ('N', ['nuc second'])
    ])
    result = dg.write_rstlatex(sat_before_nuc)
    assert result.rstlatextree == u'\\dirrel\n\t{circumstance}{\\rstsegment{sat first}}\n\t{}{\\rstsegment{nuc second}}'

    nuc_before_sat = \
    t('circumstance', [
            ('N', ['nuc first']),
            ('S', ['sat second'])
        ])
    result = dg.write_rstlatex(nuc_before_sat)
    assert result.rstlatextree == u'\\dirrel\n\t{}{\\rstsegment{nuc first}}\n\t{circumstance}{\\rstsegment{sat second}}'
Пример #9
0
def convert(parented_tree):
    if is_root(parented_tree):
        span_description = make_span(parented_tree)
        children = [span_description]
        for subtree in get_nucsat_subtrees(parented_tree):
            children.append(convert(subtree))
        orphaned_children = [orphanize(child) for child in children]
        return t('Root', orphaned_children)
    elif is_leaf(parented_tree):
        return make_edu(parented_tree)
    else:
        span_description = make_span(parented_tree)
        rel_description = make_rel2par(parented_tree)
        children = [span_description, rel_description]
        for subtree in get_nucsat_subtrees(parented_tree):
            children.append(convert(subtree))
        tree_label = convert_label(parented_tree.label())
        orphaned_children = [orphanize(child) for child in children]
        return t(tree_label, orphaned_children)
Пример #10
0
def test_rs3filewriter_emptytree():
    """An empty DGParentedTree is converted into an empty RS3 file and back."""
    input_tree = t("", [])
    expected_output_tree = example2tree("empty.rs3")

    tempfile = NamedTemporaryFile()
    RS3FileWriter(input_tree, output_filepath=tempfile.name)
    produced_output_tree = RSTTree(tempfile.name)

    assert produced_output_tree.edu_strings == produced_output_tree.tree.leaves() == []
    assert input_tree == expected_output_tree.tree == produced_output_tree.tree
Пример #11
0
def test_rs3filewriter_onesegmenttree():
    """A DGParentedTree with only one segment is correctly converted into an RS3 file and back."""
    input_tree = t("N", ["foo"])
    expected_output_tree = example2tree('only-one-segment.rs3')

    tempfile = NamedTemporaryFile()
    RS3FileWriter(input_tree, output_filepath=tempfile.name)
    produced_output_tree = RSTTree(tempfile.name)

    assert produced_output_tree.edu_strings == produced_output_tree.tree.leaves() == ['foo']
    assert input_tree == expected_output_tree.tree == produced_output_tree.tree
Пример #12
0
def test_multinuc():
    """A multinuclear relation is converted into rst.sty format."""
    contrast = \
    t('contrast', [
            ('N', ['nuc-1']),
            ('N', ['nuc-2'])
        ])

    result = dg.write_rstlatex(contrast)
    assert result.rstlatextree == u'\\multirel{contrast}\n\t{\\rstsegment{nuc-1}}\n\t{\\rstsegment{nuc-2}}'

    joint = \
    t('joint', [
            ('N', ['nuc-1']),
            ('N', ['nuc-2']),
            ('N', ['nuc-3'])
        ])

    result = dg.write_rstlatex(joint)
    assert result.rstlatextree == u'\\multirel{joint}\n\t{\\rstsegment{nuc-1}}\n\t{\\rstsegment{nuc-2}}\n\t{\\rstsegment{nuc-3}}'
Пример #13
0
    def convert_schema(self, nuc_tuple, inner_sat_tuples, outer_sat_tuples):
        """subtrees are represented as (tree, linear tree position) tuples.

        returns relation as root node.
        """
        nuc_tree, nuc_pos = nuc_tuple
        sat_tuples = inner_sat_tuples + outer_sat_tuples
        last_sat_tuple_pos = len(sat_tuples)-1

        for i, (sat_tree, sat_pos) in enumerate(sat_tuples):
            relname = self.get_relname(sat_tree.root_id)
            if sat_pos < nuc_pos:
                ordered_trees = [sat_tree, nuc_tree]
            else:
                ordered_trees = [nuc_tree, sat_tree]

            if i == last_sat_tuple_pos:
                nuc_tree = t(relname, ordered_trees, debug=self.debug, root_id=nuc_tree.root_id)
            else:
                nuc_tree = t('N', [(relname, ordered_trees)], debug=self.debug, root_id=nuc_tree.root_id)
        return nuc_tree
Пример #14
0
def make_rel2par(nuc_or_sat_subtree):
    if is_root(nuc_or_sat_subtree):
        raise ValueError("Root node can't have a relation.")
    subtree_root_label = nuc_or_sat_subtree.label()
    parent_label = nuc_or_sat_subtree.parent().label()
    if subtree_root_label == 'S':
        return t('rel2par', [parent_label])
    elif subtree_root_label == 'N':
        siblings = get_siblings(nuc_or_sat_subtree)
        root = nuc_or_sat_subtree.root()
        sibling_labels = [root[sib].label() for sib in siblings]
        if len(siblings) == 1 and sibling_labels[0] == 'S':
            return t('rel2par', ['span'])
        elif all([label == 'N' for label in sibling_labels]):
            return t('rel2par', [parent_label])
        else:
            raise ValueError(
                "Can't mix sibling types. Expected 'N' or 'S', got: {}".format(sibling_labels))
    else:
        raise ValueError(
            "Unknown nuclearity. Expected 'N' or 'S', got: {}".format(subtree_root_label))
Пример #15
0
def dis2tree(dis_tree, wrap_tree=False):
    assert get_tree_type(dis_tree) in SUBTREE_TYPES, "tree_type: {}".format(
        get_tree_type(dis_tree))
    if get_node_type(dis_tree) == 'leaf':
        return leaf2tree(dis_tree)

    if is_root(dis_tree):
        children = dis_tree[1:]
    else:
        children = dis_tree[2:]

    child_types = get_child_types(children)
    if len(child_types) == 1:  # this is a multinuc relation
        assert NUC in child_types, "child_types: {}".format(child_types)
        assert len(child_types[NUC]) > 1, "len: {}".format(
            len(child_types[NUC]))

        subtrees = [
            dis2tree(children[child_id], wrap_tree=True)
            for child_id in child_types[NUC]
        ]

        # all subtrees of a multinuc have the same relation, so we can just read it from the first one
        reltype = get_relation_type(children[0])

    else:  # this is a nucleus-satellite relation
        assert len(child_types) == 2, "child_types: {}".format(child_types)
        assert NUC in child_types and SAT in child_types, "child_types: {}".format(
            child_types)
        assert len(child_types[NUC]) == 1 and len(child_types[SAT]) == 1, \
            "child_types: {}".format(child_types)

        nuc_child_id = child_types[NUC][0]
        nuc_subtree = dis2tree(children[nuc_child_id], wrap_tree=True)

        sat_child_id = child_types[SAT][0]
        sat_child = children[sat_child_id]
        sat_subtree = dis2tree(sat_child, wrap_tree=True)

        # determine order of subtrees
        if nuc_child_id < sat_child_id:
            subtrees = [nuc_subtree, sat_subtree]
        else:
            subtrees = [sat_subtree, nuc_subtree]

        # the relation type is only stored in the satellite
        reltype = get_relation_type(sat_child)

    rst_tree = t(reltype, subtrees)
    return get_wrapped_tree(dis_tree, rst_tree, wrap_tree=wrap_tree)
Пример #16
0
def test_t():
    assert t("", []) == DGParentedTree("", [])
    assert t("") == DGParentedTree("", [])

    assert t("foo", []) == DGParentedTree("foo", [])
    assert t("foo") == DGParentedTree("foo", [])

    assert t("foo", ["bar"]) == DGParentedTree("foo", ["bar"])
    assert t("foo", ["bar", "baz"]) == DGParentedTree("foo", ["bar", "baz"])
Пример #17
0
def test_writetofile():
    """A single nucleus-satellite relation is converted into rst.sty format
    and written to a file.
    """
    sat_before_nuc = \
    t('circumstance', [
            ('S', ['sat first']),
            ('N', ['nuc second'])
    ])

    tempfile = NamedTemporaryFile()
    dg.write_rstlatex(sat_before_nuc, tempfile.name)

    with open(tempfile.name, 'r') as rstlatex_file:
        assert rstlatex_file.read() == u'\\dirrel\n\t{circumstance}{\\rstsegment{sat first}}\n\t{}{\\rstsegment{nuc second}}\n'
Пример #18
0
def test_rs3filewriter_onesegmenttree_umlauts():
    """A DGParentedTree with only one segment with umlauts is correctly
    converted into an RS3 file and back.
    """
    edu_string = u"Über sein östliches Äußeres"
    input_tree = t("N", [edu_string])
    expected_output_tree = example2tree('only-one-segment-with-umlauts.rs3')

    tempfile = NamedTemporaryFile()
    RS3FileWriter(input_tree, output_filepath=tempfile.name)
    produced_output_tree = RSTTree(tempfile.name)

    assert expected_output_tree.edu_strings == \
        produced_output_tree.edu_strings == \
        produced_output_tree.tree.leaves() == [edu_string]
    assert input_tree == expected_output_tree.tree == produced_output_tree.tree
Пример #19
0
def test_rs3filewriter_nested():
    """A DGParentedTree with a multinuc relation nested in a nuc-sat relation
    is correctly converted into an RS3 file and back."""
    input_tree = t('elaboration', [
        ('N', ['eins']),
        ('S', [
            ('joint', [
                ('N', ['zwei']),
                ('N', ['drei'])])])])
    expected_output_tree = example2tree('eins-zwei-drei-(elab-eins-from-(joint-zwei-and-drei).rs3')

    tempfile = NamedTemporaryFile()
    RS3FileWriter(input_tree, output_filepath=tempfile.name)
    produced_output_tree = RSTTree(tempfile.name)

    assert produced_output_tree.edu_strings == produced_output_tree.tree.leaves() == ['eins', 'zwei', 'drei']
    assert input_tree == expected_output_tree.tree == produced_output_tree.tree
Пример #20
0
    def segment2tree(self, elem_id, elem, elem_type, start_node=None):
        if elem['reltype'] == 'rst':
            # this elem is the S in an N-S relation
            root_label = 'S'
        else:
            root_label = 'N'

        tree = t(root_label, [elem['text']], debug=self.debug, root_id=elem_id)

        if elem_id not in self.child_dict:
            # this might be a root segment without any children
            # (e.g. a headline in PCC) or the only segment in a span
            # (which makes no sense in RST)
            if elem.get('reltype') in ('span', '', None):
                if elem['nuclearity'] != 'root':
                    logging.log(
                        logging.INFO,
                        "Segment '{}' in file '{}' is a non-root nucleus without children"
                        .format(elem_id, os.path.basename(self.filepath)))

                    if elem.get('relname') == 'span':
                        parent_elem = self.elem_dict.get(elem.get('parent'))
                        if parent_elem:
                            elem['relname'] = parent_elem.get('relname')

            return tree

        if len(self.child_dict[elem_id]) == 1:
            # this segment is (also) the N in an N-S relation
            sat_id = self.child_dict[elem_id][0]
            sat_subtree = self.dt(start_node=sat_id)
            return self.sorted_nucsat_tree(tree, sat_subtree)

        elif len(self.child_dict[elem_id]) >= 2:
            # this segment is (also) the N in an RST schema,
            # as such it must only have satellites as children
            assert all([
                self.elem_dict[child_id]['nuclearity'] == 'satellite'
                for child_id in self.child_dict[elem_id]
            ])

            sat_subtrees = [
                self.dt(start_node=child_id)
                for child_id in self.child_dict[elem_id]
            ]
            return self.order_schema(tree, sat_subtrees)
Пример #21
0
def dis2tree(dis_tree, wrap_tree=False):
    assert get_tree_type(dis_tree) in SUBTREE_TYPES, "tree_type: {}".format(get_tree_type(dis_tree))
    if get_node_type(dis_tree) == 'leaf':
        return leaf2tree(dis_tree)
    
    if is_root(dis_tree):
        children = dis_tree[1:]
    else:
        children = dis_tree[2:]

    child_types = get_child_types(children)    
    if len(child_types) == 1: # this is a multinuc relation
        assert NUC in child_types, "child_types: {}".format(child_types)
        assert len(child_types[NUC]) > 1, "len: {}".format(len(child_types[NUC]))
        
        subtrees = [dis2tree(children[child_id], wrap_tree=True) for child_id in child_types[NUC]]
        
        # all subtrees of a multinuc have the same relation, so we can just read it from the first one
        reltype = get_relation_type(children[0])      
        
    else: # this is a nucleus-satellite relation
        assert len(child_types) == 2, "child_types: {}".format(child_types)
        assert NUC in child_types and SAT in child_types, "child_types: {}".format(child_types)
        assert len(child_types[NUC]) == 1 and len(child_types[SAT]) == 1, \
            "child_types: {}".format(child_types)
        
        nuc_child_id = child_types[NUC][0]
        nuc_subtree = dis2tree(children[nuc_child_id], wrap_tree=True)

        sat_child_id = child_types[SAT][0]
        sat_child = children[sat_child_id]
        sat_subtree = dis2tree(sat_child, wrap_tree=True)

        # determine order of subtrees
        if nuc_child_id < sat_child_id:
            subtrees = [nuc_subtree, sat_subtree]
        else:
            subtrees = [sat_subtree, nuc_subtree]
        
        # the relation type is only stored in the satellite
        reltype = get_relation_type(sat_child)

    rst_tree = t(reltype, subtrees)
    return get_wrapped_tree(dis_tree, rst_tree, wrap_tree=wrap_tree)
Пример #22
0
    def segment2tree(self, elem_id, elem, elem_type, start_node=None):
        if elem['reltype'] == 'rst':
            # this elem is the S in an N-S relation
            root_label = 'S'
        else:
            root_label = 'N'

        tree = t(root_label, [elem['text']], debug=self.debug, root_id=elem_id)

        if elem_id not in self.child_dict:
            # this might be a root segment without any children
            # (e.g. a headline in PCC) or the only segment in a span
            # (which makes no sense in RST)
            if elem.get('reltype') in ('span', '', None):
                if elem['nuclearity'] != 'root':
                    logging.log(
                        logging.INFO,
                        "Segment '{}' in file '{}' is a non-root nucleus without children".format(
                            elem_id, os.path.basename(self.filepath)))

                    if elem.get('relname') == 'span':
                        parent_elem = self.elem_dict.get(elem.get('parent'))
                        if parent_elem:
                            elem['relname'] = parent_elem.get('relname')

            return tree

        if len(self.child_dict[elem_id]) == 1:
            # this segment is (also) the N in an N-S relation
            sat_id = self.child_dict[elem_id][0]
            sat_subtree = self.dt(start_node=sat_id)
            return self.sorted_nucsat_tree(tree, sat_subtree)

        elif len(self.child_dict[elem_id]) >= 2:
            # this segment is (also) the N in an RST schema,
            # as such it must only have satellites as children
            assert all([self.elem_dict[child_id]['nuclearity'] == 'satellite'
                        for child_id in self.child_dict[elem_id]])

            sat_subtrees = [self.dt(start_node=child_id)
                            for child_id in self.child_dict[elem_id]]
            return self.order_schema(tree, sat_subtrees)
Пример #23
0
def s_wrap(tree, debug=False, root_id=None):
    """Ensure the given tree has a nucleus as its root.

    If the root of the tree is a satellite, return it.
    If the root of the tree is a nucleus, replace the nucleus
    with a satellite and return the tree.
    If the root of the tree is a relation, place a satellite on top
    and return the tree.
    """
    root_label = tree.label()

    expected_n_root = debug_root_label('N', debug, tree.root_id)
    expected_s_root = debug_root_label('S', debug, tree.root_id)

    if root_label == expected_s_root:
        return tree
    elif root_label == expected_n_root:
        tree.set_label(expected_s_root)
        return tree
    else:
        return t('S', [tree], debug=debug, root_id=root_id)
Пример #24
0
def s_wrap(tree, debug=False, root_id=None):
    """Ensure the given tree has a nucleus as its root.

    If the root of the tree is a satellite, return it.
    If the root of the tree is a nucleus, replace the nucleus
    with a satellite and return the tree.
    If the root of the tree is a relation, place a satellite on top
    and return the tree.
    """
    root_label = tree.label()

    expected_n_root = debug_root_label('N', debug, tree.root_id)
    expected_s_root = debug_root_label('S', debug, tree.root_id)

    if root_label == expected_s_root:
        return tree
    elif root_label == expected_n_root:
        tree.set_label(expected_s_root)
        return tree
    else:
        return t('S', [tree], debug=debug, root_id=root_id)
Пример #25
0
def test_rs3filewriter_pcc_10575():
    """PCC rs3 file 10575 can be converted rs3 -> dgtree -> rs3' -> dgtree',
    without information loss between dgtree and dgtree'.
    """
    input_tree = t('interpretation', [
        ('N', [
            ('circumstance', [
                ('S', ['eins']),
                ('N', [
                    ('contrast', [
                        ('N', ['zwei']),
                        ('N', [
                            ('cause', [
                                ('N', ['drei']),
                                ('S', ['vier'])])])])])])]),
        ('S', ['fuenf'])])
    expected_output_tree = example2tree('maz-10575-excerpt.rs3')

    tempfile = NamedTemporaryFile()
    RS3FileWriter(input_tree, output_filepath=tempfile.name)
    produced_output_tree = RSTTree(tempfile.name)

    assert produced_output_tree.edu_strings == produced_output_tree.tree.leaves() == ['eins', 'zwei', 'drei', 'vier', 'fuenf']
    assert input_tree == expected_output_tree.tree == produced_output_tree.tree
Пример #26
0
 def sorted_nucsat_tree(self, nuc_tree, sat_tree):
     sorted_subtrees = self.sort_subtrees(nuc_tree, sat_tree)
     relname = self.get_relname(sat_tree.root_id)
     return t(relname, sorted_subtrees, debug=self.debug, root_id=nuc_tree.root_id)
Пример #27
0
def make_edu(edu_string):
    tokens = edu_string.split()
    tokens[0] = u'_!' + tokens[0]
    tokens[-1] = tokens[-1] + u'_!'
    return t('text', tokens)
Пример #28
0
    def group2tree(self, elem_id, elem, elem_type, start_node=None):
        reltype = elem.get('reltype')
        root_wrap = s_wrap if reltype == 'rst' else n_wrap

        # rst: this elem is the S in an N-S relation
        # multinuc: this elem is one of several Ns in a multinuc relation
        if reltype in ('rst', 'multinuc'):
            if len(self.child_dict[elem_id]) == 1:
                # this group is the root of another N-S relation
                subtree_id = self.child_dict[elem_id][0]
                subtree = self.dt(start_node=subtree_id)

            else:
                subtrees = [
                    self.elem_wrap(self.dt(start_node=c),
                                   debug=self.debug,
                                   root_id=c) for c in self.child_dict[elem_id]
                ]
                sorted_subtrees = self.sort_subtrees(*subtrees)
                first_child_id = self.child_dict[elem_id][0]
                subtrees_relname = self.get_relname(first_child_id)
                subtree = t(subtrees_relname,
                            sorted_subtrees,
                            debug=self.debug,
                            root_id=elem_id)
            return root_wrap(subtree, debug=self.debug, root_id=elem_id)

        else:
            assert reltype in ('', None, 'span'), \
                "Unexpected combination: elem_type '%s' and reltype '%s'" \
                    % (elem_type, elem['reltype'])

            # this elem is the N in an N-S relation
            if elem['group_type'] == 'multinuc':
                # this elem is also the 'root node' of a multinuc relation
                child_ids = self.child_dict[elem_id]
                multinuc_child_ids = [
                    c for c in child_ids
                    if self.elem_dict[c]['reltype'] == 'multinuc'
                ]
                multinuc_relname = self.get_relname(multinuc_child_ids[0])

                multinuc_elements = [
                    self.dt(start_node=mc) for mc in multinuc_child_ids
                ]
                sorted_subtrees = self.sort_subtrees(*multinuc_elements)

                multinuc_subtree = t(multinuc_relname, [sorted_subtrees],
                                     debug=self.debug,
                                     root_id=elem_id)

                other_child_ids = [
                    c for c in child_ids if c not in multinuc_child_ids
                ]

                if other_child_ids:
                    # this element is the N in an S-N-S schema
                    nuc_tree = t('N',
                                 multinuc_subtree,
                                 debug=self.debug,
                                 root_id=elem_id)

                    assert all([
                        self.elem_dict[child_id]['nuclearity'] == 'satellite'
                        for child_id in other_child_ids
                    ])

                    sat_subtrees = [
                        self.dt(start_node=child_id)
                        for child_id in other_child_ids
                    ]
                    return self.order_schema(nuc_tree, sat_subtrees)

                else:
                    # this elem is only the head of a multinuc relation
                    # TODO: does this make sense / is this ever reached?
                    return multinuc_subtree

            else:
                #~ assert elem['group_type'] == 'span', \
                #~ "Unexpected group_type '%s'" % elem['group_type']
                if len(self.child_dict[elem_id]) == 1:
                    # this span at the top of a tree was only added for visual purposes
                    child_id = self.child_dict[elem_id][0]
                    return self.dt(start_node=child_id)

                elif len(self.child_dict[elem_id]) == 2:
                    # this elem is the N of an N-S relation (child: S), but is also
                    # a span over another relation (child: N)
                    children = {}
                    for child_id in self.child_dict[elem_id]:
                        children[self.elem_dict[child_id]
                                 ['nuclearity']] = child_id

                    sat_id = children['satellite']
                    sat_subtree = self.dt(start_node=sat_id)

                    nuc_subtree = self.dt(start_node=children['nucleus'])
                    nuc_tree = n_wrap(nuc_subtree,
                                      debug=self.debug,
                                      root_id=elem_id)

                    return self.sorted_nucsat_tree(nuc_tree, sat_subtree)

                elif len(self.child_dict[elem_id]) > 2:
                    children = defaultdict(list)
                    for child_id in self.child_dict[elem_id]:
                        children[self.elem_dict[child_id]
                                 ['nuclearity']].append(child_id)

                    assert len(children['nucleus']) == 1

                    nuc_subtree = self.dt(start_node=children['nucleus'][0])
                    nuc_tree = t('N',
                                 nuc_subtree,
                                 debug=self.debug,
                                 root_id=elem_id)

                    sat_subtrees = [
                        self.dt(start_node=sat_child_id)
                        for sat_child_id in children['satellite']
                    ]

                    return self.order_schema(nuc_tree, sat_subtrees)

                else:  #len(child_dict[elem_id]) == 0
                    raise TooFewChildrenError(
                        "A span group ('%s)' should have at least 1 child: %s" \
                            % (elem_id, self.child_dict[elem_id]))
Пример #29
0
def test_multisat():
    """A set of relations sharing the same nucleus is converted into rst.sty format."""
    # S-N-S
    sat_nuc_sat = t(MULTISAT_RELNAME, [
        ('S', gen_numbered_nucsat('S', 1)),
        ('S', gen_numbered_nucsat('N', 1))
    ])

    result = dg.write_rstlatex(sat_nuc_sat)
    assert result.rstlatextree == u'\\dirrel\n\t{sat-nuc-1}{\\rstsegment{sat-1}}\n\t{}{\\rstsegment{nuc}}\n\t{nuc-sat-1}{\\rstsegment{sat-1}}'

    # S-S-N
    sat_sat_nuc = t(MULTISAT_RELNAME, [
        ('S', gen_numbered_nucsat('S', 1)),
        ('S', gen_numbered_nucsat('S', 2))
    ])

    result = dg.write_rstlatex(sat_sat_nuc)
    assert result.rstlatextree == u'\\dirrel\n\t{sat-nuc-1}{\\rstsegment{sat-1}}\n\t{sat-nuc-2}{\\rstsegment{sat-2}}\n\t{}{\\rstsegment{nuc}}'

    # N-S-S
    nuc_sat_sat = t(MULTISAT_RELNAME, [
        ('S', gen_numbered_nucsat('N', 1)),
        ('S', gen_numbered_nucsat('N', 2))
    ])

    result = dg.write_rstlatex(nuc_sat_sat)
    assert result.rstlatextree == u'\\dirrel\n\t{}{\\rstsegment{nuc}}\n\t{nuc-sat-1}{\\rstsegment{sat-1}}\n\t{nuc-sat-2}{\\rstsegment{sat-2}}'

    # S-N-S-S
    sat_nuc_sat_sat = t(MULTISAT_RELNAME, [
        ('S', gen_numbered_nucsat('S', 1)),
        ('S', gen_numbered_nucsat('N', 1)),
        ('S', gen_numbered_nucsat('N', 2))
    ])

    result = dg.write_rstlatex(sat_nuc_sat_sat)
    assert result.rstlatextree == u'\\dirrel\n\t{sat-nuc-1}{\\rstsegment{sat-1}}\n\t{}{\\rstsegment{nuc}}\n\t{nuc-sat-1}{\\rstsegment{sat-1}}\n\t{nuc-sat-2}{\\rstsegment{sat-2}}'

    # S-S-N-S
    sat_sat_nuc_sat = t(MULTISAT_RELNAME, [
        ('S', gen_numbered_nucsat('S', 1)),
        ('S', gen_numbered_nucsat('S', 2)),
        ('S', gen_numbered_nucsat('N', 1))
    ])

    result = dg.write_rstlatex(sat_sat_nuc_sat)
    assert result.rstlatextree == u'\\dirrel\n\t{sat-nuc-1}{\\rstsegment{sat-1}}\n\t{sat-nuc-2}{\\rstsegment{sat-2}}\n\t{}{\\rstsegment{nuc}}\n\t{nuc-sat-1}{\\rstsegment{sat-1}}'

    # S-S-S-N-S
    sat_sat_sat_nuc_sat = t(MULTISAT_RELNAME, [
        ('S', gen_numbered_nucsat('S', 1)),
        ('S', gen_numbered_nucsat('S', 2)),
        ('S', gen_numbered_nucsat('S', 3)),
        ('S', gen_numbered_nucsat('N', 1))
    ])

    result = dg.write_rstlatex(sat_sat_sat_nuc_sat)
    assert result.rstlatextree == u'\\dirrel\n\t{sat-nuc-1}{\\rstsegment{sat-1}}\n\t{sat-nuc-2}{\\rstsegment{sat-2}}\n\t{sat-nuc-3}{\\rstsegment{sat-3}}\n\t{}{\\rstsegment{nuc}}\n\t{nuc-sat-1}{\\rstsegment{sat-1}}'

    # S-N-S-S-S
    sat_nuc_sat_sat_sat = t(MULTISAT_RELNAME, [
        ('S', gen_numbered_nucsat('S', 1)),
        ('S', gen_numbered_nucsat('N', 1)),
        ('S', gen_numbered_nucsat('N', 2)),
        ('S', gen_numbered_nucsat('N', 3))
    ])

    result = dg.write_rstlatex(sat_nuc_sat_sat_sat)
    assert result.rstlatextree == u'\\dirrel\n\t{sat-nuc-1}{\\rstsegment{sat-1}}\n\t{}{\\rstsegment{nuc}}\n\t{nuc-sat-1}{\\rstsegment{sat-1}}\n\t{nuc-sat-2}{\\rstsegment{sat-2}}\n\t{nuc-sat-3}{\\rstsegment{sat-3}}'

    # S-S-S-N-S-S-S
    sat_sat_sat_nuc_sat_sat_sat = t(MULTISAT_RELNAME, [
        ('S', gen_numbered_nucsat('S', 1)),
        ('S', gen_numbered_nucsat('S', 2)),
        ('S', gen_numbered_nucsat('S', 3)),
        ('S', gen_numbered_nucsat('N', 1)),
        ('S', gen_numbered_nucsat('N', 2)),
        ('S', gen_numbered_nucsat('N', 3))
    ])

    result = dg.write_rstlatex(sat_sat_sat_nuc_sat_sat_sat)
    assert result.rstlatextree == u'\\dirrel\n\t{sat-nuc-1}{\\rstsegment{sat-1}}\n\t{sat-nuc-2}{\\rstsegment{sat-2}}\n\t{sat-nuc-3}{\\rstsegment{sat-3}}\n\t{}{\\rstsegment{nuc}}\n\t{nuc-sat-1}{\\rstsegment{sat-1}}\n\t{nuc-sat-2}{\\rstsegment{sat-2}}\n\t{nuc-sat-3}{\\rstsegment{sat-3}}'
Пример #30
0
def n_wrap(tree):
    return t('N', [tree])
Пример #31
0
def s_wrap(tree):
    return t('S', [tree])
Пример #32
0
def s_wrap(tree):
    return t('S', [tree])
Пример #33
0
def n_wrap(tree):
    return t('N', [tree])
Пример #34
0
    def group2tree(self, elem_id, elem, elem_type, start_node=None):
        reltype = elem.get('reltype')
        root_wrap = s_wrap if reltype == 'rst' else n_wrap

        # rst: this elem is the S in an N-S relation
        # multinuc: this elem is one of several Ns in a multinuc relation
        if reltype in ('rst', 'multinuc'):
            if len(self.child_dict[elem_id]) == 1:
                # this group is the root of another N-S relation
                subtree_id = self.child_dict[elem_id][0]
                subtree = self.dt(start_node=subtree_id)

            else:
                subtrees = [self.elem_wrap(self.dt(start_node=c), debug=self.debug, root_id=c)
                            for c in self.child_dict[elem_id]]
                sorted_subtrees = self.sort_subtrees(*subtrees)
                first_child_id = self.child_dict[elem_id][0]
                subtrees_relname = self.get_relname(first_child_id)
                subtree = t(subtrees_relname, sorted_subtrees, debug=self.debug, root_id=elem_id)
            return root_wrap(subtree, debug=self.debug, root_id=elem_id)

        else:
            assert reltype in ('', None, 'span'), \
                "Unexpected combination: elem_type '%s' and reltype '%s'" \
                    % (elem_type, elem['reltype'])

            # this elem is the N in an N-S relation
            if elem['group_type'] == 'multinuc':
                # this elem is also the 'root node' of a multinuc relation
                child_ids = self.child_dict[elem_id]
                multinuc_child_ids = [c for c in child_ids
                                      if self.elem_dict[c]['reltype'] == 'multinuc']
                multinuc_relname = self.get_relname(multinuc_child_ids[0])

                multinuc_elements = [self.dt(start_node=mc)
                                     for mc in multinuc_child_ids]
                sorted_subtrees = self.sort_subtrees(*multinuc_elements)

                multinuc_subtree = t(
                    multinuc_relname, [sorted_subtrees], debug=self.debug,
                    root_id=elem_id)

                other_child_ids = [c for c in child_ids
                                   if c not in multinuc_child_ids]

                if other_child_ids:
                    # this element is the N in an S-N-S schema
                    nuc_tree = t('N', multinuc_subtree, debug=self.debug, root_id=elem_id)

                    assert all([self.elem_dict[child_id]['nuclearity'] == 'satellite'
                                for child_id in other_child_ids])

                    sat_subtrees = [self.dt(start_node=child_id)
                                    for child_id in other_child_ids]
                    return self.order_schema(nuc_tree, sat_subtrees)

                else:
                    # this elem is only the head of a multinuc relation
                    # TODO: does this make sense / is this ever reached?
                    return multinuc_subtree

            else:
                #~ assert elem['group_type'] == 'span', \
                    #~ "Unexpected group_type '%s'" % elem['group_type']
                if len(self.child_dict[elem_id]) == 1:
                    # this span at the top of a tree was only added for visual purposes
                    child_id = self.child_dict[elem_id][0]
                    return self.dt(start_node=child_id)

                elif len(self.child_dict[elem_id]) == 2:
                    # this elem is the N of an N-S relation (child: S), but is also
                    # a span over another relation (child: N)
                    children = {}
                    for child_id in self.child_dict[elem_id]:
                        children[self.elem_dict[child_id]['nuclearity']] = child_id

                    sat_id = children['satellite']
                    sat_subtree = self.dt(start_node=sat_id)

                    nuc_subtree = self.dt(start_node=children['nucleus'])
                    nuc_tree = n_wrap(nuc_subtree, debug=self.debug, root_id=elem_id)

                    return self.sorted_nucsat_tree(nuc_tree, sat_subtree)

                elif len(self.child_dict[elem_id]) > 2:
                    children = defaultdict(list)
                    for child_id in self.child_dict[elem_id]:
                        children[self.elem_dict[child_id]['nuclearity']].append(child_id)

                    assert len(children['nucleus']) == 1

                    nuc_subtree = self.dt(start_node=children['nucleus'][0])
                    nuc_tree = t('N', nuc_subtree, debug=self.debug, root_id=elem_id)

                    sat_subtrees = [self.dt(start_node=sat_child_id)
                                    for sat_child_id in children['satellite']]

                    return self.order_schema(nuc_tree, sat_subtrees)

                else: #len(child_dict[elem_id]) == 0
                    raise TooFewChildrenError(
                        "A span group ('%s)' should have at least 1 child: %s" \
                            % (elem_id, self.child_dict[elem_id]))