def test_depth_type_order(self):
        extra = rules.depth_type_order([markers.ints, markers.lower])
        results = derive_depths(['1', 'a'], [extra])
        self.assertEqual(1, len(results))
        results = derive_depths(['i', 'a'], [extra])
        self.assertEqual(0, len(results))

        extra = rules.depth_type_order([(markers.ints, markers.roman),
                                        markers.lower])
        results = derive_depths(['1', 'a'], [extra])
        self.assertEqual(1, len(results))
        results = derive_depths(['i', 'a'], [extra])
        self.assertEqual(1, len(results))
def add_nodes_to_stack(nodes, inner_stack):
    """Calculate most likely depth assignments to each node; add to the
    provided stack"""
    # Use constraint programming to figure out possible depth assignments
    depths = derive_depths([node.label[0] for node in nodes], [
        rules.depth_type_order([(mtypes.ints, mtypes.em_ints),
                                (mtypes.roman, mtypes.upper), mtypes.upper,
                                mtypes.em_ints, mtypes.em_roman])
    ])
    if depths:
        # Find the assignment which violates the least of our heuristics
        depths = heuristics.prefer_multiple_children(depths, 0.5)
        depths = sorted(depths, key=lambda d: d.weight, reverse=True)
        depths = depths[0]
        for node, par in zip(nodes, depths):
            if par.typ != mtypes.stars:
                last = inner_stack.peek()
                node.label = [
                    l.replace('<E T="03">', '').replace('</E>', '')
                    for l in node.label
                ]
                if len(last) == 0:
                    inner_stack.push_last((3 + par.depth, node))
                else:
                    inner_stack.add(3 + par.depth, node)
Пример #3
0
def add_nodes_to_stack(nodes, inner_stack):
    """Calculate most likely depth assignments to each node; add to the
    provided stack"""
    # Use constraint programming to figure out possible depth assignments
    depths = derive_depths(
        [node.label[0] for node in nodes],
        [rules.depth_type_order([(mtypes.ints, mtypes.em_ints),
                                 (mtypes.roman, mtypes.upper),
                                 mtypes.upper, mtypes.em_ints,
                                 mtypes.em_roman])])
    if depths:
        # Find the assignment which violates the least of our heuristics
        depths = heuristics.prefer_multiple_children(depths, 0.5)
        depths = sorted(depths, key=lambda d: d.weight, reverse=True)
        depths = depths[0]
        for node, par in zip(nodes, depths):
            if par.typ != mtypes.stars:
                last = inner_stack.peek()
                node.label = [l.replace('<E T="03">', '').replace('</E>', '')
                              for l in node.label]
                if len(last) == 0:
                    inner_stack.push_last((3 + par.depth, node))
                else:
                    inner_stack.add(3 + par.depth, node)
 def test_depth_type_order_multiple(self):
     """Constrain depths to be in a list of types."""
     extra = rules.depth_type_order([(markers.ints, markers.roman),
                                     markers.lower])
     self.assert_depth_match_extra(['1', 'a'], [extra], [0, 1])
     self.assert_depth_match_extra(['i', 'a'], [extra], [0, 1])
 def test_depth_type_order_single(self):
     """Constrain depths to have certain types."""
     extra = rules.depth_type_order([markers.ints, markers.lower])
     self.assert_depth_match_extra(['1', 'a'], [extra], [0, 1])
     self.assert_depth_match_extra(['i', 'a'], [extra])
def process_inner_children(inner_stack, xml_node):
    """Process the following nodes as children of this interpretation. This
    is very similar to reg_text.py:build_from_section()"""
    children = itertools.takewhile(
        lambda x: not is_title(x), xml_node.itersiblings())
    nodes = []
    for xml_node in filter(lambda c: c.tag in ('P', 'STARS'), children):
        node_text = tree_utils.get_node_text(xml_node, add_spaces=True)
        text_with_tags = tree_utils.get_node_text_tags_preserved(xml_node)
        first_marker = get_first_interp_marker(text_with_tags)
        if xml_node.tag == 'STARS':
            nodes.append(Node(label=[mtypes.STARS_TAG]))
        elif not first_marker and nodes:
            logging.warning("Couldn't determine interp marker. Appending to "
                            "previous paragraph: %s", node_text)
            previous = nodes[-1]
            previous.text += "\n\n" + node_text
            if hasattr(previous, 'tagged_text'):
                previous.tagged_text += "\n\n" + text_with_tags
            else:
                previous.tagged_text = text_with_tags
        else:
            collapsed = collapsed_markers_matches(node_text, text_with_tags)

            #   -2 throughout to account for matching the character + period
            ends = [m.end() - 2 for m in collapsed[1:]] + [len(node_text)]
            starts = [m.end() - 2 for m in collapsed] + [len(node_text)]

            #   Node for this paragraph
            n = Node(node_text[0:starts[0]], label=[first_marker],
                     node_type=Node.INTERP)
            n.tagged_text = text_with_tags
            nodes.append(n)
            if n.text.endswith('* * *'):
                nodes.append(Node(label=[mtypes.INLINE_STARS]))

            #   Collapsed-marker children
            for match, end in zip(collapsed, ends):
                marker = match.group(1)
                if marker == '1':
                    marker = '<E T="03">1</E>'
                n = Node(node_text[match.end() - 2:end], label=[marker],
                         node_type=Node.INTERP)
                nodes.append(n)
                if n.text.endswith('* * *'):
                    nodes.append(Node(label=[mtypes.INLINE_STARS]))

    # Trailing stars don't matter; slightly more efficient to ignore them
    while nodes and nodes[-1].label[0] in mtypes.stars:
        nodes = nodes[:-1]

    # Use constraint programming to figure out possible depth assignments
    depths = derive_depths(
        [node.label[0] for node in nodes],
        [rules.depth_type_order([(mtypes.ints, mtypes.em_ints),
                                 (mtypes.roman, mtypes.upper),
                                 mtypes.upper, mtypes.em_ints,
                                 mtypes.em_roman])])
    if depths:
        # Find the assignment which violates the least of our heuristics
        depths = heuristics.prefer_multiple_children(depths, 0.5)
        depths = sorted(depths, key=lambda d: d.weight, reverse=True)
        depths = depths[0]
        for node, par in zip(nodes, depths):
            if par.typ != mtypes.stars:
                last = inner_stack.peek()
                node.label = [l.replace('<E T="03">', '').replace('</E>', '')
                              for l in node.label]
                if len(last) == 0:
                    inner_stack.push_last((3 + par.depth, node))
                else:
                    inner_stack.add(3 + par.depth, node)
Пример #7
0
def process_inner_children(inner_stack, xml_node):
    """Process the following nodes as children of this interpretation. This
    is very similar to reg_text.py:build_from_section()"""
    children = itertools.takewhile(
        lambda x: not is_title(x), xml_node.itersiblings())
    nodes = []
    for xml_node in filter(lambda c: c.tag in ('P', 'STARS'), children):
        node_text = tree_utils.get_node_text(xml_node, add_spaces=True)
        text_with_tags = tree_utils.get_node_text_tags_preserved(xml_node)
        first_marker = get_first_interp_marker(text_with_tags)
        if xml_node.tag == 'STARS':
            nodes.append(Node(label=[mtypes.STARS_TAG]))
        elif not first_marker and nodes:
            logging.warning("Couldn't determine interp marker. Appending to "
                            "previous paragraph: %s", node_text)
            previous = nodes[-1]
            previous.text += "\n\n" + node_text
            if hasattr(previous, 'tagged_text'):
                previous.tagged_text += "\n\n" + text_with_tags
            else:
                previous.tagged_text = text_with_tags
        else:
            collapsed = collapsed_markers_matches(node_text, text_with_tags)

            #   -2 throughout to account for matching the character + period
            ends = [m.end() - 2 for m in collapsed[1:]] + [len(node_text)]
            starts = [m.end() - 2 for m in collapsed] + [len(node_text)]

            #   Node for this paragraph
            n = Node(node_text[0:starts[0]], label=[first_marker],
                     node_type=Node.INTERP)
            n.tagged_text = text_with_tags
            nodes.append(n)
            if n.text.endswith('* * *'):
                nodes.append(Node(label=[mtypes.INLINE_STARS]))

            #   Collapsed-marker children
            for match, end in zip(collapsed, ends):
                marker = match.group(1)
                if marker == '1':
                    marker = '<E T="03">1</E>'
                n = Node(node_text[match.end() - 2:end], label=[marker],
                         node_type=Node.INTERP)
                nodes.append(n)
                if n.text.endswith('* * *'):
                    nodes.append(Node(label=[mtypes.INLINE_STARS]))

    # Trailing stars don't matter; slightly more efficient to ignore them
    while nodes and nodes[-1].label[0] in mtypes.stars:
        nodes = nodes[:-1]

    # Use constraint programming to figure out possible depth assignments
    depths = derive_depths(
        [n.label[0] for n in nodes],
        [rules.depth_type_order([(mtypes.ints, mtypes.em_ints),
                                 (mtypes.roman, mtypes.upper),
                                 mtypes.upper, mtypes.em_ints,
                                 mtypes.em_roman])])
    if depths:
        # Find the assignment which violates the least of our heuristics
        depths = heuristics.prefer_multiple_children(depths, 0.5)
        depths = sorted(depths, key=lambda d: d.weight, reverse=True)
        depths = depths[0]
        for node, par in zip(nodes, depths):
            if par.typ != mtypes.stars:
                last = inner_stack.peek()
                node.label = [l.replace('<E T="03">', '').replace('</E>', '')
                              for l in node.label]
                if len(last) == 0:
                    inner_stack.push_last((3 + par.depth, node))
                else:
                    inner_stack.add(3 + par.depth, node)
def process_inner_children(inner_stack, xml_node, parent=None):
    """Process the following nodes as children of this interpretation. This
    is very similar to reg_text.py:build_from_section()"""
    # manual hierarchy should work here too
    manual_hierarchy = []
    try:
        part_and_section = re.search('[0-9]+\.[0-9]+', xml_node.text).group(0)
        part, section = part_and_section.split('.')
        part_and_section += '-Interp'

        if (part in PARAGRAPH_HIERARCHY
                and part_and_section in PARAGRAPH_HIERARCHY[part]):
            manual_hierarchy = PARAGRAPH_HIERARCHY[part][part_and_section]
    except Exception:
        pass

    children = itertools.takewhile(lambda x: not is_title(x),
                                   xml_node.itersiblings())
    nodes = []
    for i, xml_node in enumerate(
            filter(lambda c: c.tag in ('P', 'STARS'), children)):
        node_text = tree_utils.get_node_text(xml_node, add_spaces=True)
        text_with_tags = tree_utils.get_node_text_tags_preserved(xml_node)
        first_marker = get_first_interp_marker(text_with_tags)

        # If the node has a 'DEPTH' attribute, we're in manual
        # hierarchy mode, just constructed from the XML instead of
        # specified in configuration.
        # This presumes that every child in the section has DEPTH
        # specified, if not, things will break in and around
        # derive_depths below.
        if xml_node.get("depth") is not None:
            manual_hierarchy.append(int(xml_node.get("depth")))

        if xml_node.tag == 'STARS':
            nodes.append(Node(label=[mtypes.STARS_TAG]))
        elif not first_marker and nodes and manual_hierarchy:
            logging.warning("Couldn't determine interp marker. "
                            "Manual hierarchy is specified")

            n = Node(node_text, label=[str(i)], node_type=Node.INTERP)
            n.tagged_text = text_with_tags
            nodes.append(n)

        elif not first_marker and not manual_hierarchy:
            logging.warning(
                "Couldn't determine interp marker. Appending to "
                "previous paragraph: %s", node_text)

            if nodes:
                previous = nodes[-1]
            else:
                previous = parent

            previous.text += "\n\n" + node_text
            if hasattr(previous, 'tagged_text'):
                previous.tagged_text += "\n\n" + text_with_tags
            else:
                previous.tagged_text = text_with_tags

        else:
            collapsed = collapsed_markers_matches(node_text, text_with_tags)

            #   -2 throughout to account for matching the character + period
            ends = [m.end() - 2 for m in collapsed[1:]] + [len(node_text)]
            starts = [m.end() - 2 for m in collapsed] + [len(node_text)]

            #   Node for this paragraph
            n = Node(node_text[0:starts[0]],
                     label=[first_marker],
                     node_type=Node.INTERP)
            n.tagged_text = text_with_tags
            nodes.append(n)
            if n.text.endswith('* * *'):
                nodes.append(Node(label=[mtypes.INLINE_STARS]))

            #   Collapsed-marker children
            for match, end in zip(collapsed, ends):
                marker = match.group(1)
                if marker == '1':
                    marker = '<E T="03">1</E>'
                n = Node(node_text[match.end() - 2:end],
                         label=[marker],
                         node_type=Node.INTERP)
                nodes.append(n)
                if n.text.endswith('* * *'):
                    nodes.append(Node(label=[mtypes.INLINE_STARS]))

    # Trailing stars don't matter; slightly more efficient to ignore them
    while nodes and nodes[-1].label[0] in mtypes.stars:
        nodes = nodes[:-1]

    # Use constraint programming to figure out possible depth assignments
    # use manual hierarchy if it's specified
    if not manual_hierarchy:
        depths = derive_depths([node.label[0] for node in nodes], [
            rules.depth_type_order(
                [(mtypes.ints, mtypes.em_ints),
                 (mtypes.lower, mtypes.roman, mtypes.upper), mtypes.upper,
                 mtypes.em_ints, mtypes.em_roman])
        ])

    if not manual_hierarchy and depths:
        # Find the assignment which violates the least of our heuristics
        depths = heuristics.prefer_multiple_children(depths, 0.5)
        depths = sorted(depths, key=lambda d: d.weight, reverse=True)
        depths = depths[0]
        for node, par in zip(nodes, depths):
            if par.typ != mtypes.stars:
                last = inner_stack.peek()
                node.label = [
                    l.replace('<E T="03">', '').replace('</E>', '')
                    for l in node.label
                ]
                if len(last) == 0:
                    inner_stack.push_last((3 + par.depth, node))
                else:
                    inner_stack.add(3 + par.depth, node)
    elif nodes and manual_hierarchy:
        logging.warning('Using manual depth hierarchy.')
        depths = manual_hierarchy
        if len(nodes) == len(depths):
            for node, depth in zip(nodes, depths):
                last = inner_stack.peek()
                node.label = [
                    l.replace('<E T="03">', '').replace('</E>', '')
                    for l in node.label
                ]
                if len(last) == 0:
                    inner_stack.push_last((3 + depth, node))
                else:
                    inner_stack.add(3 + depth, node)
        else:
            logging.error(
                'Manual hierarchy length does not match node list length!')

    elif nodes and not manual_hierarchy:
        logging.warning('Could not derive depth (interp):\n {}'.format(
            [node.label[0] for node in nodes]))
        # just add nodes in sequential order then
        for node in nodes:
            last = inner_stack.peek()
            node.label = [
                l.replace('<E T="03">', '').replace('</E>', '')
                for l in node.label
            ]
            if len(last) == 0:
                inner_stack.push_last((3, node))
            else:
                inner_stack.add(3, node)
 def test_depth_type_order_multiple(self):
     """Constrain depths to be in a list of types."""
     extra = rules.depth_type_order([(markers.ints, markers.roman),
                                     markers.lower])
     self.assert_depth_match_extra(['1', 'a'], [extra], [0, 1])
     self.assert_depth_match_extra(['i', 'a'], [extra], [0, 1])
 def test_depth_type_order_single(self):
     """Constrain depths to have certain types."""
     extra = rules.depth_type_order([markers.ints, markers.lower])
     self.assert_depth_match_extra(['1', 'a'], [extra], [0, 1])
     self.assert_depth_match_extra(['i', 'a'], [extra])
Пример #11
0
def build_from_section(reg_part, section_xml):
    section_texts = []
    nodes = []
    # Collect paragraph markers and section text (intro text for the
    # section)
    for ch in filter(lambda ch: ch.tag in ('P', 'STARS'),
                     section_xml.getchildren()):
        text = tree_utils.get_node_text(ch, add_spaces=True)
        tagged_text = tree_utils.get_node_text_tags_preserved(ch)
        markers_list = get_markers(tagged_text.strip())

        if ch.tag == 'STARS':
            nodes.append(Node(label=[mtypes.STARS_TAG]))
        elif not markers_list:
            section_texts.append((text, tagged_text))
        else:
            for m, node_text in get_markers_and_text(ch, markers_list):
                n = Node(node_text[0], [], [m], source_xml=ch)
                n.tagged_text = unicode(node_text[1])
                nodes.append(n)
            if node_text[0].endswith('* * *'):
                nodes.append(Node(label=[mtypes.INLINE_STARS]))

    # Trailing stars don't matter; slightly more efficient to ignore them
    while nodes and nodes[-1].label[0] in mtypes.stars:
        nodes = nodes[:-1]

    # Use constraint programming to figure out possible depth assignments
    depths = derive_depths(
        [n.label[0] for n in nodes],
        [rules.depth_type_order([mtypes.lower, mtypes.ints, mtypes.roman,
                                 mtypes.upper, mtypes.em_ints,
                                 mtypes.em_roman])])
    m_stack = tree_utils.NodeStack()
    if depths:
        # Find the assignment which violates the least of our heuristics
        depths = heuristics.prefer_multiple_children(depths, 0.5)
        depths = sorted(depths, key=lambda d: d.weight, reverse=True)
        depths = depths[0]
        for node, par in zip(nodes, depths):
            if par.typ != mtypes.stars:
                last = m_stack.peek()
                node.label = [l.replace('<E T="03">', '').replace('</E>', '')
                              for l in node.label]
                if len(last) == 0:
                    m_stack.push_last((1 + par.depth, node))
                else:
                    m_stack.add(1 + par.depth, node)

    section_no = section_xml.xpath('SECTNO')[0].text
    subject_xml = section_xml.xpath('SUBJECT')
    if not subject_xml:
        subject_xml = section_xml.xpath('RESERVED')
    subject_text = subject_xml[0].text

    nodes = []
    section_nums = []
    for match in re.finditer(r'%s\.(\d+)' % reg_part, section_no):
        section_nums.append(int(match.group(1)))

    #  Span of section numbers
    if u'§§' == section_no[:2] and '-' in section_no:
        first, last = section_nums
        section_nums = []
        for i in range(first, last + 1):
            section_nums.append(i)

    for section_number in section_nums:
        section_number = str(section_number)
        plain_sect_texts = [s[0] for s in section_texts]
        tagged_sect_texts = [s[1] for s in section_texts]

        section_text = ' '.join([section_xml.text] + plain_sect_texts)
        tagged_section_text = ' '.join([section_xml.text] + tagged_sect_texts)
        section_title = u"§ " + reg_part + "." + section_number
        if subject_text:
            section_title += " " + subject_text

        sect_node = Node(
            section_text, label=[reg_part, section_number],
            title=section_title)
        sect_node.tagged_text = tagged_section_text

        m_stack.add_to_bottom((1, sect_node))

        while m_stack.size() > 1:
            m_stack.unwind()

        nodes.append(m_stack.pop()[0][1])

    return nodes
Пример #12
0
def build_from_section(reg_part, section_xml):
    section_texts = []
    nodes = []

    section_no = section_xml.xpath('SECTNO')[0].text
    section_no_without_marker = re.search('[0-9]+\.[0-9]+',
                                          section_no).group(0)
    subject_xml = section_xml.xpath('SUBJECT')
    if not subject_xml:
        subject_xml = section_xml.xpath('RESERVED')
    subject_text = subject_xml[0].text

    manual_hierarchy_flag = False
    if reg_part in PARAGRAPH_HIERARCHY and section_no_without_marker in PARAGRAPH_HIERARCHY[
            reg_part]:
        manual_hierarchy_flag = True

    # Collect paragraph markers and section text (intro text for the
    # section)
    i = 0
    children = [
        ch for ch in section_xml.getchildren() if ch.tag in ['P', 'STARS']
    ]
    for ch in children:
        text = tree_utils.get_node_text(ch, add_spaces=True)
        tagged_text = tree_utils.get_node_text_tags_preserved(ch)
        markers_list = get_markers(tagged_text.strip())

        if ch.tag == 'STARS':
            nodes.append(Node(label=[mtypes.STARS_TAG]))
        elif not markers_list:
            # is this a bunch of definitions that don't have numbers next to them?
            if len(nodes) > 0:
                if (subject_text.find('Definitions.') > -1
                        or nodes[-1].text.find(
                            'For the purposes of this section')):
                    #TODO: create a grammar for definitions
                    if text.find('means') > -1:
                        def_marker = text.split('means')[0].strip().split()
                        def_marker = ''.join([
                            word[0].upper() + word[1:] for word in def_marker
                        ])
                    elif text.find('shall have the same meaning') > -1:
                        def_marker = text.split('shall')[0].strip().split()
                        def_marker = ''.join([
                            word[0].upper() + word[1:] for word in def_marker
                        ])
                    else:
                        def_marker = 'def{0}'.format(i)
                        i += 1
                    n = Node(text, label=[def_marker], source_xml=ch)
                    n.tagged_text = tagged_text
                    #nodes[-1].children.append(n)
                    nodes.append(n)
                else:
                    section_texts.append((text, tagged_text))
            else:
                if len(children) > 1:
                    def_marker = 'def{0}'.format(i)
                    n = Node(text, [], [def_marker], source_xml=ch)
                    n.tagged_text = tagged_text
                    i += 1
                    nodes.append(n)
                else:
                    # this is the only node around
                    section_texts.append((text, tagged_text))

        else:
            for m, node_text in get_markers_and_text(ch, markers_list):
                n = Node(node_text[0], [], [m], source_xml=ch)
                n.tagged_text = unicode(node_text[1])
                nodes.append(n)

            if node_text[0].endswith('* * *'):
                nodes.append(Node(label=[mtypes.INLINE_STARS]))

    # Trailing stars don't matter; slightly more efficient to ignore them
    while nodes and nodes[-1].label[0] in mtypes.stars:
        nodes = nodes[:-1]

    m_stack = tree_utils.NodeStack()

    # Use constraint programming to figure out possible depth assignments
    if not manual_hierarchy_flag:
        depths = derive_depths([n.label[0] for n in nodes], [
            rules.depth_type_order([
                mtypes.lower, mtypes.ints, mtypes.roman, mtypes.upper,
                mtypes.em_ints, mtypes.em_roman
            ])
        ])

    if not manual_hierarchy_flag and depths:
        # Find the assignment which violates the least of our heuristics
        depths = heuristics.prefer_multiple_children(depths, 0.5)
        depths = sorted(depths, key=lambda d: d.weight, reverse=True)
        depths = depths[0]

        for node, par in zip(nodes, depths):
            if par.typ != mtypes.stars:
                last = m_stack.peek()
                node.label = [
                    l.replace('<E T="03">', '').replace('</E>', '')
                    for l in node.label
                ]
                if len(last) == 0:
                    m_stack.push_last((1 + par.depth, node))
                else:
                    m_stack.add(1 + par.depth, node)

    elif nodes and manual_hierarchy_flag:
        logging.warning('Using manual depth hierarchy.')
        depths = PARAGRAPH_HIERARCHY[reg_part][section_no_without_marker]
        if len(nodes) == len(depths):
            for node, depth in zip(nodes, depths):
                last = m_stack.peek()
                node.label = [
                    l.replace('<E T="03">', '').replace('</E>', '')
                    for l in node.label
                ]
                if len(last) == 0:
                    m_stack.push_last((1 + depth, node))
                else:
                    m_stack.add(1 + depth, node)
        else:
            logging.error(
                'Manual hierarchy length does not match node list length!'
                ' ({0} nodes but {1} provided)'.format(len(nodes),
                                                       len(depths)))

    elif nodes and not manual_hierarchy_flag:
        logging.warning(
            'Could not determine depth when parsing {0}:\n{1}'.format(
                section_no_without_marker, [n.label[0] for n in nodes]))
        for node in nodes:
            last = m_stack.peek()
            node.label = [
                l.replace('<E T="03">', '').replace('</E>', '')
                for l in node.label
            ]
            if len(last) == 0:
                m_stack.push_last((3, node))
            else:
                m_stack.add(3, node)

    nodes = []
    section_nums = []
    for match in re.finditer(r'%s\.(\d+)' % reg_part, section_no):
        section_nums.append(int(match.group(1)))

    #  Span of section numbers
    if u'§§' == section_no[:2] and '-' in section_no:
        first, last = section_nums
        section_nums = []
        for i in range(first, last + 1):
            section_nums.append(i)

    for section_number in section_nums:
        section_number = str(section_number)
        plain_sect_texts = [s[0] for s in section_texts]
        tagged_sect_texts = [s[1] for s in section_texts]

        section_title = u"§ " + reg_part + "." + section_number
        if subject_text:
            section_title += " " + subject_text

        section_text = ' '.join([section_xml.text] + plain_sect_texts)
        tagged_section_text = ' '.join([section_xml.text] + tagged_sect_texts)

        sect_node = Node(section_text,
                         label=[reg_part, section_number],
                         title=section_title)
        sect_node.tagged_text = tagged_section_text

        m_stack.add_to_bottom((1, sect_node))

        while m_stack.size() > 1:
            m_stack.unwind()

        nodes.append(m_stack.pop()[0][1])

    return nodes
Пример #13
0
def process_inner_children(inner_stack, xml_node, parent=None):
    """Process the following nodes as children of this interpretation. This
    is very similar to reg_text.py:build_from_section()"""
    # manual hierarchy should work here too
    manual_hierarchy = []
    try:
        part_and_section = re.search('[0-9]+\.[0-9]+', xml_node.text).group(0)
        part, section = part_and_section.split('.')
        part_and_section += '-Interp'

        if (part in PARAGRAPH_HIERARCHY
                and part_and_section in PARAGRAPH_HIERARCHY[part]):
            manual_hierarchy = PARAGRAPH_HIERARCHY[part][part_and_section]
    except Exception:
        pass

    children = itertools.takewhile(
        lambda x: not is_title(x), xml_node.itersiblings())
    nodes = []
    for i, xml_node in enumerate(filter(lambda c: c.tag in ('P', 'STARS'),
                                        children)):
        node_text = tree_utils.get_node_text(xml_node, add_spaces=True)
        text_with_tags = tree_utils.get_node_text_tags_preserved(xml_node)
        first_marker = get_first_interp_marker(text_with_tags)

        # If the node has a 'DEPTH' attribute, we're in manual
        # hierarchy mode, just constructed from the XML instead of
        # specified in configuration.
        # This presumes that every child in the section has DEPTH
        # specified, if not, things will break in and around
        # derive_depths below.
        if xml_node.get("depth") is not None:
            manual_hierarchy.append(int(xml_node.get("depth")))

        if xml_node.tag == 'STARS':
            nodes.append(Node(label=[mtypes.STARS_TAG]))
        elif not first_marker and nodes and manual_hierarchy:
            logging.warning("Couldn't determine interp marker. "
                            "Manual hierarchy is specified")

            n = Node(node_text, label=[str(i)], node_type=Node.INTERP)
            n.tagged_text = text_with_tags
            nodes.append(n)

        elif not first_marker and not manual_hierarchy:
            logging.warning("Couldn't determine interp marker. Appending to "
                            "previous paragraph: %s", node_text)

            if nodes:
                previous = nodes[-1]
            else:
                previous = parent

            previous.text += "\n\n" + node_text
            if hasattr(previous, 'tagged_text'):
                previous.tagged_text += "\n\n" + text_with_tags
            else:
                previous.tagged_text = text_with_tags

        else:
            collapsed = collapsed_markers_matches(node_text, text_with_tags)

            #   -2 throughout to account for matching the character + period
            ends = [m.end() - 2 for m in collapsed[1:]] + [len(node_text)]
            starts = [m.end() - 2 for m in collapsed] + [len(node_text)]

            #   Node for this paragraph
            n = Node(node_text[0:starts[0]], label=[first_marker],
                     node_type=Node.INTERP)
            n.tagged_text = text_with_tags
            nodes.append(n)
            if n.text.endswith('* * *'):
                nodes.append(Node(label=[mtypes.INLINE_STARS]))

            #   Collapsed-marker children
            for match, end in zip(collapsed, ends):
                marker = match.group(1)
                if marker == '1':
                    marker = '<E T="03">1</E>'
                n = Node(node_text[match.end() - 2:end], label=[marker],
                         node_type=Node.INTERP)
                nodes.append(n)
                if n.text.endswith('* * *'):
                    nodes.append(Node(label=[mtypes.INLINE_STARS]))

    # Trailing stars don't matter; slightly more efficient to ignore them
    while nodes and nodes[-1].label[0] in mtypes.stars:
        nodes = nodes[:-1]

    # Use constraint programming to figure out possible depth assignments
    # use manual hierarchy if it's specified
    if not manual_hierarchy:
        depths = derive_depths(
            [node.label[0] for node in nodes],
            [rules.depth_type_order([
                (mtypes.ints, mtypes.em_ints),
                (mtypes.lower, mtypes.roman, mtypes.upper),
                mtypes.upper, mtypes.em_ints, mtypes.em_roman])])

    if not manual_hierarchy and depths:
        # Find the assignment which violates the least of our heuristics
        depths = heuristics.prefer_multiple_children(depths, 0.5)
        depths = sorted(depths, key=lambda d: d.weight, reverse=True)
        depths = depths[0]
        for node, par in zip(nodes, depths):
            if par.typ != mtypes.stars:
                last = inner_stack.peek()
                node.label = [l.replace('<E T="03">', '').replace('</E>', '')
                              for l in node.label]
                if len(last) == 0:
                    inner_stack.push_last((3 + par.depth, node))
                else:
                    inner_stack.add(3 + par.depth, node)
    elif nodes and manual_hierarchy:
        logging.warning('Using manual depth hierarchy.')
        depths = manual_hierarchy
        if len(nodes) == len(depths):
            for node, depth in zip(nodes, depths):
                last = inner_stack.peek()
                node.label = [l.replace('<E T="03">', '').replace('</E>', '')
                              for l in node.label]
                if len(last) == 0:
                    inner_stack.push_last((3 + depth, node))
                else:
                    inner_stack.add(3 + depth, node)
        else:
            logging.error(
                'Manual hierarchy length does not match node list length!')

    elif nodes and not manual_hierarchy:
        logging.warning('Could not derive depth (interp):\n {}'.format(
            [node.label[0] for node in nodes]))
        # just add nodes in sequential order then
        for node in nodes:
            last = inner_stack.peek()
            node.label = [l.replace('<E T="03">', '').replace('</E>', '')
                          for l in node.label]
            if len(last) == 0:
                inner_stack.push_last((3, node))
            else:
                inner_stack.add(3, node)
Пример #14
0
def build_from_section(reg_part, section_xml):
    section_texts = []
    nodes = []

    section_no = section_xml.xpath('SECTNO')[0].text
    section_no_without_marker = re.search('[0-9]+\.[0-9]+',
                                          section_no).group(0)
    subject_xml = section_xml.xpath('SUBJECT')
    if not subject_xml:
        subject_xml = section_xml.xpath('RESERVED')
    subject_text = subject_xml[0].text

    manual_hierarchy = []
    if (reg_part in PARAGRAPH_HIERARCHY
            and section_no_without_marker in PARAGRAPH_HIERARCHY[reg_part]):
        manual_hierarchy = PARAGRAPH_HIERARCHY[reg_part][
            section_no_without_marker]

    # Collect paragraph markers and section text (intro text for the
    # section)
    i = 0
    children = [ch for ch in section_xml.getchildren()
                if ch.tag in ['P', 'STARS']]
    for ch in children:
        text = tree_utils.get_node_text(ch, add_spaces=True)
        tagged_text = tree_utils.get_node_text_tags_preserved(ch)
        markers_list = get_markers(tagged_text.strip())

        # If the child has a 'DEPTH' attribute, we're in manual
        # hierarchy mode, just constructed from the XML instead of
        # specified in configuration.
        # This presumes that every child in the section has DEPTH
        # specified, if not, things will break in and around
        # derive_depths below.
        if ch.get("depth") is not None:
            manual_hierarchy.append(int(ch.get("depth")))

        if ch.tag == 'STARS':
            nodes.append(Node(label=[mtypes.STARS_TAG]))
        elif not markers_list and manual_hierarchy:
            # is this a bunch of definitions that don't have numbers next to
            # them?
            if len(nodes) > 0:
                if (subject_text.find('Definitions.') > -1
                        or nodes[-1].text.find(
                            'For the purposes of this section')):
                    # TODO: create a grammar for definitions
                    if text.find('means') > -1:
                        def_marker = text.split('means')[0].strip().split()
                        def_marker = ''.join([word[0].upper() + word[1:]
                                              for word in def_marker])
                    elif text.find('shall have the same meaning') > -1:
                        def_marker = text.split('shall')[0].strip().split()
                        def_marker = ''.join([word[0].upper() + word[1:]
                                              for word in def_marker])
                    else:
                        def_marker = 'def{0}'.format(i)
                        i += 1
                    n = Node(text, label=[def_marker], source_xml=ch)
                    n.tagged_text = tagged_text
                    nodes.append(n)
                else:
                    section_texts.append((text, tagged_text))
            else:
                if len(children) > 1:
                    def_marker = 'def{0}'.format(i)
                    n = Node(text, [], [def_marker], source_xml=ch)
                    n.tagged_text = tagged_text
                    i += 1
                    nodes.append(n)
                else:
                    # this is the only node around
                    section_texts.append((text, tagged_text))

        elif not markers_list and not manual_hierarchy:
            # No manual heirarchy specified, append to the section.
            section_texts.append((text, tagged_text))
        else:
            for m, node_text in get_markers_and_text(ch, markers_list):
                n = Node(node_text[0], [], [m], source_xml=ch)
                n.tagged_text = unicode(node_text[1])
                nodes.append(n)

            if node_text[0].endswith('* * *'):
                nodes.append(Node(label=[mtypes.INLINE_STARS]))

    # Trailing stars don't matter; slightly more efficient to ignore them
    while nodes and nodes[-1].label[0] in mtypes.stars:
        nodes = nodes[:-1]

    m_stack = tree_utils.NodeStack()

    # Use constraint programming to figure out possible depth assignments
    if not manual_hierarchy:
        depths = derive_depths(
            [node.label[0] for node in nodes],
            [rules.depth_type_order([mtypes.lower, mtypes.ints, mtypes.roman,
                                     mtypes.upper, mtypes.em_ints,
                                     mtypes.em_roman])])

    if not manual_hierarchy and depths:
        # Find the assignment which violates the least of our heuristics
        depths = heuristics.prefer_multiple_children(depths, 0.5)
        depths = sorted(depths, key=lambda d: d.weight, reverse=True)
        depths = depths[0]

        for node, par in zip(nodes, depths):
            if par.typ != mtypes.stars:
                last = m_stack.peek()
                node.label = [l.replace('<E T="03">', '').replace('</E>', '')
                              for l in node.label]
                if len(last) == 0:
                    m_stack.push_last((1 + par.depth, node))
                else:
                    m_stack.add(1 + par.depth, node)

    elif nodes and manual_hierarchy:
        logging.warning('Using manual depth hierarchy.')
        depths = manual_hierarchy
        if len(nodes) == len(depths):
            for node, spec in zip(nodes, depths):
                if isinstance(spec, int):
                    depth = spec
                elif isinstance(spec, tuple):
                    depth, marker = spec
                    node.marker = marker
                last = m_stack.peek()
                node.label = [l.replace('<E T="03">', '').replace('</E>', '')
                              for l in node.label]
                if len(last) == 0:
                    m_stack.push_last((1 + depth, node))
                else:
                    m_stack.add(1 + depth, node)
        else:
            logging.error('Manual hierarchy length does not match node '
                          'list length! ({0} nodes but {1} provided, '
                          '{2})'.format(
                              len(nodes),
                              len(depths),
                              [x.label[0] for x in nodes]))

    elif nodes and not manual_hierarchy:
        logging.warning(
            'Could not determine depth when parsing {0}:\n{1}'.format(
                section_no_without_marker, [node.label[0] for node in nodes]))
        for node in nodes:
            last = m_stack.peek()
            node.label = [l.replace('<E T="03">', '').replace('</E>', '')
                          for l in node.label]
            if len(last) == 0:
                m_stack.push_last((3, node))
            else:
                m_stack.add(3, node)

    nodes = []
    section_nums = []
    for match in re.finditer(r'%s\.(\d+)' % reg_part, section_no):
        section_nums.append(int(match.group(1)))

    #  Span of section numbers
    if u'§§' == section_no[:2] and '-' in section_no:
        first, last = section_nums
        section_nums = []
        for i in range(first, last + 1):
            section_nums.append(i)

    for section_number in section_nums:
        section_number = str(section_number)
        plain_sect_texts = [s[0] for s in section_texts]
        tagged_sect_texts = [s[1] for s in section_texts]

        section_title = u"§ " + reg_part + "." + section_number
        if subject_text:
            section_title += " " + subject_text

        section_text = ' '.join([section_xml.text] + plain_sect_texts)
        tagged_section_text = ' '.join([section_xml.text] + tagged_sect_texts)

        sect_node = Node(section_text, label=[reg_part, section_number],
                         title=section_title)
        sect_node.tagged_text = tagged_section_text

        m_stack.add_to_bottom((1, sect_node))

        while m_stack.size() > 1:
            m_stack.unwind()

        nodes.append(m_stack.pop()[0][1])

    return nodes