Пример #1
0
def nodes_from_interp_p(xml_node):
    """Given an XML node that contains text for an interpretation paragraph,
    split it into sub-paragraphs and account for trailing stars"""
    node_text = tree_utils.get_node_text(xml_node, add_spaces=True)
    text_with_tags = tree_utils.get_node_text_tags_preserved(xml_node)
    first_marker = get_first_interp_marker(text_with_tags)
    collapsed = collapsed_markers_matches(node_text, text_with_tags)

    #   -2 throughout to account for matching the character + period
    ends = [m.end() - 2 for m in collapsed[1:]] + [len(node_text)]
    starts = [m.end() - 2 for m in collapsed] + [len(node_text)]

    #   Node for this paragraph
    n = Node(node_text[0:starts[0]], label=[first_marker],
             node_type=Node.INTERP, tagged_text=text_with_tags)
    yield n
    if n.text.endswith('* * *'):
        yield Node(label=[mtypes.INLINE_STARS])

    #   Collapsed-marker children
    for match, end in zip(collapsed, ends):
        marker = match.group(1)
        if marker == '1':
            marker = '<E T="03">1</E>'
        n = Node(node_text[match.end() - 2:end], label=[marker],
                 node_type=Node.INTERP)
        yield n
        if n.text.endswith('* * *'):
            yield Node(label=[mtypes.INLINE_STARS])
Пример #2
0
def process_inner_children(inner_stack, xml_node):
    """Process the following nodes as children of this interpretation. This
    is very similar to reg_text.py:build_from_section()"""
    children = itertools.takewhile(
        lambda x: not is_title(x), xml_node.itersiblings())
    nodes = []
    for xml_node in filter(lambda c: c.tag in ('P', 'STARS'), children):
        node_text = tree_utils.get_node_text(xml_node, add_spaces=True)
        text_with_tags = tree_utils.get_node_text_tags_preserved(xml_node)
        first_marker = get_first_interp_marker(text_with_tags)
        if xml_node.tag == 'STARS':
            nodes.append(Node(label=[mtypes.STARS_TAG]))
        elif not first_marker and nodes:
            logger.warning("Couldn't determine interp marker. Appending to "
                           "previous paragraph: %s", node_text)
            previous = nodes[-1]
            previous.text += "\n\n" + node_text
            if previous.tagged_text:
                previous.tagged_text += "\n\n" + text_with_tags
            else:
                previous.tagged_text = text_with_tags
        else:
            nodes.extend(nodes_from_interp_p(xml_node))

    # Trailing stars don't matter; slightly more efficient to ignore them
    while nodes and nodes[-1].label[0] in mtypes.stars:
        nodes = nodes[:-1]

    add_nodes_to_stack(nodes, inner_stack)
    def test_get_node_text_tags(self):
        text = '<P>(a)<E T="03">Fruit.</E>Apples,<PRTPAGE P="102"/> and '
        text += 'Pineapples</P>'
        doc = etree.fromstring(text)
        result = tree_utils.get_node_text_tags_preserved(doc)

        self.assertEquals(
            '(a)<E T="03">Fruit.</E>Apples, and Pineapples', result)
Пример #4
0
    def process(self, appendix, part):
        self.m_stack = tree_utils.NodeStack()

        self.part = part
        self.paragraph_count = 0
        self.header_count = 0
        self.depth = None
        self.appendix_letter = None
        # holds collections of nodes until their depth is determined
        self.nodes = []

        self.set_letter(appendix)
        remove_toc(appendix, self.appendix_letter)

        def is_subhead(tag, text):
            initial = initial_marker(text)
            return ((tag == 'HD' and (not initial or '.' in initial[1]))
                    or (tag in ('P', 'FP')
                        and title_label_pair(text, self.appendix_letter,
                                             self.part)))

        for child in appendix.getchildren():
            text = tree_utils.get_node_text(child, add_spaces=True).strip()
            if ((child.tag == 'HD' and child.attrib['SOURCE'] == 'HED')
                    or child.tag == 'RESERVED'):
                self.end_group()
                self.hed(part, text)
            elif is_subhead(child.tag, text):
                self.end_group()
                self.subheader(child, text)
            elif initial_marker(text) and child.tag in ('P', 'FP', 'HD'):
                text = self.insert_dashes(child, text)
                self.paragraph_with_marker(
                    text,
                    tree_utils.get_node_text_tags_preserved(child))
            elif child.tag == 'SEQUENCE':
                old_depth = self.depth
                self.end_group()
                self.depth = old_depth
                self.process_sequence(child)
            elif child.tag in ('P', 'FP'):
                text = self.insert_dashes(child, text)
                self.paragraph_no_marker(text)
            elif child.tag == 'GPH':
                self.graphic(child)
            elif child.tag == 'GPOTABLE':
                self.table(child)
            elif child.tag in ('NOTE', 'NOTES'):
                self.fence(child, 'note')
            elif child.tag == 'CODE':
                self.fence(child, child.get('LANGUAGE', 'code'))

        self.end_group()
        while self.m_stack.size() > 1:
            self.m_stack.unwind()

        if self.m_stack.m_stack[0]:
            return self.m_stack.m_stack[0][0][1]
Пример #5
0
    def process_sequence(self, root):
        for child in root.getchildren():
            text = tree_utils.get_node_text(child, add_spaces=True).strip()
            text = self.insert_dashes(child, text)
            self.paragraph_with_marker(
                text, tree_utils.get_node_text_tags_preserved(child))

        old_depth = self.depth
        self.depth += 1
        self.end_group()
        self.depth = old_depth
Пример #6
0
 def derive_nodes(self, xml, processor=None):
     text = ''
     tagged_text = tree_utils.get_node_text_tags_preserved(xml).strip()
     markers_list = get_markers(tagged_text, self.next_marker(xml))
     nodes = []
     for m, node_text in get_markers_and_text(xml, markers_list):
         text, tagged_text = node_text
         node = Node(text=text.strip(), label=[m], source_xml=xml)
         node.tagged_text = unicode(tagged_text.strip())
         nodes.append(node)
     if text.endswith('* * *'):
         nodes.append(Node(label=[mtypes.INLINE_STARS]))
     return nodes
    def derive_nodes(self, xml, processor=None):
        text = tree_utils.get_node_text(xml).strip()
        node = Node(text=text, source_xml=xml)
        node.tagged_text = six.text_type(
            tree_utils.get_node_text_tags_preserved(xml).strip())

        regex = self._PAREN_REGEX if text[:1] == '(' else self._PERIOD_REGEX
        match = regex.match(text)
        if match:
            node.label = [match.group('marker')]
        else:
            node.label = [mtypes.MARKERLESS]

        return [node]
Пример #8
0
def get_markers_and_text(node, markers_list):
    node_text = tree_utils.get_node_text(node, add_spaces=True)
    text_with_tags = tree_utils.get_node_text_tags_preserved(node)

    if len(markers_list) > 1:
        actual_markers = ['(%s)' % m for m in markers_list]
        plain_markers = [m.replace('<E T="03">', '').replace('</E>', '')
                         for m in actual_markers]
        node_texts = tree_utils.split_text(node_text, plain_markers)
        tagged_texts = tree_utils.split_text(text_with_tags, actual_markers)
        node_text_list = zip(node_texts, tagged_texts)
    elif markers_list:
        node_text_list = [(node_text, text_with_tags)]
    return zip(markers_list, node_text_list)
Пример #9
0
 def derive_nodes(self, xml, processor=None):
     nodes = []
     text = tree_utils.get_node_text(xml).strip()
     tagged_text = tree_utils.get_node_text_tags_preserved(xml).strip()
     markers_list = self.paragraph_markers(text)
     with_parens = ['({})'.format(m) for m in markers_list]
     triplets = zip(markers_list,
                    tree_utils.split_text(text, with_parens),
                    tree_utils.split_text(tagged_text, with_parens))
     for m, text, tagged_text in triplets:
         node = Node(text=text.strip(), label=[m], source_xml=xml)
         node.tagged_text = six.text_type(tagged_text.strip())
         nodes.append(node)
     return nodes
Пример #10
0
def get_markers_and_text(node, markers_list):
    node_text = tree_utils.get_node_text(node, add_spaces=True)
    text_with_tags = tree_utils.get_node_text_tags_preserved(node)

    actual_markers = ['(%s)' % m for m in markers_list]
    plain_markers = [m.replace('<E T="03">', '').replace('</E>', '')
                     for m in actual_markers]
    node_texts = tree_utils.split_text(node_text, plain_markers)
    tagged_texts = tree_utils.split_text(text_with_tags, actual_markers)
    node_text_list = zip(node_texts, tagged_texts)

    if len(node_text_list) > len(markers_list):     # diff can only be 1
        markers_list.insert(0, mtypes.MARKERLESS)
    return zip(markers_list, node_text_list)
Пример #11
0
def split_by_markers(xml):
    """Given an xml node, pull out triplets of
        (marker, plain-text following, text-with-tags following)
    for each subparagraph found"""
    plain_text = tree_utils.get_node_text(xml, add_spaces=True).strip()
    tagged_text = tree_utils.get_node_text_tags_preserved(xml).strip()
    markers_list = get_markers(tagged_text, next_marker(xml))

    plain_markers = ['({})'.format(mtypes.deemphasize(m))
                     for m in markers_list]
    node_texts = tree_utils.split_text(plain_text, plain_markers)
    tagged_texts = tree_utils.split_text(
        tagged_text, ['({})'.format(m) for m in markers_list])
    if len(node_texts) > len(markers_list):     # due to initial MARKERLESS
        markers_list.insert(0, mtypes.MARKERLESS)
    return list(zip(markers_list, node_texts, tagged_texts))
Пример #12
0
def next_marker(xml):
    """Find the first marker in a paragraph that follows this xml node.
    May return None"""
    good_tags = ('P', 'FP', mtypes.STARS_TAG)

    node = xml.getnext()
    while node is not None and node.tag not in good_tags:
        node = node.getnext()

    if getattr(node, 'tag', None) == mtypes.STARS_TAG:
        return mtypes.STARS_TAG
    elif node is not None:
        tagged_text = tree_utils.get_node_text_tags_preserved(node)
        markers = get_markers(tagged_text.strip())
        if markers:
            return markers[0]
Пример #13
0
def interp_inner_child(child_node, stack):
    """ Build an inner child node (basically a node that's after
    -Interp- in the tree). If the paragraph doesn't have a marker, attach it
    to the previous paragraph"""
    node_text = tree_utils.get_node_text(child_node, add_spaces=True)
    text_with_tags = tree_utils.get_node_text_tags_preserved(child_node)
    first_marker = get_first_interp_marker(text_with_tags)
    if not first_marker and stack.lineage():
        logging.warning("Couldn't determine interp marker. Appending to "
                        "previous paragraph: %s", node_text)
        previous = stack.lineage()[0]
        previous.text += "\n\n" + node_text
        if hasattr(previous, 'tagged_text'):
            previous.tagged_text += "\n\n" + text_with_tags
        else:
            previous.tagged_text = text_with_tags
    else:
        child_with_marker(child_node, stack)
Пример #14
0
def child_with_marker(child_node, stack):
    """Machinery to build a node for an interp's inner child. Assumes the
    paragraph begins with a paragraph marker."""
    node_text = tree_utils.get_node_text(child_node, add_spaces=True)
    text_with_tags = tree_utils.get_node_text_tags_preserved(child_node)
    first_marker = get_first_interp_marker(text_with_tags)

    collapsed = collapsed_markers_matches(node_text)

    #   -2 throughout to account for matching the character + period
    ends = [m.end() - 2 for m in collapsed[1:]] + [len(node_text)]
    starts = [m.end() - 2 for m in collapsed] + [len(node_text)]

    #   Node for this paragraph
    n = Node(node_text[0:starts[0]], label=[first_marker],
             node_type=Node.INTERP)
    n.tagged_text = text_with_tags
    last = stack.peek()

    if len(last) == 0:
        stack.push_last((interpretation_level(first_marker), n))
    else:
        node_level = interpretation_level(first_marker, last[0][0])
        if node_level is None:
            logging.warning("Couldn't determine node_level for this "
                            + "interpretation paragraph: " + n.text)
            node_level = last[0][0] + 1
        stack.add(node_level, n)

    #   Collapsed-marker children
    for match, end in zip(collapsed, ends):
        n = Node(node_text[match.end() - 2:end], label=[match.group(1)],
                 node_type=Node.INTERP)
        node_level = interpretation_level(match.group(1))
        last = stack.peek()
        if len(last) == 0:
            stack.push_last((node_level, n))
        else:
            stack.add(node_level, n)
Пример #15
0
def build_from_section(reg_part, section_xml):
    p_level = 1
    m_stack = tree_utils.NodeStack()
    section_texts = []
    for ch in (ch for ch in section_xml.getchildren() if ch.tag == 'P'):
        text = tree_utils.get_node_text(ch, add_spaces=True)
        tagged_text = tree_utils.get_node_text_tags_preserved(ch)
        markers_list = get_markers(tagged_text.strip())

        if not markers_list:
            section_texts.append((text, tagged_text))
        else:
            markers_and_text = get_markers_and_text(ch, markers_list)

            #   Easier to reason if we view the list as a stack
            markers_and_text = list(reversed(markers_and_text))
            while markers_and_text:
                m, node_text = markers_and_text.pop()
                m_sans_markup = m.replace('<E T="03">', '').replace('</E>', '')
                n = Node(node_text[0], [], [str(m_sans_markup)],
                         source_xml=ch)
                n.tagged_text = unicode(node_text[1])

                new_p_level = determine_level(
                    m, p_level, next_marker(ch, markers_and_text))

                last = m_stack.peek()
                if len(last) == 0:
                    m_stack.push_last((new_p_level, n))
                else:
                    m_stack.add(new_p_level, n)
                p_level = new_p_level

    section_no = section_xml.xpath('SECTNO')[0].text
    subject_xml = section_xml.xpath('SUBJECT')
    if not subject_xml:
        subject_xml = section_xml.xpath('RESERVED')
    subject_text = subject_xml[0].text

    nodes = []
    section_nums = []
    for match in re.finditer(r'%s\.(\d+)' % reg_part, section_no):
        section_nums.append(int(match.group(1)))

    #  Span of section numbers
    if u'§§' == section_no[:2] and '-' in section_no:
        first, last = section_nums
        section_nums = []
        for i in range(first, last + 1):
            section_nums.append(i)

    for section_number in section_nums:
        section_number = str(section_number)
        plain_sect_texts = [s[0] for s in section_texts]
        tagged_sect_texts = [s[1] for s in section_texts]

        section_text = ' '.join([section_xml.text] + plain_sect_texts)
        tagged_section_text = ' '.join([section_xml.text] + tagged_sect_texts)
        section_title = u"§ " + reg_part + "." + section_number
        if subject_text:
            section_title += " " + subject_text

        sect_node = Node(
            section_text, label=[reg_part, section_number],
            title=section_title)
        sect_node.tagged_text = tagged_section_text

        m_stack.add_to_bottom((1, sect_node))

        while m_stack.size() > 1:
            m_stack.unwind()

        nodes.append(m_stack.pop()[0][1])

    return nodes
Пример #16
0
def build_from_section(reg_part, section_xml):
    section_texts = []
    nodes = []
    # Collect paragraph markers and section text (intro text for the
    # section)
    for ch in filter(lambda ch: ch.tag in ('P', 'STARS'),
                     section_xml.getchildren()):
        text = tree_utils.get_node_text(ch, add_spaces=True)
        tagged_text = tree_utils.get_node_text_tags_preserved(ch)
        markers_list = get_markers(tagged_text.strip())

        if ch.tag == 'STARS':
            nodes.append(Node(label=[mtypes.STARS_TAG]))
        elif not markers_list:
            section_texts.append((text, tagged_text))
        else:
            for m, node_text in get_markers_and_text(ch, markers_list):
                n = Node(node_text[0], [], [m], source_xml=ch)
                n.tagged_text = unicode(node_text[1])
                nodes.append(n)
            if node_text[0].endswith('* * *'):
                nodes.append(Node(label=[mtypes.INLINE_STARS]))

    # Trailing stars don't matter; slightly more efficient to ignore them
    while nodes and nodes[-1].label[0] in mtypes.stars:
        nodes = nodes[:-1]

    # Use constraint programming to figure out possible depth assignments
    depths = derive_depths(
        [n.label[0] for n in nodes],
        [rules.depth_type_order([mtypes.lower, mtypes.ints, mtypes.roman,
                                 mtypes.upper, mtypes.em_ints,
                                 mtypes.em_roman])])
    m_stack = tree_utils.NodeStack()
    if depths:
        # Find the assignment which violates the least of our heuristics
        depths = heuristics.prefer_multiple_children(depths, 0.5)
        depths = sorted(depths, key=lambda d: d.weight, reverse=True)
        depths = depths[0]
        for node, par in zip(nodes, depths):
            if par.typ != mtypes.stars:
                last = m_stack.peek()
                node.label = [l.replace('<E T="03">', '').replace('</E>', '')
                              for l in node.label]
                if len(last) == 0:
                    m_stack.push_last((1 + par.depth, node))
                else:
                    m_stack.add(1 + par.depth, node)

    section_no = section_xml.xpath('SECTNO')[0].text
    subject_xml = section_xml.xpath('SUBJECT')
    if not subject_xml:
        subject_xml = section_xml.xpath('RESERVED')
    subject_text = subject_xml[0].text

    nodes = []
    section_nums = []
    for match in re.finditer(r'%s\.(\d+)' % reg_part, section_no):
        section_nums.append(int(match.group(1)))

    #  Span of section numbers
    if u'§§' == section_no[:2] and '-' in section_no:
        first, last = section_nums
        section_nums = []
        for i in range(first, last + 1):
            section_nums.append(i)

    for section_number in section_nums:
        section_number = str(section_number)
        plain_sect_texts = [s[0] for s in section_texts]
        tagged_sect_texts = [s[1] for s in section_texts]

        section_text = ' '.join([section_xml.text] + plain_sect_texts)
        tagged_section_text = ' '.join([section_xml.text] + tagged_sect_texts)
        section_title = u"§ " + reg_part + "." + section_number
        if subject_text:
            section_title += " " + subject_text

        sect_node = Node(
            section_text, label=[reg_part, section_number],
            title=section_title)
        sect_node.tagged_text = tagged_section_text

        m_stack.add_to_bottom((1, sect_node))

        while m_stack.size() > 1:
            m_stack.unwind()

        nodes.append(m_stack.pop()[0][1])

    return nodes
Пример #17
0
def process_inner_children(inner_stack, xml_node, parent=None):
    """Process the following nodes as children of this interpretation. This
    is very similar to reg_text.py:build_from_section()"""
    # manual hierarchy should work here too
    manual_hierarchy = []
    try:
        part_and_section = re.search('[0-9]+\.[0-9]+', xml_node.text).group(0)
        part, section = part_and_section.split('.')
        part_and_section += '-Interp'

        if (part in PARAGRAPH_HIERARCHY
                and part_and_section in PARAGRAPH_HIERARCHY[part]):
            manual_hierarchy = PARAGRAPH_HIERARCHY[part][part_and_section]
    except Exception:
        pass

    children = itertools.takewhile(
        lambda x: not is_title(x), xml_node.itersiblings())
    nodes = []
    for i, xml_node in enumerate(filter(lambda c: c.tag in ('P', 'STARS'),
                                        children)):
        node_text = tree_utils.get_node_text(xml_node, add_spaces=True)
        text_with_tags = tree_utils.get_node_text_tags_preserved(xml_node)
        first_marker = get_first_interp_marker(text_with_tags)

        # If the node has a 'DEPTH' attribute, we're in manual
        # hierarchy mode, just constructed from the XML instead of
        # specified in configuration.
        # This presumes that every child in the section has DEPTH
        # specified, if not, things will break in and around
        # derive_depths below.
        if xml_node.get("depth") is not None:
            manual_hierarchy.append(int(xml_node.get("depth")))

        if xml_node.tag == 'STARS':
            nodes.append(Node(label=[mtypes.STARS_TAG]))
        elif not first_marker and nodes and manual_hierarchy:
            logging.warning("Couldn't determine interp marker. "
                            "Manual hierarchy is specified")

            n = Node(node_text, label=[str(i)], node_type=Node.INTERP)
            n.tagged_text = text_with_tags
            nodes.append(n)

        elif not first_marker and not manual_hierarchy:
            logging.warning("Couldn't determine interp marker. Appending to "
                            "previous paragraph: %s", node_text)

            if nodes:
                previous = nodes[-1]
            else:
                previous = parent

            previous.text += "\n\n" + node_text
            if hasattr(previous, 'tagged_text'):
                previous.tagged_text += "\n\n" + text_with_tags
            else:
                previous.tagged_text = text_with_tags

        else:
            collapsed = collapsed_markers_matches(node_text, text_with_tags)

            #   -2 throughout to account for matching the character + period
            ends = [m.end() - 2 for m in collapsed[1:]] + [len(node_text)]
            starts = [m.end() - 2 for m in collapsed] + [len(node_text)]

            #   Node for this paragraph
            n = Node(node_text[0:starts[0]], label=[first_marker],
                     node_type=Node.INTERP)
            n.tagged_text = text_with_tags
            nodes.append(n)
            if n.text.endswith('* * *'):
                nodes.append(Node(label=[mtypes.INLINE_STARS]))

            #   Collapsed-marker children
            for match, end in zip(collapsed, ends):
                marker = match.group(1)
                if marker == '1':
                    marker = '<E T="03">1</E>'
                n = Node(node_text[match.end() - 2:end], label=[marker],
                         node_type=Node.INTERP)
                nodes.append(n)
                if n.text.endswith('* * *'):
                    nodes.append(Node(label=[mtypes.INLINE_STARS]))

    # Trailing stars don't matter; slightly more efficient to ignore them
    while nodes and nodes[-1].label[0] in mtypes.stars:
        nodes = nodes[:-1]

    # Use constraint programming to figure out possible depth assignments
    # use manual hierarchy if it's specified
    if not manual_hierarchy:
        depths = derive_depths(
            [node.label[0] for node in nodes],
            [rules.depth_type_order([
                (mtypes.ints, mtypes.em_ints),
                (mtypes.lower, mtypes.roman, mtypes.upper),
                mtypes.upper, mtypes.em_ints, mtypes.em_roman])])

    if not manual_hierarchy and depths:
        # Find the assignment which violates the least of our heuristics
        depths = heuristics.prefer_multiple_children(depths, 0.5)
        depths = sorted(depths, key=lambda d: d.weight, reverse=True)
        depths = depths[0]
        for node, par in zip(nodes, depths):
            if par.typ != mtypes.stars:
                last = inner_stack.peek()
                node.label = [l.replace('<E T="03">', '').replace('</E>', '')
                              for l in node.label]
                if len(last) == 0:
                    inner_stack.push_last((3 + par.depth, node))
                else:
                    inner_stack.add(3 + par.depth, node)
    elif nodes and manual_hierarchy:
        logging.warning('Using manual depth hierarchy.')
        depths = manual_hierarchy
        if len(nodes) == len(depths):
            for node, depth in zip(nodes, depths):
                last = inner_stack.peek()
                node.label = [l.replace('<E T="03">', '').replace('</E>', '')
                              for l in node.label]
                if len(last) == 0:
                    inner_stack.push_last((3 + depth, node))
                else:
                    inner_stack.add(3 + depth, node)
        else:
            logging.error(
                'Manual hierarchy length does not match node list length!')

    elif nodes and not manual_hierarchy:
        logging.warning('Could not derive depth (interp):\n {}'.format(
            [node.label[0] for node in nodes]))
        # just add nodes in sequential order then
        for node in nodes:
            last = inner_stack.peek()
            node.label = [l.replace('<E T="03">', '').replace('</E>', '')
                          for l in node.label]
            if len(last) == 0:
                inner_stack.push_last((3, node))
            else:
                inner_stack.add(3, node)
 def derive_nodes(self, xml, processor=None):
     tagged = tree_utils.get_node_text_tags_preserved(xml).strip()
     return [Node(text=tree_utils.get_node_text(xml).strip(),
                  tagged_text=tagged,
                  label=[mtypes.MARKERLESS])]
 def test_no_tags(self):
     text = '<P>(a) Fruit. Apples, and Pineapples</P>'
     doc = etree.fromstring(text)
     result = tree_utils.get_node_text_tags_preserved(doc)
     self.assertEqual('(a) Fruit. Apples, and Pineapples', result)
def process_inner_children(inner_stack, xml_node):
    """Process the following nodes as children of this interpretation. This
    is very similar to reg_text.py:build_from_section()"""
    children = itertools.takewhile(
        lambda x: not is_title(x), xml_node.itersiblings())
    nodes = []
    for xml_node in filter(lambda c: c.tag in ('P', 'STARS'), children):
        node_text = tree_utils.get_node_text(xml_node, add_spaces=True)
        text_with_tags = tree_utils.get_node_text_tags_preserved(xml_node)
        first_marker = get_first_interp_marker(text_with_tags)
        if xml_node.tag == 'STARS':
            nodes.append(Node(label=[mtypes.STARS_TAG]))
        elif not first_marker and nodes:
            logging.warning("Couldn't determine interp marker. Appending to "
                            "previous paragraph: %s", node_text)
            previous = nodes[-1]
            previous.text += "\n\n" + node_text
            if hasattr(previous, 'tagged_text'):
                previous.tagged_text += "\n\n" + text_with_tags
            else:
                previous.tagged_text = text_with_tags
        else:
            collapsed = collapsed_markers_matches(node_text, text_with_tags)

            #   -2 throughout to account for matching the character + period
            ends = [m.end() - 2 for m in collapsed[1:]] + [len(node_text)]
            starts = [m.end() - 2 for m in collapsed] + [len(node_text)]

            #   Node for this paragraph
            n = Node(node_text[0:starts[0]], label=[first_marker],
                     node_type=Node.INTERP)
            n.tagged_text = text_with_tags
            nodes.append(n)
            if n.text.endswith('* * *'):
                nodes.append(Node(label=[mtypes.INLINE_STARS]))

            #   Collapsed-marker children
            for match, end in zip(collapsed, ends):
                marker = match.group(1)
                if marker == '1':
                    marker = '<E T="03">1</E>'
                n = Node(node_text[match.end() - 2:end], label=[marker],
                         node_type=Node.INTERP)
                nodes.append(n)
                if n.text.endswith('* * *'):
                    nodes.append(Node(label=[mtypes.INLINE_STARS]))

    # Trailing stars don't matter; slightly more efficient to ignore them
    while nodes and nodes[-1].label[0] in mtypes.stars:
        nodes = nodes[:-1]

    # Use constraint programming to figure out possible depth assignments
    depths = derive_depths(
        [node.label[0] for node in nodes],
        [rules.depth_type_order([(mtypes.ints, mtypes.em_ints),
                                 (mtypes.roman, mtypes.upper),
                                 mtypes.upper, mtypes.em_ints,
                                 mtypes.em_roman])])
    if depths:
        # Find the assignment which violates the least of our heuristics
        depths = heuristics.prefer_multiple_children(depths, 0.5)
        depths = sorted(depths, key=lambda d: d.weight, reverse=True)
        depths = depths[0]
        for node, par in zip(nodes, depths):
            if par.typ != mtypes.stars:
                last = inner_stack.peek()
                node.label = [l.replace('<E T="03">', '').replace('</E>', '')
                              for l in node.label]
                if len(last) == 0:
                    inner_stack.push_last((3 + par.depth, node))
                else:
                    inner_stack.add(3 + par.depth, node)
Пример #21
0
def build_from_section(reg_part, section_xml):
    section_texts = []
    nodes = []

    section_no = section_xml.xpath('SECTNO')[0].text
    section_no_without_marker = re.search('[0-9]+\.[0-9]+',
                                          section_no).group(0)
    subject_xml = section_xml.xpath('SUBJECT')
    if not subject_xml:
        subject_xml = section_xml.xpath('RESERVED')
    subject_text = subject_xml[0].text

    manual_hierarchy = []
    if (reg_part in PARAGRAPH_HIERARCHY
            and section_no_without_marker in PARAGRAPH_HIERARCHY[reg_part]):
        manual_hierarchy = PARAGRAPH_HIERARCHY[reg_part][
            section_no_without_marker]

    # Collect paragraph markers and section text (intro text for the
    # section)
    i = 0
    children = [ch for ch in section_xml.getchildren()
                if ch.tag in ['P', 'STARS']]
    for ch in children:
        text = tree_utils.get_node_text(ch, add_spaces=True)
        tagged_text = tree_utils.get_node_text_tags_preserved(ch)
        markers_list = get_markers(tagged_text.strip())

        # If the child has a 'DEPTH' attribute, we're in manual
        # hierarchy mode, just constructed from the XML instead of
        # specified in configuration.
        # This presumes that every child in the section has DEPTH
        # specified, if not, things will break in and around
        # derive_depths below.
        if ch.get("depth") is not None:
            manual_hierarchy.append(int(ch.get("depth")))

        if ch.tag == 'STARS':
            nodes.append(Node(label=[mtypes.STARS_TAG]))
        elif not markers_list and manual_hierarchy:
            # is this a bunch of definitions that don't have numbers next to
            # them?
            if len(nodes) > 0:
                if (subject_text.find('Definitions.') > -1
                        or nodes[-1].text.find(
                            'For the purposes of this section')):
                    # TODO: create a grammar for definitions
                    if text.find('means') > -1:
                        def_marker = text.split('means')[0].strip().split()
                        def_marker = ''.join([word[0].upper() + word[1:]
                                              for word in def_marker])
                    elif text.find('shall have the same meaning') > -1:
                        def_marker = text.split('shall')[0].strip().split()
                        def_marker = ''.join([word[0].upper() + word[1:]
                                              for word in def_marker])
                    else:
                        def_marker = 'def{0}'.format(i)
                        i += 1
                    n = Node(text, label=[def_marker], source_xml=ch)
                    n.tagged_text = tagged_text
                    nodes.append(n)
                else:
                    section_texts.append((text, tagged_text))
            else:
                if len(children) > 1:
                    def_marker = 'def{0}'.format(i)
                    n = Node(text, [], [def_marker], source_xml=ch)
                    n.tagged_text = tagged_text
                    i += 1
                    nodes.append(n)
                else:
                    # this is the only node around
                    section_texts.append((text, tagged_text))

        elif not markers_list and not manual_hierarchy:
            # No manual heirarchy specified, append to the section.
            section_texts.append((text, tagged_text))
        else:
            for m, node_text in get_markers_and_text(ch, markers_list):
                n = Node(node_text[0], [], [m], source_xml=ch)
                n.tagged_text = unicode(node_text[1])
                nodes.append(n)

            if node_text[0].endswith('* * *'):
                nodes.append(Node(label=[mtypes.INLINE_STARS]))

    # Trailing stars don't matter; slightly more efficient to ignore them
    while nodes and nodes[-1].label[0] in mtypes.stars:
        nodes = nodes[:-1]

    m_stack = tree_utils.NodeStack()

    # Use constraint programming to figure out possible depth assignments
    if not manual_hierarchy:
        depths = derive_depths(
            [node.label[0] for node in nodes],
            [rules.depth_type_order([mtypes.lower, mtypes.ints, mtypes.roman,
                                     mtypes.upper, mtypes.em_ints,
                                     mtypes.em_roman])])

    if not manual_hierarchy and depths:
        # Find the assignment which violates the least of our heuristics
        depths = heuristics.prefer_multiple_children(depths, 0.5)
        depths = sorted(depths, key=lambda d: d.weight, reverse=True)
        depths = depths[0]

        for node, par in zip(nodes, depths):
            if par.typ != mtypes.stars:
                last = m_stack.peek()
                node.label = [l.replace('<E T="03">', '').replace('</E>', '')
                              for l in node.label]
                if len(last) == 0:
                    m_stack.push_last((1 + par.depth, node))
                else:
                    m_stack.add(1 + par.depth, node)

    elif nodes and manual_hierarchy:
        logging.warning('Using manual depth hierarchy.')
        depths = manual_hierarchy
        if len(nodes) == len(depths):
            for node, spec in zip(nodes, depths):
                if isinstance(spec, int):
                    depth = spec
                elif isinstance(spec, tuple):
                    depth, marker = spec
                    node.marker = marker
                last = m_stack.peek()
                node.label = [l.replace('<E T="03">', '').replace('</E>', '')
                              for l in node.label]
                if len(last) == 0:
                    m_stack.push_last((1 + depth, node))
                else:
                    m_stack.add(1 + depth, node)
        else:
            logging.error('Manual hierarchy length does not match node '
                          'list length! ({0} nodes but {1} provided, '
                          '{2})'.format(
                              len(nodes),
                              len(depths),
                              [x.label[0] for x in nodes]))

    elif nodes and not manual_hierarchy:
        logging.warning(
            'Could not determine depth when parsing {0}:\n{1}'.format(
                section_no_without_marker, [node.label[0] for node in nodes]))
        for node in nodes:
            last = m_stack.peek()
            node.label = [l.replace('<E T="03">', '').replace('</E>', '')
                          for l in node.label]
            if len(last) == 0:
                m_stack.push_last((3, node))
            else:
                m_stack.add(3, node)

    nodes = []
    section_nums = []
    for match in re.finditer(r'%s\.(\d+)' % reg_part, section_no):
        section_nums.append(int(match.group(1)))

    #  Span of section numbers
    if u'§§' == section_no[:2] and '-' in section_no:
        first, last = section_nums
        section_nums = []
        for i in range(first, last + 1):
            section_nums.append(i)

    for section_number in section_nums:
        section_number = str(section_number)
        plain_sect_texts = [s[0] for s in section_texts]
        tagged_sect_texts = [s[1] for s in section_texts]

        section_title = u"§ " + reg_part + "." + section_number
        if subject_text:
            section_title += " " + subject_text

        section_text = ' '.join([section_xml.text] + plain_sect_texts)
        tagged_section_text = ' '.join([section_xml.text] + tagged_sect_texts)

        sect_node = Node(section_text, label=[reg_part, section_number],
                         title=section_title)
        sect_node.tagged_text = tagged_section_text

        m_stack.add_to_bottom((1, sect_node))

        while m_stack.size() > 1:
            m_stack.unwind()

        nodes.append(m_stack.pop()[0][1])

    return nodes