def test_text_to_labels(self):
        text = u"9(c)(2)(iii) Charges not Covered by § 1026.6(b)(1) and "
        text += "(b)(2)"
        self.assertEqual(
            [['1111', '9', 'c', '2', 'iii', 'Interp']],
            interpretation.text_to_labels(text,
                                          Label(part='1111', comment=True)))

        text = "Paragraphs 4(b)(7) and (b)(8)."
        self.assertEqual(
            [['1111', '4', 'b', '7', 'Interp'],
             ['1111', '4', 'b', '8', 'Interp']],
            interpretation.text_to_labels(text,
                                          Label(part='1111', comment=True)))

        text = "Appendices G and H-Something"
        self.assertEqual(
            [['1111', 'G', 'Interp'], ['1111', 'H', 'Interp']],
            interpretation.text_to_labels(text,
                                          Label(part='1111', comment=True)))

        text = "Paragraph 38(l)(7)(i)(A)(2)."
        self.assertEqual(
            [['1111', '38', 'l', '7', 'i', 'A', '2', 'Interp']],
            interpretation.text_to_labels(text,
                                          Label(part='1111', comment=True)))
def _p_with_label_in_child(xml_node):
    """E.g. <P><E>22(a)</E>.</P>"""
    children = xml_node.getchildren()
    return (xml_node.tag.upper() == 'P' and not (xml_node.text or '').strip()
            and len(children) == 1
            and not (children[0].tail or '').strip(" \n\t.")
            and text_to_labels(children[0].text, Label(), warn=False))
def is_title(xml_node):
    """Not all titles are created equal. Sometimes a title appears as a
    paragraph tag, mostly to add confusion."""
    if xml_node.getchildren():
        child = xml_node.getchildren()[0]
    else:
        child = None
    return bool(
        (xml_node.tag.upper() == 'HD' and xml_node.attrib['SOURCE'] != 'HED')
        or (xml_node.tag.upper() == 'P' and
            (xml_node.text is None or not xml_node.text.strip())
            and len(xml_node.getchildren()) == 1 and
            (child.tail is None or not child.tail.strip(" \n\t."))
            and text_to_labels(child.text, Label(), warn=False)) or
        (xml_node.tag.upper() == 'P' and len(xml_node.getchildren()) == 0
         and xml_node.text and not get_first_interp_marker(xml_node.text) and
         text_to_labels(xml_node.text, Label(), warn=False, force_start=True)))
def _non_interp_p_with_label(xml_node):
    """E.g. <P>22(a)</P> but not <P>ii. 22(a)</P>"""
    return (
        xml_node.tag.upper() == 'P' and
        not xml_node.getchildren() and
        xml_node.text and not get_first_interp_marker(xml_node.text) and
        text_to_labels(xml_node.text, Label(), warn=False, force_start=True)
    )
def is_title(xml_node):
    """Not all titles are created equal. Sometimes a title appears as a
    paragraph tag, mostly to add confusion."""
    if xml_node.getchildren():
        child = xml_node.getchildren()[0]
    else:
        child = None
    return bool(
        (xml_node.tag.upper() == 'HD' and xml_node.attrib['SOURCE'] != 'HED')
        or (xml_node.tag.upper() == 'P'
            and (xml_node.text is None or not xml_node.text.strip())
            and len(xml_node.getchildren()) == 1
            and (child.tail is None or not child.tail.strip(" \n\t."))
            and text_to_labels(child.text, Label(), warn=False))
        or (xml_node.tag.upper() == 'P'
            and len(xml_node.getchildren()) == 0
            and xml_node.text and not get_first_interp_marker(xml_node.text)
            and text_to_labels(xml_node.text, Label(), warn=False,
                               force_start=True)))
def _p_with_label_in_child(xml_node):
    """E.g. <P><E>22(a)</E>.</P>"""
    children = xml_node.getchildren()
    return (
        xml_node.tag.upper() == 'P' and
        not (xml_node.text or '').strip() and
        len(children) == 1 and
        not (children[0].tail or '').strip(" \n\t.") and
        text_to_labels(children[0].text, Label(), warn=False)
    )
def parse_from_xml(root, xml_nodes):
    """Core of supplement processing; shared by whole XML parsing and notice
    parsing. root is the root interpretation node (e.g. a Node with label
    '1005-Interp'). xml_nodes contains all XML nodes which will be relevant
    to the interpretations"""

    supplement_nodes = [root]

    last_label = root.label
    header_count = 0
    for ch in xml_nodes:
        node = Node(label=last_label, node_type=Node.INTERP)
        label_obj = Label.from_node(node)

        #   Explicitly ignore "subpart" headers, as they are inconsistent
        #   and they will be reconstructed as subterps client-side
        text = tree_utils.get_node_text(ch, add_spaces=True)
        if is_title(ch) and 'subpart' not in text.lower():
            labels = text_to_labels(text, label_obj)
            if labels:
                label = merge_labels(labels)
            else:  # Header without a label, like an Introduction, etc.
                header_count += 1
                label = root.label[:2] + ['h%d' % header_count]

            inner_stack = tree_utils.NodeStack()
            missing = missing_levels(last_label, label)
            supplement_nodes.extend(missing)
            last_label = label

            node = Node(node_type=Node.INTERP, label=label, title=text.strip())
            inner_stack.add(2, node)

            process_inner_children(inner_stack, ch, parent=node)

            while inner_stack.size() > 1:
                inner_stack.unwind()

            ch_node = inner_stack.m_stack[0][0][1]
            supplement_nodes.append(ch_node)

    supplement_tree = treeify(supplement_nodes)

    def per_node(node):
        node.label = [l.replace('<E T="03">', '') for l in node.label]
        for child in node.children:
            per_node(child)

    for node in supplement_tree:
        per_node(node)

    return supplement_tree[0]
예제 #8
0
def parse_from_xml(root, xml_nodes):
    """Core of supplement processing; shared by whole XML parsing and notice
    parsing. root is the root interpretation node (e.g. a Node with label
    '1005-Interp'). xml_nodes contains all XML nodes which will be relevant
    to the interpretations"""

    supplement_nodes = [root]

    last_label = root.label
    header_count = 0
    for ch in xml_nodes:
        node = Node(label=last_label, node_type=Node.INTERP)
        label_obj = Label.from_node(node)

        #   Explicitly ignore "subpart" headers, as they are inconsistent
        #   and they will be reconstructed as subterps client-side
        text = tree_utils.get_node_text(ch, add_spaces=True)
        if is_title(ch) and 'subpart' not in text.lower():
            labels = text_to_labels(text, label_obj)
            if labels:
                label = merge_labels(labels)
            else:   # Header without a label, like an Introduction, etc.
                header_count += 1
                label = root.label[:2] + ['h%d' % header_count]

            inner_stack = tree_utils.NodeStack()
            missing = missing_levels(last_label, label)
            supplement_nodes.extend(missing)
            last_label = label

            node = Node(node_type=Node.INTERP, label=label,
                        title=text.strip())
            inner_stack.add(2, node)

            process_inner_children(inner_stack, ch, parent=node)

            while inner_stack.size() > 1:
                inner_stack.unwind()

            ch_node = inner_stack.m_stack[0][0][1]
            supplement_nodes.append(ch_node)

    supplement_tree = treeify(supplement_nodes)

    def per_node(node):
        node.label = [l.replace('<E T="03">', '') for l in node.label]
        for child in node.children:
            per_node(child)
    for node in supplement_tree:
        per_node(node)

    return supplement_tree[0]
예제 #9
0
    def test_text_to_labels(self):
        text = u"9(c)(2)(iii) Charges not Covered by § 1026.6(b)(1) and "
        text += "(b)(2)"
        self.assertEqual([['1111', '9', 'c', '2', 'iii', 'Interp']],
                         interpretation.text_to_labels(
                             text, Label(part='1111', comment=True)))

        text = "Paragraphs 4(b)(7) and (b)(8)."
        self.assertEqual([['1111', '4', 'b', '7', 'Interp'],
                          ['1111', '4', 'b', '8', 'Interp']],
                         interpretation.text_to_labels(
                             text, Label(part='1111', comment=True)))

        text = "Appendices G and H-Something"
        self.assertEqual([['1111', 'G', 'Interp'], ['1111', 'H', 'Interp']],
                         interpretation.text_to_labels(
                             text, Label(part='1111', comment=True)))

        text = "Paragraph 38(l)(7)(i)(A)(2)."
        self.assertEqual([['1111', '38', 'l', '7', 'i', 'A', '2', 'Interp']],
                         interpretation.text_to_labels(
                             text, Label(part='1111', comment=True)))
        def per_node(node):
            if (node.node_type != struct.Node.INTERP
                    or node.label[-1] != struct.Node.INTERP_MARK):
                return

            #   Always add a connection based on the interp's label
            self.lookup_table[tuple(node.label[:-1])].append(node)

            #   Also add connections based on the title
            for label in text_to_labels(node.title or '',
                                        Label.from_node(node),
                                        warn=False):
                label = tuple(label[:-1])  # Remove Interp marker
                if node not in self.lookup_table[label]:
                    self.lookup_table[label].append(node)
예제 #11
0
        def per_node(node):
            if (node.node_type != struct.Node.INTERP
                    or node.label[-1] != struct.Node.INTERP_MARK):
                return

            #   Always add a connection based on the interp's label
            self.lookup_table[tuple(node.label[:-1])].append(node)

            #   Also add connections based on the title
            for label in text_to_labels(node.title or '',
                                        Label.from_node(node),
                                        warn=False):
                label = tuple(label[:-1])   # Remove Interp marker
                if node not in self.lookup_table[label]:
                    self.lookup_table[label].append(node)
def _non_interp_p_with_label(xml_node):
    """E.g. <P>22(a)</P> but not <P>ii. 22(a)</P>"""
    return (xml_node.tag.upper() == 'P' and not xml_node.getchildren()
            and xml_node.text and not get_first_interp_marker(xml_node.text)
            and text_to_labels(
                xml_node.text, Label(), warn=False, force_start=True))