Exemplo n.º 1
0
def table_xml_to_data(xml_node):
    """Construct a data structure of the table data. We provide a different
    structure than the native XML as the XML encodes too much logic. This
    structure can be used to generate semi-complex tables which could not be
    generated from the markdown above"""
    header_root = build_header(xml_node.xpath('./BOXHD/CHED'))
    header = [[] for _ in range(header_root.height())]

    def per_node(node):
        header[node.level].append({
            'text': node.text,
            'colspan': node.colspan,
            'rowspan': node.rowspan
        })

    struct.walk(header_root, per_node)
    header = header[1:]  # skip the root

    rows = []
    for row in xml_node.xpath('./ROW'):
        rows.append([
            tree_utils.get_node_text(td, add_spaces=True).strip()
            for td in row.xpath('./ENT')
        ])

    table_data = {'header': header, 'rows': rows}

    caption_nodes = xml_node.xpath('./TTITLE')
    if len(caption_nodes):
        text = tree_utils.get_node_text(caption_nodes[0]).strip()
        table_data["caption"] = text

    return table_data
Exemplo n.º 2
0
def build_header(xml_nodes):
    """Builds a TableHeaderNode tree, with an empty root. Each node in the tree
    includes its colspan/rowspan"""
    def add_element(stack, xml_node, level=None):
        text = tree_utils.get_node_text(xml_node, add_spaces=True).strip()
        stack.add(level, TableHeaderNode(text, level))

    stack = HeaderStack()
    stack.add(0, TableHeaderNode(None, 0))  # Root

    for xml_node in xml_nodes:
        level = int(xml_node.attrib['H'])
        add_element(stack, xml_node, level=level)

    while stack.size() > 1:
        stack.unwind()
    root = stack.m_stack[0][0][1]

    max_height = root.height()

    def set_colspan(n):
        n.colspan = n.width()

    struct.walk(root, set_colspan)

    root = build_header_rowspans(root, max_height)

    return root
Exemplo n.º 3
0
def create_xml_changes(amended_labels,
                       section,
                       notice_changes,
                       subpart_label=None):
    """For PUT/POST, match the amendments to the section nodes that got
    parsed, and actually create the notice changes. """
    def per_node(node):
        node.child_labels = [c.label_id() for c in node.children]

    struct.walk(section, per_node)

    amend_map = changes.match_labels_and_changes(amended_labels, section)

    for label, amendments in amend_map.iteritems():
        for amendment in amendments:
            if amendment['action'] in ('POST', 'PUT'):
                if (subpart_label and amendment['action'] == 'POST'
                        and len(label.split('-')) == 2):
                    amendment['extras'] = {'subpart': subpart_label}
                if 'field' in amendment:
                    nodes = changes.create_field_amendment(label, amendment)
                else:
                    nodes = changes.create_add_amendment(amendment)
                for n in nodes:
                    notice_changes.update(n)
            elif amendment['action'] == 'RESERVE':
                change = changes.create_reserve_amendment(amendment)
                notice_changes.update(change)
            elif amendment['action'] not in ('DELETE', 'MOVE'):
                logging.info('NOT HANDLED: %s' % amendment['action'])
    def pre_process(self):
        """As a preprocessing step, run through the entire tree, collecting
        all labels."""
        def per_node(node):
            self.known_citations.add(tuple(node.label))

        walk(self.tree, per_node)
Exemplo n.º 5
0
    def replace_using(self, tree):
        """Clear out the known labels; replace them using the provided node
        tree."""
        self._known_labels = set()

        def per_node(node):
            self._known_labels.add(node.label_id())
        struct.walk(tree, per_node)
Exemplo n.º 6
0
    def _serialize(self, tag, obj):
        """Performs class-specific conversions before writing to a file"""
        if isinstance(obj, struct.Node):
            obj = copy.deepcopy(obj)
            struct.walk(obj, _serialize_xml_fields)

        with open(self._filename(tag), 'wb') as to_write:
            pickle.dump(obj, to_write)
Exemplo n.º 7
0
def hash_nodes(reg_tree):
    """ Create a hash map to the nodes of a regulation tree.  """
    tree_hash = {}

    def per_node(node):
        tree_hash[node.label_id()] = node

    struct.walk(reg_tree, per_node)
    return tree_hash
    def pre_process(self):
        # mark the nodes that are part of a model forms section

        def per_node(node):
            if self.is_appendix(node):
                if self.is_model_form(node):
                    self.model_forms_sections.append(node.label_id())
                    self.model_forms_nodes[node.label_id()] = True
                elif self.is_model_form_child(node):
                    self.model_forms_nodes[node.label_id()] = True

        struct.walk(self.tree, per_node)
Exemplo n.º 9
0
    def _deserialize(self, tag):
        """Attempts to read the object from disk. Performs class-specific
        conversions when deserializing"""
        name = self._filename(tag)
        if os.path.exists(name):
            with open(name, 'rb') as to_read:
                try:
                    obj = pickle.load(to_read)
                except Exception:   # something bad happened during unpickling
                    obj = None

            if isinstance(obj, struct.Node):
                struct.walk(obj, _deserialize_xml_fields)
            return obj
Exemplo n.º 10
0
def find_candidate(root, label_last, amended_labels):
    """
        Look through the tree for a node that has the same paragraph marker as
        the one we're looking for (and also has no children).  That might be a
        mis-parsed node. Because we're parsing partial sections in the notices,
        it's likely we might not be able to disambiguate between paragraph
        markers.
    """
    def check(node):
        """ Match last part of label."""
        if node.label[-1] == label_last:
            return node

    candidates = struct.walk(root, check)
    if len(candidates) > 1:
        # Look for mal-formed labels, labels that can't exist (because we're
        # not amending that part of the reg, or eventually a parent with no
        # children.

        bad_labels = [n for n in candidates if bad_label(n)]
        impossible_labels = [
            n for n in candidates if impossible_label(n, amended_labels)
        ]
        no_children = [n for n in candidates if n.children == []]

        # If we have a single option in any of the categories, return that.
        if len(bad_labels) == 1:
            return bad_labels
        elif len(impossible_labels) == 1:
            return impossible_labels
        elif len(no_children) == 1:
            return no_children
    return candidates
Exemplo n.º 11
0
    def add_subparts(self):
        """Document the relationship between sections and subparts"""

        current_subpart = [None]  # Need a reference for the closure

        def per_node(node):
            if node.node_type == struct.Node.SUBPART:
                current_subpart[0] = node.label[2]
            elif node.node_type == struct.Node.EMPTYPART:
                current_subpart[0] = None
            if (node.node_type in (struct.Node.REGTEXT, struct.Node.APPENDIX)
                    and len(node.label) == 2):
                # Subparts
                section = node.label[-1]
                self.subpart_map[current_subpart[0]].append(section)

        struct.walk(self.tree, per_node)
Exemplo n.º 12
0
    def pre_process(self):
        """Create a lookup table for each interpretation"""
        def per_node(node):
            if (node.node_type != struct.Node.INTERP or
                    node.label[-1] != struct.Node.INTERP_MARK):
                return

            #   Always add a connection based on the interp's label
            self.lookup_table[tuple(node.label[:-1])].append(node)

            #   Also add connections based on the title
            for label in text_to_labels(node.title or '',
                                        Label.from_node(node),
                                        warn=False):
                label = tuple(label[:-1])   # Remove Interp marker
                if node not in self.lookup_table[label]:
                    self.lookup_table[label].append(node)
        struct.walk(self.tree, per_node)
Exemplo n.º 13
0
def changes_between(lhs, rhs):
    """Main entry point for this library. Recursively return a list of changes
    between the lhs and rhs. lhs and rhs should be FrozenNodes. Note that this
    *does not* account for reordering nodes, though it does account for
    limited moves (e.g. when renaming subparts)."""
    changes = []
    if lhs == rhs:
        return changes

    changes.extend(_local_changes(lhs, rhs))

    # Removed children. Note params reversed
    removed_children = _new_in_rhs(rhs.children, lhs.children)
    changes.extend(map(_data_for_delete, removed_children))
    # grandchildren which appear to be deleted, but may just have been moved
    possibly_moved = {}
    for child in removed_children:
        for grandchild in child.children:
            possibly_moved[grandchild.label_id] = grandchild

    # New children. Determine if they are added or moved
    for added in _new_in_rhs(lhs.children, rhs.children):
        changes.append(_data_for_add(added))
        for grandchild in added.children:
            if grandchild.label_id in possibly_moved:  # it *was* moved
                changes.extend(
                    changes_between(possibly_moved[grandchild.label_id],
                                    grandchild))
                del possibly_moved[grandchild.label_id]
            else:  # Not moved; recursively add all of it's children
                changes.extend(struct.walk(grandchild, _data_for_add))

    # Remaining nodes weren't moved; they were *re*moved
    for removed in possibly_moved.values():
        changes.extend(struct.walk(removed, _data_for_delete))

    # Recurse on modified children. Again, this does *not* track reordering
    for lhs_child in lhs.children:
        for rhs_child in rhs.children:
            if lhs_child.label_id == rhs_child.label_id:
                changes.extend(changes_between(lhs_child, rhs_child))
    return changes
Exemplo n.º 14
0
    def pre_process(self):
        """Step through every node in the tree, finding definitions. Add
        these definition to self.scoped_terms. Also keep track of which
        subpart we are in. Finally, document all defined terms. """
        self.add_subparts()
        stack = ParentStack()

        def per_node(node):
            if len(node.label) > 1 and node.node_type == struct.Node.REGTEXT:
                #   Add one for the subpart level
                stack.add(len(node.label) + 1, node)
            elif node.node_type in (struct.Node.SUBPART,
                                    struct.Node.EMPTYPART):
                #   Subparts all on the same level
                stack.add(2, node)
            else:
                stack.add(len(node.label), node)

            if node.node_type in (struct.Node.REGTEXT, struct.Node.SUBPART,
                                  struct.Node.EMPTYPART):
                included, excluded = self.node_definitions(node, stack)
                if included:
                    for scope in self.determine_scope(stack):
                        self.scoped_terms[scope].extend(included)
                self.scoped_terms['EXCLUDED'].extend(excluded)

        struct.walk(self.tree, per_node)

        referenced = self.layer['referenced']
        for scope in self.scoped_terms:
            for ref in self.scoped_terms[scope]:
                key = ref.term + ":" + ref.label
                if (key not in referenced  # New term
                        # Or this term is earlier in the paragraph
                        or ref.position[0] < referenced[key]['position'][0]):
                    referenced[key] = {
                        'term': ref.term,
                        'reference': ref.label,
                        'position': ref.position
                    }
Exemplo n.º 15
0
def create_xml_changes(amended_labels, section, notice_changes):
    """For PUT/POST, match the amendments to the section nodes that got
    parsed, and actually create the notice changes. """

    def per_node(node):
        node.child_labels = [c.label_id() for c in node.children]
    walk(section, per_node)

    amend_map = changes.match_labels_and_changes(amended_labels, section)

    for label, amendments in amend_map.items():
        for amendment in amendments:
            if amendment['action'] in ('POST', 'PUT', 'INSERT'):
                if 'field' in amendment:
                    nodes = changes.create_field_amendment(label, amendment)
                else:
                    nodes = changes.create_add_amendment(amendment)
                for n in nodes:
                    notice_changes.add_changes(amendment['amdpar_xml'], n)
            elif amendment['action'] == 'RESERVE':
                change = changes.create_reserve_amendment(amendment)
                notice_changes.add_changes(amendment['amdpar_xml'], change)
            else:
                logger.warning("Unknown action: %s", amendment['action'])
    def test_walk(self):
        n1 = struct.Node("1")
        n2 = struct.Node("2")
        n3 = struct.Node("3")
        n4 = struct.Node("4")

        n1.children = [n2, n3]
        n2.children = [n4]

        order = []

        def add_node(n):
            order.append(n)
            if not n == n2:
                return n.text
        ret_val = struct.walk(n1, add_node)
        self.assertEqual([n1, n2, n4, n3], order)
        self.assertEqual(["1", "4", "3"], ret_val)
Exemplo n.º 17
0
            # include the period
            next_char = node_text[first_p + 1:first_p + 2]
            if next_char in (')', u'”'):
                first_sentence = node_text[:first_p + 2]
            else:
                first_sentence = node_text[:first_p + 1]
        else:
            first_sentence = node_text

        # Key terms can't be the entire text of a leaf node
        if first_sentence == node_text and not node.children:
            return

        words = first_sentence.split()
        if (not words[-1] == part_end and not first_sentence.startswith('![')):
            num_words = len(words)

            # key terms are short
            if num_words <= 15:
                layer_element = {"key_term": first_sentence, "locations": [0]}
                layer[label_id] = [layer_element]


if __name__ == "__main__":
    # Use the plain text based JSON for the regulation.
    tree = api_stub.get_regulation_as_json(
        '/vagrant/data/stub-server/regulation/1005/2013-10604-eregs')
    struct.walk(tree, generate_keyterm)

    print struct.NodeEncoder().encode(layer)
Exemplo n.º 18
0
 def compare(self):
     """ Execute the actual comparison, generating the data structure
     that represents the diff. """
     struct.walk(self.older, self.deleted_and_modified)
     self.added()
Exemplo n.º 19
0
 def add_subparts(self, root):
     """Document the relationship between sections and subparts"""
     # Need a reference for maintaining state
     self.__current_subpart = None
     struct.walk(root, self._subpart_per_node)
Exemplo n.º 20
0
 def add_subparts(self, root):
     """Document the relationship between sections and subparts"""
     self._current_subpart = None
     struct.walk(root, self._subpart_per_node)