def table_xml_to_data(xml_node): """Construct a data structure of the table data. We provide a different structure than the native XML as the XML encodes too much logic. This structure can be used to generate semi-complex tables which could not be generated from the markdown above""" header_root = build_header(xml_node.xpath('./BOXHD/CHED')) header = [[] for _ in range(header_root.height())] def per_node(node): header[node.level].append({ 'text': node.text, 'colspan': node.colspan, 'rowspan': node.rowspan }) struct.walk(header_root, per_node) header = header[1:] # skip the root rows = [] for row in xml_node.xpath('./ROW'): rows.append([ tree_utils.get_node_text(td, add_spaces=True).strip() for td in row.xpath('./ENT') ]) table_data = {'header': header, 'rows': rows} caption_nodes = xml_node.xpath('./TTITLE') if len(caption_nodes): text = tree_utils.get_node_text(caption_nodes[0]).strip() table_data["caption"] = text return table_data
def build_header(xml_nodes): """Builds a TableHeaderNode tree, with an empty root. Each node in the tree includes its colspan/rowspan""" def add_element(stack, xml_node, level=None): text = tree_utils.get_node_text(xml_node, add_spaces=True).strip() stack.add(level, TableHeaderNode(text, level)) stack = HeaderStack() stack.add(0, TableHeaderNode(None, 0)) # Root for xml_node in xml_nodes: level = int(xml_node.attrib['H']) add_element(stack, xml_node, level=level) while stack.size() > 1: stack.unwind() root = stack.m_stack[0][0][1] max_height = root.height() def set_colspan(n): n.colspan = n.width() struct.walk(root, set_colspan) root = build_header_rowspans(root, max_height) return root
def create_xml_changes(amended_labels, section, notice_changes, subpart_label=None): """For PUT/POST, match the amendments to the section nodes that got parsed, and actually create the notice changes. """ def per_node(node): node.child_labels = [c.label_id() for c in node.children] struct.walk(section, per_node) amend_map = changes.match_labels_and_changes(amended_labels, section) for label, amendments in amend_map.iteritems(): for amendment in amendments: if amendment['action'] in ('POST', 'PUT'): if (subpart_label and amendment['action'] == 'POST' and len(label.split('-')) == 2): amendment['extras'] = {'subpart': subpart_label} if 'field' in amendment: nodes = changes.create_field_amendment(label, amendment) else: nodes = changes.create_add_amendment(amendment) for n in nodes: notice_changes.update(n) elif amendment['action'] == 'RESERVE': change = changes.create_reserve_amendment(amendment) notice_changes.update(change) elif amendment['action'] not in ('DELETE', 'MOVE'): logging.info('NOT HANDLED: %s' % amendment['action'])
def pre_process(self): """As a preprocessing step, run through the entire tree, collecting all labels.""" def per_node(node): self.known_citations.add(tuple(node.label)) walk(self.tree, per_node)
def replace_using(self, tree): """Clear out the known labels; replace them using the provided node tree.""" self._known_labels = set() def per_node(node): self._known_labels.add(node.label_id()) struct.walk(tree, per_node)
def _serialize(self, tag, obj): """Performs class-specific conversions before writing to a file""" if isinstance(obj, struct.Node): obj = copy.deepcopy(obj) struct.walk(obj, _serialize_xml_fields) with open(self._filename(tag), 'wb') as to_write: pickle.dump(obj, to_write)
def hash_nodes(reg_tree): """ Create a hash map to the nodes of a regulation tree. """ tree_hash = {} def per_node(node): tree_hash[node.label_id()] = node struct.walk(reg_tree, per_node) return tree_hash
def pre_process(self): # mark the nodes that are part of a model forms section def per_node(node): if self.is_appendix(node): if self.is_model_form(node): self.model_forms_sections.append(node.label_id()) self.model_forms_nodes[node.label_id()] = True elif self.is_model_form_child(node): self.model_forms_nodes[node.label_id()] = True struct.walk(self.tree, per_node)
def _deserialize(self, tag): """Attempts to read the object from disk. Performs class-specific conversions when deserializing""" name = self._filename(tag) if os.path.exists(name): with open(name, 'rb') as to_read: try: obj = pickle.load(to_read) except Exception: # something bad happened during unpickling obj = None if isinstance(obj, struct.Node): struct.walk(obj, _deserialize_xml_fields) return obj
def find_candidate(root, label_last, amended_labels): """ Look through the tree for a node that has the same paragraph marker as the one we're looking for (and also has no children). That might be a mis-parsed node. Because we're parsing partial sections in the notices, it's likely we might not be able to disambiguate between paragraph markers. """ def check(node): """ Match last part of label.""" if node.label[-1] == label_last: return node candidates = struct.walk(root, check) if len(candidates) > 1: # Look for mal-formed labels, labels that can't exist (because we're # not amending that part of the reg, or eventually a parent with no # children. bad_labels = [n for n in candidates if bad_label(n)] impossible_labels = [ n for n in candidates if impossible_label(n, amended_labels) ] no_children = [n for n in candidates if n.children == []] # If we have a single option in any of the categories, return that. if len(bad_labels) == 1: return bad_labels elif len(impossible_labels) == 1: return impossible_labels elif len(no_children) == 1: return no_children return candidates
def add_subparts(self): """Document the relationship between sections and subparts""" current_subpart = [None] # Need a reference for the closure def per_node(node): if node.node_type == struct.Node.SUBPART: current_subpart[0] = node.label[2] elif node.node_type == struct.Node.EMPTYPART: current_subpart[0] = None if (node.node_type in (struct.Node.REGTEXT, struct.Node.APPENDIX) and len(node.label) == 2): # Subparts section = node.label[-1] self.subpart_map[current_subpart[0]].append(section) struct.walk(self.tree, per_node)
def pre_process(self): """Create a lookup table for each interpretation""" def per_node(node): if (node.node_type != struct.Node.INTERP or node.label[-1] != struct.Node.INTERP_MARK): return # Always add a connection based on the interp's label self.lookup_table[tuple(node.label[:-1])].append(node) # Also add connections based on the title for label in text_to_labels(node.title or '', Label.from_node(node), warn=False): label = tuple(label[:-1]) # Remove Interp marker if node not in self.lookup_table[label]: self.lookup_table[label].append(node) struct.walk(self.tree, per_node)
def changes_between(lhs, rhs): """Main entry point for this library. Recursively return a list of changes between the lhs and rhs. lhs and rhs should be FrozenNodes. Note that this *does not* account for reordering nodes, though it does account for limited moves (e.g. when renaming subparts).""" changes = [] if lhs == rhs: return changes changes.extend(_local_changes(lhs, rhs)) # Removed children. Note params reversed removed_children = _new_in_rhs(rhs.children, lhs.children) changes.extend(map(_data_for_delete, removed_children)) # grandchildren which appear to be deleted, but may just have been moved possibly_moved = {} for child in removed_children: for grandchild in child.children: possibly_moved[grandchild.label_id] = grandchild # New children. Determine if they are added or moved for added in _new_in_rhs(lhs.children, rhs.children): changes.append(_data_for_add(added)) for grandchild in added.children: if grandchild.label_id in possibly_moved: # it *was* moved changes.extend( changes_between(possibly_moved[grandchild.label_id], grandchild)) del possibly_moved[grandchild.label_id] else: # Not moved; recursively add all of it's children changes.extend(struct.walk(grandchild, _data_for_add)) # Remaining nodes weren't moved; they were *re*moved for removed in possibly_moved.values(): changes.extend(struct.walk(removed, _data_for_delete)) # Recurse on modified children. Again, this does *not* track reordering for lhs_child in lhs.children: for rhs_child in rhs.children: if lhs_child.label_id == rhs_child.label_id: changes.extend(changes_between(lhs_child, rhs_child)) return changes
def pre_process(self): """Step through every node in the tree, finding definitions. Add these definition to self.scoped_terms. Also keep track of which subpart we are in. Finally, document all defined terms. """ self.add_subparts() stack = ParentStack() def per_node(node): if len(node.label) > 1 and node.node_type == struct.Node.REGTEXT: # Add one for the subpart level stack.add(len(node.label) + 1, node) elif node.node_type in (struct.Node.SUBPART, struct.Node.EMPTYPART): # Subparts all on the same level stack.add(2, node) else: stack.add(len(node.label), node) if node.node_type in (struct.Node.REGTEXT, struct.Node.SUBPART, struct.Node.EMPTYPART): included, excluded = self.node_definitions(node, stack) if included: for scope in self.determine_scope(stack): self.scoped_terms[scope].extend(included) self.scoped_terms['EXCLUDED'].extend(excluded) struct.walk(self.tree, per_node) referenced = self.layer['referenced'] for scope in self.scoped_terms: for ref in self.scoped_terms[scope]: key = ref.term + ":" + ref.label if (key not in referenced # New term # Or this term is earlier in the paragraph or ref.position[0] < referenced[key]['position'][0]): referenced[key] = { 'term': ref.term, 'reference': ref.label, 'position': ref.position }
def create_xml_changes(amended_labels, section, notice_changes): """For PUT/POST, match the amendments to the section nodes that got parsed, and actually create the notice changes. """ def per_node(node): node.child_labels = [c.label_id() for c in node.children] walk(section, per_node) amend_map = changes.match_labels_and_changes(amended_labels, section) for label, amendments in amend_map.items(): for amendment in amendments: if amendment['action'] in ('POST', 'PUT', 'INSERT'): if 'field' in amendment: nodes = changes.create_field_amendment(label, amendment) else: nodes = changes.create_add_amendment(amendment) for n in nodes: notice_changes.add_changes(amendment['amdpar_xml'], n) elif amendment['action'] == 'RESERVE': change = changes.create_reserve_amendment(amendment) notice_changes.add_changes(amendment['amdpar_xml'], change) else: logger.warning("Unknown action: %s", amendment['action'])
def test_walk(self): n1 = struct.Node("1") n2 = struct.Node("2") n3 = struct.Node("3") n4 = struct.Node("4") n1.children = [n2, n3] n2.children = [n4] order = [] def add_node(n): order.append(n) if not n == n2: return n.text ret_val = struct.walk(n1, add_node) self.assertEqual([n1, n2, n4, n3], order) self.assertEqual(["1", "4", "3"], ret_val)
# include the period next_char = node_text[first_p + 1:first_p + 2] if next_char in (')', u'”'): first_sentence = node_text[:first_p + 2] else: first_sentence = node_text[:first_p + 1] else: first_sentence = node_text # Key terms can't be the entire text of a leaf node if first_sentence == node_text and not node.children: return words = first_sentence.split() if (not words[-1] == part_end and not first_sentence.startswith('![')): num_words = len(words) # key terms are short if num_words <= 15: layer_element = {"key_term": first_sentence, "locations": [0]} layer[label_id] = [layer_element] if __name__ == "__main__": # Use the plain text based JSON for the regulation. tree = api_stub.get_regulation_as_json( '/vagrant/data/stub-server/regulation/1005/2013-10604-eregs') struct.walk(tree, generate_keyterm) print struct.NodeEncoder().encode(layer)
def compare(self): """ Execute the actual comparison, generating the data structure that represents the diff. """ struct.walk(self.older, self.deleted_and_modified) self.added()
def add_subparts(self, root): """Document the relationship between sections and subparts""" # Need a reference for maintaining state self.__current_subpart = None struct.walk(root, self._subpart_per_node)
def add_subparts(self, root): """Document the relationship between sections and subparts""" self._current_subpart = None struct.walk(root, self._subpart_per_node)