def build_hierarchy(self, root, nodes, depths): """Given a root node, a flat list of child nodes, and a list of depths, build a node hierarchy around the root""" stack = tree_utils.NodeStack() stack.add(0, root) for node, depth_info in zip(nodes, depths): node.label = [mtypes.deemphasize(l) for l in node.label] self.replace_markerless(stack, node, depth_info.depth + 1) self.carry_label_to_children(node) if depth_info.typ != mtypes.stars: stack.add(1 + depth_info.depth, node) return stack.collapse()
def split_by_markers(xml): """Given an xml node, pull out triplets of (marker, plain-text following, text-with-tags following) for each subparagraph found""" plain_text = tree_utils.get_node_text(xml, add_spaces=True).strip() tagged_text = tree_utils.get_node_text_tags_preserved(xml).strip() markers_list = get_markers(tagged_text, next_marker(xml)) plain_markers = ['({})'.format(mtypes.deemphasize(m)) for m in markers_list] node_texts = tree_utils.split_text(plain_text, plain_markers) tagged_texts = tree_utils.split_text( tagged_text, ['({})'.format(m) for m in markers_list]) if len(node_texts) > len(markers_list): # due to initial MARKERLESS markers_list.insert(0, mtypes.MARKERLESS) return list(zip(markers_list, node_texts, tagged_texts))
def process(self, xml, root): nodes = self.parse_nodes(xml) intro_node, nodes = self.separate_intro(nodes) if intro_node: root.text = " ".join([root.text, intro_node.text]).strip() # @todo - this is ugly. Make tagged_text a legitimate field on Node tagged_text_list = [] if getattr(root, 'tagged_text', None): tagged_text_list.append(root.tagged_text) if getattr(intro_node, 'tagged_text', None): tagged_text_list.append(intro_node.tagged_text) if tagged_text_list: root.tagged_text = ' '.join(tagged_text_list) if nodes: markers = [node.label[0] for node in nodes] constraints = self.additional_constraints() depths = derive_depths(markers, constraints) if not depths: logging.warning("Could not derive paragraph depths." " Retrying with relaxed constraints.") deemphasized_markers = [deemphasize(m) for m in markers] constraints = self.relaxed_constraints() depths = derive_depths(deemphasized_markers, constraints) if not depths: fails_at = debug_idx(markers, constraints) logging.error( "Could not determine paragraph depths (<%s /> %s):\n" "%s\n" "?? %s\n" "Remaining markers: %s", xml.tag, root.label_id(), derive_depths(markers[:fails_at], constraints)[0].pretty_str(), markers[fails_at], markers[fails_at + 1:]) depths = self.select_depth(depths) return self.build_hierarchy(root, nodes, depths) else: return root