def transform_char(self, element: Element, /): char_placeholder_pattern = re.compile( r""" ( (?P<u_plus> U\+ )? # optional U+ prefix ( (?P<code_point_placeholder> X{4} ) # XXXX as code point placeholder | (?P<code_point> 1?[0-9A-F]?[0-9A-F]{4} ) # or actual code point ) )? ( \s (?P<glyph_placeholder> \[X\] ) # [X] as glyph placeholder )? ( \s (?P<name> [A-Z0-9 -]+ ) # actual name )? """, flags=re.VERBOSE, ) transformed: Element = element.makeelement("span", {"class": "character"}, None) transformed.tail = element.tail # type: ignore text: str = element.text element.getparent().replace(element, transformed) if match := char_placeholder_pattern.fullmatch(text): cps = set[int]() if name := match.group("name"): cps.add(ord(unicodedata2.lookup(name)))
def remove_element(element: etree._Element, keep_children=False) -> None: """ Removes the given element from its tree. Unless ``keep_children`` is passed as ``True``, its children vanish with it into void. """ if keep_children: for child in element: element.addprevious(child) element.getparent().remove(element)
def _clear_context(elem: etree._Element) -> None: elem.clear() while elem.getprevious() is not None: del elem.getparent()[0] return None
def get_item_details(section_number: str, department_name: str, node: et._Element) -> Dict: """Extract a BOE diary entry's deatils.""" search_details = helpers.use_tree_for_search(node) title_node = search_details(boe.SummaryXpath.item_title)[0] pdf_url_node = search_details(boe.SummaryXpath.item_pdf_url)[0] xml_url_node = search_details(boe.SummaryXpath.item_xml_url)[0] htm_url_node = search_details(boe.SummaryXpath.item_htm_url)[0] parent = node.getparent() is_epigraph = parent.tag.lower() == 'epigrafe' epigraph = parent.get(boe.SummaryAttribute.epigraph_name) \ if is_epigraph \ else '' details = {} details['id'] = node.get(boe.SummaryAttribute.item_id) details['epigraph'] = epigraph details['section'] = section_number details['department'] = department_name details['title'] = title_node.text details['pdf_url'] = pdf_url_node.text details['xml_url'] = xml_url_node.text details['htm_url'] = htm_url_node.text return details
def _handle_text(cls, node: etree._Element, do_handle_tail_instead=False): if do_handle_tail_instead: if not node.tail or not node.tail.strip(): return text = node.tail node.tail = '' insert_node = node.getparent() insert_start = insert_node.index(node) + 1 else: if not node.text or not node.text.strip(): return text = node.text.strip() node.text = '' insert_node = node insert_start = 0 word_nodes = cls._str_2_word_nodes(text) # the child nodes all get the classes of the parents. that's used later in postproc for word_node in word_nodes: word_node.attrib[ cls.PARENT_CLASS_ATTRIB_NAME] = insert_node.attrib.get( 'class', '') # set the newly created word nodes as children of the parent node. # for text they go below the current node, at the beginning. # for tail, they get inserted into the current node's parent after the current node. for word_ind, word_node in enumerate(word_nodes): insert_node.insert(word_ind + insert_start, word_node) return
def get_parent(xml_obj: _Element): """ Returns the parent of the current xml object Args: xml_obj (Element): The xml element Returns: The xml element's parent """ return xml_obj.getparent()
def remove_one_element(element: _Element) -> None: """ Remove single specified element. element -- element to be removed """ parent = element.getparent() if parent is not None: parent.remove(element)
def remove_preserving_whitespace(element: Element) -> None: parent = element.getparent() if element.tail: prev = element.getprevious() if prev is not None: prev.tail = (prev.tail or "") + element.tail else: parent.text = (parent.text or "") + element.tail parent.remove(element)
def transform_element(self, element: Element, /): match element.tag: case "char": self.transform_char(element) case ("h1" | "figcaption" | "a") as tag if (mode := "numbering") in element.keys(): self.expand_placeholder_in_element(element, mode) if tag == "a": wrap = element.makeelement("cite", {}, None) wrap.tail, element.tail = element.tail, None # type: ignore element.getparent().replace(element, wrap) wrap.append(element)
def get_path_to_root(e: etree._Element, preserve_ns: bool = False) -> List[AnyStr]: anc = e.getparent() path = [] while anc is not None: path.insert( 0, anc.tag.strip() if preserve_ns else re.sub(r'\{.*\}(.*)', r'\1', anc.tag.strip())) anc = anc.getparent() return path
def get_parent_resource(resource_el: _Element) -> Optional[_Element]: """ Return a direct ancestor of a specified resource or None if the resource has no ancestor. Example: for a resource in group which is in clone, this function will return group element. resource_el -- resource element of which parent resource should be returned """ parent_el = resource_el.getparent() if parent_el is not None and is_wrapper_resource(parent_el): return parent_el return None
def fix_tail(self, item: etree._Element) -> None: """Fix self-closing elements. Designed only to work with self closing elements after item has just been inserted/appended """ parent = item.getparent() idx = parent.index(item) if idx == 0: # item is the first child element, move the text to after item item.tail = parent.text else: # There are other elements, possibly also text, before this child # element. # Move this element's tail to the previous element (note: .text is # only the text after the last child element, text before that and # surrounding elements are attributes of the elements) item.tail = parent[idx - 1].tail # If this is the last child element, it gets the remaining text. if idx == len(parent) - 1: parent[idx - 1].tail = parent.text
def remove_node(self, node: ET._Element, hold_tail=False): """ 删除指定节点 @param {ET._Element} node - 要删除的节点 @param {bool} hold_tail=False - 是否保留上一节点的tail信息 """ _parent = node.getparent() if _parent is not None: if hold_tail and node.tail is not None: # 保存上一节点tail信息 _tail = node.tail _previous = node.getprevious() if _previous is not None: _previous.tail = (_previous.tail or '') + _tail else: _parent.text = (_parent.text or '') + _tail _parent.remove(node) else: # 直接删除 _parent.remove(node)
def get_root(element: Element) -> Element: parent = element.getparent() while parent is not None: element, parent = parent, parent.getparent() return element
def cleanup(elem: etree._Element): elem.clear() while elem.getprevious() is not None: del elem.getparent()[0] # clean up preceding siblings
def _process_elem(self, parent_state: PTState, t_elem: etree._Element): if isinstance(t_elem, etree._Comment): return self._ext.set_elem_context(t_elem) qname = etree.QName(t_elem.tag) state = PTState(parent_state, t_elem) if state["reorder"]: self._reorder.append(state) # duplicate subtree for each source if len(state["sources"].secondary): # prevent triggering this processing branch on sibling passes del t_elem.attrib[self._pt_clark("sources")] # We temporarily detach the t_elem subtree and insert each elem subtree at # the original location of t_elem before populating, which ensures that # resolved paths are always in the form /path/to/elem[1]/child, which will # match corresponding source elements (e.g. /path/to/elem/child) in the # multi source fetch scenario. Caveat: downstream deferred pt:fill or # pt:required will be evaluated in the context of their element's final # path (e.g. /path/to/elem[3]/child). # # Inserting and populating the subtrees in reverse order ensures that their # final document order for multi source fetches is aligned with the order of # the source_map sources. parent = t_elem.getparent() idx = parent.index(t_elem) parent.remove(t_elem) for source in reversed( (state["sources"].primary, *state["sources"].secondary) ): elem = ( t_elem if source is state["sources"].primary else deepcopy(t_elem) ) state["sources"] = SourceGroup(source) parent.insert(idx, elem) self._process_elem(state, elem) return if state["fetch"]: path = self.label.getelementpath(t_elem) s_elems = state["sources"].primary.findall(path) if len(s_elems) > 1: if state["multi"] is not True and len(s_elems) != state["multi"]: raise PTFetchError( f"{len(s_elems)} source elements found but pt:multi is set to" f" expect {int(state['multi'])}", t_elem, ) # cast False to 0 for readability self._process_multi_branch(t_elem, parent_state, len(s_elems) - 1) return elif not len(s_elems): if state["required"]: url = state["sources"].primary.docinfo.URL source_file = ( Path(url).name if url is not None else "<unresolved filename>" ) raise PTFetchError( f"{qname.localname} could not be located at path {path} in" f" source {state.exp['sources']} from {source_file}", # FIXME: .exp is None in descendants where source is inherited... t_elem, ) t_elem.getparent().remove(t_elem) return elif not len(t_elem): # len(s_elems) == 1: t_elem.attrib.update(s_elems[0].attrib) t_elem.text = s_elems[0].text else: if isinstance(state["multi"], int) and state["multi"] > 1: self._process_multi_branch(t_elem, parent_state, state["multi"] - 1) return # non-fetch required condition; should be evaluated at export if state.exp["required"] is not None: self._deferred_reqs.append(state) if len(t_elem): for child_elem in t_elem.getchildren(): self._process_elem(state, child_elem) elif state.exp["fill"]: if state["defer"]: self._deferred_fills.append(state) else: self._handle_fill(state.t_elem, state.eval_deferred("fill")) state.remove_elem_pt_attrs()
def getparent(dom: etree._Element) -> etree._Element: return cast(etree._Element, dom.getparent())