def preprocess4list(element: Element): """ preprocess element for list extraction :param element: :return: """ # remove tag and its content etree.strip_elements(element, *CONTENT_USELESS_TAGS) # only move tag pair etree.strip_tags(element, *CONTENT_STRIP_TAGS) remove_children(element, CONTENT_NOISE_XPATHS) for child in children(element): # merge text in span or strong to parent p tag if child.tag.lower() == 'p': etree.strip_tags(child, 'span') etree.strip_tags(child, 'strong') if not (child.text and child.text.strip()): remove_element(child) # if a div tag does not contain any sub node, it could be converted to p node. if child.tag.lower() == 'div' and not child.getchildren(): child.tag = 'p'
def children(self): """ get children of this element :return: """ if self._children is not None: return self._children from gerapy_auto_extractor.utils.element import children self._children = list(children(self)) return self._children