def number_of_p_children(element: Element): """ get number of p tags in children :param element: :return: """ if element is None: return 0 return len(element.xpath('./p'))
def number_of_a_descendants(element: Element): """ get number of a tags in this element :param element: :return: """ if element is None: return 0 return len(element.xpath('.//a'))
def number_of_p_descendants(element: Element): """ get number of p tags in descendants :param element: :return: """ if element is None: return 0 return len(element.xpath('.//p'))
def siblings(element: Element, including=False): """ get siblings of element :param element: :param including: include current element or not :return: """ if element is None: return [] if including: yield element for sibling in element.itersiblings(preceding=True): if isinstance(sibling, HtmlElement): sibling.__class__ = Element yield sibling for sibling in element.itersiblings(preceding=False): if isinstance(sibling, HtmlElement): sibling.__class__ = Element yield sibling
def number_of_a_char(element: Element): """ get number of linked char, for example, result of `<a href="#">hello</a>world` = 5 :param element: :return: length """ if element is None: return 0 text = ''.join(element.xpath('.//a//text()')) text = re.sub(r'\s*', '', text, flags=re.S) return len(text)
def remove_element(element: Element): """ remove child element from parent :param element: :return: """ if element is None: return p = element.getparent() if p is not None: p.remove(element)
def text(element: Element): """ get text of element :param element: :return: """ if element is None: return 0 text = ''.join(element.xpath('.//text()')) text = re.sub(r'\s*', '', text, flags=re.S) return text
def _has_datetime_mata(self, element: Element): """ has datetime meta :param element: :return: """ for xpath in DATETIME_METAS: datetime = element.xpath(xpath) if datetime: return True return False
def number_of_punctuation(element: Element): """ get number of punctuation of text in this element :param element: :return: """ if element is None: return 0 text = ''.join(element.xpath('.//text()')) text = re.sub(r'\s*', '', text, flags=re.S) punctuations = [c for c in text if c in PUNCTUATION] return len(punctuations)
def parent(element: Element): """ get parent of element :param element: :return: """ if element is None: return None parent = element.getparent() if isinstance(parent, ModuleType): parent.__class__ = Element return parent
def a_descendants(element: Element): """ get :param element: :return: """ if element is None: return [] descendants = [] for descendant in element.xpath('.//a'): descendant.__class__ = Element descendants.append(descendant) return descendants
def descendants_of_body(element: Element): """ get descendants element of body element :param element: :return: """ if element is None: return [] body_xpath = '//body' elements = element.xpath(body_xpath) if elements: elements[0].__class__ = Element return list(descendants(elements[0], True)) return []
def children_of_head(element: Element): """ get children element of body element :param element: :return: """ if element is None: return [] body_xpath = '//head' body_element = element.xpath(body_xpath) if body_element: body_element.__class__ = Element return descendants(body_element, True) return []
def path(element: Element): """ get tag path using recursive function. for example result: html/body/div/div/ul/li :param element: :return: """ if element is None: return '' result = path_raw(element) # get nth-child nth = len(list(element.itersiblings(preceding=True))) + 1 result += f':nth-child({nth})' if nth != 1 else '' return result
def descendants(element: Element, including=False): """ get descendants clement of specific element :param element: parent element :param including: including current element or not :return: """ if element is None: return [] if including: yield element for descendant in element.iterdescendants(): if isinstance(descendant, HtmlElement): descendant.__class__ = Element yield descendant
def children(element: Element, including=False): """ get children :param element: :param including: :return: """ if element is None: return [] if including: yield element for child in element.iterchildren(): if isinstance(child, HtmlElement): child.__class__ = Element yield child
def remove_children(element: Element, xpaths): """ remove children from element :param element: :param xpaths: :return: """ if element is None: return if not xpaths: return for xpath in xpaths: nodes = element.xpath(xpath) for node in nodes: remove_element(node) return element
def alias(element: Element): """ get alias of element, concat tag and attribs :param element: :return: """ if element is None: return '' tag = element.tag attribs = [tag] for k, v in element.attrib.items(): k, v = re.sub(r'\s*', '', k), re.sub(r'\s*', '', v) attribs.append(f'[{k}="{v}"]' if v else f'[{k}]') result = ''.join(attribs) # get nth-child nth = len(list(element.itersiblings(preceding=True))) + 1 result += f'::nth-child({nth})' if nth != 1 else '' return result