def get_node_text(node, smart=False, normalize_space=True): """ Extract text content of the `node` and all its descendants. In smart mode `get_node_text` insert spaces between <tag><another tag> and also ignores content of the script and style tags. In non-smart mode this func just return text_content() of node with normalized spaces """ # If xpath return a attribute value, it value will be string not a node if isinstance(node, basestring): if normalize_space: node = normalize_space_func(node) return node if smart: value = ' '.join(node.xpath( './descendant-or-self::*[name() != "script" and '\ 'name() != "style"]/text()[normalize-space()]')) else: # If DOM tree was built with lxml.etree.fromstring # then tree nodes do not have text_content() method try: value = node.text_content() except AttributeError: value = ''.join(node.xpath('.//text()')) if normalize_space: value = normalize_space_func(value) return value
def strip_tags(html, normalize_space=True, convert_br=False): if convert_br: html = RE_BR.sub('\n', html) text = RE_TAG.sub(' ', html) if normalize_space: return normalize_space_func(text) else: return text
def text(self, default=NULL): try: return normalize_space_func(decode_entities(self.one().group(1))) except (AttributeError, IndexError): if default is NULL: raise DataNotFound else: return default
def strip_tags(html, normalize_space=True, convert_br=False): if convert_br: html = RE_BR.sub("\n", html) text = RE_TAG.sub(" ", html) if normalize_space: return normalize_space_func(text) else: return text
def text(self, smart=False, normalize_space=True): if self.is_text_node(): if normalize_space: return normalize_space_func(self.node()) else: return self.node() else: return get_node_text(self.node(), smart=smart, normalize_space=normalize_space)