예제 #1
0
def get_node_text(node, smart=False, normalize_space=True):
    """
    Extract text content of the `node` and all its descendants.

    In smart mode `get_node_text` insert spaces between <tag><another tag>
    and also ignores content of the script and style tags.

    In non-smart mode this func just return text_content() of node
    with normalized spaces
    """

    # If xpath return a attribute value, it value will be string not a node
    if isinstance(node, basestring):
        if normalize_space:
            node = normalize_space_func(node)
        return node

    if smart:
        value = ' '.join(node.xpath(
            './descendant-or-self::*[name() != "script" and '\
            'name() != "style"]/text()[normalize-space()]'))
    else:
        # If DOM tree was built with lxml.etree.fromstring
        # then tree nodes do not have text_content() method
        try:
            value = node.text_content()
        except AttributeError:
            value = ''.join(node.xpath('.//text()'))
    if normalize_space:
        value = normalize_space_func(value)
    return value
예제 #2
0
파일: etree.py 프로젝트: abael/weblib
def get_node_text(node, smart=False, normalize_space=True):
    """
    Extract text content of the `node` and all its descendants.

    In smart mode `get_node_text` insert spaces between <tag><another tag>
    and also ignores content of the script and style tags.

    In non-smart mode this func just return text_content() of node
    with normalized spaces
    """

    # If xpath return a attribute value, it value will be string not a node
    if isinstance(node, basestring):
        if normalize_space:
            node = normalize_space_func(node)
        return node

    if smart:
        value = ' '.join(node.xpath(
            './descendant-or-self::*[name() != "script" and '\
            'name() != "style"]/text()[normalize-space()]'))
    else:
        # If DOM tree was built with lxml.etree.fromstring
        # then tree nodes do not have text_content() method
        try:
            value = node.text_content()
        except AttributeError:
            value = ''.join(node.xpath('.//text()'))
    if normalize_space:
        value = normalize_space_func(value)
    return value
예제 #3
0
def strip_tags(html, normalize_space=True, convert_br=False):
    if convert_br:
        html = RE_BR.sub('\n', html)
    text = RE_TAG.sub(' ', html)
    if normalize_space:
        return normalize_space_func(text)
    else:
        return text
예제 #4
0
파일: base.py 프로젝트: khokhlov/hmedia
 def text(self, default=NULL):
     try:
         return normalize_space_func(decode_entities(self.one().group(1)))
     except (AttributeError, IndexError):
         if default is NULL:
             raise DataNotFound
         else:
             return default
예제 #5
0
 def text(self, default=NULL):
     try:
         return normalize_space_func(decode_entities(self.one().group(1)))
     except (AttributeError, IndexError):
         if default is NULL:
             raise DataNotFound
         else:
             return default
예제 #6
0
def strip_tags(html, normalize_space=True, convert_br=False):
    if convert_br:
        html = RE_BR.sub("\n", html)
    text = RE_TAG.sub(" ", html)
    if normalize_space:
        return normalize_space_func(text)
    else:
        return text
예제 #7
0
 def text(self, smart=False, normalize_space=True):
     if self.is_text_node():
         if normalize_space:
             return normalize_space_func(self.node())
         else:
             return self.node()
     else:
         return get_node_text(self.node(), smart=smart,
                              normalize_space=normalize_space)
예제 #8
0
 def text(self, smart=False, normalize_space=True):
     if self.is_text_node():
         if normalize_space:
             return normalize_space_func(self.node())
         else:
             return self.node()
     else:
         return get_node_text(self.node(),
                              smart=smart,
                              normalize_space=normalize_space)