Python strip_elements 예제들, lxml.html.etree.strip_elements Python 예제들

예제 #1

0

파일 보기

파일: utils.py 프로젝트: ymmagic/GeneralNewsExtractor

def normalize_node(element: HtmlElement):
    etree.strip_elements(element, *USELESS_TAG)
    for node in iter_node(element):
        # inspired by readability.
        if node.tag.lower(
        ) in TAGS_CAN_BE_REMOVE_IF_EMPTY and is_empty_element(node):
            remove_node(node)

        # merge text in span or strong to parent p tag
        if node.tag.lower() == 'p':
            etree.strip_tags(node, 'span')
            etree.strip_tags(node, 'strong')

        # if a div tag does not contain any sub node, it could be converted to p node.
        if node.tag.lower() == 'div' and not node.getchildren():
            node.tag = 'p'

        if node.tag.lower() == 'span' and not node.getchildren():
            node.tag = 'p'

        # remove empty p tag
        if node.tag.lower() == 'p' and not node.xpath('.//img'):
            if not (node.text and node.text.strip()):
                drop_tag(node)

        class_name = node.get('class')
        if class_name:
            for attribute in USELESS_ATTR:
                if attribute in class_name:
                    remove_node(node)
                    break

예제 #2

0

파일 보기

파일: preprocess.py 프로젝트: xx0746/GerapyAutoExtractor

def preprocess4list(element: Element):
    """
    preprocess element for list extraction
    :param element:
    :return:
    """
    # remove tag and its content
    etree.strip_elements(element, *CONTENT_USELESS_TAGS)
    # only move tag pair
    etree.strip_tags(element, *CONTENT_STRIP_TAGS)

    remove_children(element, CONTENT_NOISE_XPATHS)

    for child in children(element):

        # merge text in span or strong to parent p tag
        if child.tag.lower() == 'p':
            etree.strip_tags(child, 'span')
            etree.strip_tags(child, 'strong')

            if not (child.text and child.text.strip()):
                remove_element(child)

        # if a div tag does not contain any sub node, it could be converted to p node.
        if child.tag.lower() == 'div' and not child.getchildren():
            child.tag = 'p'

예제 #3

0

파일 보기

def normalize_node(element: HtmlElement):
    # 去除etree中的某个子节点
    etree.strip_elements(element, *USELESS_TAG)
    for node in iter_node(element):
        # inspired by readability.
        if node.tag.lower(
        ) in TAGS_CAN_BE_REMOVE_IF_EMPTY and is_empty_element(node):
            remove_node(node)

        # 参数中的标签从源element中删除，并且把里面的标签文本给合并进来
        if node.tag.lower() == 'p':
            etree.strip_tags(node, 'span')
            etree.strip_tags(node, 'strong')

        # 如果div标记不包含任何子节点，则可以将其转换为p节点。
        if node.tag.lower() == 'div' and not node.getchildren():
            node.tag = 'p'

        if node.tag.lower() == 'span' and not node.getchildren():
            node.tag = 'p'

        # remove empty p tag
        if node.tag.lower() == 'p' and not node.xpath('.//img'):
            if not (node.text and node.text.strip()):
                drop_tag(node)

        class_name = node.get('class')
        if class_name:
            for attribute in USELESS_ATTR:
                if attribute in class_name:
                    remove_node(node)
                    break

예제 #4

0

파일 보기

파일: test.py 프로젝트: bestxolodec/recocon

def check_enc_fixed(url):
    print "\n\n"
    print "That is url {}".format(url)
    r = requests.get(url)
    ud = UnicodeDammit(r.content, is_html=True)
    print "\t\t\t\t\t\t", ud.original_encoding == ud.declared_html_encoding
    if not ud.original_encoding == ud.declared_html_encoding:
        print ("Origignal encoding: {} vs declared_html_encoding: {}"
               "".format(ud.original_encoding, ud.declared_html_encoding))
        print "Detected encoding: {!r}". format(chardet.detect(r.content))

    enc = ud.original_encoding.lower()
    declared_enc = ud.declared_html_encoding
    if declared_enc:
        declared_enc = declared_enc.lower()
    # possible misregocnition of an encoding
    if (declared_enc and enc != declared_enc):
        detect_dict = chardet.detect(r.content)
        det_conf = detect_dict["confidence"]
        det_enc = detect_dict["encoding"].lower()
        if enc == det_enc and det_conf < THRESHOLD_OF_CHARDETECT:
            enc = declared_enc
    print "CHOOSED ENCODING: {}".format(enc)
    # if page contains any characters that differ from the main
    # encodin we will ignore them
    content = r.content.decode(enc, "ignore").encode(enc)
    htmlparser = etree.HTMLParser(encoding=enc)
    root = etree.HTML(content, parser=htmlparser)
    etree.strip_elements(root, html.etree.Comment, "script", "style")
    text = html.tostring(root, method="text", encoding=unicode)

    text = re.sub('\s+', ' ', text)
    print text[:200]

예제 #5

0

파일 보기

파일: test.py 프로젝트: pshvechikov/recocon

def check_enc_fixed(url):
    print "\n\n"
    print "That is url {}".format(url)
    r = requests.get(url)
    ud = UnicodeDammit(r.content, is_html=True)
    print "\t\t\t\t\t\t", ud.original_encoding == ud.declared_html_encoding
    if not ud.original_encoding == ud.declared_html_encoding:
        print("Origignal encoding: {} vs declared_html_encoding: {}"
              "".format(ud.original_encoding, ud.declared_html_encoding))
        print "Detected encoding: {!r}".format(chardet.detect(r.content))

    enc = ud.original_encoding.lower()
    declared_enc = ud.declared_html_encoding
    if declared_enc:
        declared_enc = declared_enc.lower()
    # possible misregocnition of an encoding
    if (declared_enc and enc != declared_enc):
        detect_dict = chardet.detect(r.content)
        det_conf = detect_dict["confidence"]
        det_enc = detect_dict["encoding"].lower()
        if enc == det_enc and det_conf < THRESHOLD_OF_CHARDETECT:
            enc = declared_enc
    print "CHOOSED ENCODING: {}".format(enc)
    # if page contains any characters that differ from the main
    # encodin we will ignore them
    content = r.content.decode(enc, "ignore").encode(enc)
    htmlparser = etree.HTMLParser(encoding=enc)
    root = etree.HTML(content, parser=htmlparser)
    etree.strip_elements(root, html.etree.Comment, "script", "style")
    text = html.tostring(root, method="text", encoding=unicode)

    text = re.sub('\s+', ' ', text)
    print text[:200]

예제 #6

0

파일 보기

파일: analizer.py 프로젝트: bestxolodec/recocon

    def _get_text(self, remove_newlines=True):
        """ Retrieves html with provided url and parses it to fully remove
        all html tags, style declarations and scripts.

        Args:
            remove_newlines (bool): wheter perform cleaning of a \n\r
                sequencies or not.

        Returns:
            unicode object of the whole text without html tags

        """
        if not self.text:
            url = self.url
            try:
                self.log.debug("Try to get content from page {}".format(url))
                r = requests.get(url)
            except requests.exceptions.RequestException as e:
                self.log.warn("Unable to get page content of the url: {url}. "
                              "The reason: {exc!r}".format(url=url, exc=e))
                raise ParsingError(e.strerror)

            ud = UnicodeDammit(r.content, is_html=True)

            enc = ud.original_encoding.lower()
            declared_enc = ud.declared_html_encoding
            if declared_enc:
                declared_enc = declared_enc.lower()
            # possible misregocnition of an encoding
            if (declared_enc and enc != declared_enc):
                detect_dict = chardet.detect(r.content)
                det_conf = detect_dict["confidence"]
                det_enc = detect_dict["encoding"].lower()
                if enc == det_enc and det_conf < THRESHOLD_OF_CHARDETECT:
                    enc = declared_enc
            # if page contains any characters that differ from the main
            # encoding we will ignore them
            content = r.content.decode(enc, "ignore").encode(enc)
            htmlparser = etree.HTMLParser(encoding=enc)
            root = etree.HTML(content, parser=htmlparser)
            etree.strip_elements(root, html.etree.Comment, "script", "style")
            text = html.tostring(root, method="text", encoding="unicode")

            if remove_newlines:
                self.log.debug(str(type(text)))
                text = re.sub('\s+', ' ', text)
            self.text = text

        return self.text

예제 #7

0

파일 보기

파일: analizer.py 프로젝트: pshvechikov/recocon

    def _get_text(self, remove_newlines=True):
        """ Retrieves html with provided url and parses it to fully remove
        all html tags, style declarations and scripts.

        Args:
            remove_newlines (bool): wheter perform cleaning of a \n\r
                sequencies or not.

        Returns:
            unicode object of the whole text without html tags

        """
        if not self.text:
            url = self.url
            try:
                self.log.debug("Try to get content from page {}".format(url))
                r = requests.get(url)
            except requests.exceptions.RequestException as e:
                self.log.warn("Unable to get page content of the url: {url}. "
                              "The reason: {exc!r}".format(url=url, exc=e))
                raise ParsingError(e.strerror)

            ud = UnicodeDammit(r.content, is_html=True)

            enc = ud.original_encoding.lower()
            declared_enc = ud.declared_html_encoding
            if declared_enc:
                declared_enc = declared_enc.lower()
            # possible misregocnition of an encoding
            if (declared_enc and enc != declared_enc):
                detect_dict = chardet.detect(r.content)
                det_conf = detect_dict["confidence"]
                det_enc = detect_dict["encoding"].lower()
                if enc == det_enc and det_conf < THRESHOLD_OF_CHARDETECT:
                    enc = declared_enc
            # if page contains any characters that differ from the main
            # encoding we will ignore them
            content = r.content.decode(enc, "ignore").encode(enc)
            htmlparser = etree.HTMLParser(encoding=enc)
            root = etree.HTML(content, parser=htmlparser)
            etree.strip_elements(root, html.etree.Comment, "script", "style")
            text = html.tostring(root, method="text", encoding="unicode")

            if remove_newlines:
                self.log.debug(str(type(text)))
                text = re.sub('\s+', ' ', text)
            self.text = text

        return self.text

예제 #8

0

파일 보기

파일: ExtractContentInfo.py 프로젝트: MistyRainSandCastle/SimilarToBaiduSearchEngine

 def ExtactProcess(self, htmlText, isIndex):
     htmlText = re.sub(r'</?span.*?>|&.{1,6}?;|</?tr.*?>|</?td.*?>',
                       '',
                       htmlText,
                       flags=re.S)
     etreeObj = etree.HTML(htmlText)
     self.titleText = self.ChangeNodeText("".join(
         etreeObj.xpath('//title/text()')))
     etree.strip_elements(etreeObj,
                          *EXTRACT_IFNO['USELESS_TAG'],
                          with_tail=False)
     if isIndex:
         self.ExtractIndexPro(etreeObj)
         self.ExtractIndexDate("".join(etreeObj.xpath('//text()')))
     else:
         self.ExtractPagePro(etreeObj)

예제 #9

0

파일 보기

def normalize_node(element: HtmlElement):
    etree.strip_elements(element, *USELESS_TAG)
    for node in iter_node(element):
        # if node.tag.lower() in USELESS_TAG:
        #     remove_node(node)

        # inspired by readability.
        if node.tag.lower(
        ) in TAGS_CAN_BE_REMOVE_IF_EMPTY and is_empty_element(node):
            remove_node(node)

        # p 标签下面的 span 标签中的文字，可以合并到 p 标签中
        if node.tag.lower() == 'p':
            etree.strip_tags(node, 'span')
            etree.strip_tags(node, 'strong')

        # if a div tag does not contain any sub node, it could be converted to p node.
        if node.tag.lower() == 'div' and not node.getchildren():
            node.tag = 'p'

        if node.tag.lower() == 'span' and not node.getchildren():
            node.tag = 'p'

        # remove empty p tag
        if node.tag.lower() == 'p' and not node.xpath('.//img'):
            if not (node.text and node.text.strip()):
                drop_tag(node)

        class_name = node.get('class')
        if class_name:
            for attribute in USELESS_ATTR:
                if ' ' in class_name:  # 判断类名是否有多个以空格相隔，先切割，#构建列表
                    classnames = class_name.split(' ')
                else:
                    classnames = [class_name]  # 只有一个
                for name in classnames:  # 遍历列表中的类名，判断是否存在需删除的类名
                    if attribute == name:
                        remove_node(node)
                        break

예제 #10

0

파일 보기

파일: web.py 프로젝트: qdbp/vnv

 def prune(self, tag: str):
     self.opseq.append(lambda s: etree.strip_elements(s, tag) or s)
     return self