Exemplo n.º 1
0
def normalize_node(element: HtmlElement):
    etree.strip_elements(element, *USELESS_TAG)
    for node in iter_node(element):
        # inspired by readability.
        if node.tag.lower(
        ) in TAGS_CAN_BE_REMOVE_IF_EMPTY and is_empty_element(node):
            remove_node(node)

        # merge text in span or strong to parent p tag
        if node.tag.lower() == 'p':
            etree.strip_tags(node, 'span')
            etree.strip_tags(node, 'strong')

        # if a div tag does not contain any sub node, it could be converted to p node.
        if node.tag.lower() == 'div' and not node.getchildren():
            node.tag = 'p'

        if node.tag.lower() == 'span' and not node.getchildren():
            node.tag = 'p'

        # remove empty p tag
        if node.tag.lower() == 'p' and not node.xpath('.//img'):
            if not (node.text and node.text.strip()):
                drop_tag(node)

        class_name = node.get('class')
        if class_name:
            for attribute in USELESS_ATTR:
                if attribute in class_name:
                    remove_node(node)
                    break
Exemplo n.º 2
0
def preprocess4list(element: Element):
    """
    preprocess element for list extraction
    :param element:
    :return:
    """
    # remove tag and its content
    etree.strip_elements(element, *CONTENT_USELESS_TAGS)
    # only move tag pair
    etree.strip_tags(element, *CONTENT_STRIP_TAGS)

    remove_children(element, CONTENT_NOISE_XPATHS)

    for child in children(element):

        # merge text in span or strong to parent p tag
        if child.tag.lower() == 'p':
            etree.strip_tags(child, 'span')
            etree.strip_tags(child, 'strong')

            if not (child.text and child.text.strip()):
                remove_element(child)

        # if a div tag does not contain any sub node, it could be converted to p node.
        if child.tag.lower() == 'div' and not child.getchildren():
            child.tag = 'p'
Exemplo n.º 3
0
def normalize_node(element: HtmlElement):
    # 去除etree中的某个子节点
    etree.strip_elements(element, *USELESS_TAG)
    for node in iter_node(element):
        # inspired by readability.
        if node.tag.lower(
        ) in TAGS_CAN_BE_REMOVE_IF_EMPTY and is_empty_element(node):
            remove_node(node)

        # 参数中的标签从源element中删除,并且把里面的标签文本给合并进来
        if node.tag.lower() == 'p':
            etree.strip_tags(node, 'span')
            etree.strip_tags(node, 'strong')

        # 如果div标记不包含任何子节点,则可以将其转换为p节点。
        if node.tag.lower() == 'div' and not node.getchildren():
            node.tag = 'p'

        if node.tag.lower() == 'span' and not node.getchildren():
            node.tag = 'p'

        # remove empty p tag
        if node.tag.lower() == 'p' and not node.xpath('.//img'):
            if not (node.text and node.text.strip()):
                drop_tag(node)

        class_name = node.get('class')
        if class_name:
            for attribute in USELESS_ATTR:
                if attribute in class_name:
                    remove_node(node)
                    break
Exemplo n.º 4
0
def check_enc_fixed(url):
    print "\n\n"
    print "That is url {}".format(url)
    r = requests.get(url)
    ud = UnicodeDammit(r.content, is_html=True)
    print "\t\t\t\t\t\t", ud.original_encoding == ud.declared_html_encoding
    if not ud.original_encoding == ud.declared_html_encoding:
        print ("Origignal encoding: {} vs declared_html_encoding: {}"
               "".format(ud.original_encoding, ud.declared_html_encoding))
        print "Detected encoding: {!r}". format(chardet.detect(r.content))

    enc = ud.original_encoding.lower()
    declared_enc = ud.declared_html_encoding
    if declared_enc:
        declared_enc = declared_enc.lower()
    # possible misregocnition of an encoding
    if (declared_enc and enc != declared_enc):
        detect_dict = chardet.detect(r.content)
        det_conf = detect_dict["confidence"]
        det_enc = detect_dict["encoding"].lower()
        if enc == det_enc and det_conf < THRESHOLD_OF_CHARDETECT:
            enc = declared_enc
    print "CHOOSED ENCODING: {}".format(enc)
    # if page contains any characters that differ from the main
    # encodin we will ignore them
    content = r.content.decode(enc, "ignore").encode(enc)
    htmlparser = etree.HTMLParser(encoding=enc)
    root = etree.HTML(content, parser=htmlparser)
    etree.strip_elements(root, html.etree.Comment, "script", "style")
    text = html.tostring(root, method="text", encoding=unicode)

    text = re.sub('\s+', ' ', text)
    print text[:200]
Exemplo n.º 5
0
def check_enc_fixed(url):
    print "\n\n"
    print "That is url {}".format(url)
    r = requests.get(url)
    ud = UnicodeDammit(r.content, is_html=True)
    print "\t\t\t\t\t\t", ud.original_encoding == ud.declared_html_encoding
    if not ud.original_encoding == ud.declared_html_encoding:
        print("Origignal encoding: {} vs declared_html_encoding: {}"
              "".format(ud.original_encoding, ud.declared_html_encoding))
        print "Detected encoding: {!r}".format(chardet.detect(r.content))

    enc = ud.original_encoding.lower()
    declared_enc = ud.declared_html_encoding
    if declared_enc:
        declared_enc = declared_enc.lower()
    # possible misregocnition of an encoding
    if (declared_enc and enc != declared_enc):
        detect_dict = chardet.detect(r.content)
        det_conf = detect_dict["confidence"]
        det_enc = detect_dict["encoding"].lower()
        if enc == det_enc and det_conf < THRESHOLD_OF_CHARDETECT:
            enc = declared_enc
    print "CHOOSED ENCODING: {}".format(enc)
    # if page contains any characters that differ from the main
    # encodin we will ignore them
    content = r.content.decode(enc, "ignore").encode(enc)
    htmlparser = etree.HTMLParser(encoding=enc)
    root = etree.HTML(content, parser=htmlparser)
    etree.strip_elements(root, html.etree.Comment, "script", "style")
    text = html.tostring(root, method="text", encoding=unicode)

    text = re.sub('\s+', ' ', text)
    print text[:200]
Exemplo n.º 6
0
    def _get_text(self, remove_newlines=True):
        """ Retrieves html with provided url and parses it to fully remove
        all html tags, style declarations and scripts.

        Args:
            remove_newlines (bool): wheter perform cleaning of a \n\r
                sequencies or not.

        Returns:
            unicode object of the whole text without html tags

        """
        if not self.text:
            url = self.url
            try:
                self.log.debug("Try to get content from page {}".format(url))
                r = requests.get(url)
            except requests.exceptions.RequestException as e:
                self.log.warn("Unable to get page content of the url: {url}. "
                              "The reason: {exc!r}".format(url=url, exc=e))
                raise ParsingError(e.strerror)

            ud = UnicodeDammit(r.content, is_html=True)

            enc = ud.original_encoding.lower()
            declared_enc = ud.declared_html_encoding
            if declared_enc:
                declared_enc = declared_enc.lower()
            # possible misregocnition of an encoding
            if (declared_enc and enc != declared_enc):
                detect_dict = chardet.detect(r.content)
                det_conf = detect_dict["confidence"]
                det_enc = detect_dict["encoding"].lower()
                if enc == det_enc and det_conf < THRESHOLD_OF_CHARDETECT:
                    enc = declared_enc
            # if page contains any characters that differ from the main
            # encoding we will ignore them
            content = r.content.decode(enc, "ignore").encode(enc)
            htmlparser = etree.HTMLParser(encoding=enc)
            root = etree.HTML(content, parser=htmlparser)
            etree.strip_elements(root, html.etree.Comment, "script", "style")
            text = html.tostring(root, method="text", encoding="unicode")

            if remove_newlines:
                self.log.debug(str(type(text)))
                text = re.sub('\s+', ' ', text)
            self.text = text

        return self.text
Exemplo n.º 7
0
    def _get_text(self, remove_newlines=True):
        """ Retrieves html with provided url and parses it to fully remove
        all html tags, style declarations and scripts.

        Args:
            remove_newlines (bool): wheter perform cleaning of a \n\r
                sequencies or not.

        Returns:
            unicode object of the whole text without html tags

        """
        if not self.text:
            url = self.url
            try:
                self.log.debug("Try to get content from page {}".format(url))
                r = requests.get(url)
            except requests.exceptions.RequestException as e:
                self.log.warn("Unable to get page content of the url: {url}. "
                              "The reason: {exc!r}".format(url=url, exc=e))
                raise ParsingError(e.strerror)

            ud = UnicodeDammit(r.content, is_html=True)

            enc = ud.original_encoding.lower()
            declared_enc = ud.declared_html_encoding
            if declared_enc:
                declared_enc = declared_enc.lower()
            # possible misregocnition of an encoding
            if (declared_enc and enc != declared_enc):
                detect_dict = chardet.detect(r.content)
                det_conf = detect_dict["confidence"]
                det_enc = detect_dict["encoding"].lower()
                if enc == det_enc and det_conf < THRESHOLD_OF_CHARDETECT:
                    enc = declared_enc
            # if page contains any characters that differ from the main
            # encoding we will ignore them
            content = r.content.decode(enc, "ignore").encode(enc)
            htmlparser = etree.HTMLParser(encoding=enc)
            root = etree.HTML(content, parser=htmlparser)
            etree.strip_elements(root, html.etree.Comment, "script", "style")
            text = html.tostring(root, method="text", encoding="unicode")

            if remove_newlines:
                self.log.debug(str(type(text)))
                text = re.sub('\s+', ' ', text)
            self.text = text

        return self.text
 def ExtactProcess(self, htmlText, isIndex):
     htmlText = re.sub(r'</?span.*?>|&.{1,6}?;|</?tr.*?>|</?td.*?>',
                       '',
                       htmlText,
                       flags=re.S)
     etreeObj = etree.HTML(htmlText)
     self.titleText = self.ChangeNodeText("".join(
         etreeObj.xpath('//title/text()')))
     etree.strip_elements(etreeObj,
                          *EXTRACT_IFNO['USELESS_TAG'],
                          with_tail=False)
     if isIndex:
         self.ExtractIndexPro(etreeObj)
         self.ExtractIndexDate("".join(etreeObj.xpath('//text()')))
     else:
         self.ExtractPagePro(etreeObj)
Exemplo n.º 9
0
def normalize_node(element: HtmlElement):
    etree.strip_elements(element, *USELESS_TAG)
    for node in iter_node(element):
        # if node.tag.lower() in USELESS_TAG:
        #     remove_node(node)

        # inspired by readability.
        if node.tag.lower(
        ) in TAGS_CAN_BE_REMOVE_IF_EMPTY and is_empty_element(node):
            remove_node(node)

        # p 标签下面的 span 标签中的文字,可以合并到 p 标签中
        if node.tag.lower() == 'p':
            etree.strip_tags(node, 'span')
            etree.strip_tags(node, 'strong')

        # if a div tag does not contain any sub node, it could be converted to p node.
        if node.tag.lower() == 'div' and not node.getchildren():
            node.tag = 'p'

        if node.tag.lower() == 'span' and not node.getchildren():
            node.tag = 'p'

        # remove empty p tag
        if node.tag.lower() == 'p' and not node.xpath('.//img'):
            if not (node.text and node.text.strip()):
                drop_tag(node)

        class_name = node.get('class')
        if class_name:
            for attribute in USELESS_ATTR:
                if ' ' in class_name:  # 判断类名是否有多个以空格相隔,先切割,#构建列表
                    classnames = class_name.split(' ')
                else:
                    classnames = [class_name]  # 只有一个
                for name in classnames:  # 遍历列表中的类名,判断是否存在需删除的类名
                    if attribute == name:
                        remove_node(node)
                        break
Exemplo n.º 10
0
Arquivo: web.py Projeto: qdbp/vnv
 def prune(self, tag: str):
     self.opseq.append(lambda s: etree.strip_elements(s, tag) or s)
     return self