def normalize_node(element: HtmlElement): etree.strip_elements(element, *USELESS_TAG) for node in iter_node(element): # inspired by readability. if node.tag.lower( ) in TAGS_CAN_BE_REMOVE_IF_EMPTY and is_empty_element(node): remove_node(node) # merge text in span or strong to parent p tag if node.tag.lower() == 'p': etree.strip_tags(node, 'span') etree.strip_tags(node, 'strong') # if a div tag does not contain any sub node, it could be converted to p node. if node.tag.lower() == 'div' and not node.getchildren(): node.tag = 'p' if node.tag.lower() == 'span' and not node.getchildren(): node.tag = 'p' # remove empty p tag if node.tag.lower() == 'p' and not node.xpath('.//img'): if not (node.text and node.text.strip()): drop_tag(node) class_name = node.get('class') if class_name: for attribute in USELESS_ATTR: if attribute in class_name: remove_node(node) break
def preprocess4list(element: Element): """ preprocess element for list extraction :param element: :return: """ # remove tag and its content etree.strip_elements(element, *CONTENT_USELESS_TAGS) # only move tag pair etree.strip_tags(element, *CONTENT_STRIP_TAGS) remove_children(element, CONTENT_NOISE_XPATHS) for child in children(element): # merge text in span or strong to parent p tag if child.tag.lower() == 'p': etree.strip_tags(child, 'span') etree.strip_tags(child, 'strong') if not (child.text and child.text.strip()): remove_element(child) # if a div tag does not contain any sub node, it could be converted to p node. if child.tag.lower() == 'div' and not child.getchildren(): child.tag = 'p'
def normalize_node(element: HtmlElement): # 去除etree中的某个子节点 etree.strip_elements(element, *USELESS_TAG) for node in iter_node(element): # inspired by readability. if node.tag.lower( ) in TAGS_CAN_BE_REMOVE_IF_EMPTY and is_empty_element(node): remove_node(node) # 参数中的标签从源element中删除,并且把里面的标签文本给合并进来 if node.tag.lower() == 'p': etree.strip_tags(node, 'span') etree.strip_tags(node, 'strong') # 如果div标记不包含任何子节点,则可以将其转换为p节点。 if node.tag.lower() == 'div' and not node.getchildren(): node.tag = 'p' if node.tag.lower() == 'span' and not node.getchildren(): node.tag = 'p' # remove empty p tag if node.tag.lower() == 'p' and not node.xpath('.//img'): if not (node.text and node.text.strip()): drop_tag(node) class_name = node.get('class') if class_name: for attribute in USELESS_ATTR: if attribute in class_name: remove_node(node) break
def check_enc_fixed(url): print "\n\n" print "That is url {}".format(url) r = requests.get(url) ud = UnicodeDammit(r.content, is_html=True) print "\t\t\t\t\t\t", ud.original_encoding == ud.declared_html_encoding if not ud.original_encoding == ud.declared_html_encoding: print ("Origignal encoding: {} vs declared_html_encoding: {}" "".format(ud.original_encoding, ud.declared_html_encoding)) print "Detected encoding: {!r}". format(chardet.detect(r.content)) enc = ud.original_encoding.lower() declared_enc = ud.declared_html_encoding if declared_enc: declared_enc = declared_enc.lower() # possible misregocnition of an encoding if (declared_enc and enc != declared_enc): detect_dict = chardet.detect(r.content) det_conf = detect_dict["confidence"] det_enc = detect_dict["encoding"].lower() if enc == det_enc and det_conf < THRESHOLD_OF_CHARDETECT: enc = declared_enc print "CHOOSED ENCODING: {}".format(enc) # if page contains any characters that differ from the main # encodin we will ignore them content = r.content.decode(enc, "ignore").encode(enc) htmlparser = etree.HTMLParser(encoding=enc) root = etree.HTML(content, parser=htmlparser) etree.strip_elements(root, html.etree.Comment, "script", "style") text = html.tostring(root, method="text", encoding=unicode) text = re.sub('\s+', ' ', text) print text[:200]
def check_enc_fixed(url): print "\n\n" print "That is url {}".format(url) r = requests.get(url) ud = UnicodeDammit(r.content, is_html=True) print "\t\t\t\t\t\t", ud.original_encoding == ud.declared_html_encoding if not ud.original_encoding == ud.declared_html_encoding: print("Origignal encoding: {} vs declared_html_encoding: {}" "".format(ud.original_encoding, ud.declared_html_encoding)) print "Detected encoding: {!r}".format(chardet.detect(r.content)) enc = ud.original_encoding.lower() declared_enc = ud.declared_html_encoding if declared_enc: declared_enc = declared_enc.lower() # possible misregocnition of an encoding if (declared_enc and enc != declared_enc): detect_dict = chardet.detect(r.content) det_conf = detect_dict["confidence"] det_enc = detect_dict["encoding"].lower() if enc == det_enc and det_conf < THRESHOLD_OF_CHARDETECT: enc = declared_enc print "CHOOSED ENCODING: {}".format(enc) # if page contains any characters that differ from the main # encodin we will ignore them content = r.content.decode(enc, "ignore").encode(enc) htmlparser = etree.HTMLParser(encoding=enc) root = etree.HTML(content, parser=htmlparser) etree.strip_elements(root, html.etree.Comment, "script", "style") text = html.tostring(root, method="text", encoding=unicode) text = re.sub('\s+', ' ', text) print text[:200]
def _get_text(self, remove_newlines=True): """ Retrieves html with provided url and parses it to fully remove all html tags, style declarations and scripts. Args: remove_newlines (bool): wheter perform cleaning of a \n\r sequencies or not. Returns: unicode object of the whole text without html tags """ if not self.text: url = self.url try: self.log.debug("Try to get content from page {}".format(url)) r = requests.get(url) except requests.exceptions.RequestException as e: self.log.warn("Unable to get page content of the url: {url}. " "The reason: {exc!r}".format(url=url, exc=e)) raise ParsingError(e.strerror) ud = UnicodeDammit(r.content, is_html=True) enc = ud.original_encoding.lower() declared_enc = ud.declared_html_encoding if declared_enc: declared_enc = declared_enc.lower() # possible misregocnition of an encoding if (declared_enc and enc != declared_enc): detect_dict = chardet.detect(r.content) det_conf = detect_dict["confidence"] det_enc = detect_dict["encoding"].lower() if enc == det_enc and det_conf < THRESHOLD_OF_CHARDETECT: enc = declared_enc # if page contains any characters that differ from the main # encoding we will ignore them content = r.content.decode(enc, "ignore").encode(enc) htmlparser = etree.HTMLParser(encoding=enc) root = etree.HTML(content, parser=htmlparser) etree.strip_elements(root, html.etree.Comment, "script", "style") text = html.tostring(root, method="text", encoding="unicode") if remove_newlines: self.log.debug(str(type(text))) text = re.sub('\s+', ' ', text) self.text = text return self.text
def ExtactProcess(self, htmlText, isIndex): htmlText = re.sub(r'</?span.*?>|&.{1,6}?;|</?tr.*?>|</?td.*?>', '', htmlText, flags=re.S) etreeObj = etree.HTML(htmlText) self.titleText = self.ChangeNodeText("".join( etreeObj.xpath('//title/text()'))) etree.strip_elements(etreeObj, *EXTRACT_IFNO['USELESS_TAG'], with_tail=False) if isIndex: self.ExtractIndexPro(etreeObj) self.ExtractIndexDate("".join(etreeObj.xpath('//text()'))) else: self.ExtractPagePro(etreeObj)
def normalize_node(element: HtmlElement): etree.strip_elements(element, *USELESS_TAG) for node in iter_node(element): # if node.tag.lower() in USELESS_TAG: # remove_node(node) # inspired by readability. if node.tag.lower( ) in TAGS_CAN_BE_REMOVE_IF_EMPTY and is_empty_element(node): remove_node(node) # p 标签下面的 span 标签中的文字,可以合并到 p 标签中 if node.tag.lower() == 'p': etree.strip_tags(node, 'span') etree.strip_tags(node, 'strong') # if a div tag does not contain any sub node, it could be converted to p node. if node.tag.lower() == 'div' and not node.getchildren(): node.tag = 'p' if node.tag.lower() == 'span' and not node.getchildren(): node.tag = 'p' # remove empty p tag if node.tag.lower() == 'p' and not node.xpath('.//img'): if not (node.text and node.text.strip()): drop_tag(node) class_name = node.get('class') if class_name: for attribute in USELESS_ATTR: if ' ' in class_name: # 判断类名是否有多个以空格相隔,先切割,#构建列表 classnames = class_name.split(' ') else: classnames = [class_name] # 只有一个 for name in classnames: # 遍历列表中的类名,判断是否存在需删除的类名 if attribute == name: remove_node(node) break
def prune(self, tag: str): self.opseq.append(lambda s: etree.strip_elements(s, tag) or s) return self