def clean_html(self, html, kill_xpaths=None): # if not kill_xpaths: # return super(MyCleaner, self).clean_html(html) kill_xpaths = to_list(kill_xpaths) + to_list(self.kill_xpaths) if isinstance(html, basestring): doc = fromstring(html) # 'HtmlElement' object else: doc = deepcopy(html) ele = [] for i in kill_xpaths: try: ekill = doc.xpath(i) except Exception as e: print('没有找到kill_xpath: %s, 报错: %s' % (i, e)) continue ele.extend(ekill) for e in ele: try: e.getparent().remove(e) except AttributeError as er: # 防止包含重复删除 'NoneType' object has no attribute 'remove' print('tag: %s, attr: %s 的节点: 已被删除' % (e.tag, e.attrib)) result_type = type(html) self(doc) return _transform_result(result_type, doc)
def autolink_html(html, *args, **kw): result_type = type(html) if isinstance(html, basestring): doc = fromstring(html) else: doc = copy.deepcopy(html) autolink(doc, *args, **kw) return _transform_result(result_type, doc)
def clean_html(self, html): result_type = type(html) if isinstance(html, basestring): doc = fromstring(html) else: doc = copy.deepcopy(html) self(doc) return _transform_result(result_type, doc)
def fill_form_html(html, values, form_id=None, form_index=None): result_type = type(html) if isinstance(html, basestring): doc = fromstring(html) else: doc = copy.deepcopy(html) fill_form(doc, values, form_id=form_id, form_index=form_index) return _transform_result(result_type, doc)
def insert_errors_html(html, values, **kw): result_type = type(html) if isinstance(html, basestring): doc = fromstring(html) else: doc = copy.deepcopy(html) insert_errors(doc, values, **kw) return _transform_result(result_type, doc)
def word_break_html(html, *args, **kw): result_type = type(html) doc = fromstring(html) word_break(doc, *args, **kw) return _transform_result(result_type, doc)
def sanitize(self, node, candidates): MIN_LEN = self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD) for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"): if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33: header.drop_tree() for elem in self.tags(node, "form", "textarea"): elem.drop_tree() allowed = {} # Conditionally clean <table>s, <ul>s, and <div>s for el in self.reverse_tags(node, "table", "ul", "div"): if el in allowed: continue weight = self.class_weight(el) if el in candidates: content_score = candidates[el]['content_score'] #print '!',el, '-> %6.3f' % content_score else: content_score = 0 tag = el.tag if weight + content_score < 0: self.debug("Cleaned %s with score %6.3f and weight %-3s" % (describe(el), content_score, weight, )) el.drop_tree() elif el.text_content().count(",") < 10: counts = {} for kind in ['p', 'img', 'li', 'a', 'embed', 'input']: counts[kind] = len(el.findall('.//%s' % kind)) counts["li"] -= 100 # Count the text length excluding any surrounding whitespace content_length = text_length(el) link_density = self.get_link_density(el) parent_node = el.getparent() if parent_node is not None: if parent_node in candidates: content_score = candidates[parent_node]['content_score'] else: content_score = 0 #if parent_node is not None: #pweight = self.class_weight(parent_node) + content_score #pname = describe(parent_node) #else: #pweight = 0 #pname = "no parent" to_remove = False reason = "" #if el.tag == 'div' and counts["img"] >= 1: # continue if counts["p"] and counts["img"] > counts["p"]: reason = "too many images (%s)" % counts["img"] to_remove = True elif counts["li"] > counts["p"] and tag != "ul" and tag != "ol": reason = "more <li>s than <p>s" to_remove = True elif counts["input"] > (counts["p"] / 3): reason = "less than 3x <p>s than <input>s" to_remove = True elif content_length < (MIN_LEN) and (counts["img"] == 0 or counts["img"] > 2): reason = "too short content length %s without a single image" % content_length to_remove = True elif weight < 25 and link_density > 0.2: reason = "too many links %.3f for its weight %s" % ( link_density, weight) to_remove = True elif weight >= 25 and link_density > 0.5: reason = "too many links %.3f for its weight %s" % ( link_density, weight) to_remove = True elif (counts["embed"] == 1 and content_length < 75) or counts["embed"] > 1: reason = "<embed>s with too short content length, or too many <embed>s" to_remove = True # if el.tag == 'div' and counts['img'] >= 1 and to_remove: # imgs = el.findall('.//img') # valid_img = False # self.debug(tounicode(el)) # for img in imgs: # # height = img.get('height') # text_length = img.get('text_length') # self.debug ("height %s text_length %s" %(repr(height), repr(text_length))) # if to_int(height) >= 100 or to_int(text_length) >= 100: # valid_img = True # self.debug("valid image" + tounicode(img)) # break # if valid_img: # to_remove = False # self.debug("Allowing %s" %el.text_content()) # for desnode in self.tags(el, "table", "ul", "div"): # allowed[desnode] = True #find x non empty preceding and succeeding siblings i, j = 0, 0 x = 1 siblings = [] for sib in el.itersiblings(): #self.debug(sib.text_content()) sib_content_length = text_length(sib) if sib_content_length: i =+ 1 siblings.append(sib_content_length) if i == x: break for sib in el.itersiblings(preceding=True): #self.debug(sib.text_content()) sib_content_length = text_length(sib) if sib_content_length: j =+ 1 siblings.append(sib_content_length) if j == x: break #self.debug(str(siblings)) if siblings and sum(siblings) > 1000: to_remove = False self.debug("Allowing %s" % describe(el)) for desnode in self.tags(el, "table", "ul", "div"): allowed[desnode] = True if to_remove: self.debug("Cleaned %6.3f %s with weight %s cause it has %s." % (content_score, describe(el), weight, reason)) #print tounicode(el) #self.debug("pname %s pweight %.3f" %(pname, pweight)) el.drop_tree() #for el in ([node] + [n for n in node.iter()]): # if not self.options.get('attributes', None): # #el.attrib = {} #FIXME:Checkout the effects of disabling this # pass return _transform_result(self._result_type, node)