def cleanpage(html): # cleaner setup cleaner = Cleaner() cleaner.html = True cleaner.page_structure = False cleaner.meta = False cleaner.safe_attrs_only = False cleaner.links = False cleaner.javascript = True # activate the javascript filter cleaner.style = True # activate the styles & stylesheet filter cleaner.links = False cleaner.frames = True cleaner.embedded = True cleaner.comments = True cleaner.annoying_tags = True cleaner.inline_style = True cleaner.page_structure = False # cleaner.remove_tags = ['b','img','h'] cleaner.kill_tags = ['img','script'] #invoke cleaner try: content=cleaner.clean_html(html) except: #error: ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fr content = u"" return content
def clearTag_old(self, text: str) -> str: import lxml from lxml.html.clean import Cleaner cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.links = True cleaner.meta = True cleaner.forms = True cleaner.embedded = True cleaner.frames = True cleaner.remove_unknown_tags = True cleaner.kill_tags = ["img"] cleaner.remove_tags = [ "strong", "div", "body", "br", "a", "p", "blockquote", "h3", "ol", "li", "font", ] return cleaner.clean_html( lxml.html.document_fromstring(text)).decode("utf-8")
def lxml_extractor(html, url): '''LXML PARSER''' cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True # This is True because we want to activate the styles & stylesheet filter cleaner.comments = True cleaner.embedded = True cleaner.forms= True cleaner.frames = True cleaner.annoying_tags = True cleaner.kill_tags = NEGATIVE_K cleaner.allow_tag = POSITIVE_K cleaner.safe_attrs_only = True #~ oc = document_fromstring(html, parser=parser, base_url=base_url, **kw) #~ File "/usr/local/lib/python2.7/dist-packages/lxml/html/__init__.py", line 752, in document_fromstring #~ value = etree.fromstring(html, parser, **kw) try: html = lxml.html.fromstring(html, base_url="url") tree = cleaner.clean_html(html) #tree.make_links_absolute(url) doc = lxml.html.tostring(tree) doc = soup_extractor(doc, url) except ValueError: doc = soup_extractor(html, url) #~ (title, doc, article, text) = read_extractor(html, url) #~ print title #~ doc = (self.doc).replace(unichr(160), " ") #~ doc = re.sub(spaces,"",self.doc) return doc
def clean_html(html_text, javascript=True, scripts=True, style=True, embedded=True, links=True, forms=True, frames=True, comments=True, annoying_tags=True, meta=True, safe_attrs_only=True, remove_unknown_tags=True, processing_instructions=True): """Clean all the javascript and styles from the HTML returning the string with only the html content""" # True = Remove | False = Keep cleaner = Cleaner() cleaner.javascript = javascript # This is True because we want to activate the javascript filter cleaner.scripts = scripts # This is True because we want to activate the scripts filter cleaner.style = style cleaner.embedded = embedded cleaner.links = links cleaner.forms = forms cleaner.frames = frames cleaner.comments = comments cleaner.page_structure = False # Keep page structure cleaner.annoying_tags = annoying_tags cleaner.meta = meta cleaner.safe_attrs_only = safe_attrs_only cleaner.remove_unknown_tags = remove_unknown_tags cleaner.processing_instructions = processing_instructions clean_content = cleaner.clean_html(lxml.html.fromstring(html_text)) return lxml.html.tostring(clean_content)
def cleaner_parameters(): reject_list = [ 'script', 'noscript', 'style', 'meta', 'semantics', 'img', 'label', 'table', 'li', 'ul', 'ol', 'nav', 'dl', 'dd', 'sub', 'sup', 'math' ] accept_list = [ 'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' 'span', 'b', 'a', 'u', 'i', 'body' ] html_cleaner = Cleaner() html_cleaner.remove_unknown_tags = True html_cleaner.processing_instructions = True html_cleaner.style = True html_cleaner.comments = True html_cleaner.scripts = True html_cleaner.javascript = True html_cleaner.meta = True html_cleaner.links = True html_cleaner.embedded = True html_cleaner.annoying_tags = True html_cleaner.frames = True html_cleaner.forms = True html_cleaner.remove_tags = accept_list html_cleaner.kill_tags = reject_list return html_cleaner
def validate(self, data): cleaner = Cleaner() cleaner.javascript = True cleaner.scripts = True cleaner.frames = True cleaner.remove_tags = ["p", "div", "a"] data["name"] = (lxml.html.document_fromstring(cleaner.clean_html(data["name"]))).text_content() if data["qty"] < 0: data["qty"] = 0 return data
def get_cleaner(): cleaner = Cleaner() cleaner.embedded = True cleaner.frames = True cleaner.style = True cleaner.remove_unknown_tags = True cleaner.processing_instructions = True cleaner.annoying_tags = True cleaner.remove_tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'b', 'a', 'u', 'i', 'body', 'div', 'span', 'p'] cleaner.kill_tags = ['table', 'img', 'semantics', 'script', 'noscript', 'style', 'meta', 'label', 'li', 'ul', 'ol', 'sup', 'math', 'nav', 'dl', 'dd', 'sub'] return cleaner
def clean(self): cleaner= Cleaner(page_structure=False) cleaner.javascript = True cleaner.scripts = True cleaner.frames = True cleaner.allow_tags = [] cleaner.remove_tags = ['p', 'div', 'a'] self.name= (lxml.html.document_fromstring(cleaner.clean_html(self.name))).text_content() self.price = (lxml.html.document_fromstring(cleaner.clean_html(self.price))).text_content() self.discountcode = (lxml.html.document_fromstring(cleaner.clean_html(self.discountcode))).text_content() self.categorycode= (lxml.html.document_fromstring(cleaner.clean_html(self.categorycode))).text_content() self.orderdate= (lxml.html.document_fromstring(cleaner.clean_html(self.orderdate))).text_content() self.selldate= (lxml.html.document_fromstring(cleaner.clean_html(self.selldate))).text_content() self.page= (lxml.html.document_fromstring(cleaner.clean_html(self.page))).text_content()
def validate(self, value): cleaner = Cleaner() cleaner.javascript = True cleaner.scripts = True cleaner.frames = True cleaner.remove_tags = ["p", "div", "a"] data["username"] = (lxml.html.document_fromstring(cleaner.clean_html(data["username"]))).text_content() data["storename"] = (lxml.html.document_fromstring(cleaner.clean_html(data["storename"]))).text_content() data["email"] = (lxml.html.document_fromstring(cleaner.clean_html(data["email"]))).text_content() # data['username']= cleaner.clean_html(data['username']) # data['storename']= cleaner.clean_html(data['storename']) # data['email']= cleaner.clean_html(data['email']) return data
def validate(self, data): cleaner = Cleaner() cleaner.javascript = True cleaner.scripts = True cleaner.frames = True cleaner.remove_tags = ["p", "div", "a"] # (lxml.html.document_fromstring(cleaner.clean_html(self.name))).text_content() data["name"] = (lxml.html.document_fromstring(cleaner.clean_html(data["name"]))).text_content() data["price"] = (lxml.html.document_fromstring(cleaner.clean_html(data["price"]))).text_content() data["itemid"] = (lxml.html.document_fromstring(cleaner.clean_html(data["itemid"]))).text_content() data["discountcode"] = (lxml.html.document_fromstring(cleaner.clean_html(data["discountcode"]))).text_content() data["orderdate"] = (lxml.html.document_fromstring(cleaner.clean_html(data["orderdate"]))).text_content() data["selldate"] = (lxml.html.document_fromstring(cleaner.clean_html(data["selldate"]))).text_content() data["page"] = (lxml.html.document_fromstring(cleaner.clean_html(data["page"]))).text_content() if data[qty] < 0: data[qty] = 0 # self.name= cleaner.clean_html(self.name) return data
def clean(self): cleaner= Cleaner() cleaner.javascript = True cleaner.scripts = True cleaner.frames = True self.name= (lxml.html.document_fromstring(cleaner.clean_html(self.name))).text_content() #lxml.html.fromstring(self.name) )
import lxml from lxml import etree from lxml.html.clean import autolink_html from lxml.html.clean import Cleaner #LXML PARSER cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True # This is True because we want to activate the styles & stylesheet filter cleaner.comments = True cleaner.embedded = True cleaner.forms= True cleaner.frames = True #cleaner.safe_attrs_only = True import re notalpha = re.compile('[^a-zA-Z]') #BS PARSER from bs4 import BeautifulSoup as bs from bs4 import Comment def make_links_absolute(soup, url): return [urlparse.urljoin(url, tag['href']) for tag in soup.findAll('a', href=True)] def clean_html(soup): soup = bs(" ".join([s.extract() for s in soup('script')])) soup = bs(" ".join([s.extract() for s in soup('iframe')])) soup = bs(" ".join([s.extract() for s in soup('form')])) soup = bs(" ".join([s.extract() for s in soup('embed')])) soup = bs(" ".join([s.extract() for s in soup('style')]))
from .filters import duplicate_test, textfilter from .settings import CUT_EMPTY_ELEMS, DEFAULT_CONFIG, MANUALLY_CLEANED, MANUALLY_STRIPPED from .utils import trim LOGGER = logging.getLogger(__name__) # HTML_CLEANER config # http://lxml.de/api/lxml.html.clean.Cleaner-class.html # https://lxml.de/apidoc/lxml.html.clean.html HTML_CLEANER = Cleaner() HTML_CLEANER.annoying_tags = False # True HTML_CLEANER.comments = True HTML_CLEANER.embedded = False # True HTML_CLEANER.forms = False # True HTML_CLEANER.frames = False # True HTML_CLEANER.javascript = False HTML_CLEANER.links = False HTML_CLEANER.meta = False HTML_CLEANER.page_structure = False HTML_CLEANER.processing_instructions = True HTML_CLEANER.remove_unknown_tags = False HTML_CLEANER.safe_attrs_only = False HTML_CLEANER.scripts = False HTML_CLEANER.style = False #HTML_CLEANER.remove_tags = MANUALLY_STRIPPED #HTML_CLEANER.kill_tags = MANUALLY_CLEANED def tree_cleaning(tree, include_tables, include_images=False): '''Prune the tree by discarding unwanted elements'''
"//*[contains(@class, 'date') or contains(@class, 'Date') or contains(@class, 'datum') or contains(@class, 'Datum')]", "//*[contains(@class, 'postmeta') or contains(@class, 'post-meta') or contains(@class, 'entry-meta') or contains(@class, 'postMeta') or contains(@class, 'post_meta') or contains(@class, 'post__meta')]", "//*[@class='meta' or @class='meta-before' or @class='asset-meta' or contains(@id, 'article-metadata') or contains(@class, 'article-metadata') or contains(@class, 'byline') or contains(@class, 'subline')]", "//*[contains(@class, 'published') or contains(@class, 'posted') or contains(@class, 'submitted') or contains(@class, 'created-post')]", "//*[contains(@id, 'lastmod') or contains(@itemprop, 'date') or contains(@class, 'time')]", "//footer", "//*[@class='post-footer' or @class='footer' or @id='footer']", "//small", "//*[contains(@class, 'author') or contains(@class, 'autor') or contains(@class, 'field-content') or @class='meta' or contains(@class, 'info') or contains(@class, 'fa-clock-o')]", ] CLEANER = Cleaner() CLEANER.comments = False CLEANER.embedded = True CLEANER.forms = False CLEANER.frames = True CLEANER.javascript = True CLEANER.links = False CLEANER.meta = False CLEANER.page_structure = True CLEANER.processing_instructions = True CLEANER.remove_unknown_tags = False CLEANER.safe_attrs_only = False CLEANER.scripts = False CLEANER.style = True CLEANER.kill_tags = [ 'audio', 'canvas', 'label', 'map', 'math', 'object', 'picture', 'rdf', 'svg', 'video' ] # 'embed', 'figure', 'img', 'table' ## REGEX cache
import rake from bs4 import BeautifulSoup import urllib.request import sys import testApp.processing as process import re from lxml.html.clean import Cleaner cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True cleaner.scripts = True cleaner.links = True cleaner.meta = True cleaner.page_structure = True cleaner.frames = True cleaner.forms = True cleaner.annoying_tags = True def get_url_content(url): try: with urllib.request.urlopen(url) as page: text = page.read() except Exception as e: return "Couldn't load url" return text def index(request): return HttpResponse("Hello, world. You're at the homepage.")
] COMMENTS_DISCARD_XPATH = ['.//*[(self::div or self::section)][starts-with(@id, "respond")]', \ './/cite', \ './/quote', \ './/*[starts-with(@id, "reply-") or starts-with(@class, "reply-title")]', \ './/*[contains(@id, "akismet") or contains(@class, "akismet")]', \ ] # HTML_CLEANER config # http://lxml.de/api/lxml.html.clean.Cleaner-class.html HTML_CLEANER = Cleaner() HTML_CLEANER.annoying_tags = True HTML_CLEANER.comments = True HTML_CLEANER.embedded = True HTML_CLEANER.forms = True HTML_CLEANER.frames = True HTML_CLEANER.javascript = True HTML_CLEANER.links = False HTML_CLEANER.meta = False HTML_CLEANER.page_structure = False HTML_CLEANER.processing_instructions = True HTML_CLEANER.remove_unknown_tags = False HTML_CLEANER.safe_attrs_only = False HTML_CLEANER.scripts = True HTML_CLEANER.style = False HTML_CLEANER.remove_tags = [ 'a', 'abbr', 'acronym', 'address', 'big', 'cite', 'font', 'ins', 'meta', 'small', 'sub', 'sup', 'wbr' ] # 'center', 'table', 'tbody', 'td', 'th', 'tr', 'span', HTML_CLEANER.kill_tags = [ 'aside', 'audio', 'canvas', 'embed', 'figure', 'footer', 'form', 'head',