def cleanpage(html): # cleaner setup cleaner = Cleaner() cleaner.html = True cleaner.page_structure = False cleaner.meta = False cleaner.safe_attrs_only = False cleaner.links = False cleaner.javascript = True # activate the javascript filter cleaner.style = True # activate the styles & stylesheet filter cleaner.links = False cleaner.frames = True cleaner.embedded = True cleaner.comments = True cleaner.annoying_tags = True cleaner.inline_style = True cleaner.page_structure = False # cleaner.remove_tags = ['b','img','h'] cleaner.kill_tags = ['img','script'] #invoke cleaner try: content=cleaner.clean_html(html) except: #error: ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fr content = u"" return content
def clearTag_old(self, text: str) -> str: import lxml from lxml.html.clean import Cleaner cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.links = True cleaner.meta = True cleaner.forms = True cleaner.embedded = True cleaner.frames = True cleaner.remove_unknown_tags = True cleaner.kill_tags = ["img"] cleaner.remove_tags = [ "strong", "div", "body", "br", "a", "p", "blockquote", "h3", "ol", "li", "font", ] return cleaner.clean_html( lxml.html.document_fromstring(text)).decode("utf-8")
def get_clean_html(self, html_text, text_only=True): try: etree = lxml.html.document_fromstring(html_text) self._is_etree(etree) # enable filters to remove Javascript and CSS from HTML document cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.html = True cleaner.page_structure = False cleaner.meta = False cleaner.safe_attrs_only = False cleaner.links = False html = cleaner.clean_html(etree) if text_only: return ' '.join(html.text_content().split()) # return html.text_content() res = lxml.html.tostring(html) except Exception as e: logger.error(f"While parsing email in get_clean_html {e}") res = "junk" return res
def html2text(html): cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True cleaner.scripts = True cleaner.comments = True cleaner.links = True cleaner.meta = True cleaner.page_structure = True cleaner.processing_instructions = True cleaner.forms = True cleaner.add_nofollow = True #html = unicodedata.normalize('NFKD', html).encode('ascii','ignore') try: document = lxml.html.document_fromstring(html) c = cleaner.clean_html(document) html = lxml.html.tostring(c) soup = BeautifulSoup(html, 'lxml') parsed_text = soup.get_text() if (len(parsed_text) > MINSIZE_CHARSDOC): return parsed_text.lower() else: return None except: return None
def cleaner_parameters(): reject_list = [ 'script', 'noscript', 'style', 'meta', 'semantics', 'img', 'label', 'table', 'li', 'ul', 'ol', 'nav', 'dl', 'dd', 'sub', 'sup', 'math' ] accept_list = [ 'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' 'span', 'b', 'a', 'u', 'i', 'body' ] html_cleaner = Cleaner() html_cleaner.remove_unknown_tags = True html_cleaner.processing_instructions = True html_cleaner.style = True html_cleaner.comments = True html_cleaner.scripts = True html_cleaner.javascript = True html_cleaner.meta = True html_cleaner.links = True html_cleaner.embedded = True html_cleaner.annoying_tags = True html_cleaner.frames = True html_cleaner.forms = True html_cleaner.remove_tags = accept_list html_cleaner.kill_tags = reject_list return html_cleaner
def clean_html(html_text, javascript=True, scripts=True, style=True, embedded=True, links=True, forms=True, frames=True, comments=True, annoying_tags=True, meta=True, safe_attrs_only=True, remove_unknown_tags=True, processing_instructions=True): """Clean all the javascript and styles from the HTML returning the string with only the html content""" # True = Remove | False = Keep cleaner = Cleaner() cleaner.javascript = javascript # This is True because we want to activate the javascript filter cleaner.scripts = scripts # This is True because we want to activate the scripts filter cleaner.style = style cleaner.embedded = embedded cleaner.links = links cleaner.forms = forms cleaner.frames = frames cleaner.comments = comments cleaner.page_structure = False # Keep page structure cleaner.annoying_tags = annoying_tags cleaner.meta = meta cleaner.safe_attrs_only = safe_attrs_only cleaner.remove_unknown_tags = remove_unknown_tags cleaner.processing_instructions = processing_instructions clean_content = cleaner.clean_html(lxml.html.fromstring(html_text)) return lxml.html.tostring(clean_content)
def get_clean_html(etree, text_only=False): _is_etree(etree) # enable filters to remove Javascript and CSS from HTML document cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.html = True cleaner.page_structure = False cleaner.meta = False cleaner.safe_attrs_only = False cleaner.links = False html = cleaner.clean_html(etree) if text_only: return html.text_content() return lxml.html.tostring(html)
def gettextonly(self, html, url): cleaner = Cleaner() cleaner.scripts = True cleaner.style = True cleaner.links = True cleaner.meta = False cleaner.page_structure = False cleaner.remove_tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'div', 'span', 'img', 'area', 'map', 'noscript', 'td', 'tr', 'table', 'a', 'p', 'br', 'li', 'ul'] doc = lxml.html.fromstring(html) path = '/html/body' try: body = doc.xpath(path)[0] except Exception as detail: print detail return False return cleaner.clean_html(body).text_content().split()
def clean(self: T) -> str: cleaner = Cleaner() cleaner.style = self.__style cleaner.links = self.__links cleaner.page_structure = self.__page_structure cleaner.safe_attrs_only = self.__safe_attrs_only # allow_tags and remove_unknown_tags can't work together if self.__allow_tags is not None: cleaner.remove_unknown_tags = False cleaner.allow_tags = self.__allow_tags if self.__kill_tags is not None: cleaner.kill_tags = self.__kill_tags if self.__remove_tags is not None: cleaner.remove_tags = self.__remove_tags if self.__safe_attrs is not None: cleaner.safe_attrs = self.__safe_attrs self.__input = cleaner.clean_html(self.__input) return self.__input
def get_clean_text(filename): utf8_parser = html.HTMLParser(encoding='utf-8') htmltxt = lxml.html.parse(filename, parser=utf8_parser) cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.html = True cleaner.page_structure = False cleaner.meta = False cleaner.safe_attrs_only = False cleaner.links = False htmltxt = cleaner.clean_html(htmltxt) txt = etree.tostring(htmltxt, encoding='unicode') txtresub = re.sub(r'<.+?>', ' ', txt) txtresub = re.sub(r'(\s|&?(amp;|apos;|quot;|gt;|lt;|nbsp;))+', ' ', txtresub) return txtresub
def get_url(self): """Get the relevant part of a web page.""" get_url = requests.get(self.data_path) page_data = get_url.content cleaner = Cleaner() cleaner.javascript = True # Remove JavaScript code from HTML. cleaner.scripts = True # Remove other code from HTML. cleaner.style = True # Remove CSS and styles from HTML. cleaner.links = True # Remove Links from HTML. cleaner.kill_tags = ['a', 'img'] # Remove these tags. # Store the cleaned up HTML. page_html = cleaner.clean_html(page_data) # Strip tags from final results. strip_tags = TagStripper() # Instantiate the HTML Tag Stripper. strip_tags.feed(page_html) # Strip all HTML tags. return strip_tags.get_html_data()
def get_url(self): """Get the HTML body of a web page.""" # Create file-like object. outfile = StringIO.StringIO() cleaner = Cleaner() cleaner.javascript = True # Remove JavaScript code from HTML. cleaner.scripts = True # Remove other code from HTML. cleaner.style = True # Remove CSS and styles from HTML. cleaner.links = True # Remove Links from HTML. cleaner.kill_tags = ['a', 'img', 'li'] # Remove these tags. # Store the cleaned up HTML. page_html = lxml.html.tostring( cleaner.clean_html( lxml.html.parse(self.data_path) ) ) outfile.write(page_html) # Write the results to this file in memory. return outfile
try: readline.read_history_file(histfile) except IOError: pass try: from lxml.html.clean import Cleaner import lxml from lxml.html import document_fromstring import requests resp = requests.get('http://en.wikipedia.org/') tree = document_fromstring(resp.text) raw = resp.text # enable filters to remove Javascript and CSS from HTML document cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.html = True cleaner.page_structure = False cleaner.meta = False cleaner.safe_attrs_only = False cleaner.links = False html = cleaner.clean_html(tree) text_content = html.text_content() except ImportError: pass atexit.register(readline.write_history_file, histfile) del os, histfile
def f_parse(args): def isAlphabet(word): alphabet = [ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'j', 'k', 'i', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'x', 'y', 'w', 'z', 'à', 'è', 'é', 'ì', 'í', 'ò', 'ó', 'ù', 'ú' ] guard = True for t in word: if t not in alphabet: guard = False return guard loc = args[0] corpuses = args[1] MINSIZE_WORD = 4 MAXSIZE_WORD = 15 MINSIZE_CHARSDOC = 100 MINSIZE_WORDSDOC = 50 cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True cleaner.scripts = True cleaner.comments = True cleaner.links = True cleaner.meta = True cleaner.page_structure = True cleaner.processing_instructions = True cleaner.forms = True cleaner.add_nofollow = True ret = [] for document in corpuses: #html = unicodedata.normalize('NFKD', html).encode('ascii','ignore') if len(document) > 0: try: document = lxml.html.document_fromstring(document) c = cleaner.clean_html(document) html = lxml.html.tostring(c) soup = BeautifulSoup(html, 'lxml') parsed_text = soup.get_text() if (len(parsed_text) > MINSIZE_CHARSDOC): parsed_text = parsed_text.lower() tokenizer = RegexpTokenizer(r'\w+') # create English stop words list en_stop = get_stop_words('en') it_stop = get_stop_words('it') sp_stop = get_stop_words('es') ge_stop = get_stop_words('de') fr_stop = get_stop_words('fr') # Create p_stemmer of class PorterStemmer #p_stemmer = PorterStemmer() # clean and tokenize document string tokens = tokenizer.tokenize(parsed_text) # remove stop words from tokens stopped_tokens1 = [i for i in tokens if not i in en_stop] stopped_tokens2 = [ i for i in stopped_tokens1 if not i in it_stop ] stopped_tokens3 = [ i for i in stopped_tokens2 if not i in sp_stop ] stopped_tokens4 = [ i for i in stopped_tokens3 if not i in ge_stop ] stopped_tokens5 = [ i for i in stopped_tokens4 if not i in fr_stop ] for word in stopped_tokens5: if not any(char.isdigit() for char in word): if len(word) > 1: #check if the word has the alphabet character if isAlphabet(word): ret.append(word) except: print('Exception : Document empty') return [loc, ret]
def handle(self, **options): since = get_last_change() writer = get_writer() last_change = since while True: doc = {} changes = settings.db.changes(since=since) since = changes["last_seq"] if since != last_change: print("Detected new tasks ".format(len(changes))) print("=== changes ===") pprint(changes) for changeset in changes["results"]: try: doc = settings.db[changeset["id"]] except couchdb.http.ResourceNotFound: print("resource not found") continue if not ("type" in doc and "page" in doc["type"]): if since != last_change: print("not processing doc: {}".format(str(doc))) last_change = since continue print("indexing", doc["url"]) ##### # raw, html, text ##################### raw = doc["content"] print("type(RAW) = %s" % type(raw)) tree = document_fromstring(str(raw)) title = " ".join([title for title in tree.xpath("//title/text()")]) # enable filters to remove Javascript and CSS from HTML document cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.html = True cleaner.page_structure = False cleaner.meta = False cleaner.safe_attrs_only = False cleaner.links = False html = cleaner.clean_html(tree) text_content = html.text_content() lxml.html.tostring(html) description = " ".join(tree.xpath("//meta[@name='description']/@content")) writer.update_document( title=title, url=doc["url"], desc=description, rank=doc["rank"], content="\n".join([title, doc["url"], text_content]), raw=raw, ) writer.commit() writer = get_writer() set_last_change(since) last_change = since
from .settings import CUT_EMPTY_ELEMS, DEFAULT_CONFIG, MANUALLY_CLEANED, MANUALLY_STRIPPED from .utils import trim LOGGER = logging.getLogger(__name__) # HTML_CLEANER config # http://lxml.de/api/lxml.html.clean.Cleaner-class.html # https://lxml.de/apidoc/lxml.html.clean.html HTML_CLEANER = Cleaner() HTML_CLEANER.annoying_tags = False # True HTML_CLEANER.comments = True HTML_CLEANER.embedded = False # True HTML_CLEANER.forms = False # True HTML_CLEANER.frames = False # True HTML_CLEANER.javascript = False HTML_CLEANER.links = False HTML_CLEANER.meta = False HTML_CLEANER.page_structure = False HTML_CLEANER.processing_instructions = True HTML_CLEANER.remove_unknown_tags = False HTML_CLEANER.safe_attrs_only = False HTML_CLEANER.scripts = False HTML_CLEANER.style = False #HTML_CLEANER.remove_tags = MANUALLY_STRIPPED #HTML_CLEANER.kill_tags = MANUALLY_CLEANED def tree_cleaning(tree, include_tables, include_images=False): '''Prune the tree by discarding unwanted elements''' # determine cleaning strategy cleaning_list, stripping_list = \
"//*[@class='meta' or @class='meta-before' or @class='asset-meta' or contains(@id, 'article-metadata') or contains(@class, 'article-metadata') or contains(@class, 'byline') or contains(@class, 'subline')]", "//*[contains(@class, 'published') or contains(@class, 'posted') or contains(@class, 'submitted') or contains(@class, 'created-post')]", "//*[contains(@id, 'lastmod') or contains(@itemprop, 'date') or contains(@class, 'time')]", "//footer", "//*[@class='post-footer' or @class='footer' or @id='footer']", "//small", "//*[contains(@class, 'author') or contains(@class, 'autor') or contains(@class, 'field-content') or @class='meta' or contains(@class, 'info') or contains(@class, 'fa-clock-o')]", ] CLEANER = Cleaner() CLEANER.comments = False CLEANER.embedded = True CLEANER.forms = False CLEANER.frames = True CLEANER.javascript = True CLEANER.links = False CLEANER.meta = False CLEANER.page_structure = True CLEANER.processing_instructions = True CLEANER.remove_unknown_tags = False CLEANER.safe_attrs_only = False CLEANER.scripts = False CLEANER.style = True CLEANER.kill_tags = [ 'audio', 'canvas', 'label', 'map', 'math', 'object', 'picture', 'rdf', 'svg', 'video' ] # 'embed', 'figure', 'img', 'table' ## REGEX cache JSON_PATTERN = re.compile( r'"date(?:Modified|Published)":"([0-9]{4}-[0-9]{2}-[0-9]{2})')
from testApp.models import * from django.template.loader import get_template from django.http import HttpResponse import rake from bs4 import BeautifulSoup import urllib.request import sys import testApp.processing as process import re from lxml.html.clean import Cleaner cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True cleaner.scripts = True cleaner.links = True cleaner.meta = True cleaner.page_structure = True cleaner.frames = True cleaner.forms = True cleaner.annoying_tags = True def get_url_content(url): try: with urllib.request.urlopen(url) as page: text = page.read() except Exception as e: return "Couldn't load url" return text
the_page = json.loads(response.read()) #print("cap response is " + str(the_page)); #print(the_page['success']); #print("what"); if ( the_page['success'] != True): return jsonify(success=False);#return empty object except urllib2.URLError, e: return jsonify(success=False); #return empty object #so the captcha is valid. Now clean the user data cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True # This is True because we want to activate the styles & stylesheet filter cleaner.scripts = True cleaner.links = True cleaner.allow_tags = None name = cleaner.clean_html(name) phone = cleaner.clean_html(phone) email = cleaner.clean_html(email) message = cleaner.clean_html(message) #build the email newMess = mail.EmailMessage(); newMess.sender ="pizzaoptimization <*****@*****.**>" newMess.subject = escape(strip_tags("Website Contact for tutoring: "+ name)) newMess.to = "pizzaoptimization <*****@*****.**>" newMess.body = escape(strip_tags("Name: " + name + "\nemail: " + email + "\nphone: " + phone + "\nmessage: " + message))
def handle(self, **options): since = get_last_change() writer = get_writer() last_change = since while True: doc = {} changes = settings.db.changes(since=since) since = changes["last_seq"] if since != last_change: print("Detected new tasks ".format(len(changes))) print("=== changes ===") pprint(changes) for changeset in changes["results"]: try: doc = settings.db[changeset["id"]] except couchdb.http.ResourceNotFound: print("resource not found") continue if not ("type" in doc and "page" in doc["type"]): if since != last_change: print("not processing doc: {}".format(str(doc))) last_change = since continue print("indexing", doc["url"]) ##### # raw, html, text ##################### raw = doc['content'] print("type(RAW) = %s" % type(raw)) tree = document_fromstring(str(raw)) title = ' '.join([title for title in tree.xpath('//title/text()')]) # enable filters to remove Javascript and CSS from HTML document cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.html = True cleaner.page_structure = False cleaner.meta = False cleaner.safe_attrs_only = False cleaner.links = False html = cleaner.clean_html(tree) text_content = html.text_content() lxml.html.tostring(html) description = ' '.join( tree.xpath("//meta[@name='description']/@content")) writer.update_document( title=title, url=doc['url'], desc=description, rank=doc['rank'], content='\n'.join([title, doc['url'], text_content]), raw=raw, ) writer.commit() writer = get_writer() set_last_change(since) last_change = since
PARSERCONFIG = { 'PREFER_DAY_OF_MONTH': 'first', 'PREFER_DATES_FROM': 'past', 'DATE_ORDER': 'DMY' } logger.debug('settings: %s %s %s', MIN_YEAR, TODAY, MAX_YEAR) logger.debug('dateparser configuration: %s', PARSERCONFIG) cleaner = Cleaner() cleaner.comments = True cleaner.embedded = True cleaner.forms = False cleaner.frames = True cleaner.javascript = False cleaner.links = False cleaner.meta = False cleaner.page_structure = True cleaner.processing_instructions = True cleaner.remove_unknown_tags = False cleaner.safe_attrs_only = False cleaner.scripts = False cleaner.style = False cleaner.kill_tags = [ 'audio', 'canvas', 'label', 'map', 'math', 'object', 'picture', 'table', 'svg', 'video' ] # 'embed', 'figure', 'img', def date_validator(datestring, outputformat):