def cleanpage(html): # cleaner setup cleaner = Cleaner() cleaner.html = True cleaner.page_structure = False cleaner.meta = False cleaner.safe_attrs_only = False cleaner.links = False cleaner.javascript = True # activate the javascript filter cleaner.style = True # activate the styles & stylesheet filter cleaner.links = False cleaner.frames = True cleaner.embedded = True cleaner.comments = True cleaner.annoying_tags = True cleaner.inline_style = True cleaner.page_structure = False # cleaner.remove_tags = ['b','img','h'] cleaner.kill_tags = ['img','script'] #invoke cleaner try: content=cleaner.clean_html(html) except: #error: ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fr content = u"" return content
def html2content(html, allowed_tags=["a", "abbr", "article", "aside", "b", "base", "blockquote", "body", "br", "caption", "cite", "code", "col", "colgroup", "dd", "del", "dfn", "dl", "dt", "em", "embed", "figcaption", "figure", "footer", "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html", "i", "img", "li", "map", "mark", "math", "meta", "meter", "nav", "noscript", "object", "ol", "optgroup", "option", "output", "p", "param", "pre", "progress", "q", "rp", "rt", "ruby", "s", "samp", "section", "small", "source", "span", "strong", "sub", "sup", "svg", "table", "tbody", "td", "th", "thead", "tfoot", "time", "title", "tr", "track", "u", "ul", "var", "video", "wbr"]): cleaner = Cleaner() cleaner.allow_tags = allowed_tags cleaner.remove_unknown_tags = False cleaner.page_structure = False cleaner.meta = False cleaner.style = True cleaner.embeded = False return cleaner.clean_html(html)
def get_clean_html(self, html_text, text_only=True): try: etree = lxml.html.document_fromstring(html_text) self._is_etree(etree) # enable filters to remove Javascript and CSS from HTML document cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.html = True cleaner.page_structure = False cleaner.meta = False cleaner.safe_attrs_only = False cleaner.links = False html = cleaner.clean_html(etree) if text_only: return ' '.join(html.text_content().split()) # return html.text_content() res = lxml.html.tostring(html) except Exception as e: logger.error(f"While parsing email in get_clean_html {e}") res = "junk" return res
def html2text(html): cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True cleaner.scripts = True cleaner.comments = True cleaner.links = True cleaner.meta = True cleaner.page_structure = True cleaner.processing_instructions = True cleaner.forms = True cleaner.add_nofollow = True #html = unicodedata.normalize('NFKD', html).encode('ascii','ignore') try: document = lxml.html.document_fromstring(html) c = cleaner.clean_html(document) html = lxml.html.tostring(c) soup = BeautifulSoup(html, 'lxml') parsed_text = soup.get_text() if (len(parsed_text) > MINSIZE_CHARSDOC): return parsed_text.lower() else: return None except: return None
def clean_html(html_text, javascript=True, scripts=True, style=True, embedded=True, links=True, forms=True, frames=True, comments=True, annoying_tags=True, meta=True, safe_attrs_only=True, remove_unknown_tags=True, processing_instructions=True): """Clean all the javascript and styles from the HTML returning the string with only the html content""" # True = Remove | False = Keep cleaner = Cleaner() cleaner.javascript = javascript # This is True because we want to activate the javascript filter cleaner.scripts = scripts # This is True because we want to activate the scripts filter cleaner.style = style cleaner.embedded = embedded cleaner.links = links cleaner.forms = forms cleaner.frames = frames cleaner.comments = comments cleaner.page_structure = False # Keep page structure cleaner.annoying_tags = annoying_tags cleaner.meta = meta cleaner.safe_attrs_only = safe_attrs_only cleaner.remove_unknown_tags = remove_unknown_tags cleaner.processing_instructions = processing_instructions clean_content = cleaner.clean_html(lxml.html.fromstring(html_text)) return lxml.html.tostring(clean_content)
def convertHtmlToDicts(url, content): """ given a url and content, create file and article dictionaries content has to include normal newlines, no \a or #N# replacers returns None, None on error """ # lxml does not like unicode if the document has an explicit encoding if " encoding=" not in content: content = pubGeneric.forceToUnicode(content) logging.debug("Converting to text: %s " % (repr(url))) artDict = pubStore.createEmptyArticleDict(source="bing", fulltextUrl=url) if not "<html" in content: return None, None try: logging.debug("Parsing html with lxml, html size %d" % len(content)) tree = lxml.html.document_fromstring(content) logging.debug("end parse html") except lxml.etree.XMLSyntaxError: return None, None titleEl = tree.find("head/title") if titleEl!=None: title = titleEl.text else: logging.debug("No title found?") title = "" metaTags = tree.findall("head/meta") artDict = parseMetaData(metaTags, artDict) logging.debug("Cleaning html tree") cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.meta = True cleaner.embedded = True cleaner.page_structure=True #cleaner.remove_tags = ["a", "li", "td"] cleanTree = cleaner.clean_html(tree) logging.debug("Cleaning done, now converting to ASCII") #text = cleanTree.text_content() newlineTags = ["p", "br"] asciiText = pubXml.treeToAsciiText(cleanTree, newlineTags) logging.debug("ASCII conversion done") logging.debug("title: %s" % title) if "title" not in artDict or artDict["title"]=="": artDict["title"] = title if artDict["abstract"]=="": abstract = unidecode.unidecode(asciiText[0:1500]).strip() artDict["abstract"] = abstract logging.debug("abstract: %s" % artDict["abstract"]) fileDict = pubStore.createEmptyFileDict(url=url, content=asciiText, mimeType="text/html") logging.debug("meta data extract success: %s" % artDict) return artDict, fileDict
def _get_cleaner(self, print_style, print_js, remove_tags): c = Cleaner() c.scripts = not print_js c.javascript = not print_js c.style = not print_style c.remove_tags = remove_tags c.page_structure = False return c
def remove_scripts_and_style(body): """ All of the Html files have fairly large blocks dedicated to style and script This function cleans out the large useless blocks out of the files and typically drops the size down significantly """ cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.page_structure = False return cleaner.clean_html(body)
def remove_scripts(self): if not self.clean_js: logger.debug('Scripts will not be removed') self.parser_modified_content = False return cleaner = Cleaner() # don't modify original page structure, eg, <head>, <html>, <body> ... cleaner.page_structure = False # don't remove inline javascript cleaner.javascript = False # remove <script> tags cleaner.scripts = True self.modified_doc = cleaner.clean_html(self.doc) self.parser_modified_content = True logger.debug('Scripts were successfully removed')
def get_clean_html(etree, text_only=False): _is_etree(etree) # enable filters to remove Javascript and CSS from HTML document cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.html = True cleaner.page_structure = False cleaner.meta = False cleaner.safe_attrs_only = False cleaner.links = False html = cleaner.clean_html(etree) if text_only: return html.text_content() return lxml.html.tostring(html)
def gettextonly(self, html, url): cleaner = Cleaner() cleaner.scripts = True cleaner.style = True cleaner.links = True cleaner.meta = False cleaner.page_structure = False cleaner.remove_tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'div', 'span', 'img', 'area', 'map', 'noscript', 'td', 'tr', 'table', 'a', 'p', 'br', 'li', 'ul'] doc = lxml.html.fromstring(html) path = '/html/body' try: body = doc.xpath(path)[0] except Exception as detail: print detail return False return cleaner.clean_html(body).text_content().split()
def clean(self: T) -> str: cleaner = Cleaner() cleaner.style = self.__style cleaner.links = self.__links cleaner.page_structure = self.__page_structure cleaner.safe_attrs_only = self.__safe_attrs_only # allow_tags and remove_unknown_tags can't work together if self.__allow_tags is not None: cleaner.remove_unknown_tags = False cleaner.allow_tags = self.__allow_tags if self.__kill_tags is not None: cleaner.kill_tags = self.__kill_tags if self.__remove_tags is not None: cleaner.remove_tags = self.__remove_tags if self.__safe_attrs is not None: cleaner.safe_attrs = self.__safe_attrs self.__input = cleaner.clean_html(self.__input) return self.__input
def get_clean_text(filename): utf8_parser = html.HTMLParser(encoding='utf-8') htmltxt = lxml.html.parse(filename, parser=utf8_parser) cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.html = True cleaner.page_structure = False cleaner.meta = False cleaner.safe_attrs_only = False cleaner.links = False htmltxt = cleaner.clean_html(htmltxt) txt = etree.tostring(htmltxt, encoding='unicode') txtresub = re.sub(r'<.+?>', ' ', txt) txtresub = re.sub(r'(\s|&?(amp;|apos;|quot;|gt;|lt;|nbsp;))+', ' ', txtresub) return txtresub
def get_content(self, pathName): try: file = open(pathName, "r") html_text = file.read() file.close() except: print("Fail to open the file located in {}".format(pathName)) return None try: cleaner = Cleaner() cleaner.javascript = True cleaner.style = True ######## Add cleaner.page_structure = False htmlData = cleaner.clean_html(html_text) except: print("Could not remove style and js code from the file located in {}".format(pathName)) return None ######## Add soup = BeautifulSoup(htmlData, "lxml") ######## Change return tuple (raw_content, soup) instead of raw_content return soup
def html2content( html, allowed_tags=[ "a", "abbr", "article", "aside", "b", "base", "blockquote", "body", "br", "caption", "cite", "code", "col", "colgroup", "dd", "del", "dfn", "dl", "dt", "em", "embed", "figcaption", "figure", "footer", "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html", "i", "img", "li", "map", "mark", "math", "meta", "meter", "nav", "noscript", "object", "ol", "optgroup", "option", "output", "p", "param", "pre", "progress", "q", "rp", "rt", "ruby", "s", "samp", "section", "small", "source", "span", "strong", "sub", "sup", "svg", "table", "tbody", "td", "th", "thead", "tfoot", "time", "title", "tr", "track", "u", "ul", "var", "video", "wbr" ]): cleaner = Cleaner() cleaner.allow_tags = allowed_tags cleaner.remove_unknown_tags = False cleaner.page_structure = False cleaner.meta = False cleaner.style = True cleaner.embeded = False return cleaner.clean_html(html)
try: readline.read_history_file(histfile) except IOError: pass try: from lxml.html.clean import Cleaner import lxml from lxml.html import document_fromstring import requests resp = requests.get('http://en.wikipedia.org/') tree = document_fromstring(resp.text) raw = resp.text # enable filters to remove Javascript and CSS from HTML document cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.html = True cleaner.page_structure = False cleaner.meta = False cleaner.safe_attrs_only = False cleaner.links = False html = cleaner.clean_html(tree) text_content = html.text_content() except ImportError: pass atexit.register(readline.write_history_file, histfile) del os, histfile
def f_parse(args): def isAlphabet(word): alphabet = [ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'j', 'k', 'i', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'x', 'y', 'w', 'z', 'à', 'è', 'é', 'ì', 'í', 'ò', 'ó', 'ù', 'ú' ] guard = True for t in word: if t not in alphabet: guard = False return guard loc = args[0] corpuses = args[1] MINSIZE_WORD = 4 MAXSIZE_WORD = 15 MINSIZE_CHARSDOC = 100 MINSIZE_WORDSDOC = 50 cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True cleaner.scripts = True cleaner.comments = True cleaner.links = True cleaner.meta = True cleaner.page_structure = True cleaner.processing_instructions = True cleaner.forms = True cleaner.add_nofollow = True ret = [] for document in corpuses: #html = unicodedata.normalize('NFKD', html).encode('ascii','ignore') if len(document) > 0: try: document = lxml.html.document_fromstring(document) c = cleaner.clean_html(document) html = lxml.html.tostring(c) soup = BeautifulSoup(html, 'lxml') parsed_text = soup.get_text() if (len(parsed_text) > MINSIZE_CHARSDOC): parsed_text = parsed_text.lower() tokenizer = RegexpTokenizer(r'\w+') # create English stop words list en_stop = get_stop_words('en') it_stop = get_stop_words('it') sp_stop = get_stop_words('es') ge_stop = get_stop_words('de') fr_stop = get_stop_words('fr') # Create p_stemmer of class PorterStemmer #p_stemmer = PorterStemmer() # clean and tokenize document string tokens = tokenizer.tokenize(parsed_text) # remove stop words from tokens stopped_tokens1 = [i for i in tokens if not i in en_stop] stopped_tokens2 = [ i for i in stopped_tokens1 if not i in it_stop ] stopped_tokens3 = [ i for i in stopped_tokens2 if not i in sp_stop ] stopped_tokens4 = [ i for i in stopped_tokens3 if not i in ge_stop ] stopped_tokens5 = [ i for i in stopped_tokens4 if not i in fr_stop ] for word in stopped_tokens5: if not any(char.isdigit() for char in word): if len(word) > 1: #check if the word has the alphabet character if isAlphabet(word): ret.append(word) except: print('Exception : Document empty') return [loc, ret]
def handle(self, **options): since = get_last_change() writer = get_writer() last_change = since while True: doc = {} changes = settings.db.changes(since=since) since = changes["last_seq"] if since != last_change: print("Detected new tasks ".format(len(changes))) print("=== changes ===") pprint(changes) for changeset in changes["results"]: try: doc = settings.db[changeset["id"]] except couchdb.http.ResourceNotFound: print("resource not found") continue if not ("type" in doc and "page" in doc["type"]): if since != last_change: print("not processing doc: {}".format(str(doc))) last_change = since continue print("indexing", doc["url"]) ##### # raw, html, text ##################### raw = doc["content"] print("type(RAW) = %s" % type(raw)) tree = document_fromstring(str(raw)) title = " ".join([title for title in tree.xpath("//title/text()")]) # enable filters to remove Javascript and CSS from HTML document cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.html = True cleaner.page_structure = False cleaner.meta = False cleaner.safe_attrs_only = False cleaner.links = False html = cleaner.clean_html(tree) text_content = html.text_content() lxml.html.tostring(html) description = " ".join(tree.xpath("//meta[@name='description']/@content")) writer.update_document( title=title, url=doc["url"], desc=description, rank=doc["rank"], content="\n".join([title, doc["url"], text_content]), raw=raw, ) writer.commit() writer = get_writer() set_last_change(since) last_change = since
LOGGER = logging.getLogger(__name__) # HTML_CLEANER config # http://lxml.de/api/lxml.html.clean.Cleaner-class.html # https://lxml.de/apidoc/lxml.html.clean.html HTML_CLEANER = Cleaner() HTML_CLEANER.annoying_tags = False # True HTML_CLEANER.comments = True HTML_CLEANER.embedded = False # True HTML_CLEANER.forms = False # True HTML_CLEANER.frames = False # True HTML_CLEANER.javascript = False HTML_CLEANER.links = False HTML_CLEANER.meta = False HTML_CLEANER.page_structure = False HTML_CLEANER.processing_instructions = True HTML_CLEANER.remove_unknown_tags = False HTML_CLEANER.safe_attrs_only = False HTML_CLEANER.scripts = False HTML_CLEANER.style = False #HTML_CLEANER.remove_tags = MANUALLY_STRIPPED #HTML_CLEANER.kill_tags = MANUALLY_CLEANED def tree_cleaning(tree, include_tables, include_images=False): '''Prune the tree by discarding unwanted elements''' # determine cleaning strategy cleaning_list, stripping_list = \ MANUALLY_CLEANED.copy(), MANUALLY_STRIPPED.copy() if include_tables is False:
"//*[contains(@id, 'lastmod') or contains(@itemprop, 'date') or contains(@class, 'time')]", "//footer", "//*[@class='post-footer' or @class='footer' or @id='footer']", "//small", "//*[contains(@class, 'author') or contains(@class, 'autor') or contains(@class, 'field-content') or @class='meta' or contains(@class, 'info') or contains(@class, 'fa-clock-o')]", ] CLEANER = Cleaner() CLEANER.comments = False CLEANER.embedded = True CLEANER.forms = False CLEANER.frames = True CLEANER.javascript = True CLEANER.links = False CLEANER.meta = False CLEANER.page_structure = True CLEANER.processing_instructions = True CLEANER.remove_unknown_tags = False CLEANER.safe_attrs_only = False CLEANER.scripts = False CLEANER.style = True CLEANER.kill_tags = [ 'audio', 'canvas', 'label', 'map', 'math', 'object', 'picture', 'rdf', 'svg', 'video' ] # 'embed', 'figure', 'img', 'table' ## REGEX cache JSON_PATTERN = re.compile( r'"date(?:Modified|Published)":"([0-9]{4}-[0-9]{2}-[0-9]{2})') # use of regex module for speed GERMAN_PATTERN = regex.compile(
from django.http import HttpResponse import rake from bs4 import BeautifulSoup import urllib.request import sys import testApp.processing as process import re from lxml.html.clean import Cleaner cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True cleaner.scripts = True cleaner.links = True cleaner.meta = True cleaner.page_structure = True cleaner.frames = True cleaner.forms = True cleaner.annoying_tags = True def get_url_content(url): try: with urllib.request.urlopen(url) as page: text = page.read() except Exception as e: return "Couldn't load url" return text def index(request):
def handle(self, **options): since = get_last_change() writer = get_writer() last_change = since while True: doc = {} changes = settings.db.changes(since=since) since = changes["last_seq"] if since != last_change: print("Detected new tasks ".format(len(changes))) print("=== changes ===") pprint(changes) for changeset in changes["results"]: try: doc = settings.db[changeset["id"]] except couchdb.http.ResourceNotFound: print("resource not found") continue if not ("type" in doc and "page" in doc["type"]): if since != last_change: print("not processing doc: {}".format(str(doc))) last_change = since continue print("indexing", doc["url"]) ##### # raw, html, text ##################### raw = doc['content'] print("type(RAW) = %s" % type(raw)) tree = document_fromstring(str(raw)) title = ' '.join([title for title in tree.xpath('//title/text()')]) # enable filters to remove Javascript and CSS from HTML document cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.html = True cleaner.page_structure = False cleaner.meta = False cleaner.safe_attrs_only = False cleaner.links = False html = cleaner.clean_html(tree) text_content = html.text_content() lxml.html.tostring(html) description = ' '.join( tree.xpath("//meta[@name='description']/@content")) writer.update_document( title=title, url=doc['url'], desc=description, rank=doc['rank'], content='\n'.join([title, doc['url'], text_content]), raw=raw, ) writer.commit() writer = get_writer() set_last_change(since) last_change = since
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html import lxml from lxml.html.clean import Cleaner #import json #import codecs from peewee import * import datetime cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.comments = True cleaner.allow_tags = False cleaner.links = False cleaner.page_structure = False db = MySQLDatabase('scrap', user='******', passwd='') class BaseModel(Model): """ Base peewee DB model """ class Meta: database = db class Texts(BaseModel): title = CharField() link = CharField()
def scrape(lineHashDB, html, encoding): # cleaner setup cleaner = Cleaner(allow_tags=['div', 'p'], remove_unknown_tags=False) cleaner.javascript = True # activate the javascript filter cleaner.style = True # activate the styles & stylesheet filter cleaner.comments = True cleaner.annoying_tags = True cleaner.inline_style = True cleaner.page_structure = False cleaner.remove_tags = ['b', 'a', 'h'] cleaner.kill_tags = ['script'] #invoke cleaner try: page = cleaner.clean_html(html) except: #error: ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fr content = u"" return content page8 = page page8 = re.sub(u'\n', ' ', page8) # remove NL # page8 = re.sub(u'\s','',page8,re.UNICODE) # blanks -> space page8 = re.sub(u' ', ' ', page8) # remove CR page8 = re.sub(u'<!--.*?-->', ' ', page8) # remove comments page8 = re.sub(u' class=".*?"', ' ', page8) # remove attributes page8 = re.sub(u' id=".*?"', ' ', page8) page8 = re.sub(u' rel=".*?"', ' ', page8) page8 = re.sub(u'\[an error occurred while processing this directive\]', ' ', page8) page8 = re.sub(u'>\s*?<', '><', page8) # remove blanks between tags # cycle to remove spurious divs for count in range(1, 20): page8 = re.sub(u'>.{0,10}<', '><', page8) # remove words under 10 chars between tags page8 = re.sub(u'<div></div>', ' ', page8) page8 = re.sub(u'<p></p>', ' ', page8) page8 = re.sub(u'<span></span>', ' ', page8) page8 = re.sub(u'\s+', ' ', page8) # remove repeated blanks #XPATHs xpath = '//*[((p) or (a) or (b) or (div) or (span)) ]/node()[(string-length() > 300)]/text()' xpath = '//*[((p) or (div))]/node()[(string-length() > 100)]/text()' sel = Selector(text=page8, type="html") text = sel.xpath(xpath).extract() content = u"" if text: for s in text: # squash duplicate whitespaces ' '.join(s.split()) # remove short lines # on empirical analysis, no unfrequent sentence under 40 chars is a relevant part of the article text, excluding repetition of title, authors, dates, etc. if len(s) < 40: next # remove leading whitespace #if s.endswith(" "): s = s[:-1] if s.startswith(" "): s = s[1:] content += s content += "\n" return content