def cleaner_parameters(): reject_list = [ 'script', 'noscript', 'style', 'meta', 'semantics', 'img', 'label', 'table', 'li', 'ul', 'ol', 'nav', 'dl', 'dd', 'sub', 'sup', 'math' ] accept_list = [ 'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' 'span', 'b', 'a', 'u', 'i', 'body' ] html_cleaner = Cleaner() html_cleaner.remove_unknown_tags = True html_cleaner.processing_instructions = True html_cleaner.style = True html_cleaner.comments = True html_cleaner.scripts = True html_cleaner.javascript = True html_cleaner.meta = True html_cleaner.links = True html_cleaner.embedded = True html_cleaner.annoying_tags = True html_cleaner.frames = True html_cleaner.forms = True html_cleaner.remove_tags = accept_list html_cleaner.kill_tags = reject_list return html_cleaner
def lxml_extractor(html, url): '''LXML PARSER''' cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True # This is True because we want to activate the styles & stylesheet filter cleaner.comments = True cleaner.embedded = True cleaner.forms= True cleaner.frames = True cleaner.annoying_tags = True cleaner.kill_tags = NEGATIVE_K cleaner.allow_tag = POSITIVE_K cleaner.safe_attrs_only = True #~ oc = document_fromstring(html, parser=parser, base_url=base_url, **kw) #~ File "/usr/local/lib/python2.7/dist-packages/lxml/html/__init__.py", line 752, in document_fromstring #~ value = etree.fromstring(html, parser, **kw) try: html = lxml.html.fromstring(html, base_url="url") tree = cleaner.clean_html(html) #tree.make_links_absolute(url) doc = lxml.html.tostring(tree) doc = soup_extractor(doc, url) except ValueError: doc = soup_extractor(html, url) #~ (title, doc, article, text) = read_extractor(html, url) #~ print title #~ doc = (self.doc).replace(unichr(160), " ") #~ doc = re.sub(spaces,"",self.doc) return doc
def clean_html(html_text, javascript=True, scripts=True, style=True, embedded=True, links=True, forms=True, frames=True, comments=True, annoying_tags=True, meta=True, safe_attrs_only=True, remove_unknown_tags=True, processing_instructions=True): """Clean all the javascript and styles from the HTML returning the string with only the html content""" # True = Remove | False = Keep cleaner = Cleaner() cleaner.javascript = javascript # This is True because we want to activate the javascript filter cleaner.scripts = scripts # This is True because we want to activate the scripts filter cleaner.style = style cleaner.embedded = embedded cleaner.links = links cleaner.forms = forms cleaner.frames = frames cleaner.comments = comments cleaner.page_structure = False # Keep page structure cleaner.annoying_tags = annoying_tags cleaner.meta = meta cleaner.safe_attrs_only = safe_attrs_only cleaner.remove_unknown_tags = remove_unknown_tags cleaner.processing_instructions = processing_instructions clean_content = cleaner.clean_html(lxml.html.fromstring(html_text)) return lxml.html.tostring(clean_content)
def html2text(html): cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True cleaner.scripts = True cleaner.comments = True cleaner.links = True cleaner.meta = True cleaner.page_structure = True cleaner.processing_instructions = True cleaner.forms = True cleaner.add_nofollow = True #html = unicodedata.normalize('NFKD', html).encode('ascii','ignore') try: document = lxml.html.document_fromstring(html) c = cleaner.clean_html(document) html = lxml.html.tostring(c) soup = BeautifulSoup(html, 'lxml') parsed_text = soup.get_text() if (len(parsed_text) > MINSIZE_CHARSDOC): return parsed_text.lower() else: return None except: return None
def cleanpage(html): # cleaner setup cleaner = Cleaner() cleaner.html = True cleaner.page_structure = False cleaner.meta = False cleaner.safe_attrs_only = False cleaner.links = False cleaner.javascript = True # activate the javascript filter cleaner.style = True # activate the styles & stylesheet filter cleaner.links = False cleaner.frames = True cleaner.embedded = True cleaner.comments = True cleaner.annoying_tags = True cleaner.inline_style = True cleaner.page_structure = False # cleaner.remove_tags = ['b','img','h'] cleaner.kill_tags = ['img','script'] #invoke cleaner try: content=cleaner.clean_html(html) except: #error: ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fr content = u"" return content
def get_data(self): html_body = self._by_pass_get_html(self.url) cleaner = Cleaner() cleaner.comments = True doc = cleaner.clean_html(html_body) return doc
def cleaned_html(self): # Try to parse the provided HTML string using lxml # strip all unnecessary information to save space cleaner = Cleaner() cleaner.scripts = True cleaner.javascript = True cleaner.comments = True cleaner.style = True self.dom = cleaner.clean_html(self.dom) assert len(self.dom), 'The html needs to be parsed to get the cleaned html' return lxml.html.tostring(self.dom)
def cleaned_html(htmlString): # Try to parse the provided HTML string using lxml # strip all unnecessary information to save space cleaner = Cleaner() cleaner.comments = True cleaner.javascript = True cleaner.scripts = True cleaner.style = True htmlString = cleaner.clean_html(htmlString) return htmlString
def clean_text(data): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.scripts = True cleaner.comments = True cleaner.meta = True cleaner.annoying_tags = True stuff = lxml.html.tostring(cleaner.clean_html(data)) soup = BeautifulSoup(stuff.decode('utf-8', 'ignore')) all_text = ' '.join(filter(lambda val: val, \ map(lambda x: x.strip(), soup.findAll(text=True)))) return all_text
def crawNews(self, url): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.comments = True tech_content = lxml.html.parse(url) tech_content = (lxml.html.tostring(tech_content)) re_title = re.compile(r'<h1.*>(.*)</h1', re.S) re_content = re.compile( r'<!-- Begin: Wordpress Article Content -->(.*)<!-- End: Wordpress Article Content -->', re.S) re_published = re.compile(r'name="sailthru.date"\scontent="(.*?)"') re_author = re.compile( r'<a\shref="(.*?)"\stitle.*?rel="author">(.*?)<\/a>.*?rel="external">(.*?)<\/a>' ) match_title = re.search(re_title, tech_content) match_content = re.search(re_content, tech_content) match_date = re.search(re_published, tech_content) match_author = re.search(re_author, tech_content) author_url = "http://techcrunch.com" + match_author.group(1) author_name = match_author.group(2) author_twitter = match_author.group(3) title = re.sub(r'<[^>]*?>', '', cleaner.clean_html(match_title.group(1))) title = re.sub(r'\s+', ' ', title) title = title.decode('utf-8').strip() content = re.sub(r'<[^>]*?>', '', cleaner.clean_html(match_content.group(1))) content = re.sub(r'\s+', ' ', content) content = content.decode('utf-8').strip() content = content.strip('\n') published_on = datetime.datetime.strptime(match_date.group(1), '%Y-%m-%d %H:%M:%S') news = self.save_news(url, title, content, published_on) author = self.findAuthorByUrl(author_url) if (isinstance(author, Author) == False): author = self.save_author(author_url, author_name, author_twitter, '') self.newsAuthor(news, author)
def clean_html(self): """ Cleaner removes HTML tags prior to processing. Note: cleaning removes the Title tags from HTML. Do not clean before grabbing titles! """ if len(self.response.content): cleaner = Cleaner() cleaner.javascript = True cleaner.scripts = True cleaner.style = True cleaner.comments = True try: return html.fromstring(cleaner.clean_html(self.response.content)) except Exception as e: logging.error(e) return None
def clean_text(data): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.scripts = True cleaner.comments = True cleaner.meta = True cleaner.annoying_tags = True doc = UnicodeDammit(data, is_html=True) parser = html.HTMLParser(encoding=doc.original_encoding) root = html.document_fromstring(data, parser=parser) stuff = lxml.html.tostring(cleaner.clean_html(root)) soup = BeautifulSoup(stuff.decode('utf-8', 'ignore')) all_text = ' '.join(filter(lambda val: val, \ map(lambda x: x.strip(), soup.findAll(text=True)))) return all_text.encode('ascii', 'ignore')
def extract_content(bytehtml, doc): """ extracts blog post content from html """ lxmldoc = lxml.html.document_fromstring(bytehtml) cleaner = Cleaner() cleaner.scripts = True cleaner.comments = True cleaner.style = True #cleaner.page_structure = True cleaner.kill_tags = ['head', 'noscript'] cleaner.remove_tags = ['p', 'i', 'b', 'strong', 'em', 'blockquote'] cleaner(lxmldoc) content_el = find_content_element(lxmldoc) if content_el: debug(3, 'content quality {}'.format(content_el._quality)) text = tidy_content(content_el.text_content()) return text else: debug(2, 'no content found!') raise Exception('no content')
def crawNews(self, url): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.comments = True tech_content = lxml.html.parse(url) tech_content = (lxml.html.tostring(tech_content)) re_title = re.compile(r'<h1.*>(.*)</h1', re.S) re_content = re.compile(r'<!-- Begin: Wordpress Article Content -->(.*)<!-- End: Wordpress Article Content -->', re.S) re_published = re.compile(r'name="sailthru.date"\scontent="(.*?)"') re_author = re.compile(r'<a\shref="(.*?)"\stitle.*?rel="author">(.*?)<\/a>.*?rel="external">(.*?)<\/a>') match_title = re.search(re_title, tech_content) match_content = re.search(re_content, tech_content) match_date = re.search(re_published, tech_content) match_author = re.search(re_author, tech_content) author_url = "http://techcrunch.com" + match_author.group(1) author_name = match_author.group(2) author_twitter = match_author.group(3) title = re.sub(r'<[^>]*?>', '', cleaner.clean_html(match_title.group(1))) title = re.sub(r'\s+', ' ', title) title = title.decode('utf-8').strip() content = re.sub(r'<[^>]*?>', '', cleaner.clean_html(match_content.group(1))) content = re.sub(r'\s+', ' ', content) content = content.decode('utf-8').strip() content = content.strip('\n') published_on = datetime.datetime.strptime(match_date.group(1), '%Y-%m-%d %H:%M:%S') news = self.save_news(url, title, content, published_on) author = self.findAuthorByUrl(author_url) if (isinstance(author, Author) == False): author = self.save_author(author_url, author_name, author_twitter, '') self.newsAuthor(news, author)
def scrape(lineHashDB, html, encoding): # cleaner setup cleaner = Cleaner(allow_tags=['div', 'p'], remove_unknown_tags=False) cleaner.javascript = True # activate the javascript filter cleaner.style = True # activate the styles & stylesheet filter cleaner.comments = True cleaner.annoying_tags = True cleaner.inline_style = True cleaner.page_structure = False cleaner.remove_tags = ['b', 'a', 'h'] cleaner.kill_tags = ['script'] #invoke cleaner try: page = cleaner.clean_html(html) except: #error: ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fr content = u"" return content page8 = page page8 = re.sub(u'\n', ' ', page8) # remove NL # page8 = re.sub(u'\s','',page8,re.UNICODE) # blanks -> space page8 = re.sub(u' ', ' ', page8) # remove CR page8 = re.sub(u'<!--.*?-->', ' ', page8) # remove comments page8 = re.sub(u' class=".*?"', ' ', page8) # remove attributes page8 = re.sub(u' id=".*?"', ' ', page8) page8 = re.sub(u' rel=".*?"', ' ', page8) page8 = re.sub(u'\[an error occurred while processing this directive\]', ' ', page8) page8 = re.sub(u'>\s*?<', '><', page8) # remove blanks between tags # cycle to remove spurious divs for count in range(1, 20): page8 = re.sub(u'>.{0,10}<', '><', page8) # remove words under 10 chars between tags page8 = re.sub(u'<div></div>', ' ', page8) page8 = re.sub(u'<p></p>', ' ', page8) page8 = re.sub(u'<span></span>', ' ', page8) page8 = re.sub(u'\s+', ' ', page8) # remove repeated blanks #XPATHs xpath = '//*[((p) or (a) or (b) or (div) or (span)) ]/node()[(string-length() > 300)]/text()' xpath = '//*[((p) or (div))]/node()[(string-length() > 100)]/text()' sel = Selector(text=page8, type="html") text = sel.xpath(xpath).extract() content = u"" if text: for s in text: # squash duplicate whitespaces ' '.join(s.split()) # remove short lines # on empirical analysis, no unfrequent sentence under 40 chars is a relevant part of the article text, excluding repetition of title, authors, dates, etc. if len(s) < 40: next # remove leading whitespace #if s.endswith(" "): s = s[:-1] if s.startswith(" "): s = s[1:] content += s content += "\n" return content
def f_parse(args): def isAlphabet(word): alphabet = [ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'j', 'k', 'i', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'x', 'y', 'w', 'z', 'à', 'è', 'é', 'ì', 'í', 'ò', 'ó', 'ù', 'ú' ] guard = True for t in word: if t not in alphabet: guard = False return guard loc = args[0] corpuses = args[1] MINSIZE_WORD = 4 MAXSIZE_WORD = 15 MINSIZE_CHARSDOC = 100 MINSIZE_WORDSDOC = 50 cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True cleaner.scripts = True cleaner.comments = True cleaner.links = True cleaner.meta = True cleaner.page_structure = True cleaner.processing_instructions = True cleaner.forms = True cleaner.add_nofollow = True ret = [] for document in corpuses: #html = unicodedata.normalize('NFKD', html).encode('ascii','ignore') if len(document) > 0: try: document = lxml.html.document_fromstring(document) c = cleaner.clean_html(document) html = lxml.html.tostring(c) soup = BeautifulSoup(html, 'lxml') parsed_text = soup.get_text() if (len(parsed_text) > MINSIZE_CHARSDOC): parsed_text = parsed_text.lower() tokenizer = RegexpTokenizer(r'\w+') # create English stop words list en_stop = get_stop_words('en') it_stop = get_stop_words('it') sp_stop = get_stop_words('es') ge_stop = get_stop_words('de') fr_stop = get_stop_words('fr') # Create p_stemmer of class PorterStemmer #p_stemmer = PorterStemmer() # clean and tokenize document string tokens = tokenizer.tokenize(parsed_text) # remove stop words from tokens stopped_tokens1 = [i for i in tokens if not i in en_stop] stopped_tokens2 = [ i for i in stopped_tokens1 if not i in it_stop ] stopped_tokens3 = [ i for i in stopped_tokens2 if not i in sp_stop ] stopped_tokens4 = [ i for i in stopped_tokens3 if not i in ge_stop ] stopped_tokens5 = [ i for i in stopped_tokens4 if not i in fr_stop ] for word in stopped_tokens5: if not any(char.isdigit() for char in word): if len(word) > 1: #check if the word has the alphabet character if isAlphabet(word): ret.append(word) except: print('Exception : Document empty') return [loc, ret]
import lxml from lxml import etree from lxml.html.clean import autolink_html from lxml.html.clean import Cleaner #LXML PARSER cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True # This is True because we want to activate the styles & stylesheet filter cleaner.comments = True cleaner.embedded = True cleaner.forms= True cleaner.frames = True #cleaner.safe_attrs_only = True import re notalpha = re.compile('[^a-zA-Z]') #BS PARSER from bs4 import BeautifulSoup as bs from bs4 import Comment def make_links_absolute(soup, url): return [urlparse.urljoin(url, tag['href']) for tag in soup.findAll('a', href=True)] def clean_html(soup): soup = bs(" ".join([s.extract() for s in soup('script')])) soup = bs(" ".join([s.extract() for s in soup('iframe')])) soup = bs(" ".join([s.extract() for s in soup('form')])) soup = bs(" ".join([s.extract() for s in soup('embed')])) soup = bs(" ".join([s.extract() for s in soup('style')]))
from lxml import etree from lxml.html.clean import Cleaner from .filters import duplicate_test, textfilter from .settings import CUT_EMPTY_ELEMS, DEFAULT_CONFIG, MANUALLY_CLEANED, MANUALLY_STRIPPED from .utils import trim LOGGER = logging.getLogger(__name__) # HTML_CLEANER config # http://lxml.de/api/lxml.html.clean.Cleaner-class.html # https://lxml.de/apidoc/lxml.html.clean.html HTML_CLEANER = Cleaner() HTML_CLEANER.annoying_tags = False # True HTML_CLEANER.comments = True HTML_CLEANER.embedded = False # True HTML_CLEANER.forms = False # True HTML_CLEANER.frames = False # True HTML_CLEANER.javascript = False HTML_CLEANER.links = False HTML_CLEANER.meta = False HTML_CLEANER.page_structure = False HTML_CLEANER.processing_instructions = True HTML_CLEANER.remove_unknown_tags = False HTML_CLEANER.safe_attrs_only = False HTML_CLEANER.scripts = False HTML_CLEANER.style = False #HTML_CLEANER.remove_tags = MANUALLY_STRIPPED #HTML_CLEANER.kill_tags = MANUALLY_CLEANED
DATE_EXPRESSIONS = [ "//*[contains(@id, 'date') or contains(@id, 'Date') or contains(@id, 'datum') or contains(@id, 'Datum') or contains(@id, 'time') or contains(@class, 'post-meta-time')]", "//*[contains(@class, 'date') or contains(@class, 'Date') or contains(@class, 'datum') or contains(@class, 'Datum')]", "//*[contains(@class, 'postmeta') or contains(@class, 'post-meta') or contains(@class, 'entry-meta') or contains(@class, 'postMeta') or contains(@class, 'post_meta') or contains(@class, 'post__meta')]", "//*[@class='meta' or @class='meta-before' or @class='asset-meta' or contains(@id, 'article-metadata') or contains(@class, 'article-metadata') or contains(@class, 'byline') or contains(@class, 'subline')]", "//*[contains(@class, 'published') or contains(@class, 'posted') or contains(@class, 'submitted') or contains(@class, 'created-post')]", "//*[contains(@id, 'lastmod') or contains(@itemprop, 'date') or contains(@class, 'time')]", "//footer", "//*[@class='post-footer' or @class='footer' or @id='footer']", "//small", "//*[contains(@class, 'author') or contains(@class, 'autor') or contains(@class, 'field-content') or @class='meta' or contains(@class, 'info') or contains(@class, 'fa-clock-o')]", ] CLEANER = Cleaner() CLEANER.comments = False CLEANER.embedded = True CLEANER.forms = False CLEANER.frames = True CLEANER.javascript = True CLEANER.links = False CLEANER.meta = False CLEANER.page_structure = True CLEANER.processing_instructions = True CLEANER.remove_unknown_tags = False CLEANER.safe_attrs_only = False CLEANER.scripts = False CLEANER.style = True CLEANER.kill_tags = [ 'audio', 'canvas', 'label', 'map', 'math', 'object', 'picture', 'rdf', 'svg', 'video'
E.BODY( E.H1(E.CLASS("heading"), title), lxml.html.fromstring(html) ) ) html_out.getroottree().write(file="summarized-roanoke.html", method="html") if __name__ == "__main__": cleaner = Cleaner() cleaner.javascript = True cleaner.scripts = True cleaner.frame = True cleaner.meta = True cleaner.comments = True cleaner.links = True cleaner.style = True cleaner.kill_tags = ["cite", "sup", "img", "noscript", "label", "video"] url = "https://en.wikipedia.org/wiki/Roanoke_Colony" doc = urllib2.urlopen(url) tree = lxml.html.parse(doc) title = tree.find(".//title").text tree = cleaner.clean_html(tree) netloc = urlparse(url).netloc if netloc == "en.wikipedia.org": parse_wiki(tree, title)