def sanitize_payload(payload): "Sanitize HTML" if not payload: return '', '' styles = [] payload = clean_payload(payload) body_style, body_class = get_body_style(payload) if body_style: styles.append(body_style) safe_attrs = set(defs.safe_attrs) safe_attrs.add('style') cleaner = Cleaner(remove_tags=UNCLEANTAGS, safe_attrs_only=True, safe_attrs=safe_attrs) payload = HTMLTITLE_RE.sub('', payload) try: html = cleaner.clean_html(payload) except ValueError: payload = bytes(bytearray(payload, encoding='utf-8')) html = cleaner.clean_html(payload) except XMLSyntaxError: html = '' mainstyle = sanitize_css(get_style(html)) if mainstyle: styles.append(decode(mainstyle)) style = u'\n'.join(styles) html = clean_styles(CSS_COMMENT_RE.sub('', html)) html = set_body_class(html, body_class) return html.strip(), style.strip()
def _get_breakdowns(self): """ returns breakdowns from GWDG in given timewindow """ #load feed first, since not working with lxml directly r = requests.get(URL) #load url and parse it with html parser root = lxml.etree.fromstring(r.text.encode("utf-8")) #get items items = [] for x in root.findall("channel/item"): pubdate = datetime.datetime.fromtimestamp( email.utils.mktime_tz( email.utils.parsedate_tz( x.find("pubDate").text[:-6] ) ) ) if pubdate >= OLDEST_NEWS: cleaner = Cleaner(allow_tags=[''], remove_unknown_tags=False) title = cleaner.clean_html(x.find("title").text)[5:-6] content = cleaner.clean_html(x.find("description").text)[5:-6] item = { "title" : title, "pubdate" : str(pubdate), "content" : content, } items.append(item) return sorted(items, key=lambda x: x["pubdate"], reverse=True)
def merge_docx(docx_list=None, out_htmlpath=None): """ docx_list is a list of strings which contains the (absolute) path of DOC/DOCX files to be merged. MERGE_DOCX() will follow the index order of docx_list for appending. Returns the HTML file as string. If OUT_HTMLPATH is given, write the HTML file out as well. """ if docx_list is None: return None cleaner = Cleaner() parser = HTMLParser(encoding='utf-8') html_list = [] for path in docx_list: try: tmp_html = PyDocX.to_html(path) html_list.append(cleaner.clean_html(lxml.html.fromstring(tmp_html, parser=parser))) except: #'MalformedDocxException' try: # Pretend it is a html html_file = '{}.html'.format(path) with open(html_file, 'rb') as tmp: tmp_html = tmp.read() tmp_html = tmp_html.decode('utf-8') html_list.append(cleaner.clean_html(lxml.html.fromstring(tmp_html, parser=parser))) except: # Cannot convert continue #print html_list if len(html_list)>1: #Append element at the end of first body main_body = html_list[0].xpath('./body')[0] for tree in html_list[1:]: elem_list = tree.xpath('./body/*') for elem in elem_list: main_body.append(elem) elif len(html_list)==1: main_body = html_list[0].xpath('./body')[0] else: try: main_body = html_list[0].xpath('./body')[0] except IndexError: # no body content. Most likely just an image/appendix return None # Convert ElementTree back to string # in this way we will lose the 'style' info in html_list[0][0], which is usually in header, # but not sure if it will cause any differences to parser later on. Probably not. html_str = lxml.etree.tostring(main_body) if out_htmlpath is not None: with open(out_htmlpath, 'wb') as tmp: tmp.write(html_str.encode('utf-8')) return html_str
def readable(self, html, url=None): self.url = url html = self.smart_decode(html) cleaner = Cleaner(page_structure=False, add_nofollow=True, style=True, safe_attrs_only=True) html = cleaner.clean_html(html) tree = lxml.html.fromstring(html) body = tree.xpath("//body")[0] article = self.grab_article(body) return cleaner.clean_html(article)
def validate(self, value): cleaner = Cleaner() cleaner.javascript = True cleaner.scripts = True cleaner.frames = True cleaner.remove_tags = ["p", "div", "a"] data["username"] = (lxml.html.document_fromstring(cleaner.clean_html(data["username"]))).text_content() data["storename"] = (lxml.html.document_fromstring(cleaner.clean_html(data["storename"]))).text_content() data["email"] = (lxml.html.document_fromstring(cleaner.clean_html(data["email"]))).text_content() # data['username']= cleaner.clean_html(data['username']) # data['storename']= cleaner.clean_html(data['storename']) # data['email']= cleaner.clean_html(data['email']) return data
def create_plaintext_message(message): """ Create clean plain text version of email message Parse the html and remove style and javacript tags and then create a plain-text-message by parsing the html and attaching links as endnotes """ cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.kill_tags = ['style'] doc = message.decode('utf-8', 'ignore') to_clean = lxml.html.fromstring(doc) cleaned_msg = lxml.html.tostring(cleaner.clean_html(to_clean)) plain_text_maxcols = 72 textout = cStringIO.StringIO() formtext = formatter.AbstractFormatter(formatter.DumbWriter( textout, plain_text_maxcols)) parser = HTMLParser(formtext) parser.feed(cleaned_msg) parser.close() # append the anchorlist at the bottom of a message # to keep the message readable. counter = 0 anchorlist = "\n\n" + ("-" * plain_text_maxcols) + "\n\n" for item in parser.anchorlist: counter += 1 if item.startswith('https://'): new_item = item.replace('https://', 'http://') else: new_item = item anchorlist += "[%d] %s\n" % (counter, new_item) text = textout.getvalue() + anchorlist del textout, formtext, parser, anchorlist return text
def sanitize_html(html, bad_tags=['body']): """Removes identified malicious HTML content from the given string.""" if html is None or html == '': return html cleaner = Cleaner(style=False, page_structure=True, remove_tags=bad_tags, safe_attrs_only=False) return cleaner.clean_html(html)
def sanitize(html): if not html: return html cleaner = Cleaner(allow_tags=_safe_tags, safe_attrs_only=True, safe_attrs=_safe_attrs, remove_unknown_tags=False) html = autolink_html(cleaner.clean_html(html)) parts = re.split('(<.*?>)', html) output = '' in_a_tag = False for part in parts: if not len(part): continue is_tag = part[0] == '<' if is_tag or in_a_tag: output += part if part[0:2].lower() == '<a': in_a_tag = True elif part[0:3].lower() == '</a': in_a_tag = False continue part = re.sub("([a-zA-Z0-9_\\-+\\.\']*[a-zA-Z0-9]@[0-9a-zA-Z\\-\\.]+\\.[a-zA-Z]{2,})", '<a href="mailto:\\1">\\1</a>', part) # After linking up emails, only look for twitter in the remaining parts sub_parts = re.split('(<.*?>)', part) part = '' for sub_part in sub_parts: part += re.sub("(?<![a-zA-Z0-9])@([0-9a-zA-Z_]{1,15})", '<a href="https://twitter.com/\\1">@\\1</a>', sub_part) output += part return output
def get_intro_text(text): """ Returns only the first <p> tag and preceding nodes """ #cut the text to the first paragraph index = text.lower().find('</p>', 1000) if index != -1: text = text[:index] +'</p>' cleaner = Cleaner( scripts=False, javascript=False, comments=False, style=False, links=False, meta=False, page_structure=False, processing_instructions=False, embedded=False, forms=False, remove_unknown_tags=True, ) text = cleaner.clean_html(text) return text
def lxml_extractor(html, url): '''LXML PARSER''' cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True # This is True because we want to activate the styles & stylesheet filter cleaner.comments = True cleaner.embedded = True cleaner.forms= True cleaner.frames = True cleaner.annoying_tags = True cleaner.kill_tags = NEGATIVE_K cleaner.allow_tag = POSITIVE_K cleaner.safe_attrs_only = True #~ oc = document_fromstring(html, parser=parser, base_url=base_url, **kw) #~ File "/usr/local/lib/python2.7/dist-packages/lxml/html/__init__.py", line 752, in document_fromstring #~ value = etree.fromstring(html, parser, **kw) try: html = lxml.html.fromstring(html, base_url="url") tree = cleaner.clean_html(html) #tree.make_links_absolute(url) doc = lxml.html.tostring(tree) doc = soup_extractor(doc, url) except ValueError: doc = soup_extractor(html, url) #~ (title, doc, article, text) = read_extractor(html, url) #~ print title #~ doc = (self.doc).replace(unichr(160), " ") #~ doc = re.sub(spaces,"",self.doc) return doc
def strip_comments__lxml(html_string=""): if not html_string: return html_string params = { 'comments': True, 'scripts': False, 'javascript': False, 'style': False, 'links': False, 'meta': False, 'page_structure': False, 'processing_instructions': False, 'embedded': False, 'frames': False, 'forms': False, 'annoying_tags': False, 'remove_tags': None, 'allow_tags': None, 'remove_unknown_tags': True, 'safe_attrs_only': False, } try: cleaner = Cleaner(**params) html = lxml.html.fromstring(html_string) clean_html = cleaner.clean_html(html) return lxml.etree.tostring(clean_html) except (XMLSyntaxError, ParserError): return html_string
def __init__(self, input): self.title = input.get('post_title') self.content = input.get('post_content') self.category = input.get('post_category') self.is_public = input.get('post_is_public') if self.is_public: self.is_public = True else: self.is_public = False if self.category not in config.get('post_categories'): raise exceptions.CantValidateForm if self.title: # strip markup html_string = lxml.html.fromstring(self.title) self.title = unicode(html_string.text_content()) else: self.title = '' if self.content: # clean markup cleaner = Cleaner(**post_rules) self.content = cleaner.clean_html(self.content) # replace newlines self.content = self.content.replace('\r\n', '<br />') else: raise exceptions.CantValidateForm
def truncate(content, max_length=DEFAULT_TRUNCATE_LENGTH, allowed_tags=ALLOWED_TAGS, full_link=None): """ truncate a body of text to the expected 'max_length' and strip the body of text of all html tags that are not in 'allowed tags'. You can also specify a 'strip' value (True -> strip html tags, False -> escape html tags and leave them in text) """ if not content: return '' cleaner = Cleaner( page_structure=False, links=True, safe_attrs_only=True, remove_unknown_tags=False, allow_tags=allowed_tags ) content = defaultfilters.truncatechars_html(cleaner.clean_html(content), max_length) if full_link: try: insert_point = content.rindex('</p>') except ValueError: insert_point = content.rindex('<') ending = content[insert_point:] content = content[:insert_point] content += ' <a href="' + full_link + '">(Read More)</a>' + ending return content
def clean_html(html, safe_attrs=('src', 'href'), input_encoding='unicode', output_encoding='unicode', **kwargs): """ Fix HTML structure and remove non-allowed attributes from all tags. """ from lxml.html.clean import Cleaner # Convert HTML to Unicode html = render_html(parse_html(html, encoding=input_encoding), make_unicode=True) # Strip some shit with default lxml tools cleaner = Cleaner(page_structure=True, **kwargs) html = cleaner.clean_html(html) # Keep only allowed attributes tree = parse_html(html) for elem in tree.xpath('./descendant-or-self::*'): for key in elem.attrib.keys(): if safe_attrs: if key not in safe_attrs: del elem.attrib[key] return render_html(tree, encoding=output_encoding)
def parse(self, response): item = JournalItem() base_url = "http://journals.ametsoc.org" journalTitle = response.xpath('//*[@id="journalBlurbPanel"]/div[2]/h3/text()').extract_first() item['title'] = journalTitle journalIssue = response.xpath('//*[@id="articleToolsHeading"]/text()').extract_first().strip() # remove whitespace at start and end item['issue'] = journalIssue # setup html cleaner to strip html tags from string (journal titles often use sub/superscript and splits article title) html_cleaner = Cleaner(allow_tags=[''], remove_unknown_tags=False) journalDescription = response.xpath('//*[@id="journalBlurbPanel"]/div[4]').extract() journalDescription = "".join(journalDescription) journalDescription = html_cleaner.clean_html(journalDescription)[5:-6] # remove any html tags and then trim the <div> tags that the cleaner inserts journalDescription = removeNewlines(journalDescription) # remove any \n\r\t characters journalDescription = journalDescription.strip() item['description'] = journalDescription coverImage = response.xpath('//*[@id="smallIssueCover"]/img/@src').extract_first().strip() print(coverImage) item['coverURL'] = base_url + coverImage yield item
def clean_article_html(cls, node): article_cleaner = Cleaner() article_cleaner.javascript = True article_cleaner.style = True article_cleaner.allow_tags = ['a', 'span', 'p', 'br', 'strong', 'b', 'em'] article_cleaner.remove_unknown_tags = False return article_cleaner.clean_html(node)
def parse(self, content): """Clean and parse HTML content.""" cleaner = Cleaner(style=True, links=False, page_structure=False, meta=True, safe_attrs_only=False, remove_unknown_tags=False) clean_content = cleaner.clean_html(content) html = etree.iterparse(StringIO(clean_content), events=("start", "end")) level = -1 css = '' # We do not want to style these elements. ignore_tags = ['html', 'body', 'head', 'meta', 'title', 'script'] if self.options.delimiter == 'spaces': delimiter = ' ' else: delimiter = '\t' for action, elem in html: if (action == 'start'): identifier = self.identify_ruleset(elem) if elem.tag not in ignore_tags: level += 1 css += delimiter * level + identifier + ' {\n' if not self.options.clean_mode: css += delimiter + delimiter * level + '/* enter your CSS here... */\n' else: if elem.tag not in ignore_tags: css += delimiter * level + '}\n' level -= 1 return css.strip()
def get_content(self, site): sel = None if site.id_type == "css": # translates csspath into xpath s = CSSSelector(site.identifier) sel = s.path else: sel = site.identifier try: page = requests.get(site.url) parser = le.HTMLParser() tree = le.parse(StringIO(page.text), parser) xp = tree.xpath(sel) if len(xp) < 1: return None html = lxml.html.tostring(xp[0]) cleaner = Cleaner(style=True, links=False, page_structure=False, embedded=False, frames=False, forms=False) cleaned_html = cleaner.clean_html(html) self._print("Cleaning html: " + str(len(html)) + " -> " + str(len(cleaned_html))) return cleaned_html except Exception as e: self._print("EXCEPTION! " + str(e.message)) return None
def gettextonly(self, tree): cleaner = Cleaner(style=True, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False) try: v = tostring(tree,method='text',encoding=unicode) except: v = None if v == None: c = lxml.html.tostring(tree) print 'v== null' # resulttext = '' # for t in c: # subtext = self.gettextonly(t) # resulttext += subtext + '\n' # return resulttext return c else: # Clean up the javascript and comment. try: v = cleaner.clean_html(v) except: # Ignore clean error pass return v.strip()
def getFormatHtml(htmlContent): try: dom = soupparser.fromstring(htmlContent) except Exception, e: cleaner = Cleaner() htmlContent = cleaner.clean_html(htmlContent) doc = soupparser.fromstring(htmlContent)
def handle_item(path): # url="http://news.39.net/"+path.split("/root/39_data/news.39.net/")[1] flag,title,text=False,"","" try: # request=requests.get(url,proxies=get_proxy(),timeout=5) # if request.status_code!=200: raise with open(path,"r") as file: content=file.read() html=lxml.html.fromstring(content.decode("gbk")) try: if re.search("utf",html.xpath("//meta/@charset")[0]): html=lxml.html.fromstring(r.content.decode("utf-8")) except: pass try: if len(html.xpath("//div[@class='art_box']/h1/text()"))>0: title=html.xpath("//div[@class='art_box']/h1/text()")[0] else: title=html.xpath("//div[@class='artbox']/h1/text()")[0] except: title="" print("title:%s"%title) if len(html.xpath("//div[@id='contentText']"))>0: div1=html.xpath("//div[@id='contentText']")[0] elif len(html.xpath("//div[@class='article']"))>0: div1=html.xpath("//div[@class='article']")[0] else: raise cleaner = Cleaner(scripts = True) for p in div1.xpath("./p"): p=cleaner.clean_html(p) try: text+=p.text_content().strip()+"\n" except: pass print("text:%s"%text) flag=True except Exception,e: print(e)
class HTMLSanitiser: def __init__(self): self.Cleaner = Cleaner(scripts = False, javascript = False, comments = False, links = False, meta = True, page_structure = False, processing_instructions = False, embedded = False, frames = False, forms = False, annoying_tags = False, remove_unknown_tags = False, safe_attrs_only = True, allow_tags=ALLOWED_TAGS) #self.Cleaner = Cleaner(allow_tags=ALLOWED_TAGS, remove_unknown_tags=False) def IsValidURL(self, URL): ParsedURL = urlparse(URL) return (ParsedURL.scheme in ALLOWED_URL_SCHEMES) def CleanURLs(self, HTML): # Largely Inspired from: http://stackoverflow.com/questions/5789127/how-to-replace-links-using-lxml-and-iterlinks ParsedHTML = lxml.html.document_fromstring(HTML) #print dir(ParsedHTML) #'iter', 'iterancestors', 'iterchildren' #for Element in ParsedHTML.iterchildren(): # print dir(Element) # print Element.tag for Element, Attribute, Link, Pos in ParsedHTML.iterlinks(): if not self.IsValidURL(Link): Element.set(Attribute, Link.replace(Link, '')) return lxml.html.tostring(ParsedHTML) def CleanThirdPartyHTML(self, HTML): # 1st clean URLs, 2nd get rid of basics, 3rd apply white list return self.Cleaner.clean_html(clean_html(self.CleanURLs(HTML))) def TestPrint(self, TestInfo, TestOutput): TestInfo += "_" * (60 - len(TestInfo)) # Make info visually easier to compare print TestInfo + TestOutput
def html_cleanup(input): cleaner = Cleaner( scripts = True, javascript = True, comments = True, style = False, links = True, meta = True, page_structure = True, processing_instructions = True, embedded = False, frames = False, forms = True, annoying_tags = True, allow_tags = ['a', 'img', 'span', 'div', 'p', 'br', 'iframe', # for google cal 'strong', 'em', 'b', 'i', 'u', 'strike', 'blockquote', 'sub', 'sup', 'ul', 'ol', 'li', 'table', 'tdata', 'tr', 'th', 'td', 'h1', 'h2', 'h3', 'h4'], remove_unknown_tags = False, safe_attrs_only = True, host_whitelist = ['youtube.com', 'www.google.com'], whitelist_tags = ['iframe', 'embed', 'script', 'img'] ) sane = cleaner.clean_html("<div>%s</div>"%input) return sane[len('<div>'):-len('</div>')]
def _load(self): """ Load the ElementTree from the source """ # Convert directional quotation marks to regular quotes double_quotes = ur'[\u201c\u201d]' self.source = re.sub(double_quotes, u'"', self.source) single_quotes = ur'[\u2019\u2018]' self.source = re.sub(single_quotes, u"'", self.source) # Convert colons self.source = self.source.replace(u'\uff1a', u':') # Remove line breaks and tabs self.source = self.source.replace(u'\n', u'') self.source = self.source.replace(u'\t', u'') # There are also some "zero width joiners" in random places in the text # Should remove them here, since they make string search unreliable # these are the codes: ‍,   (nbsp), \xa0 (nbsp), \u200d zero_width_joiners = u'\u200d' self.source = self.source.replace(zero_width_joiners, u'') # Also previously had some non breaking spaces in unicode \u00a0, but this # may have been fixed by changing the parser below # Use the lxml cleaner cleaner = Cleaner() parser = HTMLParser(encoding='utf-8') # Finally, load the cleaned string to an ElementTree self.tree = cleaner.clean_html(lxml.html.fromstring(to_string(self.source), parser=parser))
def _statistica_(url_string): """Implementa la logica per estrarre documento e metadati da rivista-statistica """ url = urlparse.urlparse(url_string) conn = httplib.HTTPConnection(url.hostname) conn.request("GET", url.path) res = conn.getresponse() body = res.read() my_page = html.fromstring(body) # Rimuovi il banner dei cookie del ***** for el in my_page.xpath('//*[@id="cookiesAlert"]'): el.getparent().remove(el) # Rimuovi tutti i tag script e il loro contenuto cleaner = Cleaner() cleaner.javascript = True my_page = cleaner.clean_html(my_page) title = my_page.xpath('//*[@id="articleTitle"]/h3') full_content = my_page.xpath('//*[@id="content"]') doi = my_page.xpath('//*[@id="pub-id::doi"]') full_content = ''.join( [etree.tostring(fix_links(el, url_string)) for el in full_content]) result = { 'title': title[0].text_content(), 'content': full_content, 'doi': doi[0].text_content() } return json.JSONEncoder().encode(result)
def visit(url): if url.startswith(base_url) == False: return try: resp = urlopen(url) except URLError as e: return page = resp.read() cleaner = Cleaner() cleaner.javasript = True cleaner.style = True cleaner.kill_tags = ELEMENTS_TO_IGNORE # soup = BeautifulSoup(page, "lxml") # for link in soup.findAll('a'): # if link.has_attr('href'): # if link.has_attr('class') and 'history' in link['class']: # continue # next_link = urljoin(url,link['href']) # next_link = urldefrag(next_link)[0] # if next_link not in visited_pages: # visited_pages.append(next_link) # pages_to_visit.append(next_link) clean_page = cleaner.clean_html(page) soup = BeautifulSoup(clean_page, "lxml") extract(soup, url)
def test_allow_tags(self): html = """ <html> <head> </head> <body> <p>some text</p> <table> <tr> <td>hello</td><td>world</td> </tr> <tr> <td>hello</td><td>world</td> </tr> </table> <img> </body> </html> """ html_root = lxml.html.document_fromstring(html) cleaner = Cleaner( remove_unknown_tags = False, allow_tags = ['table', 'tr', 'td']) result = cleaner.clean_html(html_root) self.assertEqual(12-5+1, len(list(result.iter())))
def createPages(): items = source.contentItems() for item in items: doc = parse(item).getroot() cleaner = Cleaner(style=True, links=False, page_structure=True, safe_attrs_only=False) cleaned = cleaner.clean_html(doc) # get the pagetitle titles = cleaned.find_class('Pagetitle') # snag the page title - method returns list. . there's really only one title = titles[0].text_content() # get the description descrips = cleaned.find_class('Summarytext') descrip = descrips[0].text_content() #Need to have temporary id id = str(random.randint(0, 99999999)) target.invokeFactory("Document", id=uid) obj = target[uid] obj.setTitle(title) obj.setDescription(descrip) obj.setText.getBodyText() # Will finish Archetypes content item creation process, # rename-after-creation and such obj.processForm() return obj
def analyze(request): url = request.GET['url'] opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20120427 Firefox/15.0a1')] response = opener.open(url) raw_html = response.read() cleaner = Cleaner(kill_tags = ['style', 'script', 'head'], allow_tags = [''], remove_unknown_tags = False) raw_text = cleaner.clean_html(raw_html) ptn = re.compile('<div>|</div>') raw_text = re.sub(ptn, '', raw_text) ptn = re.compile('\s+') raw_text = re.sub(ptn, ' ', raw_text) raw_text = raw_text.strip().lower() prd, score = MLearn.predict(raw_text) donut = score * 100 results = MLearn.predict_other(raw_text) related_headline = results[0][2] related_verdict = results[0][0] related_score = results[0][1] * 100 context = { 'url': url, 'verdict': prd, 'score': donut, 'related_headline': related_headline, 'related_verdict': related_verdict, 'related_score': related_score, 'results': results, } return render(request, 'results.html', context)
def learn_stopwords(self): req = urllib2.Request(self.html_url, headers={'Host':'github.com', 'Referer':'https://github.com', 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36'}) r = urllib2.urlopen(req) page = r.read() tree = html.fromstring(page) # get readme part readme_tree = tree.xpath('//*[@id="readme"]/article') if len(readme_tree) < 1: return readme_tree = readme_tree[0] self.origin_readme = readme_tree.text_content() cleaner = Cleaner(allow_tags=['p','h1','h2','h3','h4','h5','pre'], remove_unknown_tags=False) readme_tree = cleaner.clean_html(readme_tree) header = "" # iterate each header and paragraph for sub in readme_tree.iterchildren(): if sub is None: break if sub.tag == 'pre' and header: self.add_stopwords(self.filter_all(header)) header = "" elif sub.tag in ['h1','h2','h3','h4'] and sub.text is not None: header = sub.text.strip().lower()
def handle(self, **options): since = get_last_change() writer = get_writer() last_change = since while True: doc = {} changes = settings.db.changes(since=since) since = changes["last_seq"] if since != last_change: print("Detected new tasks ".format(len(changes))) print("=== changes ===") pprint(changes) for changeset in changes["results"]: try: doc = settings.db[changeset["id"]] except couchdb.http.ResourceNotFound: print("resource not found") continue if not ("type" in doc and "page" in doc["type"]): if since != last_change: print("not processing doc: {}".format(str(doc))) last_change = since continue print("indexing", doc["url"]) ##### # raw, html, text ##################### raw = doc['content'] print("type(RAW) = %s" % type(raw)) tree = document_fromstring(str(raw)) title = ' '.join([title for title in tree.xpath('//title/text()')]) # enable filters to remove Javascript and CSS from HTML document cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.html = True cleaner.page_structure = False cleaner.meta = False cleaner.safe_attrs_only = False cleaner.links = False html = cleaner.clean_html(tree) text_content = html.text_content() lxml.html.tostring(html) description = ' '.join( tree.xpath("//meta[@name='description']/@content")) writer.update_document( title=title, url=doc['url'], desc=description, rank=doc['rank'], content='\n'.join([title, doc['url'], text_content]), raw=raw, ) writer.commit() writer = get_writer() set_last_change(since) last_change = since
import lxml.html from lxml.html.clean import Cleaner # ダウンロードした XHTML ファイルのファイル名を書きます。 # ちなみに 789_14547.html は《吾輩は猫である》です。 FILE_NAME = 'data/xhtml/789_14547.html' with open(FILE_NAME, encoding='shift_jis') as f: data = f.read().encode('shift_jis') cleaner = Cleaner(page_structure=False, remove_tags=( 'ruby', 'br'), kill_tags=('rt', 'rp')) cln_html = cleaner.clean_html(data).decode('utf-8') plain_text = lxml.html.fromstring(cln_html).find_class('main_text')[ 0].text_content() # print(plain_text) # 別ファイルへの保存 PLAIN_TEXT = FILE_NAME.replace('xhtml', 'text').replace('.html', '.txt') print(PLAIN_TEXT) with open(PLAIN_TEXT, 'w') as f: f.write(plain_text)
def scrape_links(links): maincleaner = Cleaner(allow_tags=['div'], remove_unknown_tags=False, remove_tags=['div']) # funtion to remove every tag # while True: for link in links: # Loop through all the links if link == last_link: # Check if this link has already been scraped (this will eventually be changed to check dates) break # If we've hit something we've already scraped, break out of the loop # try: linkhtml = scraperwiki.scrape(link).decode('latin_1') # scrape the contents of the current link and decode from Windows-1252 encoding print link root = lxml.html.fromstring(linkhtml) # turn scraped content into an HTML object # GET TITLE title = root.cssselect("h1")[0].text.encode('utf-8') # grab the page header (title) and return its text as unicode title = replace_all(title, subDic) # replace alphanumeric obfuscations with letters # GET DATE date = root.cssselect("div.adInfo")[0].text # get the text of the html entity that contains the date and time of the post cleandate = re.sub(r'(\S+\s+\d+,\s+\d\d\d\d)(?:,?) (\d+\:\d+ \w\w)', r'\1 \2', date.strip()) # get date into a standard format cleandate = re.search(r'\S+\s+\d+, \d\d\d\d \d+\:\d+ \w\w', cleandate).group(0) # find the date string on the page rawdate = datetime.strptime(cleandate,'%B %d, %Y %I:%M %p') # encode the date as a date using format Month dd, YYYY date = rawdate.strftime('%Y-%m-%d %H:%M') # decode that date back into a string of format YYYY-mm-dd # GET MAIN BODY TEXT mainwithtags = root.cssselect("div.postingBody")[0] # grabs the body text of the post main = maincleaner.clean_html(mainwithtags).text.encode('utf-8') # gets rid of all HTML tags main = replace_all(main, subDic) # replace alphanumeric obfuscations with letters # GET PHONE NUMBER(S) stripped = replace_all(main.lower(), wordDic) # replaces common phone number obfuscations with actual numbers phonecomp = re.compile("[\s\-/=\.,{}_\!\@\#\$\%\^\&\*\(\)\~]") # list of known phone number dividers stripped = phonecomp.sub('',stripped) # remove phone number dividers phone = re.findall(r'(?:1?)[1-9]\d{9}',stripped) # search for groups of 10 consecutive numbers (with an optional preceding 1) phone = list(set(phone)) # gets rid of duplicate numbers by turning list into a set and back phone = ", ".join(phone) # formats phone numbers as "phone1, phone2,... phoneN" # GET LISTED AGE if root.cssselect("p.metaInfoDisplay"): # does the entry have metainfo? listedage = root.cssselect("p.metaInfoDisplay")[0] # get the the first html metainfo element listedage = re.sub("[^\d]","",listedage.text) # get rid of all non-numeric text in the text of the element else: # if there's no metainfo listedage = "" # set the listed age to an empty string # GET LOCATION if re.findall(r'Location\:(.*?)\</div\>',linkhtml, flags=re.DOTALL): # location = re.findall('Location\:(.*?)\</div\>',linkhtml, flags=re.DOTALL)[0].encode('utf-8') # location = removeNonAscii(location) #if any(x in NEIGHBORHOODS) in location: # print x, 'x' # area = x area = None for neighborhood in NEIGHBORHOODS: if neighborhood in location.lower(): area = neighborhood print repr(area) print repr(location) else: location = "" picturelist=[] pictures = root.cssselect('ul#viewAdPhotoLayout img') for i in range(len(pictures)): largepic = re.sub('/medium/','/large/',pictures[i].get('src')) picturelist.append(largepic) print picturelist picturelist = " ".join(picturelist) x = urllib.urlopen(largepic).read() piccode = base64.encodestring(x) print piccode # except: # print 'FAILED TO LOAD: ' + link # continue # record = {} # record['Title'] = 'LOAD FAILURE' # Set up our data record - we'll need it later record = {} record['Title'] = title #.encode('ascii', 'ignore').strip() record['Date'] = date record['Main'] = main #.encode('ascii', 'ignore').strip() record['Pictures'] = picturelist record['Phone'] = phone record['Listed Age'] = listedage #.encode('ascii', 'ignore').strip() record['Location'] = location record['area']= area record['PicCode'] = piccode #.encode('ascii', 'ignore').strip() # Print out the data we've gathered #print record, '------------' # Finally, save the record to the datastore - 'Artist' is our unique key scraperwiki.sqlite.save(["Title"], record) time.sleep(2)
def get_message_tree(self): tree = { 'id': self.get_msg_info(self.index.MSG_ID), 'tags': self.get_msg_info(self.index.MSG_TAGS).split(','), 'summary': self.get_msg_summary(), 'headers': {}, 'headers_lc': {}, 'attributes': {}, 'text_parts': [], 'html_parts': [], 'attachments': [], 'conversation': [], } conv_id = self.get_msg_info(self.index.MSG_CONV_ID) if conv_id: conv = Email(self.index, int(conv_id, 36)) tree['conversation'] = convs = [conv.get_msg_summary()] for rid in conv.get_msg_info(self.index.MSG_REPLIES).split(','): if rid: convs.append(Email(self.index, int(rid, 36)).get_msg_summary()) # FIXME: Decide if this is strict enough or too strict...? html_cleaner = Cleaner(page_structure=True, meta=True, links=True, javascript=True, scripts=True, frames=True, embedded=True, safe_attrs_only=True) msg = self.get_msg() for hdr in msg.keys(): tree['headers'][hdr] = self.index.hdr(msg, hdr) tree['headers_lc'][hdr.lower()] = self.index.hdr(msg, hdr) # Note: count algorithm must match that used in extract_attachment above count = 0 for part in msg.walk(): mimetype = part.get_content_type() if mimetype.startswith('multipart/'): continue count += 1 if (part.get('content-disposition', 'inline') == 'inline' and mimetype in ('text/plain', 'text/html')): payload, charset, openpgp = self.decode_payload(part) # FIXME: Do something with the openpgp data! if (mimetype == 'text/html' or '<html>' in payload or '</body>' in payload): tree['html_parts'].append({ 'openpgp_status': openpgp and openpgp[0] or '', 'openpgp_data': openpgp and openpgp[1] or '', 'charset': charset, 'type': 'html', 'data': (payload.strip() and html_cleaner.clean_html(payload)) or '' }) else: tree['text_parts'].extend(self.parse_text_part(payload, charset, openpgp)) else: tree['attachments'].append({ 'mimetype': mimetype, 'count': count, 'part': part, 'length': len(part.get_payload(None, True) or ''), 'content-id': part.get('content-id', ''), 'filename': part.get_filename() or '' }) if self.is_editable(): tree['is_editable'] = True tree['editing_string'] = self.get_editing_string(tree) return tree
deboilFile = lzma.open(options.outDir + "/" + options.prefix + "deboilerplate_html.xz", "w") for record in f: # We convert into UTF8 first of all orig_encoding, text = convert_encoding(record.payload.read()) url = record.url if orig_encoding is None: logging.info("Encoding of document " + url + " could not be identified") if len(text) > 0: # HTML is then normalized cleaner = Cleaner(style=True, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False) tree="" try: cleanhtml = cleaner.clean_html(re.sub('encoding *= *"[^"]+"', '', text, flags=re.IGNORECASE)) tree = ftfy.fix_text(cleanhtml, fix_entities=False, fix_character_width=False) #document = html5lib.parse(fixedtext, treebuilder="lxml", namespaceHTMLElements=False) #tree = etree.tostring(document, encoding="utf-8") except Exception as ex: sys.stderr.write(str(ex)+"\n") continue cleantree = tree.replace(" ", " ") cleantree = cleantree.replace("\t", " ") # lang id #printable_str = ''.join(x for x in cleantree if x in string.printable) lang = guess_lang_from_data2(tree) if len(languages) > 0 and lang not in languages: logging.info("Language of document " + url + ": " + lang + ". Not among searched languages.") else:
def html_clean(str): """ Clean up HTML to be safe """ cleaner = Cleaner(safe_attrs_only=True) return cleaner.clean_html(str)
class Tokenizer: def __init__(self): self.cleaner = Cleaner(scripts = True, javascript = True, style = True, \ meta = True, annoying_tags = True, embedded = True, page_structure = False, \ kill_tags = ['img', 'CDATA', 'form'], remove_tags = ['a','div'], remove_unknown_tags = True, comments = True) self.cleanerBody = Cleaner( page_structure=False, kill_tags=['h1', 'h2', 'h3', 'h4', 'h5', 'h6']) self.stopwords = set(stopwords.words('english')) self.ps = PS() def parseHTML(self, content): cleaned = self.cleaner.clean_html(content) cleanedBody = self.cleanerBody.clean_html(cleaned) #print(cleanedBody) soup = BeautifulSoup(cleaned, 'lxml') soupBody = BeautifulSoup(cleanedBody, 'lxml') #print(soup) if soup.html != None: title = self.getTitle(soup) heading = self.getHeadings(soup) body = self.getBody(soupBody) #print(title) #print(heading) #print(body) return (title, heading, body) def getTitle(self, soup): try: title = soup.title.getText().lower() #print(title) tokens = RegexpTokenizer(r'[a-z]+').tokenize(title) stemmed = [self.ps.stem(t) for t in tokens] filtered = [t for t in stemmed if not t in self.stopwords] #print(filtered) return filtered #print(tokens) except AttributeError: pass #print('No <title> tag in this file') def getHeadings(self, soup): result = [] for i in range(1, 3): try: headerList = soup.find_all("h" + str(i)) for h in headerList: replaced = re.sub('<[/*a-zA-Z0-9]*>', ' ', str(h).lower()) #print(replaced) tokens = RegexpTokenizer(r'[a-z]+').tokenize(replaced) stemmed = [self.ps.stem(t) for t in tokens] filtered = [t for t in stemmed if not t in self.stopwords] result.extend(filtered) #print(filtered) return filtered except AttributeError: pass def getBody(self, soup): try: replaced = re.sub('<[/*a-zA-Z0-9]*>', ' ', str(soup.body).lower()) #print(replaced) tokens = RegexpTokenizer(r'[a-z]+').tokenize(replaced) stemmed = [self.ps.stem(t) for t in tokens] filtered = [t for t in stemmed if not t in self.stopwords] #print(filtered) return filtered except AttributeError: pass
def as_clean_html(value): cleaner = Cleaner(style=True, scripts=True) return cleaner.clean_html(value)
def _clean_html(self, html): cleaner = Cleaner(style=True, scripts=True) return cleaner.clean_html(html)
# -- UNCOMMENT THE 6 LINES BELOW (i.e. delete the # at the start of the lines) # -- CLICK THE 'RUN' BUTTON BELOW # Check the 'Console' tab again, and you'll see how we're extracting # the HTML that was inside <td></td> tags. # We use lxml, which is a Python library especially for parsing html. # ----------------------------------------------------------------------------- html = html.replace('<br>', ' ') html = re.sub(r'(\&.*?;)|(\n|\t|\r)', ' ', html) print html issues = [] root = lxml.html.fromstring(html) # turn our HTML into an lxml object cleaner = Cleaner(remove_tags=['font', 'span'], links=False, remove_unknown_tags=False) root = cleaner.clean_html(root) newhtml = lxml.html.tostring(root) record = {} datestring = re.findall("Updated (.*?)</p>", newhtml)[0] date = time.strptime( datestring, '%b %d, %Y') # encode the date as a date using format Month dd, YYYY date = time.strftime( '%Y-%m-%d', date) # decode that date back into a string of format YYYY-mm-dd if scraperwiki.sqlite.get_var( 'last_update' ) == None or scraperwiki.sqlite.get_var('last_update') != date: record["Date"] = date
def get_message_tree(self, want=None): msg = self.get_msg() tree = {'id': self.get_msg_info(self.index.MSG_ID)} for p in 'text_parts', 'html_parts', 'attachments': if want is None or p in want: tree[p] = [] if want is None or 'summary' in want: tree['summary'] = self.get_msg_summary() if want is None or 'tags' in want: tree['tags'] = self.get_msg_info(self.index.MSG_TAGS).split(',') if want is None or 'conversation' in want: tree['conversation'] = {} conv_id = self.get_msg_info(self.index.MSG_THREAD_MID) if conv_id: conv = Email(self.index, int(conv_id, 36)) tree['conversation'] = convs = [conv.get_msg_summary()] for rid in conv.get_msg_info( self.index.MSG_REPLIES).split(','): if rid: convs.append( Email(self.index, int(rid, 36)).get_msg_summary()) if (want is None or 'headers' in want or 'editing_string' in want or 'editing_strings' in want): tree['headers'] = {} for hdr in msg.keys(): tree['headers'][hdr] = self.index.hdr(msg, hdr) if want is None or 'headers_lc' in want: tree['headers_lc'] = {} for hdr in msg.keys(): tree['headers_lc'][hdr.lower()] = self.index.hdr(msg, hdr) if want is None or 'header_list' in want: tree['header_list'] = [(k, self.index.hdr(msg, k, value=v)) for k, v in msg.items()] # FIXME: Decide if this is strict enough or too strict...? html_cleaner = Cleaner(page_structure=True, meta=True, links=True, javascript=True, scripts=True, frames=True, embedded=True, safe_attrs_only=True) # Note: count algorithm must match that used in extract_attachment # above count = 0 for part in msg.walk(): mimetype = part.get_content_type() if (mimetype.startswith('multipart/') or mimetype == "application/pgp-encrypted"): continue try: if (mimetype == "application/octet-stream" and part.cryptedcontainer is True): continue except: pass count += 1 if (part.get('content-disposition', 'inline') == 'inline' and mimetype in ('text/plain', 'text/html')): payload, charset = self.decode_payload(part) if (mimetype == 'text/html' or '<html>' in payload or '</body>' in payload): if want is None or 'html_parts' in want: tree['html_parts'].append({ 'charset': charset, 'type': 'html', 'data': ((payload.strip() and html_cleaner.clean_html(payload)) or '') }) elif want is None or 'text_parts' in want: text_parts = self.parse_text_part(payload, charset) if want is None or 'text_parts' in want: tree['text_parts'].extend(text_parts) elif want is None or 'attachments' in want: tree['attachments'].append({ 'mimetype': mimetype, 'count': count, 'part': part, 'length': len(part.get_payload(None, True) or ''), 'content-id': part.get('content-id', ''), 'filename': part.get_filename() or '' }) if self.is_editable(): if not want or 'editing_strings' in want: tree['editing_strings'] = self.get_editing_strings(tree) if not want or 'editing_string' in want: tree['editing_string'] = self.get_editing_string(tree) if want is None or 'crypto' in want: if 'crypto' not in tree: tree['crypto'] = { 'encryption': msg.encryption_info, 'signature': msg.signature_info } else: tree['crypto']['encryption'] = msg.encryption_info tree['crypto']['signature'] = msg.signature_info return tree
def clean_comment_text(string): doc = document_fromstring(string) cleaner = Cleaner() return cleaner.clean_html(doc).text_content()
def clean_project_desc(string): cleaner = Cleaner(remove_tags=["a"]) return cleaner.clean_html(string)
class TransformHtmlProceedingsToXml(object): """Get proceedings of the European Parliament.""" @timeit def __init__(self): self.cli() self.infiles = self.get_files(self.indir, self.pattern) self.n_proceedings = 0 self.rm_a = Cleaner(remove_tags=['a']) self.main() def __str__(self): message = "Information for {} MEPs extracted!".format( str(self.n_proceedings)) return message def get_files(self, directory, fileclue): """Get all files in a directory matching a pattern. Keyword arguments: directory -- a string for the input folder path fileclue -- a string as glob pattern """ matches = [] for root, dirnames, filenames in os.walk(directory): for filename in fnmatch.filter(filenames, fileclue): matches.append(os.path.join(root, filename)) return matches def read_html(self, infile): """Parse a HTML file.""" with open(infile, encoding='utf-8', mode='r') as input: return html.parse(input) def serialize(self, infile, root): ofile_name = os.path.splitext(os.path.basename(infile))[0] ofile_path = os.path.join(self.outdir, ofile_name+'.xml') xml = etree.tostring( root, encoding='utf-8', xml_declaration=True, pretty_print=True).decode('utf-8') with open(ofile_path, mode='w', encoding='utf-8') as ofile: ofile.write(xml) pass def get_name(self, tree): name = tree.xpath('//li[@class="mep_name"]')[0] name = self.rm_a.clean_html(name) name = html.tostring(name).decode('utf-8') name = re.sub(r'[\t\n]', r'', name) name = name.split('<br>') name = [html.fromstring(x).text_content() for x in name] name = ' '.join(name) return name def get_nationality(self, tree): nationality = tree.find_class('nationality')[0] nationality = nationality.text.strip() return nationality def get_id(self, infile): id = os.path.splitext(os.path.basename(infile))[0] return id def parse_date(self, a_date, a_pattern): output = datetime.datetime.strptime(a_date, a_pattern).date() return output def get_birth(self, tree): birth = tree.xpath('.//span[@class="more_info"]') birth_date = None birth_place = None death_date = None death_place = None for i in birth: if i.text is not None: birth_text = re.sub(r'[\t\n]', r'', i.text) birth_text = birth_text.strip() if re.match(r'^Date of birth: (.+?), (.+)$', birth_text): info = re.match( r'^Date of birth: (.+?), (.+)$', birth_text) birth_date = self.parse_date(info.group(1), "%d %B %Y") birth_place = info.group(2) elif re.match(r'^Date of birth: (.+?)$', birth_text): info = re.match(r'^Date of birth: (.+?)$', birth_text) birth_date = self.parse_date(info.group(1), "%d %B %Y") birth_place = None elif re.match(r'^Date of death: (.+?), (.+)$', birth_text): info = re.match( r'^Date of death: (.+?), (.+)$', birth_text) death_date = self.parse_date(info.group(1), "%d %B %Y") death_place = info.group(2) elif re.match(r'^Date of death: (.+?)$', birth_text): info = re.match(r'^Date of death: (.+?)$', birth_text) death_date = self.parse_date(info.group(1), "%d %B %Y") death_place = None return birth_date, birth_place, death_date, death_place def get_political_groups(self, tree, id): political_groups = tree.xpath('.//div[@class="boxcontent nobackground"]/h4[contains(., "Political groups")]/following-sibling::ul[1]//li') output = [] for i in political_groups: info = i.text info = re.sub(r'\n', r'', info) info = re.sub(r'\t+', r'\t', info) info = re.sub(r' \t/ ', r'\t', info) info = re.sub(r'\t:\t', r'\t', info) info = re.sub(r' - ', r'\t', info) info = re.sub(r'\t$', r'', info) info = info.strip() info = info.split('\t') info = [x.strip() for x in info] m_state = i.attrib['class'] s_date = self.parse_date(info[0], "%d.%m.%Y") if info[1] == '...': e_date = self.date else: e_date = self.parse_date(info[1], "%d.%m.%Y") p_group = info[2] p_group_role = info[3] output.append({ 'id': id, 'm_state': m_state, 's_date': s_date, 'e_date': e_date, 'p_group': p_group, 'p_group_role': p_group_role}) return output def get_national_parties(self, tree, id): political_groups = tree.xpath('.//div[@class="boxcontent nobackground"]/h4[contains(., "National parties")]/following-sibling::ul[1]//li') output = [] for i in political_groups: info = i.text info = re.sub(r'\n', r'', info) info = re.sub(r'\t+', r'\t', info) info = re.sub(r' \t/ ', r'\t', info) info = re.sub(r'\t:\t', r'\t', info) info = re.sub(r' - ', r'\t', info) info = re.sub(r'\t$', r'', info) info = info.strip() info = info.split('\t') info = [x.strip() for x in info] s_date = self.parse_date(info[0], "%d.%m.%Y") if info[1] == '...': e_date = self.date else: e_date = self.parse_date(info[1], "%d.%m.%Y") n_party = info[2] output.append({ 'id': id, 's_date': s_date, 'e_date': e_date, 'n_party': n_party}) return output def extract_info(self, infile): id = self.get_id(infile) tree = self.read_html(infile).getroot() name = self.get_name(tree) nationality = self.get_nationality(tree) birth_date, birth_place, death_date, death_place = self.get_birth(tree) self.meps[id] = { 'name': name, 'nationality': nationality, 'birth_date': birth_date, 'birth_place': birth_place, 'death_date': death_date, 'death_place': death_place } self.political_groups = ( self.political_groups + self.get_political_groups(tree, id)) self.national_parties = ( self.national_parties + self.get_national_parties(tree, id)) pass def serialize_dict_of_dicts(self, dict_of_dicts, ofile_name): df = pd.DataFrame.from_dict(dict_of_dicts, orient='index') opath = os.path.join(self.outdir, ofile_name) df.to_csv( opath, sep='\t', mode='w', encoding='utf-8', index_label='id') pass def serialize_list_of_dicts(self, list_of_dicts, ofile_name, col_order): df = pd.DataFrame(list_of_dicts) df = df[col_order] opath = os.path.join(self.outdir, ofile_name) df.to_csv(opath, sep='\t', mode='w', encoding='utf-8', index=False) pass def main(self): self.meps = {} self.political_groups = [] self.national_parties = [] for infile in self.infiles: print(infile) if self.date is None: self.date = datetime.datetime.fromtimestamp( os.path.getmtime(infile)).date() self.extract_info(infile) self.n_proceedings += 1 self.serialize_dict_of_dicts(self.meps, 'meps.csv') self.serialize_list_of_dicts( self.political_groups, 'political_groups.csv', ['id', 'm_state', 's_date', 'e_date', 'p_group', 'p_group_role']) self.serialize_list_of_dicts( self.national_parties, 'national_parties.csv', ['id', 's_date', 'e_date', 'n_party']) pass def cli(self): """CLI parses command-line arguments""" parser = argparse.ArgumentParser() parser.add_argument( "-i", "--input", required=True, help="path to the input directory.") parser.add_argument( "-o", "--output", required=True, help="path to the output directory.") parser.add_argument( '-p', "--pattern", required=False, default="*.html", help="glob pattern to filter files.") parser.add_argument( '-d', "--date", required=False, default=None, help="date of download of HTML files.") args = parser.parse_args() self.indir = args.input self.outdir = args.output if not os.path.exists(self.outdir): os.makedirs(self.outdir) self.pattern = args.pattern self.date = args.date pass
def __init__(self, html): cleaner = Cleaner(style=True, page_structure=False) self.html = cleaner.clean_html(html)
def sanitize(text): if text.strip(): cleaner = Cleaner(safe_attrs_only=False, style=True) return cleaner.clean_html(text) else: return text
def clean_summernote_html(string): cleaner = Cleaner() return cleaner.clean_html(string)
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'div', 'span', 'img', 'area', 'map' ] args = { 'meta': False, 'safe_attrs_only': False, 'page_structure': False, 'scripts': True, 'style': True, 'links': True, 'remove_tags': tags } cleaner = Cleaner(**args) path = '/html/body' body = doc.xpath(path)[0] result = cleaner.clean_html( body).text_content().encode('ascii', 'ignore') dict_result[el] += "\n\n " + " ".join( str(result).split(" ")[:count_of_words]) except: print("error at ", el[:100]) dict_result[el] = "" else: dict_result[el] = "" idx += 1 count += 1 if idx >= block: idx = 0 print("processing item " + str(count) + " of " + str(ntotal)) print("work with: ", el[:100] + "...")
class DataHandler(object): def __init__(self): # 需要保留的标签 allow_tags = ['p', 'br', 'img', 'video'] # 需要保留的属性 allow_attrs = ['src', 'controls'] self.cleaner = Cleaner(style=True, scripts=True, comments=True, javascript=True, page_structure=True, safe_attrs_only=True, remove_unknown_tags=False, safe_attrs=frozenset(allow_attrs), allow_tags=allow_tags) self.fdfs_sender = Sender() @property def current_timestamp(self): return datetime.datetime.now() @property def current_time(self): # time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) return datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') @staticmethod def str_to_string(value): """字符串-->标准时间字符串""" return parse(re.sub(r'年|月|日', '-', value)).strftime('%Y-%m-%d %H:%M:%S') @staticmethod def datetime_to_string(dt): """把Datetime格式转成字符串""" return dt.strftime('%Y-%m-%d %H:%M:%S') @staticmethod def datetime_to_timestamp(date_time): """把Datetime类型转成时间戳形式""" # return time.mktime(date_time.timetuple()) return date_time.timestamp() @staticmethod def string_to_datetime(string): """把字符串转化成Datetime格式""" return datetime.datetime.strptime(string, "%Y-%m-%d %H:%M:%S") @staticmethod def timestamp_to_datetime(stamp): """把时间戳转化为Datetime格式""" return datetime.datetime.timestamp(stamp) @staticmethod def timestamp_to_string(stamp): """把时间戳转成字符串形式""" return time.strftime("%Y-%m-%d-%H", time.localtime(stamp)) def string_to_timestamp(self, str_time): """把字符串转成时间戳形式""" return time.mktime(self.string_to_datetime(str_time).timetuple()) def remove_needless_elements(self, text): """去标签化""" text = self.cleaner.clean_html(text).replace('<div>', '').replace('</div>', '') return text def handle_images(self, url, text, headers=None): has_images = True download_images_success = True failure_time = 0 response = HtmlResponse(url=url, body=text, encoding="utf-8") image_urls = response.xpath("//img/@src").extract() if not image_urls: has_images = False for old_url in image_urls: if not (old_url.startswith("https://") or old_url.startswith("http://")): old_url = urljoin(url, old_url) new_url = self.fdfs_sender.download_upload_image(image_url=old_url, headers=headers) if new_url: text = text.replace(old_url, new_url) print(f'图片下载->上传->替换成功,{old_url},===》{new_url}') else: failure_time += 1 download_images_success = False # 表明其中有图片下载失败了 return has_images, download_images_success, text
def _clean_html_body(request, email, body, charset): """Clean up a html part as best we can Doesn't catch LXML errors """ html_tree = lxml_html.fromstring(body, parser=inboxen_parser) # if the HTML doc says its a different encoding, use that for meta_tag in html_tree.xpath("/html/head/meta"): if meta_tag.get("http-equiv", None) == "Content-Type": try: content = meta_tag.attrib["content"] content = content.split(";", 1)[1] charset = dict(HEADER_PARAMS.findall(content))["charset"] break except (KeyError, IndexError): pass elif "charset" in meta_tag.attrib: charset = meta_tag.attrib["charset"] break try: # check there's a body for premailer if html_tree.find("body") is not None: html_tree = InboxenPremailer(html_tree).transform() except Exception as exc: # Yeah, a pretty wide catch, but Premailer likes to throw up everything and anything messages.info( request, _("Part of this message could not be parsed - it may not display correctly" )) _log.warning("Failed to render CSS for %s: %s", email["eid"], exc) # Mail Pile uses this, give back if you come up with something better cleaner = Cleaner( allow_tags=HTML_ALLOW_TAGS, kill_tags=["style"], # remove style tags, not attrs remove_unknown_tags=False, safe_attrs=HTML_SAFE_ATTRS, safe_attrs_only=True, style=False, # keep style attrs ) html_tree = cleaner.clean_html(html_tree) # filter images if we need to if not email["display_images"]: for img in html_tree.xpath("//img"): try: # try to delete src first - we don't want to add a src where there wasn't one already del img.attrib["src"] # replace image with 1px png img.attrib["src"] = staticfiles_storage.url( "imgs/placeholder.svg") email["has_images"] = True except KeyError: pass for link in html_tree.xpath("//a"): try: # proxy link url = link.attrib["href"] link.attrib["href"] = proxy_url(url) except KeyError: pass # open link in tab link.attrib["target"] = "_blank" # and prevent window.opener bug (noopener is only supported in newer # browsers, plus we already set noreferrer in the head) link.attrib["rel"] = "noreferrer" # finally, export to unicode body = unicode_damnit(etree.tostring(html_tree, method="html"), charset) return safestring.mark_safe(body)
def cleaner(content_str): # 补全标签 if content_str is None: return None try: soup = BeautifulSoup(content_str, 'lxml') html_str = soup.prettify() except: html_str = content_str # 去掉style,scripts clean = Cleaner(style=True, scripts=True, comments=True, javascript=True, page_structure=False, safe_attrs_only=False) tree = html.fromstring(html_str) content = html.tostring(clean.clean_html(tree), encoding='UTF-8') # 删除其他标签,只保留p与img con = remove_tags(content, keep=('img', 'p')) # 去掉空格,换行 enter = re.compile('\n') con = enter.sub('', con).replace(' ', '') # 清理img其他属性 img_attr1 = re.compile(r'<img(.*?)src') con = img_attr1.sub('<img src', con) img_attr3 = re.compile(r'<img(.*?)data-original', re.S) con = img_attr3.sub('<img src', con) try: img_attr2 = re.findall(r'src=".*?"(.*?)>', con) for attr in img_attr2: con = con.replace(attr, '') except: pass # 清理p标签 p_class = re.compile(r'<p(.*?)>') con = p_class.sub('<p>', con) # 删除空的p标签 con = con.replace(r'<p></p>', '') # 删除img外围的p标签 imgs = re.findall(r'<img[^>]+>', con, re.S) for img in imgs: try: con = con.replace('<p>' + img + '</p>', img) except Exception as e: pass # 国际在线站点文章末尾清理 p_last = re.compile(r'>标签:.*') con = p_last.sub('>', con) return con
def clean(content, title=None): content = content.decode("utf-8") # We're parsing the content html twice! # TODO: This one can probably be removed # LXML parsing is used to get title and meta head info from HTML html_doc = html.fromstring(content, parser=html.HTMLParser(encoding="utf-8")) head_doc = html_doc.find('head') reconstructed_body = "<html><body>" + content + "</body></html>" # Get title so it can be added as an H1 tag, but remove it from # the html itself - so that Pandoc doesn't use it if not title: title = html_doc.find('.//title') title.getparent().remove(title) title = title.text_content() title = title[:title.rfind('-')] # Add in the title if "<body><h1>" not in reconstructed_body: reconstructed_body = reconstructed_body.replace( "<body>", "<body><h1>" + title + "</h1>") # Remove stuff that readability didn't remove doc = html.fromstring(reconstructed_body) # Use lxml's cleaner to remove all useless tags # (currently, this removes styles, even when not asked to) cleaner = Cleaner( scripts=True, javascript=True, comments=True, links=True, forms=True, annoying_tags=True, style=True, inline_style=False, ) doc = cleaner.clean_html(doc) body_doc = doc.find('body') bad_body_xpaths = [ "//nav", "//footer", "//button", "//form[@id='interview_experience_form']", "//div[@id='author']", "//div[@id='video']", "//div[@id='share-buttons']", "//div[@id='ide_link']", "//div[@id='disqus_thread']", "//div[@id='secondary']", "//div[@id='personalNoteDiv']", "//div[@id='practiceLinkDiv']", "//div[@class='leftSideBarParent']", "//div[@class='author_info_box']", "//div[@class='plugins']", "//div[@class='no-p-tag']", "//div[@class='comments-main']", "//ins[@class='adsbygoogle']", "//h3", "//h1[@class='entry-title']", "//h2[not(@class='tabtitle')]", "//hr", # This requires XPath 2.0 # "//a[ends-with(@href, 'sudo-gate')]", "//a[contains(@href, 'sudo-gate')]", "//p[contains(., '*****@*****.**')]", "//p[starts-with(., 'Please write comments if you find')]", ] bad_parent_xpaths = [ "//h2[starts-with(text(), 'Recommended')]", ] # This one has to be removed first, so h2's parent can die! remove_xpaths(body_doc, bad_parent_xpaths, parent=True) remove_xpaths(body_doc, bad_body_xpaths) # Convert all language tags to p tags # H1 is used only for post title for lang_h1 in body_doc.xpath("//h2[@class='tabtitle']"): lang_p = '<p><strong>%s</strong></p>' % lang_h1.text_content() lang_h1.addnext(lxml.etree.XML(lang_p)) lang_h1.getparent().remove(lang_h1) # Not too sure if this is needed - but at this point # I don't want to remove any code that works for pre_tag in body_doc.xpath("//pre"): if 'class' in pre_tag.attrib: pre_tag.attrib.pop('class') if 'title' in pre_tag.attrib: pre_tag.attrib.pop('title') try: # Add Source link to doc - this may fail for various reasons src_url = head_doc.cssselect('meta[property="og:url"]')[0].get( 'content') # noqa src_link = "<p><a href='" + src_url + "' rel='tag'>" + src_url + "</a></p>" # noqa post_content_doc = body_doc.xpath("//div[@class='entry-content']")[0] post_content_doc.append(lxml.etree.XML("<h3>Source</h3>")) post_content_doc.append(lxml.etree.XML(src_link)) except: # noqa pass # Code in the HTML is in the form of a table # We convert the table into a single pre / code tag for code_tag in body_doc.xpath('//div[starts-with(@id,"highlighter")]'): code = str(code_tag.text_content()).replace("\n\n", "") code = html_escape(code) code = "<pre> <code>" + code + "</code> </pre>" code_tag.addnext(lxml.etree.XML(code)) code_tag.getparent().remove(code_tag) result = html.tostring(body_doc).decode("utf-8") return result
def parse_article(self, response): due_date = response.meta['due_date'] last_update = response.meta['last_update'] lines = response.css('.center .MsoNormal') pattern = re.compile(r'^(\d{6})\s*(\D+)$') html = response.css('.xilan_con').extract_first() cleaner = Cleaner(page_structure=False, style=True) html = cleaner.clean_html(html) html = lxml.html.fragment_fromstring(html) text = lxml.html.tostring(html, method='text', encoding='unicode') text = text.replace('代码', '', 1).replace('名称', '', 1) text = ''.join(text.split()) parts = re.split(r'(\d{6})', text) # print('text:', parts[:30]) i = 0 count = len(parts) provinces = [] province_count = 0 city_count = 0 district_count = 0 while i < count: part = parts[i] if not part: i += 1 continue if part.isdigit(): code = part i += 1 area = parts[i] if code.endswith('0000'): a = Province(code=code, name=area) provinces.append(a) province_count += 1 elif code.endswith('00'): a = City(code=code, name=area) province = provinces[-1] cities = province.get('cities') or [] cities.append(a) province['cities'] = cities city_count += 1 else: a = District(code=code, name=area) city = provinces[-1]['cities'][-1] districts = city.get('districts') or [] districts.append(a) city['districts'] = districts district_count += 1 else: raise CloseSpider('行政区划代码无效: %s' % (part, )) i += 1 print('(更新于 %s) 截止至 %s 县及县以上行政区划代码: \ 采集到省份 %d 个,城市 %d 个,区县 %d 个' % (last_update.strftime('%Y-%m-%d'), due_date.strftime('%Y-%m-%d'), province_count, city_count, district_count)) yield Areas(due_date=due_date, last_update=last_update, provinces=provinces)
class Session(object): def __init__(self, encoding='utf8'): # Obiekt Session, używany przy kolejnych zapytaniach. Potrzebny, żeby # raz ustawić nagłówki i nie przekazywać ich do każdego zapytania # osobno. self.session = requests.Session() self.session.headers.update({ 'Accept' : 'text/html,application/xhtml+xml,'\ 'application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding' : 'gzip, deflate', 'Accept-Language' : 'pl,en-US;q=0.7,en;q=0.3', 'Cache-Control' : 'max-age=0', 'Connection' : 'keep-alive', #'Host' : 'www.krs-online.com.pl', 'User-Agent' : 'Mozilla/5.0 (X11; Linux x86_64; rv:28.0) '\ 'Gecko/20100101 Firefox/28.0', #'Referer' : 'http://www.krs-online.com.pl/muzeum-slaska-opolskiego-krs-1260077.html', #'Cookie' : 'krs_fk45=h5mfc4oblmd1e1nokkpu4694e5; krs_cookie_accepted=true', #'DNT' : '1', }) self.parser = HTMLParser(encoding=encoding) self.cleaner = Cleaner( # usuwanie skryptów, styli i komentarzy scripts=True, javascript=True, comments=True, style=True, # head i body zostają page_structure=False) def get_session(self): return self.session def clean(self, dirty_text): return self.cleaner.clean_html(dirty_text) def get(self, address, params={}): response = self.session.get(address, params=params) response.raise_for_status() return response.text def parse(self, raw_text): return fromstring(raw_text, parser=self.parser) def get_site(self, address, params={}): text = self.get(address, params) text = self.clean(text) text = self.parse(text) return text def post(self, address, params={}): response = self.session.post(address, data=params) response.raise_for_status() return response.text def post_to_site(self, address, params={}): text = self.post(address, params) text = self.clean(text) text = self.parse(text) return text
def cleanup(data, tags): cleaner = Cleaner(remove_tags=tags) clean = cleaner.clean_html(data) root = lxml.html.fromstring(clean) return root
def clean_html(text): cleaner = Cleaner(style=False) return cleaner.clean_html(text)
def handle(self, *args, **options): individuals = Individual.objects.all() for y1, y2 in year_ranges: url = url_pattern % (y1, y2, y1, y2) r = requests.get(url) r.encoding = "utf-8" output = r.text root = etree.HTML(output) dates = [ d.text for d in root.xpath( "//h2[@class=\"h3_style\"]/a[contains(@href,\"agenda\")]") ] tables = root.xpath("//table[@class=\"interlaced\"]") if len(dates) != len(tables): raise Exception("Dates and Questions Mismatch! %d <> %d" % (len(dates), len(tables))) for i in range(0, len(dates)): date = datetime.strptime(dates[i], '%d.%m.%Y') print date table = tables[i] for row in table.xpath(".//tr")[1:]: cells = row.xpath("td") if all_text(cells[3]).strip() == '-': continue legislator_name = cells[1].text if legislator_name.startswith(u"郭偉强"): legislator_name = u"郭偉強" title = all_text(cells[2]) question_type_text = all_text(cells[0]) individual = None for p in individuals: if legislator_name.startswith(p.name_ch): individual = p break if individual is None: print(legislator_name) raise Exception("Individual not found. ", legislator_name) link = cells[3].xpath(".//a")[0].attrib['href'] key = str(md5.new(link).hexdigest()) m = re.match(r"(.*[0-9]+|UQ)[\(]{0,1}(.*)\)", question_type_text) if m is None: raise Exception("Undefined Question Type", link, question_type_text) question_type = m.group(2) detail_r = requests.get(link) detail_r.encoding = "big5" output = detail_r.text cleaner = Cleaner(comments=False) output = cleaner.clean_html(output) detail_root = etree.HTML(output) try: press_release = all_text( detail_root.xpath("//div[@id=\"pressrelease\"]") [0]) except IndexError: detail_r = requests.get(link) detail_r.encoding = "utf-8" output = detail_r.text output = cleaner.clean_html(output) detail_root = etree.HTML(output) press_release = all_text( detail_root.xpath("//span[@id=\"pressrelease\"]") [0]) question_start = press_release.find(u'以下') reply_start = press_release.rfind(u'答覆:') question_text = press_release[question_start:reply_start] answer_text = press_release[reply_start + 3:] #print(question_text) #print(answer_text) #print link #print date #print individual.name_en #print key #print question_type question = Question() question.key = key question.individual = individual question.date = date question.question_type = question_type question.question = question_text question.answer = answer_text question.title = title question.link = link question.title_ch = title try: question.save() except IntegrityError: print("%s %s already exists" % (str(date), title))
try: email = kinoroot.cssselect("div.fliesstext a")[0] except: continue try: url = kinoroot.cssselect("div.fliesstext a")[1] except: continue # print lxml.html.tostring(anschrift1) data = { 'kinoname': kinoname, 'anschrift1': re.sub( '<\/{0,1}div>', ' ', cleaner.clean_html( lxml.html.tostring(anschrift1, encoding=unicode))), 'anschrift2': re.sub( '<\/{0,1}div>', ' ', cleaner.clean_html( lxml.html.tostring(anschrift2, encoding=unicode))), 'anschrift3': re.sub( '<\/{0,1}div>', ' ', cleaner.clean_html( lxml.html.tostring(anschrift3, encoding=unicode))), 'anschrift4': re.sub( '<\/{0,1}div>', ' ', cleaner.clean_html( lxml.html.tostring(anschrift4, encoding=unicode))),
<head> <title>Test</title> </head> <body> hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh <h2>Hello World</h2> <p>hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh</p> <a><img src="/test.png"></img></a> hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh <img src="/test.png"></img> </body> </html> """ string = get_document("http://news.sina.com.cn/c/nd/2016-04-06/doc-ifxrcizs6891671.shtml") allow_tags = ("b", "blod", "big", "em", "font", "h1", "h2", "h3", "h4", "h5", "h6", "i", "italic", "small", "strike", "sub", "a", "p", "strong", "div", "img", "tt", "u", "html", "meta", "body", "head", "br", "sup", "title", "article") cleaner = Cleaner(scripts=True, javascript=True, comments=True, style=True, links=True, meta=False, add_nofollow=False, page_structure=False, processing_instructions=True, embedded=False, frames=False, forms=False, annoying_tags=False, remove_tags=None, remove_unknown_tags=False, safe_attrs_only=False, allow_tags=allow_tags) string = cleaner.clean_html(string) extract(string=string)
#!/usr/bin/python3 import sys sys.path.append('..') from app import db from app.models import Review from lxml.html.clean import Cleaner review = Review.query.order_by(Review.id).first() while review is not None: print(review.id) cleaner = Cleaner(safe_attrs_only=False, style=False) new_content = cleaner.clean_html(review.content) if new_content != review.content: print('=======') print(review.content) print('-------') print(new_content) print('=======') review.content = new_content db.session.commit() review = Review.query.filter(Review.id > review.id).order_by( Review.id).first()