def update(self, parser_entry): for field in self._entry_fields: value = parser_entry.get(field) if getattr(self, field, _marker) != value: setattr(self, field, value) for field in self._date_fields: value = parser_entry.get(field + '_parsed') if value is not None: value = datetime.datetime(*value[:6]) if getattr(self, field, _marker) != value: setattr(self, field, value) entry_content = parser_entry.get('content') if entry_content: content_list = [] for content in entry_content: # *Always* sanitize the content HTML. # FeedParser sometimes doesn't sanitize, such as when # the input is base64 encoded. value = feedparser._sanitizeHTML(content.value, 'utf-8') content_list.append(value) content_html = '\n'.join(content_list) else: content_html = None if getattr(self, 'content_html', _marker) != content_html: self.content_html = content_html
def from_feed_entry(entry): """Read and construct Post object from ``entry``. ``entry`` should be a post object as returned by ``feedparser.parse``. If the post is invalid, raise a ``MalformedPostError`. This leaves the `blog` field emtpy; this must be filled in before the post is added to the database. """ for field in 'title', 'summary', 'link': if field not in entry: raise MalformedPostError("Post has no %s: %r" % (field, entry)) post = Post() post.timestamp = to_dbtime(Post._get_pub_date(entry)) post.title = entry['title'] post.summary = entry['summary'] if hasattr(entry, 'id'): post.guid = entry.id # The summary detail attribute lets us find the mime type of the # summary. feedparser doesn't escape it if it's text/plain, so we need # to do it ourselves. Unfortunately, there's a bug (likely #412) in # feedparser, and sometimes this attribute is unavailable. If it's # there, great, use it. Otherwise, we'll just assume it's html, and # sanitize it ourselves. if hasattr(entry, 'summary_detail'): mimetype = entry.summary_detail.type else: mimetype = 'application/xhtml' # Sanitize the html; who knows what feedparser did or didn't do. # XXX: _sanitizeHTML is a private function to the feedparser # library! unfortunately, we don't have many better options. This # statement is the reason the version number for the feedparser # dependency is fixed at 5.1.3; any alternate version will need to # be vetted carefully, as by doing this we lose any api stability # guarantees. post.summary = unicode( feedparser._sanitizeHTML( # _sanitizeHTML expects an encoding, so rather than do more # guesswork than we alredy have... post.summary.encode('utf-8'), 'utf-8', # _sanitizeHTML is only ever called within the library with # this value: u'text/html', ), 'utf-8') if mimetype == 'text/plain': # feedparser doesn't sanitize the summary if it's plain text, so we # need to do it manually. We're using jijna2's autoscape feature # for this, which feels like a bit of a hack to me (Ian), but it # works -- there's probably a cleaner way to do this. tmpl = jinja2.Template('{{ text }}', autoescape=True) post.summary = tmpl.render(text=post.summary) post.page_url = entry['link'] return post
def from_feed_entry(entry): """Read and construct Post object from ``entry``. ``entry`` should be a post object as returned by ``feedparser.parse``. If the post is invalid, raise a ``MalformedPostError`. This leaves the `blog` field emtpy; this must be filled in before the post is added to the database. """ for field in 'title', 'summary', 'link': if field not in entry: raise MalformedPostError("Post has no %s: %r" % (field, entry)) post = Post() post.timestamp = to_dbtime(Post._get_pub_date(entry)) post.title = entry['title'] post.summary = entry['summary'] if hasattr(entry, 'id'): post.guid = entry.id # The summary detail attribute lets us find the mime type of the # summary. feedparser doesn't escape it if it's text/plain, so we need # to do it ourselves. Unfortunately, there's a bug (likely #412) in # feedparser, and sometimes this attribute is unavailable. If it's # there, great, use it. Otherwise, we'll just assume it's html, and # sanitize it ourselves. if hasattr(entry, 'summary_detail'): mimetype = entry.summary_detail.type else: mimetype = 'application/xhtml' # Sanitize the html; who knows what feedparser did or didn't do. # XXX: _sanitizeHTML is a private function to the feedparser # library! unfortunately, we don't have many better options. This # statement is the reason the version number for the feedparser # dependency is fixed at 5.1.3; any alternate version will need to # be vetted carefully, as by doing this we lose any api stability # guarantees. post.summary = unicode(feedparser._sanitizeHTML( # _sanitizeHTML expects an encoding, so rather than do more # guesswork than we alredy have... post.summary.encode('utf-8'), 'utf-8', # _sanitizeHTML is only ever called within the library with # this value: u'text/html', ), 'utf-8') if mimetype == 'text/plain': # feedparser doesn't sanitize the summary if it's plain text, so we # need to do it manually. We're using jijna2's autoscape feature # for this, which feels like a bit of a hack to me (Ian), but it # works -- there's probably a cleaner way to do this. tmpl = jinja2.Template('{{ text }}', autoescape=True) post.summary = tmpl.render(text=post.summary) post.page_url = entry['link'] return post
def sanitise_html(html): """ santise_html(html) returns some sanitised html. It can be used to try and avoid basic html insertion attacks. >>> sanitise_html("<p>hello</p>") '<p>hello</p>' >>> sanitise_html("<script>alert('what')</script>") '' """ return feedparser._sanitizeHTML(html, "utf-8", "text/html")
def sanitize_html(html, force_https=True): """ santise_html(html) returns some sanitized html. It can be used to try and avoid basic html insertion attacks. >>> sanitize_html("<p>hello</p>") '<p>hello</p>' >>> sanitize_html("<script>alert('what')</script>") '' """ clean_html = feedparser._sanitizeHTML(html, "utf-8", "text/html") if force_https: return clean_html.replace('src="http://', 'src="https://') else: return clean_html
def htmlmail(sbj,recip,msg,template='',texttemplate='',textmsg='',images=(), recip_name='',sender=settings.DEFAULT_FROM_EMAIL,sender_name='',charset=charset): """ if you want to use Django template system: use `msg` and optionally `textmsg` as template context (dict) and define `template` and optionally `texttemplate` variables. otherwise msg and textmsg variables are used as html and text message sources. if you want to use images in html message, define physical paths and ids in tuples. (image paths are relative to MEDIA_ROOT) example: images=(('email_images/logo.gif','img1'),('email_images/footer.gif','img2')) and use them in html like this: <img src="cid:img1"> ... <img src="cid:img2"> """ html=render(msg,template) if texttemplate or textmsg: text=render((textmsg or msg),texttemplate) else: text= html2text(_sanitizeHTML(html,charset)) msgRoot = MIMEMultipart('related') msgRoot['Subject'] = sbj msgRoot['From'] = named(sender,sender_name) msgRoot['To'] = named(recip,recip_name) msgRoot.preamble = 'This is a multi-part message in MIME format.' msgAlternative = MIMEMultipart('alternative') msgRoot.attach(msgAlternative) msgAlternative.attach(MIMEText(text, _charset=charset)) msgAlternative.attach(MIMEText(html, 'html', _charset=charset)) for img in images: fp = open(settings.MEDIA_ROOT+img[0], 'rb') msgImage = MIMEImage(fp.read()) fp.close() msgImage.add_header('Content-ID', '<'+img[1]+'>') msgRoot.attach(msgImage) smtp = SMTP() smtp.connect(smtp_server) if smtp_user: smtp.login(smtp_user, smtp_pass) smtp.sendmail(sender, recip, msgRoot.as_string()) smtp.quit()
def send_html_mail_nt(subject, sender=settings.DEFAULT_FROM_EMAIL, recip="", context=None, html_template="", text_template="", sender_name="", html_content="", text_content="", recip_list=None, sender_formatted=""): from stripogram import html2text from feedparser import _sanitizeHTML if not context: context = {} if html_template: html = render(context, html_template) else: html = html_content if text_template: text = render(context, text_template) else: text = text_content if not text: text = html2text(_sanitizeHTML(html, charset)) if not recip_list: recip_list = [] if recip: recip_list.append(recip) try: if getattr(settings, "EMAIL_USE_SSL", False): server = SMTP_SSL(settings.EMAIL_HOST, settings.EMAIL_PORT) else: server = SMTP(settings.EMAIL_HOST, settings.EMAIL_PORT) if settings.EMAIL_USE_TLS: server.ehlo() server.starttls() server.ehlo() if settings.EMAIL_HOST_USER and settings.EMAIL_HOST_PASSWORD: server.login(settings.EMAIL_HOST_USER, settings.EMAIL_HOST_PASSWORD) except Exception, e: print e return
def sanitise_html(html, baseurl, inline, config, type): """Attempt to turn arbitrary feed-provided HTML into something suitable for safe inclusion into the rawdog output. The inline parameter says whether to expect a fragment of inline text, or a sequence of block-level elements.""" if html is None: return None html = encode_references(html) # sgmllib handles "<br/>/" as a SHORTTAG; this workaround from # feedparser. html = re.sub(r'(\S)/>', r'\1 />', html) html = feedparser._resolveRelativeURIs(html, baseurl, "UTF-8", type) html = feedparser._sanitizeHTML(html, "UTF-8", type) if not inline and config["blocklevelhtml"]: # If we're after some block-level HTML and the HTML doesn't # start with a block-level element, then insert a <p> tag # before it. This still fails when the HTML contains text, then # a block-level element, then more text, but it's better than # nothing. if block_level_re.match(html) is None: html = "<p>" + html if config["tidyhtml"]: import mx.Tidy args = { "wrap": 0, "numeric_entities": 1 } plugins.call_hook("mxtidy_args", config, args, baseurl, inline) output = mx.Tidy.tidy(html, None, None, **args)[2] html = output[output.find("<body>") + 6 : output.rfind("</body>")].strip() html = html.decode("UTF-8") box = plugins.Box(html) plugins.call_hook("clean_html", config, box, baseurl, inline) return box.value
def send_html_mail_nt( subject, sender=settings.DEFAULT_FROM_EMAIL, recip="", context=None, html_template="", text_template="", sender_name="", html_content="", text_content="", recip_list=None, sender_formatted="" ): from stripogram import html2text from feedparser import _sanitizeHTML if not context: context = {} if html_template: html = render(context, html_template) else: html = html_content if text_template: text = render(context, text_template) else: text = text_content if not text: text = html2text(_sanitizeHTML(html,charset)) if not recip_list: recip_list = [] if recip: recip_list.append(recip) try: if getattr(settings, "EMAIL_USE_SSL", False): server = SMTP_SSL(settings.EMAIL_HOST, settings.EMAIL_PORT) else: server = SMTP(settings.EMAIL_HOST, settings.EMAIL_PORT) if settings.EMAIL_USE_TLS: server.ehlo() server.starttls() server.ehlo() if settings.EMAIL_HOST_USER and settings.EMAIL_HOST_PASSWORD: server.login( settings.EMAIL_HOST_USER, settings.EMAIL_HOST_PASSWORD ) except Exception, e: print e return
def home(request): #Get JSON data of RSS feed: watched_urls = WatchedUrl.objects.all() #need to get the full object for each_wu in watched_urls: #Encode the URI so that the key=value pairs don't get parsed in the Google url: url = urllib.quote(each_wu.url) #TODO: Differentiate between first import and subsequent imports. We do not # need num=-1 and scoring=h for subsequent imports. gfeed_url = 'http://ajax.googleapis.com/ajax/services/feed/load?q=' + \ url + \ '&v=1.0&num=-1&scoring=h' + \ '&key=' + settings.GOOGLE_FEEDS_API_KEY #The above key is for localhost only. json_result = urllib.urlopen(gfeed_url) #Parse the JSON string into python data structures: result = simplejson.loads(json_result.read()) #Use read() to return string entries = result['responseData']['feed']['entries'] skip_count = 0 for entry in entries: #Parse the date. We remove the last part which is the time offset since #python's support for parsing it is highly variable: temp = entry['publishedDate'].split() date_no_offset = ' '.join(temp[0:-1]) #remove the last field dt = datetime.datetime.strptime(date_no_offset, '%a, %d %b %Y %H:%M:%S') #Sanitize the HTML #Not necessarily needed. We can trust that both CL and Google Feeds do #some level of sanitization. entry['title'] = feedparser._sanitizeHTML(entry['title'], 'utf-8') entry['content'] = feedparser._sanitizeHTML(entry['content'], 'utf-8') #Put in database. To prevent duplicates, try to get the url first from #database: try: post = Post.objects.get(link = entry['link']) #If the post already exists, then skip it. skip_count += 1 if skip_count >= settings.DUPLICATE_POSTS_THRESHOLD: break #we don't need to keep updating anymore continue except Post.MultipleObjectsReturned: #Technically, this shouldn't happen if other code works continue #skip creating new entry except Post.DoesNotExist: #Create a new entry post = Post() post.watched_url = each_wu post.date = dt post.title = entry['title'] #post.content = pickle.dumps(entry) post.content = entry['content'] post.link = entry['link'] post.save() #Now select the entries in chronological order entries = Post.objects.order_by('-date') #DESC order #Create add url form: add_url_form = AddURLForm() return render_to_response('home.html', {'entries': entries, 'add_url_form': add_url_form, })
def sanitize(value): return feedparser._sanitizeHTML(value, 'UTF-8', 'text/html')
def parse_feed_json(source_feed, feed_content, interval, response): ok = True changed = False try: f = json.loads(feed_content) entries = f['items'] if len(entries): source_feed.last_success = datetime.datetime.utcnow().replace(tzinfo=utc) #in case we start auto unsubscribing long dead feeds else: source_feed.last_result = "Feed is empty" interval += 120 ok = False except Exception as ex: source_feed.last_result = "Feed Parse Error" entries = [] interval += 120 ok = False if ok: if "expired" in f and f["expired"]: # This feed says it is done # TODO: permanently disable # for now interval to max interval = (24*3*60) source_feed.last_result = "This feed has expired" return (False,False,interval) try: source_feed.site_url = f["home_page_url"] source_feed.name = f["title"] except Exception as ex: pass #response.write(entries) entries.reverse() # Entries are typically in reverse chronological order - put them in right order for e in entries: body = " " if "content_text" in e: body = e["content_text"] if "content_html" in e: body = e["content_html"] # prefer html over text body = fix_relative(body,source_feed.site_url) try: guid = e["id"] except Exception as ex: try: guid = e["url"] except Exception as ex: m = hashlib.md5() m.update(body.encode("utf-8")) guid = m.hexdigest() try: p = Post.objects.filter(source=source_feed).filter(guid=guid)[0] response.write("EXISTING " + guid + "\n") except Exception as ex: response.write("NEW " + guid + "\n") p = Post(index=0) p.found = datetime.datetime.utcnow().replace(tzinfo=utc) changed = True p.source = source_feed try: title = e["title"] except Exception as ex: title = "" # borrow the RSS parser's sanitizer body = feedparser._sanitizeHTML(body, "utf-8") # TODO: validate charset ?? title = feedparser._sanitizeHTML(title, "utf-8") # TODO: validate charset ?? # no other fields are ever marked as |safe in the templates try: p.link = e["url"] except Exception as ex: p.link = '' p.title = title try: p.created = pyrfc3339.parse(e["date_published"]) except Exception as ex: response.write("CREATED ERROR") p.created = datetime.datetime.utcnow().replace(tzinfo=utc) p.guid = guid try: p.author = e["author"] except Exception as ex: p.author = "" try: p.body = body p.save() # response.write(p.body) except Exception as ex: #response.write(str(sys.exc_info()[0])) response.write("\nSave error for post:" + str(sys.exc_info()[0])) traceback.print_tb(sys.exc_traceback,file=response) return (ok,changed,interval)
def parse_feed_json(source_feed, feed_content, output): ok = True changed = False try: f = json.loads(feed_content) entries = f["items"] if len(entries): source_feed.last_success = ( timezone.now() ) # in case we start auto unsubscribing long dead feeds else: source_feed.last_result = "Feed is empty" source_feed.interval += 120 ok = False except Exception as ex: source_feed.last_result = "Feed Parse Error" entries = [] source_feed.interval += 120 ok = False if ok: if "expired" in f and f["expired"]: # This feed says it is done # TODO: permanently disable # for now source_feed.interval to max source_feed.interval = 24 * 3 * 60 source_feed.last_result = "This feed has expired" return (False, False, source_feed.interval) try: source_feed.site_url = f["home_page_url"] source_feed.name = update_source_name(source_feed.name, f["title"]) except Exception as ex: pass if "description" in f: _customize_sanitizer(parser) source_feed.description = parser._sanitizeHTML( f["description"], "utf-8", "text/html") _customize_sanitizer(parser) source_feed.name = update_source_name( source_feed.name, parser._sanitizeHTML(source_feed.name, "utf-8", "text/html"), ) if "icon" in f: source_feed.image_url = f["icon"] # output.write(entries) entries.reverse( ) # Entries are typically in reverse chronological order - put them in right order for e in entries: body = " " if "content_text" in e: body = e["content_text"] if "content_html" in e: body = e["content_html"] # prefer html over text body = fix_relative(body, source_feed.site_url) try: guid = e["id"] except Exception as ex: try: guid = e["url"] except Exception as ex: m = hashlib.md5() m.update(body.encode("utf-8")) guid = m.hexdigest() try: p = Post.objects.filter(source=source_feed).filter( guid=guid)[0] output.write("EXISTING " + guid + "\n") except Exception as ex: output.write("NEW " + guid + "\n") p = Post(index=0, body=" ") p.found = timezone.now() changed = True p.source = source_feed try: title = e["title"] except Exception as ex: title = "" # borrow the RSS parser's sanitizer _customize_sanitizer(parser) body = parser._sanitizeHTML( body, "utf-8", "text/html") # TODO: validate charset ?? _customize_sanitizer(parser) title = parser._sanitizeHTML( title, "utf-8", "text/html") # TODO: validate charset ?? # no other fields are ever marked as |safe in the templates if "banner_image" in e: p.image_url = e["banner_image"] if "image" in e: p.image_url = e["image"] try: p.link = e["url"] except Exception as ex: p.link = "" p.title = title try: p.created = pyrfc3339.parse(e["date_published"]) except Exception as ex: output.write("CREATED ERROR") p.created = timezone.now() p.guid = guid try: p.author = e["author"] except Exception as ex: p.author = "" p.save() try: seen_files = [] for ee in list(p.enclosures.all()): # check existing enclosure is still there found_enclosure = False if "attachments" in e: for pe in e["attachments"]: if pe["url"] == ee.href and ee.href not in seen_files: found_enclosure = True try: ee.length = int(pe["size_in_bytes"]) except: ee.length = 0 try: file_type = pe["mime_type"] except: file_type = "audio/mpeg" # we are assuming podcasts here but that's probably not safe ee.type = file_type ee.save() break if not found_enclosure: ee.delete() seen_files.append(ee.href) if "attachments" in e: for pe in e["attachments"]: try: if pe["url"] not in seen_files: try: length = int(pe["size_in_bytes"]) except: length = 0 try: filetype = pe["mime_type"] except: filetype = "audio/mpeg" ee = Enclosure(post=p, href=pe["url"], length=length, type=filetype) ee.save() except Exception as ex: pass except Exception as ex: if output: output.write("No enclosures - " + str(ex)) try: p.body = body p.save() # output.write(p.body) except Exception as ex: output.write(str(ex)) output.write(p.body) try: if "tags" in e: for t in e["tags"]: tag, created = Tag.objects.get_or_create(**t) p.tags.add(tag) print(f"Tag {tag} added to post {p}") except Exception as ex: output.write(str(ex)) output.write(f"couldn't add tag {tag} to post {p}") return (ok, changed)
def sanitize_html(html): return html # widget mechanism requires iframe # !!! so, its not yet possible to make it right: return feedparser._sanitizeHTML(html, 'utf-8')
def parse_feed_json(source_feed, feed_content, output): ok = True changed = False try: f = json.loads(feed_content) entries = f['items'] if entries: source_feed.last_success = timezone.now( ) #in case we start auto unsubscribing long dead feeds else: source_feed.last_result = "Feed is empty" source_feed.interval += 120 ok = False except Exception as ex: source_feed.last_result = "Feed Parse Error" entries = [] source_feed.interval += 120 ok = False if ok: if "expired" in f and f["expired"]: # This feed says it is done # TODO: permanently disable # for now source_feed.interval to max source_feed.interval = (24 * 3 * 60) source_feed.last_result = "This feed has expired" return (False, False, source_feed.interval) try: source_feed.site_url = f["home_page_url"] if not source_feed.name: source_feed.name = f["title"] except Exception as ex: pass if "description" in f: _customize_sanitizer(feedparser) source_feed.description = feedparser._sanitizeHTML( f["description"], "utf-8", 'text/html') _customize_sanitizer(feedparser) if not source_feed.name: source_feed.name = feedparser._sanitizeHTML( source_feed.name, "utf-8", 'text/html') if "icon" in f: source_feed.image_url = f["icon"] entries.reverse( ) # Entries are typically in reverse chronological order - put them in right order for e in entries: body = " " if "content_text" in e: body = e["content_text"] if "content_html" in e: body = e["content_html"] # prefer html over text body = fix_relative(body, source_feed.site_url) try: guid = e["id"] except Exception as ex: try: guid = e["url"] except Exception as ex: m = hashlib.md5() m.update(body.encode("utf-8")) guid = m.hexdigest() try: p = Post.objects.filter(source=source_feed).filter( guid=guid)[0] logging.info("EXISTING: %s", guid) except Exception as ex: logging.info("Creating new post %s.", guid) p = Post(index=0, body=' ') p.found = timezone.now() changed = True p.source = source_feed try: title = e["title"] except Exception as ex: title = "" # borrow the RSS parser's sanitizer _customize_sanitizer(feedparser) body = feedparser._sanitizeHTML( body, "utf-8", 'text/html') # TODO: validate charset ?? _customize_sanitizer(feedparser) title = feedparser._sanitizeHTML( title, "utf-8", 'text/html') # TODO: validate charset ?? # no other fields are ever marked as |safe in the templates if "banner_image" in e: p.image_url = e["banner_image"] if "image" in e: p.image_url = e["image"] try: p.link = e["url"] except Exception as ex: p.link = '' p.title = title try: p.created = pyrfc3339.parse(e["date_published"]) except Exception as ex: logging.exception('Unable to parse published date.') p.created = timezone.now() p.guid = guid try: p.author = e["author"] except Exception as ex: p.author = "" p.save() try: seen_files = [] for ee in list(p.enclosures.all()): # check existing enclosure is still there found_enclosure = False if "attachments" in e: for pe in e["attachments"]: if pe["url"] == ee.href and ee.href not in seen_files: found_enclosure = True ee.length = int( pe.get("size_in_bytes", None) or 0) typ = pe.get("mime_type", None) or "audio/mpeg" ee.type = typ ee.save() break # DANGEROUS! This deletes everything if a glitch in the feed removes enclosures. # if not found_enclosure: # ee.delete() seen_files.append(ee.href) if "attachments" in e: for pe in e["attachments"]: try: # Since many RSS feeds embed trackers into their URL that constantly change, yet almost always only include a single enclosure, # we'll only create a new enclosure when we see a new url if there are no enclosure records created yet. # This is a most robust way of preventing logical duplicates due to tracker URL changes then by trying to predict and strip out # all known tracker prefixes. if pe["url"] not in seen_files and not p.enclosures.all( ).exists(): length = int( pe.get("size_in_bytes", None) or 0) typ = pe.get("mime_type", None) or "audio/mpeg" ee = Enclosure(post=p, href=pe["url"], length=length, type=typ) ee.save() except Exception as ex: pass except Exception as ex: logging.exception("No enclosures") try: p.body = body p.save() except Exception as ex: logging.exception('Unable to save body A2.') return (ok, changed)
def save(self, *args, **kwargs): if not self.texttemplate: self.texttemplate = html2text(_sanitizeHTML(self.htmltemplate, CHARSET)) super(Letter, self).save(*args, **kwargs)