def sanitize_html(answer): """ Take a student response and sanitize the HTML to prevent malicious script injection or other unwanted content. answer - any string return - a cleaned version of the string """ try: answer = autolink_html(answer) cleaner = WhiteListCleaner(style=True, links=True, add_nofollow=False, page_structure=True, safe_attrs_only=True, whitelist_tags=( 'embed', 'iframe', 'a', 'img', 'br', )) clean_html = cleaner.clean_html(answer) clean_html = re.sub(r'</p>$', '', re.sub(r'^<p>', '', clean_html)) clean_html = re.sub("\n", "<br/>", clean_html) except Exception: clean_html = answer return clean_html
def clean_comment(self): self.cleaned_data['comment'] = clean_html(self.cleaned_data['comment']) self.cleaned_data['comment'] = autolink_html( self.cleaned_data['comment']) self.cleaned_data['comment'] = autolink_email( self.cleaned_data['comment']) return self.cleaned_data['comment']
def clean_content(self): """ Do our usual HTML cleanup. Do we want to mangle the markup field to always be "html"? """ self.cleaned_data['content'] = clean_html(self.cleaned_data['content']) self.cleaned_data['content'] = autolink_html(self.cleaned_data['content']) return self.cleaned_data['content']
def sanitize(html): if not html: return html cleaner = Cleaner(allow_tags=_safe_tags, safe_attrs_only=True, safe_attrs=_safe_attrs, remove_unknown_tags=False) html = autolink_html(cleaner.clean_html(html)) parts = re.split('(<.*?>)', html) output = '' in_a_tag = False for part in parts: if not len(part): continue is_tag = part[0] == '<' if is_tag or in_a_tag: output += part if part[0:2].lower() == '<a': in_a_tag = True elif part[0:3].lower() == '</a': in_a_tag = False continue part = re.sub("([a-zA-Z0-9_\\-+\\.\']*[a-zA-Z0-9]@[0-9a-zA-Z\\-\\.]+\\.[a-zA-Z]{2,})", '<a href="mailto:\\1">\\1</a>', part) # After linking up emails, only look for twitter in the remaining parts sub_parts = re.split('(<.*?>)', part) part = '' for sub_part in sub_parts: part += re.sub("(?<![a-zA-Z0-9])@([0-9a-zA-Z_]{1,15})", '<a href="https://twitter.com/\\1">@\\1</a>', sub_part) output += part return output
def sanitize_html(html_text: str) -> str: """Clean dangerous tags from the HTML and convert urls into anchors.""" sanitized_html = str(autolink_html(clean_html(html_text))) # The clean_html function creates HTML elements. That means if the user enters a simple text string it gets # enclosed in a <p> tag. Remove it to not confuse users that haven't entered any HTML: if sanitized_html.count("<") == 2: sanitized_html = re.sub("</?p>", "", sanitized_html) return sanitized_html
def clean_up_html(html, method='html'): html = autolink_html(html, link_regexes=_link_regexes) html = lxml.html.fromstring(cleaner.clean_html(html)) for h1 in html.findall('h1'): h1.tag = 'h2' for a in html.cssselect('a'): a.attrib['target'] = '_blank' return lxml.html.tostring(html, encoding='utf-8', method=method)
def cleanup_chat_text(html): html = autolink_html(html, link_regexes=_link_regexes) html = lxml.html.fromstring(html) for a in html.cssselect('a'): a.attrib['target'] = '_blank' return lxml.html.tostring(html).decode('utf-8')
def clean_input(comment): data = comment if 'href' not in data: data = autolink_html(data, avoid_elements=['a']) cleaner = Cleaner(add_nofollow=True, allow_tags=ALLOWED_TAGS, remove_unknown_tags=False) content = cleaner.clean_html(data).replace('\n', '<br/>') return content
def clean_body(self): body = self.cleaned_data.get('body', '') # validate HTML content # Additional options at http://codespeak.net/lxml/lxmlhtml.html#cleaning-up-html body = clean_html(body) body = autolink_html(body) self.cleaned_data['body'] = body return self.cleaned_data['body']
def html_clean(body): # validate HTML content # Additional options at http://codespeak.net/lxml/lxmlhtml.html#cleaning-up-html body = clean_html(body) body = autolink_html(body) # emails too body = autolink_email(body) return body
def sanitize_html(answer): try: answer = autolink_html(answer) cleaner = Cleaner(style=True, links=True, add_nofollow=False, page_structure=True, safe_attrs_only=True, host_whitelist=open_ended_image_submission.TRUSTED_IMAGE_DOMAINS, whitelist_tags=set(['embed', 'iframe', 'a', 'img'])) clean_html = cleaner.clean_html(answer) clean_html = re.sub(r'</p>$', '', re.sub(r'^<p>', '', clean_html)) except: clean_html = answer return clean_html
def send_mail(subject=None, txtMessage=None, htmlMessage=None, fromemail=None, recipients=None, shortname=None, priority=None, context={}, use_template=True, lang='en'): # try to be backwards-compatible if htmlMessage and not recipients: recipients = fromemail fromemail = htmlMessage htmlMessage = None if not htmlMessage: htmlMessage = txtMessage.replace("\n", "<br/>") htmlMessage = clean_html(htmlMessage) htmlMessage = autolink_html(htmlMessage) if not txtMessage: txtMessage = htmlMessage context['do_text_conversion'] = True # TODO: do a fancy strip tags thing subject = force_unicode(subject) txtMessage = force_unicode(txtMessage) htmlMessage = force_unicode(htmlMessage) if use_template: if not context.get('do_text_conversion', None): context['do_text_conversion'] = False context['body'] = htmlMessage htmlMessage = loader.get_template("email_template.html").render(Context(context)) context['body'] = txtMessage txtMessage = loader.get_template("email_template.txt").render(Context(context)) recips = ",".join(recipients) if shortname: shortname = shortname.lower() Email.objects.create(recipients=recips, shortName=shortname, sender=fromemail, subject=subject, textMessage=txtMessage, htmlMessage=htmlMessage, lang=lang) else: Email.objects.create(recipients=recips, shortName=shortname, sender=fromemail, subject=subject, textMessage=txtMessage, htmlMessage=htmlMessage, lang=lang)
def fmt_part(self, part): what = [part['type'], escape_html(part['data'])] if what[0] == 'pgpbeginsigned': what[1] = ('<input type="submit" name="gpg_recvkey"' ' value="Get PGP key and Verify">' + what[1]) if what[0] in ('pgpsignature', 'pgpbeginsigned'): key_id = re.search('key ID ([0-9A-Fa-f]+)', what[1]) if key_id: what[1] += ('<input type="hidden" name="gpg_key_id" value="0x%s">' ) % key_id.group(1) return ('html', autolink_html('<p class="%s">%s</p>' % tuple(what)))
def save(self, force_insert=False, force_update=False): # validate HTML content # Additional options at http://codespeak.net/lxml/lxmlhtml.html#cleaning-up-html self.body = clean_html(self.body) self.body = autolink_html(self.body) # set parent group group = BaseGroup.objects.get(id=self.object_id) self.parent_group = group super(GroupTopic, self).save(force_insert, force_update) post_save.send(sender=Topic, instance=GroupTopic.objects.get(id=self.id))
def save(self, force_insert=False, force_update=False): # set parent group group = BaseGroup.objects.get(id=self.object_id) self.parent_group = group # validate HTML content # Additional options at http://codespeak.net/lxml/lxmlhtml.html#cleaning-up-html if self.content and self.content.strip(): self.content = clean_html(self.content) self.content = autolink_html(self.content) super(Whiteboard, self).save(force_insert, force_update)
def find_links_in_message(text, name, avatar): link_list = [i for i in iterlinks(autolink_html(text))] extracted_links = list() for link in link_list: extract_dict = extract_link(link) if extract_dict: extracted_links.append(extract_dict) txt = autolink_html(cleaner.clean_html(text)) txt = re.sub(ur'(#[а-яА-ЯёЁA-Za-z0-9-]+)', add_user_link, txt) txt = re.sub(ur'(\$[а-яА-ЯёЁA-Za-z0-9-]+)', add_room_link, txt) user_info = render_template( 'user_message.html', avatar=avatar, name=name, time=datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S Z"), sid=md5(name + datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S Z")).hexdigest(), txt=Markup(txt), extracted_links=extracted_links ) return user_info
def post_to_wordpress(self, url, blog, author, hour, format=True): """ SIMPLISTIC WAY TO GET WHAT COULD BE AN EMAIL ONTO A WORDPRESS BLOG REQUIRES wp-cli https://github.com/wp-cli/wp-cli """ if format: self.prepare_formatting() path_to_wordpress = config_get_section_attribute('SITES', 'path_to_docroot', required=True) path_to_wpcli = config_get_section_attribute('SITES', 'path_to_wpcli', required=True) if not url: wordpress_url = config_get_section_attribute('SITES', 'url', required=True) else: wordpress_url = url # Get the user information first command = "{} --path={} user get {} --format=json ".format(path_to_wpcli, path_to_wordpress, author) to_call = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE) result, err = to_call.communicate() import json user = json.loads(result.decode()) # Now clean up the html, add links if not there and remove errant tags, also clean up for passing on try: from lxml.html.clean import Cleaner from lxml.html.clean import autolink_html except ImportError: click.secho('We need lxml!', fg='red') content = self.get_html() cleaner = Cleaner(remove_tags=['p', 'div']) # Moodle's editor has loads of lonely p and div tags content = cleaner.clean_html(content) content = autolink_html(content) replace_apostrophes = "'\\''" content = content.replace("'", replace_apostrophes).replace('\r', ' ') # escape apostrophes for bash date_as_string = '{}-{}-{} {}:{}:00'.format(self.date.year, self.date.month, self.date.day, hour.tm_hour, hour.tm_min) d = { 'title': self.get_subject(), # remove the 'Student Notices for' part 'author': user['ID'], 'content': content, 'date': date_as_string, 'blog': blog, 'url': wordpress_url, 'path_to_wpcli': path_to_wpcli, 'path_to_docroot': path_to_wordpress } command = """{path_to_wpcli} post create --path={path_to_docroot} --post_type=post --post_title='{title}' --post_content='{content}' --post_author={author} --post_status=future --post_date='{date}' --url={url}/{blog}""".format(**d) subprocess.call(command, shell=True)
def clean_description(self): body = self.cleaned_data.get('description', '') if body: # validate HTML content # Additional options at http://codespeak.net/lxml/lxmlhtml.html#cleaning-up-html body = clean_html(body) body = autolink_html(body) # emails too body = autolink_email(body) self.cleaned_data['description'] = body return body
def save(self, force_insert=False, force_update=False): ''' Automatically generate the slug from the title ''' self.slug = slugify(self.title) # and set the parent_group property # (to be honest, we could probably do away with generic foreign keys altogether) group = BaseGroup.objects.get(id=self.object_id) self.parent_group = group # validate HTML content # Additional options at http://codespeak.net/lxml/lxmlhtml.html#cleaning-up-html self.description = clean_html(self.description) self.description = autolink_html(self.description) super(Event, self).save(force_insert, force_update)
def display_message(self, email, tree, raw=False, sep='', fd=None): if raw: for line in email.get_file().readlines(): try: line = line.decode('utf-8') except UnicodeDecodeError: try: line = line.decode('iso-8859-1') except: line = '(MAILPILE DECODING FAILED)\n' self.say(line, newline='', fd=fd) else: self.buffered_html.append(('html', '<div class=headers>')) for hdr in ('From', 'Subject', 'To', 'Cc'): value = email.get(hdr, '') if value: html = '<b>%s:</b> %s<br>' % (hdr, escape_html(value)) self.buffered_html.append(('html', html)) self.buffered_html.append(('html', '</div><br>')) if tree['text_parts']: self.buffered_html.append( ('html', '<div class="message plain">')) last = '<bogus>' for part in tree['text_parts']: if part['data'] != last: self.buffered_html.append(self.fmt_part(part)) last = part['data'] else: self.buffered_html.append( ('html', '<div class="message html">')) last = '<bogus>' for part in tree['html_parts']: if part['data'] != last: self.buffered_html.append( ('html', autolink_html(part['data']))) last = part['data'] if tree['attachments']: self.buffered_html.append( ('html', '</div><div class="attachments"><ul>')) for att in tree['attachments']: desc = ( '<a href="./att:%(count)s">Attachment: %(filename)s</a> ' '(%(mimetype)s, %(length)s bytes)') % att self.buffered_html.append(('html', '<li>%s</li>' % desc)) self.buffered_html.append(('html', '</ul>')) self.buffered_html.append(('html', '</div>'))
def clean_input(comment): try: data = comment if 'href' not in data: data = autolink_html(data, avoid_elements=['a']) cleaner = Cleaner(add_nofollow=True, allow_tags=ALLOWED_TAGS, remove_unknown_tags=False) content = cleaner.clean_html(data).replace('\n', '<br/>') return content except Exception as e: if type(e).__name__ == "ParserError": raise logic.ValidationError("Comment text is required") else: template = "An exception of type {0} occurred. Arguments:\n{1!r}" message = template.format(type(e).__name__, e.args) log.debug(message)
def display_message(self, email, tree, raw=False, sep='', fd=None): if raw: for line in email.get_file().readlines(): try: line = line.decode('utf-8') except UnicodeDecodeError: try: line = line.decode('iso-8859-1') except: line = '(MAILPILE DECODING FAILED)\n' self.say(line, newline='', fd=fd) else: self.buffered_html.append(('html', '<div class=headers>')) for hdr in ('From', 'Subject', 'To', 'Cc'): value = email.get(hdr, '') if value: html = '<b>%s:</b> %s<br>' % (hdr, escape_html(value)) self.buffered_html.append(('html', html)) self.buffered_html.append(('html', '</div><br>')) if tree['text_parts']: self.buffered_html.append(('html', '<div class="message plain">')) last = '<bogus>' for part in tree['text_parts']: if part['data'] != last: self.buffered_html.append(self.fmt_part(part)) last = part['data'] else: self.buffered_html.append(('html', '<div class="message html">')) last = '<bogus>' for part in tree['html_parts']: if part['data'] != last: self.buffered_html.append(('html', autolink_html(part['data']))) last = part['data'] if tree['attachments']: self.buffered_html.append(('html', '</div><div class="attachments"><ul>')) for att in tree['attachments']: desc = ('<a href="./att:%(count)s">Attachment: %(filename)s</a> ' '(%(mimetype)s, %(length)s bytes)') % att self.buffered_html.append(('html', '<li>%s</li>' % desc)) self.buffered_html.append(('html', '</ul>')) self.buffered_html.append(('html', '</div>'))
def sanitize(html): if not html: return html cleaner = Cleaner(allow_tags=_safe_tags, safe_attrs_only=True, safe_attrs=_safe_attrs, remove_unknown_tags=False) html = autolink_html(cleaner.clean_html(html)) parts = re.split('(<.*?>)', html) output = '' in_a_tag = False for part in parts: if not len(part): continue is_tag = part[0] == '<' if is_tag or in_a_tag: output += part if part[0:2].lower() == '<a': in_a_tag = True elif part[0:3].lower() == '</a': in_a_tag = False continue part = re.sub( "([a-zA-Z0-9_\\-+\\.\']*[a-zA-Z0-9]@[0-9a-zA-Z\\-\\.]+\\.[a-zA-Z]{2,})", '<a href="mailto:\\1">\\1</a>', part) # After linking up emails, only look for twitter in the remaining parts sub_parts = re.split('(<.*?>)', part) part = '' for sub_part in sub_parts: part += re.sub("(?<![a-zA-Z0-9])@([0-9a-zA-Z_]{1,15})", '<a href="https://twitter.com/\\1">@\\1</a>', sub_part) output += part return output
def sanitize_html(answer): """ Take a student response and sanitize the HTML to prevent malicious script injection or other unwanted content. answer - any string return - a cleaned version of the string """ try: answer = autolink_html(answer) cleaner = WhiteListCleaner( style=True, links=True, add_nofollow=False, page_structure=True, safe_attrs_only=True, whitelist_tags=('embed', 'iframe', 'a', 'img', 'br',) ) clean_html = cleaner.clean_html(answer) clean_html = re.sub(r'</p>$', '', re.sub(r'^<p>', '', clean_html)) clean_html = re.sub("\n","<br/>", clean_html) except Exception: clean_html = answer return clean_html
def send_mail(subject=None, txtMessage=None, htmlMessage=None, fromemail=None, recipients=None, shortname=None, priority=None, context={}, use_template=True, lang='en', cc=None, bcc=None, content_object=None, reply_to=None): # try to be backwards-compatible if htmlMessage and recipients == None: recipients = fromemail fromemail = htmlMessage htmlMessage = None if not htmlMessage: htmlMessage = txtMessage.replace("\n", "<br/>") htmlMessage = clean_html(htmlMessage) htmlMessage = autolink_html(htmlMessage) if not txtMessage: txtMessage = htmlMessage context['do_text_conversion'] = True # TODO: do a fancy strip tags thing subject = force_unicode(subject) txtMessage = force_unicode(txtMessage) htmlMessage = force_unicode(htmlMessage) if not context.get('do_text_conversion', None): context['do_text_conversion'] = False if use_template: context['body'] = htmlMessage htmlMessage = loader.get_template("email_template.html").render(Context(context)) context['body'] = txtMessage txtMessage = loader.get_template("email_template.txt").render(Context(context)) else: context['body'] = txtMessage txtMessage = loader.get_template("email_template_clean.txt").render(Context(context)) recips = ",".join(recipients) cc_string = None bcc_string = None if cc: cc_string = ",".join(cc) if bcc: bcc_string = ",".join(bcc) if content_object: type_id = ContentType.objects.get_for_model(content_object) message_id = '*****@*****.**' % (int(round(time.time())), type_id.id, content_object.id) else: message_id = '*****@*****.**' % int(round(time.time())) if shortname: shortname = shortname.lower() e = Email.objects.create(recipients=recips, shortName=shortname, sender=fromemail, subject=subject, textMessage=txtMessage, htmlMessage=htmlMessage, lang=lang, cc=cc_string, bcc=bcc_string, reply_to=reply_to, message_id=message_id) else: e = Email.objects.create(recipients=recips, shortName=shortname, sender=fromemail, subject=subject, textMessage=txtMessage, htmlMessage=htmlMessage, lang=lang, cc=cc_string, bcc=bcc_string, reply_to=reply_to, message_id=message_id) if content_object: e.content_object = content_object e.save()
def send_mail(subject=None, txtMessage=None, htmlMessage=None, fromemail=None, recipients=None, shortname=None, priority=None, context={}, use_template=True, lang='en', cc=None, bcc=None): # try to be backwards-compatible if htmlMessage and recipients == None: recipients = fromemail fromemail = htmlMessage htmlMessage = None if not htmlMessage: htmlMessage = txtMessage.replace("\n", "<br/>") htmlMessage = clean_html(htmlMessage) htmlMessage = autolink_html(htmlMessage) if not txtMessage: txtMessage = htmlMessage context['do_text_conversion'] = True # TODO: do a fancy strip tags thing subject = force_unicode(subject) txtMessage = force_unicode(txtMessage) htmlMessage = force_unicode(htmlMessage) if not context.get('do_text_conversion', None): context['do_text_conversion'] = False if use_template: context['body'] = htmlMessage htmlMessage = loader.get_template("email_template.html").render( Context(context)) context['body'] = txtMessage txtMessage = loader.get_template("email_template.txt").render( Context(context)) else: context['body'] = txtMessage txtMessage = loader.get_template("email_template_clean.txt").render( Context(context)) recips = ",".join(recipients) cc_string = None bcc_string = None if cc: cc_string = ",".join(cc) if bcc: bcc_string = ",".join(bcc) if shortname: shortname = shortname.lower() Email.objects.create(recipients=recips, shortName=shortname, sender=fromemail, subject=subject, textMessage=txtMessage, htmlMessage=htmlMessage, lang=lang, cc=cc_string, bcc=bcc_string) else: Email.objects.create(recipients=recips, shortName=shortname, sender=fromemail, subject=subject, textMessage=txtMessage, htmlMessage=htmlMessage, lang=lang, cc=cc_string, bcc=bcc_string)
def cleanhtml(html='', cleaner=None): html_doc = soupparser.fromstring(html) if not cleaner: cleaner = sanitizer cleaned_html = cleaner.clean_html(html_doc) return lxml.html.tostring(autolink_html(cleaned_html))
def clean_html(html, host_whitelist=()): cleaner = Bleacher(host_whitelist=host_whitelist) cleaned_html = cleaner.clean_html('<body>' + html + '</body>') linkified_html = clean.autolink_html(cleaned_html) return linkified_html
def autolink(html): if html: return autolink_reg.sub(r'<a \1 target="_blank">',autolink_html(html)) return html
def clean_comment(self): self.cleaned_data['comment'] = clean_html(self.cleaned_data['comment']) self.cleaned_data['comment'] = autolink_html(self.cleaned_data['comment']) return self.cleaned_data['comment']
def cleanhtml(html='', cleaner=None): html_doc = soupparser.fromstring(remove_control_chars(html)) if not cleaner: cleaner = sanitizer cleaned_html = cleaner.clean_html(html_doc) return lxml.html.tostring(autolink_html(cleaned_html))
def cleanhtml(html=''): html_doc = soupparser.fromstring(html) cleaned_html = sanitizer.clean_html(html_doc) return lxml.html.tostring(autolink_html(cleaned_html))
def render(content): return autolink_html(content)
def parse_body(msg): body = None if msg.is_multipart(): html = None txt = None for part in msg.get_payload(): if part.is_multipart(): for part2 in part.get_payload(): if part2.get_content_type() == 'text/html': html = part2.get_payload(decode=True) elif part2.get_content_type() == 'text/plain': txt = part2.get_payload(decode=True) if part.get_content_type() == 'text/html': html = part.get_payload(decode=True) elif part.get_content_type() == 'text/plain': txt = part.get_payload(decode=True) if html: body = html elif txt: body = txt.replace("\n", "<br/>\n") else: body = msg.get_payload(decode=True) body = body.replace("\n", "<br/>\n") try: decoder = codecs.getdecoder(msg.get_content_charset()) body = decoder(body)[0] except: pass # strip out reply text # http://stackoverflow.com/questions/278788/parse-email-content-from-quoted-reply may be a better way quoting_gmail = r'<div(?:.*)gmail_quote(?:.*)>' # gmail puts their quotes in <div class="gmail_quote"> body = re.split(quoting_gmail, body)[0] quoting_thunderbird = r'<blockquote(?:.*)cite(?:.*)>' # thunderbird uses <blockquote type="cite"> body = re.split(quoting_thunderbird, body)[0] quoting_outlook = r'<(?:.*)style(?:.*)border-top: #B5C4DF(?:.*)>' # outlook is just a pain body = re.split(quoting_outlook, body)[0] quoting_text = r'<br/>\n*(.*)<br/>\n*(>(.*)<br/>\n*)+[(?:<br/>)\n]*$' # takes any block of end-of-message >-prefix lines, plus the one line preceeding it body = re.sub(quoting_text, '', body) if not body: raise BounceError("I wasn't able to understand the email you sent; it was in a format that is not supported.") # validate HTML content # Additional options at http://codespeak.net/lxml/lxmlhtml.html#cleaning-up-html body = clean_html(body) body = autolink_html(body) body = autolink_email(body) # TODO: strip out in-reference-to text in replies? return body