def scrubstring(string): from scrubber import Scrubber scrubber = Scrubber(autolink=True) try: string = string.decode('ascii') except UnicodeDecodeError: string = string.decode('utf-8') string = scrubber.scrub(string) return string.encode('utf-8')
def main(): inp_json = open(args.inpf).read() inp_json = json.loads(inp_json) text = inp_json["signal"] annots = [] for aset in inp_json["asets"]: if aset["type"] == "PERSON": annots = aset["annots"] annots.sort(key=lambda k: k[0]) text = text.encode("utf-8").decode("utf-8") text = text.strip() text = text.replace("\r\n", "^^\n") current_annot = (annots[0] if annots else None) processed_length = 0 for line in text.split('\n'): if not line: continue script_annots = Scrubber.dry_clean(line)[1] if not current_annot: continue line_len = len(line) line_annots = [] while True: if not current_annot: break start = current_annot[0] - processed_length end = current_annot[1] - processed_length if (start <= line_len and end <= line_len): annots.pop(0) line_annots.append([start, end]) current_annot = (annots[0] if annots else None) else: break processed_length += len(line) line_annots = merge_consecutive_markings(line_annots) script_annots = merge_consecutive_markings(script_annots) check_converage(line, line_annots, script_annots) print "Total number of docs: %s" % total_number_of_docs print "Docs with no PIIs / all PIIs detected: %s" % docs_with_all_pii_detected print "Docs with only partially missed PIIs: %s" % docs_with_partially_detected_pii print "Docs with fully missed PIIs: %s" % docs_with_full_missed_pii
import os from scrubber import Scrubber import BeautifulSoup #import BeautifulSoup # initialise the scrubber! all this stuff is overriding scrubber defaults so hack it to bits if you want! scrubber = Scrubber(autolink=False) scrubber.allowed_tags = set(( 'a', 'abbr', 'acronym', 'b', 'bdo', 'big', 'blockquote', 'br', 'center', 'cite', 'code', 'dd', 'del', 'dfn', 'div', 'dl', 'dt', 'em', 'h1', 'h2', 'h3', 'h4', 'h5',
import os from scrubber import Scrubber import BeautifulSoup #import BeautifulSoup # initialise the scrubber! all this stuff is overriding scrubber defaults so hack it to bits if you want! scrubber = Scrubber(autolink=False) scrubber.allowed_tags = set(( 'a', 'abbr', 'acronym', 'b', 'bdo', 'big', 'blockquote', 'br', 'center', 'cite', 'code', 'dd', 'del', 'dfn', 'div', 'dl', 'dt', 'em', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'ins', 'kbd', 'li', 'ol', 'param', 'pre', 'p', 'q', 's', 'samp', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'th', 'thead', 'tr', 'tt', 'ul', 'u', 'var', 'wbr', )) scrubber.disallowed_tags_save_content = set(( 'blink', 'body', 'html','font', )) scrubber.allowed_attributes = set(( 'align', 'alt', 'border', 'cite', 'dir', 'height', 'href', 'src', 'title', 'type', 'width', 'face', 'size', # font tags 'flashvars', # Not sure about flashvars - if any harm can come from it 'classid', # FF needs the classid on object tags for flash 'name', 'value', 'quality', 'data', 'scale', # for flash embed param tags, could limit to just param if this is harmful 'salign', 'align', 'wmode', )) # Bad attributes: 'allowscriptaccess', 'xmlns', 'target' scrubber.normalized_tag_replacements = {'b': 'strong', 'i': 'em'} # any giveaway classes the definately identify a footer.
class ScrubberTestCase(unittest.TestCase): tests = ( ( # Invalid HTML """<div notRealAttribute="value\n"onmouseover="\nexecuteMe();\n"foo="bar">\nI will execute here, too, if you mouse over me\n</div>""", "" if BeautifulSoup.__version__.startswith('3.1') else """<div>\nI will execute here, too, if you mouse over me\n</div>""" ), ( # Autolink """www.example.com<br>""", """<a href="http://www.example.com" rel="nofollow">www.example.com</a><br />""" ), ( # No autolinking of existing links """<a href="http://www.example.com">Example</a>""", """<a href="http://www.example.com" rel="nofollow" class="external">Example</a>""" ), ( # No enocoding of pre-encoded urls during autolink: """http://www.example.com/aaa%20bbb/test%20test.jpg<br/>""", """<a href="http://www.example.com/aaa%20bbb/test%20test.jpg" rel="nofollow">http://www.example.com/aaa%20bbb/test%20test.jpg</a><br />""" ), ( # Strip scripts """<div xmlns="http://www.w3.org/1999/xhtml">safe<script type="text/javascript">location.href='http:/'+'/example.com/';</script> description</div>""", """<div>safe description</div>""", ), ( # Remove target from links """<a href="www.google.com" target="_new">Google</a>""", """<a href="http://www.google.com" rel="nofollow" class="external">Google</a>""" ), ( # General cleaning (remove <br clear="all">, ...) """<br clear="all">""", """<br />""" ), ( # Converting b and i to strong and em """<b>strong</b> <i>em</i>""", """<strong>strong</strong> <em>em</em>""" ), ( # Encoded script (decimal) """<span style="any: expression(window.location='http://example.org/')">safe</span>""", """<span>safe</span>""" ), ( # Encoded script (hex) """<span style="any: expression(window.location='http://example.org/')">safe</span>""", """<span>safe</span>""" ), ( # Test unicode u"""Mitä kuuluu""", u"""Mitä kuuluu""" ), ( # Test embed """<embed src='http://videomedia.ign.com/ev/ev.swf' flashvars='object_ID=949610&downloadURL=http://pspmovies.ign.com/psp/video/article/852/852594/patapon_021508_flvlowwide.flv&allownetworking="all"' type='application/x-shockwave-flash' width='433' height='360' ></embed>""", """<embed src="http://videomedia.ign.com/ev/ev.swf" flashvars='object_ID=949610&downloadURL=http://pspmovies.ign.com/psp/video/article/852/852594/patapon_021508_flvlowwide.flv&allownetworking="all"' type="application/x-shockwave-flash" width="433" height="360"></embed>""" ), ( # Test evil code """<img src=""http://www.a.com/a.jpg<script type=text/javascript src="http://1.2.3.4:81/xss.js">" /><<img src=""http://www.a.com/a.jpg</script>""", "" ), ( # Bad font tags """<font size=+0>test</font> <font>wowzers</font> <font></font> <font><p>foo</p><i>bar</i></font>""", """test wowzers <p>foo</p><em>bar</em>""" ), ( # Stripping empty attributed """<font style="">Foo</font> <span id="">Bar</span>""", """Foo <span>Bar</span>""" ), ( # a0 == nbsp u"""test\xa0www.this.com""", u"""test\xa0<a href="http://www.this.com" rel="nofollow">www.this.com</a>""" ), ( # Remove comments "Foo <!-- bar -->", "Foo " ), ( # Layered font tags """<div><font size=+0><font size=+0><a href="http://www.google.com">test</a></font><font>ing</font> 123</font> abc</div>""", """<div><a href="http://www.google.com" rel="nofollow" class="external">test</a>ing 123 abc</div>""" ), ( # Save contents of tags specified in 'disallowed_tags_save_content' "<blink>Foo</blink>", "Foo" ), ( # Character entities shouldn't get autolinked """http://www.google.com """, """<a href="http://www.google.com" rel="nofollow">http://www.google.com</a> """ ), ( # Test unicode with autolinker u"""http://www.google.com/?q=mitä""", u"""<a href="http://www.google.com/?q=mit%C3%A4" rel="nofollow">http://www.google.com/?q=mit\xe4</a>""", ), ( # Test mailto: links """<a href="mailto:[email protected]">Mail Test</a>""", """<a href="mailto:[email protected]" rel="nofollow" class="external">Mail Test</a>""" ), ( # Test removing a node but keeping the contents """<html><head><title>Title</title></head><body><div><blink>Hello</blink> World!</blink></div></body></html>""", """<div>Hello World!</div>""" ), ( # Make keeping content for incomplete tags works "<blink><br><br>", "<br /><br />" ), ) def setUp(self): self.scrubber = Scrubber() def testScrubber(self): for html, expected in self.tests: self.failUnlessEqual(self.scrubber.scrub(html), expected)
def setUp(self): self.scrubber = Scrubber()
# # Web frontend for CVE -> RHSA report generator. # # Requires CherryPy, Mako, Scrubber, Python <= 2.7 import rhsac import cherrypy import os import re import sqlite3 from mako.template import Template from mako.lookup import TemplateLookup from scrubber import Scrubber tlu = TemplateLookup(directories=['templates']) scrubber = Scrubber(autolink=True) curdir = os.path.join(os.getcwd(), os.path.dirname(__file__)) cherrypy.config.update({ 'tools.staticdir.root': curdir, 'server.environment': 'production' }) fixemph = re.compile('!!FIX!!') class RHSAGenWeb: @cherrypy.expose def index(self, snmp=False): return tlu.get_template("index.html").render(snmp=snmp)
class ScrubberTestCase(unittest.TestCase): tests = ( ( # Invalid HTML """<div notRealAttribute="value\n"onmouseover="\nexecuteMe();\n"foo="bar">\nI will execute here, too, if you mouse over me\n</div>""", "" if BeautifulSoup.__version__.startswith('3.1') else """<div>\nI will execute here, too, if you mouse over me\n</div>""" ), ( # Autolink """www.example.com<br>""", """<a href="http://www.example.com" rel="nofollow">www.example.com</a><br />""" ), ( # No autolinking of existing links """<a href="http://www.example.com">Example</a>""", """<a href="http://www.example.com" rel="nofollow" class="external">Example</a>""" ), ( # No enocoding of pre-encoded urls during autolink: """http://www.example.com/aaa%20bbb/test%20test.jpg<br/>""", """<a href="http://www.example.com/aaa%20bbb/test%20test.jpg" rel="nofollow">http://www.example.com/aaa%20bbb/test%20test.jpg</a><br />""" ), ( # Strip scripts """<div xmlns="http://www.w3.org/1999/xhtml">safe<script type="text/javascript">location.href='http:/'+'/example.com/';</script> description</div>""", """<div>safe description</div>""", ), ( # Remove target from links """<a href="www.google.com" target="_new">Google</a>""", """<a href="http://www.google.com" rel="nofollow" class="external">Google</a>""" ), ( # General cleaning (remove <br clear="all">, ...) """<br clear="all">""", """<br />"""), ( # Converting b and i to strong and em """<b>strong</b> <i>em</i>""", """<strong>strong</strong> <em>em</em>"""), ( # Encoded script (decimal) """<span style="any: expression(window.location='http://example.org/')">safe</span>""", """<span>safe</span>"""), ( # Encoded script (hex) """<span style="any: expression(window.location='http://example.org/')">safe</span>""", """<span>safe</span>"""), ( # Test unicode u"""Mitä kuuluu""", u"""Mitä kuuluu"""), ( # Test embed """<embed src='http://videomedia.ign.com/ev/ev.swf' flashvars='object_ID=949610&downloadURL=http://pspmovies.ign.com/psp/video/article/852/852594/patapon_021508_flvlowwide.flv&allownetworking="all"' type='application/x-shockwave-flash' width='433' height='360' ></embed>""", """<embed src="http://videomedia.ign.com/ev/ev.swf" flashvars='object_ID=949610&downloadURL=http://pspmovies.ign.com/psp/video/article/852/852594/patapon_021508_flvlowwide.flv&allownetworking="all"' type="application/x-shockwave-flash" width="433" height="360"></embed>""" ), ( # Test evil code """<img src=""http://www.a.com/a.jpg<script type=text/javascript src="http://1.2.3.4:81/xss.js">" /><<img src=""http://www.a.com/a.jpg</script>""", ""), ( # Bad font tags """<font size=+0>test</font> <font>wowzers</font> <font></font> <font><p>foo</p><i>bar</i></font>""", """test wowzers <p>foo</p><em>bar</em>"""), ( # Stripping empty attributed """<font style="">Foo</font> <span id="">Bar</span>""", """Foo <span>Bar</span>"""), ( # a0 == nbsp u"""test\xa0www.this.com""", u"""test\xa0<a href="http://www.this.com" rel="nofollow">www.this.com</a>""" ), ( # Remove comments "Foo <!-- bar -->", "Foo "), ( # Layered font tags """<div><font size=+0><font size=+0><a href="http://www.google.com">test</a></font><font>ing</font> 123</font> abc</div>""", """<div><a href="http://www.google.com" rel="nofollow" class="external">test</a>ing 123 abc</div>""" ), ( # Save contents of tags specified in 'disallowed_tags_save_content' "<blink>Foo</blink>", "Foo"), ( # Character entities shouldn't get autolinked """http://www.google.com """, """<a href="http://www.google.com" rel="nofollow">http://www.google.com</a> """ ), ( # Test unicode with autolinker u"""http://www.google.com/?q=mitä""", u"""<a href="http://www.google.com/?q=mit%C3%A4" rel="nofollow">http://www.google.com/?q=mit\xe4</a>""", ), ( # Test mailto: links """<a href="mailto:[email protected]">Mail Test</a>""", """<a href="mailto:[email protected]" rel="nofollow" class="external">Mail Test</a>""" ), ( # Test removing a node but keeping the contents """<html><head><title>Title</title></head><body><div><blink>Hello</blink> World!</blink></div></body></html>""", """<div>Hello World!</div>"""), ( # Make keeping content for incomplete tags works "<blink><br><br>", "<br /><br />"), ) def setUp(self): self.scrubber = Scrubber() def testScrubber(self): for html, expected in self.tests: self.failUnlessEqual(self.scrubber.scrub(html), expected)
import gzip import os.path from scrubber import Scrubber s = Scrubber(.025) file = './all.tsv.gz' # Make sure we've got the data file locally if not os.path.exists(file): print 'Missing GPS data:', file else: gps = gzip.open(file, 'r').readlines() # secondary j iterator used to prevent doulbe testing / dropping of points j = 0 for i in xrange(1, len(gps)): j = i if j == len(gps) else j p1 = gps[j-1].strip().split('\t') p2 = gps[j].strip().split('\t') if (p1[0] == p2[0]): keep = s.keep( [ float(p1[2]), float(p1[3]) ], [ float(p2[2]), float(p2[3])] ) if not keep: j += 1 print 'Dropping point:', p2 #i, keep, p1, p2 j += 1
import gzip import os.path from scrubber import Scrubber s = Scrubber(.025) file = './all.tsv.gz' # Make sure we've got the data file locally if not os.path.exists(file): print 'Missing GPS data:', file else: gps = gzip.open(file, 'r').readlines() # secondary j iterator used to prevent doulbe testing / dropping of points j = 0 for i in xrange(1, len(gps)): j = i if j == len(gps) else j p1 = gps[j - 1].strip().split('\t') p2 = gps[j].strip().split('\t') if (p1[0] == p2[0]): keep = s.keep([float(p1[2]), float(p1[3])], [float(p2[2]), float(p2[3])]) if not keep: j += 1 print 'Dropping point:', p2 #i, keep, p1, p2 j += 1
from django import template from django.utils.safestring import mark_safe import markdown from scrubber import Scrubber register = template.Library() scrubber = Scrubber( autolink=False, nofollow=False) # Scrubber's autolink doesn't handle ftp:// md = markdown.Markdown(extensions=['nl2br', 'autolink']) @register.filter(is_safe=True) def safe_markdown(value, arg=''): return mark_safe(scrubber.scrub(md.reset().convert(value)))