def test_scrubadub_clean(self): """test old scrubadub API""" text = u"John is a cat" self.assertEqual( scrubadub.clean(text), "{{NAME}} is a cat", ) scrubadub.filth.Filth.lookup = scrubadub.utils.Lookup() with warnings.catch_warnings(record=True) as warning_context: warnings.simplefilter("always") try: self.assertEqual( scrubadub.clean(text, replace_with='identifier'), "{{NAME-0}} is a cat", ) finally: warnings.simplefilter("default") self.assertTrue(sum(issubclass(w.category, DeprecationWarning) for w in warning_context) > 0) scrubadub.filth.Filth.lookup = scrubadub.utils.Lookup() with warnings.catch_warnings(record=True) as warning_context: warnings.simplefilter("always") try: self.assertEqual( scrubadub.clean("John spoke with Doug.", replace_with='identifier'), "{{NAME-0}} spoke with {{NAME-1}}.", ) finally: warnings.simplefilter("default") self.assertTrue(sum(issubclass(w.category, DeprecationWarning) for w in warning_context) > 0) scrubadub.filth.Filth.lookup = scrubadub.utils.Lookup()
def test_top_level(self): """Test that locales work at the top level""" self.assertEqual( scrubadub.clean( "Localisation is important for phone numbers '06 87 49 77 56'", locale='en_GB'), "Localisation is important for phone numbers '06 87 49 77 56'", ) self.assertEqual( scrubadub.clean( "Localisation is important for phone numbers '06 87 49 77 56'", locale='fr_FR'), "Localisation is important for phone numbers '{{PHONE}}'", ) self.assertEqual( scrubadub.clean( "Localisation is important for phone numbers '(0121) 496 0852'", locale='en_GB'), "Localisation is important for phone numbers '{{PHONE}}'", ) self.assertEqual( scrubadub.clean( "Localisation is important for phone numbers '(0121) 496 0852'", locale='fr_FR'), "Localisation is important for phone numbers '(0121) 496 0852'", )
def test_quickstart(self): """Test the example given in the quick start docs""" text = "My cat can be contacted on [email protected], or 1800 555-5555" self.assertEqual( 'My cat can be contacted on {{EMAIL}}, or {{PHONE}}', scrubadub.clean(text), )
def test_clean_on_database(self): ''' Remove pii from records loaded from database ''' records = Raw.query.all() comments = [clean(i.comment_further_comments) for i in records] self.assertTrue('{{NINO}}' in comments[0]) self.assertTrue('{{PHONE+PASSPORT}}' in comments[1]) self.assertTrue('{{VEHICLE+NAME}}' in comments[2]) self.assertTrue('{{URL}}' in comments[3]) comments = [ self.no_names_scrubber.clean(i.comment_further_comments) for i in records ] self.assertTrue('{{NINO}}' in comments[0]) self.assertTrue('{{PHONE+PASSPORT}}' in comments[1]) self.assertTrue('{{VEHICLE}}' in comments[2]) self.assertTrue('{{URL}}' not in comments[3]) comments = [ self.no_vehicles_scrubber.clean(i.comment_further_comments) for i in records ] self.assertTrue('{{NINO}}' in comments[0]) self.assertTrue('{{PHONE+PASSPORT}}' in comments[1]) self.assertTrue('NAME' not in comments[2]) self.assertTrue('VEHICLE' not in comments[2]) self.assertTrue('{{VEHICLE+NAME}}' not in comments[2]) self.assertTrue('{{URL}}' not in comments[3])
def transform_text(self): input_value = self.Scrolledtext1.get('1.0', tk.END) if len(input_value) > 1: self.Scrolledtext2.insert(tk.END, scrubadub.clean(input_value)) messagebox.showinfo("Ανωνυμοποίηση δεδομένων", "Η ανωνυμοποίηση ολοκληρώθηκε με επιτυχία!") else: messagebox.showwarning("Εισαγωγή δεδομένων", "Δεν υπάρχουν δεδομένα για ανωνυμοποίηση!")
def test_scrubber_clean(self): """test older scrubber API""" scrubber = scrubadub.Scrubber() scrubber.remove_detector('email') text = "contact Joe Duffy at [email protected]" self.assertEqual( scrubadub.clean(text), "contact {{NAME}} {{NAME}} at {{EMAIL}}", )
def textCleaning(text): # 1. Escape HTML characters text = html_parser.unescape(text) # 2. Remove name, url, email, phone, skype, ssn scrubadub.filth.base.Filth.prefix = u' ' scrubadub.filth.base.Filth.suffix = u' ' text = scrubadub.clean(text, replace_with='placeholder') scrub_placeholder_list = ["NAME", "URL", "EMAIL", "PHONE", "SKYPE", "SSN"] tokenized_words = text.replace("'", " '").split() placeholder_void_words = [ word for word in tokenized_words if word not in scrub_placeholder_list ] # 3. Remove apostrophes APOSTROPHES = { "'s": " is", "'re": " are", "'m": "am", "'ve": "have", "'d": "would", "'t": "not", "'ll": "will", "'clock": "clock" } appostophes_removed = [ APOSTROPHES[word] if word in APOSTROPHES else word for word in placeholder_void_words ] text = ' '.join(appostophes_removed) # 4. Remove non - ASCII characters text = text.decode("utf8").encode('ascii', 'ignore') # 5. Remove punctuation text = text.translate(None, string.punctuation) # 6. Split Attached Words text = " ".join(re.findall('[A-Z][^A-Z]*', text)) # 7. Remove numbers number_removed = [i for i in text.split() if not i.isdigit()] # 8. Stemming text_tokens = [stemmer.stem(item.lower()) for item in number_removed] # 9. Stop word removal stop_words_removed = [ word for word in text_tokens if word not in stopwords.words('english') ] # 10. Join cleaned text text = ' '.join([w for w in stop_words_removed if len(w) > 1]) return text
def _scrubabdub(self, value: Any) -> str: """ Take a given value, cast to string, and apply the Scrubadub clean method. Returns original value if cast to string raises a ValueError Args: value: Incoming value from Pandas Series Returns: String from Scrubadub clean method """ try: str_value = str(value) except ValueError: return value return scrubadub.clean(str_value)
return bucket_name, source_file, dest_file if args.input is None: print("Error Args are None") exit(1) else: mybucket, infile, outfile = get_bucket_and_file(args.input) s3 = boto3.resource('s3') try: s3.meta.client.download_file(mybucket, infile, temp_in_file) except botocore.exceptions.ClientError as e: if e.response['Error']['Code'] == "404": print("The object does not exist.") else: print('Error getting file {}, {}, {} {}'.format( mybucket, infile, temp_in_file, e)) fh = open(temp_in_file, "r") mytext = fh.read() clean = scrubadub.clean(mytext) print('Cleaned text') # print(clean) fh.close() fh2 = open(temp_out_file, 'w') fh2.write(clean) fh2.close() print("written clean to {}".format(temp_out_file)) s3.meta.client.upload_file(temp_out_file, mybucket, outfile) print('file uploaded to {} - {}'.format(mybucket, outfile))
"""This is the basic usage of the scrubadub module. It exposes three different methods for obfuscating personally identifiable information and uses high recall methods for identifying filth. Precision can be improved by further customization. """ import scrubadub # this should have very smart defaults, with high recall and relatively low # precision. the placeholder method is default and uses {{}} notation to # signify when text has been obfuscated clean_text = scrubadub.clean(text) clean_text = scrubadub.clean(text, replace_with="placeholder") # the surrogate replacement method makes it easy to replace phone numbers with # fake phone numbers, for example. this makes it easy to read the content clean_text = scrubadub.clean(text, replace_with="surrogate") # the identifier replacement method replaces the personal information # associated with each person in lookup with the same unique id to make it easy # to detect the same person across document records. clean_text = scrubadub.clean(text, replace_with="identifier", lookup=lookup)
def clean_text(text): text = scrubadub.clean(text) return re.sub(r'@\S*', "{{USERNAME}}", text).encode('latin1', 'ignore').decode('utf8', 'ignore')
def clean(self, text, **kwargs): if 'replace_with' in kwargs: scrubadub.filth.base.Filth.lookup = scrubadub.utils.Lookup() return scrubadub.clean(text, **kwargs)
def clean(self, text): return scrubadub.clean(text)
def clean(self, text, **kwargs): if 'replace_with' in kwargs: scrubadub.filth.base.Filth.lookup = scrubadub.utils.Lookup() return scrubadub.clean(text, **kwargs)
def test_clean(self): """Test the top level clean api""" self.assertEqual( scrubadub.clean("This is a test message for [email protected]"), "This is a test message for {{EMAIL}}", )
def test_bad_locale(self): with self.assertRaises(ValueError): scrubadub.clean( "Localisation is important for phone numbers '(0121) 496 0852'", locale='non_existant')
def test_clean_works_with_no_pii(self): no_pii = 'This string does not contain pii' self.assertEqual(clean(no_pii), no_pii)
print('Parsing {0}\'s media...'.format(user_name), flush=True) for post in takewhile(lambda p: p.date > UNTIL, dropwhile(lambda p: p.date > SINCE, posts)): media_dest = os.path.join(media_root, str(post_count)) L.download_pic(media_dest, post.url, post.date, filename_suffix=None) post_count += 1 likes = post.likes time = post.date_local.strftime("%#I:%M %p") if platform.system() == 'Windows' else post.date_local.strftime("%-I:%M %p") date = post.date_local.date() unrem = '' for word in post.caption.split(): if word[0] is '@': unrem += '{{USERNAME}} ' else: unrem += word + ' ' caption = scrubadub.clean(unrem) comments = '' for comment in post.get_comments(): unrem = '' for word in comment[2].split(): if word[0] is '@': unrem += '{{USERNAME}} ' else: unrem += word + ' ' comments += '"' + scrubadub.clean(unrem) + '", ' entry = [date, time, media_dest, caption, likes, comments] posts_parsed.append(entry) print('Scrubbing {0}\'s media...'.format(user_name), flush=True) for filename in os.listdir(media_root):
print('Parsing {0} of {1} posts...'.format(post_counter, len(posts)), end='\r', flush=True) post_counter += 1 # Extract comment details if datetime.fromtimestamp( post['timestamp']) < datetime.now() - timedelta(days=183): continue timestamp = datetime.fromtimestamp(post['timestamp'], timezone.utc) post_date = timestamp.date() post_time = timestamp.strftime("%#I:%M %p") if platform.system( ) == 'Windows' else timestamp.strftime("%-I:%M %p") if 'data' in post: if 'post' in post['data'][0]: caption = scrubadub.clean(post['data'][0]['post']) elif 'title' in post: caption = scrubadub.clean(post['title']) if 'attachments' in post: attachments = post['attachments'][0]['data'] for attachment in attachments: if 'media' in attachment: content = attachment['media'] media = content['uri'] elif 'external_context' in attachment: content = attachment['external_context'] caption += ': ' + content['url'] media = '' if 'description' in content: caption = scrubadub.clean(content['description'])
import os import re import scrubadub dirPath = "batch10" cleanFilePath = "batch10/clean/batch10.tsv" cleanFile = open(cleanFilePath, 'w') for name in os.listdir(dirPath): fileName = dirPath + '/' + name if (os.path.isfile(fileName)): with open(fileName) as file: body = file.read() cleanBody = scrubadub.clean(body) cleanFile.write('1' + '\t' + re.sub('\s+', ' ', cleanBody) + '\n') cleanFile.close()