예제 #1
0
    def test_scrubadub_clean(self):
        """test old scrubadub API"""
        text = u"John is a cat"
        self.assertEqual(
            scrubadub.clean(text),
            "{{NAME}} is a cat",
        )

        scrubadub.filth.Filth.lookup = scrubadub.utils.Lookup()
        with warnings.catch_warnings(record=True) as warning_context:
            warnings.simplefilter("always")
            try:
                self.assertEqual(
                    scrubadub.clean(text, replace_with='identifier'),
                    "{{NAME-0}} is a cat",
                )
            finally:
                warnings.simplefilter("default")
            self.assertTrue(sum(issubclass(w.category, DeprecationWarning) for w in warning_context) > 0)


        scrubadub.filth.Filth.lookup = scrubadub.utils.Lookup()
        with warnings.catch_warnings(record=True) as warning_context:
            warnings.simplefilter("always")
            try:
                self.assertEqual(
                    scrubadub.clean("John spoke with Doug.", replace_with='identifier'),
                    "{{NAME-0}} spoke with {{NAME-1}}.",
                )
            finally:
                warnings.simplefilter("default")
            self.assertTrue(sum(issubclass(w.category, DeprecationWarning) for w in warning_context) > 0)

        scrubadub.filth.Filth.lookup = scrubadub.utils.Lookup()
예제 #2
0
 def test_top_level(self):
     """Test that locales work at the top level"""
     self.assertEqual(
         scrubadub.clean(
             "Localisation is important for phone numbers '06 87 49 77 56'",
             locale='en_GB'),
         "Localisation is important for phone numbers '06 87 49 77 56'",
     )
     self.assertEqual(
         scrubadub.clean(
             "Localisation is important for phone numbers '06 87 49 77 56'",
             locale='fr_FR'),
         "Localisation is important for phone numbers '{{PHONE}}'",
     )
     self.assertEqual(
         scrubadub.clean(
             "Localisation is important for phone numbers '(0121) 496 0852'",
             locale='en_GB'),
         "Localisation is important for phone numbers '{{PHONE}}'",
     )
     self.assertEqual(
         scrubadub.clean(
             "Localisation is important for phone numbers '(0121) 496 0852'",
             locale='fr_FR'),
         "Localisation is important for phone numbers '(0121) 496 0852'",
     )
예제 #3
0
 def test_quickstart(self):
     """Test the example given in the quick start docs"""
     text = "My cat can be contacted on [email protected], or 1800 555-5555"
     self.assertEqual(
         'My cat can be contacted on {{EMAIL}}, or {{PHONE}}',
         scrubadub.clean(text),
     )
예제 #4
0
    def test_clean_on_database(self):
        '''
        Remove pii from records loaded from database
        '''
        records = Raw.query.all()
        comments = [clean(i.comment_further_comments) for i in records]
        self.assertTrue('{{NINO}}' in comments[0])
        self.assertTrue('{{PHONE+PASSPORT}}' in comments[1])
        self.assertTrue('{{VEHICLE+NAME}}' in comments[2])
        self.assertTrue('{{URL}}' in comments[3])

        comments = [
            self.no_names_scrubber.clean(i.comment_further_comments)
            for i in records
        ]
        self.assertTrue('{{NINO}}' in comments[0])
        self.assertTrue('{{PHONE+PASSPORT}}' in comments[1])
        self.assertTrue('{{VEHICLE}}' in comments[2])
        self.assertTrue('{{URL}}' not in comments[3])

        comments = [
            self.no_vehicles_scrubber.clean(i.comment_further_comments)
            for i in records
        ]
        self.assertTrue('{{NINO}}' in comments[0])
        self.assertTrue('{{PHONE+PASSPORT}}' in comments[1])
        self.assertTrue('NAME' not in comments[2])
        self.assertTrue('VEHICLE' not in comments[2])
        self.assertTrue('{{VEHICLE+NAME}}' not in comments[2])
        self.assertTrue('{{URL}}' not in comments[3])
예제 #5
0
 def transform_text(self):
     input_value = self.Scrolledtext1.get('1.0', tk.END)
     if len(input_value) > 1:
         self.Scrolledtext2.insert(tk.END, scrubadub.clean(input_value))
         messagebox.showinfo("Ανωνυμοποίηση δεδομένων",
                             "Η ανωνυμοποίηση ολοκληρώθηκε με επιτυχία!")
     else:
         messagebox.showwarning("Εισαγωγή δεδομένων",
                                "Δεν υπάρχουν δεδομένα για ανωνυμοποίηση!")
예제 #6
0
 def test_scrubber_clean(self):
     """test older scrubber API"""
     scrubber = scrubadub.Scrubber()
     scrubber.remove_detector('email')
     text = "contact Joe Duffy at [email protected]"
     self.assertEqual(
         scrubadub.clean(text),
         "contact {{NAME}} {{NAME}} at {{EMAIL}}",
     )
예제 #7
0
def textCleaning(text):

    # 1. Escape HTML characters
    text = html_parser.unescape(text)

    # 2. Remove name, url, email, phone, skype, ssn
    scrubadub.filth.base.Filth.prefix = u' '
    scrubadub.filth.base.Filth.suffix = u' '
    text = scrubadub.clean(text, replace_with='placeholder')
    scrub_placeholder_list = ["NAME", "URL", "EMAIL", "PHONE", "SKYPE", "SSN"]
    tokenized_words = text.replace("'", " '").split()
    placeholder_void_words = [
        word for word in tokenized_words if word not in scrub_placeholder_list
    ]

    # 3. Remove apostrophes
    APOSTROPHES = {
        "'s": " is",
        "'re": " are",
        "'m": "am",
        "'ve": "have",
        "'d": "would",
        "'t": "not",
        "'ll": "will",
        "'clock": "clock"
    }
    appostophes_removed = [
        APOSTROPHES[word] if word in APOSTROPHES else word
        for word in placeholder_void_words
    ]

    text = ' '.join(appostophes_removed)

    # 4. Remove non - ASCII characters
    text = text.decode("utf8").encode('ascii', 'ignore')

    # 5. Remove punctuation
    text = text.translate(None, string.punctuation)

    # 6. Split Attached Words
    text = " ".join(re.findall('[A-Z][^A-Z]*', text))

    # 7. Remove numbers
    number_removed = [i for i in text.split() if not i.isdigit()]

    # 8. Stemming
    text_tokens = [stemmer.stem(item.lower()) for item in number_removed]

    # 9. Stop word removal
    stop_words_removed = [
        word for word in text_tokens if word not in stopwords.words('english')
    ]

    # 10. Join cleaned text
    text = ' '.join([w for w in stop_words_removed if len(w) > 1])
    return text
예제 #8
0
    def _scrubabdub(self, value: Any) -> str:
        """
        Take a given value, cast to string, and apply the Scrubadub
        clean method. Returns original value if cast to string
        raises a ValueError

        Args:
            value: Incoming value from Pandas Series

        Returns:
            String from Scrubadub clean method
        """
        try:
            str_value = str(value)
        except ValueError:
            return value
        return scrubadub.clean(str_value)
예제 #9
0
    return bucket_name, source_file, dest_file


if args.input is None:
    print("Error Args are None")
    exit(1)
else:
    mybucket, infile, outfile = get_bucket_and_file(args.input)
    s3 = boto3.resource('s3')
    try:
        s3.meta.client.download_file(mybucket, infile, temp_in_file)
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            print("The object does not exist.")
        else:
            print('Error getting file {}, {}, {} {}'.format(
                mybucket, infile, temp_in_file, e))
    fh = open(temp_in_file, "r")
    mytext = fh.read()
    clean = scrubadub.clean(mytext)
    print('Cleaned text')
    # print(clean)
    fh.close()
    fh2 = open(temp_out_file, 'w')
    fh2.write(clean)
    fh2.close()
    print("written clean to {}".format(temp_out_file))

    s3.meta.client.upload_file(temp_out_file, mybucket, outfile)
    print('file uploaded to {} - {}'.format(mybucket, outfile))
"""This is the basic usage of the scrubadub module. It exposes three different
methods for obfuscating personally identifiable information and uses high
recall methods for identifying filth. Precision can be improved by further
customization.
"""

import scrubadub

# this should have very smart defaults, with high recall and relatively low
# precision. the placeholder method is default and uses {{}} notation to
# signify when text has been obfuscated
clean_text = scrubadub.clean(text)
clean_text = scrubadub.clean(text, replace_with="placeholder")

# the surrogate replacement method makes it easy to replace phone numbers with
# fake phone numbers, for example. this makes it easy to read the content
clean_text = scrubadub.clean(text, replace_with="surrogate")

# the identifier replacement method replaces the personal information
# associated with each person in lookup with the same unique id to make it easy
# to detect the same person across document records.
clean_text = scrubadub.clean(text, replace_with="identifier", lookup=lookup)
예제 #11
0
def clean_text(text):
    text = scrubadub.clean(text)
    return re.sub(r'@\S*', "{{USERNAME}}", text).encode('latin1', 'ignore').decode('utf8', 'ignore')
예제 #12
0
 def clean(self, text, **kwargs):
     if 'replace_with' in kwargs:
         scrubadub.filth.base.Filth.lookup = scrubadub.utils.Lookup()
     return scrubadub.clean(text, **kwargs)
예제 #13
0
파일: base.py 프로젝트: jb08/scrubadub
 def clean(self, text):
     return scrubadub.clean(text)
예제 #14
0
 def clean(self, text, **kwargs):
     if 'replace_with' in kwargs:
         scrubadub.filth.base.Filth.lookup = scrubadub.utils.Lookup()
     return scrubadub.clean(text, **kwargs)
예제 #15
0
 def test_clean(self):
     """Test the top level clean api"""
     self.assertEqual(
         scrubadub.clean("This is a test message for [email protected]"),
         "This is a test message for {{EMAIL}}",
     )
예제 #16
0
 def test_bad_locale(self):
     with self.assertRaises(ValueError):
         scrubadub.clean(
             "Localisation is important for phone numbers '(0121) 496 0852'",
             locale='non_existant')
예제 #17
0
    def test_clean_works_with_no_pii(self):

        no_pii = 'This string does not contain pii'
        self.assertEqual(clean(no_pii), no_pii)
예제 #18
0
print('Parsing {0}\'s media...'.format(user_name), flush=True)
for post in takewhile(lambda p: p.date > UNTIL, dropwhile(lambda p: p.date > SINCE, posts)):
    media_dest = os.path.join(media_root, str(post_count))
    L.download_pic(media_dest, post.url, post.date, filename_suffix=None)
    post_count += 1

    likes = post.likes
    time = post.date_local.strftime("%#I:%M %p") if platform.system() == 'Windows' else post.date_local.strftime("%-I:%M %p")
    date = post.date_local.date()
    unrem = ''
    for word in post.caption.split():
        if word[0] is '@':
            unrem += '{{USERNAME}} '
        else:
            unrem += word + ' '
    caption = scrubadub.clean(unrem)
    comments = ''
    for comment in post.get_comments():
        unrem = ''
        for word in comment[2].split():
            if word[0] is '@':
                unrem += '{{USERNAME}} '
            else:
                unrem += word + ' '
        comments += '"' + scrubadub.clean(unrem) + '", '

    entry = [date, time, media_dest, caption, likes, comments]
    posts_parsed.append(entry)

print('Scrubbing {0}\'s media...'.format(user_name), flush=True)
for filename in os.listdir(media_root):
예제 #19
0
            print('Parsing {0} of {1} posts...'.format(post_counter,
                                                       len(posts)),
                  end='\r',
                  flush=True)
            post_counter += 1
            # Extract comment details
            if datetime.fromtimestamp(
                    post['timestamp']) < datetime.now() - timedelta(days=183):
                continue
            timestamp = datetime.fromtimestamp(post['timestamp'], timezone.utc)
            post_date = timestamp.date()
            post_time = timestamp.strftime("%#I:%M %p") if platform.system(
            ) == 'Windows' else timestamp.strftime("%-I:%M %p")
            if 'data' in post:
                if 'post' in post['data'][0]:
                    caption = scrubadub.clean(post['data'][0]['post'])
            elif 'title' in post:
                caption = scrubadub.clean(post['title'])

            if 'attachments' in post:
                attachments = post['attachments'][0]['data']
                for attachment in attachments:
                    if 'media' in attachment:
                        content = attachment['media']
                        media = content['uri']
                    elif 'external_context' in attachment:
                        content = attachment['external_context']
                        caption += ': ' + content['url']
                        media = ''
                    if 'description' in content:
                        caption = scrubadub.clean(content['description'])
예제 #20
0
import os
import re
import scrubadub

dirPath = "batch10"
cleanFilePath = "batch10/clean/batch10.tsv"

cleanFile = open(cleanFilePath, 'w')

for name in os.listdir(dirPath):
    fileName = dirPath + '/' + name
    if (os.path.isfile(fileName)):
        with open(fileName) as file:
            body = file.read()
            cleanBody = scrubadub.clean(body)
            cleanFile.write('1' + '\t' + re.sub('\s+', ' ', cleanBody) + '\n')

cleanFile.close()