Exemplo n.º 1
0
class PiiAnalyzer(object):
    def __init__(self, filepath):
        self.filepath = filepath
        self.parser = CommonRegex()
        self.standford_ner = StanfordNERTagger(
            'classifiers/english.conll.4class.distsim.crf.ser.gz')

    def analysis(self):
        people = []
        organizations = []
        locations = []
        emails = []
        phone_numbers = []
        street_addresses = []
        credit_cards = []
        ips = []
        data = []

        with open(self.filepath, 'rU') as filedata:
            reader = csv.reader(filedata)

            for row in reader:
                data.extend(row)
                for text in row:
                    emails.extend(self.parser.emails(text))
                    phone_numbers.extend(
                        self.parser.phones("".join(text.split())))
                    street_addresses.extend(self.parser.street_addresses(text))
                    credit_cards.extend(self.parser.credit_cards(text))
                    ips.extend(self.parser.ips(text))

        for title, tag in self.standford_ner.tag(set(data)):
            if tag == 'PERSON':
                people.append(title)
            if tag == 'LOCATION':
                locations.append(title)
            if tag == 'ORGANIZATION':
                organizations.append(title)

        return {
            'people': people,
            'locations': locations,
            'organizations': organizations,
            'emails': emails,
            'phone_numbers': phone_numbers,
            'street_addresses': street_addresses,
            'credit_cards': credit_cards,
            'ips': ips
        }
Exemplo n.º 2
0
class PiiAnalyzer(object):
    def __init__(self, filepath):
        self.filepath = filepath
        self.parser = CommonRegex()
        self.standford_ner = StanfordNERTagger('classifiers/english.conll.4class.distsim.crf.ser.gz')

    def analysis(self):
        people = []
        organizations = []
        locations = []
        emails = []
        phone_numbers = []
        street_addresses = []
        credit_cards = []
        ips = []
        data = []

        with open(self.filepath, 'rU') as filedata:
            reader = csv.reader(filedata)

            for row in reader:
                data.extend(row)
                for text in row:
                    emails.extend(self.parser.emails(text))
                    phone_numbers.extend(self.parser.phones("".join(text.split())))
                    street_addresses.extend(self.parser.street_addresses(text))
                    credit_cards.extend(self.parser.credit_cards(text))
                    ips.extend(self.parser.ips(text))

        for title, tag in self.standford_ner.tag(set(data)):
            if tag == 'PERSON':
                people.append(title)
            if tag == 'LOCATION':
                locations.append(title)
            if tag == 'ORGANIZATION':
                organizations.append(title)

        return {'people': people, 'locations': locations, 'organizations': organizations,
                'emails': emails, 'phone_numbers': phone_numbers, 'street_addresses': street_addresses,
                'credit_cards': credit_cards, 'ips': ips
                }
Exemplo n.º 3
0
def extractEntries(zipCode, page):
    entries = driver.find_elements_by_css_selector("td")
    countNewEntries = 0
    countDupes = 0
    countElements = 0
    pageDupes = set()
    for entry in entries:
        children = entry.find_elements_by_css_selector("div b a")

        if len(children) != 1:
            continue

        countElements += 1

        nameElements = entry.find_elements_by_css_selector("div b a")

        name = None
        phone = None
        address = None
        email = None
        url = None

        if nameElements:
            possibleNames = [elem.text for elem in nameElements if elem.text]
            if possibleNames:
                name = possibleNames[0]

        lines = entry.text.splitlines()

        for line in lines:
            parsed_text = CommonRegex(line)

            valid_urls = []
            if hasattr(parsed_text.links, '__call__'):
                if parsed_text.links():
                    valid_urls = [link for link in parsed_text.links() if 'gmail' not in link and 'yahoo' not in link and 'hotmail' not in 'link']
            else:
                if parsed_text.links:
                    valid_urls = [link for link in parsed_text.links if 'gmail' not in link and 'yahoo' not in link and 'hotmail' not in 'link']
            if valid_urls:
                url = valid_urls[0]

            if hasattr(parsed_text.emails, '__call__'):
                if parsed_text.emails():
                    email = parsed_text.emails()[0]
            else:
                if parsed_text.emails:
                    email = parsed_text.emails[0]

            if hasattr(parsed_text.phones, '__call__'):
                if parsed_text.phones():
                    phone = parsed_text.phones()[0]
            else:
                if parsed_text.phones:
                    phone = parsed_text.phones[0]

            if hasattr(parsed_text.street_addresses, '__call__'):
                if parsed_text.street_addresses():
                    address = parsed_text.street_addresses()[0]
            else:
                if parsed_text.street_addresses:
                    address = parsed_text.street_addresses[0]

        dataText = entry.text.replace("\n", " --- ")

        if name or phone or email:
            data = {
                "name": name,
                "phone": phone,
                "address": address,
                "email": email,
                "zip": zipCode,
                "url": url,
                "data": dataText
            }

            dedupeKey = str(data['name']) + str(data['email'])

            if dedupeKey not in dedupeKeys:
                countNewEntries += 1
                extracted.append(data)
                dedupeKeys.add(dedupeKey)
            elif dedupeKey not in pageDupes:
                countDupes += 1

            pageDupes.add(dedupeKey)

    print(f"    {zipCode}@{page}: Added {countNewEntries} new entries. Had {countDupes} dupes. Examined {countElements} elements")

    return countNewEntries > 0
Exemplo n.º 4
0
locations = []
emails = []
phone_numbers = []
street_addresses = []
credit_cards = []
ips = []
data = []

with open('sample-data.csv', 'r') as filedata:
    reader = csv.reader(filedata)
    for row in reader:
        data.extend(row)
    for text in row:
        emails.extend(parser.emails(text))
        phone_numbers.extend(parser.phones("".join(text.split())))
        street_addresses.extend(parser.street_addresses(text))
        credit_cards.extend(parser.credit_cards(text))
        ips.extend(parser.ips(text))

for title, tag in standford_ner.tag(set(data)):
    if tag == 'PERSON':
        people.append(title)
    if tag == 'LOCATION':
        locations.append(title)
    if tag == 'ORGANIZATION':
        organizations.append(title)

sa = {
    'people': people,
    'locations': locations,
    'organizations': organizations,
Exemplo n.º 5
0
class PiiAnalyzer(object):
    def __init__(self, filepath):
        self.filepath = filepath
        self.parser = CommonRegex()
        self.standford_ner = StanfordNERTagger('classifiers/english.conll.4class.distsim.crf.ser.gz')

    #def analysis(self, people = False, organizations = False,locations = False, emails = False, phone_numbers = False, street_addresses = False, credit_cards = False, ips=False):
    def analysis(self, **kwargs):
        people = []
        organizations = []
        locations = []
        emails = []
        phone_numbers = []
        street_addresses = []
        credit_cards = []
        ips = []
        data = []

        get_all = len(kwargs)==0
        get_people = get_all or kwargs.get('people',False)
        get_organizations = get_all or kwargs.get('organizations',False)
        get_locations = get_all or kwargs.get('locations',False)
        get_emails = get_all or kwargs.get('emails',False)
        get_phone_numbers = get_all or kwargs.get('phone_numbers',False)
        get_street_addresses = get_all or kwargs.get('street_addresses',False)
        get_credit_cards = get_all or kwargs.get('credit_cards',False)
        get_ips = get_all or kwargs.get('ips',False)

        with open(self.filepath, 'rU') as filedata:
            reader = csv.reader(filedata)

            for row in reader:
                data.extend(row)
                for text in row:
                    if(get_emails):
                        emails.extend(self.parser.emails(text))
                    if(get_phone_numbers):
                        phone_numbers.extend(self.parser.phones("".join(text.split())))
                    if(get_street_addresses):
                        street_addresses.extend(self.parser.street_addresses(text))
                    if(get_credit_cards):
                        credit_cards.extend(self.parser.credit_cards(text))
                    if(get_ips):
                        ips.extend(self.parser.ips(text))

        for title, tag in self.standford_ner.tag(set(data)):
            if get_people and tag == 'PERSON':
                people.append(title)
            if get_locations and tag == 'LOCATION':
                locations.append(title)
            if get_organizations and tag == 'ORGANIZATION':
                organizations.append(title)

        pii_results = dict()
        if people is not None: pii_results['people'] = people 
        if locations is not None: pii_results['locations'] = locations 
        if organizations is not None: pii_results['organizations'] = organizations 
        if emails is not None: pii_results['emails'] = emails 
        if phone_numbers is not None: pii_results['phone_numbers'] = phone_numbers 
        if street_addresses is not None: pii_results['street_addresses'] = street_addresses 
        if credit_cards is not None: pii_results['credit_cards'] = credit_cards 
        if ips is not None: pii_results['ips'] = ips 

        return pii_results