class TestPhones(unittest.TestCase): def setUp(self): self.parser = CommonRegex() def test_phones(self): formats = ["12345678900", "1234567890", "1 234 567 8900", "234-567-8900", "1-234-567-8900", "1.234.567.8900", "5678900", "567-8900"] for f in formats: self.assertEqual(self.parser.phones(f), [f])
class PiiAnalyzer(object): def __init__(self, filepath): self.filepath = filepath self.parser = CommonRegex() self.standford_ner = StanfordNERTagger( 'classifiers/english.conll.4class.distsim.crf.ser.gz') def analysis(self): people = [] organizations = [] locations = [] emails = [] phone_numbers = [] street_addresses = [] credit_cards = [] ips = [] data = [] with open(self.filepath, 'rU') as filedata: reader = csv.reader(filedata) for row in reader: data.extend(row) for text in row: emails.extend(self.parser.emails(text)) phone_numbers.extend( self.parser.phones("".join(text.split()))) street_addresses.extend(self.parser.street_addresses(text)) credit_cards.extend(self.parser.credit_cards(text)) ips.extend(self.parser.ips(text)) for title, tag in self.standford_ner.tag(set(data)): if tag == 'PERSON': people.append(title) if tag == 'LOCATION': locations.append(title) if tag == 'ORGANIZATION': organizations.append(title) return { 'people': people, 'locations': locations, 'organizations': organizations, 'emails': emails, 'phone_numbers': phone_numbers, 'street_addresses': street_addresses, 'credit_cards': credit_cards, 'ips': ips }
class PiiAnalyzer(object): def __init__(self, filepath): self.filepath = filepath self.parser = CommonRegex() self.standford_ner = StanfordNERTagger('classifiers/english.conll.4class.distsim.crf.ser.gz') def analysis(self): people = [] organizations = [] locations = [] emails = [] phone_numbers = [] street_addresses = [] credit_cards = [] ips = [] data = [] with open(self.filepath, 'rU') as filedata: reader = csv.reader(filedata) for row in reader: data.extend(row) for text in row: emails.extend(self.parser.emails(text)) phone_numbers.extend(self.parser.phones("".join(text.split()))) street_addresses.extend(self.parser.street_addresses(text)) credit_cards.extend(self.parser.credit_cards(text)) ips.extend(self.parser.ips(text)) for title, tag in self.standford_ner.tag(set(data)): if tag == 'PERSON': people.append(title) if tag == 'LOCATION': locations.append(title) if tag == 'ORGANIZATION': organizations.append(title) return {'people': people, 'locations': locations, 'organizations': organizations, 'emails': emails, 'phone_numbers': phone_numbers, 'street_addresses': street_addresses, 'credit_cards': credit_cards, 'ips': ips }
def extractEntries(zipCode, page): entries = driver.find_elements_by_css_selector("td") countNewEntries = 0 countDupes = 0 countElements = 0 pageDupes = set() for entry in entries: children = entry.find_elements_by_css_selector("div b a") if len(children) != 1: continue countElements += 1 nameElements = entry.find_elements_by_css_selector("div b a") name = None phone = None address = None email = None url = None if nameElements: possibleNames = [elem.text for elem in nameElements if elem.text] if possibleNames: name = possibleNames[0] lines = entry.text.splitlines() for line in lines: parsed_text = CommonRegex(line) valid_urls = [] if hasattr(parsed_text.links, '__call__'): if parsed_text.links(): valid_urls = [link for link in parsed_text.links() if 'gmail' not in link and 'yahoo' not in link and 'hotmail' not in 'link'] else: if parsed_text.links: valid_urls = [link for link in parsed_text.links if 'gmail' not in link and 'yahoo' not in link and 'hotmail' not in 'link'] if valid_urls: url = valid_urls[0] if hasattr(parsed_text.emails, '__call__'): if parsed_text.emails(): email = parsed_text.emails()[0] else: if parsed_text.emails: email = parsed_text.emails[0] if hasattr(parsed_text.phones, '__call__'): if parsed_text.phones(): phone = parsed_text.phones()[0] else: if parsed_text.phones: phone = parsed_text.phones[0] if hasattr(parsed_text.street_addresses, '__call__'): if parsed_text.street_addresses(): address = parsed_text.street_addresses()[0] else: if parsed_text.street_addresses: address = parsed_text.street_addresses[0] dataText = entry.text.replace("\n", " --- ") if name or phone or email: data = { "name": name, "phone": phone, "address": address, "email": email, "zip": zipCode, "url": url, "data": dataText } dedupeKey = str(data['name']) + str(data['email']) if dedupeKey not in dedupeKeys: countNewEntries += 1 extracted.append(data) dedupeKeys.add(dedupeKey) elif dedupeKey not in pageDupes: countDupes += 1 pageDupes.add(dedupeKey) print(f" {zipCode}@{page}: Added {countNewEntries} new entries. Had {countDupes} dupes. Examined {countElements} elements") return countNewEntries > 0
organizations = [] locations = [] emails = [] phone_numbers = [] street_addresses = [] credit_cards = [] ips = [] data = [] with open('sample-data.csv', 'r') as filedata: reader = csv.reader(filedata) for row in reader: data.extend(row) for text in row: emails.extend(parser.emails(text)) phone_numbers.extend(parser.phones("".join(text.split()))) street_addresses.extend(parser.street_addresses(text)) credit_cards.extend(parser.credit_cards(text)) ips.extend(parser.ips(text)) for title, tag in standford_ner.tag(set(data)): if tag == 'PERSON': people.append(title) if tag == 'LOCATION': locations.append(title) if tag == 'ORGANIZATION': organizations.append(title) sa = { 'people': people, 'locations': locations,
class PiiAnalyzer(object): def __init__(self, filepath): self.filepath = filepath self.parser = CommonRegex() self.standford_ner = StanfordNERTagger('classifiers/english.conll.4class.distsim.crf.ser.gz') #def analysis(self, people = False, organizations = False,locations = False, emails = False, phone_numbers = False, street_addresses = False, credit_cards = False, ips=False): def analysis(self, **kwargs): people = [] organizations = [] locations = [] emails = [] phone_numbers = [] street_addresses = [] credit_cards = [] ips = [] data = [] get_all = len(kwargs)==0 get_people = get_all or kwargs.get('people',False) get_organizations = get_all or kwargs.get('organizations',False) get_locations = get_all or kwargs.get('locations',False) get_emails = get_all or kwargs.get('emails',False) get_phone_numbers = get_all or kwargs.get('phone_numbers',False) get_street_addresses = get_all or kwargs.get('street_addresses',False) get_credit_cards = get_all or kwargs.get('credit_cards',False) get_ips = get_all or kwargs.get('ips',False) with open(self.filepath, 'rU') as filedata: reader = csv.reader(filedata) for row in reader: data.extend(row) for text in row: if(get_emails): emails.extend(self.parser.emails(text)) if(get_phone_numbers): phone_numbers.extend(self.parser.phones("".join(text.split()))) if(get_street_addresses): street_addresses.extend(self.parser.street_addresses(text)) if(get_credit_cards): credit_cards.extend(self.parser.credit_cards(text)) if(get_ips): ips.extend(self.parser.ips(text)) for title, tag in self.standford_ner.tag(set(data)): if get_people and tag == 'PERSON': people.append(title) if get_locations and tag == 'LOCATION': locations.append(title) if get_organizations and tag == 'ORGANIZATION': organizations.append(title) pii_results = dict() if people is not None: pii_results['people'] = people if locations is not None: pii_results['locations'] = locations if organizations is not None: pii_results['organizations'] = organizations if emails is not None: pii_results['emails'] = emails if phone_numbers is not None: pii_results['phone_numbers'] = phone_numbers if street_addresses is not None: pii_results['street_addresses'] = street_addresses if credit_cards is not None: pii_results['credit_cards'] = credit_cards if ips is not None: pii_results['ips'] = ips return pii_results