class TestDates(unittest.TestCase): def setUp(self): self.parser = CommonRegex() def test_numeric(self): formats = ["1-19-14", "1.19.14", "1.19.14", "01.19.14"] for f in formats: self.assertEqual(self.parser.dates(f), [f]) def test_verbose(self): formats = ["January 19th, 2014", "Jan. 19th, 2014", "Jan 19 2014", "19 Jan 2014"] for f in formats: self.assertEqual(self.parser.dates(f), [f])
class PiiAnalyzer(object): def __init__(self, filepath): self.filepath = filepath self.parser = CommonRegex() self.standford_ner = StanfordNERTagger( 'classifiers/english.conll.4class.distsim.crf.ser.gz') def analysis(self): people = [] organizations = [] locations = [] emails = [] phone_numbers = [] street_addresses = [] credit_cards = [] ips = [] data = [] with open(self.filepath, 'rU') as filedata: reader = csv.reader(filedata) for row in reader: data.extend(row) for text in row: emails.extend(self.parser.emails(text)) phone_numbers.extend( self.parser.phones("".join(text.split()))) street_addresses.extend(self.parser.street_addresses(text)) credit_cards.extend(self.parser.credit_cards(text)) ips.extend(self.parser.ips(text)) for title, tag in self.standford_ner.tag(set(data)): if tag == 'PERSON': people.append(title) if tag == 'LOCATION': locations.append(title) if tag == 'ORGANIZATION': organizations.append(title) return { 'people': people, 'locations': locations, 'organizations': organizations, 'emails': emails, 'phone_numbers': phone_numbers, 'street_addresses': street_addresses, 'credit_cards': credit_cards, 'ips': ips }
def addresses(data): st_address = [] loc = '' loc_list = [] parsed_text = CommonRegex(data) st_address = parsed_text.street_addresses words = nltk.word_tokenize(data) taggsets = nltk.pos_tag(words) namedEnt = nltk.ne_chunk(taggsets, binary=False) for subtree in namedEnt.subtrees(): if subtree.label() == 'GPE': l = [] # print((subtree.leaves())) for leaf in subtree.leaves(): l.append(leaf[0]) loc = ' '.join(l) if loc not in loc_list: loc_list.append(loc) loc_list.extend(st_address) # for add in loc_list: # data = data.replace(add, block) return data, loc_list
def check_for_address_format(addresses): final = [] for item in addresses: parsed = CommonRegex(str(item)) if parsed.street_addresses: final.append(parsed) return len(final)
def dates(data): parsed_text = CommonRegex(data) dates_list = parsed_text.dates # for date in dates_list: # data = data.replace(date, block) return data, dates_list
class TestTimes(unittest.TestCase): def setUp(self): self.parser = CommonRegex() def test_times(self): formats = ["09:45", "9:45", "23:45", "9:00am", "9am", "9:00 A.M.", "9:00 pm"] for f in formats: self.assertEqual(self.parser.times(f), [f])
def getFields(filename): filenamer = filename text = open(filenamer).read() text = text.upper() #print text #parser.parse(text) #print Parser.contacttest_address = parsed_text = CommonRegex(tex) print addresses
def index(): if request.method == 'GET': return 'OK' elif request.method == 'POST': # Store the IP address of the requester request_ip = ipaddress.ip_address(u'{0}'.format(request.remote_addr)) # If GHE_ADDRESS is specified, use it as the hook_blocks. if os.environ.get('GHE_ADDRESS', None): hook_blocks = [os.environ.get('GHE_ADDRESS')] # Otherwise get the hook address blocks from the API. else: hook_blocks = requests.get( 'https://api.github.com/meta').json()['hooks'] if request.headers.get('X-GitHub-Event') == "ping": return json.dumps({'msg': 'Hi!'}) if request.headers.get('X-GitHub-Event') == 'pull_request': merge_state = request.json['pull_request']['state'] merge_body = request.json['pull_request']['body'] if (merge_state == 'closed'): print('Merge state closed') print('Merge Body: ' + merge_body) parsed_bounty_issue = re.findall(r"#(\w+)", merge_body)[0] repository_path_encode = str(repository_path) repository_path_encode = repository_path.encode('utf-8') bounty_issue_encode = str(parsed_bounty_issue) bounty_issue_encode = bounty_issue_encode.encode('utf-8') passphrase = hashlib.sha256(repository_path + issue_title).hexdigest() addresses = CommonRegex(merge_body).btc_addresses[0] parsed_bounty_issue = re.findall(r"#(\w+)", merge_body) bounty_address = github.get_address_from_issue( parsed_bounty_issue) amount = utils.get_address_balance(bounty_address) with open(DEFAULT_WALLET_PATH, 'r') as f: json_data = json.load(f) issue_name = json_data[parsed_bounty_issue] multisig_wallet.send_bitcoin(str(issue_name), str(addresses), int(amount * 1e8), str(passphrase)) return json.dumps({'message': 'Pull request received'}) return json.dumps({'message': 'Pull request payout failed'}) if request.headers.get('X-GitHub-Event') == 'issue_comment': comment_data = { 'url': request.json['comment']['issue_url'], 'payout_address': request.json['issue']['labels'][0]['name'], 'payout_amount': request.json['issue']['labels'][1]['name'], 'body': request.json['comment']['body'] } print(comment_data) return json.dumps({'message': 'Issue comment received'})
class TestLinks(unittest.TestCase): def setUp(self): self.parser = CommonRegex() def test_links(self): formats = ["www.google.com", "http://www.google.com", "www.google.com/?query=dog" "sub.example.com", "http://www.google.com/%&#/?q=dog"] for f in formats: self.assertEqual(self.parser.links(f), [f])
def contains_personal_info(text): parsed_text = CommonRegex(text) return any([ bool(parsed_text.links), bool(parsed_text.emails), bool(parsed_text.ips), bool(parsed_text.ipv6s), bool(parsed_text.credit_cards), bool(parsed_text.btc_addresses), ])
class TestPhones(unittest.TestCase): def setUp(self): self.parser = CommonRegex() def test_phones(self): formats = ["12345678900", "1234567890", "1 234 567 8900", "234-567-8900", "1-234-567-8900", "1.234.567.8900", "5678900", "567-8900"] for f in formats: self.assertEqual(self.parser.phones(f), [f])
def extract_email(values): emaillist = [] perm_email = [] for item in values: Email = CommonRegex(str(item)) if Email.emails: emaillist.append(Email.emails) for i in emaillist: for j in i: perm_email.append(j) return perm_email
def extract_phonenumbers(values): phonelist = [] perm_phone = [] for item in values: phonenumbers = CommonRegex(str(item)) if phonenumbers.phones: phonelist.append(phonenumbers.phones) for i in phonelist: for j in i: perm_phone.append(j) return perm_phone
class PiiAnalyzer(object): def __init__(self, filepath): self.filepath = filepath self.parser = CommonRegex() self.standford_ner = StanfordNERTagger('classifiers/english.conll.4class.distsim.crf.ser.gz') def analysis(self): people = [] organizations = [] locations = [] emails = [] phone_numbers = [] street_addresses = [] credit_cards = [] ips = [] data = [] with open(self.filepath, 'rU') as filedata: reader = csv.reader(filedata) for row in reader: data.extend(row) for text in row: emails.extend(self.parser.emails(text)) phone_numbers.extend(self.parser.phones("".join(text.split()))) street_addresses.extend(self.parser.street_addresses(text)) credit_cards.extend(self.parser.credit_cards(text)) ips.extend(self.parser.ips(text)) for title, tag in self.standford_ner.tag(set(data)): if tag == 'PERSON': people.append(title) if tag == 'LOCATION': locations.append(title) if tag == 'ORGANIZATION': organizations.append(title) return {'people': people, 'locations': locations, 'organizations': organizations, 'emails': emails, 'phone_numbers': phone_numbers, 'street_addresses': street_addresses, 'credit_cards': credit_cards, 'ips': ips }
def find_numbers(data): numbers = [] data1 = CommonRegex(data) if data1.phones != 0: for n in data1.phones: numbers.append(n) size = len(numbers) stats = ( "The number of unique phone numbers replaced in the given file is %d \n" % size) return numbers, stats
def find_address(data): addresses = [] data1 = CommonRegex(data) if data1.street_addresses != 0: for n in data1.street_addresses: addresses.append(n) size = len(addresses) stats = ( "The number of unique addresses replaced in the given file is %d \n" % size) return addresses, stats
def detect(self, column: CatColumn, datum: str) -> Optional[PiiType]: """Scan the text and return an array of PiiTypes that are found""" regex_result = CommonRegex(datum) if regex_result.phones: # pylint: disable=no-member return Phone() if regex_result.emails: # pylint: disable=no-member return Email() if regex_result.credit_cards: # pylint: disable=no-member return CreditCard() if regex_result.street_addresses: # pylint: disable=no-member return Address() return None
def find_dates(data): data1 = CommonRegex(data) date5 = [] if data1.dates != 0: for n in data1.dates: date5.append(n) for n in date5: data = data.replace(n, "█" * len(n)) size = len(date5) stats = ("The number of unique dates replaced in the given file is %d \n" % size) return date5, stats
def scan(self, text): """Scan the text and return an array of PiiTypes that are found""" regex_result = CommonRegex(text) types = [] if regex_result.phones: # pylint: disable=no-member types.append(PiiTypes.PHONE) if regex_result.emails: # pylint: disable=no-member types.append(PiiTypes.EMAIL) if regex_result.credit_cards: # pylint: disable=no-member types.append(PiiTypes.CREDIT_CARD) if regex_result.street_addresses: # pylint: disable=no-member types.append(PiiTypes.ADDRESS) return types
def get_address_from_issue(issue_number): github_url = "https://api.github.com/repos/" + repository['path'] + '/issues/' + issue_number headers = { "Authorization": "token " + GITHUB_TOKEN, "Content-Type": "application/json" } # Setup the request req = urllib.request.Request(github_url, headers=headers) # Make the request, capture the response res = urllib.request.urlopen(req).read() res = json.loads(res.decode()) body = res['body'] address = CommonRegex(body).btc_addresses[0] return address
def find_info(text, attribute): if text is None: return [] parsed_text = CommonRegex(text) if attribute == Information.link: return parsed_text.links if attribute == Information.email: return parsed_text.emails if attribute == Information.time: return parsed_text.times if attribute == Information.date: return parsed_text.dates if attribute == Information.phone: return parsed_text.phones if attribute == Information.price: return parsed_text.prices if attribute == Information.address: return parsed_text.street_addresses
def dates(data): text_parsed = CommonRegex(data) list_of_dates = text_parsed.dates #alternately, if i use regular expression, to find dates.The regular expressions can be de#defined as we want. """ init_file=[] total_data=data dates_red=[] for i in range(len(total_data)): red = total_data[i] red_dates = re.findall(r"([A-Z]\w\w\w+\s\d+,\s\d\d\d\d)", red) dates_red.append(red_dates) for i in range(len(dates_red)): for j in dates_red: red = red.replace(j,'██') init_file.append(red) return init_file """ return data, list_of_dates
def is_stop_sentence(sentence): words = { 'copyright', 'unsubscribe', 'instagram', 'twitter', 'facebook', 'youtube', 'style=' } tokens = nltk.word_tokenize(sentence) for token in tokens: if token.lower() in words: return True parsed_text = CommonRegex(sentence) if len(parsed_text.street_addresses) > 0 or len( parsed_text.emails) > 0 or len(parsed_text.phones) > 0 or len( parsed_text.times) > 0 or len(parsed_text.dates) > 0 or len( parsed_text.links) > 0 or len(parsed_text.zip_codes) > 0: return True return False
def parsedContent(this): openObject = open(this.path, "r") this.contents = openObject.read() this.parsed_text = CommonRegex(contents)
def parsedContent(self): openObject = open(self.path, "r") self.contents = openObject.read() self.parsed_text = CommonRegex(self.contents)
def extractEntries(zipCode, page): entries = driver.find_elements_by_css_selector("td") countNewEntries = 0 countDupes = 0 countElements = 0 pageDupes = set() for entry in entries: children = entry.find_elements_by_css_selector("div b a") if len(children) != 1: continue countElements += 1 nameElements = entry.find_elements_by_css_selector("div b a") name = None phone = None address = None email = None url = None if nameElements: possibleNames = [elem.text for elem in nameElements if elem.text] if possibleNames: name = possibleNames[0] lines = entry.text.splitlines() for line in lines: parsed_text = CommonRegex(line) valid_urls = [] if hasattr(parsed_text.links, '__call__'): if parsed_text.links(): valid_urls = [link for link in parsed_text.links() if 'gmail' not in link and 'yahoo' not in link and 'hotmail' not in 'link'] else: if parsed_text.links: valid_urls = [link for link in parsed_text.links if 'gmail' not in link and 'yahoo' not in link and 'hotmail' not in 'link'] if valid_urls: url = valid_urls[0] if hasattr(parsed_text.emails, '__call__'): if parsed_text.emails(): email = parsed_text.emails()[0] else: if parsed_text.emails: email = parsed_text.emails[0] if hasattr(parsed_text.phones, '__call__'): if parsed_text.phones(): phone = parsed_text.phones()[0] else: if parsed_text.phones: phone = parsed_text.phones[0] if hasattr(parsed_text.street_addresses, '__call__'): if parsed_text.street_addresses(): address = parsed_text.street_addresses()[0] else: if parsed_text.street_addresses: address = parsed_text.street_addresses[0] dataText = entry.text.replace("\n", " --- ") if name or phone or email: data = { "name": name, "phone": phone, "address": address, "email": email, "zip": zipCode, "url": url, "data": dataText } dedupeKey = str(data['name']) + str(data['email']) if dedupeKey not in dedupeKeys: countNewEntries += 1 extracted.append(data) dedupeKeys.add(dedupeKey) elif dedupeKey not in pageDupes: countDupes += 1 pageDupes.add(dedupeKey) print(f" {zipCode}@{page}: Added {countNewEntries} new entries. Had {countDupes} dupes. Examined {countElements} elements") return countNewEntries > 0
def index(): if request.method == 'GET': return 'OK' elif request.method == 'POST': # Store the IP address of the requester request_ip = ipaddress.ip_address(u'{0}'.format(request.remote_addr)) # If GHE_ADDRESS is specified, use it as the hook_blocks. if os.environ.get('GHE_ADDRESS', None): hook_blocks = [os.environ.get('GHE_ADDRESS')] # Otherwise get the hook address blocks from the API. else: hook_blocks = requests.get( 'https://api.github.com/meta').json()['hooks'] if request.headers.get('X-GitHub-Event') == "ping": return json.dumps({'msg': 'Hi!'}) if request.headers.get('X-GitHub-Event') == 'pull_request': if (request.json['pull_request']['user']['site_admin'] == 'false'): return json.dumps( {'message': 'Pull request not submitted by site admin'}) merge_state = request.json['pull_request']['state'] merge_body = request.json['pull_request']['body'] if (merge_state == 'closed'): print('Merge state closed') print('Merge Body: ' + merge_body) parsed_bounty_issue = re.findall(r"#(\w+)", merge_body)[0] addresses = CommonRegex(merge_body).btc_addresses[0] bounty_address = github.get_address_from_issue( parsed_bounty_issue) amount = multisig_wallet.get_address_balance(bounty_address) try: # use username to look up wallet Id with open(DEFAULT_WALLET_PATH, 'r') as wallet: data = json.loads(wallet.read()) for user in data: try: if (user['issue_number'] == int( parsed_bounty_issue)): print('Wallet found') wallet_name = user['wallet_name'] walletId = user[wallet_name]['walletId'] except: print('Loading wallet..') except: print('Wallet not found, creating new user...') # Set up sending of the bounty issue_title = wallet_name repository_path_encode = repository_path.encode('utf-8') issue_title_encode = issue_title.encode('utf-8') passphrase = hashlib.sha256(repository_path_encode + issue_title_encode).hexdigest() multisig_wallet.send_bitcoin_simple(walletId, str(addresses), amount, passphrase) # Set up sending of the tweet usd_per_btc = requests.get( 'https://bitpay.com/api/rates/usd').json()['rate'] bounty_in_btc = round((int(bounty_in_satoshi) / 10**8), 3) bounty_in_usd = round(bounty_in_btc * usd_per_btc, 2) url = 'https://github.com/21hackers/git-money/issues/' + parsed_bounty_issue twitter.send('Bounty Granted (' + amount + ' bits ~ $' + bounty_in_usd + '): ' + issue_title + ' ' + url) return json.dumps({'message': 'Pull request received'}) return json.dumps({'message': 'Pull request payout failed'}) if request.headers.get('X-GitHub-Event') == 'issue_comment': comment_data = { 'url': request.json['comment']['issue_url'], 'payout_address': request.json['issue']['labels'][0]['name'], 'payout_amount': request.json['issue']['labels'][1]['name'], 'body': request.json['comment']['body'] } print(comment_data) return json.dumps({'message': 'Issue comment received'})
def setUp(self): self.parser = CommonRegex()
def echo(): # Getting the data InputData = request.args.get('echoValue') #unicode if InputData: EncodedInputData = InputData.encode("utf-8") else: return jsonify(Empty='Empty') SpacySmallData = SpacySmall(InputData) SpacyMediumData = SpacyMedium(InputData) places1 = GeoText(EncodedInputData) # Email Regex expression library email = [] parsed_text = CommonRegex(EncodedInputData) email.append(parsed_text.emails) # URL Finding(URL Regex) URLMatcher = re.search( r'\(?\b(http://|www[.])[-A-Za-z0-9+&@#/%?=~_()|!:,.;]*[-A-Za-z0-9+&@#/%=~_()|]', EncodedInputData) url = [] if URLMatcher: url.append(URLMatcher.group(0)) places = GeoText(EncodedInputData) # Finding name using the Spacy library name = [] for ent in SpacySmallData.ents: if ent.label_ == 'PERSON': name.append((ent.text)) names = list(set(name) - set(places1.countries) - set(places1.cities)) name1 = [x for x in names if not any(x1.isdigit() for x1 in x)] org = [] # Finding the Organisation using the Spacy library for ent in SpacyMediumData.ents: if ent.label_ == 'ORG': org.append((ent.text)) if not org: for ent in SpacySmallData.ents: if ent.label_ == 'ORG': org.append((ent.text)) places2 = GeoText(EncodedInputData) language = "" # Detecting the language try: language = detect(InputData) except: pass # Distinguishing between Phone number,Fax number and Mobile Number TrainedSpacy = TrainedSpacyData(InputData) mob = [] fax = [] tel = [] for ent in TrainedSpacy.ents: if ent.label_ == 'TEL': tel.append((ent.text)) if ent.label_ == 'MOB': mob.append(ent.text) if ent.label_ == 'FAX': fax.append(ent.text) # Finding the word infront of the Phone number and classifying based on the word as Phone or Fax or Mob --- This is done using the Phonenumber Library. tel1 = [] fax1 = [] mob1 = [] for match in phonenumbers.PhoneNumberMatcher(InputData, None): cnew = NaiveBayesClassifier(train) a = match.start start = match.start end = match.end # print a my = InputData # if (a > 1): try: word = my[:(a - 1)].split()[-1] # print word if (any(x.isalpha() for x in word)): if cnew.classify(word) is 'tel': tel1.append(my[start:end]) elif cnew.classify(word) is 'fax': fax1.append(my[start:end]) elif cnew.classify(word) is 'mob': mob1.append(my[start:end]) else: word2 = my[:(a - 1)].split()[-2] if cnew.classify(word2) is 'tel': tel1.append(my[start:end]) elif cnew.classify(word2) is 'fax': fax1.append(my[start:end]) elif cnew.classify(word2) is 'mob': mob1.append(my[start:end]) except IndexError: tel1.append(my[start:end]) # Find the word infront of the Phone NO & classifying based on the word as Phone/Fax/Mob - This is done using the Trained Model phone = TrainedSpacyData(InputData) for ent in phone.ents: if (ent.label_ == 'TEL' or ent.label_ == 'FAX' or ent.label_ == 'MOB'): cnew = NaiveBayesClassifier(train) a = ent.start_char # print a my = InputData try: word = my[:(a - 1)].split()[-1] if (any(x.isalpha() for x in word)): # print (word) if cnew.classify(word) is 'tel': tel1.append(ent.text) elif cnew.classify(word) is 'fax': fax1.append(ent.text) elif cnew.classify(word) is 'mob': mob1.append(ent.text) else: word2 = my[:(a - 1)].split()[-2] # print (word2) if cnew.classify(word2) is 'tel': tel1.append(ent.text) elif cnew.classify(word2) is 'fax': fax1.append(ent.text) elif cnew.classify(word2) is 'mob': mob1.append(ent.text) except IndexError: tel1.append(ent.text) tel1 = [x for x in tel1 if sum(c.isdigit() for c in x) > 9] fax1 = [x for x in fax1 if sum(c.isdigit() for c in x) > 9] mob1 = [x for x in mob1 if sum(c.isdigit() for c in x) > 9] tel2 = list(set(tel1)) fax2 = list(set(fax1)) mob2 = list(set(mob1)) # Remove Alphabetic in Telephone NO tel2 = removeAplhabet(tel2) fax2 = removeAplhabet(fax2) mob2 = removeAplhabet(mob2) # Title detetction # Finding Position entity by using library and model pos = [] data = EncodedInputData.decode("utf-8") data = data.title() title = finder.findall(data) y = 0 for x in title: pos.append(InputData[title[y][0]:title[y][1]]) y += 1 if not pos: for ent in TrainedSpacy.ents: if ent.label_ == 'POS': pos.append(ent.text) # Continuation of the Organisation to remove duplicate from the position for x in pos: for y in org: if x in y: org.remove(y) org1 = list( set(org) - set(places1.countries) - set(places1.cities) - set(name1)) orgg = [] for x in org1: orgg.append((x)) # Address Detection AddressList = [] # Using a PYAP library addresses = pyap.parse(EncodedInputData, country='US') for address in addresses: AddressList.append(str(address)) # Identifying Address using Trained Spacy Model if not AddressList: for ent in phone.ents: if ent.label_ == 'add': AddressList.append(ent.text) # Identifying Address by finding the line where it has City if not AddressList: for line in EncodedInputData.splitlines(): for city in places2.cities: if city in line: AddressList.append(line) AddressList = [x for x in AddressList if "Mobile" not in x] # Limitation AddressList = [x for x in AddressList if "MOBILE" not in x] # Limitation # Identifying Address using Regex Expression if not AddressList: if places2.cities: r2 = re.compile( r'([(\d|-|/|/s|(A-Z)?){1-7}]+[,|-|\s]+[A-zZ]+[Aa-zZ]+.*)') add = r2.findall(EncodedInputData) # print add for text in add: for text2 in places.cities: if text2 in text: AddressList.append(text) wotex = list(set(AddressList)) # Passing Address into the Geocoder-GOOGLE Library to extract into components import geocoder add_1 = [] city_1 = [] country_1 = [] code_1 = [] zip_1 = [] county_1 = [] state_1 = [] for x in wotex: try: g1 = geocoder.google(x) if g1.postal: zip_1.append(g1.postal) add1 = "" if g1.housenumber: add1 = g1.housenumber if g1.street: add1 = add1 + " " + g1.street add_1.append(add1) if g1.country: code_1.append(g1.country) # city_1.append(g1.city) if g1.city: city_1.append(g1.city) if g1.country_long: country_1.append(g1.country_long) if g1.state_long: state_1.append(g1.state_long) if g1.county: county_1.append(g1.county) if not g1.city: place_new = GeoText(x) y = str(place_new.cities) g2 = geocoder.google(y) if g2.country: code_1.append(g2.country) if g2.country_long: country_1.append(g2.country_long) if g2.city: city_1.append(g2.city) if g2.postal: zip_1.append(g2.postal) if g2.county: county_1.append(g2.county) except Exception: pass # Passing Address into the Geocoder-OSM Library to extract into components add_OSM = [] city_OSM = [] zip_OSM = [] code_OSM = [] country_OSM = [] county_OSM = [] state_OSM = [] for y in wotex: try: g21 = geocoder.osm(y) add2 = "" if g21.json and 'housenumber' in g21.json: add2 = g21.json['housenumber'] if g21.json and 'street' in g21.json: add2 = add2 + ' ' + g21.json['street'] add_OSM.append(add2) if g21.json and 'city' in g21.json: city_OSM.append(g21.json['city']) if g21.json and 'country' in g21.json: country_OSM.append(g21.json['country']) if g21.json and 'postal' in g21.json: zip_OSM.append(g21.json['postal']) if g21.json and g21.json['raw']['address']['country_code']: code_OSM.append(g21.json['raw']['address']['country_code']) if g21.json and 'county' in g21.json: county_OSM.append(g21.json['county']) if g21.json and 'state' in g21.json: state_OSM.append(g21.json['state']) if not city_OSM: placess = GeoText(y) x = str(placess.cities) print(x) g3 = geocoder.osm(x) print(g3.json) if g3.json and 'city' in g3.json: city_OSM.append(g3.json['city']) if g3.json and 'country' in g3.json: country_OSM.append(g3.json['country']) if g3.json and 'postal' in g3.json: zip_OSM.append(g3.json['postal']) if g3.json and g3.json['raw']['address']['country_code']: code_OSM.append(g3.json['raw']['address']['country_code']) if g3.json and 'county' in g3.json: county_OSM.append(g3.json['county']) if g3.json and 'state' in g3.json: state_OSM.append(g3.json['state']) except Exception: pass email_N = ','.join(map(unicode, parsed_text.emails)) url_N = ','.join(map(unicode, url)) name1_N = ','.join(map(unicode, name1)) orgg_N = ','.join(map(unicode, orgg)) try: wotex_N = ','.join(map(unicode, wotex)) except Exception: wotex_N = ','.join(map(str, wotex)) pos_N = ','.join(map(unicode, pos)) tel_N = ','.join(map(unicode, tel)) mob_N = ','.join(map(unicode, mob)) fax_N = ','.join(map(unicode, fax)) tel2_N = ','.join(map(unicode, tel2)) fax2_N = ','.join(map(unicode, fax2)) mob2_N = ','.join(map(unicode, mob2)) add_1_N = ','.join(map(unicode, add_1)) zip_1_N = ','.join(map(unicode, zip_1)) code_1_N = ','.join(map(unicode, code_1)) add_OSM_N = ','.join(map(unicode, add_OSM)) zip_OSM_N = ','.join(map(unicode, zip_OSM)) code_OSM_N = ','.join(map(unicode, code_OSM)) city_1_N = ','.join(map(unicode, city_1)) city_OSM_N = ','.join(map(unicode, city_OSM)) country_1_N = ','.join(map(unicode, country_1)) country_OSM_N = ','.join(map(unicode, country_OSM)) state_1_N = ','.join(map(unicode, state_1)) county_1_N = ','.join(map(unicode, county_1)) # state_OSM_N = ','.join(map(unicode, state_OSM)) # county_OSM_N = ','.join(map(unicode, county_OSM)) return jsonify(Email=email_N, Www=url_N, Name=name1_N, Organization=orgg_N, FullAddress=wotex_N, Role=pos_N, Tel1=tel_N, Mob1=mob_N, Fax1=fax_N, Phone=tel2_N, Fax=fax2_N, Mobile=mob2_N, Address1Google=add_1_N, ZipCodeGoogle=zip_1_N, CountryCodeGoogle=code_1_N, Address1Osm=add_OSM_N, ZipCodeOsm=zip_OSM_N, CountryCodeOsm=code_OSM_N, CityGoogle=city_1_N, CityOsm=city_OSM_N, CountryGoogle=country_1_N, CountryOsm=country_OSM_N, StateGoogle=state_1_N, CountyGoogle=county_1_N, Language=language)
entities_list = [] from chunk import chunker for sent in nltk.sent_tokenize(text): #print pos_tag(tokenize(sent)) for chunk in chunker.parse(pos_tag(tokenize(sent))): # print chunk entities_list.append(chunk) with open('out.txt', 'w') as file: file.write(str(entities_list)) #print entities_list import unicodecsv as csv from commonregex import CommonRegex from nltk.tag.stanford import StanfordNERTagger parser = CommonRegex() standford_ner = StanfordNERTagger( 'classifiers/english.conll.4class.distsim.crf.ser.gz', 'stanford-ner.jar') people = [] organizations = [] locations = [] emails = [] phone_numbers = [] street_addresses = [] credit_cards = [] ips = [] data = [] with open('sample-data.csv', 'r') as filedata: reader = csv.reader(filedata) for row in reader:
def __init__(self, filepath): self.filepath = filepath self.parser = CommonRegex() self.standford_ner = StanfordNERTagger('classifiers/english.conll.4class.distsim.crf.ser.gz')
def is_other(df): other_dic = {} parser = CommonRegex() find_name = NameDataset() other_label, other_count = is_person_name(df) other_dic[other_label] = other_count other_label, other_count = is_business_name(df) other_dic[other_label] = other_count other_label, other_count = is_phone_number(df) other_dic[other_label] = other_count other_label, other_count = is_address(df) other_dic[other_label] = other_count other_label, other_count = is_street_name(df) other_dic[other_label] = other_count other_label, other_count = is_city_agency(df) other_dic[other_label] = other_count other_label, other_count = is_city(df) other_dic[other_label] = other_count other_label, other_count = is_neighborhood(df) other_dic[other_label] = other_count other_label, other_count = is_lat_lon_cord(df) other_dic[other_label] = other_count other_label, other_count = is_zip_code(df) other_dic[other_label] = other_count other_label, other_count = is_borough(df) other_dic[other_label] = other_count other_label, other_count = is_school_name(df) other_dic[other_label] = other_count other_label, other_count = is_color(df) other_dic[other_label] = other_count other_label, other_count = is_car_make(df) other_dic[other_label] = other_count other_label, other_count = is_area_of_study(df) other_dic[other_label] = other_count other_label, other_count = is_subject_in_school(df) other_dic[other_label] = other_count other_label, other_count = is_school_level(df) other_dic[other_label] = other_count other_label, other_count = is_college_name(df) other_dic[other_label] = other_count other_label, other_count = is_website(df) other_dic[other_label] = other_count other_label, other_count = is_building_classification(df) other_dic[other_label] = other_count other_label, other_count = is_vehicle_type(df) other_dic[other_label] = other_count other_label, other_count = is_location_type(df) other_dic[other_label] = other_count other_label, other_count = is_park_playground(df) other_dic[other_label] = other_count other_label, other_count = is_letter(df) other_dic[other_label] = other_count other_label_list = [] other_label_count = [] k = Counter(other_dic) top_three = k.most_common(3) for i in top_three: other_label_list.append(i[0]) other_label_count.append(i[1]) return other_label_list, other_label_count
for i, d in enumerate(dictionary): if d[0] == word: wordID = i features_matrix[0, wordID] = words.count(word) return features_matrix test_doc = 'travel-nontravel/tr3.txt' doc_matrix = extract_features_for_single_doc(test_doc) result3 = model1.predict(doc_matrix) if result3 == 0: print "non travel" else: print "travel" print str(result3) + "\n" if result3 == 1: f = open(test_doc, "r") if f.mode == 'r': contents = f.read() print contents #matches = datefinder.find_dates(contents) #for match in matches: #print match parsed_text = CommonRegex(contents) print(parsed_text.times) print(parsed_text.dates) print(parsed_text.street_addresses) print(parsed_text.btc_addresses)
import requests import json from commonregex import CommonRegex from .exceptions import * from .helper import make_request from sentry_sdk import capture_message, capture_exception from urllib.parse import urlencode import html parser = CommonRegex() class SongParser(object): base_url = "https://songwhip.com/api/" headers = { 'Origin': 'https://songwhip.com', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36 OPR/67.0.3575.53', 'Content-Type': 'application/json' } # payload = "{\"url\":\"{}\",\"country\":\"IN\"}" payload = {"country": "IN", "url": ""} SONG_SERVICES_MAP = { "tidal": "Tidal", "deezer": "Deezer", "itunes": "iTunes", "pandora": "Pandora", "spotify": "Spotify", "youtube": "YouTube", "googleplay": "Google Play", "itunesStore": "iTunes Store",
def __mark_merchant_line(text_lines, line_rect_list): """ Check and mark the Merchant lines """ merchant_rect = None name_line_list = [] page_rect = line_rect_list['rect'][0] list_no_name = ['welcome', 'thank you', 'customer', 'copy', 'only', '*', 'ticket', '(', ')', ':', 'invoice', '!', 'more', 'congratulation', 'bill'] for i in range(len(text_lines)): # pre-processing of text line for j in range(i + 1, len(line_rect_list['text'])): if text_lines[i] == line_rect_list['text'][j]: break line_rect = line_rect_list['rect'][j] text_lines[i] = text_lines[i].replace('Welcome to', '') text_lines[i] = text_lines[i].strip('-') # check contains of key in list_no_name f_check_no_list = False for j in range(len(list_no_name)): if text_lines[i].lower().__contains__(list_no_name[j]): f_check_no_list = True break if f_check_no_list: continue # check validation of key if len(text_lines[i]) <= 2: continue elif len(name_line_list) > 0 and name_line_list[-1] + 1 != i: break elif len(name_line_list) > 0 and text_lines[i].__contains__(text_lines[name_line_list[-1]]): continue elif len(name_line_list) > 2: continue elif len(name_line_list) > 1 and not text_lines[i].isupper(): continue elif text_lines[i][0] == '#': continue elif len(CommonRegex(text_lines[i]).dates) > 0: continue elif len(CommonRegex(text_lines[i]).phones) > 0: continue elif len(CommonRegex(text_lines[i]).links) > 0: continue elif len(text_lines[i].replace('@', '').replace('&', '').split()) > 5: continue elif len(text_lines[i].split()) > 3 and text_lines[i].__contains__('-'): continue elif text_lines[i].replace('-', '').replace(' ', '').isdigit(): # '305337 - 1' continue elif len(name_line_list) > 0 and line_rect[1] > 2 * merchant_rect[3] - merchant_rect[1]: continue elif (line_rect[0] + line_rect[2]) > (page_rect[0] + page_rect[2]) * 1.3: # check the position continue name_line_list.append(i) merchant_rect = line_rect return name_line_list
def sanitize(text, sanitized_filename): # Remove web URLs sanitized = re.sub(r"http\S+", "", text) # Remove HTML tags sanitized = re.sub('<[^<]+?>', '', sanitized) # Remove HTML characters sanitized = re.sub('&[^ ]+;', '', sanitized) # Remove state abbreviations sanitized = re.sub( '(?<!\w)(?:,\s+)?(?:A[LKZR]|C[AOT]|DE|FL|GA|HI|I[ADLN]|K[SY]|LA|M[EDAINSOT]|N[EVHJMYCD]|O[HKR]|PA|RI|S[CD]|T[NX]|UT|V[AT]|W[AIVY]),?\s?(?!\w)', '', sanitized) fo = open('unique_senders.txt') try: text_ = fo.read() finally: fo.close() # Load natural language processor and add custom organizations nlp = spacy.load("en_core_web_lg") ruler = nlp.add_pipe("entity_ruler") unique_senders = set(text_.split('\n')) patterns = list() for unique_sender in unique_senders: pattern = {"label": "ORG", "pattern": [{"LOWER": unique_sender}]} patterns.append(pattern) ruler.add_patterns(patterns) lemmatizer = WordNetLemmatizer() sentences = nltk.sent_tokenize(sanitized) counter = 0 num_sentences = len(sentences) res = '' print('There are %d sentences' % (num_sentences)) # Write the result try: sanitized_file = open(sanitized_filename, "w") for sentence in sentences: counter += 1 if counter % 1000 == 0 or counter == num_sentences: print('%d sentences sanitized (%.2f%% complete)' % (counter, float(counter / num_sentences) * 100)) if is_stop_sentence(sentence): continue #print('1 ' + sentence) doc = nlp(sentence) for ent in doc.ents: #print('ent: %s ent.label: %s' % (ent.text, ent.label_)) if ent.label_ == 'ORG' or ent.label_ == 'NORP': sentence = sentence.replace(ent.text, 'BUSINESSNAME') elif ent.label_ == 'PRODUCT': sentence = sentence.replace(ent.text, 'PRODUCTNAME') elif ent.label_ == 'PERSON' and ent.label_ == 'GPE': sentence = sentence.replace(ent.text, '') #print('2 ' + sentence) # Remove times, dates, and prices parsed_text = CommonRegex(sentence) for price in parsed_text.prices: sentence = sentence.replace(price, '') #print('3 ' + sentence) # Remove non-dictionary words words = set(nltk.corpus.words.words()) puncts = {'.', '"', "'", ',', '-', '%', '!', '?'} etcs = {'email', 'online'} words = words.union(puncts).union(etcs) tokens = nltk.wordpunct_tokenize(sentence) pos_tags = nltk.pos_tag(tokens) #print(pos_tags) for token_idx, token in enumerate(tokens): syntactic_category = get_syntactic_category( pos_tags[token_idx][1]) if syntactic_category != None: lemmatized_token = lemmatizer.lemmatize( token.lower(), syntactic_category) #print('lemmatized: %s original: %s' % (lemmatized_token, token)) if lemmatized_token.lower( ) not in words and token != 'BUSINESSNAME' and token != 'PRODUCTNAME' and not token.isnumeric( ): sentence = sentence.replace(token, '') else: if token.lower( ) not in words and token != 'BUSINESSNAME' and token != 'PRODUCTNAME' and not token.isnumeric( ): sentence = sentence.replace(token, '') #print('4 ' + sentence) # Remove extra whitespaces sentence = re.sub(r'\s+', ' ', sentence) sentence = re.sub(r' , ', ' ', sentence) sentence = re.sub(r' \.', '.', sentence) sentence = re.sub(r' !', '!', sentence) sentence = re.sub(r' \?', '?', sentence) #print('5 ' + sentence) sanitized_file.write(sentence + '\n') finally: sanitized_file.close()