def splitAddress(self): address = self.policy_data["Mailing_Address_Full"] if address != None: result = pyap.parse(address, country = 'CA') if len(result) > 0: r = result[0] address_data = r.as_dict() if address_data != None: self.policy_data["Mailing_Address_Full"] = address_data['full_address'] self.policy_data["Mailing_Address_Unit"] = address_data['street_number'] self.policy_data["Mailing_Address_Street"] = address_data['street_name'] self.policy_data["Mailing_Address_Province"] = address_data['region1'] self.policy_data["Mailing_Address_PostalCode"] = address_data['postal_code'] self.policy_data["Mailing_Address_City"] = address_data['city'] for location in self.locations: address = None if location.data["Location_Of_Insured_Property"] != None: try: address = location.data["Location_Of_Insured_Property"].split(":")[1] result = pyap.parse(address, country = 'CA') if len(result) > 0: r = result[0] address_data = r.as_dict() if address_data != None: location.data["Physical_Address_Full"] = address_data['full_address'] location.data["Physical_Address_Unit"] = address_data['street_number'] location.data["Physical_Address_Street"] = address_data['street_name'] location.data["Physical_Address_Province"] = address_data['region1'] location.data["Physical_Address_PostalCode"] = address_data['postal_code'] location.data["Physical_Address_City"] = address_data['city'] except IndexError: print("Error Parsing Address")
def address(text): """ Extracts the address information text: input text from where address to be extracted """ if pyap.parse(re.sub(r',|#\d+', "", text), country='US'): return pyap.parse(re.sub(r',|#\d+', "", text), country='US')[0] else: return None
def find_info(value_matrix, sheet): phone_string = "" data_matrix = [[]] data_matrix.clear() for v in value_matrix: # Provider name from matrix provider_string = v[0] try: add_string = pyap.parse(v[1], country='US')[0].__str__( ) # Find address from matrix value index 1 except: add_string = "" # Expect error when no address found. Make value "" to add to matrix try: web_string = URLExtract().find_urls( v[1])[0] # Find URL from matrix value index 1 except: web_string = "" # Expect error when no URL found. Make value "" to add to matrix for match in phonenumbers.PhoneNumberMatcher( v[1], "US"): # Find phone number from matrix value index 1 phone_string = phonenumbers.format_number( match.number, phonenumbers.PhoneNumberFormat.NATIONAL) data_matrix.append( [provider_string, phone_string, web_string, add_string]) phone_string = "" sheet.update_values("C:H", data_matrix) # Update cell range with found values
def _extract_variants_from_text(self, field, text: str): addresses = list(pyap.parse(text, country='US')) if not addresses: addresses = list(get_addresses(text)) return [AddressField._get_from_geocode(address) for address in addresses]
def test_parse_address(): ap = parser.AddressParser(country='US') result = ap.parse('No address here') assert not result ap = parser.AddressParser(country='US') result = ap._parse_address('No address here') assert not result ap = parser.AddressParser(country='US') test_address = "xxx 225 E. John Carpenter Freeway, " +\ "Suite 1500 Irving, Texas 75062 xxx" addresses = ap.parse(test_address) assert addresses[0].full_address == \ "225 E. John Carpenter Freeway, Suite 1500 Irving, Texas 75062"
def Address_Search(self, test_address): # NOT in use addresses = pyap.parse(test_address, country='US') for address in addresses: # shows found address print(address) # shows address parts print(address.as_dict())
def find_address(responce, contact_link): soup = BeautifulSoup(response.body, 'lxml') for script in soup(["script", "style"]): script.extract() #text = soup.get_text().split('\n') try: address = str(pyap.parse(soup.text, country='US')[0]) except Exception as e: #print(e) address = None try: assert len(contact_link) > 0 if len(address) == 0: request = SeleniumRequest(url=contact_link, callback=find_address, meta={'splash': {'endpoint': 'render.html', 'args': {'html': 1, 'png': 1, 'width': 600, 'render_all': 1, 'wait': 0.5}}}) except Exception as e: #print(e) pass return address
def get_string(text): with open(text, 'r') as file: data = file.read().replace('\n', '') addresses = pyap.parse(data, country='US') print(addresses) return addresses
def parse_url(self, response): # get all text on page as plain text text = ''.join(response.xpath('//body//text()').extract()) key = response.meta.get('key') # url might not be here if usr is passing in plain file if key not in self.data: self.data[key] = { 'url': key, 'name': [], 'phone': [], 'address': [] } # parse out address for addr in pyap.parse(text, country='US'): self.data[key]['address'].append(addr.as_dict()) # https://stackoverflow.com/questions/34527917/extracting-phone-numbers-from-a-free-form-text-in-python-by-using-regex for phone in re.finditer( '\(?\b[2-9][0-9]{2}\)?[-. ]?[2-9][0-9]{2}[-. ]?[0-9]{4}\b', text): print(phone) self.data[key]['phone'].append(phone)
def location(address): try: addresses = pyap.parse(address, country='US') return addresses[0] except Exception: pass
def listAddresses(myText): addresses = pyap.parse(myText, country='US') myList = [] for address in addresses: tok = nltk.word_tokenize(str(address)) for t in tok: myList.append(t) return myList
def address_extract(text): addresses = pyap.parse(text, country='US') # for address in addresses: # print(address) # print(address.as_dict()) return addresses
def addressFinder(self, string): try: found_address = str(pyap.parse(string, country='US')[0]) g = geocoder.google(found_address) address = g.housenumber + " " + g.street return address, g.city, g.lat, g.lng except: return "", "", "", ""
def addr(P): text3 = P AddC = 0 Address = ap.parse(text3, country='US') #print(Address) for i in Address: text3 = text3.replace(str(i), "█" * len(str(i))) AddC = AddC + 1 return text3, AddC
def pyapGetEvent_Locations(searchString): addresses = pyap.parse(searchString.encode("utf-8"), country='US') event_locations = {"numLocations":len(addresses), "addresses":[]} #"searchString":searchString, for address in addresses: addDict = address.as_dict() event_locations["addresses"].append(addDict) return event_locations
def test_full_address_parts(): """Tests that the right parts of the address are picked up by the right regex""" example_addresses = [ { 'full_address': '9 Shaun glen, East Joan, LN4 1LE', 'street_name': 'Shaun glen', 'street_number': '9', 'postal_code': 'LN4 1LE', }, { 'full_address': '11-59 High Road\nEast Finchley London\nN2 8AW, UK', 'street_name': 'High Road', 'street_number': '11-59', 'postal_code': 'N2 8AW', 'country': 'UK', }, { 'full_address': 'Studio 53, Harrison cove, Smithbury, G88 4US, United Kingdom', 'occupancy': 'Studio 53', 'street_name': 'Harrison cove', 'postal_code': 'G88 4US', 'country': 'United Kingdom', }, ] filler_text = "This is filler text that can be inserted both before and after addresses" punctuation = ["\n", ", ", ". ", " "] # Test each of the above addresses for address_parts in example_addresses: # Test with filler text before and after the address for filler_before, filler_after in itertools.product([False, True], [False, True]): # Use the following punctuation to join the filler text and the address for join_string in punctuation: filler_text_before = (filler_text + join_string) if filler_before else '' filler_text_after = (join_string + filler_text) if filler_after else '' address_text = filler_text_before + address_parts[ 'full_address'] + filler_text_after parsed = pyap.parse(address_text, country='GB') print( pyap.parser.AddressParser._normalize_string(address_text)) # Ensure that only one address is found assert len(parsed) == 1 for k, v in six.iteritems(address_parts): if k == 'full_address': assert parsed[ 0].full_address == pyap.parser.AddressParser._normalize_string( v) else: # assert that every item in the above address dictionaries match the parsed address assert parsed[0].__getattribute__(k) == v
def iter_filth(self, text, document_name: Optional[str] = None): addresses = pyap.parse(text, country=self.region) for address in addresses: # Ignore any addresses containing any explitally ignored words if any([ word.lower() in address.full_address.lower() for word in self.ignored_words ]): # print("contains an ignored word") continue postal_address = None if self.minimum_address_sections > 0: postal_address = postal.parser.parse_address( address.full_address) # Ensure that there are enough parts of the address to be a real address if len(postal_address) < self.minimum_address_sections: # print("address too short") continue if len(self.match_pyap_postal_fields) > 0: if postal_address is None: postal_address = postal.parser.parse_address( address.full_address) # Check the two parses agree on part of the address for pyap_field, postal_field in self.match_pyap_postal_fields.items( ): if not address.__getattribute__(pyap_field).lower() in [ part[0] for part in postal_address if part[1] == postal_field ]: continue # It seems to be a real address, lets look for it in the text # This is needed as pyap does some text normalisation, this undoes that normalisation # See _normalize_string() in https://github.com/vladimarius/pyap/blob/master/pyap/parser.py pattern = re.escape(address.full_address) pattern = pattern.replace(r',\ ', r'\s*([\n,]\s*)+') pattern = pattern.replace(r'\ ', r'\s+') pattern = pattern.replace('-', '[‐‑‒–—―]') pattern = r'\b' + pattern + r'\b' found_strings = re.finditer(pattern, text, re.MULTILINE | re.UNICODE) # Iterate over each found string matching this regex and yield some filth for instance in found_strings: yield self.filth_cls( beg=instance.start(), end=instance.end(), text=instance.group(), detector_name=self.name, document_name=document_name, locale=self.locale, )
def hello_from_body(args): """Method 3: Return hello with name, given in body""" text = args.get("textBlob", "") addresses = pyap.parse(text, country='US') addr = [] for address in addresses: addr.append(str(address)) html = args.get("htmlBlob", "") url = args.get("url", "") writeToGcs(url, html) urlId = getUrlId(url) return {"addresses": addr, "urlId": urlId}, 200
def parse_adress(colonne, no_match=no_match): try: parsed_obj = pyap.parse(colonne, country='CA') try: parsed = parsed_obj[0].as_dict() except IndexError: parsed = "" except TypeError: print(colonne) parsed = "" parsed = json.dumps(parsed) return parsed
def parse(page): country = ['US','CA'] is_US =True locations =[] parsed= [] address = pyap.parse(page, country=country[0]) if address == []: is_US =False address = pyap.parse(page, country=country[1]) for a in address: if a in parsed: continue else: parsed.append(a) for setter in parsed: if is_US: parsed = US.parse(str(setter)) else: parsed = AddressParser().parse(str(setter)) locations.append(parsed) return locations
def process_text(q): doc = nlp(q) matches = matcher(doc) merge_and_add_ents(doc, matches) for address in pyap.parse(q, country='US'): spn = find_span(doc, nlp(str(address).decode('utf-8'))) doc.ents += ((ADDRESS_ID, spn[0], spn[1]), ) filtered_ents = [ent for ent in list(doc.ents) if ent.label_ in ENT_LIST] return doc, filtered_ents
def fetch_address(arr_body): msg_body = preprocess(arr_body) msg_body = '\n'.join(msg_body) # print msg_body addresses = pyap.parse(msg_body, country='US') location = [] try: for address in addresses: # shows found address location.append(address) # print address except: pass return location
def _extract_from_possible_value(self, field, possible_value): if not possible_value: return None if type(possible_value) is dict: address = possible_value.get('address') else: addresses = list(pyap.parse(str(possible_value), country='US')) if not addresses: addresses = list(get_addresses(str(possible_value))) address = addresses[0] if addresses else str(possible_value) return AddressField._get_from_geocode(address)
def find_base(soup, country='us'): """ Find addresses using pyap package """ for script in soup(["script", "style"]): script.extract() text = soup.get_text() address = '' adr = pyap.parse(text, country='us') if len(adr) > 0: for item in adr: address = address + ' ' + str(item) return address
def _extract_variants_from_text(self, field, text: str, **kwargs): addresses = list(pyap.parse(text, country='US')) result = [] if not addresses: addresses = list(get_addresses(text)) resolved_addresses = {} while addresses: address = addresses.pop(0) resolved_address = resolved_addresses.get(address) if resolved_address is None: resolved_address = AddressField._get_from_geocode(address) resolved_addresses[address] = resolved_address result.append(resolved_address) return result
def allot_values(): # TODO: Address finder breaks when state abbrev missing. Could find by zip code. for v in final_sheet[1:]: # TODO: Address finder also breaks "Fort Wayne" into "Fort Way" "NE" address = pyap.parse(v[12].upper(), country='US') # Made Upper because Lower and Title confuse pyap if address: address_list = [address[0].as_dict()['street_number'], title(address[0].as_dict()['street_name']), title(address[0].as_dict()['street_type']), address[0].as_dict()['route_id'], address[0].as_dict()['post_direction']] address1 = [x for x in address_list if x] v[1] = ' '.join(address1) address2 = [title(address[0].as_dict()['floor']), title(address[0].as_dict()['building_id']), title(address[0].as_dict()['occupancy'])] address2 = [x for x in address2 if x] v[2] = ' '.join(address2) v[3] = title(address[0].as_dict()['city']) v[4] = address[0].as_dict()['region1'] v[5] = address[0].as_dict()['postal_code'] urls = URLExtract(extract_email=True).find_urls(v[12].lower()) if urls: for url in urls[::-1]: if '@' in url: # This is a simplistic way to find email, a url could also have an @ v[7] = url # Overwriting because I have nowhere to store additional urls/emails else: v[11] = url fax = v[12].lower().find('fax') if fax > -1: # Find returns -1 if no instance found try: match = phonenumbers.PhoneNumberMatcher(v[12][fax:], 'US').next() v[12] = v[12][:fax] + v[12][fax:][0:match.start] + '[Redacted]' + v[12][fax:][match.end:] v[10] = phonenumbers.format_number(match.number, phonenumbers.PhoneNumberFormat.RFC3966)[7:] except StopIteration: pass for match in phonenumbers.PhoneNumberMatcher(v[12], "US"): if not v[8]: v[8] = phonenumbers.format_number(match.number, phonenumbers.PhoneNumberFormat.RFC3966)[7:] elif not v[9]: v[9] = phonenumbers.format_number(match.number, phonenumbers.PhoneNumberFormat.RFC3966)[7:] elif not v[10]: v[10] = phonenumbers.format_number(match.number, phonenumbers.PhoneNumberFormat.RFC3966)[7:] v[6] = 'service' v[14:19] = ['08:00-17:00'] * 5
def extract_entities(self, text): regexs = { 'TIME': utils.TIME_REGEX, 'DATE': utils.DATE_REGEX, 'ZIP': utils.ZIP_REGEX, 'EMAIL': utils.EMAIL_REGEX, 'CURRENCY': utils.CURRENCY_REGEX, 'TAX_ID': utils.TAX_REGEX, 'PHONE_NUMBER': utils.PHONE_NUMBER_REGEX, } long_address_matches = pyap.parse(text, country='US') short_address_matches = re.findall(utils.SHORT_ADDRESS_REGEX, text) if len(long_address_matches) != 0: for address in long_address_matches: text = text.replace(str(address), ' ', 1) self.entities.append({ 'text': str(address), 'type': self.CODEC['LOCATION'], 'score': 1 }) elif len(short_address_matches) != 0: for address in short_address_matches: text = text.replace(str(address), ' ', 1) self.entities.append({ 'text': str(address), 'type': self.CODEC['LOCATION'], 'score': 1 }) for key in regexs: matches = re.findall(regexs[key], text) if len(matches) != 0: for ent in matches: ent = ent[0] if isinstance(ent, tuple) else ent text = text.replace(ent, ' ', 1) self.entities.append({ 'text': str(ent), 'type': self.CODEC[key], }) return self.entities
def fetch_address(arr_body): '''takes input list of body and extract address from it ''' msg_body = preprocess(arr_body) # refine the body msg_body = '\n'.join(msg_body) # make a string out of the list addresses = pyap.parse( msg_body, country='US' ) # selecting the country as 'US' parse the body and extract address location = [] # to include the address try: # if addresses contains any element for address in addresses: # shows found address location.append(address) # apppend the address # print address except: # if no elements simply pass pass return location
def addresser(): #test_address = "Lorem ipsum 225 E. John Carpenter Freeway, Suite 1500 Irving, Texas 75062 Dorem sit amet + 1733 Kellogg Springs Dr. Atlanta, GA 30338 " test_address = request.form.get("address") # test_address = request.get_json(force=True) addresses = pyap.parse(test_address, country='US') print(addresses) if not addresses: data = 'There is no addresses present' return jsonify(data) full_street = [] state = [] street_number = [] street_name = [] zip_code = [] for address in addresses: # shows found address # shows address parts full_address = address.as_dict() full_street.append(full_address['full_street']) zip_code.append(full_address['postal_code']) state.append(full_address['region1']) street_number.append(full_address['street_number']) street_name.append(full_address['street_name']) full_street = pd.Series(full_street) state = pd.Series(state) street_name = pd.Series(street_name) street_number = pd.Series(street_number) zip_code = pd.Series(zip_code) full_street.name = 'full_street' state.name = 'state' zip_code.name = 'zip_code' street_name.name = 'street_name' street_number.name = 'street_name' data = pd.DataFrame( pd.concat([full_street, state, zip_code, street_name, street_number], axis=1, sort=False)) cd.get_data(data) return jsonify(full_address)
def Text_to_String(self, filename): ret = [] noaddylist = [] addylist = [] if (self.debug): cwd = os.path.join(self.og, "CSVFiles") print(os.path.isdir(filename)) print(filename) cwd = os.path.join(self.og, filename) os.chdir(cwd) print(cwd) for file in glob.glob('*.txt'): temp = open(file, 'r').read().strip() addresses = [] addresses = pyap.parse(temp, country='US') # print(addresses) addy = [] for address in addresses: addy.append(str(address)) if (addy == []): print(addy[0], "no addres!") tlist = [file] noaddylist.append(file) else: print(addy[0], "found address with name", file) tlist = [file, addy[0]] addylist.append(tlist) os.remove(file) ret = [addylist, noaddylist] print(ret, "= ret") cwd = os.chdir("../") print(cwd) os.rmdir(filename) return ret
def extract_cities(document): places = GeoText(document) print 'Cities :', places.cities print 'Countries :', places.countries city = places.cities if places.cities: print('Address') r2 = re.compile(r'([(\d|-|/){1-5}]+[,|-|\s]+[A-zZ]+[Aa-zZ]+.*)') add = r2.findall(document) # print add for text in add: for text1 in places.cities: if text1 in text: print(text) # print (r2.findall(document)) elif places.cities is None: addresses = pyap.parse(document, country='US') for address in addresses: # shows found address print(address) else: print('No Address Found')