def _normalize_address_str(address_val): """ Normalize the address to conform to short abbreviations. If an invalid address_val is provided, None is returned. If a valid address is provided, a normalized version is returned. """ # if this string is empty the regular expression in the sa wont # like it, and fail, so leave returning nothing if not address_val: return None # now parse the address into number, street name and street type parser = StreetAddressParser() addr = parser.parse(str(address_val)) # TODO: should probably use unicode() normalized_address = '' if not addr: return None if 'house' in addr and addr['house'] is not None: normalized_address = addr['house'].lstrip("0") #some addresses have leading zeros, strip them here if 'street_name' in addr and addr['street_name'] is not None: normalized_address = normalized_address + ' ' + addr['street_name'] if 'street_type' in addr and addr['street_type'] is not None: normalized_address = normalized_address + ' ' + addr['street_type'] formatter = StreetAddressFormatter() normalized_address = formatter.abbrev_street_avenue_etc(normalized_address) return normalized_address.lower().strip()
class TestStreetAddress(unittest.TestCase): def setUp(self): self.addr_parser = StreetAddressParser() self.addr_formatter = StreetAddressFormatter() def test_success_abbrev_street_avenue_etc(self): addr = self.addr_parser.parse('221B Baker Street') eq_(self.addr_formatter.abbrev_street_avenue_etc(addr['street_full']), 'Baker St')
def _normalize_address_str(address_val): """ Normalize the address to conform to short abbreviations. If an invalid address_val is provided, None is returned. If a valid address is provided, a normalized version is returned. """ # if this string is empty the regular expression in the sa wont # like it, and fail, so leave returning nothing if not address_val: return None # now parse the address into number, street name and street type addr = usaddress.tag(str(address_val))[0] # TODO: should probably use unicode() normalized_address = '' if not addr: return None if 'AddressNumber' in addr and addr['AddressNumber'] is not None: normalized_address = addr['AddressNumber'].lstrip("0") # some addresses have leading zeros, strip them here if 'StreetNamePreDirectional' in addr and addr['StreetNamePreDirectional'] is not None: normalized_address = normalized_address + ' ' + _normalize_address_direction(addr['StreetNamePreDirectional']) if 'StreetName' in addr and addr['StreetName'] is not None: normalized_address = normalized_address + ' ' + addr['StreetName'] if 'StreetNamePostType' in addr and addr['StreetNamePostType'] is not None: # remove any periods from abbreviations normalized_address = normalized_address + ' ' + _normalize_address_post_type(addr['StreetNamePostType']) if 'StreetNamePostDirectional' in addr and addr['StreetNamePostDirectional'] is not None: normalized_address = normalized_address + ' ' + _normalize_address_direction(addr['StreetNamePostDirectional']) formatter = StreetAddressFormatter() normalized_address = formatter.abbrev_street_avenue_etc(normalized_address) return normalized_address.lower().strip()
def normalize_address_str(address_val): """ Normalize the address to conform to short abbreviations. If an invalid address_val is provided, None is returned. If a valid address is provided, a normalized version is returned. """ # if this string is empty the regular expression in the sa wont # like it, and fail, so leave returning nothing if not address_val: return None # encode the string as utf-8 if not isinstance(address_val, basestring): address_val = str(address_val) else: address_val = str(address_val.encode('utf-8')) # Do some string replacements to remove odd characters that we come across replacements = { '\xef\xbf\xbd': '', '\uFFFD': '', } for k, v in replacements.items(): address_val = address_val.replace(k, v) # now parse the address into number, street name and street type try: # Add in the mapping of CornerOf to the AddressNumber. addr = usaddress.tag(str(address_val), tag_mapping={'CornerOf': 'AddressNumber'})[0] except usaddress.RepeatedLabelError: # usaddress can't parse this at all normalized_address = str(address_val) except UnicodeEncodeError: # Some kind of odd character issue that we are not handling yet. normalized_address = str(address_val) else: # Address can be parsed, so let's format it. normalized_address = '' if 'AddressNumber' in addr and addr['AddressNumber'] is not None: normalized_address = _normalize_address_number( addr['AddressNumber']) if 'StreetNamePreDirectional' in addr and addr['StreetNamePreDirectional'] is not None: normalized_address = normalized_address + ' ' + _normalize_address_direction( addr['StreetNamePreDirectional']) # NOQA if 'StreetName' in addr and addr['StreetName'] is not None: normalized_address = normalized_address + ' ' + addr['StreetName'] if 'StreetNamePostType' in addr and addr['StreetNamePostType'] is not None: # remove any periods from abbreviations normalized_address = normalized_address + ' ' + _normalize_address_post_type( addr['StreetNamePostType']) # NOQA if 'StreetNamePostDirectional' in addr and addr['StreetNamePostDirectional'] is not None: normalized_address = normalized_address + ' ' + _normalize_address_direction( addr['StreetNamePostDirectional']) # NOQA if 'OccupancyType' in addr and addr['OccupancyType'] is not None: normalized_address = normalized_address + ' ' + addr['OccupancyType'] if 'OccupancyIdentifier' in addr and addr['OccupancyIdentifier'] is not None: normalized_address = normalized_address + ' ' + addr['OccupancyIdentifier'] formatter = StreetAddressFormatter() normalized_address = formatter.abbrev_street_avenue_etc(normalized_address) return normalized_address.lower().strip()
addr_formatter = StreetAddressFormatter() if opts.addr: lst = [opts.addr] else: lst = map(str.strip, tests) for t in lst: if t: print '"%s"' % t logging.info('addr_str: ' + unicode(t)) addr = addr_parser.parse(t) if addr['street_full'] is not None: street = addr_formatter.append_TH_to_street( addr['street_full']) logging.info('After append_TH_to_street: ' + street) street = addr_formatter.abbrev_direction(street) logging.info('After abbrev_direction: ' + street) street = addr_formatter.abbrev_street_avenue_etc(street) logging.info('After abbrev_street_avenue_etc: ' + street) street = addr_formatter.abbrev_street_avenue_etc( street, abbrev_only_last_token=False) logging.info('After abbrev_street_avenue_etc (aggressive): ' + street) print json.dumps(addr, sort_keys=True)
addr_parser = StreetAddressParser() addr_formatter = StreetAddressFormatter() if opts.addr: lst = [opts.addr] else: lst = map(str.strip,tests) for t in lst: if t: print '"%s"' % t logging.info('addr_str: ' + unicode(t)) addr = addr_parser.parse(t) if addr['street_full'] is not None: street = addr_formatter.append_TH_to_street(addr['street_full']) logging.info('After append_TH_to_street: ' + street) street = addr_formatter.abbrev_direction(street) logging.info('After abbrev_direction: ' + street) street = addr_formatter.abbrev_street_avenue_etc(street) logging.info('After abbrev_street_avenue_etc: ' + street) street = addr_formatter.abbrev_street_avenue_etc(street, abbrev_only_last_token=False) logging.info('After abbrev_street_avenue_etc (aggressive): ' + street) print json.dumps(addr, sort_keys=True)
def normalize_address_str(address_val): """ Normalize the address to conform to short abbreviations. If an invalid address_val is provided, None is returned. If a valid address is provided, a normalized version is returned. """ # if this string is empty the regular expression in the sa wont # like it, and fail, so leave returning nothing if not address_val: return None address_val = unicode(address_val).encode('utf-8') # Do some string replacements to remove odd characters that we come across replacements = { '\xef\xbf\xbd': '', '\uFFFD': '', } for k, v in replacements.items(): address_val = address_val.replace(k, v) # now parse the address into number, street name and street type try: addr = usaddress.tag(str(address_val))[0] except usaddress.RepeatedLabelError: # usaddress can't parse this at all normalized_address = str(address_val) except UnicodeEncodeError: # Some kind of odd character issue that we aren't handling yet. normalized_address = str(address_val) else: # Address can be parsed, so let's format it. normalized_address = '' if 'AddressNumber' in addr and addr['AddressNumber'] is not None: normalized_address = _normalize_address_number( addr['AddressNumber']) if 'StreetNamePreDirectional' in addr and addr[ 'StreetNamePreDirectional'] is not None: normalized_address = normalized_address + ' ' + _normalize_address_direction( addr['StreetNamePreDirectional']) # NOQA if 'StreetName' in addr and addr['StreetName'] is not None: normalized_address = normalized_address + ' ' + addr['StreetName'] if 'StreetNamePostType' in addr and addr[ 'StreetNamePostType'] is not None: # remove any periods from abbreviations normalized_address = normalized_address + ' ' + _normalize_address_post_type( addr['StreetNamePostType']) # NOQA if 'StreetNamePostDirectional' in addr and addr[ 'StreetNamePostDirectional'] is not None: normalized_address = normalized_address + ' ' + _normalize_address_direction( addr['StreetNamePostDirectional']) # NOQA formatter = StreetAddressFormatter() normalized_address = formatter.abbrev_street_avenue_etc( normalized_address) return normalized_address.lower().strip()