Exemplo n.º 1
0
def _normalize_address_str(address_val):
    """
    Normalize the address to conform to short abbreviations.

    If an invalid address_val is provided, None is returned.

    If a valid address is provided, a normalized version is returned.
    """

    # if this string is empty the regular expression in the sa wont
    # like it, and fail, so leave returning nothing
    if not address_val:
        return None

    # now parse the address into number, street name and street type
    parser = StreetAddressParser()
    addr = parser.parse(str(address_val))  # TODO: should probably use unicode()
    normalized_address = ''

    if not addr:
        return None

    if 'house' in addr and addr['house'] is not None:
        normalized_address = addr['house'].lstrip("0") #some addresses have leading zeros, strip them here

    if 'street_name' in addr and addr['street_name'] is not None:
        normalized_address = normalized_address + ' ' + addr['street_name']

    if 'street_type' in addr and addr['street_type'] is not None:
        normalized_address = normalized_address + ' ' + addr['street_type']

    formatter = StreetAddressFormatter()
    normalized_address = formatter.abbrev_street_avenue_etc(normalized_address)
 
    return normalized_address.lower().strip()
Exemplo n.º 2
0
class TestStreetAddress(unittest.TestCase):
    def setUp(self):
        self.addr_parser = StreetAddressParser()
        self.addr_formatter = StreetAddressFormatter()


    def test_success_abbrev_street_avenue_etc(self):
        addr = self.addr_parser.parse('221B Baker Street')
        eq_(self.addr_formatter.abbrev_street_avenue_etc(addr['street_full']), 'Baker St')
Exemplo n.º 3
0
def _normalize_address_str(address_val):
    """
    Normalize the address to conform to short abbreviations.

    If an invalid address_val is provided, None is returned.

    If a valid address is provided, a normalized version is returned.
    """

    # if this string is empty the regular expression in the sa wont
    # like it, and fail, so leave returning nothing
    if not address_val:
        return None

    # now parse the address into number, street name and street type
    addr = usaddress.tag(str(address_val))[0]  # TODO: should probably use unicode()
    normalized_address = ''

    if not addr:
        return None

    if 'AddressNumber' in addr and addr['AddressNumber'] is not None:
        normalized_address = addr['AddressNumber'].lstrip("0")  # some addresses have leading zeros, strip them here

    if 'StreetNamePreDirectional' in addr and addr['StreetNamePreDirectional'] is not None:
        normalized_address = normalized_address + ' ' + _normalize_address_direction(addr['StreetNamePreDirectional'])

    if 'StreetName' in addr and addr['StreetName'] is not None:
        normalized_address = normalized_address + ' ' + addr['StreetName']

    if 'StreetNamePostType' in addr and addr['StreetNamePostType'] is not None:
        # remove any periods from abbreviations
        normalized_address = normalized_address + ' ' + _normalize_address_post_type(addr['StreetNamePostType'])

    if 'StreetNamePostDirectional' in addr and addr['StreetNamePostDirectional'] is not None:
        normalized_address = normalized_address + ' ' + _normalize_address_direction(addr['StreetNamePostDirectional'])

    formatter = StreetAddressFormatter()
    normalized_address = formatter.abbrev_street_avenue_etc(normalized_address)

    return normalized_address.lower().strip()
Exemplo n.º 4
0
def normalize_address_str(address_val):
    """
    Normalize the address to conform to short abbreviations.

    If an invalid address_val is provided, None is returned.

    If a valid address is provided, a normalized version is returned.
    """

    # if this string is empty the regular expression in the sa wont
    # like it, and fail, so leave returning nothing
    if not address_val:
        return None

    # encode the string as utf-8
    if not isinstance(address_val, basestring):
        address_val = str(address_val)
    else:
        address_val = str(address_val.encode('utf-8'))

    # Do some string replacements to remove odd characters that we come across
    replacements = {
        '\xef\xbf\xbd': '',
        '\uFFFD': '',
    }
    for k, v in replacements.items():
        address_val = address_val.replace(k, v)

    # now parse the address into number, street name and street type
    try:
        # Add in the mapping of CornerOf to the AddressNumber.
        addr = usaddress.tag(str(address_val), tag_mapping={'CornerOf': 'AddressNumber'})[0]
    except usaddress.RepeatedLabelError:
        # usaddress can't parse this at all
        normalized_address = str(address_val)
    except UnicodeEncodeError:
        # Some kind of odd character issue that we are not handling yet.
        normalized_address = str(address_val)
    else:
        # Address can be parsed, so let's format it.
        normalized_address = ''

        if 'AddressNumber' in addr and addr['AddressNumber'] is not None:
            normalized_address = _normalize_address_number(
                addr['AddressNumber'])

        if 'StreetNamePreDirectional' in addr and addr['StreetNamePreDirectional'] is not None:
            normalized_address = normalized_address + ' ' + _normalize_address_direction(
                addr['StreetNamePreDirectional'])  # NOQA

        if 'StreetName' in addr and addr['StreetName'] is not None:
            normalized_address = normalized_address + ' ' + addr['StreetName']

        if 'StreetNamePostType' in addr and addr['StreetNamePostType'] is not None:
            # remove any periods from abbreviations
            normalized_address = normalized_address + ' ' + _normalize_address_post_type(
                addr['StreetNamePostType'])  # NOQA

        if 'StreetNamePostDirectional' in addr and addr['StreetNamePostDirectional'] is not None:
            normalized_address = normalized_address + ' ' + _normalize_address_direction(
                addr['StreetNamePostDirectional'])  # NOQA

        if 'OccupancyType' in addr and addr['OccupancyType'] is not None:
            normalized_address = normalized_address + ' ' + addr['OccupancyType']

        if 'OccupancyIdentifier' in addr and addr['OccupancyIdentifier'] is not None:
            normalized_address = normalized_address + ' ' + addr['OccupancyIdentifier']

        formatter = StreetAddressFormatter()
        normalized_address = formatter.abbrev_street_avenue_etc(normalized_address)

    return normalized_address.lower().strip()
    addr_formatter = StreetAddressFormatter()

    if opts.addr:
        lst = [opts.addr]
    else:
        lst = map(str.strip, tests)

    for t in lst:
        if t:
            print '"%s"' % t
            logging.info('addr_str: ' + unicode(t))
            addr = addr_parser.parse(t)

            if addr['street_full'] is not None:
                street = addr_formatter.append_TH_to_street(
                    addr['street_full'])
                logging.info('After append_TH_to_street: ' + street)

                street = addr_formatter.abbrev_direction(street)
                logging.info('After abbrev_direction: ' + street)

                street = addr_formatter.abbrev_street_avenue_etc(street)
                logging.info('After abbrev_street_avenue_etc: ' + street)

                street = addr_formatter.abbrev_street_avenue_etc(
                    street, abbrev_only_last_token=False)
                logging.info('After abbrev_street_avenue_etc (aggressive): ' +
                             street)

            print json.dumps(addr, sort_keys=True)
Exemplo n.º 6
0
    addr_parser = StreetAddressParser()
    addr_formatter = StreetAddressFormatter()

    if opts.addr:
        lst = [opts.addr]
    else:
        lst = map(str.strip,tests)

    for t in lst:
        if t:
            print '"%s"' % t
            logging.info('addr_str: ' + unicode(t))
            addr = addr_parser.parse(t)

            if addr['street_full'] is not None:
                street = addr_formatter.append_TH_to_street(addr['street_full'])
                logging.info('After append_TH_to_street: ' + street)

                street = addr_formatter.abbrev_direction(street)
                logging.info('After abbrev_direction: ' + street)

                street = addr_formatter.abbrev_street_avenue_etc(street)
                logging.info('After abbrev_street_avenue_etc: ' + street)

                street = addr_formatter.abbrev_street_avenue_etc(street, abbrev_only_last_token=False)
                logging.info('After abbrev_street_avenue_etc (aggressive): ' + street)

            print json.dumps(addr, sort_keys=True)

Exemplo n.º 7
0
def normalize_address_str(address_val):
    """
    Normalize the address to conform to short abbreviations.

    If an invalid address_val is provided, None is returned.

    If a valid address is provided, a normalized version is returned.
    """

    # if this string is empty the regular expression in the sa wont
    # like it, and fail, so leave returning nothing
    if not address_val:
        return None

    address_val = unicode(address_val).encode('utf-8')

    # Do some string replacements to remove odd characters that we come across
    replacements = {
        '\xef\xbf\xbd': '',
        '\uFFFD': '',
    }
    for k, v in replacements.items():
        address_val = address_val.replace(k, v)

    # now parse the address into number, street name and street type
    try:
        addr = usaddress.tag(str(address_val))[0]
    except usaddress.RepeatedLabelError:
        # usaddress can't parse this at all
        normalized_address = str(address_val)
    except UnicodeEncodeError:
        # Some kind of odd character issue that we aren't handling yet.
        normalized_address = str(address_val)
    else:
        # Address can be parsed, so let's format it.
        normalized_address = ''

        if 'AddressNumber' in addr and addr['AddressNumber'] is not None:
            normalized_address = _normalize_address_number(
                addr['AddressNumber'])

        if 'StreetNamePreDirectional' in addr and addr[
                'StreetNamePreDirectional'] is not None:
            normalized_address = normalized_address + ' ' + _normalize_address_direction(
                addr['StreetNamePreDirectional'])  # NOQA

        if 'StreetName' in addr and addr['StreetName'] is not None:
            normalized_address = normalized_address + ' ' + addr['StreetName']

        if 'StreetNamePostType' in addr and addr[
                'StreetNamePostType'] is not None:
            # remove any periods from abbreviations
            normalized_address = normalized_address + ' ' + _normalize_address_post_type(
                addr['StreetNamePostType'])  # NOQA

        if 'StreetNamePostDirectional' in addr and addr[
                'StreetNamePostDirectional'] is not None:
            normalized_address = normalized_address + ' ' + _normalize_address_direction(
                addr['StreetNamePostDirectional'])  # NOQA

        formatter = StreetAddressFormatter()
        normalized_address = formatter.abbrev_street_avenue_etc(
            normalized_address)

    return normalized_address.lower().strip()