Пример #1
0
def compare_addresses(address1, address2):
    try:
        a1 = usaddress.tag(address1)
        a2 = usaddress.tag(address2)
        shared = dict(set(a1[0].items()) & set(a2[0].items()))
        return shared
    except RepeatedLabelError:
        pass
Пример #2
0
    def parse(self, address):

        # TODO: Implement this method to return the parsed components of a
        # given address using usaddress: https://github.com/datamade/usaddress

        # this should receieve an address from the GET request, parse the address, and return JSON using serializer

        usaddress.tag(address)
        #and then serialize?

        return address_components, address_type
    def get_address(self, text_block, key):
        block_string = ' '.join(text_block).lower()
        po_pattern = re.compile(r'(po box)\s*\d+')
        po_box = re.search(po_pattern, block_string)
        if po_box != None:
            self.pat_dic[key + '_' + 'po_box'] = po_box[0].split()[-1]

        # add_pattern = re.compile(r'([A-Z,a-z,0-9][^.!\-:;,\s]+)[,|\s]+([A-Z,a-z][^.!\-:;]+?)\s*(\d{5})')
        add_pattern = re.compile(
            r'([A-Z,a-z,0-9][^!\-:;,]+)[,|\s]+([A-Z,a-z][^.!\-:;]+?)\s*(\d{5})'
        )

        addresses = []

        for line in text_block:
            addresses.append(re.findall(add_pattern, line.lower()))

        print(addresses)
        for matches in addresses:
            if len(matches) > 0:
                try:
                    tags = usaddress.tag(' '.join(matches[0]).replace('.',
                                                                      ''))[0]
                    if 'PlaceName' in tags.keys() and 'StateName' in tags.keys(
                    ) and tags['StateName'].upper() in US_STATES:
                        self.pat_dic[key + '_' + 'address'] = ' '.join(
                            matches[0]).replace('.', '')
                        self.pat_dic[key + '_' +
                                     'PlaceName'] = tags['PlaceName']
                        self.pat_dic[key + '_' +
                                     'StateName'] = tags['StateName']
                        self.pat_dic[key + '_' + 'ZipCode'] = tags['ZipCode']

                except:
                    print("Unexpected error:", sys.exc_info()[0])

        for matches in text_block:
            if len(matches) > 0:
                try:
                    main_tags = usaddress.tag(matches.lower())
                    tags = main_tags[0]
                    if len(main_tags) > 0:
                        if "StreetName" in tags.keys(
                        ) and "AddressNumber" in tags.keys(
                        ) and main_tags[1] == 'Street Address' and (
                                'SubaddressType' not in tags.keys()
                                and 'Recipient' not in tags.keys()):
                            if tags["AddressNumber"].isdigit():
                                print(tags)
                                self.pat_dic[key + '_' +
                                             'street'] = matches.lower()

                except:
                    print("Unexpected error:", sys.exc_info()[0])
Пример #4
0
def audit_address(address):
    """ Checks for valid street types and other address issues"""
    if 'housenumber' in address:

        num = address['housenumber']
        addy = usaddress.tag(num)
        if 'AddressNumber' in addy[0]:
            address['housenumber'] = addy[0]['AddressNumber']
        if 'street' in address and 'StreetNamePreDirectional' in addy[0]:
            address['street'] = str.format('{} {}',
                                           addy[0]['StreetNamePreDirectional'],
                                           address['street'])

    if 'street' in address:
        street = address['street']
        try:
            addy = usaddress.tag(street)

        except Exception:
            # Nothing to parse, just ignore it
            pass

        if 'addresscalc' in locals():
            if 'housenumber' not in address and 'AddressNumber' in addy[0]:
                address['housenumber'] = addy[0]['AddressNumber']

            # Remove Street Number from Name
            direction = addy[0][
                'StreetNamePreDirectional'] if 'StreetNamePreDirectional' in addy[
                    0] else ''
            streetname = addy[0]['StreetName'] if 'StreetName' in addy[
                0] else ''
            streettype = addy[0][
                'StreetNamePostType'] if 'StreetNamePostType' in addy[0] else ''

            street = str.format('{} {} {}', direction, streetname,
                                streettype).strip()

        # Check that any common abbreviations are spelled out
        pat = re.compile(r'\b(' + '|'.join(STREET_FIXES.keys()) + r')\b')
        street = pat.sub(lambda x: STREET_FIXES[x.group()], street)

        # Update directional abbreviations to full words
        pat = re.compile(r'\b(' + '|'.join(DIRECTION_FIXES.keys()) + r')\b')
        street = pat.sub(lambda x: DIRECTION_FIXES[x.group()], street)

        address['street'] = street

    return address
Пример #5
0
def format_address_data(address_data, county_name):
    mapping = electionsaver.addressSchemaMapping

    boe_county = ["Clay", "Jackson", "Platte", "St. Louis"]
    boe_city = ["Kansas City", "St. Louis City"]

    if county_name in boe_county:
        location_name = f"{county_name} County Board of Elections"
    elif county_name in boe_city:
        location_name = f"{county_name} Board of Elections"
    elif county_name == "St. Charles":
        location_name = "St. Charles Country Election Authority"
    else:
        location_name = f"{county_name} County Election Office"

    parsed_data_dict = usaddress.tag(address_data, tag_mapping=mapping)[0]

    final_address = {"locationName": location_name}

    if "aptNumber" in parsed_data_dict:
        final_address["aptNumber"] = parsed_data_dict["aptNumber"]
    if "streetNumberName" in parsed_data_dict:
        final_address["streetNumberName"] = parsed_data_dict[
            "streetNumberName"]

    if "locationName" in parsed_data_dict:
        final_address["locationName"] = parsed_data_dict["locationName"]
    if "poBox" in parsed_data_dict:
        final_address["poBox"] = parsed_data_dict["poBox"]

    return final_address
Пример #6
0
def ZipCode(x):
    try:
        data = usaddress.tag(x)
        if 'ZipCode' in data[0].keys():
            return data[0]['ZipCode']
    except:
        pass
Пример #7
0
def addresscheck(df,usadd,wrongaddress):

    i = 0
    j = 0
    for index, row in df.iterrows():
        company = unicode(row['Company']).encode('utf-8')
        addressstring = (unicode(row['Address 1']).encode('utf-8')+' '+ unicode(row['Address 2']).encode('utf-8')+' ' + unicode(row['City']).encode('utf-8')+' ' + unicode(row['State']).encode('utf-8') + ' ' + unicode(row['Zipcode']).encode('utf-8'))
        try:
            addresscheck = usaddress.tag(addressstring)
        except usaddress.RepeatedLabelError:

            wrongaddress.loc[j] = df.iloc[index]
            j = j+1
        if addresscheck[1] != 'Ambiguous':

            if addressstring.find('14261') == -1 and addressstring.find('14260') == -1 and addressstring.find('Clement') == -1 and addressstring.find('Goodyear') == -1 and company.find('University at Buffalo') == -1:
                usadd.loc[i]= df.iloc[index]
                i = i +1



        else:
            if addressstring.find('14261') == -1 and addressstring.find('14260') == -1 and addressstring.find('Clement') == -1 and addressstring.find('Goodyear') == -1 and company.find('University at Buffalo') == -1:
                wrongaddress.loc[j] = df.iloc[index]
                j = j +1
Пример #8
0
def _check_address(address: Any, must_contain: Tuple[str, ...], clean: bool) -> Any:
    """
    Finds the index of the given country in the DATA dataframe.

    Parameters
    ----------
    address_str
        address value to be cleaned
    must_contain
        A tuple containing parts of the address that must be included for the
         address to be successfully cleaned
    clean
        If True, a tuple (index, status) is returned.
        If False, the function returns True/False to be used by the validate address function.
    """
    if address in NULL_VALUES:
        return (None, "null") if clean else False

    address = re.sub(r"[().]", "", str(address))

    try:
        address, _ = usaddress.tag(address, TAG_MAPPING)

    except usaddress.RepeatedLabelError:
        return (None, "unknown") if clean else False

    status = _check_status(address, must_contain)

    if status:
        return (address, "success") if clean else True

    return (address, "unknown") if clean else False
Пример #9
0
def format_address_data(address, county_name):
    mapping = electionsaver.addressSchemaMapping

    if county_name == "Knox":
        address = "100 W Cedar St, Benjamin, TX 79505"
    if county_name == "Live Oak":
        address = "301 E Houston St George West, TX 78022"
    if county_name == "Kleberg":
        address = address + " 78364"
    if county_name == "Parker":
        address = "1112 Santa Fe Drive Weatherford, TX 76086"
    if county_name == "Stephens":
        address = address.replace("Courthouse", "")
    if county_name == "Borden":
        address = "117 Wasson Rd, Gail, TX 79738"

    parsed_data_dict = usaddress.tag(address, tag_mapping=mapping)[0]

    final_address = {"state": "Texas"}
    if "streetNumberName" in parsed_data_dict:
        final_address["streetNumberName"] = parsed_data_dict["streetNumberName"]
    if "city" in parsed_data_dict:
        final_address["city"] = parsed_data_dict["city"]
    if "zipCode" in parsed_data_dict:
        final_address["zipCode"] = parsed_data_dict["zipCode"]
    if "poBox" in parsed_data_dict:
        final_address["poBox"] = parsed_data_dict["poBox"]
    if "locationName" in parsed_data_dict:
        final_address["locationName"] = parsed_data_dict["locationName"]
    if "aptNumber" in parsed_data_dict:
        final_address["aptNumber"] = parsed_data_dict["aptNumber"]
    return final_address
Пример #10
0
def format_address_data(address_data, county_name):
    if county_name == "Swans Island":
        address_data = address_data.replace("Swan'S", "Swans")
    if county_name == "Jackson":
        address_data = "730 Moosehead Trail Hwy PO Box 393 Jackson, ME 04921"

    final_address = {}
    mapping = electionsaver.addressSchemaMapping
    parsed_data_dict = usaddress.tag(address_data, tag_mapping=mapping)[0]

    try:
        final_address = {
            "city": parsed_data_dict["city"],
            "state": "Maine",
            "zipCode": parsed_data_dict["zipCode"],
        }
    except KeyError:
        print(f"Error with data {parsed_data_dict}")

    if "streetNumberName" in parsed_data_dict:
        final_address["streetNumberName"] = parsed_data_dict[
            "streetNumberName"]
    if "locationName" in parsed_data_dict:
        final_address["locationName"] = parsed_data_dict["locationName"]
    if "aptNumber" in parsed_data_dict:
        final_address["aptNumber"] = parsed_data_dict["aptNumber"]
    if "poBox" in parsed_data_dict:
        final_address["poBox"] = parsed_data_dict["poBox"]
    return final_address
Пример #11
0
def format_address_data(address_data, county_name):
    mapping = electionsaver.addressSchemaMapping

    location_name = f"{county_name} County Election Office"

    # https://www.daggettcounty.org/16/ClerkTreasurer
    if county_name == 'Daggett':
        address_data = "95 North 1st West, P.O. Box 400"
    # http://sanjuancounty.org/index.php/clerkauditor/
    elif county_name == 'San Juan':
        address_data = "117 South Main, P.O. Box 338"

    parsed_data_dict = usaddress.tag(address_data, tag_mapping=mapping)[0]

    final_address = {"locationName": location_name}

    if "aptNumber" in parsed_data_dict:
        final_address["aptNumber"] = parsed_data_dict["aptNumber"]
    if "streetNumberName" in parsed_data_dict:
        final_address["streetNumberName"] = parsed_data_dict[
            "streetNumberName"]
    if "locationName" in parsed_data_dict:
        final_address["locationName"] = parsed_data_dict["locationName"]
    if "poBox" in parsed_data_dict:
        final_address["poBox"] = parsed_data_dict["poBox"]

    return final_address
Пример #12
0
    def extract_mailing_address(self, input_dict):
        if input_dict['MAILADD1'].strip():
            try:
                tagged_address, address_type = usaddress.tag(' '.join([
                    input_dict['MAILADD1'],
                    input_dict['MAILADD2'],
                    input_dict['MAILADD3'],
                    input_dict['MAILADD4']
                ]))

                if address_type == 'Ambiguous':
                    print("Warn - %s: Ambiguous mailing address falling back to residential (%s)" % address_type, input_dict['MAILADD1'])
                    tagged_address = {}

                if len(tagged_address) > 0:
                    return {
                        'MAIL_ADDRESS_LINE1': self.construct_mail_address_1(
                            tagged_address,
                            address_type,
                        ),
                        'MAIL_ADDRESS_LINE2': self.construct_mail_address_2(tagged_address),
                        'MAIL_CITY': tagged_address['PlaceName'] if 'PlaceName' in tagged_address else "",
                        'MAIL_ZIP_CODE': tagged_address['ZipCode'] if 'ZipCode' in tagged_address else "",
                        'MAIL_STATE': tagged_address['StateName'] if 'StateName' in tagged_address else "",
                        'MAIL_COUNTRY': ""
                    }
                else:
                    return {}
            except usaddress.RepeatedLabelError as e:
                print('Warn: Can\'t parse mailing address. Falling back to residential (%s)' % e.parsed_string)
                return {}
        else:
            return {}
Пример #13
0
    def populate_address(self, sosrec):
        address = sosrec['Address'].replace('<br/>', ' ')
        addr_parts = usaddress.tag(address)
        payload = {
          'addr': "",
          'unit': "",
          'city': "",
          'state': "KANSAS",
          'zip': ""
        }
        for key, val in addr_parts[0].items():
            if key == 'OccupancyIdentifier':
                payload['unit'] = val
            elif key == 'PlaceName':
                payload['city'] = val
            elif key == 'StateName' and len(val) > 0:
                payload['state'] = val
            elif key == 'ZipCode':
                payload['zip'] = val
            else:
                if len(payload['addr']) > 0:
                    payload['addr'] = ' '.join([payload['addr'], val])
                elif val == "No information available":
                    payload['addr'] = ""
                else:
                    payload['addr'] = val

        self.update(payload)
Пример #14
0
def format_address_data(address_data, town_name):
    mapping = electionsaver.addressSchemaMapping

    # Edge cases
    if address_data == "20 PARK ST GORHAM":
        address_data = "20 PARK ST GORHAM 03581"

    parsed_data_dict = {}
    try:
        parsed_data_dict = usaddress.tag(address_data, tag_mapping=mapping)[0]
    except Exception as e:
        raise WalkTheVoteError(
            f"Error with data for {town_name} town, data is {parsed_data_dict}"
        ) from e

    final_address = {"state": "NH"}

    if "city" in parsed_data_dict:
        final_address["city"] = parsed_data_dict["city"].title()
    if "zipCode" in parsed_data_dict:
        final_address["zipCode"] = parsed_data_dict["zipCode"]
    if "streetNumberName" in parsed_data_dict:
        final_address["streetNumberName"] = parsed_data_dict[
            "streetNumberName"].title()
    if "poBox" in parsed_data_dict:
        final_address["poBox"] = parsed_data_dict["poBox"].title()
    final_address["locationName"] = parsed_data_dict.get(
        "locationName", f"{town_name} City Election Office".title())
    if "aptNumber" in parsed_data_dict:
        final_address["aptNumber"] = parsed_data_dict["aptNumber"].title()
    return final_address
Пример #15
0
 def parse_address(self, input_str):
     address = usaddress.tag(input_str)
     print json.dumps(address, indent=4)
     print type(address)
     # compose street
     street = ''
     if 'OccupancyType' in address[0].keys(
     ) and 'OccupancyIdentifier' in address[0].keys():
         street = '%s %s, ' % (address[0].get('OccupancyType'),
                               address[0].get('OccupancyIdentifier'))
     if not address[0].get('AddressNumber') or not address[0].get(
             'StreetName'):
         street = None
     elif not address[0].get('StreetNamePostType'):
         street = street + '%s %s' % (address[0].get('AddressNumber'),
                                      address[0].get('StreetName'))
     else:
         street = street + '%s %s %s' % (
             address[0].get('AddressNumber'), address[0].get('StreetName'),
             address[0].get('StreetNamePostType'))
     result = {
         'city': address[0].get('PlaceName'),
         'country': address[0].get('CountryName'),
         'state': address[0].get('StateName'),
         'street': street,
         'zip': address[0].get('ZipCode')
     }
     return result
def geo_parser(location, gmaps_json):
    # parse json response
    try:
        results = gmaps_json["results"][0]
        std_name = results['name']
        print(std_name)
        lat = results['geometry']['location']['lat']
        lng = results['geometry']['location']['lng']
        std_address = results['formatted_address']
        # parse address
        try:
            parsed_address = usaddress.tag(std_address)
            city = parsed_address[0]['PlaceName']
            state = parsed_address[0]['StateName']
        except:
            parsed_address = usaddress.parse(std_address)
            # traverse parsed address list if the tagger fails
            city = ''
            state = ''
            for addr_tup in parsed_address:
                print(addr_tup)
                if addr_tup[1] == 'PlaceName':
                    city += ' ' + addr_tup[0]
                if addr_tup[1] == 'StateName':
                    state += ' ' + addr_tup[0]
                city = city.strip()
        print(city)
        df = pd.DataFrame([[location, std_name, lat, lng, city, state]], columns=['Raw_Name', 'Name', 'Latitude', 'Longitude', 'City', 'State'])
        return df

    except IndexError:
        print(gmaps_json)
        df = pd.DataFrame()
        return
Пример #17
0
    def usaddress_tag(self, address_str):
        """
        We get parse misses now and then. TODO: figure out how to handle
        usaddress parse errors.

        The usaddress_type almost always comes back as a "Ambiguous"
        We use a simple convention of if there's a USPSBoxID, then it's a PO Box,
        otherwise it's a Street Address

        Input:
            address_str: string of address from input file
        Output:
            Dictionary containing tagged parts of addresses
        """
        try:
            usaddress_dict, usaddress_type = usaddress.tag(address_str)

            # if contains a PO Box ID consider it a PO Box, else a Street Address
            if 'USPSBoxID' in usaddress_dict:
                usaddress_type = 'PO Box'
            else:
                usaddress_type = 'Street Address'
            return usaddress_dict, usaddress_type

        except usaddress.RepeatedLabelError as e:
            # If USAddress fails then just return None to set the
            # VALIDATION_STATUS appropriatly. We will have to manually fix the address later
            return None, None
Пример #18
0
    def parse(self, address):
        address, address_type = usaddress.tag(
            address,
            tag_mapping={
                'Recipient': 'recipient',
                'AddressNumber': 'address1',
                'AddressNumberPrefix': 'address1',
                'AddressNumberSuffix': 'address1',
                'StreetName': 'address1',
                'StreetNamePreDirectional': 'address1',
                'StreetNamePreModifier': 'address1',
                'StreetNamePreType': 'address1',
                'StreetNamePostDirectional': 'address1',
                'StreetNamePostModifier': 'address1',
                'StreetNamePostType': 'address1',
                'CornerOf': 'address1',
                'IntersectionSeparator': 'address1',
                'LandmarkName': 'address1',
                'USPSBoxGroupID': 'address1',
                'USPSBoxGroupType': 'address1',
                'USPSBoxID': 'address1',
                'USPSBoxType': 'address1',
                'OccupancyType': 'address1',
                'OccupancyIdentifier': 'address1',
                'SubaddressIdentifier': 'address1',
                'SubaddressType': 'address1',
                'BuildingName': 'address2',
                'PlaceName': 'city',
                'StateName': 'state',
                'ZipCode': 'zip_code',
                'CountryName': 'country'
            })

        return address
Пример #19
0
def clean_street(address1,
                 address2="",
                 *,
                 zipcode=None,
                 strip_occupancy=False):
    """
    Clean street address.

    If a zipcode is passed in, we double check to make sure the address was
    parsed correctly.

    If strip_occupancy is true, remove components that don't affect geocoding,
    like floor, apartment, unit, etc.

    NOTE: `usaddress` was trained with full addresses, not just street address.
    """
    # matchers that rely on knowing address lines
    # Strip "care of" lines
    if re.match(r"c/o ", address1, re.IGNORECASE):
        return address2

    # concatenate line 1 and line 2
    address = "{} {}".format(address1, address2).strip()

    if not address:
        # XXX Should this be an exception?
        return address

    try:
        # Add arbitrary city/state/zip to get `usaddress` to parse the address
        # as just the street address and not a full address
        address_to_parse = ("{}, Austin, TX {}".format(address, zipcode)
                            if zipcode else address)
        addr, type_ = usaddress.tag(address_to_parse)
    except usaddress.RepeatedLabelError:
        logger.warning("Unparseable address: {}".format(address_to_parse))
        return address

    if zipcode:
        try:
            addr.pop("PlaceName", None)
            addr.pop("StateName", None)
            guessed_zip = addr.pop("ZipCode")
            assert guessed_zip == zipcode
        except KeyError:
            logger.warning("Expected zipcode %s but found none", zipcode)
        except AssertionError:
            logger.warning("Guessed the wrong zipcode {} != {}".format(
                guessed_zip, zipcode))
    if strip_occupancy:
        addr.pop("OccupancyType", None)
        addr.pop("OccupancyIdentifier", None)
    if type_ == "Street Address":
        return " ".join(
            [component_format(label, value) for label, value in addr.items()])
    elif type_ == "PO Box":
        return " ".join(
            [component_format(label, value) for label, value in addr.items()])
    logger.warning("Ambiguous address: {}".format(address), extra=addr)
    return address
Пример #20
0
def format_address_data(address_data, county_name):
    mapping = electionsaver.addressSchemaMapping

    parsed_data_dict = {}
    try:
        parsed_data_dict = usaddress.tag(address_data, tag_mapping=mapping)[0]
    except Exception as e:
        raise WalkTheVoteError(
            f"Error with data for {county_name} town, data is {parsed_data_dict}"
        ) from e

    final_address = {"state": "SC"}

    if "city" in parsed_data_dict:
        final_address["city"] = parsed_data_dict["city"].title()
    if "zipCode" in parsed_data_dict:
        final_address["zipCode"] = parsed_data_dict["zipCode"]
    if "streetNumberName" in parsed_data_dict:
        final_address["streetNumberName"] = parsed_data_dict[
            "streetNumberName"].title()
    if "poBox" in parsed_data_dict:
        final_address["poBox"] = parsed_data_dict["poBox"].title()
    final_address["locationName"] = parsed_data_dict.get(
        "locationName",
        f"{county_name} County Board of Voter Registration & Elections".title(
        ))
    if "aptNumber" in parsed_data_dict:
        final_address["aptNumber"] = parsed_data_dict["aptNumber"].title()
    return final_address
Пример #21
0
def test_api_parse_succeeds(client):
    # TODO: Finish this test. Send a request to the API and confirm that the
    # data comes back in the appropriate format.
    address_string = '123 main st chicago il'
    parsed_address = usaddress.tag(address_string)

    assert type(parsed_address) is tuple
Пример #22
0
def search_by_address(address):
    """
	search_by_address(address) -> [address_candidates]
	Returns an array of properties matching the given address

	param: address (String): The address associated with the property
	"""

    address = usaddress.tag(address)
    street = ' '.join([
        address[0].get('AddressNumber',
                       ''), address[0].get('StreetNamePreDirectional', ''),
        address[0].get('StreetName',
                       ''), address[0].get('StreetNamePostType', '')
    ])
    city = address[0].get('PlaceName', '')
    state = address[0].get('StateName', '')
    zip_code = address[0].get('ZipCode', '')

    payload = (('Street', street), ('City', city), ('State', state),
               ('ZIP', zip_code), ('f', 'json'))

    if address[1] == 'Street Address':
        # TODO: asynchronous?
        r = requests.post(BASE_URL, params=payload)

        candidates = [c for c in r.json()['candidates']]
        # Sort by 'score'
        candidates.sort(key=_sort_by_score, reverse=True)
        return candidates

    else:
        return []
Пример #23
0
    def usaddress_tag(self, address_str):
        """
        We get parse misses now and then. TODO: figure out how to handle
        usaddress parse errors.

        The usaddress_type almost always comes back as a "Ambiguous"
        We use a simple convention of if there's a USPSBoxID, then it's a PO Box,
        otherwise it's a Street Address

        Input:
            address_str: string of address from input file
        Output:
            Dictionary containing tagged parts of addresses
        """
        try:
            usaddress_dict, usaddress_type = usaddress.tag(address_str)
        except usaddress.RepeatedLabelError as e:
            # this will use the second occurance of each tag
            # there is probably a better rule
            usaddress_dict = {k: v for v, k in e.parsed_string}
        # if contains a PO Box ID consider it a PO Box, else a Street Address
        if 'USPSBoxID' in usaddress_dict:
            usaddress_type = 'PO Box'
        else:
            usaddress_type = 'Street Address'
        return usaddress_dict, usaddress_type
Пример #24
0
def format_street_number(street_number_name, county_name):
    mapping = electionsaver.addressSchemaMapping

    if county_name == "Glenn":
        street_number_name = street_number_name.replace(", 2nd Street", "")

    parsed_data_dict = usaddress.tag(street_number_name, tag_mapping=mapping)[0]

    final_address = {"locationName": f"{county_name} County Election Office"}

    if "aptNumber" in parsed_data_dict:
        final_address["aptNumber"] = parsed_data_dict["aptNumber"]
    if "streetNumberName" in parsed_data_dict:
        final_address["streetNumberName"] = parsed_data_dict["streetNumberName"]
    if "locationName" in parsed_data_dict:
        final_address["locationName"] = parsed_data_dict["locationName"]

    if county_name == "San Francisco":
        final_address = {
            "streetNumberName": "1 Dr. Carlton B Goodlett Place",
            "locationName": "City Hall",
            "aptNumber": "Room 48",
        }

    # print(f'Error with data for {countyName} county, data is {parsed_data_dict}')

    return final_address
Пример #25
0
def format_address_data(address, county_name):
    mapping = electionsaver.addressSchemaMapping
    parsed_data_dict = usaddress.tag(address, tag_mapping=mapping)[0]

    final_address = {
        "state": "Ohio",
        "zipCode": parsed_data_dict["zipCode"],
    }
    if "streetNumberName" in parsed_data_dict:
        final_address["streetNumberName"] = parsed_data_dict[
            "streetNumberName"]
    else:
        if county_name == "Vinton":
            final_address["streetNumberName"] = "31935 OH-93"
        if county_name == "Brown":
            final_address["streetNumberName"] = "800 Mt. Orab Pike"
    if "city" in parsed_data_dict:
        final_address["city"] = parsed_data_dict["city"]
    if "poBox" in parsed_data_dict:
        final_address["poBox"] = parsed_data_dict["poBox"]
    if "locationName" in parsed_data_dict:
        final_address["locationName"] = parsed_data_dict["locationName"]
        if county_name == "Vinton":
            final_address["locationName"] = "Community Building"
        if county_name == "Brown":
            final_address["locationName"] = "Administrative Building"
    if "aptNumber" in parsed_data_dict:
        final_address["aptNumber"] = parsed_data_dict["aptNumber"]
        if county_name == "Vinton":
            final_address.pop("aptNumber")
    return final_address
Пример #26
0
def standardize(address, code = "a"):
    if code not in ["a", "r", "n"]:
        raise InputError("code must be a (append), r (replace), or n (none)")
    # make case insensitive, apply usaddress parsing
    tagged = usaddress.tag(str(address).upper(), label_mappings)
    tagged = tagged[0]
    # remove punctuation from results (not removed beforehand, as punctuation can affect parsing)
    stripped = {label: words if label == 'HN' else \
        words.translate(str.maketrans('', '', string.punctuation)).strip() \
            for (label, words) in tagged.items()}

    # apply replacements
    substituted = clean(stripped, processing_dict)
    # add codes for directions, extensions, etc. if desired
    if code != "n":
        pairs = list(substituted.items())
        for (label, word) in pairs:
            # confirm label is substitutable and substitution is known
            if label in code_dict and word in code_dict.get(label):
                # add to dictionary
                substituted[label+"C"] = code_dict[label].get(word)
                # remove original value if requested
                if code == "r":
                    substituted.pop(label)
    # add concatenated WSN
    if "WSDESC1" in substituted or "WSID1" in substituted:
        if "WSDESC1" not in substituted:
            substituted["WS"] = substituted["WSID1"]
        elif "WSID1" not in substituted:
            substituted["WS"] = substituted["WSDESC1"]
        else:
            substituted["WS"] =" ".join([substituted["WSDESC1"], substituted["WSID1"]])
    return ' '.join([str(substituted[key]) for key in substituted.keys()])
Пример #27
0
def appendMailingAddress(outrow, row):
    try:
        tagged_address, address_type = usaddress.tag(' '.join([
            row['MAILADD1'], row['MAILADD2'], row['MAILADD3'], row['MAILADD4']
        ]))
    except usaddress.RepeatedLabelError as e:
        print('Can\'t parse mailing address. Falling back to res')
        tagged_address = {}

    if (len(tagged_address) > 0):
        PrepareUtils.appendMailingAddressFromTaggedFields(
            outrow, tagged_address, address_type)
    else:
        outrow.update({
            'MAIL_ADDRESS_LINE1':
            PrepareUtils.constructMailAddr1FromOutRow(outrow),
            'MAIL_ADDRESS_LINE2':
            PrepareUtils.constructMailAddr2FromOutRow(outrow),
            'MAIL_CITY':
            outrow['PLACE_NAME'],
            'MAIL_STATE':
            outrow['STATE_NAME'],
            'MAIL_ZIP_CODE':
            outrow['ZIP_CODE'],
            'MAIL_COUNTRY':
            'USA'
        })
Пример #28
0
 def getparts(self):
     try:
         tagged_address, self._type = usaddress.tag(self.prepped)
     except usaddress.RepeatedLabelError as e:
         x = OrderedDict(e.parsed_string)
         tagged_address, self._type = (x, "Questionable")
     return tagged_address
Пример #29
0
def parse_address_string(addr_str):
    # type: (str) -> MutableMapping[str, str]
    """Separate an address string into its component parts per usaddress.

    Attempts to parse addr_str into it's component parts, using usaddress.

    If usaddress identifies the address type as Ambiguous or the resulting
    OrderedDict includes any keys from AMBIGUOUS_LABELS that would constitute
    ambiguous address in the SEED/GBR use case (ie: Recipient) then
    an AmbiguousAddressError is raised.

    :param addr_str: str address to be processed.
    :type addr_str: str
    :return: usaddress OrderedDict
    :rtype: MutableMapping
    """
    parsed_results = usaddress.tag(addr_str)
    parsed_addr = parsed_results[0]
    # if the address is parseable but some form of ambiguity is found that
    # may result in data corruption NormalizationError is raised.
    if (parsed_results[1] == 'Ambiguous'
            or any(key in AMBIGUOUS_LABELS for key in parsed_addr.keys())):
        raise AmbiguousAddressError()
    parsed_addr = handle_abnormal_occupancy(parsed_addr, addr_str)
    return parsed_addr
Пример #30
0
    def get(self, request):
        query = request.GET.get('address')
        addressComponents = usaddress.tag(query)
        print(addressComponents)
        # This will call the parse method when request is sent from the index.html form

        return Response(addressComponents)
Пример #31
0
    def extract_mailing_address(self, input_dict):
        if input_dict['MAIL_ADDR1'].strip():
            try:
                tagged_address, address_type = usaddress.tag(' '.join([
                    input_dict['MAIL_ADDR1'],
                    input_dict['MAIL_ADDR2'],
                    input_dict['MAIL_ADDR3']
                ]))

                if address_type == 'Ambiguous':
                    print("Warn - {}: Ambiguous mailing address falling back to residential ({})".format(address_type, input_dict['MAIL_ADDR1']))
                    tagged_address = {}

                if len(tagged_address) > 0:
                    return {
                        'MAIL_ADDRESS_LINE1': self.construct_mail_address_1(
                            tagged_address,
                            address_type,
                        ),
                        'MAIL_ADDRESS_LINE2': self.construct_mail_address_2(tagged_address),
                        'MAIL_CITY': tagged_address['PlaceName'] if 'PlaceName' in tagged_address else "",
                        'MAIL_ZIP_CODE': tagged_address['ZipCode'] if 'ZipCode' in tagged_address else "",
                        'MAIL_STATE': tagged_address['StateName'] if 'StateName' in tagged_address else "",
                        'MAIL_COUNTRY': ""
                    }
                else:
                    return {}
            except usaddress.RepeatedLabelError as e:
                print('Warn: Can\'t parse mailing address. Falling back to residential ({})'.format(e.parsed_string))
                return {}
        else:
            return {}
Пример #32
0
def label_parsing(address):

    try:
        # make case insensitive, apply usaddress parsing
        tagged = tag(address.upper(), label_mappings)
    # exit if parsing fails 
    except RepeatedLabelError:
        return {'ERROR' : address}

    tagged = tagged[0]
    # remove punctuation from results (not removed beforehand, as punctuation can affect parsing)
    stripped = {label: words if label == 'HN' else \
        words.translate(str.maketrans('', '', string.punctuation)).strip() \
            for (label, words) in tagged.items()}

    if 'HN' in stripped:
        HN = stripped['HN']
        separator = "".join([x for x in HN if not x.isnumeric()])
        if separator:
            parts = HN.split(separator)
            if len(parts) == 2:
                    stripped["HN1"] = parts[0]
                    stripped["HNSEP"] = separator
                    stripped["HN2"] = parts[1]

    # add concatenated WSN
    if "WSDESC1" in stripped or "WSID1" in stripped:
        if "WSDESC1" not in stripped:
            stripped["WS"] = stripped["WSID1"]
        elif "WSID1" not in stripped:
            stripped["WS"] = stripped["WSDESC1"]
        else:
            stripped["WS"] =" ".join([stripped["WSDESC1"], stripped["WSID1"]])
    return stripped
def process_row(row, commit=False):
    address_number = row[1]
    street_name = row[2]
    description = row[5].strip()
    resolution = row[6].strip()
    closed = row[9]

    if commit:
        closed_status, _ = CaseStatus.objects.get_or_create(name='Closed')
        active_status, _ = CaseStatus.objects.get_or_create(name='Active')

        address = "{} {}".format(address_number, street_name)
        tagged = usaddress.tag(address)

        address_type = tagged[1]
        if address_type == 'Street Address':
            address_number = tagged[0].get('AddressNumber')
            street_name = tagged[0].get('StreetName')

            try:
                address_number = int(address_number)
            except ValueError:
                # malformed address_number
                return

            if address_number and street_name:
                CSSCase.objects.get_or_create(
                    description=description,
                    resolution=resolution,
                    status=closed and closed_status or active_status,
                    address_number=address_number,
                    street_name=street_name.upper()
                )
Пример #34
0
def formatAddressData(address, countyName):
    mapping = electionsaver.addressSchemaMapping

    parsedDataDict = usaddress.tag(address, tag_mapping=mapping)[0]

    finalAddress = {
        "state": "Illinois",
        "zipCode": parsedDataDict["zipCode"],
    }
    if "streetNumberName" in parsedDataDict:
        finalAddress["streetNumberName"] = parsedDataDict["streetNumberName"].title()
    else:
        if countyName == "Cumberland":
            finalAddress["streetNumberName"] = "140 COURTHOUSE SQUARE".title()
        if countyName == "Mason":
            finalAddress["streetNumberName"] = "100 NORTH BROADWAY".title()
    if "city" in parsedDataDict:
        finalAddress["city"] = parsedDataDict["city"].title()
        if countyName == "Mason":
            finalAddress["city"] = "HAVANA".title()
    if "poBox" in parsedDataDict:
        finalAddress["poBox"] = parsedDataDict["poBox"]
        if countyName == "Cumberland":
            finalAddress["poBox"] = 'PO BOX 146'.title()
        if countyName == "Mason":
            finalAddress["poBox"] = "PO BOX 77".title()
    if "locationName" in parsedDataDict:
        finalAddress["locationName"] = parsedDataDict["locationName"]
    if "aptNumber" in parsedDataDict:
        finalAddress["aptNumber"] = parsedDataDict["aptNumber"].title()
    return finalAddress
Пример #35
0
def lookup_geo(g, ady, verbose=False):
    if verbose:
        print 'Lookup_geo:\n\t%s' % ady

    tags, _ = usaddress.tag(ady)

    addressNumber = tags.get('AddressNumber', '')
    streetName = [v for k, v in tags.items() if k.startswith('StreetName')]
    streetName = ' '.join(streetName)
    borough = tags.get('PlaceName', '').lower()

    if 'ny' in borough or 'manhattan' in borough:
        borough = 'manhattan'

    if 'queens' in borough:
        borough = 'queens'

    if 'brooklyn' in borough:
        borough = 'brooklyn'

    if 'bronx' in borough:
        borough = 'bronx'

    if 'manhattan' in borough:
        borough = 'manhattan'

    if 'staten island' in borough:
        borough = 'staten island'

    if verbose:
        print usaddress.tag(ady)
        print 'adNumber: %s\t\tstName: %s\t\tBorough:%s\n\n' % (addressNumber,
                                                                streetName,
                                                                borough)

    dic = g.address(addressNumber, streetName, borough)
    zipcode = dic.get('zipCode', '')
    streetAddress = '%s %s' % (dic.get('houseNumber', ''),
                               dic.get('firstStreetNameNormalized', ''))

    borough = dic.get('firstBoroughName', '')
    longitude = dic.get('longitude', '')
    latitude = dic.get('latitude', '')

    place = RefLocation(streetAddress, borough, zipcode, latitude, longitude)
    return place.schema_object()
Пример #36
0
def searchParse(query, filters):
    query = str(query)
    query = string.capwords(query)
    parsed = usaddress.tag(query) # returns tuple with parsed string and 'street address' or 'ambiguous'
    if parsed[1] == 'Street Address':
        return redirect(url_for('.report', address=query))
    else:
        return redirect(url_for('.listings', address=query, filters=filters))
Пример #37
0
def tag_address(address):
    """Tags address by its individual components."""
    tagged_address = None
    try:
        tagged_address = usaddress.tag(address)
    except (usaddress.RepeatedLabelError, UnicodeEncodeError):
        pass
    return tagged_address
Пример #38
0
 def get_region(self):
     if(self.incd_address):
         try:
             tag = usaddress.tag(self.incd_address)[0]
             if(tag.has_key('PlaceName')):
                 places = tag['PlaceName'].split(",")
                 if(len(places) >= 1):
                     self.region = places[len(places) - 1].strip()
         except usaddress.RepeatedLabelError:
             self.region = None
Пример #39
0
def generateListing(query):
    lis = Listing.query.filter_by(raw_add=query).first()
    if lis:
        return lis
    # not currently in db, note not robust
    parsed = usaddress.tag(query)
    add_Dict = parsed[0]
    state = city = zipcode = "Unknown"
    if "StateName" in add_Dict:
        state = add_Dict["StateName"]
    if "PlaceName" in add_Dict:
        city = add_Dict["PlaceName"]
    if "ZipCode" in add_Dict:
        zipcode = add_Dict["ZipCode"]
    street_address = ""
    if "AddressNumber" in add_Dict:
        street_address += add_Dict["AddressNumber"]
    if "StreetName" in add_Dict:
        street_address += " "
        street_address += add_Dict["StreetName"]
    if "StreetNamePostType" in add_Dict:
        street_address += " "
        street_address += add_Dict["StreetNamePostType"]
    hashed = getHash(query)
    lis = Listing(
        raw_add=query,
        street_address=street_address,
        state=state,
        city=city,
        zipcode=zipcode,
        area=getArea(hashed),
        price=getPrice(hashed),
        bedrooms=getRm(hashed),
        bathrooms=(getRm(hashed) - 1),
        realtor=getRealtor(hashed),
        seller=getSeller(hashed),
        school_district=getSchoolDis(hashed),
    )
    db.session.add(lis)
    # populate tabs
    tax = Tax(rate=getTaxRate(hashed), listing=lis)
    schools = School(elementarySchool=getElemSchool(hashed), highSchool=getHighSchool(hashed), listing=lis)
    crim = Crime(rate=getCrimeRate(hashed), most_frequent_crime=getFrequentCrime(hashed), listing=lis)
    geo = Geo(most_frequent_incident=getGeoIncident(hashed), most_recent_incident=getDate(hashed), listing=lis)
    img = Image(path=getHouseImage(hashed), listing=lis)
    db.session.add(img)
    db.session.add(tax)
    db.session.add(schools)
    db.session.add(crim)
    db.session.add(geo)
    db.session.commit()
    return lis
Пример #40
0
def _parse(address_string):
    """parses address string into atx address parts,
    returns list
    """
    if not type(address_string) is str:
        raise TypeError(Messages.str_req)

    address_string = _sanitize(address_string)
    address_string = _pre_hack(address_string)
    address_parts = usaddress.tag(address_string)
    address_parts = _translate_to_atx(address_parts)
    address_parts = _post_hack(address_parts)
    return address_parts
Пример #41
0
def addresscheck(df):
    for index, row in df.iterrows():
        addressstring = (row[1]+' '+ row[2]+' ' + row[4]+' ' + row[5])
        addresscheck = usaddress.tag(addressstring)
        if addresscheck[1] != 'Ambiguous':
            if addressstring.find('University at Buffalo') != -1:
                buffadd.loc[len(buffadd)]=df.iloc[index]
            else:
                usadd.loc[len(usadd)]= df.iloc[index]
        else:
            if addressstring.find("University at Buffalo") != -1:
                wrongbuff.loc[len(wrongbuff)] = df.iloc[index]
            else:
                wrongaddress.loc[len(wrongaddress)] = df.iloc[index]
Пример #42
0
    def parse_with_usaddress_tag(self, addr_str):
        """
        Parses address string using usaddress's `tag()` function
        """
        try:
            tagged = usaddress.tag(addr_str)[0].items()
        except usaddress.RepeatedLabelError:
            # FIXME: Add richer logging here with contents of `rle` or chain exception w/ Python 3
            # FIXME: Shouldn't leak details of 'tag' method since it not longer a param
            raise AddressParserError("Could not parse address '{}' with 'tag' method".format(addr_str))

        addr_parts = [{'code': self.standard_part_mapping[k], 'value': v} for k, v in tagged]

        return addr_parts
Пример #43
0
def listings(address,filters):
    parsed = usaddress.tag(address)[0]
    listings =[]
    if 'ZipCode' in parsed:
        listings = list(Listing.query.filter_by(zipcode=parsed['ZipCode']).order_by(Listing.timestamp.desc()))
    elif 'PlaceName' and 'StateName' in parsed:
        parsed['StateName'] = parsed['StateName'].upper()
        if 'StreetNamePreDirectional' and 'StreetName' in parsed:
            parsed['PlaceName'] = parsed['StreetNamePreDirectional'] + " " + parsed['StreetName'] + " " + parsed['PlaceName']
        listings = list(Listing.query.filter_by(city=parsed['PlaceName'],state=parsed['StateName']).order_by(Listing.timestamp.desc()))
    else:
        flash('Please enter a valid address including either a zipcode or city name and state.')
    listings = apply_filters(listings, filters)
    return render_template('listings.html',act_filters=filters,query=address, lots=False,address=address,favs=False,search=g.search, searchbar=True, listings=listings, count=len(listings))
Пример #44
0
 def rend(self):
     if not self.addr.find(';') == -1:
         temp = self.addr.split(';')
         if temp[1] == '':
             try:
                 if 'PlaceName' not in usaddress.tag(temp[0])[0] or 'StateName' not in usaddress.tag(temp[0])[0]:
                     return temp[0], None
                 else:
                     return '', temp[0]
             except usaddress.RepeatedLabelError:
                 return temp[0], ''
         return temp[0], temp[1]
     else:
         return self.addr, ''
Пример #45
0
def addresscheck(df,usadd,wrongaddress):
    for index, row in df.iterrows():
        addressstring = (unicode(row['Address 1'])+' '+ unicode(row['Address 2'])+' ' + unicode(row['City'])+' ' + unicode(row['State']) + ' ' + unicode(row['Zipcode']))
        try:
            addresscheck = usaddress.tag(addressstring)
        except usaddress.RepeatedLabelError:
            wrongaddress.loc[len(wrongaddress)] = df.iloc[index]

        if addresscheck[1] != 'Ambiguous':
            if addressstring.find('14261') == -1 and addressstring.find('14260') == -1 and addressstring.find('Clement') == -1 and addressstring.find('Goodyear') == -1:
                usadd.loc[len(usadd)]= df.iloc[index]

        else:
            if addressstring.find('14261') == -1 and addressstring.find('14260') == -1 and addressstring.find('Clement') == -1 and addressstring.find('Goodyear') == -1:
                wrongaddress.loc[len(wrongaddress)] = df.iloc[index]
Пример #46
0
def clean_street(address1, address2='', zipcode=None):
    """
    Clean street address.
    """
    # matchers that rely on knowing address lines
    # Strip "care of" lines
    if re.match(r'c/o ', address1, re.IGNORECASE):
        return address2

    # concatenate line 1 and line 2
    address = '{} {}'.format(address1, address2).strip()

    if not address:
        return address

    try:
        # Add arbitrary city/state/zip to get `usaddress` to parse the address
        # as just the street address and not a full address
        address_to_parse = '{}, Austin, TX {}'.format(address, zipcode) if zipcode else address
        addr, type_ = usaddress.tag(address_to_parse)
    except usaddress.RepeatedLabelError:
        logger.warn('Unparseable address: {}'.format(address_to_parse))
        return address
    if zipcode:
        try:
            addr.pop('PlaceName', None)
            addr.pop('StateName', None)
            guessed_zip = addr.pop('ZipCode')
            assert guessed_zip == zipcode
        except KeyError:
            logger.warn('Expected a zipcode {} but found none'.format('zipcode'))
        except AssertionError:
            logger.warn('Guessed the wrong zipcode {} != {}'
                        .format(guessed_zip, zipcode))
    if type_ == 'Street Address':
        return ' '.join(
            [component_format(label, value) for label, value in addr.items()])
    elif type_ == 'PO Box':
        return ' '.join(
            [component_format(label, value) for label, value in addr.items()])
    logger.warn('Ambiguous address: {}'.format(address), extra=addr)
    return address
Пример #47
0
def create_address_url(raw_address_text):
    raw_address_parsed = usaddress.tag(raw_address_text)
    address_ordered_dict = raw_address_parsed[0]
    
    address_keys = ['AddressNumber','StreetName','StreetNamePostType','OccupancyType','OccupancyIdentifier']
    address_string_list=[]
    for key in address_keys:
        if address_ordered_dict.get(key) is not None:
            address_string_list.append(address_ordered_dict[key])
    address_string = ' '.join(address_string_list)
    address_url_encode = address_string.replace(' ','+').strip()
    
    citystatezip_string = address_ordered_dict.get('PlaceName','')
    citystatezip_string += '%2C ' + address_ordered_dict.get('StateName','')
    citystatezip_string += ' ' + address_ordered_dict.get('ZipCode','')
    citystatezip_url_encode = citystatezip_string.strip().replace(' ','+')

    address_for_walkscore = address_url_encode + "," + citystatezip_url_encode

    return address_url_encode, citystatezip_url_encode, address_for_walkscore
Пример #48
0
def _normalize_address_str(address_val):
    """
    Normalize the address to conform to short abbreviations.

    If an invalid address_val is provided, None is returned.

    If a valid address is provided, a normalized version is returned.
    """

    # if this string is empty the regular expression in the sa wont
    # like it, and fail, so leave returning nothing
    if not address_val:
        return None

    # now parse the address into number, street name and street type
    addr = usaddress.tag(str(address_val))[0]  # TODO: should probably use unicode()
    normalized_address = ''

    if not addr:
        return None

    if 'AddressNumber' in addr and addr['AddressNumber'] is not None:
        normalized_address = addr['AddressNumber'].lstrip("0")  # some addresses have leading zeros, strip them here

    if 'StreetNamePreDirectional' in addr and addr['StreetNamePreDirectional'] is not None:
        normalized_address = normalized_address + ' ' + _normalize_address_direction(addr['StreetNamePreDirectional'])

    if 'StreetName' in addr and addr['StreetName'] is not None:
        normalized_address = normalized_address + ' ' + addr['StreetName']

    if 'StreetNamePostType' in addr and addr['StreetNamePostType'] is not None:
        # remove any periods from abbreviations
        normalized_address = normalized_address + ' ' + _normalize_address_post_type(addr['StreetNamePostType'])

    if 'StreetNamePostDirectional' in addr and addr['StreetNamePostDirectional'] is not None:
        normalized_address = normalized_address + ' ' + _normalize_address_direction(addr['StreetNamePostDirectional'])

    formatter = StreetAddressFormatter()
    normalized_address = formatter.abbrev_street_avenue_etc(normalized_address)

    return normalized_address.lower().strip()
Пример #49
0
    def __parse(self, d):
        from usaddress import tag

        try:
            # Note that we replace . with space
            addr = tag(d['address'].replace('.', ' ').upper())[0]
        except Exception:
            raise Exception('Unable to parse address %s' % (d['address'],))

        if 'StreetName' not in addr:
            return

        d['street_name'] = addr['StreetName'].replace(' ', '')

        if 'AddressNumber' in addr:
            d['house_number'] = StrLib.extract_numeric(addr['AddressNumber'])
            if not d['house_number'].isnumeric():
                d['street_name'] = '%s %s' % (addr['AddressNumber'], d['street_name'])
                d['house_number'] = ''

        if 'StreetNamePreType' in addr:
            d['street_name'] = '%s%s' % (addr['StreetNamePreType'], d['street_name'])
        if 'StreetNamePreDirectional' in addr:
            d['pre_direction'] = addr['StreetNamePreDirectional'].replace('.', '')
            if d['pre_direction'] not in self.__directions:
                d['street_name'] = '%s %s' % (d['pre_direction'], d['street_name'])
                d['pre_direction'] = ''
        if 'StreetNamePostType' in addr:
            d['street_type'] = addr['StreetNamePostType'].replace('.', '')
            if d['street_type'] not in street_abbrs and \
                    d['street_type'] not in street_abbrs.values():
                d['street_name'] = '%s%s' % (d['street_name'], d['street_type'])
                d['street_type'] = ''
        if 'StreetNamePostDirectional' in addr:
            d['suf_direction'] = addr['StreetNamePostDirectional'].replace('.', '')
            if d['suf_direction'] not in self.__directions:
                d['street_name'] = '%s %s' % (d['street_name'], d['suf_direction'])
                d['suf_direction'] = None
        if 'OccupancyIdentifier' in addr:
            d['unit'] = addr['OccupancyIdentifier']
Пример #50
0
def main(**kwargs):
    """The main function of the script. Performs most of the address parsing work and all output.
    
    :param input: a text file object that will be read from. Should contain address-like data, one address per line
    :param output: a text file object where parsed output will be written. Parsed output will be similar to CSV data
    :param remove_post_zip: a boolean value whether to remove data on each line following a sequence of 5 digits and a comma
    :type input: text file object in read mode
    :type output: text file object in write mode
    :type remove_post_zip: bool
    """
    lines = [line.replace('"', '').strip() for line in kwargs['input']]
    kwargs['input'].close()
    
    print('{} lines in input'.format(len(lines)))
    
    county = re.compile('(\d{5}),.*')
    
    parsed = []
    errored = []
    
    for line in lines:
        try:
            parsed.append(usaddress.tag(re.sub(county, r'\1', line)))
        except(usaddress.RepeatedLabelError):
            errored.append(line)
            
    for address in parsed:
        
        kwargs['output'].write(','.join(parsed_address_to_human(address)) + '\n')
    kwargs['output'].close()
    
    for error in errored:
        sys.stderr.write('{}\n'.format(error))
    if len(errored):
        sys.stderr.write('{} lines unable to be parsed\n'.format(len(errored)))
    
    return 0
Пример #51
0
    def __init__(self, address_str):

        # Retaining the raw string
        self.address_str = address_str
        # # The tag method will try to be a little smarter
        # # it will merge consecutive components, strip commas, & return an address type
        # # expected output: (OrderedDict([('AddressNumber', u'123'), ('StreetName', u'Main'),
        # # ('StreetNamePostType', u'St.'), ('OccupancyType', u'Suite'), ('OccupancyIdentifier', u'100'),
        # ('PlaceName', u'Chicago'), ('StateName', u'IL')]), 'Street Address')
        address_dict = usaddress.tag(address_str)

        # The if conditional is needed to avoid dictionary's key error if the key doesn't exist
        self.addressNumber = address_dict['AddressNumber'] if 'AddressNumber' in address_dict.keys() \
            else raise ValueError('Address AddressNumber not provided')
        # noinspection PyUnreachableCode
        self.streetName = address_dict['StreetName'] if 'StreetName' in address_dict.keys() \
            else raise ValueError('Address StreetName not provided')
        self.streetNamePostType = address_dict['StreetNamePostType'] if 'StreetNamePostType' in address_dict.keys() \
            else raise ValueError('Address StreetNamePostType not provided')
        self.placeName = address_dict['PlaceName'] if 'PlaceName' in address_dict.keys() \
            else raise ValueError('Address PlaceName not provided')
        self.stateName = address_dict['StateName'] if 'StateName' in address_dict.keys() \
            else raise ValueError('Address StateName not provided')

        # Optional values
        self.occupancyIdentifier = address_dict[
            'OccupancyIdentifier'] if 'OccupancyIdentifier' in address_dict.keys() \
            else
        raise ValueError('Address OccupancyIdentifier not provided')
        self.occupancyType = address_dict['OccupancyType'] if 'OccupancyType' in address_dict.keys() \
            else
        raise ValueError('Address OccupancyType not provided')
        self.streetNamePreDirectional = address_dict[
            'StreetNamePreDirectional'] if 'StreetNamePreDirectional' in address_dict.keys() \
            else None
        self.zipCode = address_dict['ZipCode'] if 'ZipCode' in address_dict.keys() else None
Пример #52
0
    def region_parse(self, region):
        # Scans for international zip codes, using the regex forms stored in the array below.
        regex_base = ['NSW \d{4}','\W(GIR|[A-Z]\d[A-Z\d]??|[A-Z]{2}\d[A-Z\d]??)[ ]??(\d[A-Z]{2})\W',
                      '\W((?:0[1-46-9]\d{3})|(?:[1-357-9]\d{4})|(?:[4][0-24-9]\d{3})|(?:[6][013-9]\d{3}))\W',
                      '\W([ABCEGHJKLMNPRSTVXY]\d[ABCEGHJKLMNPRSTVWXYZ])\ {0,1}(\d[ABCEGHJKLMNPRSTVWXYZ]\d)\W',
                      '\W(F-)?((2[A|B])|[0-9]{2})[0-9]{3}\W', '\W(V-|I-)?[0-9]{5}\W', '\W[^\W\d_]{2}-\d{4}\W',
                      '\W\d{6}\W',
                      '\W(0[289][0-9]{2})|([1345689][0-9]{3})|(2[0-8][0-9]{2})|(290[0-9])|(291[0-4])|(7[0-4][0-9]{2})|(7[8-9][0-9]{2})\W',
                      '\W[1-9][0-9]{3}\s?([a-zA-Z]{2})?\W', '\W([1-9]{2}|[0-9][1-9]|[1-9][0-9])[0-9]{3}\W',
                      '\W([D-d][K-k])?( |-)?[1-9]{1}[0-9]{3}\W', '\W(s-|S-){0,1}[0-9]{3}\s?[0-9]{2}\W',
                      '\W[1-9]{1}[0-9]{3}\W',
                      '\W[^\W\d_]{2}\d \d[^\W\d_]{2}\W', '\W[^\W\d_]{2}/d[^\W\d_] /d[^\W\d_]{2}\W', '\W\d{3}-\d{4}\W',
                      '\W\d{3}-\d{3}\W', '\W[^\W\d_]{2}\d \d[^\W\d_]\W',
                      '\W[^\W\d_]\d{6}\W', '\W[^\d_]{2}\d\W', '\W[^\W\d_]{2}-\d{4}\W',
                      '\W[^\W\d_]{2}\d \d[^\W\d_]{2}\W', '\W[^\W\d_]{3} \d{4}\W', '\W[^\W\d_]-\d{4}\W',
                      '\W[^\W\d_]\d{2} [^\W\d_]{3}\W']

        # Initialize this with blank values but valid keys, so I can just break out of this method while returning the
        # appropriate data structure if I need to.
        filtered_region = {'City': '',
                           'State': '',
                           'Zipcode': ''}

        # Initialized as an empty variable so that the method can also use it as a boolean if it is never called
        zippy = None

        # Must go before basic parsing, or else blank spaces will throw it off.
        if region is None:
            return filtered_region

        region = ' '.join(region.split())

        # Scanning for PO numbers in the latter part of the address. If one is found, the same procedure as
        # the zipcodes is followed.
        try:
            holder = usaddress.tag(region)
            if 'USPSBoxType' in holder[0] and 'USPSBoxID' in holder[0]:
                po = "%s %s" % (holder[0]['USPSBoxType'], holder[0]['USPSBoxID'])
                region = region.replace(po, '')
                filtered_region['PO'] = po
        except usaddress.RepeatedLabelError:
            pass

        dublin = re.search("dublin \d(\d)?", region.lower())
        if dublin:
            dubb = dublin.group(0)
            dub = dubb.split(' ')
            filtered_region['City'] = dub[0].capitalize()
            filtered_region['Zipcode'] = dub[1]
            region = ((region.lower()).replace(dubb, ',')).capitalize()
            return self.international(region, filtered_region)

        result = re.search("\W\d{5}([\-]?\d{4})?\W", " %s " % region)

        if not result:
            # Finds the international zipcodes, and then strips them from the region string
            for reg in regex_base:
                print(region)
                region = ' '.join(region.split())
                results = re.search(reg, " %s " % region)
                if results:
                    zippy = (results.group(0)).strip()
                    filtered_region['Zipcode'] = zippy
                    # print(zippy)
                    region = region.replace(zippy, ',')
                    break
            if not zippy:
                # Back to American Parsing. These addresses that make it down here should all be domestic.
                filtered_region = self.domestic(region, filtered_region)
                return filtered_region
        else:
            # American Parsing. These addresses that make it down here should all be domestic.
            filtered_region = self.domestic(region, filtered_region)
            return filtered_region

        # International Parsing. Special cases for them.
        if zippy:
            return self.international(region, filtered_region)

        return filtered_region
Пример #53
0
import usaddress, sys, json
addr=sys.argv[1]
addr_parsed = usaddress.tag(addr)
print(json.dumps(addr_parsed))



Пример #54
0
    def domestic(region, filtered_region):
        region = region.strip()
        if not ' ' in region and not ',' in region:
            filtered_region['City'] = region
            return filtered_region

        citi = None
        # Because Washington DC has some odd formatting and weird state issues (in maryland, but not technically in
        # maryland), I just created a special case that deals with this string. Uses fuzzy string reading to ID.
        if fuzz.partial_ratio(region, 'Washington D.C') > 90:
            citi = 'Washington, D.C.'
            region = region.replace('Washington', '')
            region = region.replace('DC', '')
            region = region.replace('D.C.', '')

        # The meat of the US address parsing. If the parsing returns an error, nothing is returned and it will end up
        # in the problem file stack.
        try:
            region = region.strip(',')
            clean = usaddress.tag(region)
        except usaddress.RepeatedLabelError:
            return filtered_region

        # Street addresses returned by the parser may contain some or none of these elements, which is why they are
        # initially placed into an array to be combined later. This snippet of code is here, in addition to the first
        # section so that it can catch any misplaced addresses.
        address = [clean[0].setdefault('AddressNumber', ''),
                   clean[0].setdefault('StreetNamePreDirectional', ''),
                   clean[0].setdefault('StreetNamePreModifier', ''), clean[0].setdefault('StreetName', ''),
                   clean[0].setdefault('StreetNamePostType', ''),
                   clean[0].setdefault('StreetNamePostDirectional', '')]
        if address is not None:
            address = ' '.join(address)
            address = ' '.join(address.split())
            filtered_region['Street'] = address

        # Reassigns variable values to the dictionary. If these values didn't exist in the firm place within the
        # parser, nothing is done. The setdefault function only works with dictionaries.
        filtered_region['State'] = clean[0].setdefault('StateName', '')
        filtered_region['Zipcode'] = clean[0].setdefault('ZipCode', '')

        # Enters Washington DC
        if citi:
            filtered_region['City'] = citi
        else:
            filtered_region['City'] = clean[0].setdefault('PlaceName', '')

        # Adds on the PO box stuff into the dictionary.
        if 'USPSBoxType' in clean[0] and 'USPSBoxID' in clean[0]:
            filtered_region['PO'] = "%s %s" % (clean[0]['USPSBoxType'], clean[0]['USPSBoxID'])

        if 'CountryName' in clean[0]:
            filtered_region['Country'] = clean[0]['CountryName']

        if 'Recipient' in clean[0]:
            filtered_region['Recipient'] = clean[0]['Recipient']

        if 'LandmarkName' in clean[0]:
            filtered_region['LandmarkName'] = clean[0]['LandmarkName']

        return filtered_region
Пример #55
0
import usaddress
from usaddress import RepeatedLabelError
import csv

readfile = open('uscleaned.csv', 'rb')
reader = csv.reader(readfile)
new_rows = []
problem_rows = []
for row in reader:
    row[4] = row[4].replace('US', '')
    print row[4]
    try:
        address = usaddress.tag(row[4])[0]
        try:
            city = address['PlaceName']
        except KeyError:
            city = ''
        try:
            state = address['StateName']
        except KeyError:
            state = ''
        try:
            zipcode = address['ZipCode']
        except KeyError:
            zipcode = ''
        try:
            country = address['Country']
        except KeyError:
            country = ''
        address['PlaceName'] = None
        address['StateName'] = None
Пример #56
0
 def tagger(self, field) :
     return usaddress.tag(field)
Пример #57
0
 def test_broadway(self):
     s1 = '1775 Broadway And 57th, Newyork NY'
     usaddress.tag(s1)
Пример #58
0
def normalize_attorney_contact(c, fallback_name=''):
    """Normalize the contact string for an attorney.

    Attorney contact strings are newline separated addresses like:

        Landye Bennett Blumstein LLP
        701 West Eighth Avenue, Suite 1200
        Anchorage, AK 99501
        907-276-5152
        Email: [email protected]

    We need to pull off email and phone numbers, and then our address parser
    should work nicely.
    """
    atty_info = {
        'email': '',
        'fax': '',
        'phone': '',
    }
    if not c:
        return {}, atty_info

    address_lines = []
    lines = c.split('\n')
    for i, line in enumerate(lines):
        line = re.sub('Email:\s*', '', line).strip()
        line = re.sub('pro se', '', line, flags=re.I)
        if not line:
            continue
        try:
            validate_email(line)
        except ValidationError:
            # Not an email address, press on.
            pass
        else:
            # An email address.
            atty_info['email'] = line
            continue

        # Perhaps a phone/fax number?
        clean_line = re.sub(r'(\(|\)|\\|/|\s+)', '', line)
        if clean_line.startswith('Fax:'):
            clean_line = re.sub('Fax:', '', clean_line)
            m = phone_digits_re.search(clean_line)
            if m:
                atty_info['fax'] = normalize_us_phone_number(clean_line)
            continue
        else:
            m = phone_digits_re.search(clean_line)
            if m:
                atty_info['phone'] = normalize_us_phone_number(clean_line)
                continue

        # First line containing an ampersand? These are usually law firm names.
        if u'&' in line and i == 0:
            fallback_name = line
            continue

        has_chars = re.search('[a-zA-Z]', line)
        if has_chars:
            # Not email, phone, fax, and has at least one char.
            address_lines.append(line)

    mapping = {
        'Recipient': 'name',
        'AddressNumber': 'address1',
        'AddressNumberPrefix': 'address1',
        'AddressNumberSuffix': 'address1',
        'StreetName': 'address1',
        'StreetNamePreDirectional': 'address1',
        'StreetNamePreModifier': 'address1',
        'StreetNamePreType': 'address1',
        'StreetNamePostDirectional': 'address1',
        'StreetNamePostModifier': 'address1',
        'StreetNamePostType': 'address1',
        # When corner addresses are given, you have two streets in an address
        'SecondStreetName': 'address1',
        'SecondStreetNamePreDirectional': 'address1',
        'SecondStreetNamePreModifier': 'address1',
        'SecondStreetNamePreType': 'address1',
        'SecondStreetNamePostDirectional': 'address1',
        'SecondStreetNamePostModifier': 'address1',
        'SecondStreetNamePostType': 'address1',
        'CornerOf': 'address1',
        'IntersectionSeparator': 'address1',
        'LandmarkName': 'address1',
        'USPSBoxGroupID': 'address1',
        'USPSBoxGroupType': 'address1',
        'USPSBoxID': 'address1',
        'USPSBoxType': 'address1',
        'BuildingName': 'address2',
        'OccupancyType': 'address2',
        'OccupancyIdentifier': 'address2',
        'SubaddressIdentifier': 'address2',
        'SubaddressType': 'address2',
        'PlaceName': 'city',
        'StateName': 'state',
        'ZipCode': 'zip_code',
        'ZipPlus4': 'zip_code',
    }
    try:
        address_info, address_type = usaddress.tag(
            u', '.join(address_lines),
            tag_mapping=mapping,
        )
    except (usaddress.RepeatedLabelError, UnicodeEncodeError):
        # See https://github.com/datamade/probableparsing/issues/2 for why we
        # catch the UnicodeEncodeError. Oy.
        logger.warn("Unable to parse address (RepeatedLabelError): %s" %
                    ', '.join(c.split('\n')))
        return {}, atty_info

    # We don't want this getting through to the database layer. Pop it.
    address_info.pop('NotAddress', None)

    if any([address_type == 'Ambiguous',
            'CountryName' in address_info]):
        logger.warn("Unable to parse address (Ambiguous address type): %s" %
                    ', '.join(c.split('\n')))
        return {}, atty_info

    if address_info.get('name') is None and fallback_name:
        address_info['name'] = fallback_name
    if address_info.get('state'):
        address_info['state'] = normalize_us_state(address_info['state'])

    address_info = normalize_address_info(dict(address_info))
    address_info['lookup_key'] = make_address_lookup_key(address_info)
    return address_info, atty_info