def get_var_from_overlay(df_shp1, df_shp2, varsToGet):
    '''
    Moves variables from df_shp2 to df_shp1 based on a spatial join, e.g. grabbing information from census geographies
    :param df_shp1:
    :param df_shp2:
    :param varsToGet:
    :param dfToConvert: specifies which df's crs to convert, in case df is much larger than the other
    :return:
    '''
    # makes a unique ID for df_shp1 which will later be deleted, in case it intersects multiple in df_shp2
    df_shp1['__ID__'] = range(df_shp1.shape[0])

    # dropping nonetype rows from shp files
    write_to_log(
        "Droppping {} and {} rows from df_shp1 and df_shp2 whose geometries are None"
        .format(sum(df_shp1.geometry.type.isnull()),
                sum(df_shp2.geometry.type.isnull())),
        2,
        doPrint=True)
    df_shp1 = df_shp1[~df_shp1.geometry.type.isnull()]
    df_shp2 = df_shp2[~df_shp2.geometry.type.isnull()]

    if (str(df_shp1.crs) != default_crs):
        write_to_log(
            f"df_shp1 crs  is {str(df_shp1.crs)}, assigning {default_crs}",
            2,
            doPrint=True)
        df_shp2 = df_shp2.to_crs(default_crs)
        df_shp1.crs = default_crs

    if (str(df_shp2.crs) != default_crs):
        write_to_log(
            f"df_shp1 crs  is {str(df_shp2.crs)}, assigning {default_crs}",
            2,
            doPrint=True)
        df_shp2 = df_shp2.to_crs(default_crs)
        df_shp1.crs = default_crs

    df_shp1 = df_shp1.drop(columns=varsToGet, errors="ignore")
    df_shp1_overlay = gpd.sjoin(df_shp1,
                                df_shp2[varsToGet + ['geometry']],
                                how="left",
                                op='intersects')
    write_to_log(
        "{} rows in df_shp1 intersected a shape in df_shp2, {} intersected multiple shapes, {} intersected no shapes"
        .format(
            str(len(df_shp1_overlay['__ID__'].drop_duplicates())),
            str(
                len(df_shp1_overlay['__ID__'][df_shp1_overlay[
                    '__ID__'].duplicated()].drop_duplicates())),
            str(df_shp1.shape[0] -
                len(df_shp1_overlay['__ID__'].drop_duplicates()))),
        1,
        doPrint=True)
    print(
        df_shp1_overlay['GEOID10'].isna().sum() /
        df_shp1_overlay['GEOID10'].shape[0], "percent GEOID10 that is NA")
    df_shp1_overlay = df_shp1_overlay.drop_duplicates(subset=['__ID__']).drop(
        columns=['__ID__'])
    return (df_shp1_overlay)
    return df


def make_location_vars(panel_df: pd.DataFrame) -> pd.DataFrame:
    for var in [
            "cleaned_dba_name", "cleaned_business_name",
            "cleaned_ownership_name", "business_id"
    ]:
        panel_df["num_locations_" + var] = make_chain_var(
            panel_df[[var, "year"]], time_col="year", name_col=var)
    return panel_df


if __name__ == "__main__":

    write_to_log(f'Starting clean business data at {WTL_TIME}')
    # initialize data dict
    # data_dict = make_data_dict(use_seagate=False)
    #
    for city in [
            # "sf",
            # "sd",
            "sac",
            # "la",
            # "chi",
            # "seattle",
            # "baton_rouge",
            # "philly",
            # "orlando",
            # "stl"
    ]:
Exemplo n.º 3
0
def clean_unit_vectorized(column:pd.Series) -> pd.Series:
    """Takes a pandas dataframe and performs any vectorizable string cleaning operations. Does not
    convert floor numbers to numbers like clean_unit
    """
    """Takes a dataframe and a column and cleans the strings including removing periods, apostrophies, weird spaces"""
    if column.isnull().all() == True:
        return column
    if pd.api.types.is_string_dtype(column) == False:
        write_to_log("column is not of type object/string")
        return column

    column = column.str.lower()

    column = column.str.replace(r'\.|!|@|\$|~|\(|\)|\\|\||\*|/|"|`', "", regex=True)

    column = column.str.replace(r"'", "", regex=True)

    column = column.str.strip()

    column = column.str.replace(r'^(.+)([&,]\s?)$', r'\g<1>', regex=True)

    column = column.str.replace(r'\s{2,}', " ", regex=True,flags=re.IGNORECASE)

    column = column.str.replace(r"([0-9]+)(\s-\s|\s-|-\s)([0-9]+)",r"\g<1>-\g<3>", regex=True,flags=re.IGNORECASE)

    column = column.str.replace(
        r'([\s^-])(rm\s|space\s|room\s|units?\b|suite\b|apt\b|un\s|ste\b|number\b|no\b)', r'\g<1>#', regex=True)

    column = column.str.replace(
        r'\s(rm|room|unit|suite|apt|un|ste|number\b|no\b)(\s)([0-9a-zA-Z]+)', r'#\g<3>', regex=True)

    # column = column.str.replace(
    #     r'(rm|space|room|unit|suite|apt|un|ste|no)(\s)([0-9a-zA-Z]+)', r'#\g<3>', regex=True)

    column = column.str.replace(r'##', r'#', regex=True)
    column = column.str.replace(r'#\s', '#', regex=True, flags=re.IGNORECASE)

    column = column.str.replace(
        r'([^#])([0-9]{1,})([abcefgijklmopquvwxyz]{2,})',r'\g<1>\g<2> \g<3>', regex=True,flags=re.IGNORECASE)

    column = column.str.replace(r'([a-z])(#)',r'\g<1> \g<2>', regex=True,flags=re.IGNORECASE)

    column = column.str.replace(r'(no\s?)([0-9-]+)',r'#\g<2>', regex=True,flags=re.IGNORECASE)

    column = column.str.replace(r'([^ ])(,|&)([ ])', r'\g<1>\g<2> \g<3>',regex=True, flags=re.IGNORECASE)

    column = column.str.replace(r'([^ ])(,|&)([^\s])', r'\g<1>\g<2> \g<3>',regex=True, flags=re.IGNORECASE)

    column = column.str.replace(r'([a-z])(,)(a-z])', r'\g<1>\g<2> \g<3>', regex=True, flags=re.IGNORECASE)

    column = column.str.replace(r'([a-z])(&)(a-z])', r'\g<1> \g<2> \g<3>',regex=True, flags=re.IGNORECASE)

    column = column.str.replace(r'(&)(a-z])', r'\g<1> \g<2>',regex=True, flags=re.IGNORECASE)

    column = column.str.replace(r'([a-z])(&)', r'\g<1> \g<2>',regex=True, flags=re.IGNORECASE)

    column = column.str.replace(r'([^#])([a-z]{1,})([0-9]{2,})',r'\g<1>\g<2> \g<3>', regex=True,flags=re.IGNORECASE)

    column = column.str.replace(r'([a-z]{2,})([#])', r'\g<1> \g<2>', regex=True, flags=re.IGNORECASE)

    column = column.str.replace(r'\(.+?\)', "", regex=True, flags=re.IGNORECASE)

    column = column.str.replace(r'##', r"#", regex=True, flags=re.IGNORECASE)
    column = column.str.replace(r'(.+)[\s,\s]{1,3}baltimore,?\s?(md|maryland)?$', r"\g<1>", regex=True,
                                flags=re.IGNORECASE)

    # delete parenthesis
    column = column.str.replace(r'\(.+?\)', r"", regex=True)
    column = column.str.replace(r'\s{2,}', " ", regex=True, flags=re.IGNORECASE)
    column = column.str.replace(r"([0-9]+)(\s-\s|\s-|-\s)([0-9]+)",r"\g<1>-\g<3>", regex=True,flags=re.IGNORECASE)
    column = column.str.replace(
        r'\s(rm|room|space|unit|suite|apt|un|ste|number\b)(\s)([0-9a-zA-Z]+)', r'#\g<3>', regex=True)
    column = column.str.replace(r'##', r"#", regex=True, flags=re.IGNORECASE)

    column = column.str.replace(r'#\s', '#', regex=True, flags=re.IGNORECASE)

    column = column.str.replace(r'(#)([a-z]{1,2})(\s)([0-9]+)',r"\g<1>\g<2>\g<4>", regex=True, flags=re.IGNORECASE)

    column = column.str.replace(r'(\s$|^\s)', '', regex=True,flags=re.IGNORECASE)

    column = column.str.replace(r'(p\s?o)\s?(box)', 'po box', regex=True, flags=re.IGNORECASE)
    column = column.str.replace(r'(p\.?o\.?)\s?(box)', 'po box', regex=True,flags=re.IGNORECASE)
    column = column.replace(r'', np.nan, regex=True )
    column = column.replace(r'nan', np.nan, regex=False)

    column = column.str.replace(r'(.+)[\s,\s]{1,3}baltimore,?\s?(md|maryland)?$', r"\g<1>",regex=True,
                                flags=re.IGNORECASE)
    column = column.str.replace(r'(\s(1st|2nd|3rd)\sfloor)$', r"",regex=True, flags=re.IGNORECASE)

    column = column.str.strip()
    return column
Exemplo n.º 4
0
def parse_address(dataframe, address_col, unit_col, st_num_col, st_name_col, st_sfx_col, st_d_col,
                  zipcode_col, city_col, state_col, st_num2_col,
                  prefix='parsed_', legal_description_col=False, raise_error_on_na=True):

    col_list = [address_col, unit_col, st_num_col, st_name_col, st_num2_col,
                st_sfx_col, st_d_col, zipcode_col, city_col, state_col]
    cols_to_check = [address_col, st_name_col]
    if legal_description_col is not False:
        col_list.append(legal_description_col)
        cols_to_check.append(legal_description_col)
    for cols in col_list:
        if cols not in dataframe.columns:
            dataframe[cols] = np.nan
    if all([dataframe[col].isna().all() for col in cols_to_check]):
        if raise_error_on_na == True:
            raise ValueError('All parsable address components are completely NA!')
        else:
            write_to_log('All parsable address components are completely NA! Returning dataframe', warn=True)
            return dataframe
    parsed_unit_col = prefix + 'addr_u'
    parsed_st_num1_col = prefix + 'addr_n1'
    parsed_st_num2_col = prefix + 'addr_n2'
    parsed_st_name_col = prefix + 'addr_sn'
    parsed_ss_col = prefix + 'addr_ss'
    parsed_sd_col = prefix + 'addr_sd'
    parsed_zip_col = prefix + 'addr_zip'
    parsed_address_name = prefix + 'addr_name'
    parsed_city_col = prefix + 'city'
    parsed_state_col = prefix + 'state'
    # replace empty strings w/ np.nan
    for col in col_list:
        if (is_string_dtype(dataframe[col])) and (dataframe[col].isna().all() == False):
            try:
                dataframe[col] = dataframe[col].replace(r'', np.nan, regex=True,)
                dataframe[col] = dataframe[col].replace(r'nan', np.nan, regex=False)
            except AttributeError as error:
                print('{} is numeric, not attempting to remove whitespace'.format(col))
    dataframe_full = dataframe.copy(deep=True)
    merge_cols = [address_col, st_name_col, unit_col, st_num_col, st_sfx_col, st_d_col,
                  zipcode_col, city_col, state_col, st_num2_col]
    initial_shape = dataframe_full.shape[0]
    if legal_description_col is not False:
        dataframe = dataframe[[address_col, unit_col, st_num_col, st_name_col, st_sfx_col,
                               st_d_col, zipcode_col, city_col, legal_description_col, state_col, st_num2_col]]
        dataframe.drop_duplicates(
            subset=merge_cols + [legal_description_col], inplace=True)
    else:
        dataframe = dataframe[
            [address_col, unit_col, st_num_col, st_name_col, st_sfx_col, st_d_col, zipcode_col, city_col, state_col,
             st_num2_col]]
        dataframe.drop_duplicates(
            subset=merge_cols, inplace=True)
    for col in [parsed_zip_col, parsed_sd_col, parsed_unit_col, parsed_address_name, parsed_st_name_col,
                parsed_st_num2_col,
                parsed_st_num1_col, parsed_ss_col]:
        dataframe[col] = np.nan
    final_shape = dataframe_full.shape[0]
    if initial_shape != final_shape:
        write_to_log('initial shape is {} and final shape is {}'.format(initial_shape, final_shape))
        raise ValueError('Initial and final shapes do not agree! Copy was not deep enough')
    og_types = (dataframe_full[[address_col, st_name_col, st_num_col, st_sfx_col, st_d_col]].dtypes.apply(
        lambda x: x.name).to_dict())
    # create new address col
    new_address_col = 'new_' + address_col
    # regex definitions
    st_num = r'([0-9]+-?[a-z]?)'
    st_d = r'([nsewrl]{1,2}\s|rear\s|side\s)'
    st_name = r'(\w{3,}|\w{3,}\s\w{3,}|[a-z\s-]{4,}|[a-z]+|[0-9]+)'
    st_sfx = r'(aly|ave|byu|blf|blvd|bd|br|cswy|ctr|cir|ct|cr|cv|crk|cres|xing|curv|dr|est|expy|ext|frk|ft|fwy|gdn|gtwy|hvn|' \
             r'hwy|hl|jct|ky|lks|ln|lgt|lp|mall|mnr|mdw|msn|mtn|pkwy|pass|path|pl|plz|pt|prt|pw|rst|rdge|rd|rte|rw|shr|' \
             r'sq|st|strm|trak|trl|turnpike|ter|tl|vly|vws|walk|way)'
    zipcode = r'([0-9]{5}|[0-9]{5}-[0-9]{4})'
    unit = r'#\s?([a-z0-9-]+|[0-9-]+-?[a-z]?|[a-z]-?[0-9]+|[up][0-9]?[abcd]|un\s[0-9abcd]|[a-z])'
    unit2 = r'\s([a-z]+-?[0-9]{1,4}|[uptcl][-\s]?[tl]?[0-9]{1,4}[abcd]?|un\s[0-9abcd]{1,4})'  # use this for when you don't need to know that there's a # before
    state = 'AL|AK|AS|AZ|AR|CA|CO|CT|DE|DC|FL|GA|GU|HI|ID|IL|IN|IA|KS|KY|LA|ME|MD|MA|MI|MN|MS|MO|MT|NE|NV|NH|NJ|NM|NY|NC|ND|MP|OH|OK|OR|PA|PR|RI|SC|SD|TN|TX|UT|VT|VI|VA|WA|WV|WI|WY'
    state_dict = {
        'AL': 'Alabama',
        'AK': 'Alaska',
        'AS': 'American Samoa',
        'AZ': 'Arizona',
        'AR': 'Arkansas',
        'CA': 'California',
        'CO': 'Colorado',
        'CT': 'Connecticut',
        'DE': 'Delaware',
        'DC': 'District of Columbia',
        'FL': 'Florida',
        'GA': 'Georgia',
        'GU': 'Guam',
        'HI': 'Hawaii',
        'ID': 'Idaho',
        'IL': 'Illinois',
        'IN': 'Indiana',
        'IA': 'Iowa',
        'KS': 'Kansas',
        'KY': 'Kentucky',
        'LA': 'Louisiana',
        'ME': 'Maine',
        'MD': 'Maryland',
        'MA': 'Massachusetts',
        'MI': 'Michigan',
        'MN': 'Minnesota',
        'MS': 'Mississippi',
        'MO': 'Missouri',
        'MT': 'Montana',
        'NE': 'Nebraska',
        'NV': 'Nevada',
        'NH': 'New Hampshire',
        'NJ': 'New Jersey',
        'NM': 'New Mexico',
        'NY': 'New York',
        'NC': 'North Carolina',
        'ND': 'North Dakota',
        'MP': 'Northern Mariana Islands',
        'OH': 'Ohio',
        'OK': 'Oklahoma',
        'OR': 'Oregon',
        'PA': 'Pennsylvania',
        'PR': 'Puerto Rico',
        'RI': 'Rhode Island',
        'SC': 'South Carolina',
        'SD': 'South Dakota',
        'TN': 'Tennessee',
        'TX': 'Texas',
        'UT': 'Utah',
        'VT': 'Vermont',
        'VI': 'Virgin Islands',
        'VA': 'Virginia',
        'WA': 'Washington',
        'WV': 'West Virginia',
        'WI': 'Wisconsin',
        'WY': 'Wyoming'}
    # filter rows that have already been parsed
    # create df_list to hold parsed names
    df_list = []
    # fill in where already parsed
    dataframe[parsed_unit_col] = dataframe[unit_col]
    dataframe[parsed_ss_col] = dataframe[st_sfx_col]
    dataframe[parsed_st_name_col] = dataframe[st_name_col]
    dataframe[parsed_sd_col] = dataframe[st_d_col]
    dataframe[parsed_city_col] = dataframe[city_col]
    dataframe[parsed_state_col] = dataframe[state_col]
    dataframe[parsed_st_num2_col] = np.where(
        dataframe[st_num2_col].isna(),
        dataframe[st_num2_col],
        dataframe[st_num2_col].astype(str)
    )
    # do additional parsing from parsed columns
    # (i.e st suffix from st name, st num 1 & 2 from st_num) standardize strings
    if is_string_dtype(dataframe[st_num_col]):
        dataframe['TEMP'] = np.where(
            dataframe[st_num_col].isna(),
            dataframe[st_num_col],
            dataframe[st_num_col].astype(str)
        )
        split1 = dataframe['TEMP'].str.extract(st_num + '\s?[-&]?\s?' + st_num + '?', flags=re.IGNORECASE)
        dataframe[parsed_st_num1_col].fillna(split1[0], inplace=True)
        dataframe[parsed_st_num2_col].fillna(split1[1], inplace=True)
        dataframe.drop(columns=['TEMP'], inplace=True)
        # dataframe[parsed_st_num2_col] = np.nan

    dataframe[parsed_st_num1_col] = np.where(dataframe[st_num_col].notnull(),
                                             dataframe[st_num_col].astype(str),
                                             dataframe[parsed_st_num1_col]
                                             )
    # clean columns
    for col in [parsed_ss_col, parsed_st_name_col, parsed_sd_col, parsed_city_col]:
        try:
            if any(dataframe[col] == 'nan'):
                print('super not glamorous')
                raise ValueError
            # print('cleaning {}'.format(col))
            dataframe[col] = clean_address_col(dataframe[col])
            if any(dataframe[col] == 'nan'):
                print('not glamorous {}'.format(col))
                raise ValueError
        except AttributeError:
            print(f'{col} cant be cleaned')
    # see if you can parse suffix from st_name col (happens if people put full st address in parsed column)
    if is_string_dtype(dataframe[parsed_st_name_col]):
        split2 = dataframe[parsed_st_name_col].str.extract(st_name + '\s' + st_sfx + "$", flags=re.IGNORECASE)
        dataframe[parsed_st_name_col] = dataframe[parsed_st_name_col].fillna(split2[0])
        dataframe[parsed_ss_col] = dataframe[parsed_ss_col].fillna(split2[1])
        dataframe[parsed_st_name_col] = dataframe[parsed_st_name_col].str.replace(st_name + '\s' + st_sfx + "$", r'\g<1>',
                                                                                  flags=re.IGNORECASE, regex=True)
        dataframe[parsed_st_name_col] = dataframe[parsed_st_name_col].str.replace('\s{2,}', ' ', flags=re.IGNORECASE,
                                                                                  regex=True)
        dataframe[parsed_st_name_col] = dataframe[parsed_st_name_col].str.strip()
    # see if you can parse directional from st_suffix column
    if is_string_dtype(dataframe[parsed_st_name_col]):
        split2 = dataframe[parsed_st_name_col].str.extract("(\s|^)([nsew])(\s|$)")
        dataframe[parsed_sd_col] = dataframe[parsed_sd_col].fillna(split2[2])
        dataframe[parsed_st_name_col] = np.where(
            ~(dataframe[parsed_st_name_col].str.contains("^[nsew]$", na=True)),
            dataframe[parsed_st_name_col].str.replace("(\s|^)([nsew])(\s|$)", r"\g<3>", regex=True),
            dataframe[parsed_st_name_col]
        )
        dataframe[parsed_st_name_col] = dataframe[parsed_st_name_col].str.replace('\s{2,}', ' ', flags=re.IGNORECASE, regex=True)
        dataframe[parsed_st_name_col] = dataframe[parsed_st_name_col].str.strip()
    # clean zipcode column
    dataframe[parsed_zip_col] = dataframe[zipcode_col]
    if dataframe[parsed_zip_col].isna().all() == False:
        dataframe[parsed_zip_col] = np.where(dataframe[parsed_zip_col].isna(),
                                             dataframe[parsed_zip_col],
                                             dataframe[parsed_zip_col].astype(str)
                                             )
        dataframe[parsed_zip_col] = dataframe[parsed_zip_col].str.replace(r'(.+)(\.[0-9])', r'\g<1>', regex=True)
        dataframe[parsed_zip_col] = dataframe[parsed_zip_col].str.strip()
        dataframe[parsed_zip_col] = np.where(dataframe[parsed_zip_col].str.contains('^[0-9]{6}$'),
                                             dataframe[parsed_zip_col].str[0:5],
                                             dataframe[parsed_zip_col])
        dataframe[parsed_zip_col] = dataframe[parsed_zip_col].str.strip()
        dataframe[parsed_zip_col] = np.where(dataframe[parsed_zip_col].str.contains('^[0-9]{4}$'),
                                             '0' + dataframe[parsed_zip_col],
                                             dataframe[parsed_zip_col])

        dataframe[parsed_zip_col] = dataframe[parsed_zip_col].str.replace(r'([0-9]{5})-([0-9]+)?', r'\g<1>', regex=True)
        dataframe[parsed_zip_col] = '_' + dataframe[parsed_zip_col]
        dataframe[parsed_zip_col] = dataframe[parsed_zip_col].str.replace(r'^(_)\s?([0-9]{4})$', r'\g<1>0\g<2>', regex=True)

    # extract unit_col from address column
    dataframe[parsed_unit_col] = np.where(
        dataframe[parsed_unit_col] == '',
        np.nan,
        dataframe[parsed_unit_col]
    )
    # dataframe['fillUnit'] = dataframe[address_col].str.extract(unit,flags=re.IGNORECASE).iloc[:,0]
    # try a bunch of different units
    if is_string_dtype(dataframe[address_col]):
        dataframe[parsed_unit_col] = dataframe[parsed_unit_col].fillna(
            dataframe[address_col].str.extract(unit,
                                               flags=re.IGNORECASE).iloc[:, 0])
        dataframe[parsed_unit_col] = dataframe[parsed_unit_col].fillna(
            dataframe[address_col].str.extract(unit2,
                                               flags=re.IGNORECASE).iloc[:, 0])
        dataframe[parsed_unit_col] = dataframe[parsed_unit_col].fillna(
            dataframe[address_col].str.extract(r'\su[-:\s]{1,3}([0-9]+[a-z]+)',
                                               flags=re.IGNORECASE).iloc[:, 0])
        dataframe[parsed_unit_col] = dataframe[parsed_unit_col].fillna(
            dataframe[address_col].str.extract(r'\su[-:\s]{1,3}([0-9]+)',
                                               flags=re.IGNORECASE).iloc[:, 0])
        dataframe[parsed_unit_col] = dataframe[parsed_unit_col].fillna(
            dataframe[address_col].str.extract(r'1-([0-9]+[a-z]+)',
                                               flags=re.IGNORECASE).iloc[:, 0])
        # replace unit_col column w/ empty string
        dataframe[new_address_col] = dataframe[address_col].str.replace(unit, '', regex=True)
        dataframe[new_address_col] = dataframe[new_address_col].str.strip()
        if (dataframe[new_address_col].isna().sum()) > (dataframe[address_col].isna().sum()):
            raise ValueError
        state_names = [keys for keys in state_dict.keys()] + [values for values in state_dict.values()]
        # clean the city column
        if dataframe[parsed_city_col].isna().all() == False:
            # parse state from city col
            dataframe[parsed_state_col] = dataframe[parsed_state_col].fillna(dataframe[parsed_city_col].
                                                                             str.extract(
                '\s({})(\s|$)'.format('|'.join(state_names)),
                flags=re.IGNORECASE).iloc[:, 0])
            # remove state from city
            dataframe[parsed_city_col] = dataframe[parsed_city_col].str.replace(r'(.+)[,\s]({})(\s|$)'.
                                                                                format('|'.join(state_names)), r'\g<1>',
                                                                                flags=re.IGNORECASE, regex=True)
        # parse state from full address
        dataframe[parsed_state_col] = dataframe[parsed_state_col].fillna(
            dataframe[address_col].str.extract('({})'.format('|\b'.join(state_names)),
                                               flags=re.IGNORECASE).iloc[:, 0])
        # standardize parsed state names
        if dataframe[parsed_state_col].isna().all() == False:
            for key in state_dict.keys():
                dataframe[parsed_state_col] = np.where(
                    dataframe[parsed_state_col].isna(),
                    dataframe[parsed_state_col],
                    dataframe[parsed_state_col].str.replace('{}'.format(state_dict[key]), key, regex=True)

                )

        added_cols = [parsed_st_num1_col, parsed_st_name_col, parsed_ss_col]
        dataframe1 = dataframe[~dataframe[added_cols].notna().all(1)]
        dataframe0 = dataframe[dataframe[added_cols].notna().all(1)]
        dataframe0['parsed_from'] = 'already_parsed'
        df_list.append(dataframe0)

        def parse_from_legal_description(dataframe, log=False):
            st_sfx_la = r'(aly|ave|byu|blf|blvd|cl|cswy|ctr|cir|cv|crk|cres|xing|curv|dr|est|expy|ext|frk|ft|fwy|gdn|gtwy|hvn|' \
                        r'hwy|hl|jct|ky|lks|ln|lgt|lp|mall|mnr|mdw|msn|mtn|pkwy|pass|path|plz|pt|prt|rst|rdge|rd|rte|rw|shr|' \
                        r'sq|st|strm|trak|turnpike|vly|vws|walk|way)'
            st_name_la = r'([a-z\s-]+)'
            dataframe_no_ld = dataframe[dataframe[legal_description_col].isna()]
            dataframe_yes_ld = dataframe[~dataframe[legal_description_col].isna()]
            if is_string_dtype(dataframe[legal_description_col]):
                split3 = dataframe_yes_ld[legal_description_col].str.extract(r'(#)\s?([0-9a-z-]{1,6})',
                                                                             flags=re.IGNORECASE)
                dataframe_yes_ld[parsed_unit_col].fillna(split3[1], inplace=True)
                split4 = dataframe_yes_ld[legal_description_col].str.extract(r'\b([0-9a-z-]+)\sof\s([0-9]+)',
                                                                             flags=re.IGNORECASE)
                dataframe_yes_ld[parsed_unit_col].fillna(split4[0], inplace=True)
                dataframe_yes_ld[parsed_st_num1_col].fillna(split4[1])
                dataframe_yes_ld['new' + legal_description_col] = dataframe_yes_ld[legal_description_col].str.replace(
                    'of\s', '', flags=re.IGNORECASE)
                dataframe_yes_ld['new' + legal_description_col] = dataframe_yes_ld[
                    'new' + legal_description_col].str.replace(r'#\s?[0-9a-z]{1,5}', '', flags=re.IGNORECASE, regex=True)
                # parse where string contains num? st suffix
                split1 = dataframe_yes_ld['new' + legal_description_col].str.extract(
                    st_num + r'?-?' + st_num + r'?\b' + r'([nsewrl])?\s?' + st_name_la +
                    r'\s' + st_sfx_la, flags=re.IGNORECASE
                    )
                dataframe_yes_ld[parsed_st_num1_col] = dataframe_yes_ld[parsed_st_num1_col].fillna(split1[0])
                dataframe_yes_ld[parsed_st_num2_col] = dataframe_yes_ld[parsed_st_num2_col].fillna(split1[1])
                dataframe_yes_ld[parsed_sd_col] = dataframe_yes_ld[parsed_sd_col].fillna(split1[2])
                dataframe_yes_ld[parsed_st_name_col] = dataframe_yes_ld[parsed_st_name_col].fillna(split1[3])
                dataframe_yes_ld[parsed_ss_col] = dataframe_yes_ld[parsed_ss_col].fillna(split1[4])
                # parse where string contains #unit? <some word> + condo
                split2 = dataframe_yes_ld['new' + legal_description_col].str.extract(
                    r'(of\s)?#?([0-9]+[a-z]?)?\s?(of\s)?([a-z\s]+)\scondo', flags=re.IGNORECASE)
                dataframe_yes_ld[parsed_unit_col] = dataframe_yes_ld[parsed_unit_col].fillna(split2[1])
                dataframe_yes_ld[parsed_address_name] = dataframe_yes_ld[parsed_address_name].fillna(split2[3])
                dataframe_yes_ld = dataframe_yes_ld.drop(columns=['new' + legal_description_col])
                dataframe_yes_ld['parsed_from'] = 'legal_desc'
            else:
                if log is not False:
                    print('not parsing legal address column')
            dataframe = pd.concat([dataframe_yes_ld, dataframe_no_ld])
            return dataframe
            # drop duplicates based on address & legal address column

        # dataframe2.drop_duplicates(subset=[new_address_col, legal_description_col], inplace=True)
        # extract where format is num st sfx unit, but unit does not need a # signifier
        split0 = dataframe1[new_address_col].str.extract(
            st_num + r'-?\s?' + st_num + r'?\s' + st_d + r'?[\s-]?' + st_name + r'\s' + st_sfx + r'[-\s,]{1,3}' +
            r'#?([0-9-]{1,4}-?[a-z]?|[a-z]-?[0-9]{1,4}|[up][0-9]{1,4}[abcd]|un\s[0-9abcd]{1,4}|[abcd])' + r'([^0-9]+)?',
            flags=re.IGNORECASE)
        dataframe1[parsed_st_num1_col].fillna(split0[0], inplace=True)
        dataframe1[parsed_st_num2_col].fillna(split0[1], inplace=True)
        dataframe1[parsed_sd_col].fillna(split0[2], inplace=True)
        dataframe1[parsed_st_name_col].fillna(split0[3], inplace=True)
        dataframe1[parsed_ss_col].fillna(split0[4], inplace=True)
        dataframe1[parsed_unit_col].fillna(split0[5], inplace=True)

        dataframe2 = dataframe1[~dataframe1[added_cols].notna().all(1)]
        dataframe1 = dataframe1[dataframe1[added_cols].notna().all(1)]

        dataframe1['parsed_from'] = 'num_st_sfx_u'
        df_list.append(dataframe1)

        # extract where format is num num? st sfx zipcode?
        split1 = dataframe2[new_address_col].str.extract(
            st_num + r'-?\s?' + st_num + r'?\s' + st_d + r'?[\s-]?' + st_name +
            r'\s' + st_sfx + r'\s?' + r'#?([0-9-]{1,4}-?[a-z]?|[a-z]-?[0-9]{1,4}|[up][0-9]{1,4}[abcd]|un\s[0-9abcd]{1,4})' + '?'
                                                                                                                             r',?\b' + zipcode + '?',
            flags=re.IGNORECASE)
        dataframe2[parsed_st_num1_col].fillna(split1[0], inplace=True)
        dataframe2[parsed_st_num2_col].fillna(split1[1], inplace=True)
        dataframe2[parsed_sd_col].fillna(split1[2], inplace=True)
        dataframe2[parsed_st_name_col].fillna(split1[3], inplace=True)
        dataframe2[parsed_ss_col].fillna(split1[4], inplace=True)
        dataframe2[parsed_unit_col].fillna(split1[5], inplace=True)
        dataframe2[parsed_zip_col].fillna(split1[6], inplace=True)
        if legal_description_col is not False:
            dataframe2 = parse_from_legal_description(dataframe=dataframe2)

        dataframe3 = dataframe2[~dataframe2[added_cols].notna().all(1)]
        dataframe2 = dataframe2[dataframe2[added_cols].notna().all(1)]
        dataframe2['parsed_from'] = 'num_st_sfx'
        df_list.append(dataframe2)

        # extract where format is num st_d? st
        split2 = dataframe3[new_address_col].str.extract(
            r'^' + st_num + r'-?\s?' + st_num + r'?\s' + st_d + '?' + st_name + '\s?,?\s?#?([0-9a-z\s]{1,4})?$',
            flags=re.IGNORECASE)
        dataframe3[parsed_st_num1_col].fillna(split2[0], inplace=True)
        dataframe3[parsed_st_num2_col].fillna(split2[1], inplace=True)
        dataframe3[parsed_sd_col].fillna(split2[2], inplace=True)
        dataframe3[parsed_st_name_col].fillna(split2[3], inplace=True)
        dataframe3[parsed_unit_col].fillna(split2[4], inplace=True)
        dataframe4 = dataframe3[~dataframe3[[parsed_st_num1_col, parsed_st_name_col]].notna().all(1)]
        dataframe3 = dataframe3[dataframe3[[parsed_st_num1_col, parsed_st_name_col]].notna().all(1)]
        dataframe3['parsed_from'] = 'num_st'
        df_list.append(dataframe3)

        # extract where format is st_st_sfx
        split3 = dataframe4[new_address_col].str.extract(
            '^' + st_d + r'?\s?' + st_name + r'\s' + st_sfx + r'\s?[,-]?\s?$', flags=re.IGNORECASE)
        dataframe4[parsed_sd_col].fillna(split3[0], inplace=True)
        dataframe4[parsed_st_name_col].fillna(split3[1], inplace=True)
        dataframe4[parsed_ss_col].fillna(split3[2], inplace=True)
        dataframe5 = dataframe4[~dataframe4[[parsed_ss_col, parsed_st_name_col]].notna().all(1)]
        dataframe4 = dataframe4[dataframe4[[parsed_ss_col, parsed_st_name_col]].notna().all(1)]
        dataframe4['parsed_from'] = 'st_st_sfx'
        df_list.append(dataframe4)

        # extract where format is st_st_sfx_unit
        split4 = dataframe5[new_address_col].str.extract('^' + st_d + r'?\s?' + st_name +
                                                         r'\s' + st_sfx + r'\s' + unit2 + '.+$', flags=re.IGNORECASE)
        dataframe5[parsed_sd_col].fillna(split4[0], inplace=True)
        dataframe5[parsed_st_name_col].fillna(split4[1], inplace=True)
        dataframe5[parsed_ss_col].fillna(split4[2], inplace=True)
        dataframe5[parsed_unit_col].fillna(split4[3], inplace=True)
        dataframe6 = dataframe5[~dataframe5[[parsed_ss_col, parsed_st_name_col]].notna().all(1)]
        dataframe5 = dataframe5[dataframe5[[parsed_ss_col, parsed_st_name_col]].notna().all(1)]
        dataframe5['parsed_from'] = 'st_st_sfx_unit'
        df_list.append(dataframe5)

        split5 = dataframe6[new_address_col].str.extract('^' + st_d + r'?\s?' + st_name +
                                                         r'\s' + st_sfx + '\s.+$', flags=re.IGNORECASE)
        dataframe6[parsed_sd_col].fillna(split5[0], inplace=True)
        dataframe6[parsed_st_name_col].fillna(split5[1], inplace=True)
        dataframe6[parsed_ss_col].fillna(split5[2], inplace=True)
        dataframe7 = dataframe6[~dataframe6[[parsed_ss_col, parsed_st_name_col]].notna().all(1)]
        dataframe6 = dataframe6[dataframe6[[parsed_ss_col, parsed_st_name_col]].notna().all(1)]
        dataframe6['parsed_from'] = 'st_st_sfx_otherStuff'
        df_list.append(dataframe6)

        # extract where format is (.+)way
        # has to be done like this because broadway doesnt have a st sfx
        split6 = dataframe7[new_address_col].str.extract('^' + st_num + r'?\s?' + '([a-z]+way)' + '\s?' +
                                                         r'#?\s?([0-9-]+-?[a-z]?|[a-z]-?[0-9]+|[up][0-9]?[abcd]|un\s[0-9abcd])' + '?')
        dataframe7[parsed_st_name_col].fillna(split6[1], inplace=True)
        dataframe7[parsed_st_num1_col].fillna(split6[0], inplace=True)
        dataframe7[parsed_unit_col].fillna(split6[2], inplace=True)
        #     dataframe7.drop(columns=['temp_col'], inplace=True)
        dataframe_not_parsed = dataframe7[~dataframe7[[parsed_st_num1_col, parsed_st_name_col]].notna().all(1)]
        dataframe7 = dataframe7[dataframe7[[parsed_st_num1_col, parsed_st_name_col]].notna().all(1)]
        dataframe7['parsed_from'] = 'way'
        dataframe_not_parsed['parsed_from'] = 'not parsed'
        df_list.append(dataframe7)

        dataframe = pd.concat(df_list)
    else:
        dataframe[new_address_col] = dataframe[address_col]
        dataframe_not_parsed = pd.DataFrame(columns=dataframe.columns)
    if is_string_dtype(dataframe[parsed_st_num1_col]):
        dataframe[parsed_st_num1_col] = dataframe[parsed_st_num1_col].str.replace('-', '')
    if is_string_dtype(dataframe[parsed_st_num1_col]):
        dataframe[parsed_st_num1_col] = dataframe[parsed_st_num1_col].str.replace('-', '')
    if is_string_dtype(dataframe[parsed_zip_col]):
        dataframe[parsed_zip_col] = dataframe[parsed_zip_col].str.replace('_$', '', regex=True)
    dataframe[parsed_st_num1_col] = np.where(dataframe[parsed_st_num1_col].isna(),
                                             dataframe[parsed_st_num1_col],
                                             dataframe[parsed_st_num1_col].astype('int32', errors='ignore'))
    for col in [parsed_unit_col, parsed_st_num1_col, parsed_sd_col, parsed_st_name_col, parsed_ss_col,
                parsed_zip_col, parsed_city_col]:
        try:
            dataframe[col] = dataframe[col].str.replace(r'\s{2,}', " ",
                                                        flags=re.IGNORECASE, regex=True)
            dataframe[col] = dataframe[col].str.strip()
            dataframe[col] = dataframe[col].str.replace(r'\.0', " ",
                                                        flags=re.IGNORECASE, regex=True)
            dataframe[col] = np.where(dataframe[col].str.contains('^(\s+)?$', na=True), np.nan, dataframe[col])
            dataframe[col] = np.where(dataframe[col].str.contains('^nan$', na=True), np.nan, dataframe[col])
        except AttributeError as error:
            print('{} is numeric, not attempting to remove whitespace'.format(col))
    # fill unit col w/ n2 col that has letters
    try:
        dataframe['fill_unit'] = np.where((dataframe[parsed_unit_col].isna()) & (
            dataframe[parsed_st_num2_col].str.contains('[a-z]', flags=re.IGNORECASE)),
                                          dataframe[parsed_st_num2_col], np.nan
                                          )
        dataframe[parsed_unit_col].fillna(dataframe['fill_unit'], inplace=True)
        dataframe.drop(columns='fill_unit', inplace=True)
        dataframe[parsed_st_num2_col] = np.where(
            dataframe[parsed_st_num2_col].str.contains('[a-z]', flags=re.IGNORECASE),
            np.nan,
            dataframe[parsed_st_num2_col])
    except AttributeError:
        print('{} is numeric, not attempting to remove fill unit'.format(parsed_st_num2_col))
    # remove n2 that have letters
    # clean unit col
    try:
        dataframe[parsed_unit_col] = dataframe[parsed_unit_col].str.replace(r'-|\.|!|@|\$|~|\(|\)|\\|\||\*|/|"|`|\s|#',
                                                                            "", regex=True)
        dataframe[parsed_unit_col] = dataframe[parsed_unit_col].str.replace('(u)([0-9]+[a-z]?)', r'\g<2>', regex=True)
        dataframe[parsed_unit_col] = dataframe[parsed_unit_col].str.lower()
        dataframe[parsed_unit_col] = dataframe[parsed_unit_col].str.strip()
        # clean unit col
        dataframe[parsed_unit_col] = dataframe[parsed_unit_col].str.replace(r'-|\.|!|@|\$|~|\(|\)|\\|\||\*|/|"|`|\s|#',
                                                                            "", regex=True)
        # parse directional from address and fill na
        split_d = dataframe[new_address_col].str.extract('\s' + '([nsew]{1,2})' + '(\s|,|$)', flags=re.IGNORECASE)
        dataframe[parsed_sd_col].fillna(split_d[0], inplace=True)
        # remove directionals from parsed street name
        dataframe[parsed_st_name_col] = dataframe[parsed_st_name_col].str.replace(r'^([nsewrl])\s(.+)', r'\g<2>', regex=True)
        dataframe[parsed_st_name_col] = dataframe[parsed_st_name_col].str.replace(r'(.+)\s([nsewrl])$', r'\g<1>', regex=True)
    except AttributeError:
        print('{} is not string type, not attempting to remove whitespace'.format(col))
    # standardize street name
    dataframe[parsed_st_name_col] = string_standardize_column_vectorized(dataframe[parsed_st_name_col])
    dataframe[prefix + 'fullAddress'] = combine_names(dataframe[[parsed_unit_col, parsed_st_num1_col,
     parsed_sd_col, parsed_st_name_col, parsed_ss_col,
                             parsed_zip_col, parsed_city_col]],
                  name_cols=[parsed_unit_col, parsed_st_num1_col, parsed_sd_col, parsed_st_name_col, parsed_ss_col,
                             parsed_zip_col, parsed_city_col],  fill='empty'
                  )
    dataframe = pd.concat([dataframe, dataframe_not_parsed])

    dataframe[prefix + 'fullAddress'].fillna(dataframe[address_col], inplace=True)
    if legal_description_col is not False:
        dataframe[prefix + 'fullAddress'].fillna(dataframe[legal_description_col], inplace=True)

    if legal_description_col is not False:
        dataframe_full = pd.merge(dataframe_full, dataframe, how='left',
                                  on=merge_cols + [legal_description_col], indicator=True)
    else:
        dataframe_full = pd.merge(dataframe_full, dataframe, how='left',
                                  on=merge_cols, indicator=True)
    if dataframe_full['_merge'].isin(['left_only']).any():

        raise ValueError('Some addresses didnt get merged right, buddy')
    if dataframe_full.shape[0] != initial_shape:
        raise ValueError('Some addresses got lost, buddy')
    dataframe_full.drop(columns=['_merge'], inplace=True)
    dataframe_full[prefix + 'fullAddress'] = dataframe_full[prefix + 'fullAddress'].str.replace('\.0', '', regex=True)
    return dataframe_full
Exemplo n.º 5
0
def string_standardize_column_vectorized(column:pd.Series,  log=False) -> pd.Series:
    """Standardizes strings to be in accordance with US Postal Standards."""
    # create column with prefix and old column name
    if column.isna().all():
        if log is not False:
            write_to_log('{} is completely NA... Not attempting to clean'.format(column))
        return column
    if not is_string_dtype(column):
        return column
    replacement_dict = {
        'apt': ['apa?rtme?nt', 'apts'],
        'aly': ['allee', 'alle?y'],
        'ave': ['av', 'ave?nu?e?'],
        'blvd': ['boulevard', 'boulv?'],
        'brg': ['br', 'bri?dge?'],
        'canyn': ['canyon', 'cnyn'],
        "condo": ["CONDOMINIUM|CO?NDO?MI?NI?U?MS?|CONDOS|CONDOS|COND"],
        'ctr': ['cent?', 'center', 'ce?ntre?'],
        'cir': ['ci?rcl?e?'],
        'ct': ['co?u?rt'],
        'dr': ['dri?ve?'],
        'e': ['east'],
        'est': ['estate'],
        'expy': ['expr?e?s?s?', 'expressway', 'expw'],
        'ext': ['exte?nsi?o?n'],
        'ft': ['fo?rt'],
        'fwy': ['fre?e?wa?y'],
        'gdn': ['ga?rde?n'],
        'hbr': ['harb', 'ha?rbo?r'],
        'hts': ['ht', 'heights', 'hgts?'],
        'hwy': ['highway', 'highwy', 'hiway', 'hiwy', 'hway', 'hw'],
        'jct': ['ju?ncti?o?n'],
        'ln': ['lane' ],
        'lp': ['lo?o?p'],
        'mt': ['mntain', 'mntn', 'mountain', 'mountin', 'mtin', 'mtn', 'mount'],
        'n': ['north', 'no'],  # no can unfortunately be number or north depending
        'rd': ['road'],
        'pk': ['parks?'],
        'pl': ['place'],
        'plz': ['plaza'],
        'pkwy': ['parkway'],
        'rdge': ['ridge'],
        'riv': ['rive?r'],
        'rte': ['route'],
        'rw': ['row?'],
        'sq': ['squ?a?re?'],
        'ste': ['suite'],
        'st': ['street', 'str', 'saint'],
        's': ['south', 'so'],
        'ter': ['terr', 'terrace', 'ter$', 'te'],
        'trl': ['trail', 'trl'],
        'w': ['west'],
        'way': ['wy'],
        '1': ['one'],
        '2': ['two'],
        '100': ['hundred'],
        '1st': ['first', '01st'],
        '2nd': ['second', '02nd'],
        '3rd': ['third', '03rd'],
        '4th': ['fourth', '04th'],
        '5th': ['fifth', '05th'],
        '6th': ['sixth', '06th'],
        '7th': ['seventh', '07th'],
        '8th': ['eighth', '08th'],
        '9th': ['ninth', '09th'],
        '10th': ['tenth'],
        'AL': ['Alabama'],
        'AK': ['Alaska'],
        'AS': ['American Samoa'],
        'AZ': ['Arizona'],
        'AR': ['Arkansas'],
        'CA': ['California'],
        'CO': ['Colorado'],
        'CT': ['Connecticut'],
        'DE': ['Delaware'],
        'DC': ['District of Columbia'],
        'FL': ['Florida'],
        'GA': ['Georgia'],
        'GU': ['Guam'],
        'HI': ['Hawaii'],
        'ID': ['Idaho'],
        'IL': ['Illinois'],
        'IN': ['Indiana'],
        'IA': ['Iowa'],
        'KS': ['Kansas'],
        'KY': ['Kentucky'],
        'LA': ['Louisiana'],
        'ME': ['Maine'],
        'MD': ['Maryland'],
        'MA': ['Massachusetts'],
        'MI': ['Michigan'],
        'MN': ['Minnesota'],
        'MS': ['Mississippi'],
        'MO': ['Missouri'],
        'MT': ['Montana'],
        'NE': ['Nebraska'],
        'NV': ['Nevada'],
        'NH': ['New Hampshire'],
        'NJ': ['New Jersey'],
        'NM': ['New Mexico'],
        'NY': ['New York'],
        'NC': ['North Carolina'],
        'ND': ['North Dakota'],
        'MP': ['Northern Mariana Islands'],
        'OH': ['Ohio'],
        'OK': ['Oklahoma'],
        'OR': ['Oregon'],
        'PA': ['Pennsylvania'],
        'PR': ['Puerto Rico'],
        'RI': ['Rhode Island'],
        'SC': ['South Carolina'],
        'SD': ['South Dakota'],
        'TN': ['Tennessee'],
        'TX': ['Texas'],
        'UT': ['Utah'],
        'VT': ['Vermont'],
        'VI': ['Virgin Islands'],
        'VA': ['Virginia'],
        'WA': ['Washington'],
        'WV': ['West Virginia'],
        'WI': ['Wisconsin'],
        'WY': ['Wyoming'],
        # boston specific
        'comm': ['co?mm[onwealth]{1,7}'],
        'center': ['centre'],
        'whaler': ['whalers'],
        'lake shore': ['lakehore'],
        # baltimore specific
        'cold spring': ['coldspring']
    }  # update as needed
    # loop through dict and replace values with key
    for item in replacement_dict:
        value_string = ""
        for value in sorted(replacement_dict[item]):
            value_string += value + "|"
        re_string = r'(\b)(%s)(\b)' % value_string[:-1]
        re_replace = r'\g<1>%s\g<3>' % item
        column = column.str.replace(re_string, re_replace, flags=re.IGNORECASE, regex=True)
    # replace double spaces w/ single onesx
    column = column.str.replace(r' {2,}', r' ', flags=re.IGNORECASE, regex=True)
    return column