Exemplo n.º 1
0
def import_addresses(dataset):
    '''
    Reads in one of three address datasets (specified with a string). Returns a
    dataframe.
    '''

    print('Reading in {} addresses'.format(dataset.upper()))

    # Read in the COOK address dataset; rename a column
    if dataset == 'cook':
        df = ad.read_cook_addr()
        df = df.rename(columns={'ID': 'VendorName'}, index=str)

    # Read in the IRS dataset; rename a column and standardize names
    elif dataset == 'irs':
        df = ad.read_irs()
        df = df.rename(columns={'OrganizationName': 'VendorName'}, index=str)
        df['VendorName'] = df['VendorName'].apply(stdname)

    # Read in the IL address dataset; standardize names
    elif dataset == 'il':
        df = ad.read_il_addr()
        df['VendorName'] = df['VendorName'].apply(stdname)

    # Conver text fields to uppercase
    df = u.upper(df)

    return df
Exemplo n.º 2
0
def read_contracts():
    '''
    Reads in the contracts dataset via the MERGE_CONTRACTS module. Returns a
    dataframe.
    '''

    # Initialize an empty list to hold the dataframes
    dfs = []

    # For every ((filename,label)) tuple:
    for fname_tuple in mc.FNAMES:
        # Read in and process the dataset
        df = mc.process_dataset(fname_tuple)
        # If the label == 'CHI':
        if fname_tuple[-1] == 'CHI':
            # Send the dataframe through the round2 address cleaner
            df = addclean.round2(df)
            # Send the Address1 field through the address cleaner
            df['Address1'] = df['Address1'].apply(addclean.address_cleaner)
        # Add the newly processed dataframe into the list
        dfs.append(df)

    # Concatenate all the dataframes
    merged = pd.concat(dfs)

    # Convert the text columns (except for the URLs) to uppercase
    merged = u.upper(merged)

    # There should be this many records in the dataframe:  6591 records
    return merged
Exemplo n.º 3
0
def import_pb(fname):
    '''
    Reads in the PurpleBinder dataset. Splits each record into multiple based on
    the number of locations contained in the locations field. Splits the
    location column into its component parts (Address1, Address2, City, State, &
    ZipCode) and then converts all the strings to uppercase.
    Returns a dataframe.
    '''

    # Read in the json file
    df = read_pb(fname)

    # Split the locations into multiple rows (one row per location)
    splitR = split_rows(df)

    # Split the location column into its component parts
    splitC = split_cols(splitR)

    # Convert string columns to uppercase
    df_upper = u.upper(splitC)

    # There are serious problems with some of the geocoding in the PB data, so
    # drop the coordinates
    df_upper = df_upper.drop(['Latitude', 'Longitude'], axis=1)

    return df_upper
Exemplo n.º 4
0
def import_dfss(fname):
    '''
    Reads in the DFSS dataset, converting strings to uppercase. Assigns an ID.
    Returns a dataframe.
    '''

    df = read_dfss(fname)

    df_upper = u.upper(df)

    return df_upper
Exemplo n.º 5
0
def import_wchi(fname):
    '''
    Reads in the West Chi dataset. Splits the address field into its component
    parts. Converts strings to uppercase. Returns a dataframe.
    '''

    # Read in the WESTCHI file
    df = read_wc(fname)

    # Split addresses into their compnent parts
    split = split_addr(df)

    # Convert strings to uppercase
    df_upper = u.upper(split)

    return df_upper
Exemplo n.º 6
0
def import_mc(fname, sheetname):
    '''
    Reads in one MapsCorps dataset. Replaces str(np.NaN) with the empty string.
    Converts string values to uppercase. Drops duplicates. Returns a dataframe.
    '''

    # Extracts the year from the sheetname
    year = get_year(sheetname)

    # Uses a different function to read in the file based on the year
    if year == 2009:
        df = read_2009(fname, sheetname)
    elif year == 2016:
        df = read_2016(fname, sheetname)

    # Replaces the string 'nan' (str(np.NaN)) with the empty string and converts
    # strings to uppercase
    df = df.replace('nan', '')
    df_upper = u.upper(df)

    return df_upper.drop_duplicates().reset_index(drop=True)