df = pd.concat([process_df(file) for file in file_list]) df.to_csv(bls_data_dict['appended zipcode'] + "appended_zip.csv", index = False) def filter_bls_data(df, address_df): # pad zipcode to 5 digits df['zip'] ="_" + df['zip'].astype(str).str.pad(5, side="left", fillchar = "0") # merge city from address data df = df.merge(address_df[['parsed_city', 'parsed_addr_zip']].drop_duplicates(), how = "inner", left_on = "zip", right_on = "parsed_addr_zip") print(df['parsed_city'].value_counts()) print(df.groupby(['parsed_city']).agg(**{"num_zip":('zip','count' ), "num_unique_zip": ('zip', 'nunique')})) return df if __name__ == "__main__": # process_raw_data() data_dict = make_data_dict(use_seagate=True) city_dict = { "stl": "", "sf": "^san francisco$", "seattle": "^seattle$", "sd": "^san diego$", "chicago" : "^chicago$", "baton_rouge": "^baton rouge$", "la": "^los angeles$", 'philly': "" } def filter_df(df, city): df = df[df['parsed_city'].fillna("").str.contains(city)] # print(city, df['parsed_city'].value_counts()) return df
id. startDate endDate type dataframes Main function takes in cleaned business dataframe, makes misc business vars,converts business dataframe into panel and writes business dataframe to csv """ import pandas as pd import numpy as np import math import re from helper_functions import write_to_log, WTL_TIME, fuzzy_merge, get_nearest_address, make_panel from data_constants import make_data_dict, filePrefix from name_parsing import parse_business from clean_address_data import parallelize_dataframe from typing import Union data_dict = make_data_dict(use_seagate=False) # function that determines if variable is a chain in a given year def make_chain_var(df, name_col='business_id', time_col='year', loc_col='num_locations'): """ Function takes in a dataframe with a name and time column and returns columns containing: the number of observations with the same name and an indicator variable if the number of observations is greater than the threshold :param df: dataframe :param name_col: string type column of names :param time_col: usually is something like year, but technically can be any second variable to group on :param loc_col: name of num_observations column to be made
def make_qc_aggs(bls_df: pd.DataFrame, city: str, make_naics_aggs=False): data_dict = make_data_dict(use_seagate=True) bus_df = pd.read_csv(data_dict['final'][city]['business_location'] + "business_locations.csv", usecols=["year", "parsed_city", "parsed_addr_zip","is_business", "cleaned_business_name", "cleaned_dba_name", "primary_cleaned_fullAddress"]).\ drop_duplicates(subset = ["cleaned_business_name", "cleaned_dba_name", "primary_cleaned_fullAddress", "year"]) bus_df = bus_df.assign(index=np.arange(bus_df.shape[0])) bus_df = bus_df[bus_df['is_business'] != "person"] bls_df = bls_df[(bls_df['parsed_city'].isin(bus_df['parsed_city'])) & (bls_df['year'].isin(bus_df['year']))] bls_city_agg = (bls_df.groupby(['parsed_city', 'year']).agg(**{ "num_establishments": ('est', 'sum') }).reset_index()) bus_df_city_agg = (bus_df.groupby(['parsed_city', 'year']).agg(**{ "num_establishments": ('index', 'count') }).reset_index()) city_agg = pd.merge(bls_city_agg, bus_df_city_agg, how="outer", suffixes=["_bls", "_business_loc"], on=["parsed_city", "year"]) # repeat for zipcode bls_city_zip_agg = (bls_df.groupby(['parsed_city', 'zip', 'year']).agg(**{ "num_establishments": ('est', 'sum') }).reset_index()) bus_df_city_zip_agg = (bus_df.groupby( ['parsed_city', "parsed_addr_zip", 'year']).agg(**{ "num_establishments": ('index', 'count') }).reset_index()) city_zip_agg = pd.merge( bls_city_zip_agg, bus_df_city_zip_agg, how="outer", suffixes=["_bls", "_business_loc"], left_on=["parsed_city", "year", 'zip'], right_on=["parsed_city", "year", 'parsed_addr_zip']) if make_naics_aggs is not False: bls_city_naics_agg = (bls_df.groupby( ['parsed_city', "naics", 'year']).agg(**{"num_establishments": ('est', 'sum')})) bus_df_city_naics_agg = (bus_df.groupby( ['parsed_city', 'naics', 'year']).agg(**{"num_establishments": 'size'})) city_naics_agg = pd.merge(bls_city_naics_agg, bus_df_city_naics_agg, how="outer", suffixes=["_bls", "_business_loc"], on=["parsed_city", "year", "naics"]) bls_city_zip_naics_agg = (bls_df.groupby( ['parsed_city', 'zip', "naics", 'year']).agg(**{"num_establishments": ('est', 'sum')})) bus_df_city_zip_naics_agg = (bus_df.groupby( ['parsed_city', "parsed_addr_zip", 'naics', 'year']).agg(**{"num_establishments": 'size'})) city_zip_naics_agg = pd.merge( bls_city_zip_naics_agg, bus_df_city_zip_naics_agg, how="outer", suffixes=["_bls", "_business_loc"], left_on=["parsed_city", "year", "naics", 'zip'], right_on=["parsed_city", "year", "naics", 'parsed_addr_zip']) city_naics_agg.to_csv(filePrefix + f"/qc/bls_{city}_naics_agg.csv", index=False) city_zip_naics_agg.to_csv(filePrefix + f"/qc/bls_{city}_zip_naics_agg.csv", index=False) city_agg.to_csv(filePrefix + f"/qc/bls_{city}_agg.csv", index=False) city_zip_agg.to_csv(filePrefix + f"/qc/bls_{city}_zip_agg.csv", index=False)