def download_hospital_data(url, write_out_csv=True): hospital_df = pd.read_csv(url) # TODO: Move this processing code probably into data processing class # TODO: Leverage geocoder class instead of this code af = addfips.AddFIPS() # Reverse geocoder used to get geocoded fips and county information # Note: Progress_apply is used for the timer functionality hospital_df['fips'] = hospital_df.apply( lambda x: (af.get_county_fips(x['COUNTY'], x['STATE'])), axis=1) # clean the BEDS column to make sure all are positive in value, by converting negative beds to 0 hospital_df['BEDS'] = hospital_df.apply( lambda x: (remove_negative_values_replace_with_zero(x['BEDS'])), axis=1) # write out this data file to csv if write_out_csv: timestr = time.strftime("%Y%m%d") path = 'hospital_data_processed_' + timestr + '.csv' hospital_df.to_csv(path, index=False, header=True) return hospital_df
def add_fips_ids(df, state_col="state", county_col="county", vintage=2015): """Add State and County FIPS IDs to a dataframe.""" # force the columns to be the nullable string types so we have a consistent # null value to filter out before feeding to addfips df = df.astype({ state_col: pd.StringDtype(), county_col: pd.StringDtype(), }) af = addfips.AddFIPS(vintage=vintage) # Lookup the state and county FIPS IDs and add them to the dataframe: df["state_id_fips"] = df.apply(lambda x: (af.get_state_fips(state=x[state_col]) if pd.notnull(x[state_col]) else pd.NA), axis=1) logger.info( f"Assigned state FIPS codes for " f"{len(df[df.state_id_fips.notnull()])/len(df):.2%} of records.") df["county_id_fips"] = df.apply( lambda x: (af.get_county_fips(state=x[state_col], county=x[county_col]) if pd.notnull(x[county_col]) else pd.NA), axis=1) # force the code columns to be nullable strings - the leading zeros are # important df = df.astype({ "county_id_fips": pd.StringDtype(), "state_id_fips": pd.StringDtype(), }) logger.info( f"Assigned county FIPS codes for " f"{len(df[df.county_id_fips.notnull()])/len(df):.2%} of records.") return df
def handle(self, *args, **options): baseDir = settings.BASE_DIR af = addfips.AddFIPS() with open('%s/utils/crawlers/usgs_station/stations.csv' % (baseDir)) as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') for line, row in enumerate(csv_reader): if line == 0: # header continue site_no, url, lat, long, id, county, state, hydrolic_unit_id, id2 = row state_fips = af.get_state_fips(state) county_fips = af.get_county_fips(county, state=state) node = get_object_or_None(models.node, name = site_no) if not node: node = models.node() node.name = site_no node.position = Point( float(lat), float(long)) node.county = county node.state = getStateAbbrev(state) node.fips_state = state_fips node.fips_county = county_fips node.meta = json.dumps({'site_no': site_no, 'hydrolic_unit_id':hydrolic_unit_id, 'id2': id2 }) node.save() log('node %s - insert' % ( site_no ), 'success') else: log('node %s - exists' % ( site_no ), 'info')
def get_fips(row): '''This function gets a row from a dataframe (a series) and returns the FIPS code from the county.''' af = addfips.AddFIPS() row_as_dict = series_to_dict(row) row_with_fips = af.add_county_fips(row_as_dict, county_field="BUYER_COUNTY", state_field="BUYER_STATE") fips = row_with_fips['fips'] return (fips)
def add_fips_ids(df, state_col="state", county_col="county", vintage=2015): """Add State and County FIPS IDs to a dataframe.""" af = addfips.AddFIPS(vintage=vintage) # Lookup the state and county FIPS IDs and add them to the dataframe: df["state_id_fips"] = df.apply(lambda x: af.get_state_fips(state=x.state), axis=1) logger.info( f"Assigned state FIPS codes for " f"{len(df[df.state_id_fips.notnull()])/len(df):.2%} of records.") df["county_id_fips"] = df.apply( lambda x: af.get_county_fips(state=x.state, county=x.county), axis=1) df["county_id_fips"] = df.county_id_fips.fillna(pd.NA) logger.info( f"Assigned county FIPS codes for " f"{len(df[df.county_id_fips.notnull()])/len(df):.2%} of records.") return df
def getFIPS_General( city_tuple, values): # not used as the addfips lib can not recognize some cities ''' This function is to get the fips code from the city name : param city_tuple: tuple ''' assert isinstance(values, list) res = [] af = addfips.AddFIPS() # print(city_tuple) # print(values) for i in range(len(values)): res.append(af.get_county_fips(city_tuple[i][1], state=city_tuple[i][0])) res, values = getSorted(res, values) return res
def get_county_fips_code(fname, state_col,county_col): ''' A function that uses the addfips library to assign fips codes accordingly to our data. FIPS codes help with geographic plots for our dataset, as they are unified numbers for states and counties. :fname: string. Name of input file to get FIPS codes from :state_col: column number where state name is located :county_col: column number where county name is stored. ''' assert isinstance(fname,str) and isinstance(state_col,int) and isinstance(county_col,int) assert state_col>=0 and county_col>=0 af=addfips.AddFIPS() header,state= import_data(fname,read='col',samples=None,col_num=state_col) header,county= import_data(fname,read='col',samples=None,col_num=county_col) assert len(state)==len(county) fips=[0]*len(state) for i in range(len(state)): fips[i]=af.get_county_fips(county[i], state[i]) return fips
def clean(data): df = pd.DataFrame(data) # Rename the file headers df.rename(variables, axis="columns", inplace=True) # Reformat dates df['date_modeled'] = pd.to_datetime(df['date_modeled']) # Turn state names into fips code af = addfips.AddFIPS() fips = [] for key, value in df['state'].items(): fips.append(af.get_state_fips(value)) df['us_state_fips'] = fips # Drop records where state fips is empty df = df.dropna(subset=['us_state_fips']) # Code model type df['model'] = df['model'].map({'polls-only': '0', 'polls-plus': '1'}) # reorder so that the cnt and new are always next to each other in the same order df = df[[ 'election_cycle', 'race_type', 'name_incumbent', 'name_challenger', 'name_third', 'us_state_fips', 'tipping', 'vpi', 'win_state_incumbent', 'win_state_challenger', 'win_state_third', 'win_ec_if_win_state_incumbent', 'win_ec_if_win_state_challenger', 'win_state_if_win_ec_incumbent', 'win_state_if_win_ec_challenger', 'voteshare_incumbent', 'voteshare_challenger', 'voteshare_third', 'voteshare_other', 'voteshare_lo_incumbent', 'voteshare_lo_challenger', 'voteshare_lo_third', 'voteshare_lo_other', 'voteshare_hi_incumbent', 'voteshare_hi_challenger', 'voteshare_hi_third', 'voteshare_hi_other', 'margin', 'margin_lo', 'margin_hi', 'model', 'date_modeled', 'simulation_timestamp', 'simulation_count' ]] # order the records by date df = df.sort_values(by=['date_modeled'], ascending=False) return df
def county_data_for_population_score(st, yr): """ Returns a dataframe containing the 8 pop. char. indicators for each county in the defined state. st = abbrevation for state of interest (TX, FL, ...) - string yr = end year of interst for the ACS 5-year estimates - integer """ # acs requests host, dataset = 'https://api.census.gov/data', 'acs/acs5' year = str(yr) # convert year to string # pop. char. indicators - refer to documentation for more info variables = [ "B03002_001E", "B03002_003E", "B17001_001E", "B17001_002E", "B16010_001E", "B16010_002E", "B01001_001E", "B01001_003E", "B01001_004E", "B01001_027E", "B01001_028E", "B01001_020E", "B01001_021E", "B01001_022E", "B01001_023E", "B01001_024E", "B01001_025E", "B01001_044E", "B01001_045E", "B01001_046E", "B01001_047E", "B01001_048E", "B01001_049E", "B25070_001E", "B25070_007E", "B25070_008E", "B25070_009E", "B25070_010E", "B25070_011E" ] # variable names for the pop. char. indicators col_names = [ "name", "tpop_race", "tpop_nhwhite", "tpop_poverty", "tpop_impoverished", "tpop_education", "tpop_lthsgraduate", "tpop_age", "tpop_lt5M", "tpop_5to9M", "tpop_lt5F", "tpop_5to9F", "tpop_65to66M", "tpop_67to69M", "tpop70to74M", "tpop_75to79M", "tpop_80to84M", "tpop_over85M", "tpop_65to66F", "tpop_67to69F", "tpop70to74F", "tpop_75to79F", "tpop_80to84F", "tpop_over85F", "tpop_incomeonhousing", "tpop_30to34.9IonH", "tpop_35to39.9IonH", "tpop_40to49.9IonH", "tpop_over50IonH", "tpop_NAIonH", "state", "county" ] # loop through the variables for defined state and year # store results in a dataframe for variable in variables: print("Fetching data for {} variable".format(variable)) url = "/".join([host, year, dataset]) get_vars = ["NAME"] + variables predicates = {} predicates["get"] = ",".join(get_vars) predicates["for"] = "county:*" predicates["in"] = "state:" + addfips.AddFIPS().get_state_fips(st) r = requests.get(url, params=predicates) df = pd.DataFrame(columns=col_names, data=r.json()[1:]) #create column of census tract names name_information = df.name #modify the dataframe to remove the name column and format values as integers df = df[df.columns[~df.columns.isin(['name'])]].astype(int) # create dataframe from the retrieved data indicators_df = pd.DataFrame() # do the calculations for each indicator and store it in the newly made dataframe indicators_df = indicators_df.assign( name=name_information.apply(lambda x: x.split(',')[0]), population=df.tpop_race, nwpopulation=df.tpop_nhwhite, nwpopulation_p=100 * (df.tpop_nhwhite / df.tpop_race), poverty=df.tpop_impoverished, poverty_p=100 * (df.tpop_impoverished / df.tpop_poverty), educational_attainment_p=100 * (df.tpop_lthsgraduate / df.tpop_education), age_0to9=df.tpop_lt5M + df.tpop_5to9M + df.tpop_lt5F + df.tpop_5to9F, age_0to9_p=100 * ((df.tpop_lt5M + df.tpop_5to9M + df.tpop_lt5F + df.tpop_5to9F) / df.tpop_age), age_10to64=df.tpop_age - (df.tpop_lt5M + df.tpop_5to9M + df.tpop_lt5F + df.tpop_5to9F) - (df.tpop_65to66M + df.tpop_67to69M + df.tpop70to74M + df.tpop_75to79M + df.tpop_80to84M + df.tpop_over85M + df.tpop_65to66F + df.tpop_67to69F + df.tpop70to74F + df.tpop_75to79F + df.tpop_80to84F + df.tpop_over85F), age_10to64_p=100 * ((df.tpop_age - (df.tpop_lt5M + df.tpop_5to9M + df.tpop_lt5F + df.tpop_5to9F) - (df.tpop_65to66M + df.tpop_67to69M + df.tpop70to74M + df.tpop_75to79M + df.tpop_80to84M + df.tpop_over85M + df.tpop_65to66F + df.tpop_67to69F + df.tpop70to74F + df.tpop_75to79F + df.tpop_80to84F + df.tpop_over85F)) / (df.tpop_age)), age_65=df.tpop_65to66M + df.tpop_67to69M + df.tpop70to74M + df.tpop_75to79M + df.tpop_80to84M + df.tpop_over85M + df.tpop_65to66F + df.tpop_67to69F + df.tpop70to74F + df.tpop_75to79F + df.tpop_80to84F + df.tpop_over85F, age_65_p=100 * ((df.tpop_65to66M + df.tpop_67to69M + df.tpop70to74M + df.tpop_75to79M + df.tpop_80to84M + df.tpop_over85M + df.tpop_65to66F + df.tpop_67to69F + df.tpop70to74F + df.tpop_75to79F + df.tpop_80to84F + df.tpop_over85F) / df.tpop_age), housing_burden_p=100 * ((df["tpop_30to34.9IonH"] + df["tpop_35to39.9IonH"] + df["tpop_40to49.9IonH"] + df["tpop_over50IonH"]) / (df.tpop_incomeonhousing - df.tpop_NAIonH)), county=df.county) #add health data to the indicators dataframe #this is for TX only. from the TX DSHS if st == 'TX': TX_health_indicators_df = pd.read_csv( './raw_data/TX_health_indicators.csv').rename( index=str, columns={'county': 'name'}) indicators_df = indicators_df.merge(TX_health_indicators_df[[ 'name', 'low_birth_weight', 'cardiovascular_disease' ]], how='left', on="name") del df, TX_health_indicators_df # save as dataframe and/or CSV file if QA: indicators_df.to_csv('county_population_df.csv', index=False) return indicators_df else: return indicators_df
def add_geodata_to_dataframe(ut, st, ogdf): """ Returns modified dataframe with geodata added ut = unit of data (tract or county) - string st = abbreviation for state of interest - string ogdf = dataframe that will be modified - dataframe """ # filter shapefile by state and add it to the dataframe # for tract level if ut == "tract": print("Fetching geographic {} data".format(ut)) # read in shp file if st == 'TX': tempdf = gpd.read_file( "./boundary_data/tl_2018_48_tract/tl_2018_48_tract.shp")[[ 'GEOID', 'geometry' ]] else: tempdf = gpd.read_file("./boundary_data/usatracts/usa_tracts.shp")[ ['STATEFP', 'GEOID', 'geometry']] tempdf = tempdf.rename(index=str, columns={'STATEFP': 'state'}) # filter shp file based on state tempdf = tempdf[tempdf['state'] == addfips.AddFIPS().get_state_fips( st)] # add geodata print("Adding geographic {} data to the dataframe".format(ut)) geo_dataframe = tempdf.merge(ogdf, how='left', on='geoid') # for county level elif ut == "county": print("Fetching geographic {} data".format(ut)) # read in shp file tempdf = gpd.read_file( "./boundary_data/tl_2019_us_county/tl_2019_us_county.shp")[[ 'STATEFP', 'COUNTYFP', 'GEOID', 'geometry' ]] tempdf = tempdf.rename(index=str, columns={ 'STATEFP': 'state', 'COUNTYFP': 'county' }) # fiter shp file based on state tempdf = tempdf[tempdf['state'] == addfips.AddFIPS().get_state_fips( st)] tempdf['county'] = tempdf['county'].astype(int) # add geodata print("Adding geographic {} data to the dataframe".format(ut)) geo_dataframe = tempdf.merge(ogdf, how='left', on='county') # unit is not clearly defined or unavailable else: print( "Please check 'ut' input to make sure it is either 'tract' or 'county', not {}" .format(ut)) del tempdf # save as dataframe and/or CSV file if QA: geo_dataframe.to_csv('geocounty_population_df.csv', index=False) return geo_dataframe else: return geo_dataframe
'r') as csv_summary_file_yesterday: csv_summary_yesterday = csv_summary_file_yesterday.read() if filecmp.cmp(csv_summary_file_name_today_tmp, csv_summary_file_name_yesterday): print('Website not updated: ', url) os.remove(csv_summary_file_name_today_tmp) sys.exit(1) else: print('Begin crawling') ##===process today's csv summary data=== csv_summary_today = csv_summary_today_tmp if os.path.exists(csv_summary_file_name_today_tmp): os.rename(csv_summary_file_name_today_tmp, csv_summary_file_name_today) print('Save today\'s csv summary data: ', csv_summary_file_name_today) df = pd.read_csv(csv_summary_file_name_today)[:-1] df['Case Count'] = df['Case Count'].astype('int64') df['Death Count'] = df['Death Count'].astype('int64') df['Hospitalized Count'] = df['Hospitalized Count'].astype('int64') df2 = df.groupby('County', as_index=True)[[ 'Case Count', 'Death Count', 'Hospitalized Count' ]].sum().reset_index() af = addfips.AddFIPS() df2['fips'] = df2['County'].apply(lambda x: af.get_county_fips(x, 'OH')) df2.to_csv(csv_county_summary_file_name_today, index=None, header=True) print('Save today\'s csv county summary data: ', csv_county_summary_file_name_today)
from . import settings import addfips from dateutil import parser as dateutil_parser import hashlib import os import pendulum import requests import threading import us _fips = addfips.AddFIPS() _fip_state_reverse = {} _fip_state_full_reverse = {} _fip_county_reverse = {} # Patches to the 'addfips' library, for counties _fip_patches = { # Added everywhere. '_global': { 'all': '000', 'unknown': '999', }, 'AK': { # There are technically two codes; this is used for ambiguous # entries. 'aleutian islands': '010', # These show up in the USDA census data. They're to be trusted. 'anchorage borough': '020', 'juneau borough': '110', 'kuskokwim division': '160', 'prince of wales-outer ketchikan census area': '201',
def __init__(self, county_fips_download_url): self.af = addfips.AddFIPS() self.download_county_fips_info(county_fips_download_url)
def get_fips_code(county, state_abr): af = addfips.AddFIPS() code = af.get_county_fips(county, state=state_abr) return code