def write_local_files_debug(self): """Downloads and writes the tables to the local file system as csv and json files. This is only for debugging/convenience, and should not be used in production.""" metadata = fetch_acs_metadata(self.base_acs_url) var_map = parse_acs_metadata(metadata, list(GROUPS.keys())) by_hisp_and_race_json = fetch_acs_group(self.base_acs_url, HISPANIC_BY_RACE_CONCEPT, var_map, 2, self.county_level) sex_by_age_frames = {} for concept in SEX_BY_AGE_CONCEPTS_TO_RACE: json_string = fetch_acs_group(self.base_acs_url, concept, var_map, 2, self.county_level) frame = gcs_to_bq_util.values_json_to_dataframe(json_string) sex_by_age_frames[concept] = update_col_types(frame) race_and_hispanic_frame = gcs_to_bq_util.values_json_to_dataframe( by_hisp_and_race_json) race_and_hispanic_frame = update_col_types(race_and_hispanic_frame) race_and_hispanic_frame = standardize_frame( race_and_hispanic_frame, get_vars_for_group(HISPANIC_BY_RACE_CONCEPT, var_map, 2), [HISPANIC_COL, RACE_COL], self.county_level, POPULATION_COL) frames = { self.get_table_name_by_race(): self.get_all_races_frame(race_and_hispanic_frame), self.get_table_name_by_sex_age_race(): self.get_sex_by_age_and_race(var_map, sex_by_age_frames) } for key, df in frames.items(): df.to_csv("table_" + key + ".csv", index=False) df.to_json("table_" + key + ".json", orient="records")
def upload_to_gcs(self, gcs_bucket): """Uploads population data from census to GCS bucket.""" metadata = fetch_acs_metadata(self.base_acs_url) var_map = parse_acs_metadata(metadata, list(GROUPS.keys())) concepts = list(SEX_BY_AGE_CONCEPTS_TO_RACE.keys()) concepts.append(HISPANIC_BY_RACE_CONCEPT) file_diff = False for concept in concepts: group_vars = get_vars_for_group(concept, var_map, 2) cols = list(group_vars.keys()) url_params = get_census_params(cols, self.county_level) concept_file_diff = url_file_to_gcs.url_file_to_gcs( self.base_acs_url, url_params, gcs_bucket, self.get_filename(concept)) file_diff = file_diff or concept_file_diff url_params = get_census_params([TOTAL_POP_VARIABLE_ID], self.county_level) next_file_diff = url_file_to_gcs.url_file_to_gcs( self.base_acs_url, url_params, gcs_bucket, self.add_filename_suffix(TOTAL_POP_VARIABLE_ID)) file_diff = file_diff or next_file_diff return file_diff
def __init__(self, base_url): self.base_url = base_url metadata = fetch_acs_metadata(self.base_url)["variables"] metadata = trimMetadata(metadata, MEDIAN_INCOME_BY_RACE_GROUPS.keys()) self.metadata = parseMetadata( metadata, [MetadataKey.AGE, MetadataKey.INCOME, MetadataKey.RACE], self.metadataInitializer, ) self.state_fips = get_state_fips_mapping(base_url) self.county_fips = get_county_fips_mapping(base_url) self.data = {}
def write_to_bq(self, dataset, gcs_bucket): """Writes population data to BigQuery from the provided GCS bucket dataset: The BigQuery dataset to write to gcs_bucket: The name of the gcs bucket to read the data from""" # TODO change this to have it read metadata from GCS bucket metadata = fetch_acs_metadata(self.base_acs_url) var_map = parse_acs_metadata(metadata, list(GROUPS.keys())) race_and_hispanic_frame = gcs_to_bq_util.load_values_as_dataframe( gcs_bucket, self.get_filename(HISPANIC_BY_RACE_CONCEPT)) race_and_hispanic_frame = update_col_types(race_and_hispanic_frame) race_and_hispanic_frame = standardize_frame( race_and_hispanic_frame, get_vars_for_group(HISPANIC_BY_RACE_CONCEPT, var_map, 2), [HISPANIC_COL, RACE_COL], self.county_level, POPULATION_COL) total_frame = gcs_to_bq_util.load_values_as_dataframe( gcs_bucket, self.add_filename_suffix(TOTAL_POP_VARIABLE_ID)) total_frame = update_col_types(total_frame) total_frame = standardize_frame( total_frame, {TOTAL_POP_VARIABLE_ID: ['Total']}, [RACE_OR_HISPANIC_COL], self.county_level, POPULATION_COL) sex_by_age_frames = {} for concept in SEX_BY_AGE_CONCEPTS_TO_RACE: sex_by_age_frame = gcs_to_bq_util.load_values_as_dataframe( gcs_bucket, self.get_filename(concept)) sex_by_age_frame = update_col_types(sex_by_age_frame) sex_by_age_frames[concept] = sex_by_age_frame frames = { self.get_table_name_by_race(): self.get_all_races_frame( race_and_hispanic_frame, total_frame), self.get_table_name_by_sex_age_race(): self.get_sex_by_age_and_race( var_map, sex_by_age_frames) } for table_name, df in frames.items(): # All breakdown columns are strings column_types = {c: 'STRING' for c in df.columns} column_types[POPULATION_COL] = 'INT64' gcs_to_bq_util.add_dataframe_to_bq( df, dataset, table_name, column_types=column_types)
def __init__(self, base_url): self.base_url = base_url metadata = fetch_acs_metadata(self.base_url)["variables"] metadata = trimMetadata(metadata, [HEALTH_INSURANCE_BY_SEX_GROUPS_PREFIX]) self.metadata = parseMetadata( metadata, [MetadataKey.AGE, MetadataKey.SEX], lambda key: dict() ) for k, v in self.metadata.items(): if MetadataKey.POPULATION not in v: self.metadata[k][ MetadataKey.POPULATION ] = HealthInsurancePopulation.TOTAL self.state_fips = get_state_fips_mapping(base_url) self.county_fips = get_county_fips_mapping(base_url) self.data = {}
def __init__(self, base_url): self.base_url = base_url metadata = fetch_acs_metadata(self.base_url)["variables"] metadata = trimMetadata( metadata, HEALTH_INSURANCE_BY_RACE_GROUP_PREFIXES.keys() ) self.metadata = parseMetadata( metadata, [MetadataKey.AGE, MetadataKey.RACE], self.metadataInitializer ) for k, v in self.metadata.items(): if MetadataKey.POPULATION not in v: self.metadata[k][ MetadataKey.POPULATION ] = HealthInsurancePopulation.TOTAL self.state_fips = get_state_fips_mapping(base_url) self.county_fips = get_county_fips_mapping(base_url) self.data = {}