def get_all_races_frame(self, race_and_hispanic_frame): """Includes all race categories, both including and not including Hispanic/Latino.""" all_races = self.standardize_race_include_hispanic( race_and_hispanic_frame) standardized_race = self.standardize_race_exclude_hispanic( race_and_hispanic_frame) standardized_race = standardized_race.copy() # both variants of standardized race include a "Hispanic or Latino" # group, so remove from one before concatenating. standardized_race = standardized_race[ standardized_race[RACE_CATEGORY_ID_COL] != Race.HISP.value] all_races = pandas.concat([all_races, standardized_race]) # Drop extra columns before adding derived rows so they don't interfere # with grouping. all_races.drop(RACE_COL, axis=1, inplace=True) # Add derived rows. all_races = add_sum_of_rows( all_races, RACE_CATEGORY_ID_COL, POPULATION_COL, Race.TOTAL.value, list(RACE_STRING_TO_CATEGORY_ID_INCLUDE_HISP.values())) all_races = add_sum_of_rows( all_races, RACE_CATEGORY_ID_COL, POPULATION_COL, Race.MULTI_OR_OTHER_STANDARD_NH.value, [Race.MULTI_NH.value, Race.OTHER_STANDARD_NH.value]) all_races = add_sum_of_rows( all_races, RACE_CATEGORY_ID_COL, POPULATION_COL, Race.MULTI_OR_OTHER_STANDARD.value, [Race.MULTI.value, Race.OTHER_STANDARD.value]) add_race_columns_from_category_id(all_races) return self.sort_race_frame(all_races)
def get_sex_by_age_and_race(self, var_map, sex_by_age_frames): """Returns a DataFrame of population by sex and age and race. var_map: ACS metadata variable map, as returned by `parse_acs_metadata` sex_by_age_frames: Map of concept to non-standardized DataFrame for that concept.""" frames = [] for concept, race in SEX_BY_AGE_CONCEPTS_TO_RACE.items(): frame = sex_by_age_frames[concept] group_vars = get_vars_for_group(concept, var_map, 2) sex_by_age = standardize_frame(frame, group_vars, [SEX_COL, AGE_COL], self.county_level, POPULATION_COL) # TODO reorder columns so population is last sex_by_age[RACE_CATEGORY_ID_COL] = race frames.append(sex_by_age) result = pandas.concat(frames) result[AGE_COL] = result[AGE_COL].apply(rename_age_bracket) result = add_sum_of_rows(result, AGE_COL, POPULATION_COL, TOTAL_VALUE) result = add_sum_of_rows(result, SEX_COL, POPULATION_COL, TOTAL_VALUE) add_race_columns_from_category_id(result) return self.sort_sex_age_race_frame(result)
def generate_breakdown(self, breakdown, df): output = [] states = df['State Name'].drop_duplicates().to_list() columns = [ std_col.STATE_NAME_COL, std_col.COPD_PCT, std_col.DIABETES_PCT ] if breakdown == std_col.RACE_OR_HISPANIC_COL: columns.append(std_col.RACE_CATEGORY_ID_COL) else: columns.append(breakdown) for state in states: for breakdown_value in BREAKDOWN_MAP[breakdown]: output_row = {} output_row[std_col.STATE_NAME_COL] = state if breakdown == std_col.RACE_OR_HISPANIC_COL: output_row[ std_col. RACE_CATEGORY_ID_COL] = UHC_RACE_GROUPS_TO_STANDARD[ breakdown_value] else: output_row[breakdown] = breakdown_value for determinant in UHC_DETERMINANTS_OF_HEALTH: if breakdown_value == 'All': output_row[ UHC_DETERMINANTS_OF_HEALTH[determinant]] = df.loc[ (df['State Name'] == state) & (df['Measure Name'] == determinant )]['Value'].values[0] else: row = df.loc[ (df['State Name'] == state) & (df['Measure Name'].str.contains(determinant)) & (df['Measure Name'].str.contains(breakdown_value))] if len(row) > 0: pct = row['Value'].values[0] if pct: output_row[UHC_DETERMINANTS_OF_HEALTH[ determinant]] = pct output.append(output_row) output_df = pd.DataFrame(output, columns=columns) if breakdown == std_col.RACE_OR_HISPANIC_COL: std_col.add_race_columns_from_category_id(output_df) return output_df
def standardize_data(df): """Standardizes the data by cleaning string values and standardizing column names. df: Pandas dataframe to standardize. """ # Clean string values in the dataframe. df = df.applymap(lambda x: x.replace('"', '').strip() if isinstance(x, str) else x) # Standardize column names. df = df.rename(columns=COL_NAME_MAPPING) # Add race metadata columns. if std_col.RACE_CATEGORY_ID_COL in df.columns: std_col.add_race_columns_from_category_id(df) return df
def split_data_frames(self): state_data = [] county_data = [] # Extract keys from self.data Tuple # (state_fip, County_fip, Age, Sex, Race): {PopulationObj} for data, population in self.data.items(): state_fip, county_fip, race, age, income = data if county_fip is None: state_data.append( [ state_fip, self.state_fips[state_fip], race, age, income, population, ] ) else: county_data.append( [ state_fip, self.state_fips[state_fip], state_fip + county_fip, self.county_fips[(state_fip, county_fip)], race, age, income, population, ] ) # Build Panda DataFrames with standardized cols self.income_by_race_age_state_frame = pd.DataFrame( state_data, columns=[ STATE_FIPS_COL, STATE_NAME_COL, RACE_CATEGORY_ID_COL, AGE_COL, INCOME_COL, POPULATION_COL, ], ) self.income_by_race_age_county_frame = pd.DataFrame( county_data, columns=[ STATE_FIPS_COL, STATE_NAME_COL, COUNTY_FIPS_COL, COUNTY_NAME_COL, RACE_CATEGORY_ID_COL, AGE_COL, INCOME_COL, POPULATION_COL, ], ) add_race_columns_from_category_id(self.income_by_race_age_state_frame) add_race_columns_from_category_id(self.income_by_race_age_county_frame) # Aggregate Frames by Filename self.frames = { "household_income_by_race_age_state": self.income_by_race_age_state_frame, "household_income_by_race_age_county": self.income_by_race_age_county_frame, }
def split_data_frames(self): race_state_data = [] sex_state_data = [] age_state_data = [] race_county_data = [] sex_county_data = [] age_county_data = [] # Extract keys from self.data Tuple # (state_fip, county_fip, Age, Sex, Race): {PopulationObj} for data, population in self.data.items(): state_fip, county_fip, age, sex, race = data population = self.data[data] above = population[PovertyPopulation.ABOVE] below = population[PovertyPopulation.BELOW] # Since data is going into unique datasets, data should be split to # only have one dataset target. If this fails, # check custom_accumulations if (age is not None) + (sex is not None) + (race is not None) != 1: raise AssertionError(f"Invalid Tuple: {data}") if county_fip is None: default_state_vals = [ state_fip, self.state_fips[state_fip], below, above, ] if race is not None: race_state_data.append(default_state_vals + [race]) elif sex is not None: sex_state_data.append(default_state_vals + [sex]) elif age is not None: age_state_data.append(default_state_vals + [age]) else: default_county_vals = [ state_fip, self.state_fips[state_fip], state_fip + county_fip, self.county_fips[(state_fip, county_fip)], below, above, ] if race is not None: race_county_data.append(default_county_vals + [race]) elif sex is not None: sex_county_data.append(default_county_vals + [sex]) elif age is not None: age_county_data.append(default_county_vals + [age]) base_state_cols = [ STATE_FIPS_COL, STATE_NAME_COL, BELOW_POVERTY_COL, ABOVE_POVERTY_COL, ] base_county_cols = [ STATE_FIPS_COL, STATE_NAME_COL, COUNTY_FIPS_COL, COUNTY_NAME_COL, BELOW_POVERTY_COL, ABOVE_POVERTY_COL, ] # Build Panda DataFrames with standardized cols self.poverty_by_race_state = pd.DataFrame( race_state_data, columns=base_state_cols + [RACE_CATEGORY_ID_COL], ) self.poverty_by_race_county = pd.DataFrame( race_county_data, columns=base_county_cols + [RACE_CATEGORY_ID_COL], ) add_race_columns_from_category_id(self.poverty_by_race_state) add_race_columns_from_category_id(self.poverty_by_race_county) # Build Panda DataFrames with standardized cols self.poverty_by_age_state = pd.DataFrame( age_state_data, columns=base_state_cols + [AGE_COL], ) self.poverty_by_age_county = pd.DataFrame( age_county_data, columns=base_county_cols + [AGE_COL], ) self.poverty_by_sex_state = pd.DataFrame( sex_state_data, columns=base_state_cols + [SEX_COL], ) self.poverty_by_sex_county = pd.DataFrame( sex_county_data, columns=base_county_cols + [SEX_COL], ) # Aggregate Frames by Filename self.frames = { "poverty_by_race_state": self.poverty_by_race_state, "poverty_by_race_county": self.poverty_by_race_county, "poverty_by_age_state": self.poverty_by_age_state, "poverty_by_age_county": self.poverty_by_age_county, "poverty_by_sex_state": self.poverty_by_sex_state, "poverty_by_sex_county": self.poverty_by_sex_county, }
def split_data_frames(self): state_race_data = [] county_race_data = [] # Extract keys from self.data Tuple # (state_fip, County_fip, Age, Sex, Race): {PopulationObj} for data in self.data: state_fip, county_fip, age, race = data population = self.data[data] whi = population[HealthInsurancePopulation.WITH] wohi = population[HealthInsurancePopulation.WITHOUT] total = population[HealthInsurancePopulation.TOTAL] if county_fip is None: state_race_data.append( [state_fip, self.state_fips[state_fip], age, race, whi, wohi, total] ) else: county_race_data.append( [ state_fip, self.state_fips[state_fip], state_fip + county_fip, self.county_fips[(state_fip, county_fip)], age, race, whi, wohi, total, ] ) self.state_race_frame = pd.DataFrame( state_race_data, columns=[ STATE_FIPS_COL, STATE_NAME_COL, AGE_COL, RACE_CATEGORY_ID_COL, WITH_HEALTH_INSURANCE_COL, WITHOUT_HEALTH_INSURANCE_COL, TOTAL_HEALTH_INSURANCE_COL, ], ) self.county_race_frame = pd.DataFrame( county_race_data, columns=[ STATE_FIPS_COL, STATE_NAME_COL, COUNTY_FIPS_COL, COUNTY_NAME_COL, AGE_COL, RACE_CATEGORY_ID_COL, WITH_HEALTH_INSURANCE_COL, WITHOUT_HEALTH_INSURANCE_COL, TOTAL_HEALTH_INSURANCE_COL, ], ) add_race_columns_from_category_id(self.state_race_frame) add_race_columns_from_category_id(self.county_race_frame) # Aggregate Frames by Filename self.frames = { "health_insurance_by_race_age_state": self.state_race_frame, "health_insurance_by_race_age_county": self.county_race_frame, }