def get_all_races_frame(self, race_and_hispanic_frame):
        """Includes all race categories, both including and not including
           Hispanic/Latino."""
        all_races = self.standardize_race_include_hispanic(
            race_and_hispanic_frame)
        standardized_race = self.standardize_race_exclude_hispanic(
            race_and_hispanic_frame)
        standardized_race = standardized_race.copy()
        # both variants of standardized race include a "Hispanic or Latino"
        # group, so remove from one before concatenating.
        standardized_race = standardized_race[
            standardized_race[RACE_CATEGORY_ID_COL] != Race.HISP.value]
        all_races = pandas.concat([all_races, standardized_race])

        # Drop extra columns before adding derived rows so they don't interfere
        # with grouping.
        all_races.drop(RACE_COL, axis=1, inplace=True)

        # Add derived rows.
        all_races = add_sum_of_rows(
            all_races, RACE_CATEGORY_ID_COL, POPULATION_COL, Race.TOTAL.value,
            list(RACE_STRING_TO_CATEGORY_ID_INCLUDE_HISP.values()))
        all_races = add_sum_of_rows(
            all_races, RACE_CATEGORY_ID_COL, POPULATION_COL,
            Race.MULTI_OR_OTHER_STANDARD_NH.value,
            [Race.MULTI_NH.value, Race.OTHER_STANDARD_NH.value])
        all_races = add_sum_of_rows(
            all_races, RACE_CATEGORY_ID_COL, POPULATION_COL,
            Race.MULTI_OR_OTHER_STANDARD.value,
            [Race.MULTI.value, Race.OTHER_STANDARD.value])

        add_race_columns_from_category_id(all_races)
        return self.sort_race_frame(all_races)
    def get_sex_by_age_and_race(self, var_map, sex_by_age_frames):
        """Returns a DataFrame of population by sex and age and race.

           var_map: ACS metadata variable map, as returned by
                    `parse_acs_metadata`
           sex_by_age_frames: Map of concept to non-standardized DataFrame for
                              that concept."""
        frames = []
        for concept, race in SEX_BY_AGE_CONCEPTS_TO_RACE.items():
            frame = sex_by_age_frames[concept]
            group_vars = get_vars_for_group(concept, var_map, 2)
            sex_by_age = standardize_frame(frame, group_vars,
                                           [SEX_COL, AGE_COL],
                                           self.county_level, POPULATION_COL)

            # TODO reorder columns so population is last
            sex_by_age[RACE_CATEGORY_ID_COL] = race
            frames.append(sex_by_age)
        result = pandas.concat(frames)
        result[AGE_COL] = result[AGE_COL].apply(rename_age_bracket)

        result = add_sum_of_rows(result, AGE_COL, POPULATION_COL, TOTAL_VALUE)
        result = add_sum_of_rows(result, SEX_COL, POPULATION_COL, TOTAL_VALUE)

        add_race_columns_from_category_id(result)
        return self.sort_sex_age_race_frame(result)
Exemplo n.º 3
0
    def generate_breakdown(self, breakdown, df):
        output = []
        states = df['State Name'].drop_duplicates().to_list()

        columns = [
            std_col.STATE_NAME_COL, std_col.COPD_PCT, std_col.DIABETES_PCT
        ]
        if breakdown == std_col.RACE_OR_HISPANIC_COL:
            columns.append(std_col.RACE_CATEGORY_ID_COL)
        else:
            columns.append(breakdown)

        for state in states:
            for breakdown_value in BREAKDOWN_MAP[breakdown]:
                output_row = {}
                output_row[std_col.STATE_NAME_COL] = state

                if breakdown == std_col.RACE_OR_HISPANIC_COL:
                    output_row[
                        std_col.
                        RACE_CATEGORY_ID_COL] = UHC_RACE_GROUPS_TO_STANDARD[
                            breakdown_value]
                else:
                    output_row[breakdown] = breakdown_value

                for determinant in UHC_DETERMINANTS_OF_HEALTH:
                    if breakdown_value == 'All':
                        output_row[
                            UHC_DETERMINANTS_OF_HEALTH[determinant]] = df.loc[
                                (df['State Name'] == state)
                                & (df['Measure Name'] == determinant
                                   )]['Value'].values[0]

                    else:
                        row = df.loc[
                            (df['State Name'] == state)
                            & (df['Measure Name'].str.contains(determinant)) &
                            (df['Measure Name'].str.contains(breakdown_value))]

                        if len(row) > 0:
                            pct = row['Value'].values[0]
                            if pct:
                                output_row[UHC_DETERMINANTS_OF_HEALTH[
                                    determinant]] = pct

                output.append(output_row)

        output_df = pd.DataFrame(output, columns=columns)

        if breakdown == std_col.RACE_OR_HISPANIC_COL:
            std_col.add_race_columns_from_category_id(output_df)

        return output_df
Exemplo n.º 4
0
def standardize_data(df):
    """Standardizes the data by cleaning string values and standardizing column
    names.

    df: Pandas dataframe to standardize.
    """
    # Clean string values in the dataframe.
    df = df.applymap(lambda x: x.replace('"', '').strip()
                     if isinstance(x, str) else x)

    # Standardize column names.
    df = df.rename(columns=COL_NAME_MAPPING)

    # Add race metadata columns.
    if std_col.RACE_CATEGORY_ID_COL in df.columns:
        std_col.add_race_columns_from_category_id(df)

    return df
    def split_data_frames(self):
        state_data = []
        county_data = []

        # Extract keys from self.data Tuple
        # (state_fip, County_fip, Age, Sex, Race): {PopulationObj}
        for data, population in self.data.items():
            state_fip, county_fip, race, age, income = data

            if county_fip is None:
                state_data.append(
                    [
                        state_fip,
                        self.state_fips[state_fip],
                        race,
                        age,
                        income,
                        population,
                    ]
                )
            else:
                county_data.append(
                    [
                        state_fip,
                        self.state_fips[state_fip],
                        state_fip + county_fip,
                        self.county_fips[(state_fip, county_fip)],
                        race,
                        age,
                        income,
                        population,
                    ]
                )

        # Build Panda DataFrames with standardized cols
        self.income_by_race_age_state_frame = pd.DataFrame(
            state_data,
            columns=[
                STATE_FIPS_COL,
                STATE_NAME_COL,
                RACE_CATEGORY_ID_COL,
                AGE_COL,
                INCOME_COL,
                POPULATION_COL,
            ],
        )
        self.income_by_race_age_county_frame = pd.DataFrame(
            county_data,
            columns=[
                STATE_FIPS_COL,
                STATE_NAME_COL,
                COUNTY_FIPS_COL,
                COUNTY_NAME_COL,
                RACE_CATEGORY_ID_COL,
                AGE_COL,
                INCOME_COL,
                POPULATION_COL,
            ],
        )

        add_race_columns_from_category_id(self.income_by_race_age_state_frame)
        add_race_columns_from_category_id(self.income_by_race_age_county_frame)

        # Aggregate Frames by Filename
        self.frames = {
            "household_income_by_race_age_state": self.income_by_race_age_state_frame,
            "household_income_by_race_age_county": self.income_by_race_age_county_frame,
        }
Exemplo n.º 6
0
    def split_data_frames(self):
        race_state_data = []
        sex_state_data = []
        age_state_data = []

        race_county_data = []
        sex_county_data = []
        age_county_data = []

        # Extract keys from self.data Tuple
        # (state_fip, county_fip, Age, Sex, Race): {PopulationObj}
        for data, population in self.data.items():
            state_fip, county_fip, age, sex, race = data

            population = self.data[data]
            above = population[PovertyPopulation.ABOVE]
            below = population[PovertyPopulation.BELOW]

            # Since data is going into unique datasets, data should be split to
            # only have one dataset target.  If this fails,
            # check custom_accumulations
            if (age is not None) + (sex is not None) + (race is not None) != 1:
                raise AssertionError(f"Invalid Tuple: {data}")

            if county_fip is None:
                default_state_vals = [
                    state_fip,
                    self.state_fips[state_fip],
                    below,
                    above,
                ]
                if race is not None:
                    race_state_data.append(default_state_vals + [race])
                elif sex is not None:
                    sex_state_data.append(default_state_vals + [sex])
                elif age is not None:
                    age_state_data.append(default_state_vals + [age])
            else:
                default_county_vals = [
                    state_fip,
                    self.state_fips[state_fip],
                    state_fip + county_fip,
                    self.county_fips[(state_fip, county_fip)],
                    below,
                    above,
                ]
                if race is not None:
                    race_county_data.append(default_county_vals + [race])
                elif sex is not None:
                    sex_county_data.append(default_county_vals + [sex])
                elif age is not None:
                    age_county_data.append(default_county_vals + [age])

        base_state_cols = [
            STATE_FIPS_COL,
            STATE_NAME_COL,
            BELOW_POVERTY_COL,
            ABOVE_POVERTY_COL,
        ]

        base_county_cols = [
            STATE_FIPS_COL,
            STATE_NAME_COL,
            COUNTY_FIPS_COL,
            COUNTY_NAME_COL,
            BELOW_POVERTY_COL,
            ABOVE_POVERTY_COL,
        ]

        # Build Panda DataFrames with standardized cols
        self.poverty_by_race_state = pd.DataFrame(
            race_state_data,
            columns=base_state_cols + [RACE_CATEGORY_ID_COL],
        )
        self.poverty_by_race_county = pd.DataFrame(
            race_county_data,
            columns=base_county_cols + [RACE_CATEGORY_ID_COL],
        )

        add_race_columns_from_category_id(self.poverty_by_race_state)
        add_race_columns_from_category_id(self.poverty_by_race_county)

        # Build Panda DataFrames with standardized cols
        self.poverty_by_age_state = pd.DataFrame(
            age_state_data,
            columns=base_state_cols + [AGE_COL],
        )
        self.poverty_by_age_county = pd.DataFrame(
            age_county_data,
            columns=base_county_cols + [AGE_COL],
        )

        self.poverty_by_sex_state = pd.DataFrame(
            sex_state_data,
            columns=base_state_cols + [SEX_COL],
        )
        self.poverty_by_sex_county = pd.DataFrame(
            sex_county_data,
            columns=base_county_cols + [SEX_COL],
        )

        # Aggregate Frames by Filename
        self.frames = {
            "poverty_by_race_state": self.poverty_by_race_state,
            "poverty_by_race_county": self.poverty_by_race_county,
            "poverty_by_age_state": self.poverty_by_age_state,
            "poverty_by_age_county": self.poverty_by_age_county,
            "poverty_by_sex_state": self.poverty_by_sex_state,
            "poverty_by_sex_county": self.poverty_by_sex_county,
        }
Exemplo n.º 7
0
    def split_data_frames(self):
        state_race_data = []
        county_race_data = []

        # Extract keys from self.data Tuple
        # (state_fip, County_fip, Age, Sex, Race): {PopulationObj}
        for data in self.data:
            state_fip, county_fip, age, race = data

            population = self.data[data]
            whi = population[HealthInsurancePopulation.WITH]
            wohi = population[HealthInsurancePopulation.WITHOUT]
            total = population[HealthInsurancePopulation.TOTAL]

            if county_fip is None:
                state_race_data.append(
                    [state_fip, self.state_fips[state_fip], age, race, whi, wohi, total]
                )
            else:
                county_race_data.append(
                    [
                        state_fip,
                        self.state_fips[state_fip],
                        state_fip + county_fip,
                        self.county_fips[(state_fip, county_fip)],
                        age,
                        race,
                        whi,
                        wohi,
                        total,
                    ]
                )

        self.state_race_frame = pd.DataFrame(
            state_race_data,
            columns=[
                STATE_FIPS_COL,
                STATE_NAME_COL,
                AGE_COL,
                RACE_CATEGORY_ID_COL,
                WITH_HEALTH_INSURANCE_COL,
                WITHOUT_HEALTH_INSURANCE_COL,
                TOTAL_HEALTH_INSURANCE_COL,
            ],
        )
        self.county_race_frame = pd.DataFrame(
            county_race_data,
            columns=[
                STATE_FIPS_COL,
                STATE_NAME_COL,
                COUNTY_FIPS_COL,
                COUNTY_NAME_COL,
                AGE_COL,
                RACE_CATEGORY_ID_COL,
                WITH_HEALTH_INSURANCE_COL,
                WITHOUT_HEALTH_INSURANCE_COL,
                TOTAL_HEALTH_INSURANCE_COL,
            ],
        )

        add_race_columns_from_category_id(self.state_race_frame)
        add_race_columns_from_category_id(self.county_race_frame)

        # Aggregate Frames by Filename
        self.frames = {
            "health_insurance_by_race_age_state": self.state_race_frame,
            "health_insurance_by_race_age_county": self.county_race_frame,
        }