示例#1
0
def rate_count_switcher(df,
                        gbd_round_id,
                        decomp_step,
                        rate_cols=['mean', 'lower', 'upper']):
    pre_cols = df.columns

    if 'age_group_id' not in pre_cols:
        good_ages = hosp_prep.get_hospital_age_groups()
        df = df.merge(good_ages[['age_start', 'age_group_id']],
                      how='left',
                      on='age_start',
                      validate='m:1')
    if 'year_id' in pre_cols:
        df['year_start'], df['year_end'] = df['year_id'], df['year_id']
    if 'sex_id' not in pre_cols:
        assert set(df['sex'].unique()) == set(['Male', 'Female'])
        df['sex_id'] = 2
        df.loc[df['sex'] == 'Male', 'sex_id'] = 1

    years = list(np.arange(1990, 2018, 1))
    ages = df.age_group_id.unique().tolist()
    locs = df.location_id.unique().tolist()

    pop = db_queries.get_population(gbd_round_id=gbd_round_id,
                                    decomp_step=decomp_step,
                                    year_id=years,
                                    age_group_id=ages,
                                    sex_id=[1, 2],
                                    location_id=locs)
    pop['year_start'], pop['year_end'] = pop['year_id'], pop['year_id']
    pop.drop(['year_id', 'run_id'], axis=1, inplace=True)

    agg_pop = pop.copy()
    agg_pop = hosp_prep.year_binner(agg_pop)
    agg_pop = agg_pop.groupby(['age_group_id', 'year_start', 'year_end', 'location_id', 'sex_id']).\
                            agg({'population': 'mean'}).reset_index()

    if (df['year_start'] + 4 == df['year_end']).all():
        df = df.merge(agg_pop,
                      how='left',
                      on=[
                          'age_group_id', 'sex_id', 'location_id',
                          'year_start', 'year_end'
                      ],
                      validate='m:1')
    else:
        df = df.merge(pop,
                      how='left',
                      on=[
                          'age_group_id', 'sex_id', 'location_id',
                          'year_start', 'year_end'
                      ],
                      validate='m:1')

    assert df['population'].isnull().sum() == 0

    for col in rate_cols:
        df["count_{}".format(col)] = df[col] * df['population']

    return df
示例#2
0
def get_sample_size(df, fix_group237=False):
    """
    This function attaches sample size to hospital data.  It's for sources that
    should have fully covered populations, so sample size is just population.
    Checks if age_group_id is a column that exists and if not, it attaches it.

    Parameters
        df: Pandas DataFrame
            contains the data that you want to add sample_size to.  Will add
            pop to every row.
    """
    # process
    ## attach age group id to data
    ## get pop with those age group ids in the data
    ## attach pop by age group id

    if 'age_group_id' not in df.columns:
        # pull age_group to age_start/age_end map
        age_group = hosp_prep.get_hospital_age_groups()

        # merge age group id on
        pre = df.shape[0]
        df = df.merge(age_group, how='left', on=['age_start', 'age_end'])
        assert df.shape[0] == pre, "number of rows changed during merge"
        assert df.age_group_id.notnull().all(), ("age_group_id is missing "
            "for some rows")

    # get population
    pop = get_population(QUERY)

    if fix_group237:

        fix_pop = get_population(QUERY)
        pre = fix_pop.shape[0]
        fix_pop['age_group_id'] = 237
        fix_pop = fix_pop.groupby(fix_pop.columns.drop('population').tolist()).agg({'population': 'sum'}).reset_index()
        assert pre/2 == fix_pop.shape[0]

        pop = pd.concat([pop, fix_pop], ignore_index=True)

    # rename pop columns to match hospital data columns
    pop.rename(columns={'year_id': 'year_start'}, inplace=True)
    pop['year_end'] = pop['year_start']
    pop.drop("run_id", axis=1, inplace=True)

    demography = ['location_id', 'year_start', 'year_end',
                  'age_group_id', 'sex_id']

    # merge on population
    pre_shape = df.shape[0]
    df = df.merge(pop, how='left', on=demography)  # attach pop info to hosp
    assert pre_shape == df.shape[0], "number of rows don't match after merge"
    assert df.population.notnull().all(),\
        "population is missing for some rows. look at this df! \n {}".\
            format(df.loc[df.population.isnull(), demography].drop_duplicates())

    return(df)
示例#3
0
def get_sample_size(df):
    """
    This function attaches sample size to hospital data.  It's for sources that
    should have fully covered populations, so sample size is just population.

    """
    # process
    ## attach age group id to data
    ## get pop with those age group ids in the data
    ## attach pop by age group id

    if 'age_group_id' not in df.columns:
        # pull age_group to age_startFILEPATH map
        age_group = hosp_prep.get_hospital_age_groups()

        # merge age group id on
        pre = df.shape[0]
        df = df.merge(age_group, how='left', on=['age_start', 'age_end'])
        assert df.shape[0] == pre, "number of rows changed during merge"
        assert df.age_group_id.notnull().all(), ("age_group_id is missing "
                                                 "for some rows")

    # get population
    pop = get_population(age_group_id=list(df.age_group_id.unique()),
                         location_id=list(df.location_id.unique()),
                         sex_id=[1, 2],
                         year_id=list(df.year_start.unique()))

    # rename pop columns to match hospital data columns
    pop.rename(columns={'year_id': 'year_start'}, inplace=True)
    pop['year_end'] = pop['year_start']
    pop.drop("process_version_map_id", axis=1, inplace=True)

    demography = [
        'location_id', 'year_start', 'year_end', 'age_group_id', 'sex_id'
    ]

    # merge on population
    pre_shape = df.shape[0]
    df = df.merge(pop, how='left', on=demography)  # attach pop info to hosp
    assert pre_shape == df.shape[0], "number of rows don't match after merge"
    assert df.population.notnull().all(), ("population is missing for some"
                                           " rows")

    return (df)
示例#4
0
def all_group_id_start_end_switcher(df, remove_cols=True, ignore_nulls=False):
    """
    Takes a dataframe with age start/end OR age group ID and switches from one
    to the other

    Args:
        df: (Pandas DataFrame) data to swich age labelling
        remove_cols: (bool)  If True, will drop the column that was switched
            from
        ignore_nulls: (bool)  If True, assertions about missing ages
            will be ignored.  Not a good idea to use in production but is useful
            for when you just need to quickly see what ages you have.
    """

    if sum([w in ['age_start', 'age_end', 'age_group_id']
            for w in df.columns]) == 3:
        assert False,\
        "All age columns are present, unclear which output is desired. "\
        "Simply drop the columns you don't want"

    elif sum([w in ['age_start', 'age_end'] for w in df.columns]) == 2:
        merge_on = ['age_start', 'age_end']
        switch_to = ['age_group_id']

    elif 'age_group_id' in df.columns:
        merge_on = ['age_group_id']
        switch_to = ['age_start', 'age_end']
    else:
        assert False, "Age columns not present or named incorrectly"

    ages = hosp_prep.get_hospital_age_groups()

    age_set = "hospital"
    for m in merge_on:
        ages_unique = ages[merge_on].drop_duplicates()
        df_unique = df[merge_on].drop_duplicates()

        if ages_unique.shape[0] != df_unique.shape[0]:
            age_set = 'non_hospital'
        elif (ages_unique[m].sort_values().reset_index(drop=True) !=
              df_unique[m].sort_values().reset_index(drop=True)).all():
            age_set = 'non_hospital'

    if age_set == 'non_hospital':

        ages = query("""SQL""", conn_def="shared")
        ages.rename(columns={
            "age_group_years_start": "age_start",
            "age_group_years_end": "age_end"
        },
                    inplace=True)

        if 'age_end' in merge_on:

            df.loc[df['age_end'] == 100, 'age_end'] = 125

        duped_ages = [294, 308, 27, 161, 38, 301]
        ages = ages[~ages.age_group_id.isin(duped_ages)]

    dupes = ages[ages.duplicated(['age_start', 'age_end'],
                                 keep=False)].sort_values('age_start')

    pre = df.shape[0]
    df = df.merge(ages, how='left', on=merge_on)
    assert pre == df.shape[0],\
        "Rows were duplicated, probably from these ages \n{}".format(dupes)

    if not ignore_nulls:
        for s in switch_to:
            assert df[s].isnull().sum() == 0, (
                "{} contains missing values from "
                "the merge. The values with Nulls are {}".format(
                    s, df.loc[df[s].isnull(),
                              merge_on].drop_duplicates().sort_values(
                                  by=merge_on)))

    if remove_cols:

        df.drop(merge_on, axis=1, inplace=True)
    return (df)
示例#5
0
def get_sample_size(df, gbd_round_id, decomp_step, fix_group237=False):
    """
    This function attaches sample size to hospital data.  It's for sources that
    should have fully covered populations, so sample size is just population.
    Checks if age_group_id is a column that exists and if not, it attaches it.

    Parameters
        df: Pandas DataFrame
            contains the data that you want to add sample_size to.  Will add
            pop to every row.
    """

    if 'year_id' in df.columns:
        df.rename(columns={'year_id': 'year_start'}, inplace=True)
        df['year_end'] = df['year_start']

    if 'age_group_id' not in df.columns:

        age_group = hosp_prep.get_hospital_age_groups()

        pre = df.shape[0]
        df = df.merge(age_group, how='left', on=['age_start', 'age_end'])
        assert df.shape[0] == pre, "number of rows changed during merge"
        assert df.age_group_id.notnull().all(), ("age_group_id is missing "
                                                 "for some rows")

    pop = get_population(age_group_id=list(df.age_group_id.unique()),
                         location_id=list(df.location_id.unique()),
                         sex_id=[1, 2],
                         year_id=list(df.year_start.unique()),
                         gbd_round_id=gbd_round_id,
                         decomp_step=decomp_step)

    if fix_group237:

        fix_pop = get_population(age_group_id=[235, 32],
                                 location_id=list(df.location_id.unique()),
                                 sex_id=[1, 2],
                                 year_id=list(df.year_start.unique()),
                                 gbd_round_id=gbd_round_id,
                                 decomp_step=decomp_step)
        pre = fix_pop.shape[0]
        fix_pop['age_group_id'] = 237
        fix_pop = fix_pop.groupby(
            fix_pop.columns.drop('population').tolist()).agg({
                'population':
                'sum'
            }).reset_index()
        assert pre / 2 == fix_pop.shape[0]

        pop = pd.concat([pop, fix_pop], sort=False, ignore_index=True)

    pop.rename(columns={'year_id': 'year_start'}, inplace=True)
    pop['year_end'] = pop['year_start']
    pop.drop("run_id", axis=1, inplace=True)

    demography = [
        'location_id', 'year_start', 'year_end', 'age_group_id', 'sex_id'
    ]

    pre_shape = df.shape[0]
    df = df.merge(pop, how='left', on=demography)
    assert pre_shape == df.shape[0], "number of rows don't match after merge"
    assert df.population.notnull().all(),\
        "population is missing for some rows. look at this df! \n {}".\
            format(df.loc[df.population.isnull(), demography].drop_duplicates())

    return (df)
示例#6
0
# ** parent inj **
df = get_parent_injuries(df)

hosp_prep.check_parent_injuries(df, col_to_sum='val')

# ** correction factors **
df = apply_outpatient_correction(df)

# ** age sex restrictions **
df = outpatient_restrictions(df)

# # test making square
df = df[df.val != 0]

ages = hosp_prep.get_hospital_age_groups()

df = df.merge(ages, how='left', on=['age_start', 'age_end'])

df_square = hosp_prep.make_zeroes(df, level_of_analysis='bundle_id',
                             cols_to_square=['val', 'val_corrected'], icd_len=5)


np.abs(df.val.sum() - df_square.val.sum())

pre_check_val = df['val'].sort_values().reset_index(drop=True)
post_check_val = df_square.loc[df_square['val'] > 0, 'val'].sort_values().reset_index(drop=True)

merged = df_square.merge(df, how='left', on=['age_start', 'age_end', 'sex_id', 'location_id', 'year_start', 'source', 'nid',
                                            'age_group_id', 'bundle_id', 'year_end', 'facility_id', 'representative_id',
                                            ])
def prepare_envelope(df,
                     fix_maternal=False,
                     return_only_inj=False,
                     apply_age_sex_restrictions=True,
                     want_to_drop_data=True):
    """
    function that accepts as a parameter hospital dataframe and adds the
    the envelope to it, and adds meids and bundle_ids to it, duplicating
    baby sequela in the process (as expected)

    Parameters:
    df: DataFrame
        contains hospital inpatient primary diagnosis that has been mapped to
        Baby Sequelae
    fix_maternal: Boolean
        Should always be on, it's just an option for testing purposes to save
        time.
        switch that runs fix_maternal_denominators.  This function transforms
        hospital data from 'population' to 'live births' using Age Specific
        Fertility Rates.
    return_only_inj: Boolean
        switch that will make it so this function only returns injuries data.
        should always be off, it's just there for testing.
    apply_age_sex_restrictions: Boolean
        switch that if True, will apply age and sex restrictions as determined
        by cause_set_id=9.  DUSERts to true.  Useful if you want to look at
        data all the data that was present in the source.
    """

    ###############################################
    # DROP DATA
    ###############################################

    if want_to_drop_data:
        df = hosp_prep.drop_data(df, verbose=False)

    ###############################################
    # MAP AGE_GROUP_ID ONTO DATA
    ###############################################
    # These are our weird hospital age_group_id: we use 235 for 95-99, and we
    # don't use age_group_ids 2,3,4, we use 28 in their place
    ages = hosp_prep.get_hospital_age_groups(
    )  # df that has age_group_id, age_start,
    # and age_end
    df = df.merge(ages, how='left', on=['age_start', 'age_end'])

    # comment out to test the new sources in the process
    if not df.age_group_id.notnull().all():
        warnings.warn("""Shouldn't be any null age_group_id, there could be
        unsplit ages in the data, or maybe an age_start/age_end changed""")

    # maps = pd.read_csv("FILEPATH")
    # maps = maps[['nonfatal_cause_name', 'cause_id', 'level']].copy()
    # maps = maps[maps.cause_id.notnull()]
    # maps = maps[maps.level == 1]
    # maps.drop("level", axis=1, inplace=True)# no need to merge level onto data
    # maps.drop_duplicates(inplace=True)
    #
    # pre = df.shape[0]
    # df = df.merge(maps, how='left', on='nonfatal_cause_name')
    # assert df.shape[0] == pre, "df shape changed during merge"

    #########################################################
    # Get denominator for sources where we don't want to use envelope
    #########################################################

    # Make list of sources that we don't want to use envelope
    full_coverage_sources = ["UK_HOSPITAL_STATISTICS"]

    if pd.Series(full_coverage_sources).isin(df.source.unique()).any():
        # make dataframe with sources
        full_coverage_df = df[df.source.isin(full_coverage_sources)].copy()

        # get denominator (aka population)
        full_coverage_df = gbd_hosp_prep.get_sample_size(full_coverage_df)

        full_coverage_df.rename(columns={'population': 'sample_size'},
                                inplace=True)
        # NOTE now there's going be a column "sample_size" that
        # is null for every source except the fully covered ones

        # make product
        full_coverage_df[
            'product'] = full_coverage_df.val / full_coverage_df.sample_size
    else:
        # make empty dataframe so that it exists and wont break anything
        full_coverage_df = pd.DataFrame(columns=list(df.columns) + ['product'])

    # drop these sources from dataframe for now
    df = df[~df.source.isin(full_coverage_sources)].copy()

    #########################################
    # CREATE CAUSE FRACTIONS
    #########################################
    df = hosp_prep.create_cause_fraction(df)

    ###############################################
    # APPLY CAUSE RESTRICTIONS
    ###############################################

    # NOTE it is okay to apply corrections after cause fractions, because restrictions
    # were also applied before the cause fractions were made ( right before splitting)
    if apply_age_sex_restrictions:
        df = hosp_prep.apply_restrictions(df, 'cause_fraction')
        full_coverage_df = hosp_prep.apply_restrictions(
            full_coverage_df, 'product')
        # drop the restricted values
        df = df[df['cause_fraction'].notnull()]
        full_coverage_df = full_coverage_df[
            full_coverage_df['product'].notnull()]

    #########################################
    # APPLY ENVELOPE
    #########################################
    # read envelope
    env_df = pd.read_csv("FILEPATH")
    locs = df.location_id.unique()  # all locations in our data
    # find locations that are missing age end 99
    missing_99 = list(set(locs) -\
        set(env_df.loc[(env_df.location_id.isin(locs))&(env_df.age_end == 99),
                       'location_id'].unique()))
    # change their age_ends to be 99
    env_df.loc[(env_df.location_id.isin(missing_99)) &
               (env_df.age_start == 95), 'age_end'] = 99
    # since we do NOT want to merge on by age_group_id
    env_df.drop('age_group_id', axis=1, inplace=True)

    demography = [
        'location_id', 'year_start', 'year_end', 'age_start', 'age_end',
        'sex_id'
    ]

    # merge envelope onto data
    pre_shape = df.shape[0]
    df = df.merge(env_df, how='left', on=demography)
    assert pre_shape == df.shape[0],\
        "The merge duplicated rows unexpectedly"

    # compute nfc hospitalization rate
    # aka "apply the envelope"
    df['product'] = df['cause_fraction'] * df['mean']
    df['upper_product'] = df['cause_fraction'] * df['upper']
    df['lower_product'] = df['cause_fraction'] * df['lower']

    ############################################
    # RE-ATTACH data that has sample size instead of
    ############################################

    df = pd.concat([df, full_coverage_df]).reset_index(drop=True)
    # NOTE now there's going be a column "sample_size" and "cases" that
    # is null for every source except the fully covered ones

    # if we just want injuries:
    if return_only_inj == True:
        inj_me = pd.read_csv("FILEPATH")
        inj_me = list(inj_me['level1_meid'])
        df = df[df['modelable_entity_id'].isin(inj_me)]

    # drop columns we don't need anymore
    # cause fraction: finished using it, was used to make product
    # mean: finished using it, was used to make product
    # upper: finished using, it was used ot make product_upper
    # lower: finished using, it was used ot make product_lower
    # val, numerator, denomantor: no longer in count space
    # NON FATAL_CAUSE_NAME IS LEFT TO SIGNIFY THAT THIS DATA IS STILL AT THE
    # NONFATAL_CAUSE_NAME LEVEL
    df.drop([
        'cause_fraction', 'mean', 'upper', 'lower', 'val', 'numerator',
        'denominator'
    ],
            axis=1,
            inplace=True)

    if "bundle_id" in df.columns:
        level_of_analysis = "bundle_id"
    if "nonfatal_cause_name" in df.columns:
        level_of_analysis = "nonfatal_cause_name"
    # write some summary stats of the env being applied
    # this is to review against our aggregated data to find large differences
    today = datetime.datetime.today().strftime("%Y_%m_%d").replace(":", "_")
    # Zeros can really skew the summary stats so drop them
    env_stats = df[df['product'] != 0].\
        groupby(['location_id', level_of_analysis])['product'].\
        describe().reset_index()
    env_stats.rename(columns={'level_2': 'stats'}, inplace=True)
    env_stats.to_csv("FILEPATH", index=False)

    return (df)