示例#1
0
def read_clean_map():
    """
    Read in the clean map file and do some light prepping to get it ready
    to merge onto the data

    Returns:
        Pandas DataFrame containing the map
    """
    # read in clean map
    maps_path = r"FILEPATH"
    maps = pd.read_csv(maps_path, dtype={'cause_code': object})
    assert hosp_prep.verify_current_map(maps)

    map_version = int(maps['map_version'].unique())
    # cast to string
    maps['cause_code'] = maps['cause_code'].astype(str)
    maps['cause_code'] = maps['cause_code'].str.upper()

    # keep relevant columns
    maps = maps[['cause_code', 'nonfatal_cause_name',
                     'code_system_id']].copy()

    # drop duplicate values
    maps = maps.drop_duplicates()
    return maps, map_version
def add_measure(df):
    """
    Adds bundle measure onto the df and renames the cols to fit with
    remaining process.  Data must already have bundle_id attached.

    Parameters:
        df: Pandas DataFrame
            Must have a 'bundle_id' column
    """

    assert "measure" not in df.columns, "'measure' already exists."
    assert "bundle_id" in df.columns, "'bundle_id' must exist."

    # Read in clean maps to merge measure onto data
    # need this for ELMO reqs
    clean_maps = pd.read_csv(root + r"{FILEPATH}" r"clean_map.csv")
    assert hosp_prep.verify_current_map(clean_maps)
    clean_maps = clean_maps[['bundle_id', 'bid_measure']]
    clean_maps.drop_duplicates(inplace=True)
    clean_maps.rename(columns={'bid_measure': 'measure'}, inplace=True)

    # remove null bundle ids in map
    clean_maps = clean_maps[clean_maps.bundle_id.notnull()]

    # check to make sure rows aren't duplicated
    pre_shape = df.shape[0]
    # merge measure onto data
    df = df.merge(clean_maps, how='left', on='bundle_id')

    # get injuries bids so we can check for missing measures
    pc_injuries = pd.read_csv(root + r"{FILEPATH}"
                              r"parent_child_injuries_gbd2017.csv")
    inj_bids = pc_injuries['Level1-Bundle ID'].unique()

    # some injuries bids didn't get measures!
    assert set(df[df.measure.isnull()].bundle_id).issubset(set(inj_bids)), """
        We expect that all null measures belong to injuries, but that is
        not the case. Something went wrong!"""

    # fix any injuries that are missing measure, all inj are inc:
    df.loc[(df.measure.isnull()) & (df.bundle_id.isin(inj_bids)),
           'measure'] = 'inc'

    # assert that all rows had a measure merged on
    hosp_prep.report_if_merge_fail(df,
                                   check_col='measure',
                                   id_cols='bundle_id',
                                   store=True,
                                   filename="measure_merge_failure")

    assert pre_shape == df.shape[0], "number of rows don't match after merge"

    return (df)
示例#3
0
def verify_missing(bundle, locs, age, sex, years, run_id, map_path=None):
    """
    pass a clean map, a bundle and set of demographic info and the func
    will return the data if it exists and a print statement if not
    """
    if map_path == None:
        map_path = "FILEPATH".format(run_id)

    if type(locs) == int:
        locs = [locs]
    if type(years) == int:
        locs = [years]

    loc_source = pd.read_csv(r"FILENAME" r"FILEPATH")
    loc_source = loc_source[loc_source.location_id.isin(locs)]
    df_list = []
    for source in loc_source.source.unique():
        for year in years:
            try:
                df = pd.read_hdf(r"FILENAME"
                                 r"FILENAME" + source + "_" + str(year) +
                                 "FILEPATH",
                                 key='df')
                df_list.append(df)
                del df
            except:
                print("couldn't read in " + source + str(year))

    data = pd.concat(df_list, sort=False)
    del df_list
    amap = pd.read_csv(map_path)
    amap.rename({'icg_name': 'icg_id'}, axis=1, inplace=True)
    assert hosp_prep.verify_current_map(amap)

    amap = amap.query("bundle_id == @bundle")

    data = data[(data.cause_code.isin(amap.cause_code))]
    data = data.query("age_start == @age & sex_id == @sex")
    if data.shape[0] == 0:
        print("uuhhh, yeah. there's no data here")
        print("age {} sex {} bundle {} years {} locations {}".format(
            age, sex, bundle, years, locs))
    else:
        return (data)
示例#4
0
def expand_bundles(df, drop_null_bundles=True):
    """
    This Function maps groups of ICD codes to Bundles.
    """

    assert "bundle_id" not in df.columns, "bundle_id has already been attached"
    assert "nonfatal_cause_name" in df.columns,\
        "'nonfatal_cause_name' must be a column"

    # merge on bundle id
    maps = pd.read_csv(root + r"{FILEPATH}clean_map.csv")
    assert hosp_prep.verify_current_map(maps)
    maps = maps[['nonfatal_cause_name', 'bundle_id']].copy()
    maps = maps.drop_duplicates()

    df = df.merge(maps, how='left', on='nonfatal_cause_name')

    if drop_null_bundles:
        # drop rows without bundle id
        df = df[df.bundle_id.notnull()]

    return (df)
示例#5
0
def get_bundle_paths(cfpath):
    cf_bundles = glob.glob(cfpath + "*.csv")
    len(cf_bundles)
    cm = pd.read_csv(r"filepath".format(root))
    assert hosp_prep.verify_current_map(cm), "The map version is not correct"

    # create a list of bundle id ints to check against the map
    bundles = [
        int(re.sub("[^0-9]", "", os.path.basename(f))) for f in cf_bundles
    ]
    map_bundles = cm[cm.bundle_id.notnull()].bundle_id.unique()
    bdiff = set(bundles).symmetric_difference(set(map_bundles))
    if bdiff:
        msg = """
        {} bundles are different between the current map and the CF folder. Bundles {}
        are present in the CFs but not the map and bundles {} are present in the map but
        not the CF folder.
        """.format(len(bdiff),
                   set(bundles) - set(map_bundles),
                   set(map_bundles) - set(bundles))
        warnings.warn(msg)
    return cf_bundles
示例#6
0
def verify_missing(bundle, locs, age, sex, years,
                            map_path="FILEPATH"):
    """
    pass a clean map, a bundle and set of demographic info and the func
    will return the data if it exists and a print statement if not
    """
    if type(locs) == int:
        locs = [locs]
    if type(years) == int:
        locs = [years]
    # read in location/source map
    loc_source = pd.read_csv(r"FILEPATH")
    loc_source = loc_source[loc_source.location_id.isin(locs)]
    df_list = []
    for source in loc_source.source.unique():
        for year in years:
            try:
                df = pd.read_hdf(FILEPATH)
                df_list.append(df)
                del df
            except:
                print("couldn't read in " + source + str(year))

    data = pd.concat(df_list)
    del df_list
    amap = pd.read_csv(map_path)
    assert hosp_prep.verify_current_map(amap)
    # get the icd codes associated with a bundle
    amap = amap.query("bundle_id == @bundle")
    # subset on icd codes
    data = data[(data.cause_code.isin(amap.cause_code))]
    data = data.query("age_start == @age & sex_id == @sex")
    if data.shape[0] == 0:
        print("uuhhh, yeah. there's no data here")
        print("age {} sex {} bundle {} years {} locations {}".format(age, sex, bundle, years, locs))
    else:
        return(data)
import sys

import hosp_prep

# print warning message about latest version of maps:
print("=====================================")
print("=== PLEASE MAKE SURE YOU ARE USING===")
print("==== LATEST VERSION OF CLEAN MAP ====")
print("=====================================")

# read in clean maps
# TODO: change this to real clean maps when time comes clean_map_9
df = pd.read_csv(root + r"FILEPATH")

map_vers = int(df.map_version.unique())
assert hosp_prep.verify_current_map(df)

df = df[['cause_code', 'bundle_id', 'code_system_id', 'bid_measure', 'level']]

# select only ICD9 parts
df = df[df['code_system_id'] == 1]

# get rid of null values
df.dropna(axis=0, inplace=True)

# make icd codes strings
df['cause_code'] = df['cause_code'].astype(str)

# drop code sys id column
df.drop(['code_system_id', 'level'], axis=1, inplace=True)
示例#8
0
def prep_weights(df, level):
    """
    Function that computes weights for use in age sex splitting.

    Parameters:
        df: pandas DataFrame
            input data to inform weights. Should already have the appropriate
            age groups.  Should have column "product" which is the product
            of cause_fraction and the hospital utilization envelope.
            age_group_id should be present.  df is used only as an input to
            make weights.
        level: string
            Must be "bundle_id" or "nonfatal_cause_name". indicates if we're making
            bundle level weights or cause level weights.
    Returns:
        DataFrame that has weights computed at level of the Parameter 'level'
    """
    print("Getting {} weights...".format(level))

    # remove the all sexes sex id
    df = df[df.sex_id != 3].copy()

    # code is set up to use both age_start/age_end and age_group_id
    df = gbd_hosp_prep.all_group_id_start_end_switcher(df, remove_cols=False)

    for source in df.source.unique():
        age_min = df[df.source == source].age_start.min()
        age_max = df[df.source == source].age_start.max()
        assert age_max == 95,\
            "source {} doesn't have max age_start == 95".format(source)

    # keep relevant columns
    df = df[[
        'age_group_id', 'age_start', 'age_end', 'location_id', 'sex_id',
        'year_end', 'year_start', 'product', 'nonfatal_cause_name'
    ]].copy()

    # make square aka cartesian. We want population to be conributed by all
    # age-countries, regardless if there are any cases for all age-country pairs
    # For every location-year that we already have, we want all age-sex
    # (and bundle/cause) combinations.  This introduces Nulls where there wasn't
    # any data, which are then filled with zeros
    template = hosp_prep.make_square(df)
    df = template.merge(df,
                        how='left',
                        on=[
                            'age_group_id', 'sex_id', 'location_id',
                            'year_start', 'nonfatal_cause_name', 'year_end',
                            'age_start', 'age_end'
                        ])
    # fill zeros
    df.update(df['product'].fillna(0))

    # merge pop on so we can convert to count space, so we can do addition
    # get info for pop
    age_list = list(df.age_group_id.unique())
    loc_list = list(df.location_id.unique())
    year_list = list(df.year_start.unique())

    # get pop
    pop = get_population(QUERY)

    # format pop
    pop.drop("run_id", axis=1, inplace=True)  # and lock it ??
    pop['year_start'] = pop['year_id']
    pop['year_end'] = pop['year_id']
    pop.drop('year_id', axis=1, inplace=True)

    # merge pop
    df = df.merge(
        pop,
        how='left',
        on=['location_id', 'year_start', 'year_end', 'age_group_id', 'sex_id'])

    # multiply by pop to get into count space so we can get to
    # age / sex / bundle groups
    df['counts'] = df['product'] * df['population']

    if level == 'bundle_id':
        assert False, "Level can no longer be bundle id"
        # in this section you merge on bundle_id via nonfatal_cause_name, and
        # then drop nonfatal_cause_name
        maps = pd.read_csv(r"FILEPATH/clean_map.csv")
        assert hosp_prep.verify_current_map(maps)
        maps = maps[['nonfatal_cause_name', 'bundle_id', 'level']].copy()
        maps = maps[maps.level == 1].copy()
        maps = maps.drop('level', axis=1)
        maps = maps.dropna(subset=['bundle_id'])
        maps = maps.drop_duplicates()

        df = df.merge(maps, how='left', on='nonfatal_cause_name')

        df.drop("nonfatal_cause_name", axis=1, inplace=True)

    group_cols = ['age_end', 'age_start', 'age_group_id', 'sex_id', level]
    df = df.groupby(by=group_cols).agg({
        'counts': 'sum',
        'population': 'sum'
    }).reset_index()

    # divide by pop to get back into ratespace ... and we have the start of our
    # weights
    df['weight'] = df['counts'] / df['population']

    return (df)