Пример #1
0
def extract_age(df):

    # get ages as ints
    pre = df.shape[0]
    df = pd.concat([df, df.age.str.split("0", expand=True)], axis=1)
    assert df.shape[0] == pre
    df.rename(columns={
        0: 'age_start',
        1: 'age_end',
        2: 'single'
    },
              inplace=True)

    df.loc[df.single == '4', 'age_end'] = '4'
    df.loc[df.single == '9', 'age_end'] = '9'
    df.drop(['single', 3], axis=1, inplace=True)

    df['age_start'] = pd.to_numeric(df['age_start'])
    df['age_end'] = pd.to_numeric(df['age_end'])

    df['age_start'] = df['age_end'] - 4
    df.loc[df.age == '000', ['age_start', 'age_end']] = 0
    df.loc[df.age == "095", ['age_start', 'age_end']] = [95, 124]
    df.loc[df.age_end == 4, 'age_start'] = 1
    df['age_end'] = df['age_end'] + 1

    df = gbd_hosp_prep.all_group_id_start_end_switcher(df.copy(),
                                                       remove_cols=False)
    return df
Пример #2
0
def get_baby_age_year_combo(df, test=False):
    today = re.sub("\W", "_", str(datetime.datetime.now()))[0:10]

    if test:
        df = pd.read_csv(r"FILEPATH")
    else:

        df = metadata_reader('baby_age_pattern')

    df = gbd_hosp_prep.all_group_id_start_end_switcher(df)

    def expandgrid(*itrs):

        product = list(itertools.product(*itrs))
        return ({
            'Var{}'.format(i + 1): [x[i] for x in product]
            for i in range(len(itrs))
        })

    year_list = list(range(1990, 2016))

    age_list = [0, 1] + list(range(5, 100, 5))
    age_list = np.asarray(age_list)

    baby_list = df.nonfatal_cause_name.unique()

    square = expandgrid(age_list, year_list, baby_list)
    square = pd.DataFrame(square)
    square.columns = ['age_start', 'year_start', 'nonfatal_cause_name']

    for source in df.source.unique():

        df_source = df[df.source == source].copy()
        df_source = df_source[[
            'age_start', 'year_start', 'nonfatal_cause_name'
        ]].drop_duplicates()

        df_source = df_source[df_source.year_start >= 1990]

        age_year = df_source[[
            'age_start', 'year_start', 'nonfatal_cause_name'
        ]].drop_duplicates()
        age_year['present'] = 'yes'
        age_year = age_year[age_year.year_start >= 1990]

        exists = square.merge(
            age_year,
            how='left',
            on=['age_start', 'year_start', 'nonfatal_cause_name'])
        exists = exists.fillna('no')

        exists.rename(columns={
            'age_start': 'age',
            'year_start': 'year'
        },
                      inplace=True)

        exists.to_csv("FILENAME" "FILEPATH".format(source, today), index=False)
    return
def extract_age(df):
    # get ages as ints
    pre = df.shape[0]
    df = pd.concat([df, df.age.str.split("-", expand=True)], axis=1)
    assert df.shape[0] == pre
    df.rename(columns={0: 'age_start', 1: 'age_end'}, inplace=True)
    df.loc[df.age_start == "85+", ['age_start', 'age_end']] = ["85", "124"]
    df.loc[df.age_start == "64", 'age_start'] = "65"
    df['age_start'] = pd.to_numeric(df['age_start'])
    df['age_end'] = pd.to_numeric(df['age_end'])

    df['age_end'] = df['age_end'] + 1
    df = gbd_hosp_prep.all_group_id_start_end_switcher(df.copy(),
                                                       remove_cols=False)
    return df
Пример #4
0
def prep_weights(df, level, gbd_round_id, decomp_step, squaring_method, inp_pipeline=True):
    """
    Function that computes weights for use in age sex splitting.

    Parameters:
        df: pandas DataFrame
            input data to inform weights. Should already have the appropriate
            age groups.  Should have column "product" which is the product
            of cause_fraction and the hospital utilization envelope.
            age_group_id should be present.  df is used only as an input to
            make weights.
        level: string
            Must be "bundle_id" or "icg_id". indicates if we're making
            bundle level weights or cause level weights. Note bundle ID weights will fail
        squaring_method: str
            Must be 'broad' or 'bundle_source_specific'. Broad means bundles that are never
            coded for a data source will be added as zeros, while the other does not
    Returns:
        DataFrame that has weights computed at level of the Parameter 'level'
    """

    
    assert level in df.columns, "{} is not present in the data".format(level)
    assert 'product' in df.columns, "Product column must be present"
    assert 'source' in df.columns, "Source column must be present"
    
    if level not in ("bundle_id", "icg_id"):
        raise ValueError("level must either be 'bundle_id' or 'icg_id'")

    if squaring_method not in ('broad', 'bundle_source_specific'):
        raise ValueError("squaring method must be either 'broad' or 'bundle_source_specific'")

    
    
    

    print("Getting {} weights...".format(level))

    
    df = df[df.sex_id != 3].copy()

    
    df = gbd_hosp_prep.all_group_id_start_end_switcher(df, remove_cols=False)

    test_weight_sources(df[['source', 'age_start', 'age_end']].drop_duplicates())

    if inp_pipeline:
        
        
        
        keep_cols = ['age_group_id', 'age_start', 'age_end', 'location_id',
                     'sex_id', 'year_end', 'year_start', 'product', 'icg_id']
        if squaring_method == 'bundle_source_specific':
            new_square_cols = ['source', 'nid', 'facility_id', 'representative_id']
            keep_cols = keep_cols + new_square_cols

        df = df[keep_cols].copy()

    
    
    
    
    
    if squaring_method == 'broad':
        template = hosp_prep.make_square(df)
        df = template.merge(df, how='left',
                            on=['age_group_id', 'sex_id', 'location_id',
                                'year_start', 'year_end', 'age_start', 'age_end',
                            level])
        
        df.update(df['product'].fillna(0))

    elif squaring_method == 'bundle_source_specific':
        df = hosp_prep.make_zeros(df, cols_to_square='product', etiology=level)
        
        
        if inp_pipeline:
            df.drop(new_square_cols, axis=1, inplace=True)

    else:
        assert False, f"{squaring_method} isn't a recognized squaring method"
    print("Finished making the data square")

    
    

    
    
    age_list = list(df.age_group_id.unique())
    loc_list = list(df.location_id.unique())
    year_list = list(df.year_start.unique())

    
    pop = get_population(age_group_id=age_list, location_id=loc_list,
                         sex_id=[1, 2], year_id=year_list,
                         gbd_round_id=gbd_round_id, decomp_step=decomp_step)

    
    pop.drop("run_id", axis=1, inplace=True)  
    pop['year_start'] = pop['year_id']
    pop['year_end'] = pop['year_id']
    pop.drop('year_id', axis=1, inplace=True)

    
    pre_shape = df.shape[0]
    df = df.merge(pop, how='left',
                  on=['location_id', 'year_start', 'year_end',
                      'age_group_id', 'sex_id'])
    assert pre_shape == df.shape[0], "Merging on pop changed the 
    print("Population has been merged onto the data")

    
    
    df['counts'] = df['product'] * df['population']

    
    
    
    group_cols = ['age_end', 'age_start', 'age_group_id', 'sex_id', level]
    df = df.groupby(by=group_cols).agg({'counts': 'sum',
                                        'population': 'sum'}).reset_index()
    print("Groupby is complete, calculating first pass of weights")

    
    
    df['weight'] = df['counts'] / df['population']

    return(df)
Пример #5
0
def format_hospital(write_hdf=False, downsize=False, verbose=True, head=False):
    """
    Function that reads all our formatted data and concatenates them together.

    Arguments:
        write_hdf: (bool) If true, writes an HDF H5 file of the aggregated data
        downsize: (bool) If true, numeric types will be cast down to the
            smallest size
        verbose: (bool) If true will print progress and information
        head: (bool) If true will only grab first 1000 rows of each source.
    """
    start = time.time()
    today = datetime.datetime.today().strftime("%Y_%m_%d")

    print("reading gbd2015 hospital data...")

    us_filepath = (r"FILENAME" r"FILEPATH")
    us = pd.read_stata(us_filepath)

    us.rename(columns={
        'dx_mapped_': 'cause_code',
        'dx_ecode_id': 'diagnosis_id'
    },
              inplace=True)

    loop_list = [us]
    stata_sources = [
        'NOR_NIPH_08_12', 'USA_HCUP_SID_03', 'USA_HCUP_SID_04',
        'USA_HCUP_SID_05', 'USA_HCUP_SID_06', 'USA_HCUP_SID_07',
        'USA_HCUP_SID_08', 'USA_HCUP_SID_09', 'USA_NHDS_79_10', 'BRA_SIH',
        'MEX_SINAIS', 'NZL_NMDS', 'EUR_HMDB', 'SWE_PATIENT_REGISTRY_98_12'
    ]

    for source in stata_sources:
        if verbose:
            print(source)
        filepath = (r"FILEPATH" r"FILEPATH").format(source, source)
        if head:
            new_df = hosp_prep.read_stata_chunks(filepath,
                                                 chunksize=1000,
                                                 chunks=1)
        else:
            new_df = pd.read_stata(filepath)
        if source == "EUR_HMDB":
            new_df.drop(['metric_bed_days', 'metric_day_cases'],
                        axis=1,
                        inplace=True)
        if verbose:
            print_diff_cols(new_df, us)

        loop_list.append(new_df)

    df = pd.concat(loop_list, ignore_index=True)

    df.loc[df.source.str.startswith("USA_HCUP_SID_"),
           "source"] = "USA_HCUP_SID"

    df.loc[df.age_end > 1, 'age_end'] = df.loc[df.age_end > 1, 'age_end'] + 1
    print("Concatinated all gbd 2015 data together in {} minutes".format(
        (time.time() - start) / 60))

    rename_dict = {
        'cases': 'val',
        'sex': 'sex_id',
        'NID': 'nid',
        'dx_ecode_id': "diagnosis_id",
        'dx_mapped_': "cause_code",
        'icd_vers': 'code_system_id',
        'platform': 'facility_id',
        'national': 'representative_id',
        'year': 'year_id'
    }
    df.rename(columns=rename_dict, inplace=True)

    df.drop(['subdiv', 'iso3', 'deaths'], axis=1, inplace=True)

    df.loc[df['representative_id'] == 0, 'representative_id'] = 3

    df['nid'] = df['nid'].astype(int)
    assert len(df['code_system_id'].unique()) <= 2, (
        "We assume that there "
        "only 2 ICD formats present: ICD 9 and ICD 10")
    df['code_system_id'].replace(['ICD9_detail', 'ICD10'], [1, 2],
                                 inplace=True)

    df['facility_id'].replace([1, 2],
                              ['inpatient unknown', 'outpatient unknown'],
                              inplace=True)

    df['outcome_id'] = 'case'
    df['metric_id'] = 1
    df['age_group_unit'] = 1
    df['year_start'] = df['year_id']
    df['year_end'] = df['year_id']
    df.drop('year_id', axis=1, inplace=True)

    df = df[df['val'] > 0]

    df.loc[(df.source == 'NZL_NMDS') & (df.age_end == 1), 'age_end'] = 5

    print("Done formating GBD 2015 in {} minutes".format(
        (time.time() - start) / 60))

    print("Reading in new data")

    h5_sources = [
        'KGZ_MHIF', 'IND_SNH', 'CHN_NHSIRS', 'TUR_DRGHID', 'CHL_MOH',
        'DEU_HSRS', 'PRT_CAHS', 'ITA_IMCH', 'PHL_HICC', 'NPL_HID', 'QAT_AIDA',
        'KEN_IMMS', 'GEO_COL', 'UK_HOSPITAL_STATISTICS', 'IDN_SIRS', 'VNM_MOH',
        "JOR_ABHD", 'AUT_HDD', 'ECU_INEC_97_14'
    ]

    loop_list = [df]
    for source in h5_sources:
        if verbose:
            print(source)
        filepath = (r"FILEPATH" r"FILEPATH").format(source, source)
        if head:
            new_df = pd.read_hdf(filepath, key="df", start=0, stop=1000)
        else:
            new_df = pd.read_hdf(filepath, key="df")
        assert set(df.columns).symmetric_difference(set(new_df.columns)) == \
            {'deaths'} or set(df.columns).symmetric_difference(set(new_df.columns)) == set(), \
            print_diff_cols(new_df, df)

        loop_list.append(new_df)

    df = pd.concat(loop_list, ignore_index=True)
    del loop_list
    print("Done reading in new data in {} minutes".format(
        (time.time() - start) / 60))

    df = df.loc[(df.source != "GEO_COL") | (~df.year_start.isin([2012, 2013]))]

    df = df[df.location_id != 4749]

    df = df[df['val'] > 0]

    df.loc[(df.sex_id != 1) & (df.sex_id != 2), 'sex_id'] = 3

    hosp_frmat_feat = [
        'location_id', 'year_start', 'year_end', 'age_group_unit', 'age_start',
        'age_end', 'sex_id', 'source', 'nid', 'representative_id',
        'facility_id', 'code_system_id', 'diagnosis_id', 'cause_code',
        'outcome_id', 'metric_id', 'val'
    ]

    columns_before = df.columns
    df = df[hosp_frmat_feat]
    columns_after = df.columns

    assert set(columns_before) == set(columns_after),\
        "You accidentally dropped a column while reordering"

    print("converting datatypes and downsizing...")

    if verbose:
        print(df.info(memory_usage='deep'))
    df['cause_code'] = df['cause_code'].astype('str')
    df['source'] = df['source'].astype('category')
    df['facility_id'] = df['facility_id'].astype('category')
    df['outcome_id'] = df['outcome_id'].astype('category')

    int_list = [
        'location_id', 'year_start', 'year_end', 'age_group_unit', 'age_start',
        'age_end', 'sex_id', 'nid', 'representative_id', 'code_system_id',
        'diagnosis_id', 'metric_id'
    ]

    if downsize:

        for col in int_list:
            try:
                df[col] = pd.to_numeric(df[col], errors='raise')
            except:
                print(col, "<- this col didn't work")
    if verbose:
        print(df.info(memory_usage='deep'))

    bad_ends = df[(df.age_end == 29)].source.unique()
    for asource in bad_ends:

        df.loc[(df.age_end > 1) & (df.source == asource),
               'age_end'] = df.loc[(df.age_end > 1) &
                                   (df.source == asource), 'age_end'] + 1

    df.loc[df.age_start > 95, 'age_start'] = 95
    df.loc[df.age_start >= 95, 'age_end'] = 125
    df.loc[df.age_end == 100, 'age_end'] = 125

    df = gbd_hosp_prep.all_group_id_start_end_switcher(df)



    assert len(df['diagnosis_id'].unique()) <= 2,\
              "diagnosis_id should have 2 or fewer feature levels"
    assert len(df['code_system_id'].unique()) <= 2,\
              "code_system_id should have 2 or fewer feature levels"

    for asource in df.source.unique():
        outcomes = df[df['source'] == asource].outcome_id.unique()
        if "case" in outcomes:
            assert "death" not in outcomes
            assert "discharge" not in outcomes

    print("Done processing formatted files in {} minutes".format(
        (time.time() - start) / 60))
    if write_hdf == True:

        write_path = (r"FILENAME" r"FILEPATH".format(today))
        category_cols = ['cause_code', 'source', 'facility_id', 'outcome_id']
        for col in category_cols:
            df[col] = df[col].astype(str)
        df.to_hdf(write_path,
                  key='df',
                  format='table',
                  complib='blosc',
                  complevel=5,
                  mode='w')

    return (df)
Пример #6
0
def apply_restrictions(df,
                       age_set,
                       cause_type,
                       map_version='current',
                       prod=True):
    """
    Apply age and sex restrictions by ICG or bundle to a dataframe of clinical data

    Params:
        df: (pd.DataFrame) clinical data
        age_set: (str) is the data in indv year ages, binned age groups with start/end or
                       age_group_ids
                        acceptable params are "indv", "binned", "age_group_id"
        cause_type: (str) do we want icg restricts or bundle restricts

    Returns:
        df: (pd.DataFrame) with rows that fall outside of age-sex restrictions dropped
    """
    warnings.warn("apply_restrictions needs a testing suite!!")
    sex_diff = set(df.sex_id.unique()).symmetric_difference([1, 2])
    if sex_diff:
        warnings.warn(
            f"There are sex_id values that won't have restrictions applied to them. These are {sex_diff}"
        )

    assert age_set in ['indv', 'binned', 'age_group_id'
                       ], "{} is not an acceptable age set".format(age_set)

    check_map_version(map_version)

    start_cols = df.columns

    if age_set == "age_group_id":
        import gbd_hosp_prep

        df = gbd_hosp_prep.all_group_id_start_end_switcher(df)
    elif age_set == 'indv':
        df = hosp_prep.age_binning(df,
                                   drop_age=False,
                                   terminal_age_in_data=False)

    df['to_keep'] = 1

    if cause_type == 'icg':
        restrict = get_clinical_process_data('age_sex_restrictions',
                                             map_version,
                                             prod=prod)
    elif cause_type == 'bundle':
        restrict = create_bundle_restrictions(map_version)
    else:
        assert False, "pick an acceptable restriction type"

    assert set(restrict.loc[restrict['yld_age_start'] < 1,
                            'yld_age_start'].unique()) == {0}

    keep_cols = [
        cause_type + '_id', 'male', 'female', 'yld_age_start', 'yld_age_end'
    ]

    pre = df.shape[0]
    df = df.merge(restrict[keep_cols], how='left', on=cause_type + '_id')
    assert pre == df.shape[0], ("merge made more rows, there's something wrong"
                                " in the restrictions file")

    df.loc[(df['male'] == 0) & (df['sex_id'] == 1), 'to_keep'] = np.nan

    df.loc[(df['female'] == 0) & (df['sex_id'] == 2), 'to_keep'] = np.nan

    df.loc[df['age_end'] <= df['yld_age_start'], 'to_keep'] = np.nan

    df.loc[df['age_start'] > df['yld_age_end'], 'to_keep'] = np.nan

    df = df[df['to_keep'].notnull()]

    df.drop(['male', 'female', 'yld_age_start', 'yld_age_end', 'to_keep'],
            axis=1,
            inplace=True)

    if age_set == "age_group_id":

        df = gbd_hosp_prep.all_group_id_start_end_switcher(df)
    elif age_set == "indv":
        df.drop(['age_start', 'age_end'], axis=1, inplace=True)

    diff_cols = set(start_cols).symmetric_difference(set(df.columns))
    assert not diff_cols, "The diff columns are {}".format(diff_cols)

    return df
Пример #7
0
def prep_weights(df, level):
    """
    Function that computes weights for use in age sex splitting.

    Parameters:
        df: pandas DataFrame
            input data to inform weights. Should already have the appropriate
            age groups.  Should have column "product" which is the product
            of cause_fraction and the hospital utilization envelope.
            age_group_id should be present.  df is used only as an input to
            make weights.
        level: string
            Must be "bundle_id" or "nonfatal_cause_name". indicates if we're making
            bundle level weights or cause level weights.
    Returns:
        DataFrame that has weights computed at level of the Parameter 'level'
    """
    print("Getting {} weights...".format(level))

    # remove the all sexes sex id
    df = df[df.sex_id != 3].copy()

    # code is set up to use both age_start/age_end and age_group_id
    df = gbd_hosp_prep.all_group_id_start_end_switcher(df, remove_cols=False)

    for source in df.source.unique():
        age_min = df[df.source == source].age_start.min()
        age_max = df[df.source == source].age_start.max()
        assert age_max == 95,\
            "source {} doesn't have max age_start == 95".format(source)

    # keep relevant columns
    df = df[[
        'age_group_id', 'age_start', 'age_end', 'location_id', 'sex_id',
        'year_end', 'year_start', 'product', 'nonfatal_cause_name'
    ]].copy()

    # make square aka cartesian. We want population to be conributed by all
    # age-countries, regardless if there are any cases for all age-country pairs
    # For every location-year that we already have, we want all age-sex
    # (and bundle/cause) combinations.  This introduces Nulls where there wasn't
    # any data, which are then filled with zeros
    template = hosp_prep.make_square(df)
    df = template.merge(df,
                        how='left',
                        on=[
                            'age_group_id', 'sex_id', 'location_id',
                            'year_start', 'nonfatal_cause_name', 'year_end',
                            'age_start', 'age_end'
                        ])
    # fill zeros
    df.update(df['product'].fillna(0))

    # merge pop on so we can convert to count space, so we can do addition
    # get info for pop
    age_list = list(df.age_group_id.unique())
    loc_list = list(df.location_id.unique())
    year_list = list(df.year_start.unique())

    # get pop
    pop = get_population(QUERY)

    # format pop
    pop.drop("run_id", axis=1, inplace=True)  # and lock it ??
    pop['year_start'] = pop['year_id']
    pop['year_end'] = pop['year_id']
    pop.drop('year_id', axis=1, inplace=True)

    # merge pop
    df = df.merge(
        pop,
        how='left',
        on=['location_id', 'year_start', 'year_end', 'age_group_id', 'sex_id'])

    # multiply by pop to get into count space so we can get to
    # age / sex / bundle groups
    df['counts'] = df['product'] * df['population']

    if level == 'bundle_id':
        assert False, "Level can no longer be bundle id"
        # in this section you merge on bundle_id via nonfatal_cause_name, and
        # then drop nonfatal_cause_name
        maps = pd.read_csv(r"FILEPATH/clean_map.csv")
        assert hosp_prep.verify_current_map(maps)
        maps = maps[['nonfatal_cause_name', 'bundle_id', 'level']].copy()
        maps = maps[maps.level == 1].copy()
        maps = maps.drop('level', axis=1)
        maps = maps.dropna(subset=['bundle_id'])
        maps = maps.drop_duplicates()

        df = df.merge(maps, how='left', on='nonfatal_cause_name')

        df.drop("nonfatal_cause_name", axis=1, inplace=True)

    group_cols = ['age_end', 'age_start', 'age_group_id', 'sex_id', level]
    df = df.groupby(by=group_cols).agg({
        'counts': 'sum',
        'population': 'sum'
    }).reset_index()

    # divide by pop to get back into ratespace ... and we have the start of our
    # weights
    df['weight'] = df['counts'] / df['population']

    return (df)
Пример #8
0
df_list = []
for f in files:
    temp = pd.read_hdf(f)
    df_list.append(temp)
df_orig = pd.concat(df_list, ignore_index=True)
del df_list, temp

df = df_orig.copy()

df = drop_data_for_outpatient(df)
print "shape is {}".format(df.shape)
print "facility_id: {}".format(df.facility_id.value_counts())
print "source: {}".format(df.source.unique())
print "sex: {}".format(df.sex_id.unique())
df = gbd_hosp_prep.all_group_id_start_end_switcher(df, remove_cols=True)
df[['source', 'age_start', 'age_end']].drop_duplicates().sort_values(by=['source', 'age_start'])


# **fix age end**
df.loc[(df.age_start == 85)&(df.age_end == 90), ['age_start', 'age_end']] = [85, 125]
df[['source', 'age_start', 'age_end']].drop_duplicates().sort_values(['source', 'age_start'])

# ** mapping **
df = outpatient_mapping(df, which_map='current')

# ** parent inj **
checkpoint = df.copy()
df = checkpoint.copy()
df = get_parent_injuries(df.copy())
Пример #9
0
def run_age_sex_splitting(df,
                          run_id,
                          gbd_round_id,
                          decomp_step,
                          round_id=0,
                          verbose=False,
                          write_viz_data=True,
                          level='icg_id',
                          weight_path=None,
                          inp_pipeline=True):
    """
    Takes in dataframe of data that you want to split, and a list of sources
    that need splitting, and then uses age_sex_splitting.py to split.  Returns
    The split data.  Along the way saves pre split and post split data in a
    format that's good for plotting. Runs hosp_prep.drop_data() and
    hosp_prep.apply_restrictions().  Meant to be ran on ALL hospital data!

    Parameters
        df: Pandas DataFrame
            The data to be split. can contain data that doesn't need to be split
            The age sex splitting code checks what parts of the data need
            splitting.
        round_id: int
            Specifies what round of splitting to run. used for file names.  If
            it's set to 1 or 2, it'll save a file that can be used for
            visualization.  If it's anything else nothing happens.
        verbose: Boolean
            Specifies if information about age groups before and after, and case
            counts before and after, should be printed to the screen.
        weight_path: str
            the location of the weights. If level is icg_id then it will pull them
            from within the inpatient run_id. If level is bundle_id then a weight
            path must be defined
    """

    if write_viz_data:

        if round_id == 1 or round_id == 2:

            pre_split = df.copy()
            pre_split = gbd_hosp_prep.all_group_id_start_end_switcher(
                pre_split)
            pre_split = pre_split.merge(query("""
                                              SQL
                                              """,
                                              conn_def='shared'),
                                        how='left',
                                        on='location_id')
            pre_split.drop([
                'year_end', 'age_group_unit', 'age_end', 'nid', 'facility_id',
                'representative_id', 'diagnosis_id', 'outcome_id', 'metric_id'
            ],
                           axis=1,
                           inplace=True)
            pre_split.to_csv(r"FILENAME"
                             r"FILEPATH".format(round_id),
                             index=False,
                             encoding='utf-8')

    if inp_pipeline:

        df = hosp_prep.drop_data(df, verbose=False)

    df = clinical_mapping.apply_restrictions(df,
                                             age_set='age_group_id',
                                             cause_type=level[:-3],
                                             prod=True)

    pre_cols = df.columns

    df_list = []

    id_cols = [
        'source', 'nid', level, 'year_id', 'age_group_id', 'sex_id',
        'location_id'
    ]
    perfect_ages = [
        5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 28, 30, 31,
        32, 235
    ]
    perfect_sexes = [1, 2]
    rows = df.shape[0]
    numer = 0

    for source in df.source.unique():

        splitting_df = df[df.source == source].copy()
        numer += splitting_df.shape[0]
        print(source)
        print("working: {}% done".format(float(numer) / rows * 100))
        if verbose:
            print("{}'s age groups before:".format(source))
            print(splitting_df['age_group_id'].sort_values().unique())

        splitting_df['year_id'] = splitting_df['year_start']
        splitting_df.drop(['year_start', 'year_end'], axis=1, inplace=True)
        if verbose:
            print("now splitting {}".format(source))

        if set(splitting_df.age_group_id.unique()).\
            symmetric_difference(set(perfect_ages)) == set() and\
            set(splitting_df.sex_id.unique()).symmetric_difference(set()) == set():
            df_list.append(splitting_df)
            continue

        split_df = split_age_sex(df=splitting_df,
                                 id_cols=id_cols,
                                 run_id=run_id,
                                 value_column='val',
                                 level_of_analysis=level,
                                 fix_gbd2016_mistake=False,
                                 gbd_round_id=gbd_round_id,
                                 decomp_step=decomp_step,
                                 weight_path=weight_path)
        if verbose:
            print("Orig value sum {} - New value sum {} = {} \n".format(
                splitting_df.val.sum(), split_df.val.sum(),
                splitting_df.val.sum() - split_df.val.sum()))
        pre_val = splitting_df.val.sum()
        post_val = split_df.val.sum()
        if pre_val - post_val > (pre_val * .005):
            warnings.warn(
                "Too many cases were lost, a {} percent change (1 - post/pre)".
                format((1 - (float(post_val) / float(pre_val))) * 100))

        if verbose:
            print("{}'s ages after:".format(source))
            print(split_df['age_group_id'].sort_values().unique())

        df_list.append(split_df)

    df = pd.concat(df_list, sort=False).reset_index(drop=True)


    assert df[['age_group_id']].drop_duplicates()\
        .shape[0]==21

    df['year_start'] = df['year_id']
    df['year_end'] = df['year_id']

    df.drop(['year_id'], axis=1, inplace=True)
    assert set(df.columns).symmetric_difference(set(pre_cols)) == set()

    if write_viz_data:

        if round_id == 1 or round_id == 2:
            viz = df.merge(query("""
                                    SQL
                                    """,
                                 conn_def='shared'),
                           how='left',
                           on='location_id')

            viz = gbd_hosp_prep.all_group_id_start_end_switcher(viz)

            viz.drop([
                'year_end', 'age_group_unit', 'age_end', 'nid', 'facility_id',
                'representative_id', 'diagnosis_id', 'outcome_id', 'metric_id'
            ],
                     axis=1).to_csv(r"FILENAME"
                                    r"FILEPATH".format(round_id),
                                    index=False,
                                    encoding='utf-8')

    return (df)
Пример #10
0
def run_outpatient(run_id, gbd_round_id, decomp_step, run_icpc):
    """
    SETUP
    Gets the files for our three outpatient sources.  Creates a dataframe
    after reading these files and drops data that cannot be used for the
    outpatient process.

    :param df: None at this moment
    :return df_orig: the raw dataframe from reading outpatient sources
    """

    if run_icpc:
        launch_icpc(run_id, gbd_round_id, decomp_step)

    def file_name(file):
        return file.split('.')[0]

    sources = ['SWE_PATIENT_REGISTRY_98_12', 'USA_NAMCS', 'USA_NHAMCS_92_10']
    source_dir = "FILEPATH"\
                 "FILENAME".format(run_id)

    file_paths = [
        os.path.join(source_dir, file) for file in os.listdir(source_dir)
        if file_name(file) in sources
    ]
    dfs = [pd.read_hdf(file) for file in file_paths]
    df_orig = pd.concat(dfs, ignore_index=True, sort=False)
    del dfs

    assert sorted(df_orig.source.unique()) == sorted(sources),\
        "sources inconsistent after writing to df"

    location = "FILENAME" + str(run_id)

    df = otp.drop_data_for_outpatient(df_orig)
    """
    AGES
    Switches out age_group_id with age_start and age_end.  Then, age_end is
    changed to 125 for ages in the range 85 to 90 for USA_NAMCS
    and USA_NHAMCS to match SWEDEN's oldest age.
    """

    df = ghp.all_group_id_start_end_switcher(df, remove_cols=True)

    df.loc[(df.age_start == 85) &
           (df.source.isin(['USA_NAMCS', 'USA_NHAMCS_92_10'])),
           ['age_end']] = 125
    df = ghp.all_group_id_start_end_switcher(df)
    """
    MAPPING ICD TO ICG
    Implements outpatient mapping from ICD to ICG level.
    """

    df = cm.map_to_gbd_cause(df,
                             input_type='cause_code',
                             output_type='icg',
                             write_unmapped=False,
                             truncate_cause_codes=False,
                             extract_pri_dx=False,
                             prod=False,
                             groupby_output=True)

    df.to_hdf(location + "FILEPATH",
              key='df',
              complib='blosc',
              complevel=5,
              mode='w')

    out = df.copy()
    out['estimate_type'] = 'otp-any-unadjusted'
    out = out[[
        'location_id', 'year_start', 'age_group_id', 'sex_id', 'nid',
        'representative_id', 'facility_id', 'estimate_type', 'diagnosis_id',
        'icg_id', 'icg_name', 'val'
    ]]
    out = out.rename(columns={'year_start': 'year_id'})
    out[['location_id', 'year_id', 'representative_id',
         'sex_id', 'nid', 'icg_id']] = \
        out[['location_id', 'year_id', 'representative_id',
             'sex_id', 'nid', 'icg_id']].astype(int)
    """
    MAPPING ICG TO BUNDLE
    Implements outpatient mapping from ICG to bundle level.
    """

    df = cm.map_to_gbd_cause(df,
                             input_type='icg',
                             output_type='bundle',
                             write_unmapped=False,
                             truncate_cause_codes=False,
                             extract_pri_dx=False,
                             prod=False,
                             groupby_output=True)
    df = ghp.all_group_id_start_end_switcher(df)

    df.to_hdf(location + "FILEPATH",
              key='df',
              complib='blosc',
              complevel=5,
              mode='w')
    """
    INJ FACTORS
    Applies injuries, correction, and restrictions.  Checks to see if 3
    columns are correctly added to the new df.
    """
    df = otp.get_parent_injuries(df)

    df = otp.apply_outpatient_correction(df, run_id)

    (rows_before, cols_before) = df.shape
    col_names_before = df.columns

    df = otp.apply_inj_factor(df, run_id, fillna=True)

    (rows_after, cols_after) = df.shape
    assert cols_after == cols_before + 2
    assert set(df.columns).symmetric_difference(set(col_names_before))\
        == set(['val_inj_corrected', 'factor']),\
        "columns failed to add after applying inj factor"

    df = otp.outpatient_restrictions(df)
    """
    MAKE SQUARE
    The following is because of how our make zeros function works. It finds
    all the ages present, and makes every source have those ages. Because
    different outpatient sources have different ages, after being made
    square every source has every age. That's dumb, but it works in
    inpatient, so here we just fix it post hoc. - previous documentation

    Makes the data square.
    """
    df = ghp.all_group_id_start_end_switcher(df, remove_cols=True)

    swe_ages = list(
        df[df.source == "SWE_PATIENT_REGISTRY_98_12"].age_group_id.unique())
    nhamcs_ages = list(
        df[df.source == "USA_NHAMCS_92_10"].age_group_id.unique())
    namcs_ages = list(df[df.source == "USA_NAMCS"].age_group_id.unique())
    pre_check_val = df.loc[df['val'] > 0,
                           'val'].sort_values().reset_index(drop=True)
    df = hosp_prep.make_zeros(
        df,
        etiology='bundle_id',
        cols_to_square=['val', 'val_corrected', 'val_inj_corrected'],
        icd_len=5)

    source_age_dict = {
        "SWE_PATIENT_REGISTRY_98_12": swe_ages,
        "USA_NHAMCS_92_10": nhamcs_ages,
        "USA_NAMCS": namcs_ages
    }

    df_list = []
    for source in source_age_dict:
        temp = df[df.source == source].copy()
        temp = temp[temp.age_group_id.isin(source_age_dict[source])].copy()
        df_list.append(temp)
    df = pd.concat(df_list, ignore_index=True, sort=False)

    df = ghp.all_group_id_start_end_switcher(df, remove_cols=False)

    post_check_val = df.loc[df['val'] > 0,
                            'val'].sort_values().reset_index(drop=True)
    assert set(pre_check_val) - set(post_check_val) == set([]), \
        "pre and post check val are different after making square"
    assert (post_check_val == pre_check_val).all(),\
        "pre and post check val are different after making square"
    assert not (post_check_val != pre_check_val).any(),\
        "pre and post check val are different after making square"
    assert np.abs(post_check_val - pre_check_val).sum() == 0.0,\
        "pre and post check val are different after making square"
    """
    FIX NULLS AND DUPES
    Cleans up the data by filling in nulls and removing duplicates.
    """

    df['age_group_unit'] = 1
    df['metric_id'] = 1

    maps = cm.get_bundle_measure(prod=False)
    maps_dups = maps.loc[maps['bundle_id'].duplicated(
        keep=False)].sort_values('bundle_id')
    assert maps_dups.shape[0] == 0, (
        "Map has more than one measure for the same bundle at least once.")
    maps.rename(columns={'bundle_measure': 'measure'}, inplace=True)
    assert "measure" in maps.columns, "Rename to column measure didn't work."
    df = df.merge(maps, how='left', on='bundle_id')

    cause_id_info = query("SQL", conn_def='epi')

    null = df.loc[df.measure.isnull()]
    null = null.merge(cause_id_info, how='left', on='bundle_id')
    null['cause_id'] = null['cause_id'].astype(int)
    null_measure = null['cause_id'].unique()

    inj = get_cause_metadata(cause_set_id=3)
    inj = inj.loc[inj['cause_outline'].str[0:1] == 'C']

    for each_measure in null_measure:
        assert not inj.loc[inj['cause_id'] ==
                           each_measure].empty, "null measures aren't injuries"

    df.loc[df.measure.isnull(), 'measure'] = 'inc'

    df.loc[df.factor.isnull(), 'factor'] = 1

    assert df.isnull().sum().sum() == 0, "cannot be nulls before elmo"

    sum_cols = ['val', 'val_corrected', 'val_inj_corrected']
    sum_dict = dict(list(zip(sum_cols, ['sum'] * 3)))
    df = df.groupby(
        df.columns.drop(sum_cols).tolist()).agg(sum_dict).reset_index()

    duplicated_df = df[df[[
        'age_start', 'age_end', 'age_group_id', 'year_start', 'year_end',
        'location_id', 'sex_id', 'bundle_id', 'nid'
    ]].duplicated(keep=False)].copy()
    assert duplicated_df.shape[0] == 0, "duped rows??"

    df = otp.get_sample_size_outpatient(df,
                                        gbd_round_id=gbd_round_id,
                                        decomp_step=decomp_step)
    """
    ELMO
    """

    assert (df.loc[df.age_end > 1, 'age_end'].values %
            5 == 0).all(), "ages are not in correct bins"

    test = df[['source', 'nid', 'year_start', 'year_end']].drop_duplicates()
    nid_map = pd.\
        read_excel("FILEPATH")
    test = test.merge(nid_map, how='left', on='nid')
    assert test.isnull().sum().sum() == 0, "there are null nids and years"
    assert test[test.merged_nid.isnull(
    )].shape[0] == 0, "there are null nids and years"

    done = df.copy()

    df['run_id'] = run_id
    df = reshape_long(df)

    df = df[df.cases < df.sample_size]
    assert 6607 not in df.bundle_id.unique(), (
        "6607 is a typo and shouldn't be in the map")

    nulls_ok_columns = ["mean", "lower", "upper"]
    test_df = df.drop(nulls_ok_columns, axis=1).copy()
    assert test_df.notnull().all().all(), "There are nulls:\n{}".format(
        test_df.isnull().sum())

    assert df[df[[
        'age_group_id', 'year_start', 'year_end', 'location_id', 'sex_id',
        'bundle_id', 'nid', 'estimate_id', 'diagnosis_id'
    ]].duplicated(keep=False)].shape[0] == 0, ("duplicate rows")

    done = otp.outpatient_elmo(done, gbd_round_id)

    nulls_ok_columns = [
        'mean', 'upper', 'lower', 'seq', 'underlying_nid', 'sampling_type',
        'recall_type_value', 'uncertainty_type', 'uncertainty_type_value',
        'input_type', 'standard_error', 'effective_sample_size',
        'design_effect', 'response_rate'
    ]
    test_df = done.drop(nulls_ok_columns, axis=1).copy()
    assert test_df.notnull().all().all(), "There are nulls:\n{}".format(
        test_df.isnull().sum())

    assert done[done[[
        'age_start', 'age_end', 'age_group_id', 'year_start', 'year_end',
        'location_id', 'sex', 'bundle_id', 'nid'
    ]].duplicated(keep=False)].shape[0] == 0, "duplicate rows"

    done.drop("age_group_id", inplace=True, axis=1)

    print("run_outpatient() has finished!")

    return done, df