Exemplo n.º 1
0
def combine_incidence(df):
    ''' Combines data by uid, preserving national_registry and full_coverage
            status for later reference
    '''
    print("   combining registries")
    uid_cols = get_uid_columns()
    combine_uids = uid_cols + [
        'country_id', 'national_registry', 'full_coverage'
    ]
    dont_combine = (df['sdi_quintile'].eq(5) & df['full_coverage'].eq(1) &
                    (df['national_registry'].eq(1)
                     | df['is_subnational'].eq(1)))
    df = df[combine_uids + [
        'sdi_quintile', 'is_subnational', 'registry_index', 'dataset_id',
        'NID', 'cases', 'pop'
    ]]
    combined_data = staging_functions.combine_uid_entries(
        df[~dont_combine], uid_cols=combine_uids, metric_cols=['cases', 'pop'])
    # Preferentially keep full_coverage data for the same uid
    existing = df.loc[dont_combine, :]
    output = existing.append(combined_data)
    output.sort_values(uid_cols + ['full_coverage'],
                       ascending=False).reset_index()
    output = output.drop_duplicates(subset=uid_cols, keep="first")
    # Re-set sdi quintile to account for merges
    output = modeled_locations.add_sdi_quintile(output, delete_existing=True)
    assert not output[output.duplicated(uid_cols)].any().any(), \
        "combine_incidence produced redundant entries"
    print("incidence estimates combined")
    return (output)
Exemplo n.º 2
0
def project_incidence():
    ''' For each IHME location_id, projects estimates based in the input cancer
        rates
        Includes generation of national estimates from subnational estimates
        where national estimates are not present 
    '''
    print("   projecting data to ihme demographic specifications...")
    output_file = utils.get_path("projected_incidence",
                                 process="cod_mortality")
    input_file = utils.get_path("combined_incidence", process="cod_mortality")
    pop_uids = [c for c in get_uid_columns() if c != 'acause']
    df = pd.read_csv(input_file)
    # define subset that can be projected to the IHME population
    df = modeled_locations.add_subnational_status(df)
    df = supplement_national_estimates(df)
    # Ensure validity of sdi_quintile
    df = modeled_locations.add_sdi_quintile(df, delete_existing=True)
    # Calculate rate of input
    df.loc[:, 'rate'] = df['cases'] / df['pop']
    df['registry_pop'] = df['pop']
    # Mark data to be projected
    project_to_ihme = (df['sdi_quintile'].eq(5))
    df_sdi5 = df.loc[project_to_ihme, :].copy()
    df_other = df.loc[~project_to_ihme, :].copy()
    # Add IHME population to applicable uids
    del df_sdi5['pop']
    ihme_pop = load_ihme_pop(
        list(df.loc[project_to_ihme, 'location_id'].unique()))
    df_sdi5 = df_sdi5.merge(ihme_pop)
    # Homogenize population by group where not applying IHME populations
    df_other = staging_functions.homogenize_pop(df_other, uid_cols=pop_uids)
    output = df_other.append(df_sdi5)
    # reindex to allow multiplying series
    # create new column index, then set that as the new index
    output['index'] = np.arange(len(output))
    output = output.set_index('index')
    # Broadcast rates to the final population estimate for all locations
    output.loc[output['pop'].notnull() & output['rate'].notnull()
               & ~output['rate'].eq(np.inf),
               'cases'] = output['rate'] * output['pop']
    # Drop registry-specific tags
    output = output.drop([
        'national_registry', 'full_coverage', 'is_subnational', 'registry_pop'
    ],
                         axis=1,
                         errors='ignore')
    assert not output.loc[output.duplicated(get_uid_columns()), :].any().any(), \
        "Duplicates exist after projection"
    assert not output['pop'].isnull().any(), "Missing population data"
    assert len(output) == len(df), "Error during estimate projection"
    output.to_csv(output_file, index=False)
    print("   data projected.")
    return (output)
Exemplo n.º 3
0
def refine_by_cc_code(df):
    ''' Generates a 'cc_code' (measure of the remaining difference between cancer
            mortality and all-cause mortality) and drops data that  that are not
            credible (cancer deaths > 70% of all-cause mortality)
    '''
    uid_cols = ['country_id'] + \
        [c for c in get_uid_cols() if c not in ['acause']]
    # Set max proportion of all-cause mortality that could possibly come from cancer
    max_pct_cancer = 0.70
    print("Entries before cc_code refinement: {}".format(len(df)))
    # Calculate cc_code as the difference between total cancer and all-cause mortality
    loc_list = df['location_id'].unique().tolist()
    loc_list = [l for l in loc_list if str(l) != 'nan']
    env = load_mortality_envelope(loc_list,
                                  df['age_group_id'].unique().tolist(),
                                  df['year_id'].unique().tolist())
    deaths_df = df.loc[
        ~df['acause'].str.contains("neo_leukemia_"), :]  # remove child causes
    deaths_df = deaths_df.groupby(uid_cols, as_index=False).agg({
        'deaths': 'sum',
        'pop': 'mean'
    }).rename(columns={'deaths': 'cancer_deaths'})
    cc_df = deaths_df.merge(
        env,
        how='inner',
        on=['location_id', 'year_id', 'sex_id', 'age_group_id'])
    cc_df['total_deaths'] = cc_df['death_rate'] * cc_df['pop']
    cc_df.loc[:, ['total_deaths', 'cancer_deaths']] = \
        cc_df[['total_deaths', 'cancer_deaths']].fillna(0)
    valid_estimates = (cc_df['cancer_deaths'] <=
                       max_pct_cancer * cc_df['total_deaths'])
    cc_df = cc_df.loc[valid_estimates, :]
    cc_df['deaths'] = cc_df['total_deaths'] - cc_df['cancer_deaths']
    cc_df['acause'] = "cc_code"
    cc_df['registry_index'] = "0.0.1"
    cc_df['NID'] = utils.get_gbd_parameter('generic_cancer_nid')
    cc_df['dataset_id'] = 3
    cc_df = cc_df.drop(['total_deaths', 'cancer_deaths', 'death_rate'], axis=1)
    cc_df.drop_duplicates(inplace=True)
    # Attach cc_code data to main dataset and return. First subset df to only
    #   those uids with valid cc_code values, then append the full cc_code values
    # subset output to only valid cc_code
    output = df.merge(cc_df[uid_cols], how='inner')
    print("Entries after cc_code refinement: {}".format(len(output)))
    output = output.append(cc_df)  # append
    df = modeled_locations.add_sdi_quintile(df, delete_existing=True)
    print("Final entries with cc_code attached: {}".format(len(output)))
    assert not output[output.duplicated(get_uid_cols())].any().any(), \
        "Duplicate entries present at end of refine_by_cc_code"
    assert not df['deaths'].isnull().any(), \
        "Mortality estimates lost while calulating cc_code"
    return (output)
Exemplo n.º 4
0
def add_subdiv(df):
    '''## Creates "site" information ('subdiv') displayed by CoD vis and source information
    '''
    print("adding subdiv (site labels)...")

    def _format_registries(reg_tup):
        if not isinstance(reg_tup, tuple):
            try:
                reg_tup = literal_eval(reg_tup)
            except:
                reg_tup = tuple(reg_tup)
        # remove country_id and any indexes for "[combined/muli] registry"
        reg_list = [
            r[(r.find(".") + 1):] for r in reg_tup
            if not r.startswith("0.0.") and not r.startswith("0.1.")
        ]
        return (", ".join(reg_list))

    def _format_dataset_id(ds_tup):
        if not isinstance(ds_tup, tuple):
            try:
                ds_tup = literal_eval(ds_tup)
            except:
                ds_tup = tuple(ds_tup)
        return (", ".join(list(ds_tup)))

    input_len = len(df)

    subdiv_uids = ['country_id'] + \
        [c for c in get_uid_cols() if c not in ['acause', 'age_group_id']]
    df = staging.combine_uid_entries(df,
                                     subdiv_uids,
                                     metric_cols=['deaths'],
                                     collapse_metrics=False)
    # Re-set sdi quintile to account for merges
    df = modeled_locations.add_sdi_quintile(df, delete_existing=True)
    # Generate subdiv values, unique to each cause
    df['subdiv'] = df['dataset_id'].apply(_format_dataset_id) + \
        ": " + df['registry_index'].apply(_format_registries)
    df.loc[df['subdiv'].str.len() >= 200,
           'subdiv'] = df.loc[df['subdiv'].str.len() >= 200,
                              'subdiv'].str[:197] + "..."
    assert not df[df.duplicated(get_uid_cols())].any().any(), \
        "Duplicate values present after subdiv"
    # Test output
    assert len(df) == input_len, \
        "Error generating source label. Entries are not consistent"
    assert not df[df.duplicated(get_uid_cols())].any().any(), \
        "Duplicate values present at end of add_subdiv"
    return (df)
Exemplo n.º 5
0
def add_required_columns(df):
    ''' Add columns whose information is required throughout the process
            These values must be populated for all uids throughout the pipeline
    '''
    # Add country_id and SDI_quintile
    df = modeled_locations.add_country_id(df)
    df.loc[df['registry_index'].str.startswith("163."), 'country_id'] = 163
    df.loc[df['dataset_id'].eq(391)
           & df['registry_index'].str.startswith("63."), 'location_id'] = 63
    df = modeled_locations.add_sdi_quintile(df)
    # Mark data that are modeled subnationally. Ensure that
    #   (location_id == country_id) for uids that are modeled only nationally
    df = modeled_locations.add_subnational_status(df)
    df = staging_functions.add_coverage_metadata(df)
    df.loc[~df['is_subnational'].eq(1), 'location_id'] = df['country_id']
    for col in ['national_registry', 'full_coverage']:
        df.loc[df[col].isnull(), col] = 0
    return (df)
Exemplo n.º 6
0
def _add_ihme_pop_marker(df):
    ''' Returns the dataframe with an added 'ihme_pop_ok' column indicating whether
        ihme population estimates may be merged with the uid
    '''
    if not 'sdi_quintile' in df.columns:
        df = modeled_locations.add_sdi_quintile(df)
    if not 'full_coverage' in df.columns:
        df = add_coverage_metadata(df)
    ds_df = cdb.db_api().get_table("dataset")
    df.loc[:, 'ihme_pop_ok'] = 0
    for dsid in df['dataset_id'].unique():
        pop_ok = ds_df.loc[ds_df['dataset_id'] ==
                           dsid, 'can_use_ihme_pop'].values[0]
        if pop_ok == 1:
            df.loc[df['dataset_id'] == dsid, 'ihme_pop_ok'] = pop_ok
    ihme_pop_ok = (df['sdi_quintile'].isin([5]) &
                   (df['full_coverage'].isin([1])))
    df.loc[ihme_pop_ok, 'ihme_pop_ok'] = 1
    return(df)
Exemplo n.º 7
0
def apply_recode(df):
    ''' Apply recode to mortality estiamtes (different than incidence recode),
            then recombine recoded mortality data
        -- Note: population should be unique by uid at this point, and should be
            included with the uid
    '''
    print("    recoding deaths...")
    input_framework = df[['location_id', 'year_id',
                          'sex_id']].drop_duplicates()
    df = df.loc[df['acause'].str.startswith("neo_"), :]
    # subset to make the recode faster
    young_ages = df.loc[df['age_group_id'] < 10, :]
    young_ages = recode(young_ages, data_type_id=3)
    uid_cols = get_uid_columns() + ['country_id']
    df = df[uid_cols +
            ['registry_index', 'dataset_id', 'NID', 'deaths', 'pop']]
    adults = df.loc[df['age_group_id'] >= 10,
                    [c for c in df.columns if c in young_ages.columns]]
    recoded = adults.append(young_ages)
    recoded = recoded[uid_cols +
                      ['registry_index', 'dataset_id', 'NID', 'deaths', 'pop']]
    # Ensure unify data source information and combine recoded data with other data
    print("    recombining re-coded data...")
    recoded = staging.combine_uid_entries(recoded,
                                          uid_cols + ['pop'],
                                          metric_cols=['deaths'])
    # Re-set sdi quintile to account for merges
    recoded = modeled_locations.add_sdi_quintile(recoded, delete_existing=True)
    # Test output
    check_len = len(recoded)
    recoded = recoded.merge(input_framework, how='outer')
    assert len(recoded) == check_len, \
        "Some uids lost after recode"  # ensure that no "null" entries are added on outer merge
    assert not recoded.loc[recoded.duplicated(get_uid_columns()), :].any().any(), \
        "Duplicates exist after recode"
    assert not (recoded['deaths'] < 0).any(), "Erroneous death values exist"
    return (recoded)
Exemplo n.º 8
0
def supplement_national_estimates(df):
    ''' Combines subnational estimates to create national estimates, then
        removes redundancy. These estimates are used for validation only
    '''
    print("      generating national estimations...")
    uid_cols = get_uid_columns()
    output_cols = uid_cols + ['cases', 'pop', 'dataset_id', 'NID']
    combine_uids = uid_cols + ['is_subnational', 'national_registry']
    # Subset to data used to create national projections.
    to_combine = (df['is_subnational'].eq(1) & ~df['national_registry'].eq(1)
                  & ~df['location_id'].eq(354))
    est_df = df.loc[to_combine, :]
    est_df.loc[:, 'location_id'] = est_df['country_id']
    est_df = staging_functions.combine_uid_entries(
        est_df, uid_cols=combine_uids, metric_cols=['cases', 'pop'])
    est_df['country_id'] = est_df['location_id']
    # Add sdi quintile information back to new estimates
    est_df = modeled_locations.add_sdi_quintile(est_df, delete_existing=True)
    # Preferentially keep existing data for the same uid
    df = df.append(est_df)
    df.sort_values(uid_cols + ['national_registry'],
                   ascending=False).reset_index()
    df = df.drop_duplicates(subset=uid_cols, keep="first")
    return (df)