Exemplo n.º 1
def combine_incidence(df):
    ''' Combines data by uid, preserving national_registry and full_coverage
            status for later reference
    print("   combining registries")
    uid_cols = get_uid_columns()
    combine_uids = uid_cols + [
        'country_id', 'national_registry', 'full_coverage'
    dont_combine = (df['sdi_quintile'].eq(5) & df['full_coverage'].eq(1) &
                     | df['is_subnational'].eq(1)))
    df = df[combine_uids + [
        'sdi_quintile', 'is_subnational', 'registry_index', 'dataset_id',
        'NID', 'cases', 'pop'
    combined_data = staging_functions.combine_uid_entries(
        df[~dont_combine], uid_cols=combine_uids, metric_cols=['cases', 'pop'])
    # Preferentially keep full_coverage data for the same uid
    existing = df.loc[dont_combine, :]
    output = existing.append(combined_data)
    output.sort_values(uid_cols + ['full_coverage'],
    output = output.drop_duplicates(subset=uid_cols, keep="first")
    # Re-set sdi quintile to account for merges
    output = modeled_locations.add_sdi_quintile(output, delete_existing=True)
    assert not output[output.duplicated(uid_cols)].any().any(), \
        "combine_incidence produced redundant entries"
    print("incidence estimates combined")
    return (output)
Exemplo n.º 2
def project_incidence():
    ''' For each IHME location_id, projects estimates based in the input cancer
        Includes generation of national estimates from subnational estimates
        where national estimates are not present 
    print("   projecting data to ihme demographic specifications...")
    output_file = utils.get_path("projected_incidence",
    input_file = utils.get_path("combined_incidence", process="cod_mortality")
    pop_uids = [c for c in get_uid_columns() if c != 'acause']
    df = pd.read_csv(input_file)
    # define subset that can be projected to the IHME population
    df = modeled_locations.add_subnational_status(df)
    df = supplement_national_estimates(df)
    # Ensure validity of sdi_quintile
    df = modeled_locations.add_sdi_quintile(df, delete_existing=True)
    # Calculate rate of input
    df.loc[:, 'rate'] = df['cases'] / df['pop']
    df['registry_pop'] = df['pop']
    # Mark data to be projected
    project_to_ihme = (df['sdi_quintile'].eq(5))
    df_sdi5 = df.loc[project_to_ihme, :].copy()
    df_other = df.loc[~project_to_ihme, :].copy()
    # Add IHME population to applicable uids
    del df_sdi5['pop']
    ihme_pop = load_ihme_pop(
        list(df.loc[project_to_ihme, 'location_id'].unique()))
    df_sdi5 = df_sdi5.merge(ihme_pop)
    # Homogenize population by group where not applying IHME populations
    df_other = staging_functions.homogenize_pop(df_other, uid_cols=pop_uids)
    output = df_other.append(df_sdi5)
    # reindex to allow multiplying series
    # create new column index, then set that as the new index
    output['index'] = np.arange(len(output))
    output = output.set_index('index')
    # Broadcast rates to the final population estimate for all locations
    output.loc[output['pop'].notnull() & output['rate'].notnull()
               & ~output['rate'].eq(np.inf),
               'cases'] = output['rate'] * output['pop']
    # Drop registry-specific tags
    output = output.drop([
        'national_registry', 'full_coverage', 'is_subnational', 'registry_pop'
    assert not output.loc[output.duplicated(get_uid_columns()), :].any().any(), \
        "Duplicates exist after projection"
    assert not output['pop'].isnull().any(), "Missing population data"
    assert len(output) == len(df), "Error during estimate projection"
    output.to_csv(output_file, index=False)
    print("   data projected.")
    return (output)
Exemplo n.º 3
def refine_by_cc_code(df):
    ''' Generates a 'cc_code' (measure of the remaining difference between cancer
            mortality and all-cause mortality) and drops data that  that are not
            credible (cancer deaths > 70% of all-cause mortality)
    uid_cols = ['country_id'] + \
        [c for c in get_uid_cols() if c not in ['acause']]
    # Set max proportion of all-cause mortality that could possibly come from cancer
    max_pct_cancer = 0.70
    print("Entries before cc_code refinement: {}".format(len(df)))
    # Calculate cc_code as the difference between total cancer and all-cause mortality
    loc_list = df['location_id'].unique().tolist()
    loc_list = [l for l in loc_list if str(l) != 'nan']
    env = load_mortality_envelope(loc_list,
    deaths_df = df.loc[
        ~df['acause'].str.contains("neo_leukemia_"), :]  # remove child causes
    deaths_df = deaths_df.groupby(uid_cols, as_index=False).agg({
        'deaths': 'sum',
        'pop': 'mean'
    }).rename(columns={'deaths': 'cancer_deaths'})
    cc_df = deaths_df.merge(
        on=['location_id', 'year_id', 'sex_id', 'age_group_id'])
    cc_df['total_deaths'] = cc_df['death_rate'] * cc_df['pop']
    cc_df.loc[:, ['total_deaths', 'cancer_deaths']] = \
        cc_df[['total_deaths', 'cancer_deaths']].fillna(0)
    valid_estimates = (cc_df['cancer_deaths'] <=
                       max_pct_cancer * cc_df['total_deaths'])
    cc_df = cc_df.loc[valid_estimates, :]
    cc_df['deaths'] = cc_df['total_deaths'] - cc_df['cancer_deaths']
    cc_df['acause'] = "cc_code"
    cc_df['registry_index'] = "0.0.1"
    cc_df['NID'] = utils.get_gbd_parameter('generic_cancer_nid')
    cc_df['dataset_id'] = 3
    cc_df = cc_df.drop(['total_deaths', 'cancer_deaths', 'death_rate'], axis=1)
    # Attach cc_code data to main dataset and return. First subset df to only
    #   those uids with valid cc_code values, then append the full cc_code values
    # subset output to only valid cc_code
    output = df.merge(cc_df[uid_cols], how='inner')
    print("Entries after cc_code refinement: {}".format(len(output)))
    output = output.append(cc_df)  # append
    df = modeled_locations.add_sdi_quintile(df, delete_existing=True)
    print("Final entries with cc_code attached: {}".format(len(output)))
    assert not output[output.duplicated(get_uid_cols())].any().any(), \
        "Duplicate entries present at end of refine_by_cc_code"
    assert not df['deaths'].isnull().any(), \
        "Mortality estimates lost while calulating cc_code"
    return (output)
Exemplo n.º 4
def add_subdiv(df):
    '''## Creates "site" information ('subdiv') displayed by CoD vis and source information
    print("adding subdiv (site labels)...")

    def _format_registries(reg_tup):
        if not isinstance(reg_tup, tuple):
                reg_tup = literal_eval(reg_tup)
                reg_tup = tuple(reg_tup)
        # remove country_id and any indexes for "[combined/muli] registry"
        reg_list = [
            r[(r.find(".") + 1):] for r in reg_tup
            if not r.startswith("0.0.") and not r.startswith("0.1.")
        return (", ".join(reg_list))

    def _format_dataset_id(ds_tup):
        if not isinstance(ds_tup, tuple):
                ds_tup = literal_eval(ds_tup)
                ds_tup = tuple(ds_tup)
        return (", ".join(list(ds_tup)))

    input_len = len(df)

    subdiv_uids = ['country_id'] + \
        [c for c in get_uid_cols() if c not in ['acause', 'age_group_id']]
    df = staging.combine_uid_entries(df,
    # Re-set sdi quintile to account for merges
    df = modeled_locations.add_sdi_quintile(df, delete_existing=True)
    # Generate subdiv values, unique to each cause
    df['subdiv'] = df['dataset_id'].apply(_format_dataset_id) + \
        ": " + df['registry_index'].apply(_format_registries)
    df.loc[df['subdiv'].str.len() >= 200,
           'subdiv'] = df.loc[df['subdiv'].str.len() >= 200,
                              'subdiv'].str[:197] + "..."
    assert not df[df.duplicated(get_uid_cols())].any().any(), \
        "Duplicate values present after subdiv"
    # Test output
    assert len(df) == input_len, \
        "Error generating source label. Entries are not consistent"
    assert not df[df.duplicated(get_uid_cols())].any().any(), \
        "Duplicate values present at end of add_subdiv"
    return (df)
Exemplo n.º 5
def add_required_columns(df):
    ''' Add columns whose information is required throughout the process
            These values must be populated for all uids throughout the pipeline
    # Add country_id and SDI_quintile
    df = modeled_locations.add_country_id(df)
    df.loc[df['registry_index'].str.startswith("163."), 'country_id'] = 163
           & df['registry_index'].str.startswith("63."), 'location_id'] = 63
    df = modeled_locations.add_sdi_quintile(df)
    # Mark data that are modeled subnationally. Ensure that
    #   (location_id == country_id) for uids that are modeled only nationally
    df = modeled_locations.add_subnational_status(df)
    df = staging_functions.add_coverage_metadata(df)
    df.loc[~df['is_subnational'].eq(1), 'location_id'] = df['country_id']
    for col in ['national_registry', 'full_coverage']:
        df.loc[df[col].isnull(), col] = 0
    return (df)
Exemplo n.º 6
def _add_ihme_pop_marker(df):
    ''' Returns the dataframe with an added 'ihme_pop_ok' column indicating whether
        ihme population estimates may be merged with the uid
    if not 'sdi_quintile' in df.columns:
        df = modeled_locations.add_sdi_quintile(df)
    if not 'full_coverage' in df.columns:
        df = add_coverage_metadata(df)
    ds_df = cdb.db_api().get_table("dataset")
    df.loc[:, 'ihme_pop_ok'] = 0
    for dsid in df['dataset_id'].unique():
        pop_ok = ds_df.loc[ds_df['dataset_id'] ==
                           dsid, 'can_use_ihme_pop'].values[0]
        if pop_ok == 1:
            df.loc[df['dataset_id'] == dsid, 'ihme_pop_ok'] = pop_ok
    ihme_pop_ok = (df['sdi_quintile'].isin([5]) &
    df.loc[ihme_pop_ok, 'ihme_pop_ok'] = 1
Exemplo n.º 7
def apply_recode(df):
    ''' Apply recode to mortality estiamtes (different than incidence recode),
            then recombine recoded mortality data
        -- Note: population should be unique by uid at this point, and should be
            included with the uid
    print("    recoding deaths...")
    input_framework = df[['location_id', 'year_id',
    df = df.loc[df['acause'].str.startswith("neo_"), :]
    # subset to make the recode faster
    young_ages = df.loc[df['age_group_id'] < 10, :]
    young_ages = recode(young_ages, data_type_id=3)
    uid_cols = get_uid_columns() + ['country_id']
    df = df[uid_cols +
            ['registry_index', 'dataset_id', 'NID', 'deaths', 'pop']]
    adults = df.loc[df['age_group_id'] >= 10,
                    [c for c in df.columns if c in young_ages.columns]]
    recoded = adults.append(young_ages)
    recoded = recoded[uid_cols +
                      ['registry_index', 'dataset_id', 'NID', 'deaths', 'pop']]
    # Ensure unify data source information and combine recoded data with other data
    print("    recombining re-coded data...")
    recoded = staging.combine_uid_entries(recoded,
                                          uid_cols + ['pop'],
    # Re-set sdi quintile to account for merges
    recoded = modeled_locations.add_sdi_quintile(recoded, delete_existing=True)
    # Test output
    check_len = len(recoded)
    recoded = recoded.merge(input_framework, how='outer')
    assert len(recoded) == check_len, \
        "Some uids lost after recode"  # ensure that no "null" entries are added on outer merge
    assert not recoded.loc[recoded.duplicated(get_uid_columns()), :].any().any(), \
        "Duplicates exist after recode"
    assert not (recoded['deaths'] < 0).any(), "Erroneous death values exist"
    return (recoded)
Exemplo n.º 8
def supplement_national_estimates(df):
    ''' Combines subnational estimates to create national estimates, then
        removes redundancy. These estimates are used for validation only
    print("      generating national estimations...")
    uid_cols = get_uid_columns()
    output_cols = uid_cols + ['cases', 'pop', 'dataset_id', 'NID']
    combine_uids = uid_cols + ['is_subnational', 'national_registry']
    # Subset to data used to create national projections.
    to_combine = (df['is_subnational'].eq(1) & ~df['national_registry'].eq(1)
                  & ~df['location_id'].eq(354))
    est_df = df.loc[to_combine, :]
    est_df.loc[:, 'location_id'] = est_df['country_id']
    est_df = staging_functions.combine_uid_entries(
        est_df, uid_cols=combine_uids, metric_cols=['cases', 'pop'])
    est_df['country_id'] = est_df['location_id']
    # Add sdi quintile information back to new estimates
    est_df = modeled_locations.add_sdi_quintile(est_df, delete_existing=True)
    # Preferentially keep existing data for the same uid
    df = df.append(est_df)
    df.sort_values(uid_cols + ['national_registry'],
    df = df.drop_duplicates(subset=uid_cols, keep="first")
    return (df)