def save_model_results(df, metric_name, acause):
    ''' Saves a separate output file for each me_tag in the dataframe
    '''
    uid_cols = nd.nonfatalDataset(metric_name, acause).uid_cols
    data_cols = nd.get_columns(metric_name)
    draw_cols = nd.get_columns("draw_cols")
    if metric_name == "incidence":
        measure_id = utils.get_gbd_parameter('incidence_measure_id')
        df.loc[:, 'me_tag'] = 'primary_phase'
    elif metric_name == "prevalence":
        measure_id = utils.get_gbd_parameter('prevalence_measure_id')
    for this_tag in df['me_tag'].unique():
        me_id = nd.get_modelable_entity_id(acause, this_tag)
        if me_id is None:
            continue
        print("me_id " + str(me_id) + " " + this_tag)
        output_data = df.loc[df['me_tag'].eq(this_tag), uid_cols + data_cols]
        output_data.columns = uid_cols + draw_cols
        output_data['modelable_entity_id'] = me_id
        nd.save_outputs(
            "final_results",
            output_data,
            acause,
            me_id,
            measure_id,
        )
示例#2
0
def save_worker(target_id, description, input_dir):
    print("saving {}...".format(description))
    years = list(range(1980, int(utils.get_gbd_parameter("max_year")) + 1))
    save_results_cod(input_dir=input_dir,
                     input_file_pattern='death_{location_id}.csv',
                     cause_id=target_id,
                     description=description,
                     sex_id=[1, 2],
                     metric_id=1,
                     year_id=years,
                     mark_best=True)
    print("model saved.")
示例#3
0
def refine_by_cc_code(df):
    ''' Generates a 'cc_code' (measure of the remaining difference between cancer
            mortality and all-cause mortality) and drops data that  that are not
            credible (cancer deaths > 70% of all-cause mortality)
    '''
    uid_cols = ['country_id'] + \
        [c for c in get_uid_cols() if c not in ['acause']]
    # Set max proportion of all-cause mortality that could possibly come from cancer
    max_pct_cancer = 0.70
    print("Entries before cc_code refinement: {}".format(len(df)))
    # Calculate cc_code as the difference between total cancer and all-cause mortality
    env = load_mortality_envelope(df['location_id'].unique().tolist(),
                                  df['age_group_id'].unique().tolist(),
                                  df['year_id'].unique().tolist())
    deaths_df = df.loc[
        ~df['acause'].str.contains("neo_leukemia_"), :]  # remove child causes
    deaths_df = deaths_df.groupby(uid_cols, as_index=False).agg({
        'deaths': 'sum',
        'pop': 'mean'
    }).rename(columns={'deaths': 'cancer_deaths'})
    cc_df = deaths_df.merge(
        env,
        how='inner',
        on=['location_id', 'year_id', 'sex_id', 'age_group_id'])
    cc_df['total_deaths'] = cc_df['death_rate'] * cc_df['pop']
    cc_df.loc[:, ['total_deaths', 'cancer_deaths']] = \
        cc_df[['total_deaths', 'cancer_deaths']].fillna(0)
    valid_estimates = (cc_df['cancer_deaths'] <=
                       max_pct_cancer * cc_df['total_deaths'])
    cc_df = cc_df.loc[valid_estimates, :]
    cc_df['deaths'] = cc_df['total_deaths'] - cc_df['cancer_deaths']
    cc_df['acause'] = "cc_code"
    cc_df['registry_index'] = "(\'0.0.0\',)"
    cc_df['NID'] = utils.get_gbd_parameter('generic_cancer_nid')
    cc_df['dataset_id'] = 3
    cc_df = cc_df.drop(['total_deaths', 'cancer_deaths', 'death_rate'], axis=1)
    cc_df.drop_duplicates(inplace=True)
    # Attach cc_code data to main dataset and return. First subset df to only
    #   those uids with valid cc_code values, then append the full cc_code values
    # subset output to only valid cc_code
    output = df.merge(cc_df[uid_cols], how='inner')
    print("Entries after cc_code refinement: {}".format(len(output)))
    output = output.append(cc_df)  # append
    df = modeled_locations.add_sdi_quintile(df, delete_existing=True)
    print("Final entries with cc_code attached: {}".format(len(output)))
    assert not output[output.duplicated(get_uid_cols())].any().any(), \
        "Duplicate entries present at end of refine_by_cc_code"
    assert not df['deaths'].isnull().any(), \
        "Mortality estimates lost while calulating cc_code"
    return (output)
示例#4
0
def extract_single_nid(nid_entry):
    ''' If multiple nids are present, returns a single nid
    '''
    try:
        nid_entry = literal_eval(nid_entry)
    except ValueError:
        pass
    if not isinstance(nid_entry, tuple) or isinstance(nid_entry, list):
        nid_entry = [nid_entry]
    if len(nid_entry) == 1:
        if str(nid_entry[0]).isdigit() and str(nid_entry[0]) != '0':
            return (int(nid_entry[0]))
        else:
            pass
    return (int(utils.get_gbd_parameter('generic_cancer_nid')))
示例#5
0
def split_liver():
    ''' Submits the liver-cancer-specific information to the split manager
    '''
    # set source and targets
    source_cid = 417  # parent cause_id
    target_cids = [996, 418, 419, 420, 421]  # cause_ids
    proportion_meids = [18763, 2470, 2471, 2472, 2473]  # proportion me_ids
    years = list(range(1980, int(utils.get_gbd_parameter("max_year")) + 1))
    description = "lvr_cncr_split"
    liver_model_path = utils.get_path('cod_liver_splits',
                                      process='cancer_model')
    work_dir = "{}/{}".format(liver_model_path, utils.display_timestamp())
    # Run split
    print(utils.display_timestamp())
    success = manage_split(source_cid, target_cids, proportion_meids, work_dir,
                           description)
    if success:
        print("All CoD liver splits uploaded. " + utils.display_timestamp())
    else:
        print("Error during CoD splits for liver cancer")
def calc_procedure_tenplus(inc_df, proportions, acause, location_id):
    ''' Multiplies incidence draws by the procedure proportion and the absolute
            survival proportion at 10 years to estimate the number of cases
            surviving for at least 10 years
    '''
    # Load known values
    print(
        "    calculating the incidence of procedures with surv > ten years...")
    uid_cols = nd.nonfatalDataset().uid_cols
    type_cols = nd.get_columns('incidence')
    draw_cols = nd.get_columns("draw_cols")
    abs_surv = [nd.get_columns("absolute_survival")]
    max_estimation_year = utils.get_gbd_parameter('max_year')
    max_survival_months = nd.nonfatalDataset().max_survival_months
    # Estimate incidence of procedure
    mrg_df = inc_df.merge(proportions)
    adj_df = mrg_df[uid_cols]
    num_procedures = (mrg_df[type_cols].values * mrg_df[draw_cols].values)
    adj_df[type_cols] = pd.DataFrame(num_procedures).fillna(0)
    # Estimate number of procedures resulting in survival beyond ten years
    surv_df = load_estimates('survival', acause, location_id)
    surv_df = surv_df.loc[surv_df['survival_month'].eq(max_survival_months),
                          uid_cols + abs_surv]
    adj_df = adj_df.merge(surv_df)
    pbt_df = adj_df[uid_cols]
    num_procedures_10ys = adj_df[type_cols].values * \
                                        adj_df[abs_surv].values
    pbt_df[draw_cols] = pd.DataFrame(num_procedures_10ys).fillna(0)
    # Update years and age categories
    pbt_df.loc[:, 'age_group_id'] = pbt_df['age_group_id'].apply(
        add_decade_to_age)
    pbt_df.loc[:, 'year_id'] += 10
    # drop data that are now out of scope
    pbt_df = pbt_df.loc[pbt_df['year_id'] <= max_estimation_year, :]
    # For procedures whose sequelae are fractional,
    if sequelae_fractions(acause):
        pbt_df = split_sequelae(pbt_df, acause, location_id)
    else:
        pbt_df.loc[:, 'modelable_entity_id'] = \
            nd.get_modelable_entity_id(acause, 'procedure_sequelae')
    return (pbt_df)