示例#1
0
def generate_estimates(acause, location_id, faux_correct):
    ''' Applies procedure adjustments where necessary, then saves separate outputs
            by measure and cancer phase
    '''
    print("Begin final adjustments...")
    faux_correct = False
    inc_df = load_estimates('incidence', acause, location_id, faux_correct)
    prev_input = load_estimates('prevalence', acause, location_id,
                                faux_correct)
    prev_df = calc_total_prevalence(prev_input,
                                    uid_cols=nd.nonfatalDataset(
                                        'prevalence', acause).uid_cols)
    pr_id = procedure_me_id(acause)
    if pr_id is not None:
        prop_df = load_procedure_proportions(pr_id, location_id)
        prev_df = apply_procdedure_proportions(prev_df, prop_df, acause,
                                               'prevalence', faux_correct)
        proc_data = calc_procedure_tenplus(inc_df, prop_df, acause,
                                           location_id, faux_correct)
        save_procedure_inputs(proc_data, acause, location_id)

        save_model_results(inc_df, 'incidence', acause, faux_correct)
        save_model_results(prev_df, 'prevalence', acause, faux_correct)
    else:
        save_model_results(inc_df, 'incidence', acause, faux_correct)
        save_model_results(prev_df, 'prevalence', acause, faux_correct)
    success_file = nd.nonfatalDataset('final_results',
                                      acause).get_output_file("finalized_" +
                                                              str(location_id))
    open(success_file, 'a').close()
    print(str(success_file) + " saved.")
    return (True)
示例#2
0
 def im_draw(df, draw_num, surv_uids, faux_correct):
     ''' Returns the dataframe with estimate of absolute survival for the 
         requested draw_num
     '''
     # Subset to only the necessary data
     decomp_str = decomp_prefix_cols(faux_correct)
     max_surv = nd.nonfatalDataset().max_survival_months
     draw_uids = nd.nonfatalDataset().uid_cols
     # Note: to run with draws, pass draw number to the two following get_columns calls
     abs_surv_col = '{}surv_abs_{}'.format(
         decomp_str, draw_num)  #nd.get_columns("absolute_survival")
     increm_mort_col = '{}incr_mort_{}'.format(
         decomp_str, draw_num
     )  # nd.get_columns("incremental_mortality") + '_{}'.format(draw_num)
     # Calculate incremental mortality, the number of people who have lived
     #   with the disease for each period (those who die in year one
     #   had the disease for only a year)
     df[increm_mort_col] = df.sort_values(surv_uids).groupby(
         draw_uids)[abs_surv_col].diff(-1).fillna(0).clip(lower=0)
     # Calculate the number of people surviving with the disease at and
     #   beyond the maximum year
     at_max_surv_months = (df['survival_month'] == max_surv)
     mort_total = df[~at_max_surv_months].groupby(
         draw_uids, as_index=False)[increm_mort_col].agg(
             np.sum).rename(columns={increm_mort_col: 'total_mort'})
     df = df.merge(mort_total)
     df.loc[at_max_surv_months, increm_mort_col] = 1 - df['total_mort']
     # test and return
     assert not df.isnull().any().any(), "Error in im_draw {}".format(i)
     return (df.loc[:, surv_uids + [increm_mort_col]])
示例#3
0
def calc_prevalence(adjusted_sequelae_durations, mort_df, acause,
                    faux_correct):
    '''
    '''
    print("    calculating prevalence...")
    decomp_str = decomp_prefix_cols(faux_correct)
    if len(decomp_str) > 0:
        max_draws = 100
    else:
        max_draws = 1000
    prev_cols = nd.get_columns('prevalence')
    mort_cols = nd.get_columns('mortality')
    surv_uids = nd.nonfatalDataset("survival", acause).uid_cols
    prev_uids = nd.nonfatalDataset("prevalence", acause).uid_cols
    # Create the prevalence estimation frame from the survival and mortality frames
    mrg_df = pd.merge(adjusted_sequelae_durations, mort_df)
    df = mrg_df[surv_uids + ['me_tag']]
    # Calculate prevalence of each sequela by multiplying sequela duration
    #     by the number of people surviving for only that duration
    for i in list(range(0, max_draws)):
        df['prev_{}'.format(i)] = mrg_df['deaths_{}'.format(i)].mul(
            mrg_df['sequela_duration'], axis=0)
    df = dft.collapse(df,
                      combine_cols=prev_cols,
                      by_cols=prev_uids,
                      func='sum')
    df.loc[:, prev_cols] = df[prev_cols] / 12  # convert to years
    assert not df.isnull().any().any(), "Error in im_draw {}".format(i)
    return (df)
示例#4
0
def load_survival(acause, location_id, faux_correct):
    ''' Returns survival estimation subset required for prevalence estimation
    '''
    decomp_str = decomp_prefix_cols(faux_correct)
    uid_cols = nd.nonfatalDataset("survival", acause).uid_cols
    abs_surv_draw_cols = nd.get_columns(
        '{}absolute_survival'.format(decomp_str))
    this_dataset = nd.nonfatalDataset("survival", acause)
    input_file = this_dataset.get_output_file(location_id)
    surv_data = pd.read_csv(input_file)
    return (surv_data[uid_cols + abs_surv_draw_cols])
示例#5
0
def load_incidence(acause, location_id, faux_correct):
    ''' Returns incidence estimation subset required for prevalence estimation
    '''
    decomp_str = decomp_prefix_cols(faux_correct)
    uid_cols = nd.nonfatalDataset().uid_cols
    inc_cols = nd.get_columns("incidence".format(decomp_str))
    inc_cols = inc_cols[0:1000]
    input_file = nd.nonfatalDataset("incidence",
                                    acause).get_output_file(location_id)
    inc_data = pd.read_csv(input_file)[uid_cols + inc_cols]
    return (inc_data[uid_cols + inc_cols])
示例#6
0
def save_model_results(df, metric_name, acause, faux_correct):
    ''' Saves a separate output file for each me_tag in the dataframe
    '''
    decomp_str = decomp_prefix_cols(faux_correct)
    uid_cols = nd.nonfatalDataset(metric_name, acause).uid_cols
    data_cols = nd.get_columns('{}{}'.format(decomp_str, metric_name))
    draw_cols = nd.get_columns("{}draw_cols".format(decomp_str))
    d_step = utils.get_gbd_parameter('current_decomp_step')
    if metric_name == "incidence":
        measure_id = utils.get_gbd_parameter('incidence_measure_id')
        df.loc[:, 'me_tag'] = 'primary_phase'
    elif metric_name == "prevalence":
        measure_id = utils.get_gbd_parameter('prevalence_measure_id')
    for this_tag in df['me_tag'].unique():
        me_id = nd.get_modelable_entity_id(acause, this_tag)
        if me_id is None:
            continue
        print("me_id " + str(me_id) + " " + this_tag)
        output_data = df.loc[df['me_tag'].eq(this_tag), uid_cols + data_cols]
        output_data.columns = uid_cols + draw_cols
        output_data['modelable_entity_id'] = me_id
        output_data['upper'] = np.NaN
        output_data['lower'] = np.NaN
        output_data['uncertainty_type_value'] = np.NaN
        output_data['is_outlier'] = 0
        output_data['step4_location_year'] = '{} updated estimates'.format(
            d_step)
        nd.save_outputs(
            "final_results",
            output_data,
            acause,
            me_id,
            measure_id,
        )
示例#7
0
def save_procedure_inputs(df, acause, location_id):
    '''' Formats and saves procedure data for upload into the epi database
    '''
    uid_cols = nd.nonfatalDataset().uid_cols + ['modelable_entity_id']
    draw_cols = nd.get_columns("draw_cols")
    epi_estimate_cols = ['mean', 'lower', 'upper']
    data = df.loc[:, uid_cols + draw_cols].copy()
    # apply formatting
    data.loc[df['age_group_id'].isin([33, 44, 301]), 'age_group_id'] = 235
    data = dft.collapse(data, by_cols=uid_cols, stub='draw')
    epi_df = epi_upload.format_draws_data(data)
    epi_df = epi_upload.convert_to_rate(epi_df, epi_estimate_cols, location_id)

    # Add metadata
    epi_df['measure'] = 'incidence'
    epi_df['unit_type'] = "Person*year"
    epi_df['extractor'] = getuser()
    epi_df['location_id'] = location_id
    # Finalize and export
    for me_id in epi_df['modelable_entity_id'].unique():
        print("me_id " + str(me_id) + " sequela split")
        me_table = nd.load_me_table()
        bundle_id = int(me_table.loc[me_table['modelable_entity_id'].eq(me_id),
                                     'bundle_id'].item())
        this_output = epi_df.loc[epi_df['modelable_entity_id'].eq(me_id), :]
        this_output = epi_upload.EpiUploadDataframe(this_output).data
        # Save output without testing (epi formatter has already tested data per
        #   epi specs)
        # add location_id to enable save_outputs
        this_output['location_id'] = location_id
        nd.save_outputs("dismod_inputs",
                        this_output,
                        acause,
                        bundle_id,
                        skip_testing=True)
示例#8
0
def load_rel_surv_values(acause, location_id, cnf_model_version_id, faux_correct):
    ''' Loads and returns survival best-case/worst-case estimations for the given
            acause
    '''
    print("       loading survival...")
    decomp_str = decomp_prefix_cols(faux_correct)
    uid_cols = nd.nonfatalDataset().uid_cols
    uid_cols = uid_cols + ['surv_year']
    scaled_survival = nd.get_columns('{}scaled_survival'.format(decomp_str))
    sex_restrictions = {'neo_prostate': 1, 'neo_testicular': 1,
                        'neo_cervical': 2, 'neo_ovarian': 2, 'neo_uterine': 2}
    # Load specific input based on version_id
    surv_folder = load_surv_folder(cnf_model_version_id)
    input_file = "{}/{}/{}.csv".format(surv_folder, acause, location_id)
    # import and update names
    this_surv = pd.read_csv(input_file)
    this_surv = this_surv.loc[this_surv['surv_year'] <= 10,]
    this_surv.rename(columns={'year': 'year_id',
                              'sex': 'sex_id'}, inplace=True)
    # Add 'year 0' survival equal to 1 (no time has passed through which to survive)
    tmp = this_surv.loc[this_surv['surv_year'].eq(1),]
    tmp['surv_year'] = 0
    tmp[scaled_survival] = 1 
    this_surv = this_surv.append(tmp)
    # Subset by sex
    if acause in sex_restrictions.keys():
        this_surv = this_surv.loc[this_surv['sex_id']
                                  == sex_restrictions[acause],: ]
    # Test and return
    assert not this_surv.isnull().any().any(), \
        "Null values found in relative survival input after formatting"
    validate_proportions(this_surv[scaled_survival])
    return(this_surv)
示例#9
0
def calc_mortality(surv_df, acause, location_id, faux_correct):
    ''' Calculate mortality, the number of people who die of the
        cause during the interval (year), where
            mort= incremental_mortality*incidence.
        Returns a datafrane of mortality by uid
    '''
    print("    estimating absolute mortality...")
    decomp_str = decomp_prefix_cols(faux_correct)
    if len(decomp_str) > 0:
        max_draws = 100
    else:
        max_draws = 1000
    uid_cols = nd.nonfatalDataset("survival", acause).uid_cols
    inc_cols = nd.get_columns("{}incidence".format(decomp_str))
    incr_mort_draw_cols = nd.get_columns(
        '{}incremental_mortality'.format(decomp_str))
    mort_cols = nd.get_columns('{}mortality'.format(decomp_str))
    incr_mort_df = calc_increm_mort(surv_df, acause, location_id, faux_correct)
    inc_df = load_incidence(acause, location_id, faux_correct)
    mrg_df = incr_mort_df.merge(inc_df)
    df = mrg_df[uid_cols]
    for i in list(range(0, max_draws)):
        df['deaths_{}'.format(i)] = \
            pd.DataFrame(mrg_df['inc_{}'.format(i)] * mrg_df['{}incr_mort_{}'.format(decomp_str, i)])
    df = df.merge(incr_mort_df)
    return (df)
示例#10
0
def calc_procedure_tenplus(inc_df, proportions, acause, location_id,
                           faux_correct):
    ''' Multiplies incidence draws by the procedure proportion and the absolute
            survival proportion at 10 years to estimate the number of cases
            surviving for at least 10 years
    '''
    # Load known values
    print(
        "    calculating the incidence of procedures with surv > ten years...")
    decomp_str = decomp_prefix_cols(faux_correct)
    uid_cols = nd.nonfatalDataset().uid_cols
    type_cols = nd.get_columns('{}incidence'.format(decomp_str))
    draw_cols = nd.get_columns("{}draw_cols".format(decomp_str))
    abs_surv_draw_cols = nd.get_columns(
        '{}absolute_survival'.format(decomp_str))
    max_estimation_year = utils.get_gbd_parameter('max_year')
    max_survival_months = nd.nonfatalDataset().max_survival_months
    # Estimate incidence of procedure
    mrg_df = inc_df.merge(proportions)
    adj_df = mrg_df[uid_cols]
    num_procedures = (mrg_df[type_cols].values * mrg_df[draw_cols].values)
    adj_df[type_cols] = pd.DataFrame(num_procedures).fillna(0)
    # Estimate number of procedures resulting in survival beyond ten years
    surv_df = load_estimates('survival', acause, location_id, faux_correct)
    surv_df = surv_df.loc[surv_df['survival_month'].eq(max_survival_months),
                          uid_cols + abs_surv_draw_cols]
    adj_df = adj_df.merge(surv_df)
    pbt_df = adj_df[uid_cols]
    num_procedures_10ys = adj_df[type_cols].values * \
                                        adj_df[abs_surv_draw_cols].values
    pbt_df[draw_cols] = pd.DataFrame(num_procedures_10ys).fillna(0)
    # Update years and age categories
    pbt_df.loc[:, 'age_group_id'] = pbt_df['age_group_id'].apply(
        add_decade_to_age)
    pbt_df.loc[:, 'year_id'] += 10
    # drop data that are now out of scope
    pbt_df = pbt_df.loc[pbt_df['year_id'] <= max_estimation_year, :]
    # For procedures whose sequelae are fractional,
    if sequelae_fractions(acause):
        pbt_df = split_sequelae(pbt_df, acause, location_id)
    else:
        pbt_df.loc[:, 'modelable_entity_id'] = \
            nd.get_modelable_entity_id(acause, 'procedure_sequelae')
    return (pbt_df)
示例#11
0
def save_splits(modelable_entity_id, cnf_model_version_id):
    ''' Launch jobs to upload each of the "split" modelable entities, generated 
            by splitting the parent modelable entity
    '''
    def output_file_func(id):
        return (nd.nonfatalDataset("upload", id).get_output_file('upload'))

    parent_me = modelable_entity_id
    work_dir = get_work_dir(parent_me, cnf_model_version_id)
    this_step = nd.nonfatalDataset("split", parent_me)
    success_file = this_step.get_output_file('upload')
    children_mes, skip_mes, me_tag = get_me_info(modelable_entity_id,
                                                 parent_me)
    measures = get_measures(me_tag)
    save_worker = utils.get_path('save_epi_worker', process='cancer_model')
    # Generate a list of arguments (one for each child me)
    description = "{}_run_{}".format(me_tag, cnf_model_version_id)
    save_args_template = ("--meid {meid} --meas_id {meas} --indir {input_dir}"
                          " --cnf_run_id {cnf_rid} --desc {desc}")
    save_arg_list = []
    for cm in children_mes:
        save_arg_list += [
            save_args_template.format(meid=cm,
                                      meas=" ".join([str(m)
                                                     for m in measures]),
                                      desc=description,
                                      input_dir="{}/{}".format(work_dir, cm),
                                      cnf_rid=cnf_model_version_id)
        ]
    # Start jobs
    job_dict = cluster_tools.create_jobs(script_path=save_worker,
                                         job_header="lvr_save_epi",
                                         memory_request=90,
                                         id_list=children_mes,
                                         script_args=save_arg_list,
                                         use_argparse=True,
                                         project_name="cancer")
    for i in job_dict:
        job_dict[i]['job'].launch()
    # Check for results
    job_description = str(modelable_entity_id) + " split upload"
    success_df = cluster_tools.wait_for_results(
        job_dict,
        jobs_description=job_description,
        noisy_checker=False,
        output_file_function=output_file_func,
        max_minutes=30)
    success = cluster_tools.validate_success(success_df, job_description)
    if success:
        success_df.to_csv(success_file, index=False)
        return (True)
    else:
        print("Error during split")
        return (False)
示例#12
0
def create_estimation_frame(acause, location_id, cnf_model_version_id, faux_correct):
    ''' Returns a dataframe containing the ages and covariates used to estimate
            survival and incremental mortality
    '''
    print("    creating estimation frame...")
    # inc_input = load_inc_data(acause)
    max_surv = nd.nonfatalDataset().max_survival_months
    uid_cols = nd.nonfatalDataset().uid_cols
    keep_ages = list(range(1, 21)) + list(range(30, 34)) + [235]
    # load and subset survival curve to match the estimation parameters
    surv_data = load_rel_surv_values(acause, location_id, cnf_model_version_id, faux_correct)
    surv_data['survival_month'] = surv_data['surv_year'] * 12
    surv_data = surv_data.loc[surv_data['survival_month'] <= max_surv]
    # merge with lambda values to create the survival estimation_frame
    lambda_input = load_lambda_values(location_id, cnf_model_version_id)
    estim_frame = surv_data.merge(lambda_input[uid_cols + ['lambda']])
    estim_frame = estim_frame.loc[estim_frame['age_group_id'].isin(
        keep_ages), :]
    estim_frame['lambda_years'] = (
        estim_frame['lambda'] * (estim_frame['surv_year']))
    return(estim_frame)
示例#13
0
def calc_increm_mort(surv_df, acause, location_id, faux_correct):
    ''' Returns a dataframe of mortality proportions equal to the mortality 
            delta from the previous survival_year, by uid 
    '''
    def im_draw(df, draw_num, surv_uids, faux_correct):
        ''' Returns the dataframe with estimate of absolute survival for the 
            requested draw_num
        '''
        # Subset to only the necessary data
        decomp_str = decomp_prefix_cols(faux_correct)
        max_surv = nd.nonfatalDataset().max_survival_months
        draw_uids = nd.nonfatalDataset().uid_cols
        # Note: to run with draws, pass draw number to the two following get_columns calls
        abs_surv_col = '{}surv_abs_{}'.format(
            decomp_str, draw_num)  #nd.get_columns("absolute_survival")
        increm_mort_col = '{}incr_mort_{}'.format(
            decomp_str, draw_num
        )  # nd.get_columns("incremental_mortality") + '_{}'.format(draw_num)
        # Calculate incremental mortality, the number of people who have lived
        #   with the disease for each period (those who die in year one
        #   had the disease for only a year)
        df[increm_mort_col] = df.sort_values(surv_uids).groupby(
            draw_uids)[abs_surv_col].diff(-1).fillna(0).clip(lower=0)
        # Calculate the number of people surviving with the disease at and
        #   beyond the maximum year
        at_max_surv_months = (df['survival_month'] == max_surv)
        mort_total = df[~at_max_surv_months].groupby(
            draw_uids, as_index=False)[increm_mort_col].agg(
                np.sum).rename(columns={increm_mort_col: 'total_mort'})
        df = df.merge(mort_total)
        df.loc[at_max_surv_months, increm_mort_col] = 1 - df['total_mort']
        # test and return
        assert not df.isnull().any().any(), "Error in im_draw {}".format(i)
        return (df.loc[:, surv_uids + [increm_mort_col]])

    # Generate incremental mortality draws
    decomp_str = decomp_prefix_cols(faux_correct)
    output_uids = nd.nonfatalDataset("survival", acause).uid_cols
    abs_surv_draw_cols = nd.get_columns(
        '{}absolute_survival'.format(decomp_str))
    incr_mort_draw_cols = nd.get_columns(
        '{}incremental_mortality'.format(decomp_str))
    output_df = surv_df.loc[:, output_uids]
    print("    estimating incremental mortality proportion...")
    # Note: this section remains written with a loop to facilitate future
    #   processing of absolute survival draws
    for i, as_col in enumerate(abs_surv_draw_cols):
        this_draw = im_draw(df=surv_df.loc[:, output_uids + [as_col]],
                            draw_num=i,
                            surv_uids=output_uids,
                            faux_correct=faux_correct)
        output_df = output_df.merge(this_draw, on=output_uids)
    return (output_df[output_uids + incr_mort_draw_cols])
示例#14
0
def generate_estimates(acause, location_id, faux_correct=False):
    ''' Runs the prevalence estimation pipeline
    '''
    faux_correct = False
    output_file = nd.nonfatalDataset("prevalence",
                                     acause).get_output_file(location_id)
    print("Begin prevalence estimation...")
    surv_df = load_survival(acause, location_id, faux_correct)
    mort_df = calc_mortality(surv_df, acause, location_id, faux_correct)
    adjusted_sequelae_durations = load_sequela_framework(surv_df, acause)
    prev_df = calc_prevalence(adjusted_sequelae_durations, mort_df, acause,
                              faux_correct)
    nd.save_outputs("prevalence", prev_df, acause)
示例#15
0
def load_estimates(metric_name, acause, location_id, faux_correct):
    ''' Loads previously-generated estimates per the metric_name
    '''
    decomp_str = decomp_prefix_cols(faux_correct)
    this_step = nd.nonfatalDataset(metric_name, acause)
    uid_cols = this_step.uid_cols
    if metric_name == "survival":
        type_cols = nd.get_columns('{}absolute_survival'.format(decomp_str))
    else:
        type_cols = nd.get_columns('{}{}'.format(decomp_str, metric_name))
    #
    input_file = this_step.get_output_file(location_id)
    input_data = pd.read_csv(input_file)
    return (input_data[uid_cols + type_cols])
示例#16
0
def split_sequelae(df, acause, location_id):
    ''' Splits estimates into sequela based on proportions from literature
    '''
    print("    splitting sequelae...")
    uid_cols = nd.nonfatalDataset().uid_cols + ['modelable_entity_id']
    draw_cols = nd.get_columns("draw_cols")
    # Generate dataframe containing the procedure_sequelae fractions
    fracs = pd.DataFrame().from_dict(
        sequelae_fractions(acause), orient='index').reset_index().rename(
            columns={'index': 'modelable_entity_id'})
    fracs = fracs[fracs['me_tag'].eq("procedure_sequelae")]
    fracs['acause'] = acause
    # Merge dataframe with data
    df['acause'] = acause
    split_df = df.merge(fracs)
    split_df[draw_cols] = split_df[draw_cols].multiply(split_df['fraction'],
                                                       axis='index')
    assert split_df[draw_cols].notnull().all().all(), "Nulls in split sequelae"
    return (split_df)
示例#17
0
def main(meid, desc, indir, run_id, meas_id):
    this_step = nd.nonfatalDataset("split", meid)
    success_file = this_step.get_output_file('upload')
    print("Working on {} ({}) in {}".format(meid, desc, indir, run_id))
    success_df = save_worker(meid=meid,
                             meas_ids=meas_id,
                             description=desc,
                             input_dir=indir,
                             cnf_run_id=run_id)
    # Validate save and preserve record if successful
    if (len(success_df) > 0) and isinstance(success_df, pd.DataFrame):
        if 'model_version_id' in success_df.columns:
            model_id = success_df.at[0, 'model_version_id']
            epi_upload.update_upload_record(
                meid,
                run_id,
                model_id,
                cancer_model_type="split_custom_epi",
                success=1)
            success_df.to_csv(success_file, index=False)
            return (True)
    else:
        print("Error during split")
    return (False)
示例#18
0
 def output_file_func(id):
     return (nd.nonfatalDataset("upload", id[0]).get_output_file('upload'))
示例#19
0
def load_sequela_framework(surv_df, acause):
    ''' Adjust sequela duration based on survival
            First adjust incremental sequela duration (including controlled) to equal 
            total months from diagnosis (at midyear) to death (at end year). This is
            total amount of time someone may experience any of the sequela (from 
            diagnosis to death, separated out by the amount of time they are living 
            with cancer)

        Then iteratively adjust duration of each sequela so time lived with cancer is 
            equal to the sum of all sequela durations
            1) zero-out metastatic_phase and terminal_phase for events that occur
            at the maximum survival duration.
            2) Adjust durations to fit the number of survival years
               i) The terminal phase is set and not adjusted
               ii) The most flexible phase is controlled: Adjust controlled time
                to equal the difference between incremental_duration sequela 
                duration and the duration of each of the other sequelae
               iii) The next most flexible time is primary diagnosis and 
                treatment
               iv) Finally we can adjust the metastatic time if the totals still
                  do not add up
    '''
    def adjust_duration(df, stage, uid_cols):
        '''
        '''
        sd_col = 'sequela_duration'
        input_cols = df.columns.tolist()
        this_phase = (df['me_tag'] == stage)
        df = df.merge(df[~this_phase].groupby(
            uid_cols,
            as_index=False)[sd_col].sum().rename(columns={sd_col: 'tot_dur'}))
        df.loc[this_phase, sd_col] = df['incremental_duration'] - df['tot_dur']
        df.loc[this_phase & (df[sd_col] <= 0), sd_col] = 0
        assert not df.duplicated(uid_cols+['me_tag']).any(), \
            "ERROR: error when calculating sequelae_durations for {} stage".format(
                stage)
        return (df[input_cols])

    #
    print("    creating sequela framework...")
    nf_ds = nd.nonfatalDataset("survival", acause)
    uid_cols = nf_ds.uid_cols
    max_survival_months = nf_ds.max_survival_months
    seq_dur = load_durations(acause)
    # Add sequela durations
    surv_df.loc[:, 'acause'] = acause
    df = pd.merge(surv_df[uid_cols + ['acause']], seq_dur, on='acause')
    df.loc[:, 'raw_sequela_duration'] = df['sequela_duration']
    df.loc[:, 'incremental_duration'] = df['survival_month'] + 6
    # Set the 'beyond maximum' survival years to the duration of survival
    #   for the final period
    end_of_period = (df['survival_month'].eq(max_survival_months))
    df.loc[end_of_period, 'incremental_duration'] = max_survival_months - 6
    # Set late-phase duration to 0 at the end of the survival period
    #   (if someone survives beyond the maximum duration, they are treated
    #   as 'survivors', so there are no terminal or metastatic phases)
    late_phase = (df['me_tag'].isin(["terminal_phase", "metastatic_phase"]))
    end_of_period = df['survival_month'].eq(max_survival_months)
    df.loc[late_phase & end_of_period, 'sequela_duration'] = 0
    # Iteratively adjust sequela duration (see docstring for explanation)
    for stage in ['controlled_phase', "primary_phase", "metastatic_phase"]:
        df = adjust_duration(df, stage, uid_cols)
    assert df['incremental_duration'].notnull().all(), \
        "error calculating sequela durations"
    return (df)
示例#20
0
def apply_procdedure_proportions(df, proportions, acause, metric_name,
                                 faux_correct):
    ''' Multiplies estimates by procedure proportions, adding to the dataframe
            a set of estimates for the number of cancer events that do not receive
            the given procedure
        -- Note:
            As of 2018-07-10, incidence data are adjusted after modeling
            and are not processed through this function, although the ability 
            to do so remains 

    '''
    decomp_str = decomp_prefix_cols(faux_correct)
    print("    adjusting to avoid double-counting procedures for {}...".format(
        metric_name))
    # Return if adjustment is unnecessary (if there is no rate id for the cause)
    uid_cols = nd.nonfatalDataset(metric_name, acause).uid_cols
    draw_cols = nd.get_columns("{}draw_cols".format(decomp_str))
    type_cols = nd.get_columns('{}{}'.format(decomp_str, metric_name))
    mrg_cols = [c for c in uid_cols if c != 'me_tag']
    # Subset estimates to the phase wherein procedures occur
    if metric_name == 'prevalence':
        mrg_df = df.loc[df['me_tag'] == "controlled_phase", :].copy()
        del mrg_df['me_tag']
    elif metric_name == 'incidence':
        mrg_df = df.copy()
    # For data where sequela are a fraction of the number of procedures, multiply
    #       the procedure proportion by those fractions
    if metric_name == 'prevalence' and bool(sequelae_fractions(acause)):
        # Generate dataframe to containing the fractions
        fracs = pd.DataFrame().from_dict(sequelae_fractions(acause),
                                         orient='index')
        fracs['acause'] = acause
        fracs = fracs[~fracs['me_tag'].eq("procedure_sequelae")]
        # Merge dataframe with proportions to expand
        proportions['acause'] = acause
        props = proportions.merge(fracs)
        # Adjust proportions by me
        props[draw_cols] = props[draw_cols].multiply(props['fraction'],
                                                     axis='index')
        del props['acause']
    else:
        # Determine fraction of population that does not recieve the procedure
        props = proportions.copy()
        props['me_tag'] = "adjusted_controlled_phase_a"
    # Apply proportions to estimates
    #   Note: may drop some data if proportions are only for estimation years
    mrg_df = mrg_df.merge(props, on=mrg_cols, how='inner')
    adj_df = mrg_df[uid_cols]
    evnt_wo_proc = pd.DataFrame(mrg_df[type_cols].values *
                                mrg_df[draw_cols].values).fillna(0)
    evnt_wo_proc.columns = type_cols
    adj_df[type_cols] = evnt_wo_proc
    assert not adj_df.isnull().any().any(
    ), "Error calculating procedure proportions"
    # For prevalence, append the adjusted data to the rest of the estimates
    if metric_name == 'prevalence':
        sq_df = dft.collapse(adj_df, mrg_cols,
                             combine_cols=type_cols).sort_values(mrg_cols)
        cntrl_df = df.loc[df['me_tag'].eq("controlled_phase"), :].merge(
            mrg_df[mrg_cols].drop_duplicates(), on=mrg_cols,
            how='inner').sort_values(mrg_cols)
        nosq_df = cntrl_df[mrg_cols]
        no_proc = pd.DataFrame(cntrl_df[type_cols].values -
                               sq_df[type_cols].values)
        no_proc.columns = type_cols
        nosq_df[type_cols] = no_proc
        nosq_df['me_tag'] = "adjusted_controlled_phase"
        adj_df = adj_df.append(nosq_df)
        output_data = df.append(adj_df)
    # Incidence of cancers with the procedure is estimated elsewhere, so there
    #      is no need to preserve the unadjusted data
    else:
        output_data = adj_df
    return (output_data[uid_cols + type_cols])