Exemplo n.º 1
0
def run_phase(df,
              csvid,
              nid,
              extract_type_id,
              lsvid,
              pop_run_id,
              cmvid,
              launch_set_id,
              remove_decimal,
              write_diagnostics=True):
    """String together processes for redistribution."""

    # what to do about caching throughout the phase
    read_file_cache_options = {
        'block_rerun': True,
        'cache_dir': CACHE_DIR,
        'force_rerun': False,
        'cache_results': False
    }

    # the iso3 of this data
    iso3 = get_value_from_nid(nid,
                              'iso3',
                              extract_type_id=extract_type_id,
                              location_set_version_id=lsvid)

    # the code system id
    code_system_id = int(
        get_value_from_nid(nid,
                           'code_system_id',
                           extract_type_id=extract_type_id))

    # the data type
    data_type_id = get_value_from_nid(nid,
                                      'data_type_id',
                                      extract_type_id=extract_type_id)

    # cause map
    cause_map = get_cause_map(code_map_version_id=cmvid,
                              **read_file_cache_options)

    orig_deaths_sum = int(df['deaths'].sum())

    if remove_decimal:
        print_log_message("Removing decimal from code map")
        cause_map['value'] = cause_map['value'].apply(
            lambda x: x.replace(".", ""))

    if needs_garbage_correction(iso3, data_type_id):
        print_log_message("Correcting Garbage for {}".format(iso3))
        orig_gc_sum = int(df.query('cause_id == 743')['deaths'].sum())

        cause_meta_df = get_current_cause_hierarchy(cause_set_version_id=csvid,
                                                    **read_file_cache_options)

        # get age group ids
        age_meta_df = get_ages(**read_file_cache_options)

        loc_meta_df = get_current_location_hierarchy(
            location_set_version_id=lsvid, **read_file_cache_options)

        pop_meta_df = get_pop(pop_run_id=pop_run_id, **read_file_cache_options)
        # Move garbage to hiv first
        hiv_corrector = HIVCorrector(df,
                                     iso3,
                                     code_system_id,
                                     pop_meta_df,
                                     cause_meta_df,
                                     loc_meta_df,
                                     age_meta_df,
                                     correct_garbage=True)
        df = hiv_corrector.get_computed_dataframe()
        after_gc_sum = int(df.query('cause_id == 743')['deaths'].sum())
        after_deaths_sum = int(df['deaths'].sum())
        print_log_message("""
            Stage [gc deaths / total deaths]
            Before GC correction [{gco} / {to}]
            After GC correction [{gca} / {ta}]
        """.format(gco=orig_gc_sum,
                   to=orig_deaths_sum,
                   gca=after_gc_sum,
                   ta=after_deaths_sum))

    df = add_code_metadata(df, ['value', 'code_system_id'],
                           code_map=cause_map,
                           **read_file_cache_options)
    # recognizing that it is weird for code_system_id to come from two places,
    # make sure they are consistent
    assert (df['code_system_id'] == code_system_id).all(), "Variable code " \
        "system id {} did not agree with all values of df code " \
        "system id: \n{}".format(
            code_system_id, df.loc[df['code_system_id'] != code_system_id])

    print_log_message("Formatting data for redistribution")
    # do we have all the packages we need?
    # verify_packages(df)
    # format age groups to match package parameters
    df = format_age_groups(df)
    # drop observations with 0 deaths
    df = drop_zero_deaths(df)
    # merge on redistribution location hierarchy
    df = add_rd_locations(df, lsvid)
    # fill in any missing stuff that may have come from rd hierarchy
    df = fill_missing_df(df, verify_all=True)
    # create split groups

    # NO SPLIT GROUP NEEDED
    df = add_split_group_id_column(df)

    # final check to make sure we have all the necessary columns
    df = format_columns_for_rd(df, code_system_id)

    split_groups = list(df.split_group.unique())
    parallel = len(split_groups) > 1

    print_log_message("Submitting/Running split groups")
    for split_group in split_groups:
        # remove intermediate files from previous run
        delete_split_group_output(nid, extract_type_id, split_group)
        # save to file
        split_df = df.loc[df['split_group'] == split_group]
        write_split_group_input(split_df, nid, extract_type_id, split_group)
        # submit jobs or just run them here
        if parallel:
            submit_split_group(nid, extract_type_id, split_group,
                               code_system_id, launch_set_id)
        else:
            worker_main(nid, extract_type_id, split_group, code_system_id)
    if parallel:
        print_log_message("Waiting for splits to complete...")
        # wait until all jobs for a given nid have completed
        # eventually need logic for files not being present
        wait('claude_redistributionworker_{}'.format(nid), 30)
        # This seems to be necessary to wait for files
        print_log_message("Done waiting. Appending them together")
    # append split groups together
    df = read_append_split_groups(split_groups, nid, extract_type_id,
                                  cause_map)

    print_log_message("Done appending files - {} rows assembled".format(
        len(df)))
    df = revert_variables(df)

    after_deaths_sum = int(df['deaths'].sum())
    before_after_text = """
        Before GC redistribution: {a}
        After GC redistribution: {b}
    """.format(a=orig_deaths_sum, b=after_deaths_sum)
    diff = abs(orig_deaths_sum - after_deaths_sum)
    # bad if change 2% or 5 deaths, whichever is greater
    # (somewhat arbitrary, just trying to avoid annoying/non-issue failures)
    diff_threshold = max(.02 * orig_deaths_sum, 5)
    if not diff < diff_threshold:
        raise AssertionError("Deaths not close.\n" + before_after_text)
    else:
        print_log_message(before_after_text)

    return df
Exemplo n.º 2
0
def run_phase(df, nid, extract_type_id, env_run_id, pop_run_id,
              location_set_version_id, cause_set_version_id):
    cache_dir = CONF.get_directory('db_cache')
    source = get_value_from_nid(
        nid,
        'source',
        extract_type_id=extract_type_id,
        location_set_version_id=location_set_version_id)
    data_type_id = get_value_from_nid(
        nid,
        'data_type_id',
        extract_type_id=extract_type_id,
        location_set_version_id=location_set_version_id)
    iso3 = get_value_from_nid(nid,
                              'iso3',
                              extract_type_id=extract_type_id,
                              location_set_version_id=location_set_version_id)
    standard_cache_options = {
        'force_rerun': False,
        'block_rerun': True,
        'cache_dir': cache_dir,
        'cache_results': False
    }

    # ************************************************************
    # Get cached metadata
    # ************************************************************
    print_log_message("Getting cached db resources")
    location_hierarchy = get_current_location_hierarchy(
        location_set_version_id=location_set_version_id,
        **standard_cache_options)
    pop_df = get_pop(pop_run_id=pop_run_id, **standard_cache_options)
    env_df = get_env(env_run_id=env_run_id, **standard_cache_options)
    age_weight_df = get_age_weights(**standard_cache_options)
    cause_meta_df = get_current_cause_hierarchy(
        cause_set_version_id=cause_set_version_id, **standard_cache_options)
    age_meta_df = get_ages(**standard_cache_options)

    # ************************************************************
    # RAKING
    # ************************************************************
    # Rake if appropriate based on this logic
    if ((data_type_id in [8, 9, 10] and (source != 'Other_Maternal'))
            or source in MATERNAL_NR_SOURCES):
        if source not in NOT_RAKED_SOURCES:
            print_log_message("Raking sub national estimates")
            raker = Raker(df, source)
            df = raker.get_computed_dataframe(location_hierarchy)

    # for the Other_Maternal source we only rake household surveys
    elif source == "Other_Maternal":
        model_groups = get_datasets(nid,
                                    extract_type_id,
                                    block_rerun=True,
                                    force_rerun=False).model_group.unique()
        assert len(model_groups) == 1
        model_group = model_groups[0]

        if "HH_SURVEYS" in model_group:
            if model_group == 'MATERNAL-HH_SURVEYS-IND':
                print_log_message("Raking sub national estimates," \
                    " applying double raking for India Maternal"
                )
                raker = Raker(df, source, double=True)
                df = raker.get_computed_dataframe(location_hierarchy)
            else:
                print_log_message("Raking sub national estimates")
                raker = Raker(df, source)
                df = raker.get_computed_dataframe(location_hierarchy)


# ************************************************************
# DROP ZERO SAMPLE SIZE AND RESTRICTED AGE/SEX DATA
# ************************************************************

# data with zero sample size is almost certaintly some anomolous result
# of a program generating data it shouldn't have, and it shouldn't be
# included in codem models. Was probably already dropped, anyway, before
# running noise reduction.
    df = df.query('sample_size != 0')

    # uploading data before 1980 is a waste of space because neither codem
    # nor codviz use it
    df = df.loc[df['year_id'] >= 1980]

    print_log_message("Enforcing age sex restrictions")
    # this actually drops data from the dataframe if it violates age/sex
    # restrictions (e.g. male maternity disorders)
    df = enforce_asr(df, cause_meta_df, age_meta_df)

    # ************************************************************
    # FIT EACH DRAW TO NON-ZERO FLOOR
    # ************************************************************

    print_log_message("Fitting to non-zero floor...")
    nonzero_floorer = NonZeroFloorer(df)
    df = nonzero_floorer.get_computed_dataframe(pop_df, env_df, cause_meta_df)

    # ************************************************************
    # AGE AGGREGATION
    # ************************************************************

    print_log_message("Creating age standardized and all ages groups")
    age_aggregator = AgeAggregator(df, pop_df, env_df, age_weight_df)
    df = age_aggregator.get_computed_dataframe()

    # ************************************************************
    # Make CODEm and CoDViz metrics for uncertainty
    # ************************************************************
    # columns that should be present in the phase output
    final_cols = [
        'age_group_id', 'cause_id', 'cf_corr', 'cf_final', 'cf_raw', 'cf_rd',
        'extract_type_id', 'location_id', 'nid', 'sample_size', 'sex_id',
        'site_id', 'year_id'
    ]
    # Use draws to make metrics for uncertainty to
    # be used by CODEm and CoDViz
    # also creates cf_final from mean of draws
    print_log_message("Making metrics for CODEm and CoDViz")
    if dataset_has_redistribution_variance(data_type_id, source):
        df = RedistributionVarianceEstimator.make_codem_codviz_metrics(
            df, pop_df)
        final_cols += [
            'cf_final_high_rd', 'cf_final_low_rd', 'variance_rd_log_dr',
            'variance_rd_logit_cf'
        ]

    # we did this in the old code-- no cfs over 1 nor below 0
    for cf_col in ['cf_final', 'cf_rd', 'cf_raw', 'cf_corr']:
        df.loc[df[cf_col] > 1, cf_col] = 1
        df.loc[df[cf_col] < 0, cf_col] = 0

    df = df[final_cols]

    return df
Exemplo n.º 3
0
def run_phase(df, nid, extract_type_id, pop_run_id, cause_set_version_id,
              location_set_version_id):
    """Run the full phase, chaining together computational elements."""
    cache_dir = CONF.get_directory('FILEPATH')

    orig_deaths = df['deaths'].sum()

    standard_cache_options = {
        'force_rerun': False,
        'block_rerun': True,
        'cache_dir': cache_dir,
        'cache_results': False
    }

    code_system_id = get_value_from_nid(nid,
                                        'code_system_id',
                                        extract_type_id=extract_type_id)

    # this queries the database, maybe should be passed in directly
    code_system = get_code_system_from_id(code_system_id)

    source = get_value_from_nid(nid, 'source', extract_type_id=extract_type_id)

    data_type_id = get_value_from_nid(
        nid,
        'data_type_id',
        extract_type_id=extract_type_id,
        location_set_version_id=location_set_version_id)

    # get cause hierarchy
    cause_meta_df = get_current_cause_hierarchy(
        cause_set_version_id=cause_set_version_id, **standard_cache_options)

    is_vr = data_type_id in [9, 10]

    if not skip_hiv_correction(source) and is_vr:

        # get location hierarchy
        loc_meta_df = get_current_location_hierarchy(
            location_set_version_id=location_set_version_id,
            **standard_cache_options)

        # get population
        pop_meta_df = get_pop(pop_run_id=pop_run_id, **standard_cache_options)

        # get age metadata
        age_meta_df = get_ages(**standard_cache_options)

        # get the country
        iso3 = get_value_from_nid(
            nid,
            'iso3',
            extract_type_id=extract_type_id,
            location_set_version_id=location_set_version_id)
        assert pd.notnull(iso3), "Could not find iso3 for nid {}, " \
            "extract_type_id {}".format(nid, extract_type_id)

        hiv_corrector = HIVCorrector(df,
                                     iso3,
                                     code_system_id,
                                     pop_meta_df,
                                     cause_meta_df,
                                     loc_meta_df,
                                     age_meta_df,
                                     correct_garbage=False)
        print_log_message("Running hiv correction for iso3 {}".format(iso3))
        df = hiv_corrector.get_computed_dataframe()

    if needs_injury_redistribution(source):
        print_log_message("Correcting injuries")
        if not 'loc_meta_df' in vars():
            # get location hierarchy
            loc_meta_df = get_current_location_hierarchy(
                location_set_version_id=location_set_version_id,
                **standard_cache_options)
        injury_redistributor = InjuryRedistributor(df, loc_meta_df,
                                                   cause_meta_df)
        df = injury_redistributor.get_computed_dataframe()

    df = combine_with_rd_raw(df, nid, extract_type_id, location_set_version_id)

    val_cols = ['deaths', 'deaths_raw', 'deaths_corr', 'deaths_rd']

    # run china VR rescaling
    if needs_subnational_rescale(source):
        china_rescaler = ChinaHospitalUrbanicityRescaler()
        df = china_rescaler.get_computed_dataframe(df)

    if needs_strata_collapse(source):
        # set site id to blank site id and collapse
        df['site_id'] = 2
        group_cols = list(set(df.columns) - set(val_cols))
        df = df.groupby(group_cols, as_index=False)[val_cols].sum()

    if is_vr:
        # drop if deaths are 0 across all current deaths columns
        df = df.loc[df[val_cols].sum(axis=1) != 0]

    # restrict causes based on code system
    print_log_message("Running bridge mapper")
    bridge_mapper = BridgeMapper(source, cause_meta_df, code_system)
    df = bridge_mapper.get_computed_dataframe(df)

    # run recodes based on expert opinion
    print_log_message("Enforcing some very hard priors (expert opinion)")
    expert_opinion_recoder = Recoder(cause_meta_df, source, code_system_id,
                                     data_type_id)
    df = expert_opinion_recoder.get_computed_dataframe(df)

    end_deaths = df['deaths'].sum()

    print_log_message("Checking no large loss or gain of deaths")
    if abs(orig_deaths - end_deaths) >= (.1 * end_deaths):
        diff = round(abs(orig_deaths - end_deaths), 2)
        old = round(abs(orig_deaths))
        new = round(abs(end_deaths))
        raise AssertionError("Change of {} deaths [{}] to [{}]".format(
            diff, old, new))

    return df
Exemplo n.º 4
0
def run_redistribution(input_data,
                       signature_ids,
                       proportion_ids,
                       cause_map,
                       package_folder,
                       residual_cause='cc_code',
                       diagnostic_output=False,
                       first_and_last_only=False,
                       rerun_cause_map=True):
    """Most granular method of whole redistribution process."""
    data, signature_metadata, proportion_metadata = prep_data(
        input_data,
        signature_ids,
        proportion_ids,
        residual_cause=residual_cause)
    print_log_message("Importing packages")
    packages = get_packages(package_folder, cause_map)
    print_log_message("Evaluating cause map restrictions")
    nid = int(input_data.nid.unique().item())
    extract_type_id = int(input_data.extract_type_id.unique().item())
    cm_file = "FILEPATH".format(nid, extract_type_id)
    if not os.path.isfile(cm_file):
        rerun_cause_map = True
    if rerun_cause_map:
        cause_map_evaluated = evaluate_cause_restrictions(
            cause_map, proportion_metadata)
    else:
        cause_map_evaluated = pd.read_csv(cm_file)
    print_log_message("Run redistribution!")
    diagnostics_all = []
    seq = 0
    if first_and_last_only:
        first = packages[0]
        last = packages[-1]
        packages = [first, last]
    for package in packages:

        if not data_has_any_package_garbage(data, package):
            continue

        print_log_message("    package: {}".format(package['package_name']))
        print_log_message("    package_description: {}".format(
            package['package_description']))
        print_log_message("        Deaths before = " + str(data.freq.sum()))
        print_log_message("        Rows before = " + str(len(data)))
        print_log_message("            ... calculating proportions")
        proportions = get_proportions(data,
                                      proportion_metadata,
                                      package,
                                      cause_map_evaluated,
                                      residual_cause=residual_cause)
        print_log_message("            ... redistributing data")
        data, diagnostics = redistribute_garbage(data, proportions, package)
        data = data.loc[(data['freq'] > 0) | (data['cause'] == residual_cause)]
        data = data.groupby(['proportion_id', 'signature_id',
                             'cause']).sum().reset_index()
        if diagnostic_output:
            diagnostics['seq'] = seq
            add_cols = [
                'shared_package_version_id', 'package_version_id',
                'package_name', 'package_id'
            ]
            for add_col in add_cols:
                diagnostics[add_col] = package[add_col]
            seq += 1
            diagnostics_all.append(diagnostics)
        print_log_message("        Deaths after = " + str(data.freq.sum()))
        print_log_message("        Rows after = " + str(len(data)))
    print_log_message("Done!")
    data = pd.merge(data, signature_metadata, on='signature_id')
    if diagnostic_output:
        diagnostics = pd.concat(diagnostics_all).reset_index(drop=True)
    return data.ix[data.freq > 0], diagnostics, \
        signature_metadata, proportion_metadata
def correct_misdiagnosis(df, nid, extract_type_id, code_system_id, adjust_id,
                         remove_decimal):

    conf = Configurator('standard')
    mc_process_dir = conf.get_directory('mc_process_data')
    package_dir = conf.get_directory('rd_process_inputs') + "FILEPATH"
    misdiagnosis_path = conf.get_resource('misdiagnosis_prob_path')
    if adjust_id == 543:
        misdiagnosis_version_id = 4
    elif adjust_id == 544:
        misdiagnosis_version_id = 3
    elif adjust_id == 500:
        misdiagnosis_version_id = 3

    misdiagnosis_path = misdiagnosis_path.format(
        adjust_id=adjust_id,
        version_id=misdiagnosis_version_id,
        code_system_id=code_system_id)

    start_deaths = df['deaths'].sum()
    start_deaths_target = df.loc[df.cause_id == adjust_id, 'deaths'].sum()
    start_deaths_cc = df.loc[df.cause_id == 919, 'deaths'].sum()

    df = df.loc[df.deaths > 0]

    print_log_message("Adding packages")
    df = add_packages(df, code_system_id, remove_decimal, package_dir)
    print_log_message("Getting deaths to move")
    move_df = get_deaths_to_move(df, adjust_id, misdiagnosis_path,
                                 mc_process_dir, nid, extract_type_id,
                                 code_system_id)
    print_log_message("Jumbling up deaths")
    df = death_jumble(df, move_df, adjust_id, code_system_id)

    print_log_message("Checking deaths jumbled well")
    end_deaths = df['deaths'].sum()
    end_deaths_target = df.loc[df.cause_id == adjust_id, 'deaths'].sum()
    end_deaths_cc = df.loc[df.cause_id == 919, 'deaths'].sum()

    assert abs(int(start_deaths) - int(end_deaths)) <= 5, \
        'Bad jumble - added/lost deaths ' \
        '(started: {}, ended: {})'.format(str(int(start_deaths)),
                                          str(int(end_deaths)))

    print_log_message("Storing intermediate data")
    store_intermediate_data(df, move_df, mc_process_dir, adjust_id, nid,
                            extract_type_id)

    print_log_message('Deaths moved: ' + str(
        int((end_deaths_target + end_deaths_cc) -
            (start_deaths_target + start_deaths_cc))))

    return df
Exemplo n.º 6
0
def get_proportions(data,
                    proportion_metadata,
                    package,
                    cause_map_evaluated,
                    residual_cause='cc_code'):
    weight_groups = find_weight_groups(package,
                                       proportion_metadata,
                                       filter_impossible=True,
                                       verify_integrity=False)

    print_log_message("                -Identifying targets")
    targets = []
    for tg in package['target_groups']:
        temp = pd.DataFrame(
            {'cause': package['target_groups'][tg]['target_codes']})
        temp['target_group'] = tg
        targets.append(temp)
    targets = pd.concat(targets).reset_index(drop=True)

    print_log_message("                -Pulling data counts - 1")
    proportions = []
    for pid in weight_groups['proportion_id']:
        temp = targets.copy(deep=True)
        if package['create_targets'] == 1:
            temp['freq'] = 0.001
        else:
            temp['freq'] = 0
        temp['proportion_id'] = pid
        proportions.append(temp)
    print_log_message("                -Pulling data counts - 2")
    tg_dict = {}
    for tg in package['target_groups']:
        tg_dict[tg] = package['target_groups'][tg]['target_codes']
    tg_df = pd.DataFrame.from_dict(tg_dict,
                                   orient='index').stack().reset_index()
    tg_df.columns = ['target_group', 'index', 'cause']
    tg_df = tg_df[['target_group', 'cause']]
    tg_df = tg_df.merge(data[['proportion_id', 'cause', 'freq']],
                        on='cause',
                        how='left')
    proportions.append(tg_df)
    print_log_message("                -Pulling data counts - 3")
    proportions = pd.concat(proportions)
    print_log_message("                -Pulling data counts - 4")
    proportions = proportions.sort_values(
        ['proportion_id', 'target_group', 'cause'], ).reset_index(drop=True)
    print_log_message("                -Pulling data counts - 5")
    proportions = proportions.set_index(
        ['proportion_id', 'target_group']).join(
            proportions.groupby(['proportion_id', 'target_group'
                                 ]).sum().rename(columns={'freq': 'total'}))
    print_log_message("                -Pulling data counts - 6")
    proportions.ix[proportions['total'] == 0, 'freq'] = 0.001
    proportions = proportions.drop('total', axis=1).reset_index()

    print_log_message("                -Pulling data counts - 7")
    proportions = pd.merge(proportions,
                           proportion_metadata,
                           on='proportion_id')
    print_log_message("                -Pulling data counts - 8")
    print_log_message("                -Merging on cause restrictions")
    # Merge on cause restrictions
    proportions = pd.merge(proportions,
                           cause_map_evaluated,
                           on=['proportion_id', 'cause'],
                           how='left')
    report_if_merge_fail(proportions, 'eval', ['proportion_id', 'cause'])
    # Zero out if the cause is restricted
    proportions.ix[~proportions['eval'], 'freq'] = 0
    # Calculate totals for each cause
    print_log_message("                -Calculating totals for each cause")
    proportions = proportions.ix[:, [
        'proportion_id', 'target_group', 'cause', 'freq'
    ]].groupby(['proportion_id', 'target_group', 'cause']).sum().reset_index()
    # Calculate totals for each target group & merge back on
    print_log_message(
        "                -Calculating totals for each target group")
    proportions = proportions.set_index(
        ['proportion_id', 'target_group']).join(
            proportions.groupby(['proportion_id', 'target_group'
                                 ]).sum().rename(columns={'freq': 'total'}))
    proportions = pd.merge(proportions.reset_index(),
                           weight_groups,
                           on='proportion_id')
    # Merge on weights
    print_log_message("                -Merging on weights")
    weights = []
    for tg in package['target_groups']:
        wg = 0
        for wgt in package['target_groups'][tg]['weights']:
            weights.append({
                'target_group': tg,
                'weight_group': str(wg),
                'weight': wgt
            })
            wg += 1
    weights = pd.DataFrame(weights)
    proportions = pd.merge(proportions,
                           weights,
                           on=['target_group', 'weight_group'])
    # Calculate final proportions to apply
    print_log_message("                -Reformatting data type")
    for c in ['freq', 'weight', 'total']:
        proportions[c] = proportions[c].astype('float64')
    print_log_message("                -Calculating proportions")
    proportions['proportion'] = (proportions.freq / proportions.total) * \
        proportions.weight

    print_log_message("                -Adding residual causes where needed")
    proportions = pd.concat([
        proportions.ix[proportions.total == 0,
                       ['proportion_id', 'target_group', 'weight', 'total']].
        drop_duplicates().set_index('total').set_value(
            0, 'cause', residual_cause).rename(columns={
                'weight': 'proportion'
            }).reset_index().ix[:, ['proportion_id', 'proportion', 'cause']].
        groupby(['proportion_id', 'cause']).sum().reset_index(),
        proportions.ix[proportions.total != 0,
                       ['proportion_id', 'cause', 'proportion']].groupby(
                           ['proportion_id', 'cause']).sum().reset_index()
    ]).reset_index(drop=True)
    # Again make sure everything sums to 1
    print_log_message("                -Make sure everything sums to 1")
    proportions = proportions.set_index(['proportion_id']).join(
        proportions.groupby(['proportion_id'
                             ]).sum().rename(columns={'proportion': 'total'}))
    proportions['proportion'] = (proportions.proportion / proportions.total)
    proportions = proportions.reset_index()[[
        'proportion_id', 'cause', 'proportion'
    ]]
    return proportions
Exemplo n.º 7
0
def redistribute_garbage(data, proportions, package):
    """Prepare garbage codes for redistribution."""
    diagnostics = []
    # Make sure the package contains all the codes in the proportions set
    print_log_message("                -Expanding proportions to signature id")
    temp = data[['proportion_id', 'signature_id'
                 ]].drop_duplicates().reset_index(drop=True).copy(deep=True)
    proportions = pd.merge(temp, proportions, on='proportion_id')
    # Tag garbage
    print_log_message("                -Tagging garbage")
    causes = data[['cause']].drop_duplicates()
    causes['garbage'] = 0
    causes.loc[causes['cause'].isin(package['garbage_codes']), 'garbage'] = 1
    cause_garbage_map = causes.set_index('cause').to_dict()['garbage']
    data['garbage'] = data['cause'].map(cause_garbage_map)
    diagnostics.append(data.loc[data['garbage'] == 1])
    # Get total number of garbage codes for each signature_id
    print_log_message("                -Summing garbage per signature id")
    temp = data.loc[data['garbage'] == 1,
                    ['proportion_id', 'signature_id', 'freq']].groupby(
                        ['proportion_id', 'signature_id']).sum().reset_index()
    temp = temp.rename(columns={'freq': 'garbage'})
    print_log_message("                -Splitting garbage onto targets: merge")
    # Redistribute garbage onto targets
    additions = pd.merge(proportions,
                         temp,
                         on=['proportion_id', 'signature_id'],
                         how='outer')
    print_log_message(
        "                -Splitting garbage onto targets: multiply")
    for c in ['proportion', 'garbage']:
        additions[c] = additions[c].fillna(0)
    additions['freq'] = additions['proportion'] * additions['garbage']
    additions = additions.loc[
        additions['freq'] > 0,
        ['signature_id', 'proportion_id', 'cause', 'freq']]
    diagnostics.append(additions)
    print_log_message(
        "                -Appending split garbage onto non-garbage")
    # Zero out garbage codes
    data.loc[data['garbage'] == 1, 'freq'] = 0
    # Tack on redistributed data
    data = pd.concat([data, additions])
    data = data.loc[:, ['proportion_id', 'signature_id', 'cause', 'freq']]
    data = data.reset_index(drop=True)
    # Create diagnostics
    print_log_message("                -Making diagnostic dataframe")
    diagnostics = pd.concat(diagnostics)
    diagnostics['garbage'] = diagnostics['garbage'].fillna(0)
    # Collapse to proportion id
    diagnostics = diagnostics.groupby(['proportion_id', 'garbage',
                                       'cause'])['freq'].sum().reset_index()
    # Return outputs
    return data, diagnostics
Exemplo n.º 8
0
def run_phase(df, nid, extract_type_id, env_run_id, pop_run_id,
              location_set_version_id, cause_set_version_id):
    cache_dir = CONF.get_directory('db_cache')
    source = get_value_from_nid(
        nid,
        'source',
        extract_type_id=extract_type_id,
        location_set_version_id=location_set_version_id)
    data_type_id = get_value_from_nid(
        nid,
        'data_type_id',
        extract_type_id=extract_type_id,
        location_set_version_id=location_set_version_id)
    standard_cache_options = {
        'force_rerun': False,
        'block_rerun': True,
        'cache_dir': cache_dir,
        'cache_results': False
    }

    # ************************************************************
    # Get cached metadata
    # ************************************************************
    print_log_message("Getting cached db resources")
    location_hierarchy = get_current_location_hierarchy(
        location_set_version_id=location_set_version_id,
        **standard_cache_options)
    pop_df = get_pop(pop_run_id=pop_run_id, **standard_cache_options)
    env_df = get_env(env_run_id=env_run_id, **standard_cache_options)
    age_weight_df = get_age_weights(**standard_cache_options)
    cause_meta_df = get_current_cause_hierarchy(
        cause_set_version_id=cause_set_version_id, **standard_cache_options)
    age_meta_df = get_ages(**standard_cache_options)

    # ************************************************************
    # RAKING
    # ************************************************************
    if ((data_type_id in [8, 9, 10] and (source != "Other_Maternal"))
            or source in MATERNAL_NR_SOURCES):
        if source not in NOT_RAKED_SOURCES:
            print_log_message("Raking sub national estimates")
            raker = Raker(df, source)
            df = raker.get_computed_dataframe(location_hierarchy)

    # for the Other_Maternal source we only rake household surveys
    elif source == "Other_Maternal":
        model_groups = get_datasets(nid,
                                    extract_type_id,
                                    block_rerun=True,
                                    force_rerun=False).model_group.unique()
        assert len(model_groups) == 1
        model_group = model_groups[0]

        if "HH_SURVEYS" in model_group:
            print_log_message("Raking sub national estimates")
            raker = Raker(df, source)
            df = raker.get_computed_dataframe(location_hierarchy)


# ************************************************************
# DROP ZERO SAMPLE SIZE AND RESTRICTED AGE/SEX DATA
# ************************************************************
    df = df.query('sample_size != 0')

    df = df.loc[df['year_id'] >= 1980]

    print_log_message("Enforcing age sex restrictions")

    df = enforce_asr(df, cause_meta_df, age_meta_df)

    # ************************************************************
    # FIT EACH DRAW TO NON-ZERO FLOOR
    # ************************************************************

    print_log_message("Fitting to non-zero floor...")
    nonzero_floorer = NonZeroFloorer(df)
    df = nonzero_floorer.get_computed_dataframe(pop_df, env_df, cause_meta_df)

    # ************************************************************
    # AGE AGGREGATION
    # ************************************************************

    print_log_message("Creating age standardized and all ages groups")
    age_aggregator = AgeAggregator(df, pop_df, env_df, age_weight_df)
    df = age_aggregator.get_computed_dataframe()

    # ************************************************************
    # Make CODEm and CoDViz metrics for uncertainty
    # ************************************************************
    # columns that should be present in the phase output
    final_cols = [
        'age_group_id', 'cause_id', 'cf_corr', 'cf_final', 'cf_raw', 'cf_rd',
        'extract_type_id', 'location_id', 'nid', 'sample_size', 'sex_id',
        'site_id', 'year_id'
    ]

    print_log_message("Making metrics for CODEm and CoDViz")
    if dataset_has_redistribution_variance(data_type_id, source):
        df = RedistributionVarianceEstimator.make_codem_codviz_metrics(
            df, pop_df)
        final_cols += [
            'cf_final_high_rd', 'cf_final_low_rd', 'variance_rd_log_dr',
            'variance_rd_logit_cf'
        ]

    for cf_col in ['cf_final', 'cf_rd', 'cf_raw', 'cf_corr']:
        df.loc[df[cf_col] > 1, cf_col] = 1
        df.loc[df[cf_col] < 0, cf_col] = 0

    df = df[final_cols]

    return df
Exemplo n.º 9
0
    def special_cause_reassignment(self, df, code_system_id):
        """Replace the actual data cause under certain conditions.

        This essentially allows mapping based on not just the cause
        and code system but based on other information like
        the location, NID, year, etc.

        Args:
            df (DataFrame): data with cause

        Returns:
            DataFrame: with any modifications
        """

        cache_args = {
            'force_rerun': False,
            'block_rerun': True,
            'cache_dir': 'standard',
            'cache_results': False
        }
        # Some SRS codes get redistributed differently than
        # other ICD10 datasets
        df = add_nid_metadata(
            df, 'source', **cache_args
        )

        if (df['source'] == "India_SRS_states_report").any():
            print_log_message("Changing SRS codes to custom garbage groups")
            assert (df['source'] == "India_SRS_states_report").all()

            df = add_code_metadata(
                df, 'value', code_system_id=code_system_id,
                **cache_args
            )

            custom_grbg = pd.read_csv(
                self.cg.get_resource("srs_custom_garbage_groups")
            )
            custom_grbg = custom_grbg.query('active == 1')
            custom_grbg['value'] = custom_grbg['srs_custom_garbage_group']
            custom_grbg = add_code_metadata(
                custom_grbg, 'code_id', code_system_id=code_system_id,
                merge_col='value', **cache_args
            )
            custom_grbg = custom_grbg.rename(
                columns={'code_id': 'new_code_id'})
            custom_grbg = custom_grbg[['package_id', 'new_code_id']]

            gp_dfs = []
            for package_id in custom_grbg.package_id.unique():
                gp_df = get_garbage_from_package(
                    code_system_id, package_id, package_arg_type="package_id"
                )
                assert len(gp_df) != 0, \
                    "Found 0 codes for package {}".format(package_id)
                gp_dfs.append(gp_df)
            gp_df = pd.concat(gp_dfs, ignore_index=True)

            gp_df = gp_df.merge(custom_grbg, how='left')
            report_if_merge_fail(gp_df, 'new_code_id', 'package_id')
            gp_df = gp_df[['value', 'new_code_id']]
            gp_df['value'] = gp_df['value'].str.strip()

            df = df.merge(gp_df, how='left', on='value')
            df.loc[df['new_code_id'].notnull(), 'code_id'] = df['new_code_id']
            df['code_id'] = df['code_id'].astype(int)
            df = df.drop(['new_code_id', 'value'], axis=1)

        df = df.drop('source', axis=1)

        china_cdc_2008 = (df['nid'] == 270005) & (df['extract_type_id'] == 2)
        # J96.00 - move five to four digit J96.0 (this should be a rule in formatting, only keep 4 digit detail)
        five_dig_code = df['code_id'] == 13243
        df.loc[
            china_cdc_2008 & five_dig_code,
            'code_id'
        ] = 13242

        return df
Exemplo n.º 10
0
def run_phase(df, nid, extract_type_id, pop_run_id, cause_set_version_id,
              location_set_version_id):
    """Run the full phase, chaining together computational elements."""
    # get filepaths
    cache_dir = CONF.get_directory('db_cache')

    orig_deaths = df['deaths'].sum()

    standard_cache_options = {
        'force_rerun': False,
        'block_rerun': True,
        'cache_dir': cache_dir,
        'cache_results': False
    }

    code_system_id = get_value_from_nid(nid,
                                        'code_system_id',
                                        extract_type_id=extract_type_id)

    code_system = get_code_system_from_id(code_system_id,
                                          **standard_cache_options)

    source = get_value_from_nid(nid, 'source', extract_type_id=extract_type_id)

    data_type_id = get_value_from_nid(
        nid,
        'data_type_id',
        extract_type_id=extract_type_id,
        location_set_version_id=location_set_version_id)

    # get cause hierarchy
    cause_meta_df = get_current_cause_hierarchy(
        cause_set_version_id=cause_set_version_id, **standard_cache_options)

    is_vr = data_type_id in [9, 10]

    # run hiv correction on VR, but not Other_Maternal
    # countries to correct will be further pruned by the master cause
    # selections csv in the hiv corrector class
    if not skip_hiv_correction(source) and is_vr:

        # get location hierarchy
        loc_meta_df = get_current_location_hierarchy(
            location_set_version_id=location_set_version_id,
            **standard_cache_options)

        # get population
        pop_meta_df = get_pop(pop_run_id=pop_run_id, **standard_cache_options)

        # get age metadata
        age_meta_df = get_ages(**standard_cache_options)

        # get the country
        iso3 = get_value_from_nid(
            nid,
            'iso3',
            extract_type_id=extract_type_id,
            location_set_version_id=location_set_version_id)
        assert pd.notnull(iso3), "Could not find iso3 for nid {}, " \
            "extract_type_id {}".format(nid, extract_type_id)

        hiv_corrector = HIVCorrector(df,
                                     iso3,
                                     code_system_id,
                                     pop_meta_df,
                                     cause_meta_df,
                                     loc_meta_df,
                                     age_meta_df,
                                     correct_garbage=False)
        print_log_message("Running hiv correction for iso3 {}".format(iso3))
        df = hiv_corrector.get_computed_dataframe()

    if needs_injury_redistribution(source):
        print_log_message("Correcting injuries")
        if not 'loc_meta_df' in vars():
            # get location hierarchy
            loc_meta_df = get_current_location_hierarchy(
                location_set_version_id=location_set_version_id,
                **standard_cache_options)
        injury_redistributor = InjuryRedistributor(df, loc_meta_df,
                                                   cause_meta_df)
        df = injury_redistributor.get_computed_dataframe()

    # apply redistribution of LRI to tb in under 15, non-neonatal ages based
    # on location/year specific proportions
    print_log_message(
        "Applying special redistribution of LRI to TB in under 15")
    lri_tb_redistributor = LRIRedistributor(df, cause_meta_df)
    df = lri_tb_redistributor.get_computed_dataframe()

    # merge in raw and rd here because recodes and bridge mapping should
    # also apply to the causes that are in previous phases (raw deaths for
    # secret codes need to be moved up to their parent cause, for example)
    df = combine_with_rd_raw(df, nid, extract_type_id, location_set_version_id)

    val_cols = ['deaths', 'deaths_raw', 'deaths_corr', 'deaths_rd']

    # run china VR rescaling
    if needs_subnational_rescale(source):
        china_rescaler = ChinaHospitalUrbanicityRescaler()
        df = china_rescaler.get_computed_dataframe(df)

    if needs_strata_collapse(source):
        # set site id to blank site id and collapse
        df['site_id'] = 2
        group_cols = list(set(df.columns) - set(val_cols))
        df = df.groupby(group_cols, as_index=False)[val_cols].sum()

    if is_vr:
        # drop if deaths are 0 across all current deaths columns
        df = df.loc[df[val_cols].sum(axis=1) != 0]

    # restrict causes based on code system
    print_log_message("Running bridge mapper")
    bridge_mapper = BridgeMapper(source, cause_meta_df, code_system)
    df = bridge_mapper.get_computed_dataframe(df)

    # run recodes based on expert opinion
    print_log_message("Enforcing some very hard priors (expert opinion)")
    expert_opinion_recoder = Recoder(cause_meta_df, source, code_system_id,
                                     data_type_id)
    df = expert_opinion_recoder.get_computed_dataframe(df)

    end_deaths = df['deaths'].sum()

    print_log_message("Checking no large loss or gain of deaths")
    if abs(orig_deaths - end_deaths) >= (.1 * end_deaths):
        diff = round(abs(orig_deaths - end_deaths), 2)
        old = round(abs(orig_deaths))
        new = round(abs(end_deaths))
        raise AssertionError("Change of {} deaths [{}] to [{}]".format(
            diff, old, new))

    return df
Exemplo n.º 11
0
def run_pipeline(year, source, int_cause, code_system_id, code_map_version_id,
                 cause_set_version_id, nid, extract_type_id, data_type_id,
                 inj_garbage, diagnostic_acauses=None,
                 explore=False, drop_p2=False):
    """Clean, map, and prep data for next steps."""

    print_log_message("Formatting data")
    formatting_method, args = get_formatting_method(
        source, data_type_id, year, drop_p2=drop_p2)
    df = formatting_method(*args)

    print_log_message("Dropping rows without multiple cause")
    df = drop_non_mcause(df, explore)

    print_log_message("Mapping data")
    Mapper = MCoDMapper(int_cause, code_system_id,
                        code_map_version_id, drop_p2=drop_p2)
    df = Mapper.get_computed_dataframe(df)

    cause_cols = [x for x in list(df) if ("cause" in x) & ~(
        x.endswith("code_original")) & ~(x.endswith(f"{int_cause}"))]
    cause_cols.remove("cause_id")
    # keep original "cause" information for
    # "cause" col is a string name in CoD cause map
    # after mapping to cause ids - (ex code id 103591)
    if source == "USA_NVSS":
        if code_system_id == 1:
            for col in cause_cols:
                df.loc[~(df[f"{col}"].str.match(
                    "(^[A-Z][0-9]{2,4}$)|(^0000$)")),
                    col] = df[f"{col}_code_original"]

    if inj_garbage:
        # FYI: This was a last minute addition to make plots of %X59/Y34
        # of injuries garbage for my manuscript
        # it's not needed for any analysis
        print_log_message(
            "subsetting to only rows with UCOD as injuries garbage codes")
        package_list = pd.read_excel(
            "/homes/agesak/thesis/maps/package_list.xlsx",
            sheet_name="mohsen_vetted")

        # get a list of all injuries garbage package names
        inj_packages = package_list.package_name.unique().tolist()

        # get the garbage codes associated with these garbage packages
        garbage_df = engine_room.get_package_list(
            code_system_or_id=code_system_id, include_garbage_codes=True)

        # subset df to only rows with injuries garbage as UCOD
        df = apply_garbage_map(df, garbage_df, inj_packages)
    else:
        causes = get_most_detailed_inj_causes(
            int_cause, cause_set_version_id=cause_set_version_id,
            **{'block_rerun': True, 'force_rerun': False})
        df = df.loc[(df.cause_id.isin(causes)) | (
            (df[f"{int_cause}"] == 1) & (df.cause_id == 743))]

        df = format_for_bow(df, code_system_id)
        # subset to rows where UCOD is injuries or any death is X59/y34
        df = df[[x for x in list(df) if not ((x.endswith(f"{int_cause}")) | (
            x.endswith("code_original")) | (x.startswith("pII")))] + [
            int_cause, f"pII_{int_cause}", f"cause_{int_cause}"]]
    return df
Exemplo n.º 12
0
def main(model_group, location_set_version_id, cause_set_version_id,
         launch_set_id):
    print_log_message(
        "Beginning NR modeling for model_group {}".format(model_group)
    )

    cache_dir = CONF.get_directory('db_cache')
    read_file_cache_options = {
        'block_rerun': True,
        'cache_dir': cache_dir,
        'force_rerun': False,
        'cache_results': False
    }

    print_log_message("Preparing location hierarchy")
    location_hierarchy = get_current_location_hierarchy(
        location_set_version_id=location_set_version_id,
        **read_file_cache_options
    )

    cause_meta_df = get_current_cause_hierarchy(
        cause_set_version_id=cause_set_version_id,
        **read_file_cache_options
    )

    age_meta_df = get_ages(
        **read_file_cache_options
    )

    print_log_message("Preparing model data")
    model_df = get_model_data(
        model_group, location_hierarchy, location_set_version_id, cause_meta_df
    )

    print_log_message("Got {} rows of model data".format(len(model_df)))
    if len(model_df) == 0:
        print_log_message("Exiting...")
        return

    # re make deaths in nrmodelworker.R
    model_df = model_df.drop('deaths', axis=1)

    # get the unique cause_ids by code_system
    code_system_cause_dict = get_code_system_cause_ids(model_df)

    # square data for certain data types
    if (model_group.startswith("VR")) or (model_group.startswith("Cancer")):
        print_log_message(
            "Bringing back zeros (squaring) so noise reduction "
            "knows to depress time series"
        )
        squarer = Squarer(cause_meta_df, age_meta_df)
        model_df = squarer.get_computed_dataframe(model_df)

    elif "HH_SURVEYS" in model_group:
        model_df = square_dhs_data(model_df, cause_meta_df,
                                   age_meta_df, location_hierarchy)

    print_log_message(log_statistic(model_df))

    print_log_message("Restricting model data to only existing cause_ids")
    model_df = restrict_to_cause_ids(code_system_cause_dict, model_df)

    print_log_message("Adding NR location info")
    model_df = format_for_nr(model_df, location_hierarchy)
    if model_group_is_run_by_cause(model_group):
        run_phase_by_cause(model_df, model_group, launch_set_id)
    else:
        run_phase_by_model_group(model_df, model_group, launch_set_id)

    print_log_message("Job complete. Exiting...")
Exemplo n.º 13
0
def run_phase_by_cause(model_df, model_group, launch_set_id):
    """Run the model, parallelizing by country and cause."""
    # get cause list
    nocause = model_df[model_df['cause_id'].isnull()]
    if len(nocause) > 0:
        raise AssertionError("Have {} rows with missing cause: {}".format(
            len(nocause),
            nocause
        ))
    causes = list(set(model_df['cause_id']))
    causes = [int(cause) for cause in causes]

    print_log_message(
        "Writing NR input file and submitting jobs for "
        "{} causes".format(len(causes)))

    log_base_dir = "FILEPATH" \
                   "{launch_set_id}".format(
                       user=getpass.getuser(),
                       launch_set_id=launch_set_id)
    claude_dir = CONF.get_directory('claude_code')
    worker = "{claude}/run_phase_nrmodelworker.R".format(claude=claude_dir)
    slots = 5
    if model_group == 'VR-GBR':
        cores = 25
    else:
        cores = 15
    subnat_iso3s = CONF.get_id('subnational_modeled_iso3s')
    for subnat_iso3 in subnat_iso3s:
        if model_group == "VR-{}".format(subnat_iso3):
            slots = 18
    if model_group == 'VR-GBR':
        slots = 100
    language = "r"

    num_draws = CONF.get_resource('uncertainty_draws')
    if not modelgroup_has_redistribution_variance(model_group):
        num_draws = 0

    for cause_id in causes:
        write_nrmodel_data(
            model_df, model_group, launch_set_id, cause_id=cause_id)
        params = [
            model_group, str(launch_set_id),
            str(num_draws), str(cause_id)
        ]
        jobname = "claude_nrmodelworker_{model_group}_{cause_id}".format(
            model_group=model_group, cause_id=cause_id)

        submit_cod(
            jobname,
            slots,
            language,
            worker,
            cores=cores,
            params=params,
            verbose=(launch_set_id == 0),
            logging=True,
            log_base_dir=log_base_dir
        )

    wait("claude_nrmodelworker_{model_group}".format(
        model_group=model_group), 30)

    nr_dir = CONF.get_directory('nr_process_data')
    iso_dir = "FILEPATH".format(
        nrdir=nr_dir, model_group=model_group)
    causes_outpath = "FILEPATH".format(iso_dir=iso_dir, lsid=launch_set_id)
    cause_path = "FILEPATH"

    for cause_id in causes:
        outpath = cause_path.format(iso_dir=iso_dir, cause_id=cause_id,
                                    lsid=launch_set_id)

        just_keep_trying(
            os.path.exists,
            args=[outpath],
            max_tries=250,
            seconds_between_tries=6,
            verbose=True
        )

    print_log_message("Writing causes file to {}".format(causes_outpath))
    causes_df = pd.DataFrame({'cause_id': list(causes)})
    causes_df.to_csv(causes_outpath, index=False)
Exemplo n.º 14
0
def get_model_data(model_group, location_hierarchy,
                   location_set_version_id, cause_meta_df):
    """Get data to run in NR model with incoming data."""
    iso3s = location_hierarchy.query('level == 3')['ihme_loc_id'].unique()
    regions = location_hierarchy.query('level == 2')['ihme_loc_id'].unique()
    super_region_ids = location_hierarchy.query(
        'level == 1')['location_id'].unique()

    # need to be string for later test that what comes after "VA-" is a
    # super region (otherwise, would have to compare ints, and whats after
    # "VA-" might not be convertible to an int)
    super_region_ids = [str(s) for s in super_region_ids]
    super_region_to_region_ids = location_hierarchy.query('level == 2')

    # location id here is the region id, and parent id is the super region id
    # becomes a dictionary from super region id to list of region ids
    super_region_to_region_ids = (
        super_region_to_region_ids[['location_id', 'parent_id']].groupby(
            'parent_id'
        ).apply(lambda df: list(set(df['location_id']))).to_dict()
    )

    regions_to_ids = location_hierarchy.query(
        'level == 2').set_index('ihme_loc_id')['region_id']

    level_three_location_ids = location_hierarchy.query(
        'level == 3')['location_id'].unique()

    model_group_filters = {}

    bad_model_group = False
    if model_group.startswith("VR-"):
        model_group_filters['data_type_id'] = [9, 10]
        loc_code = model_group.replace("VR-", "")
        if loc_code in iso3s:
            model_group_filters['iso3'] = loc_code
        elif loc_code in regions:
            region_id = regions_to_ids[loc_code]
            model_group_filters['region_id'] = region_id
            model_group_filters['exec_function'] = restrict_to_location_ids
            model_group_filters['exec_function_args'] = [
                level_three_location_ids
            ]
        elif loc_code == "GRL-AK":
            AK_LOC_ID = 524
            GRL_LOC_ID = 349
            model_group_filters['location_id'] = [AK_LOC_ID, GRL_LOC_ID]
        else:
            bad_model_group = True
    elif model_group.startswith("VA-"):
        model_group_filters['data_type_id'] = [8, 12]
        if model_group == "VA-SRS-IND":
            model_group_filters['source'] = IND_SRS_SOURCES
        elif model_group == "VA-SRS-IDN":
            model_group_filters['source'] = IDN_SRS_SOURCES
        elif model_group == "VA-Matlab":
            model_group_filters['source'] = MATLAB_SOURCES
        elif model_group == "VA-Nepal-Burden":
            model_group_filters['source'] = "Nepal_Burden_VA"
        elif model_group == "VA-IND":
            model_group_filters['iso3'] = "IND"
        elif model_group == "VA-158":
            # potential bug from GBD2016 - super region 158 keeps only
            # Pakistan, Nepal, and Bangledesh, doesn't get India data
            # Also keep Bhutan in case we ever have VA there
            model_group_filters['iso3'] = ['PAK', 'NPL', 'BGD', 'BTN']
        else:
            loc_code = model_group.replace("VA-", "")
            if loc_code in super_region_ids:
                super_region_id = int(loc_code)
                model_group_filters['region_id'] = \
                    super_region_to_region_ids[super_region_id]
            else:
                bad_model_group = True

    elif model_group == "Cancer_Registry":
        model_group_filters['source'] = "Cancer_Registry"

    # keep data by source/iso3/survey type
    # model groups follow MATERNAL-{source}-{iso3} format
    # except for the household surveys within Other_Maternal
    elif model_group.startswith("MATERNAL"):
        for source in MATERNAL_NR_SOURCES:
            if source in model_group:
                model_group_filters['source'] = source
        if "HH_SURVEYS" in model_group:
            model_group_filters['survey_type'] = ["DHS", "RHS", "AHS",
                                                  "DLHS", "NFHS"]
        model_group_filters['iso3'] = model_group[-3:]

    # special malaria model groups for VA data
    elif model_group.startswith('malaria'):
        model_group_filters['data_type_id'] = [8, 12]
        model_group_filters['malaria_model_group'] = model_group
        if "IND_SRS" in model_group:
            model_group_filters['source'] = IND_SRS_SOURCES
    elif model_group == "CHAMPS":
        model_group_filters['data_type_id'] = [12]
    else:
        bad_model_group = True
    if bad_model_group:
        raise AssertionError(
            "Unrecognized model group: {}".format(bad_model_group)
        )

    model_df = get_claude_data(
        phase="aggregation",
        is_active=True,
        is_dropped=False,
        location_set_id=35,
        year_id=range(1980, 2050),
        assert_all_available=True,
        location_set_version_id=location_set_version_id,
        **model_group_filters
    )

    add_cols = ['code_system_id']

    if model_group.startswith(("VA", "MATERNAL", "malaria", "CHAMPS")) or \
            model_group in ["VR-RUS", "VR-R9"]:
        add_cols.append('source')

    if model_group.startswith('MATERNAL-HH_SURVEYS'):
        model_df = add_survey_type(model_df)

    # add on code_system_id
    model_df = add_nid_metadata(
        model_df, add_cols, force_rerun=False, block_rerun=True,
        cache_dir='standard', cache_results=False
    )
    if model_group == "VR-RUS" or model_group == "VR-R9":
        # treat this like Russia_FMD_1989_1998 for purpose of cause list,
        # as it has now been bridge mapped that way
        replace_source = "Russia_FMD_ICD9"
        replace_csid = 213
        fmd_conv_10 = model_df['source'] == replace_source
        num_replace = len(model_df[fmd_conv_10])
        assert num_replace > 0, \
            "No rows found with source {} in " \
            "model group {}".format(replace_source, model_group)
        print_log_message(
            "Setting code system to {cs} for {s} "
            "source: {n} rows changed".format(
                cs=replace_csid, s=replace_source, n=num_replace)
        )
        model_df.loc[fmd_conv_10, 'code_system_id'] = replace_csid

    report_if_merge_fail(
        model_df, 'code_system_id', ['nid', 'extract_type_id']
    )

    # special source drops for certain groups
    model_df = drop_source_data(model_df, model_group, location_hierarchy,
                                cause_meta_df)

    return model_df
Exemplo n.º 15
0
def square_dhs_data(model_df, cause_meta_df, age_meta_df, location_hierarchy):
    """Special squaring method for DHS.

    We want to represent DHS data as a continuous time series, but
    there are some gaps in location/years. The goal is to produce
    a squared dataframe that can be noise reduced and upload a
    continuous time series.
    """
    # separate DHS from non-DHS data (e.g. NFHS, AHS, DLHS, RHS)
    dhs = model_df['survey_type'] == "DHS"
    non_dhs = model_df[~dhs]
    dhs = model_df[dhs]

    if len(dhs) > 0:
        # get df with id_cols to merge on apres square
        nid_loc_df = model_df[
            ['nid', 'location_id', 'site_id', 'extract_type_id']
        ].drop_duplicates()
        print_log_message(
            "Bringing back zeros (squaring) so noise reduction "
            "knows to depress time series"
        )
        squarer = Squarer(cause_meta_df, age_meta_df,
                          location_meta_df=location_hierarchy, dhs=True)
        dhs = squarer.get_computed_dataframe(dhs)

        # fill in some metadata
        dhs['code_system_id'].fillna(177, inplace=True)
        dhs['source'].fillna("Other_Maternal", inplace=True)
        dhs['survey_type'].fillna("DHS", inplace=True)

        # merge on NIDs, etid
        dhs = nid_loc_df.merge(dhs, on=['location_id', 'site_id'], how='right')

        # issue for timor-leste overlap w/ indonesia, drop indonesia duplicate
        tls = dhs.query('location_id == 19')
        dhs = dhs.query('location_id != 19')
        tls = tls.drop_duplicates(
            subset=['location_id', 'year_id', 'cause_id', 'age_group_id',
                    'sex_id', 'nid_y', 'extract_type_id_y'], keep='first'
        )
        dhs = pd.concat([tls, dhs], ignore_index=True)

        # replace null values with merged
        dhs.loc[dhs['nid_y'].isnull(), 'nid_y'] = dhs['nid_x']
        dhs.loc[
            dhs['extract_type_id_y'].isnull(), 'extract_type_id_y'
        ] = dhs['extract_type_id_x']

        # clean up
        dhs.drop(['extract_type_id_x', 'nid_x', 'iso3'], axis=1, inplace=True)
        dhs.rename(columns={
            'nid_y': 'nid', 'extract_type_id_y': 'extract_type_id'
        }, inplace=True)

        # fill in sample size for rows that were newly created
        # want sample size to be > 0 for noise reduction
        dhs.loc[dhs['sample_size'] == 0, 'sample_size'] = 0.5

        # append all household survey types back together
        model_df = pd.concat([dhs, non_dhs], ignore_index=True)

    assert model_df.notnull().values.any()
    report_duplicates(
        model_df, ['year_id', 'sex_id', 'location_id', 'cause_id',
                   'age_group_id', 'nid', 'extract_type_id', 'site_id'])

    return model_df
Exemplo n.º 16
0
def get_model_data(model_group, location_hierarchy, location_set_version_id,
                   cause_meta_df):
    """Get data to run in NR model with incoming data."""
    iso3s = location_hierarchy.query('level == 3')['ihme_loc_id'].unique()
    regions = location_hierarchy.query('level == 2')['ihme_loc_id'].unique()
    super_region_ids = location_hierarchy.query(
        'level == 1')['location_id'].unique()

    super_region_ids = [str(s) for s in super_region_ids]
    super_region_to_region_ids = location_hierarchy.query('level == 2')

    super_region_to_region_ids = (super_region_to_region_ids[[
        'location_id', 'parent_id'
    ]].groupby('parent_id').apply(
        lambda df: list(set(df['location_id']))).to_dict())

    regions_to_ids = location_hierarchy.query('level == 2').set_index(
        'ihme_loc_id')['region_id']

    level_three_location_ids = location_hierarchy.query(
        'level == 3')['location_id'].unique()

    model_group_filters = {}

    bad_model_group = False
    if model_group.startswith("VR-"):
        model_group_filters['data_type_id'] = [9, 10]
        loc_code = model_group.replace("VR-", "")
        if loc_code in iso3s:
            model_group_filters['iso3'] = loc_code
        elif loc_code in regions:
            region_id = regions_to_ids[loc_code]
            model_group_filters['region_id'] = region_id
            model_group_filters['exec_function'] = restrict_to_location_ids
            model_group_filters['exec_function_args'] = [
                level_three_location_ids
            ]
        elif loc_code == "GRL-AK":
            AK_LOC_ID = 524
            GRL_LOC_ID = 349
            model_group_filters['location_id'] = [AK_LOC_ID, GRL_LOC_ID]
        else:
            bad_model_group = True
    elif model_group.startswith("VA-"):
        model_group_filters['data_type_id'] = 8
        if model_group == "VA-SRS-IND":
            model_group_filters['source'] = IND_SRS_SOURCES
        elif model_group == "VA-SRS-IDN":
            model_group_filters['source'] = IDN_SRS_SOURCES
        elif model_group == "VA-Matlab":
            model_group_filters['source'] = MATLAB_SOURCES
        elif model_group == "VA-IND":
            model_group_filters['iso3'] = "IND"
        elif model_group == "VA-158":
            model_group_filters['iso3'] = ['PAK', 'NPL', 'BGD']
        else:
            loc_code = model_group.replace("VA-", "")
            if loc_code in super_region_ids:
                super_region_id = int(loc_code)
                model_group_filters['region_id'] = \
                    super_region_to_region_ids[super_region_id]
            else:
                bad_model_group = True

    elif model_group == "Cancer_Registry":
        model_group_filters['source'] = "Cancer_Registry"

    elif model_group.startswith("MATERNAL"):
        for source in MATERNAL_NR_SOURCES:
            if source in model_group:
                model_group_filters['source'] = source
        if "HH_SURVEYS" in model_group:
            model_group_filters['survey_type'] = [
                "DHS", "RHS", "AHS", "DLHS", "NFHS"
            ]
        model_group_filters['iso3'] = model_group[-3:]

    elif model_group.startswith('malaria'):
        model_group_filters['data_type_id'] = 8
        model_group_filters['malaria_model_group'] = model_group
        if "IND_SRS" in model_group:
            model_group_filters['source'] = IND_SRS_SOURCES
    else:
        bad_model_group = True
    if bad_model_group:
        raise AssertionError(
            "Unrecognized model group: {}".format(bad_model_group))

    model_df = get_claude_data(phase="aggregation",
                               is_active=True,
                               is_dropped=False,
                               location_set_id=35,
                               year_id=range(1980, 2050),
                               assert_all_available=True,
                               location_set_version_id=location_set_version_id,
                               **model_group_filters)

    add_cols = ['code_system_id']

    if model_group.startswith("VA") or model_group.startswith("MATERNAL") or \
            model_group in ["VR-RUS", "VR-R9"] or model_group.startswith('malaria'):
        add_cols.append('source')

    if model_group.startswith('MATERNAL-HH_SURVEYS'):
        model_df = add_survey_type(model_df)

    # add on code_system_id
    model_df = add_nid_metadata(model_df,
                                add_cols,
                                force_rerun=False,
                                block_rerun=True,
                                cache_dir='standard',
                                cache_results=False)
    if model_group == "VR-RUS" or model_group == "VR-R9":

        replace_source = "Russia_FMD_ICD9"
        replace_csid = 213
        fmd_conv_10 = model_df['source'] == replace_source
        num_replace = len(model_df[fmd_conv_10])
        assert num_replace > 0, \
            "No rows found with source {} in " \
            "model group {}".format(replace_source, model_group)
        print_log_message("Setting code system to {cs} for {s} "
                          "source: {n} rows changed".format(cs=replace_csid,
                                                            s=replace_source,
                                                            n=num_replace))
        model_df.loc[fmd_conv_10, 'code_system_id'] = replace_csid

    report_if_merge_fail(model_df, 'code_system_id',
                         ['nid', 'extract_type_id'])

    # special source drops for certain groups
    model_df = drop_source_data(model_df, model_group, location_hierarchy,
                                cause_meta_df)

    return model_df