Пример #1
0
def get_korean_war_locations():
    side_a = ["China", "Russian Federation"]
    locs = get_current_location_hierarchy()
    side_a = pd.DataFrame(data=side_a, columns={"location_name"})
    side_a = pd.merge(side_a,
                      locs[['location_name', 'location_id']],
                      how='left')
    side_a = list(locs[locs['parent_id'].isin(
        side_a['location_id'])]['location_id'])

    side_b_us = ['United States']
    side_b_us = pd.DataFrame(data=side_b_us, columns={"location_name"})
    side_b_us = pd.merge(side_b_us,
                         locs[['location_name', 'location_id']],
                         how='left')
    side_b_us = list(locs[locs['parent_id'].isin(
        side_b_us['location_id'])]['location_id'])

    side_b_uk = ['United Kingdom']
    side_b_uk = pd.DataFrame(data=side_b_uk, columns={"location_name"})
    side_b_uk = pd.merge(side_b_uk,
                         locs[['location_name', 'location_id']],
                         how='left')
    side_b_uk = list(locs[locs['parent_id'].isin(
        side_b_uk['location_id'])]['location_id'])
    side_b_uk = list(locs[locs['parent_id'].isin(side_b_uk)]['location_id'])
    side_b_uk = list(locs[locs['parent_id'].isin(side_b_uk)]['location_id'])
    location_id = [
        16, 76, 18, 179, 125, 82, 80, 89, 101, 71, 155, 44850, 44851
    ] + side_a + side_b_uk + side_b_us
    return location_id
Пример #2
0
def aggregate_to_country_level(orig_df, location_set_version_id):
    """Aggregate sub nationals to country level."""
    df = orig_df.copy()

    # merge on country level location_ids
    location_meta_df = get_current_location_hierarchy(
        location_set_version_id=location_set_version_id)
    country_location_ids = \
        get_country_level_location_id(df.location_id.unique(),
                                      location_meta_df)
    df = df.merge(country_location_ids, how='left', on='location_id')
    report_if_merge_fail(df, 'country_location_id', ['location_id'])

    # aggregate sub national locations to national level
    df = df[df['location_id'] != df['country_location_id']]
    df['location_id'] = df['country_location_id']
    df = df.drop(['country_location_id'], axis=1)
    group_cols = [col for col in df.columns if col not in VAL_COLS]
    df = df.groupby(group_cols, as_index=False)[VAL_COLS].sum()
    df['loc_agg'] = 1

    # append aggregates to original dataframe
    orig_df['loc_agg'] = 0
    df = df.append(orig_df)
    return df
Пример #3
0
def format_source(release_date):
    # read the raw data and the WHO provided country/year map
    df = read_data(release_date)
    country_map = get_country_map(release_date)

    # subset to just the new loc/years
    # also apply location/year restrictions
    df = subset_location_years(df, country_map)

    # map location information
    loc_meta = get_current_location_hierarchy(
        location_set_id=CONF.get_id('location_set'),
        location_set_version_id=CONF.get_id('location_set_version'),
        force_rerun=False,
        block_rerun=True)
    df = get_gbd_locations(df, country_map, loc_meta)

    # replicating age adjustments for WHO data from
    df = adjust_WHO_ages(df)

    # Limit the dataframe to the columns needed and melt ages wide to long
    df = melt_df(df)

    # assign age group ids
    df = get_age_group_ids(df)

    # map code ids and apply special remaps
    cause_map = get_cause_map(1, force_rerun=False, block_rerun=True)
    df = map_code_id(df, cause_map)

    # add manual cols and cleanup
    df = cleanup(df)

    # apply nids
    df = map_nids(df, release_date)

    # apply any final special adjustments
    df = apply_special_adjustments(df)

    # final grouping and finalize formatting
    df = df[FINAL_FORMATTED_COLS]
    assert df.notnull().values.all()
    df = df.groupby(ID_COLS, as_index=False)[VALUE_COL].sum()

    # run finalize formatting
    locals_present = finalize_formatting(df, SYSTEM_SOURCE, write=WRITE)
    nid_meta_df = locals_present['nid_meta_df']

    # update nid metadata status
    if WRITE:
        nid_extracts = nid_meta_df[[
            'nid', 'extract_type_id'
        ]].drop_duplicates().to_records(index=False)
        for nid, extract_type_id in nid_extracts:
            nid = int(nid)
            extract_type_id = int(extract_type_id)
            update_nid_metadata_status(nid,
                                       extract_type_id,
                                       is_active=IS_ACTIVE,
                                       is_mort_active=IS_MORT_ACTIVE)
Пример #4
0
def main(model_group, location_set_version_id, cause_set_version_id,
         launch_set_id):
    print_log_message(
        "Beginning NR modeling for model_group {}".format(model_group))

    cache_dir = CONF.get_directory('db_cache')
    read_file_cache_options = {
        'block_rerun': True,
        'cache_dir': cache_dir,
        'force_rerun': False,
        'cache_results': False
    }

    print_log_message("Preparing location hierarchy")
    location_hierarchy = get_current_location_hierarchy(
        location_set_version_id=location_set_version_id,
        **read_file_cache_options)

    cause_meta_df = get_current_cause_hierarchy(
        cause_set_version_id=cause_set_version_id, **read_file_cache_options)

    age_meta_df = get_ages(**read_file_cache_options)

    print_log_message("Preparing model data")
    model_df = get_model_data(model_group, location_hierarchy,
                              location_set_version_id, cause_meta_df)

    print_log_message("Got {} rows of model data".format(len(model_df)))
    if len(model_df) == 0:
        print_log_message("Exiting...")
        return

    model_df = model_df.drop('deaths', axis=1)

    code_system_cause_dict = get_code_system_cause_ids(model_df)

    if (model_group.startswith("VR")) or (model_group.startswith("Cancer")):
        print_log_message("Bringing back zeros (squaring) so noise reduction "
                          "knows to depress time series")
        squarer = Squarer(cause_meta_df, age_meta_df)
        model_df = squarer.get_computed_dataframe(model_df)

    elif "HH_SURVEYS" in model_group:
        model_df = square_dhs_data(model_df, cause_meta_df, age_meta_df,
                                   location_hierarchy)

    print_log_message(log_statistic(model_df))

    print_log_message("Restricting model data to only existing cause_ids")
    model_df = restrict_to_cause_ids(code_system_cause_dict, model_df)

    print_log_message("Adding NR location info")
    model_df = format_for_nr(model_df, location_hierarchy)
    if model_group_is_run_by_cause(model_group):
        run_phase_by_cause(model_df, model_group, launch_set_id)
    else:
        run_phase_by_model_group(model_df, model_group, launch_set_id)

    print_log_message("Job complete. Exiting...")
def add_rd_locations(df, lsvid):
    """Merge on location hierarchy specific to redistribution."""
    lhh = get_current_location_hierarchy(location_set_version_id=lsvid,
                                         force_rerun=False,
                                         block_rerun=True,
                                         cache_dir=CACHE_DIR)
    rd_lhh = get_redistribution_locations(lhh)
    df = pd.merge(df, rd_lhh, on='location_id', how='left')
    report_if_merge_fail(df, 'global', 'location_id')
    report_if_merge_fail(df, 'dev_status', 'location_id')

    return df
Пример #6
0
    def __init__(self):
        self.cg = Configurator('standard')
        self.cache_dir = self.cg.get_directory('db_cache')
        # if you do not want to write any output files then set test to "True"
        self.test = False
        self.cache_options = {
            'force_rerun': True,
            'block_rerun': False,
            'cache_dir': self.cache_dir
        }
        self.dataset_filters = {
            'data_type_id': [8, 9, 10, 12],
            'location_set_id': 35,
            'is_active': True,
            'year_id': range(1980, 2050)
        }
        self.national_nids = self.cg.get_resource("nid_replacements")

        # resources
        self.completeness = self.cg.get_resource("completeness")
        self.env_meta_df = get_env(env_run_id=self.cg.get_id('env_run'),
                                   **self.cache_options)
        self.location_meta_df = get_current_location_hierarchy(
            location_set_version_id=self.cg.get_id('location_set_version'),
            **self.cache_options)
        self.cod_ages = list(
            get_cod_ages(**self.cache_options)['age_group_id'].unique())

        # identifiers
        self.source_cols = ["source", "nid", "data_type_id"]
        self.geo_cols = ["location_id", "year_id"]
        self.meta_cols = ["nationally_representative", "detail_level_id"]
        self.value_cols = ['deaths']
        self.year_end = self.cg.get_id('year_end')
        self.full_time_series = "full_time_series"

        # directories
        self.current_best_version = "2018_04_03_151739"
        self.out_dir = "FILEPATH"
        self.arch_dir = "{}/_archive".format(self.out_dir)
        self.timestamp = cod_timestamp()
Пример #7
0
def run_proportions_prep(shared_package_id, outdir, vr_pull_timestamp, data_id,
                         test=False):

    location_set_version = CONF.get_id('location_set_version')

    location_hierarchy = get_current_location_hierarchy(
        # location_set_version_id=location_set_version,
        gbd_round_id=5  # FIXME: change this when covars have all loc values!
    )

    reg_spec = get_regression_specification(shared_package_id)

    input_data_path = "FILEPATH".format(outdir, data_id)

    print_log_message("Running input data prep")
    df = pull_vr_data_for_rdp_reg(
        reg_spec, location_hierarchy, vr_pull_timestamp=vr_pull_timestamp,
        data_id=data_id, small_test=test
    )
    print_log_message("Formatting regression input")
    df = format_reg_data_for_modeling(
        df, reg_spec, location_hierarchy
    )
    # df = df.rename(columns={
    #     'prop_pkgtarg_target': 'cf_target',
    #     'prop_pkgtarg_garbage': 'cf_garbage'
    # })
    print_log_message("Writing regression input")
    df.to_csv(input_data_path, index=False)

    # # make square df
    print_log_message("Writing square dataset")
    square_df = prepare_square_df(df, location_hierarchy, reg_spec)
    square_df = add_model_group(square_df, with_age=False)
    square_df.to_csv("FILEPATH".format(outdir, data_id),
                     index=False)

    print_log_message("Done")
def run_pipeline(nid,
                 extract_type_id,
                 launch_set_id,
                 df,
                 code_system_id,
                 cause_set_version_id,
                 location_set_version_id,
                 pop_run_id,
                 env_run_id,
                 distribution_set_version_id,
                 diagnostic=False):
    """Run the full pipeline"""

    cache_options = {
        'force_rerun': False,
        'block_rerun': True,
        'cache_results': False,
        'cache_dir': CONF.get_directory('FILEPATH'),
        'verbose': False
    }

    location_meta_df = get_current_location_hierarchy(
        location_set_version_id=location_set_version_id, **cache_options)

    code_map = get_cause_map(code_system_id=code_system_id, **cache_options)

    source = get_value_from_nid(nid, "source", extract_type_id)

    print("Overriding causes when necessary")
    df = overrides(df, location_meta_df)

    print("Dropping data out of scope")
    df = drop_data_out_of_scope(df, location_meta_df, source)
    if len(df) > 0:
        # make sure six minor territories are grouped correctly
        assert_no_six_minor_territories(df)

        # run mapping
        print("\nDeaths before MAPPING: {}".format(df.deaths.sum()))
        Mapper = GBDCauseMapper(cause_set_version_id, code_map)
        df = Mapper.get_computed_dataframe(df, code_system_id)
        if diagnostic:
            write_phase_output(df,
                               'mapping',
                               nid,
                               extract_type_id,
                               launch_set_id,
                               sub_dirs='diagnostic')

        print("\nDeaths before AGESEXSPLIT: {}".format(df.deaths.sum()))
        # run age sex splitting
        MySplitter = AgeSexSplitter(cause_set_version_id,
                                    pop_run_id,
                                    distribution_set_version_id,
                                    verbose=True,
                                    collect_diagnostics=False)

        df = MySplitter.get_computed_dataframe(df, location_meta_df)
        if diagnostic:
            diag_df = MySplitter.get_diagnostic_dataframe()
            write_phase_output(diag_df,
                               'agesexsplit',
                               nid,
                               extract_type_id,
                               launch_set_id,
                               sub_dirs='diagnostic')

        print("\nDeaths before CORRECTIONS: {}".format(df.deaths.sum()))
        # run restrictions corrections
        Corrector = RestrictionsCorrector(code_system_id,
                                          cause_set_version_id,
                                          collect_diagnostics=False,
                                          verbose=True)
        df = Corrector.get_computed_dataframe(df)

        # calculate cc_code for some sources
        if source in ['Iran_maternal_surveillance', 'Iran_forensic']:
            env_meta_df = get_env(env_run_id=env_run_id, **cache_options)
            df = calculate_cc_code(df, env_meta_df, code_map)
            print("\nDeaths after adding cc_code: {}".format(df.deaths.sum()))

        # adjust deaths for New Zealand by maori/non-maori ethnicities
        if source in ["NZL_MOH_ICD9", "NZL_MOH_ICD10"]:
            df = correct_maori_non_maori_deaths(df)
            print("\nDeaths after Maori/non-Maori adjustment: {}".format(
                df.deaths.sum()))

        print("\nDeaths at END: {}".format(df.deaths.sum()))

    return df
def get_locations():
    """Fetch the best location hierarchy in the location set."""
    locations = get_current_location_hierarchy(
        location_set_version_id=CONF.get_id("location_set_version"))
    return locations
def run_phase(df,
              csvid,
              nid,
              extract_type_id,
              lsvid,
              pop_run_id,
              cmvid,
              launch_set_id,
              remove_decimal,
              write_diagnostics=True):

    read_file_cache_options = {
        'block_rerun': True,
        'cache_dir': CACHE_DIR,
        'force_rerun': False,
        'cache_results': False
    }

    iso3 = get_value_from_nid(nid,
                              'iso3',
                              extract_type_id=extract_type_id,
                              location_set_version_id=lsvid)

    code_system_id = int(
        get_value_from_nid(nid,
                           'code_system_id',
                           extract_type_id=extract_type_id))

    data_type_id = get_value_from_nid(nid,
                                      'data_type_id',
                                      extract_type_id=extract_type_id)

    cause_map = get_cause_map(code_map_version_id=cmvid,
                              **read_file_cache_options)

    orig_deaths_sum = int(df['deaths'].sum())

    if remove_decimal:
        print_log_message("Removing decimal from code map")
        cause_map['value'] = cause_map['value'].apply(
            lambda x: x.replace(".", ""))

    if needs_garbage_correction(iso3, data_type_id):
        print_log_message("Correcting Garbage for {}".format(iso3))
        orig_gc_sum = int(df.query('cause_id == 743')['deaths'].sum())

        cause_meta_df = get_current_cause_hierarchy(cause_set_version_id=csvid,
                                                    **read_file_cache_options)

        age_meta_df = get_ages(**read_file_cache_options)

        loc_meta_df = get_current_location_hierarchy(
            location_set_version_id=lsvid, **read_file_cache_options)

        pop_meta_df = get_pop(pop_run_id=pop_run_id, **read_file_cache_options)

        hiv_corrector = HIVCorrector(df,
                                     iso3,
                                     code_system_id,
                                     pop_meta_df,
                                     cause_meta_df,
                                     loc_meta_df,
                                     age_meta_df,
                                     correct_garbage=True)
        df = hiv_corrector.get_computed_dataframe()
        after_gc_sum = int(df.query('cause_id == 743')['deaths'].sum())
        after_deaths_sum = int(df['deaths'].sum())
        print_log_message("""
            Stage [gc deaths / total deaths]
            Before GC correction [{gco} / {to}]
            After GC correction [{gca} / {ta}]
        """.format(gco=orig_gc_sum,
                   to=orig_deaths_sum,
                   gca=after_gc_sum,
                   ta=after_deaths_sum))

    df = add_code_metadata(df, ['value', 'code_system_id'],
                           code_map=cause_map,
                           **read_file_cache_options)

    assert (df['code_system_id'] == code_system_id).all(), "Variable code " \
        "system id {} did not agree with all values of df code " \
        "system id: \n{}".format(
            code_system_id, df.loc[df['code_system_id'] != code_system_id])

    print_log_message("Formatting data for redistribution")

    df = format_age_groups(df)
    # drop observations with 0 deaths
    df = drop_zero_deaths(df)
    # merge on redistribution location hierarchy
    df = add_rd_locations(df, lsvid)
    # fill in any missing stuff that may have come from rd hierarchy
    df = fill_missing_df(df, verify_all=True)

    df = add_split_group_id_column(df)

    # final check to make sure we have all the necessary columns
    df = format_columns_for_rd(df, code_system_id)

    split_groups = list(df.split_group.unique())
    parallel = len(split_groups) > 1

    print_log_message("Submitting/Running split groups")
    for split_group in split_groups:
        # remove intermediate files from previous run
        delete_split_group_output(nid, extract_type_id, split_group)
        # save to file
        split_df = df.loc[df['split_group'] == split_group]
        write_split_group_input(split_df, nid, extract_type_id, split_group)
        if parallel:
            submit_split_group(nid, extract_type_id, split_group,
                               code_system_id, launch_set_id)
        else:
            worker_main(nid, extract_type_id, split_group, code_system_id)
    if parallel:
        print_log_message("Waiting for splits to complete...")
        wait('claude_redistributionworker_{}'.format(nid), 30)
        print_log_message("Done waiting. Appending them together")
    df = read_append_split_groups(split_groups, nid, extract_type_id,
                                  cause_map)

    print_log_message("Done appending files - {} rows assembled".format(
        len(df)))
    df = revert_variables(df)

    after_deaths_sum = int(df['deaths'].sum())
    before_after_text = """
        Before GC redistribution: {a}
        After GC redistribution: {b}
    """.format(a=orig_deaths_sum, b=after_deaths_sum)
    diff = abs(orig_deaths_sum - after_deaths_sum)

    diff_threshold = max(.02 * orig_deaths_sum, 5)
    if not diff < diff_threshold:
        raise AssertionError("Deaths not close.\n" + before_after_text)
    else:
        print_log_message(before_after_text)

    return df
Пример #11
0
def run_phase(df,
              csvid,
              nid,
              extract_type_id,
              lsvid,
              pop_run_id,
              cmvid,
              launch_set_id,
              remove_decimal,
              write_diagnostics=True):
    """String together processes for redistribution."""

    # what to do about caching throughout the phase
    read_file_cache_options = {
        'block_rerun': True,
        'cache_dir': CACHE_DIR,
        'force_rerun': False,
        'cache_results': False
    }

    # the iso3 of this data
    iso3 = get_value_from_nid(nid,
                              'iso3',
                              extract_type_id=extract_type_id,
                              location_set_version_id=lsvid)

    # the code system id
    code_system_id = int(
        get_value_from_nid(nid,
                           'code_system_id',
                           extract_type_id=extract_type_id))

    # the data type
    data_type_id = get_value_from_nid(nid,
                                      'data_type_id',
                                      extract_type_id=extract_type_id)

    # cause map
    cause_map = get_cause_map(code_map_version_id=cmvid,
                              **read_file_cache_options)

    orig_deaths_sum = int(df['deaths'].sum())

    if remove_decimal:
        print_log_message("Removing decimal from code map")
        cause_map['value'] = cause_map['value'].apply(
            lambda x: x.replace(".", ""))

    if needs_garbage_correction(iso3, data_type_id):
        print_log_message("Correcting Garbage for {}".format(iso3))
        orig_gc_sum = int(df.query('cause_id == 743')['deaths'].sum())

        cause_meta_df = get_current_cause_hierarchy(cause_set_version_id=csvid,
                                                    **read_file_cache_options)

        # get age group ids
        age_meta_df = get_ages(**read_file_cache_options)

        loc_meta_df = get_current_location_hierarchy(
            location_set_version_id=lsvid, **read_file_cache_options)

        pop_meta_df = get_pop(pop_run_id=pop_run_id, **read_file_cache_options)
        # Move garbage to hiv first
        hiv_corrector = HIVCorrector(df,
                                     iso3,
                                     code_system_id,
                                     pop_meta_df,
                                     cause_meta_df,
                                     loc_meta_df,
                                     age_meta_df,
                                     correct_garbage=True)
        df = hiv_corrector.get_computed_dataframe()
        after_gc_sum = int(df.query('cause_id == 743')['deaths'].sum())
        after_deaths_sum = int(df['deaths'].sum())
        print_log_message("""
            Stage [gc deaths / total deaths]
            Before GC correction [{gco} / {to}]
            After GC correction [{gca} / {ta}]
        """.format(gco=orig_gc_sum,
                   to=orig_deaths_sum,
                   gca=after_gc_sum,
                   ta=after_deaths_sum))

    df = add_code_metadata(df, ['value', 'code_system_id'],
                           code_map=cause_map,
                           **read_file_cache_options)
    # recognizing that it is weird for code_system_id to come from two places,
    # make sure they are consistent
    assert (df['code_system_id'] == code_system_id).all(), "Variable code " \
        "system id {} did not agree with all values of df code " \
        "system id: \n{}".format(
            code_system_id, df.loc[df['code_system_id'] != code_system_id])

    print_log_message("Formatting data for redistribution")
    # do we have all the packages we need?
    # verify_packages(df)
    # format age groups to match package parameters
    df = format_age_groups(df)
    # drop observations with 0 deaths
    df = drop_zero_deaths(df)
    # merge on redistribution location hierarchy
    df = add_rd_locations(df, lsvid)
    # fill in any missing stuff that may have come from rd hierarchy
    df = fill_missing_df(df, verify_all=True)
    # create split groups

    # NO SPLIT GROUP NEEDED
    df = add_split_group_id_column(df)

    # final check to make sure we have all the necessary columns
    df = format_columns_for_rd(df, code_system_id)

    split_groups = list(df.split_group.unique())
    parallel = len(split_groups) > 1

    print_log_message("Submitting/Running split groups")
    for split_group in split_groups:
        # remove intermediate files from previous run
        delete_split_group_output(nid, extract_type_id, split_group)
        # save to file
        split_df = df.loc[df['split_group'] == split_group]
        write_split_group_input(split_df, nid, extract_type_id, split_group)
        # submit jobs or just run them here
        if parallel:
            submit_split_group(nid, extract_type_id, split_group,
                               code_system_id, launch_set_id)
        else:
            worker_main(nid, extract_type_id, split_group, code_system_id)
    if parallel:
        print_log_message("Waiting for splits to complete...")
        # wait until all jobs for a given nid have completed
        # eventually need logic for files not being present
        wait('claude_redistributionworker_{}'.format(nid), 30)
        # This seems to be necessary to wait for files
        print_log_message("Done waiting. Appending them together")
    # append split groups together
    df = read_append_split_groups(split_groups, nid, extract_type_id,
                                  cause_map)

    print_log_message("Done appending files - {} rows assembled".format(
        len(df)))
    df = revert_variables(df)

    after_deaths_sum = int(df['deaths'].sum())
    before_after_text = """
        Before GC redistribution: {a}
        After GC redistribution: {b}
    """.format(a=orig_deaths_sum, b=after_deaths_sum)
    diff = abs(orig_deaths_sum - after_deaths_sum)
    # bad if change 2% or 5 deaths, whichever is greater
    # (somewhat arbitrary, just trying to avoid annoying/non-issue failures)
    diff_threshold = max(.02 * orig_deaths_sum, 5)
    if not diff < diff_threshold:
        raise AssertionError("Deaths not close.\n" + before_after_text)
    else:
        print_log_message(before_after_text)

    return df
Пример #12
0
def run_phase(df, nid, extract_type_id, pop_run_id, cause_set_version_id,
              location_set_version_id):
    """Run the full phase, chaining together computational elements."""
    cache_dir = CONF.get_directory('FILEPATH')

    orig_deaths = df['deaths'].sum()

    standard_cache_options = {
        'force_rerun': False,
        'block_rerun': True,
        'cache_dir': cache_dir,
        'cache_results': False
    }

    code_system_id = get_value_from_nid(nid,
                                        'code_system_id',
                                        extract_type_id=extract_type_id)

    # this queries the database, maybe should be passed in directly
    code_system = get_code_system_from_id(code_system_id)

    source = get_value_from_nid(nid, 'source', extract_type_id=extract_type_id)

    data_type_id = get_value_from_nid(
        nid,
        'data_type_id',
        extract_type_id=extract_type_id,
        location_set_version_id=location_set_version_id)

    # get cause hierarchy
    cause_meta_df = get_current_cause_hierarchy(
        cause_set_version_id=cause_set_version_id, **standard_cache_options)

    is_vr = data_type_id in [9, 10]

    if not skip_hiv_correction(source) and is_vr:

        # get location hierarchy
        loc_meta_df = get_current_location_hierarchy(
            location_set_version_id=location_set_version_id,
            **standard_cache_options)

        # get population
        pop_meta_df = get_pop(pop_run_id=pop_run_id, **standard_cache_options)

        # get age metadata
        age_meta_df = get_ages(**standard_cache_options)

        # get the country
        iso3 = get_value_from_nid(
            nid,
            'iso3',
            extract_type_id=extract_type_id,
            location_set_version_id=location_set_version_id)
        assert pd.notnull(iso3), "Could not find iso3 for nid {}, " \
            "extract_type_id {}".format(nid, extract_type_id)

        hiv_corrector = HIVCorrector(df,
                                     iso3,
                                     code_system_id,
                                     pop_meta_df,
                                     cause_meta_df,
                                     loc_meta_df,
                                     age_meta_df,
                                     correct_garbage=False)
        print_log_message("Running hiv correction for iso3 {}".format(iso3))
        df = hiv_corrector.get_computed_dataframe()

    if needs_injury_redistribution(source):
        print_log_message("Correcting injuries")
        if not 'loc_meta_df' in vars():
            # get location hierarchy
            loc_meta_df = get_current_location_hierarchy(
                location_set_version_id=location_set_version_id,
                **standard_cache_options)
        injury_redistributor = InjuryRedistributor(df, loc_meta_df,
                                                   cause_meta_df)
        df = injury_redistributor.get_computed_dataframe()

    df = combine_with_rd_raw(df, nid, extract_type_id, location_set_version_id)

    val_cols = ['deaths', 'deaths_raw', 'deaths_corr', 'deaths_rd']

    # run china VR rescaling
    if needs_subnational_rescale(source):
        china_rescaler = ChinaHospitalUrbanicityRescaler()
        df = china_rescaler.get_computed_dataframe(df)

    if needs_strata_collapse(source):
        # set site id to blank site id and collapse
        df['site_id'] = 2
        group_cols = list(set(df.columns) - set(val_cols))
        df = df.groupby(group_cols, as_index=False)[val_cols].sum()

    if is_vr:
        # drop if deaths are 0 across all current deaths columns
        df = df.loc[df[val_cols].sum(axis=1) != 0]

    # restrict causes based on code system
    print_log_message("Running bridge mapper")
    bridge_mapper = BridgeMapper(source, cause_meta_df, code_system)
    df = bridge_mapper.get_computed_dataframe(df)

    # run recodes based on expert opinion
    print_log_message("Enforcing some very hard priors (expert opinion)")
    expert_opinion_recoder = Recoder(cause_meta_df, source, code_system_id,
                                     data_type_id)
    df = expert_opinion_recoder.get_computed_dataframe(df)

    end_deaths = df['deaths'].sum()

    print_log_message("Checking no large loss or gain of deaths")
    if abs(orig_deaths - end_deaths) >= (.1 * end_deaths):
        diff = round(abs(orig_deaths - end_deaths), 2)
        old = round(abs(orig_deaths))
        new = round(abs(end_deaths))
        raise AssertionError("Change of {} deaths [{}] to [{}]".format(
            diff, old, new))

    return df
Пример #13
0
def run_phase(df, nid, extract_type_id, env_run_id, pop_run_id,
              location_set_version_id, cause_set_version_id):
    cache_dir = CONF.get_directory('db_cache')
    source = get_value_from_nid(
        nid,
        'source',
        extract_type_id=extract_type_id,
        location_set_version_id=location_set_version_id)
    data_type_id = get_value_from_nid(
        nid,
        'data_type_id',
        extract_type_id=extract_type_id,
        location_set_version_id=location_set_version_id)
    iso3 = get_value_from_nid(nid,
                              'iso3',
                              extract_type_id=extract_type_id,
                              location_set_version_id=location_set_version_id)
    standard_cache_options = {
        'force_rerun': False,
        'block_rerun': True,
        'cache_dir': cache_dir,
        'cache_results': False
    }

    # ************************************************************
    # Get cached metadata
    # ************************************************************
    print_log_message("Getting cached db resources")
    location_hierarchy = get_current_location_hierarchy(
        location_set_version_id=location_set_version_id,
        **standard_cache_options)
    pop_df = get_pop(pop_run_id=pop_run_id, **standard_cache_options)
    env_df = get_env(env_run_id=env_run_id, **standard_cache_options)
    age_weight_df = get_age_weights(**standard_cache_options)
    cause_meta_df = get_current_cause_hierarchy(
        cause_set_version_id=cause_set_version_id, **standard_cache_options)
    age_meta_df = get_ages(**standard_cache_options)

    # ************************************************************
    # RAKING
    # ************************************************************
    # Rake if appropriate based on this logic
    if ((data_type_id in [8, 9, 10] and (source != 'Other_Maternal'))
            or source in MATERNAL_NR_SOURCES):
        if source not in NOT_RAKED_SOURCES:
            print_log_message("Raking sub national estimates")
            raker = Raker(df, source)
            df = raker.get_computed_dataframe(location_hierarchy)

    # for the Other_Maternal source we only rake household surveys
    elif source == "Other_Maternal":
        model_groups = get_datasets(nid,
                                    extract_type_id,
                                    block_rerun=True,
                                    force_rerun=False).model_group.unique()
        assert len(model_groups) == 1
        model_group = model_groups[0]

        if "HH_SURVEYS" in model_group:
            if model_group == 'MATERNAL-HH_SURVEYS-IND':
                print_log_message("Raking sub national estimates," \
                    " applying double raking for India Maternal"
                )
                raker = Raker(df, source, double=True)
                df = raker.get_computed_dataframe(location_hierarchy)
            else:
                print_log_message("Raking sub national estimates")
                raker = Raker(df, source)
                df = raker.get_computed_dataframe(location_hierarchy)


# ************************************************************
# DROP ZERO SAMPLE SIZE AND RESTRICTED AGE/SEX DATA
# ************************************************************

# data with zero sample size is almost certaintly some anomolous result
# of a program generating data it shouldn't have, and it shouldn't be
# included in codem models. Was probably already dropped, anyway, before
# running noise reduction.
    df = df.query('sample_size != 0')

    # uploading data before 1980 is a waste of space because neither codem
    # nor codviz use it
    df = df.loc[df['year_id'] >= 1980]

    print_log_message("Enforcing age sex restrictions")
    # this actually drops data from the dataframe if it violates age/sex
    # restrictions (e.g. male maternity disorders)
    df = enforce_asr(df, cause_meta_df, age_meta_df)

    # ************************************************************
    # FIT EACH DRAW TO NON-ZERO FLOOR
    # ************************************************************

    print_log_message("Fitting to non-zero floor...")
    nonzero_floorer = NonZeroFloorer(df)
    df = nonzero_floorer.get_computed_dataframe(pop_df, env_df, cause_meta_df)

    # ************************************************************
    # AGE AGGREGATION
    # ************************************************************

    print_log_message("Creating age standardized and all ages groups")
    age_aggregator = AgeAggregator(df, pop_df, env_df, age_weight_df)
    df = age_aggregator.get_computed_dataframe()

    # ************************************************************
    # Make CODEm and CoDViz metrics for uncertainty
    # ************************************************************
    # columns that should be present in the phase output
    final_cols = [
        'age_group_id', 'cause_id', 'cf_corr', 'cf_final', 'cf_raw', 'cf_rd',
        'extract_type_id', 'location_id', 'nid', 'sample_size', 'sex_id',
        'site_id', 'year_id'
    ]
    # Use draws to make metrics for uncertainty to
    # be used by CODEm and CoDViz
    # also creates cf_final from mean of draws
    print_log_message("Making metrics for CODEm and CoDViz")
    if dataset_has_redistribution_variance(data_type_id, source):
        df = RedistributionVarianceEstimator.make_codem_codviz_metrics(
            df, pop_df)
        final_cols += [
            'cf_final_high_rd', 'cf_final_low_rd', 'variance_rd_log_dr',
            'variance_rd_logit_cf'
        ]

    # we did this in the old code-- no cfs over 1 nor below 0
    for cf_col in ['cf_final', 'cf_rd', 'cf_raw', 'cf_corr']:
        df.loc[df[cf_col] > 1, cf_col] = 1
        df.loc[df[cf_col] < 0, cf_col] = 0

    df = df[final_cols]

    return df
Пример #14
0
def run_phase(df, nid, extract_type_id, env_run_id, pop_run_id,
              location_set_version_id, cause_set_version_id):
    cache_dir = CONF.get_directory('db_cache')
    source = get_value_from_nid(
        nid,
        'source',
        extract_type_id=extract_type_id,
        location_set_version_id=location_set_version_id)
    data_type_id = get_value_from_nid(
        nid,
        'data_type_id',
        extract_type_id=extract_type_id,
        location_set_version_id=location_set_version_id)
    standard_cache_options = {
        'force_rerun': False,
        'block_rerun': True,
        'cache_dir': cache_dir,
        'cache_results': False
    }

    # ************************************************************
    # Get cached metadata
    # ************************************************************
    print_log_message("Getting cached db resources")
    location_hierarchy = get_current_location_hierarchy(
        location_set_version_id=location_set_version_id,
        **standard_cache_options)
    pop_df = get_pop(pop_run_id=pop_run_id, **standard_cache_options)
    env_df = get_env(env_run_id=env_run_id, **standard_cache_options)
    age_weight_df = get_age_weights(**standard_cache_options)
    cause_meta_df = get_current_cause_hierarchy(
        cause_set_version_id=cause_set_version_id, **standard_cache_options)
    age_meta_df = get_ages(**standard_cache_options)

    # ************************************************************
    # RAKING
    # ************************************************************
    if ((data_type_id in [8, 9, 10] and (source != "Other_Maternal"))
            or source in MATERNAL_NR_SOURCES):
        if source not in NOT_RAKED_SOURCES:
            print_log_message("Raking sub national estimates")
            raker = Raker(df, source)
            df = raker.get_computed_dataframe(location_hierarchy)

    # for the Other_Maternal source we only rake household surveys
    elif source == "Other_Maternal":
        model_groups = get_datasets(nid,
                                    extract_type_id,
                                    block_rerun=True,
                                    force_rerun=False).model_group.unique()
        assert len(model_groups) == 1
        model_group = model_groups[0]

        if "HH_SURVEYS" in model_group:
            print_log_message("Raking sub national estimates")
            raker = Raker(df, source)
            df = raker.get_computed_dataframe(location_hierarchy)


# ************************************************************
# DROP ZERO SAMPLE SIZE AND RESTRICTED AGE/SEX DATA
# ************************************************************
    df = df.query('sample_size != 0')

    df = df.loc[df['year_id'] >= 1980]

    print_log_message("Enforcing age sex restrictions")

    df = enforce_asr(df, cause_meta_df, age_meta_df)

    # ************************************************************
    # FIT EACH DRAW TO NON-ZERO FLOOR
    # ************************************************************

    print_log_message("Fitting to non-zero floor...")
    nonzero_floorer = NonZeroFloorer(df)
    df = nonzero_floorer.get_computed_dataframe(pop_df, env_df, cause_meta_df)

    # ************************************************************
    # AGE AGGREGATION
    # ************************************************************

    print_log_message("Creating age standardized and all ages groups")
    age_aggregator = AgeAggregator(df, pop_df, env_df, age_weight_df)
    df = age_aggregator.get_computed_dataframe()

    # ************************************************************
    # Make CODEm and CoDViz metrics for uncertainty
    # ************************************************************
    # columns that should be present in the phase output
    final_cols = [
        'age_group_id', 'cause_id', 'cf_corr', 'cf_final', 'cf_raw', 'cf_rd',
        'extract_type_id', 'location_id', 'nid', 'sample_size', 'sex_id',
        'site_id', 'year_id'
    ]

    print_log_message("Making metrics for CODEm and CoDViz")
    if dataset_has_redistribution_variance(data_type_id, source):
        df = RedistributionVarianceEstimator.make_codem_codviz_metrics(
            df, pop_df)
        final_cols += [
            'cf_final_high_rd', 'cf_final_low_rd', 'variance_rd_log_dr',
            'variance_rd_logit_cf'
        ]

    for cf_col in ['cf_final', 'cf_rd', 'cf_raw', 'cf_corr']:
        df.loc[df[cf_col] > 1, cf_col] = 1
        df.loc[df[cf_col] < 0, cf_col] = 0

    df = df[final_cols]

    return df
Пример #15
0
def run_phase(df, nid, extract_type_id, pop_run_id, cause_set_version_id,
              location_set_version_id):
    """Run the full phase, chaining together computational elements."""
    # get filepaths
    cache_dir = CONF.get_directory('db_cache')

    orig_deaths = df['deaths'].sum()

    standard_cache_options = {
        'force_rerun': False,
        'block_rerun': True,
        'cache_dir': cache_dir,
        'cache_results': False
    }

    code_system_id = get_value_from_nid(nid,
                                        'code_system_id',
                                        extract_type_id=extract_type_id)

    code_system = get_code_system_from_id(code_system_id,
                                          **standard_cache_options)

    source = get_value_from_nid(nid, 'source', extract_type_id=extract_type_id)

    data_type_id = get_value_from_nid(
        nid,
        'data_type_id',
        extract_type_id=extract_type_id,
        location_set_version_id=location_set_version_id)

    # get cause hierarchy
    cause_meta_df = get_current_cause_hierarchy(
        cause_set_version_id=cause_set_version_id, **standard_cache_options)

    is_vr = data_type_id in [9, 10]

    # run hiv correction on VR, but not Other_Maternal
    # countries to correct will be further pruned by the master cause
    # selections csv in the hiv corrector class
    if not skip_hiv_correction(source) and is_vr:

        # get location hierarchy
        loc_meta_df = get_current_location_hierarchy(
            location_set_version_id=location_set_version_id,
            **standard_cache_options)

        # get population
        pop_meta_df = get_pop(pop_run_id=pop_run_id, **standard_cache_options)

        # get age metadata
        age_meta_df = get_ages(**standard_cache_options)

        # get the country
        iso3 = get_value_from_nid(
            nid,
            'iso3',
            extract_type_id=extract_type_id,
            location_set_version_id=location_set_version_id)
        assert pd.notnull(iso3), "Could not find iso3 for nid {}, " \
            "extract_type_id {}".format(nid, extract_type_id)

        hiv_corrector = HIVCorrector(df,
                                     iso3,
                                     code_system_id,
                                     pop_meta_df,
                                     cause_meta_df,
                                     loc_meta_df,
                                     age_meta_df,
                                     correct_garbage=False)
        print_log_message("Running hiv correction for iso3 {}".format(iso3))
        df = hiv_corrector.get_computed_dataframe()

    if needs_injury_redistribution(source):
        print_log_message("Correcting injuries")
        if not 'loc_meta_df' in vars():
            # get location hierarchy
            loc_meta_df = get_current_location_hierarchy(
                location_set_version_id=location_set_version_id,
                **standard_cache_options)
        injury_redistributor = InjuryRedistributor(df, loc_meta_df,
                                                   cause_meta_df)
        df = injury_redistributor.get_computed_dataframe()

    # apply redistribution of LRI to tb in under 15, non-neonatal ages based
    # on location/year specific proportions
    print_log_message(
        "Applying special redistribution of LRI to TB in under 15")
    lri_tb_redistributor = LRIRedistributor(df, cause_meta_df)
    df = lri_tb_redistributor.get_computed_dataframe()

    # merge in raw and rd here because recodes and bridge mapping should
    # also apply to the causes that are in previous phases (raw deaths for
    # secret codes need to be moved up to their parent cause, for example)
    df = combine_with_rd_raw(df, nid, extract_type_id, location_set_version_id)

    val_cols = ['deaths', 'deaths_raw', 'deaths_corr', 'deaths_rd']

    # run china VR rescaling
    if needs_subnational_rescale(source):
        china_rescaler = ChinaHospitalUrbanicityRescaler()
        df = china_rescaler.get_computed_dataframe(df)

    if needs_strata_collapse(source):
        # set site id to blank site id and collapse
        df['site_id'] = 2
        group_cols = list(set(df.columns) - set(val_cols))
        df = df.groupby(group_cols, as_index=False)[val_cols].sum()

    if is_vr:
        # drop if deaths are 0 across all current deaths columns
        df = df.loc[df[val_cols].sum(axis=1) != 0]

    # restrict causes based on code system
    print_log_message("Running bridge mapper")
    bridge_mapper = BridgeMapper(source, cause_meta_df, code_system)
    df = bridge_mapper.get_computed_dataframe(df)

    # run recodes based on expert opinion
    print_log_message("Enforcing some very hard priors (expert opinion)")
    expert_opinion_recoder = Recoder(cause_meta_df, source, code_system_id,
                                     data_type_id)
    df = expert_opinion_recoder.get_computed_dataframe(df)

    end_deaths = df['deaths'].sum()

    print_log_message("Checking no large loss or gain of deaths")
    if abs(orig_deaths - end_deaths) >= (.1 * end_deaths):
        diff = round(abs(orig_deaths - end_deaths), 2)
        old = round(abs(orig_deaths))
        new = round(abs(end_deaths))
        raise AssertionError("Change of {} deaths [{}] to [{}]".format(
            diff, old, new))

    return df