Exemplo n.º 1
0
def run_phase_by_model_group(model_df, model_group, launch_set_id):
    """Run the model, parallelizing by country."""
    print_log_message("Writing NR input file")
    write_nrmodel_data(model_df, model_group, launch_set_id)

    claude_dir = CONF.get_directory('claude_code')

    params = [model_group, str(launch_set_id)]

    use_stata = model_group.startswith(("VA", "malaria", "CHAMPS"))
    if use_stata:
        worker = "{claude}/run_phase_nrmodelworkerVA.do".format(
            claude=claude_dir)
        work_dir = CONF.get_directory('nr_process_data')
        params.append(work_dir)

        jobname = "claude_nrmodelworker_{model_group}".format(
            model_group=model_group)
        slots = 1
        language = "stata"
        log_base_dir = "FILEPATH" \
                       "{launch_set_id}".format(
                           user=getpass.getuser(),
                           launch_set_id=launch_set_id)

        submit_cod(
            jobname,
            slots,
            language,
            worker,
            params=params,
            verbose=(launch_set_id == 0),
            logging=True,
            log_base_dir=log_base_dir
        )
        wait("claude_nrmodelworker_{model_group}".format(
            model_group=model_group), 30)
    else:
        num_draws = CONF.get_resource('uncertainty_draws')
        if not modelgroup_has_redistribution_variance(model_group):
            num_draws = 0
        params.append(str(num_draws))
        program = "FILEPATH"
        worker = "{claude}/run_phase_nrmodelworker.R".format(claude=claude_dir)

        command = [program, worker] + params

        print_log_message("Starting command: {}".format(command))
        retcode = subprocess.call(command)
        print(retcode)
def run_phase(df,
              csvid,
              nid,
              extract_type_id,
              lsvid,
              pop_run_id,
              cmvid,
              launch_set_id,
              remove_decimal,
              write_diagnostics=True):

    read_file_cache_options = {
        'block_rerun': True,
        'cache_dir': CACHE_DIR,
        'force_rerun': False,
        'cache_results': False
    }

    iso3 = get_value_from_nid(nid,
                              'iso3',
                              extract_type_id=extract_type_id,
                              location_set_version_id=lsvid)

    code_system_id = int(
        get_value_from_nid(nid,
                           'code_system_id',
                           extract_type_id=extract_type_id))

    data_type_id = get_value_from_nid(nid,
                                      'data_type_id',
                                      extract_type_id=extract_type_id)

    cause_map = get_cause_map(code_map_version_id=cmvid,
                              **read_file_cache_options)

    orig_deaths_sum = int(df['deaths'].sum())

    if remove_decimal:
        print_log_message("Removing decimal from code map")
        cause_map['value'] = cause_map['value'].apply(
            lambda x: x.replace(".", ""))

    if needs_garbage_correction(iso3, data_type_id):
        print_log_message("Correcting Garbage for {}".format(iso3))
        orig_gc_sum = int(df.query('cause_id == 743')['deaths'].sum())

        cause_meta_df = get_current_cause_hierarchy(cause_set_version_id=csvid,
                                                    **read_file_cache_options)

        age_meta_df = get_ages(**read_file_cache_options)

        loc_meta_df = get_current_location_hierarchy(
            location_set_version_id=lsvid, **read_file_cache_options)

        pop_meta_df = get_pop(pop_run_id=pop_run_id, **read_file_cache_options)

        hiv_corrector = HIVCorrector(df,
                                     iso3,
                                     code_system_id,
                                     pop_meta_df,
                                     cause_meta_df,
                                     loc_meta_df,
                                     age_meta_df,
                                     correct_garbage=True)
        df = hiv_corrector.get_computed_dataframe()
        after_gc_sum = int(df.query('cause_id == 743')['deaths'].sum())
        after_deaths_sum = int(df['deaths'].sum())
        print_log_message("""
            Stage [gc deaths / total deaths]
            Before GC correction [{gco} / {to}]
            After GC correction [{gca} / {ta}]
        """.format(gco=orig_gc_sum,
                   to=orig_deaths_sum,
                   gca=after_gc_sum,
                   ta=after_deaths_sum))

    df = add_code_metadata(df, ['value', 'code_system_id'],
                           code_map=cause_map,
                           **read_file_cache_options)

    assert (df['code_system_id'] == code_system_id).all(), "Variable code " \
        "system id {} did not agree with all values of df code " \
        "system id: \n{}".format(
            code_system_id, df.loc[df['code_system_id'] != code_system_id])

    print_log_message("Formatting data for redistribution")

    df = format_age_groups(df)
    # drop observations with 0 deaths
    df = drop_zero_deaths(df)
    # merge on redistribution location hierarchy
    df = add_rd_locations(df, lsvid)
    # fill in any missing stuff that may have come from rd hierarchy
    df = fill_missing_df(df, verify_all=True)

    df = add_split_group_id_column(df)

    # final check to make sure we have all the necessary columns
    df = format_columns_for_rd(df, code_system_id)

    split_groups = list(df.split_group.unique())
    parallel = len(split_groups) > 1

    print_log_message("Submitting/Running split groups")
    for split_group in split_groups:
        # remove intermediate files from previous run
        delete_split_group_output(nid, extract_type_id, split_group)
        # save to file
        split_df = df.loc[df['split_group'] == split_group]
        write_split_group_input(split_df, nid, extract_type_id, split_group)
        if parallel:
            submit_split_group(nid, extract_type_id, split_group,
                               code_system_id, launch_set_id)
        else:
            worker_main(nid, extract_type_id, split_group, code_system_id)
    if parallel:
        print_log_message("Waiting for splits to complete...")
        wait('claude_redistributionworker_{}'.format(nid), 30)
        print_log_message("Done waiting. Appending them together")
    df = read_append_split_groups(split_groups, nid, extract_type_id,
                                  cause_map)

    print_log_message("Done appending files - {} rows assembled".format(
        len(df)))
    df = revert_variables(df)

    after_deaths_sum = int(df['deaths'].sum())
    before_after_text = """
        Before GC redistribution: {a}
        After GC redistribution: {b}
    """.format(a=orig_deaths_sum, b=after_deaths_sum)
    diff = abs(orig_deaths_sum - after_deaths_sum)

    diff_threshold = max(.02 * orig_deaths_sum, 5)
    if not diff < diff_threshold:
        raise AssertionError("Deaths not close.\n" + before_after_text)
    else:
        print_log_message(before_after_text)

    return df
Exemplo n.º 3
0
def run_phase(df,
              csvid,
              nid,
              extract_type_id,
              lsvid,
              pop_run_id,
              cmvid,
              launch_set_id,
              remove_decimal,
              write_diagnostics=True):
    """String together processes for redistribution."""

    # what to do about caching throughout the phase
    read_file_cache_options = {
        'block_rerun': True,
        'cache_dir': CACHE_DIR,
        'force_rerun': False,
        'cache_results': False
    }

    # the iso3 of this data
    iso3 = get_value_from_nid(nid,
                              'iso3',
                              extract_type_id=extract_type_id,
                              location_set_version_id=lsvid)

    # the code system id
    code_system_id = int(
        get_value_from_nid(nid,
                           'code_system_id',
                           extract_type_id=extract_type_id))

    # the data type
    data_type_id = get_value_from_nid(nid,
                                      'data_type_id',
                                      extract_type_id=extract_type_id)

    # cause map
    cause_map = get_cause_map(code_map_version_id=cmvid,
                              **read_file_cache_options)

    orig_deaths_sum = int(df['deaths'].sum())

    if remove_decimal:
        print_log_message("Removing decimal from code map")
        cause_map['value'] = cause_map['value'].apply(
            lambda x: x.replace(".", ""))

    if needs_garbage_correction(iso3, data_type_id):
        print_log_message("Correcting Garbage for {}".format(iso3))
        orig_gc_sum = int(df.query('cause_id == 743')['deaths'].sum())

        cause_meta_df = get_current_cause_hierarchy(cause_set_version_id=csvid,
                                                    **read_file_cache_options)

        # get age group ids
        age_meta_df = get_ages(**read_file_cache_options)

        loc_meta_df = get_current_location_hierarchy(
            location_set_version_id=lsvid, **read_file_cache_options)

        pop_meta_df = get_pop(pop_run_id=pop_run_id, **read_file_cache_options)
        # Move garbage to hiv first
        hiv_corrector = HIVCorrector(df,
                                     iso3,
                                     code_system_id,
                                     pop_meta_df,
                                     cause_meta_df,
                                     loc_meta_df,
                                     age_meta_df,
                                     correct_garbage=True)
        df = hiv_corrector.get_computed_dataframe()
        after_gc_sum = int(df.query('cause_id == 743')['deaths'].sum())
        after_deaths_sum = int(df['deaths'].sum())
        print_log_message("""
            Stage [gc deaths / total deaths]
            Before GC correction [{gco} / {to}]
            After GC correction [{gca} / {ta}]
        """.format(gco=orig_gc_sum,
                   to=orig_deaths_sum,
                   gca=after_gc_sum,
                   ta=after_deaths_sum))

    df = add_code_metadata(df, ['value', 'code_system_id'],
                           code_map=cause_map,
                           **read_file_cache_options)
    # recognizing that it is weird for code_system_id to come from two places,
    # make sure they are consistent
    assert (df['code_system_id'] == code_system_id).all(), "Variable code " \
        "system id {} did not agree with all values of df code " \
        "system id: \n{}".format(
            code_system_id, df.loc[df['code_system_id'] != code_system_id])

    print_log_message("Formatting data for redistribution")
    # do we have all the packages we need?
    # verify_packages(df)
    # format age groups to match package parameters
    df = format_age_groups(df)
    # drop observations with 0 deaths
    df = drop_zero_deaths(df)
    # merge on redistribution location hierarchy
    df = add_rd_locations(df, lsvid)
    # fill in any missing stuff that may have come from rd hierarchy
    df = fill_missing_df(df, verify_all=True)
    # create split groups

    # NO SPLIT GROUP NEEDED
    df = add_split_group_id_column(df)

    # final check to make sure we have all the necessary columns
    df = format_columns_for_rd(df, code_system_id)

    split_groups = list(df.split_group.unique())
    parallel = len(split_groups) > 1

    print_log_message("Submitting/Running split groups")
    for split_group in split_groups:
        # remove intermediate files from previous run
        delete_split_group_output(nid, extract_type_id, split_group)
        # save to file
        split_df = df.loc[df['split_group'] == split_group]
        write_split_group_input(split_df, nid, extract_type_id, split_group)
        # submit jobs or just run them here
        if parallel:
            submit_split_group(nid, extract_type_id, split_group,
                               code_system_id, launch_set_id)
        else:
            worker_main(nid, extract_type_id, split_group, code_system_id)
    if parallel:
        print_log_message("Waiting for splits to complete...")
        # wait until all jobs for a given nid have completed
        # eventually need logic for files not being present
        wait('claude_redistributionworker_{}'.format(nid), 30)
        # This seems to be necessary to wait for files
        print_log_message("Done waiting. Appending them together")
    # append split groups together
    df = read_append_split_groups(split_groups, nid, extract_type_id,
                                  cause_map)

    print_log_message("Done appending files - {} rows assembled".format(
        len(df)))
    df = revert_variables(df)

    after_deaths_sum = int(df['deaths'].sum())
    before_after_text = """
        Before GC redistribution: {a}
        After GC redistribution: {b}
    """.format(a=orig_deaths_sum, b=after_deaths_sum)
    diff = abs(orig_deaths_sum - after_deaths_sum)
    # bad if change 2% or 5 deaths, whichever is greater
    # (somewhat arbitrary, just trying to avoid annoying/non-issue failures)
    diff_threshold = max(.02 * orig_deaths_sum, 5)
    if not diff < diff_threshold:
        raise AssertionError("Deaths not close.\n" + before_after_text)
    else:
        print_log_message(before_after_text)

    return df
Exemplo n.º 4
0
def run_phase_by_cause(model_df, model_group, launch_set_id):
    """Run the model, parallelizing by country and cause."""
    # get cause list
    nocause = model_df[model_df['cause_id'].isnull()]
    if len(nocause) > 0:
        raise AssertionError("Have {} rows with missing cause: {}".format(
            len(nocause),
            nocause
        ))
    causes = list(set(model_df['cause_id']))
    causes = [int(cause) for cause in causes]

    print_log_message(
        "Writing NR input file and submitting jobs for "
        "{} causes".format(len(causes)))

    log_base_dir = "FILEPATH" \
                   "{launch_set_id}".format(
                       user=getpass.getuser(),
                       launch_set_id=launch_set_id)
    claude_dir = CONF.get_directory('claude_code')
    worker = "{claude}/run_phase_nrmodelworker.R".format(claude=claude_dir)
    slots = 5
    if model_group == 'VR-GBR':
        cores = 25
    else:
        cores = 15
    subnat_iso3s = CONF.get_id('subnational_modeled_iso3s')
    for subnat_iso3 in subnat_iso3s:
        if model_group == "VR-{}".format(subnat_iso3):
            slots = 18
    if model_group == 'VR-GBR':
        slots = 100
    language = "r"

    num_draws = CONF.get_resource('uncertainty_draws')
    if not modelgroup_has_redistribution_variance(model_group):
        num_draws = 0

    for cause_id in causes:
        write_nrmodel_data(
            model_df, model_group, launch_set_id, cause_id=cause_id)
        params = [
            model_group, str(launch_set_id),
            str(num_draws), str(cause_id)
        ]
        jobname = "claude_nrmodelworker_{model_group}_{cause_id}".format(
            model_group=model_group, cause_id=cause_id)

        submit_cod(
            jobname,
            slots,
            language,
            worker,
            cores=cores,
            params=params,
            verbose=(launch_set_id == 0),
            logging=True,
            log_base_dir=log_base_dir
        )

    wait("claude_nrmodelworker_{model_group}".format(
        model_group=model_group), 30)

    nr_dir = CONF.get_directory('nr_process_data')
    iso_dir = "FILEPATH".format(
        nrdir=nr_dir, model_group=model_group)
    causes_outpath = "FILEPATH".format(iso_dir=iso_dir, lsid=launch_set_id)
    cause_path = "FILEPATH"

    for cause_id in causes:
        outpath = cause_path.format(iso_dir=iso_dir, cause_id=cause_id,
                                    lsid=launch_set_id)

        just_keep_trying(
            os.path.exists,
            args=[outpath],
            max_tries=250,
            seconds_between_tries=6,
            verbose=True
        )

    print_log_message("Writing causes file to {}".format(causes_outpath))
    causes_df = pd.DataFrame({'cause_id': list(causes)})
    causes_df.to_csv(causes_outpath, index=False)