Пример #1
0
def _parallel_merge_split(meid_cause_map, interp_files, output_dir, tmpdir,
                          location_id):
    try:
        epi_draw = []
        for f in interp_files:
            epi_draw.append(
                pd.read_hdf(f,
                            'draws',
                            where=["location_id=={}".format(location_id)]))
        epi_draws = pd.concat(epi_draw)

        cd = pd.read_hdf(os.path.join(tmpdir.name, 'source_cause_draws.h5'),
                         'draws',
                         where=['location_id=={}'.format(location_id)])

        draw_cols = [col for col in cd.columns if 'draw_' in col]
        epi_draws = epi_draws[epi_draws['age_group_id'].isin(
            cd['age_group_id'].unique())]

        # these columns are not needed and cause maths.merge_split to break
        drop_cols = ['measure_id', 'model_version_id', 'metric_id']
        cd.drop(drop_cols, axis=1, inplace=True, errors='ignore')
        epi_draws.drop(drop_cols, axis=1, inplace=True, errors='ignore')

        cout = merge_split(
            cd, epi_draws,
            ['year_id', 'age_group_id', 'sex_id', 'location_id'], draw_cols)

        cout = cout.merge(cd[[
            'year_id', 'age_group_id', 'sex_id', 'location_id', 'envelope'
        ]],
                          how='left')
        cout['cause_id'] = cout['modelable_entity_id']
        cout['cause_id'] = cout['cause_id'].replace(meid_cause_map)
        cout['measure_id'] = 1
        for cid in cout.cause_id.unique():
            cid_dir = '{}/{}'.format(output_dir, int(cid))
            cid_dir = cid_dir.replace("\r", "")
            if not os.path.exists(cid_dir):
                makedirs_safely(cid_dir)
            fn = '{}/death_{}.csv'.format(cid_dir, location_id)
            cout.query('cause_id=={}'.format(cid)).to_csv(fn, index=False)
        return location_id, 0
    except Exception:
        tb_str = traceback.format_exc()
        return location_id, tb_str
Пример #2
0
    def run_all(self, n_processes: int = 8):
        """
        Creates a process_version (with db tables), performs
        cause decomposition, saves to file, and uploads results to
        the db. Use the n_processes argument to determine how many
        parallel processes to run simultaneously, keeping in mind
        that each hits the database a couple of times.
        """
        pvid = self.create_process_version()
        print(f"Process version id {pvid} created on the "
              f"gbd {self.env.value} database.")

        makedirs_safely(self.write_dir)
        print(f"Output directory {self.write_dir} created.")

        self.run_decomp(n_processes=n_processes)
        self.upload()
        process_version = GBDProcessVersion(pvid, env=self.env)
        process_version._update_status(
            gbd.gbd_process_version_status['ACTIVE'])
        print(f"Process version id {pvid} marked active. Run completed")
Пример #3
0
def split_epi_model(source_meid,
                    target_meids,
                    prop_meids,
                    split_measure_ids=[5, 6],
                    prop_meas_id=18,
                    gbd_round_id=gbd.GBD_ROUND_ID,
                    decomp_step=None,
                    output_dir=None):
    """
    Splits modelable entity draws based on proportion models. Outputs are new
    draw files for each proportion model, tagged with the target_meid.

    Arguments:
        source_meid(int):  The id of the me to be split.

        target_meids(intlist): A list of modelable entity ids that will be
            used to tag the new draws. Essentially, these are the me_ids for
            the newly created models.

        prop_meids (intlist): A list of modelable entity ids corresponding
            to the proportion models used to do the split. Note: target_meids
            and proportion_meids need to have 1-1 parity with each other and
            are order dependent.

            Example: target_meids = [2891, 2892, 2893, 2894]
                 proportion_meids = [1922, 1920, 1921, 1923]

            Proportion model 1922 will be used to create model 2891,
            proportion model 1920 will be used to create model 2892 and so on.

        split_measure_ids(int or intlist, optional): A list of measure_ids from
            the source model, to be split according to the proportion models.
            Default is [5, 6], prevalence and incidence.

        prop_meas_id(int, optional): The measure id used in the proportion
        models. Default is 18, proportion.

        gbd_round_id(int, optional): The gbd_round_id for the proportion models
            being used. This argument is used to retrieve the best model for a
            given round. Default is the current GBD round.

        decomp_step (str): Decomposition step. Allowed values are None,
            'iterative', 'step1', 'step2', 'step3', 'step4', and 'step5'
            depending on the value of gbd_round_id.

        output_dir(str, optional): The directory where the draw files are
            created. Subdirectories are created for each target modelable
            entity id, and files are tagged by location id.
            Example: output_dir/2891/35.h5 is a draw file for newly created
            model 2891, location_id 35.

    Returns:
        Pandas.DataFrame:
            The output directory where either the draws or an errors logfile
            can be found.

    Raises:
        IllegalSplitEpiArgument: If source_meid and proportion_measure_id are
            not integers. If target_meids and prop_meids are not lists of ints
            or split_measure_ids is not an int or list of ints. If target_meids
            and prop_meids do not have the same length.

        ValueError: If the decomp_step argument is invalid.
    """

    # Validate Arguments
    validate_decomp_step_input(decomp_step, gbd_round_id)

    validate_source_meid(source_meid)
    validate_measure_id(prop_meas_id)
    validate_meids(target_meids, 'Target')
    validate_meids(prop_meids, 'Proportion')
    validate_split_measure_ids(split_measure_ids)

    if len(target_meids) != len(prop_meids):
        raise IllegalSplitEpiArgument(
            "Target modelable_entity_ids and proportion modelable_entity_ids "
            "lists must represent a 1-to-1 mapping of modelable_entity_ids. "
            "Received: {t_ids} target ids and {p_ids} proportion ids.".format(
                t_ids=len(target_meids), p_ids=len(prop_meids)))

    if output_dir is None:
        output_dir = 'FILEPATH'

    try:
        makedirs_safely(output_dir)
    except Exception:
        pass

    res = launch_epi_splits(source_meid, target_meids, prop_meids,
                            split_measure_ids, prop_meas_id, output_dir,
                            gbd_round_id, decomp_step)

    errors = [r for r in res if r[1] != 0]

    if len(errors) == 0:
        return pd.DataFrame({'output_dir': output_dir}, index=[0])
    else:
        logfile = '{}/{}_errors.log'.format(output_dir, str(source_meid))
        with open(logfile, 'w') as f:
            estr = "\n".join([str(r) for r in errors])
            f.write(estr)

        return pd.DataFrame({'error_dir': logfile}, index=[0])
Пример #4
0
    parser.add_argument('--gbd_round_id', type=int)
    parser.add_argument('--intermediate_dir', type=str)
    parser.add_argument('--decomp_step', type=str, default=None)

    args = parser.parse_args()
    return (args.gbd_id, args.proportion_measure_id, args.gbd_round_id,
            args.sex_id, args.intermediate_dir, args.decomp_step)


if __name__ == '__main__':

    (gbd_id, measure_id, gbd_round_id, sex_id, outdir,
     decomp_step) = parse_arguments()

    if not os.path.exists(outdir):
        makedirs_safely(outdir)

    end_year = int(gbd_round_from_gbd_round_id(gbd_round_id))

    df = interpolate(gbd_id=gbd_id,
                     gbd_id_type='modelable_entity_id',
                     source='epi',
                     measure_id=measure_id,
                     reporting_year_start=1980,
                     reporting_year_end=end_year,
                     sex_id=sex_id,
                     gbd_round_id=gbd_round_id,
                     decomp_step=decomp_step,
                     num_workers=30)

    id_cols = [col for col in df.columns if col.endswith('_id')]
Пример #5
0
def _launch_cod_splits(source_cause_id, target_cause_ids, target_meids,
                       prop_meas_id, gbd_round_id, decomp_step, output_dir,
                       project):
    """
    Split the given source_cause_id given target_meid proportions, saved
    to the target_cause_ids in output_dir.

    Arguments:
        source_cause_id (int): cause_id for the draws to be split
        target_cause_ids (intlist): list of cause ids that you want the new
            outputted subcauses to be identified by
        target_meids (intlist): list of proportion models' modelable_entity_ids
            that you want the source_cause_id to be split by, to make the
            target_cause_ids. Target_cause_ids and target_me_ids must be
            specified in the same order
        prop_meas_id (int): The measure_id that identifies the proportion
            in the target_meids to use for the split.
        gbd_round_id (int): the gbd_round_id for models being split.
        decomp_step (str): Specifies which decomposition step the returned
            estimates should be from. If using interpolate for GBD round 6 and
            above, must specify one of 'step1', 'step2', 'step3', 'step4',
            'step5', or 'iterative'.
        output_dir (str): directory where you want final results stored
        project (str): The SGE project to launch split_cod_model subjobs
            to using SplitCodSwarm.

    Returns:
        A list of tuples with each location_id paired with either 0, or an
                error message. This is then parsed in the central function
                draw_ops.split_cod_model into errors or success messages
    """

    # setup years, sex restrictions, most detailed locations, etc.

    if gbd_round_id >= 6:
        cause_set_id = COMPUTATION_CAUSE_SET_ID
    else:
        cause_set_id = REPORTING_CAUSE_SET_ID
    causes = get_cause_metadata(
        cause_set_id=cause_set_id,
        gbd_round_id=gbd_round_id,
        decomp_step=decomp_step).query("cause_id==@source_cause_id")
    sex_ids = []
    if causes['male'].item() != 0:
        sex_ids.append(1)
    if causes['female'].item() != 0:
        sex_ids.append(2)
    if not sex_ids:
        raise ValueError(
            "Source_cause_id {} is restricted for both males and females, "
            "according to cause metadata".format(source_cause_id))

    most_detailed_locs = list(
        get_location_metadata(35,
                              gbd_round_id=gbd_round_id,
                              decomp_step=decomp_step).query(
                                  'most_detailed==1').location_id.unique())
    meid_cause_map = dict(zip(target_meids, target_cause_ids))

    # run interpolating/extrapolating
    intermediate_dir = os.path.join(output_dir,
                                    'intermediate_{}'.format(source_cause_id))
    if not os.path.exists(intermediate_dir):
        makedirs_safely(intermediate_dir)

    swarm = SplitCoDSwarm(source_id=source_cause_id,
                          proportion_ids=target_meids,
                          proportion_measure_id=prop_meas_id,
                          sex_ids=sex_ids,
                          gbd_round_id=gbd_round_id,
                          decomp_step=decomp_step,
                          intermediate_dir=intermediate_dir,
                          outdir=output_dir,
                          project=project)
    swarm.add_interpolate_tasks()
    exit_code = swarm.run()
    if exit_code != 0:
        raise RuntimeError(
            "Interpolating CoD years failed. Check logs in {}.".format(
                output_dir))

    # run splitting
    for cid in target_cause_ids:
        cid_dir = os.path.join(output_dir, str(cid))
        if not os.path.exists(cid_dir):
            makedirs_safely(cid_dir)
    file_list = glob.glob(os.path.join('{}/*.h5'.format(intermediate_dir)))

    # read in draws for source cause
    source = _get_draws(source_cause_id, gbd_round_id, decomp_step, sex_ids)
    # create a temporary directory to store all the draws from the source cause
    tmpdir = tempfile.TemporaryDirectory(dir=output_dir)
    # save source cause draws to temporary directory
    source.to_hdf(
        os.path.join(tmpdir.name, 'source_cause_draws.h5'),
        key='draws',
        mode='w',
        format='table',
        data_columns=['location_id', 'year_id', 'sex_id', 'age_group_id'])

    run_splits = functools.partial(_parallel_merge_split, meid_cause_map,
                                   file_list, output_dir, tmpdir)
    pool = Pool(30)
    res = pool.map(run_splits, most_detailed_locs)
    pool.close()
    pool.join()
    # clean up tempdir
    tmpdir.cleanup()
    return res
Пример #6
0
def split_cod_model(source_cause_id,
                    target_cause_ids,
                    target_meids,
                    project,
                    prop_meas_id=18,
                    decomp_step=None,
                    gbd_round_id=gbd.GBD_ROUND_ID,
                    output_dir=None):
    """Returns a dataframe containing only the name of the out_dir where this
    function will save your new draws.

    Arguments:
        source_cause_id (int): the cause_id to be split into target_cause_ids.

        target_cause_ids (intlist): the cause_ids that should be produced as a
            result of the split. These should be provided in the same order as
            the target_meids which define the split proportions.

        target_meids (intlist): the modelable_entity_ids containing the
            proportion models by which to split the source_cause_id. These
            should be provided in the same order as the target_cause_ids.
        
        project (str): the team-specific SGE cluster project to launch the
            subjobs of the split_cod_model swarm to.

        prop_meas_id (int): The measure_id that identifies the proportion in
            the target_meids to use for the split. Defaults to measure_id 18,
            proportion.

        decomp_step (str): Specifies which decomposition step the returned
            estimates should be from. Default to None. "Allowed values are
            None, iterative', 'step1', 'step2', 'step3', 'step4', and 'step5'
            depending on the value of gbd_round_id."

        gbd_round_id (int): the gbd_round_id for models being split. Defaults
            to current GBD round id.

        output_dir (str): place where you want new draws to be saved.

    Returns:
        Pandas.DataFrame:
        The output directory where either the draws or an errors logfile
        can be found.

    Raises:
        IllegalSplitCoDArgument: If the source_cause_id, any of the
            target_cause_ids or target_meids are invalid, or if the lists
            of target causes and target meids are not one-to-one.
        RuntimeError: If any under-the-hood errors thrown directing the
            user to the log file.
    """
    # Validate all incoming arguments.
    validate_decomp_step_input(decomp_step, gbd_round_id)

    # Validate source cause_id
    validate_ids(source_cause_id, VALID_CAUSE_IDS, 'source_cause_id')
    # Validate target_cause_ids
    validate_ids(target_cause_ids, VALID_CAUSE_IDS, 'target_cause_ids')

    # Validate modelable_entity_ids
    validate_ids(target_meids, VALID_MEIDS, 'proportion_meids')

    if len(target_cause_ids) != len(target_meids):
        raise IllegalSplitCoDArgument(
            "target_cause_ids and target_meids lists must represent a 1-to-1 "
            "mapping of cause_ids to modelable_entity_ids. Received: {t_ids} "
            "target ids and {p_ids} proportion ids.".format(
                t_ids=len(target_cause_ids), p_ids=len(target_meids)))

    if not output_dir:
        output_dir = 'FILEPATH'

    if not os.path.exists(output_dir):
        makedirs_safely(output_dir)

    res = _launch_cod_splits(source_cause_id, target_cause_ids, target_meids,
                             prop_meas_id, gbd_round_id, decomp_step,
                             output_dir, project)
    errors = [r for r in res if r[1] != 0]

    if len(errors) == 0:
        return pd.DataFrame({'output_dir': output_dir}, index=[0])
    else:
        logfile = '{}/{}_errors.log'.format(output_dir, str(source_cause_id))
        with open(logfile, 'w') as f:
            estr = "\n".join([str(r) for r in errors])
            f.write(estr)

        return pd.DataFrame({'error_dir': logfile}, index=[0])