def match_with_dimensions(df, dimensions_df):
    merge_cols = [
        'measure_id', 'location_id', 'year_id', 'sex_id', 'age_group_id',
        'cause_id', 'rei_id', 'star_id'
    ]
    dimensions_df = get_dimensions(dimensions_df, merge_cols)
    MPGlobals.logger.info("match_with_dimensions df {}".format(
        get_index_columns(df)))
    MPGlobals.logger.info("match_with_dimensions dimensions_df {}".format(
        get_index_columns(dimensions_df)))
    df = pd.merge(df, dimensions_df, on=merge_cols)
    return df
Пример #2
0
def risk_attr_burden_to_paf(
    risk_cause_df,
    hundred_percent_pafs_df,
    value_cols,
    demographic_cols=[
        'location_id', 'year_id', 'age_group_id', 'sex_id', 'measure_id',
        'metric_id'
    ],
):
    """Takes a dataframe whose values represent risk-attributable burden
    and convert those to PAFs"""
    # Get the cause-level envelope

    burden_by_cause = risk_cause_df.query(
        'rei_id == @gbd.risk.TOTAL_ATTRIBUTABLE and '
        'star_id == @gbd.star.ANY_EVIDENCE_LEVEL')

    logger.info("APPLY PAFS BEGIN burden_by_cause {}".format(
        get_index_columns(burden_by_cause)))
    # Merge cause-level envelope onto data
    paf_df = risk_cause_df.merge(burden_by_cause,
                                 on=demographic_cols + ['cause_id'],
                                 suffixes=('', '_bbc'))
    # Divide attributable burden by cause-level envelope
    bbc_vcs = ["{}_bbc".format(col) for col in value_cols]
    paf_df[value_cols] = paf_df[value_cols].values / paf_df[bbc_vcs].values
    paf_df[value_cols] = paf_df[value_cols].fillna(0)

    # Set certain cause-risk pairs to 100 % pafs
    # This should not happen on age standardized pafs
    if hundred_percent_pafs_df.empty:
        logger.debug("No hundred-percent PAFs detected")
    else:
        hundred_percent_pafs_df['full_paf'] = 1
        paf_df = pd.merge(paf_df,
                          hundred_percent_pafs_df,
                          on=['cause_id', 'rei_id'],
                          how='left')

        set_to_one = ((paf_df['full_paf'] == 1) &
                      (paf_df['age_group_id'] != gbd.age.AGE_STANDARDIZED))
        paf_df.loc[set_to_one, value_cols] = 1.0

    # Change metric to percent
    paf_df['metric_id'] = gbd.metrics.PERCENT
    logger.info("APPLY PAFS END")
    # Keep only the columns we need
    keep_cols = demographic_cols + ['cause_id', 'rei_id', 'star_id'
                                    ] + value_cols
    return paf_df[keep_cols]
def write_draws(df, out_dir, measure_label, location_id, year_id,
                write_out_star_ids):
    """Write draws to the appropriate file for the given loc-year-measure"""
    if measure_label == 'death':
        measure_id = gbd.measures.DEATH
    elif measure_label == 'yll':
        measure_id = gbd.measures.YLL
    elif measure_label == 'yld':
        measure_id = gbd.measures.YLD
    elif measure_label == 'daly':
        measure_id = gbd.measures.DALY

    filename = get_input_args.calculate_output_filename(
        out_dir, measure_id, location_id, year_id)
    sink = HDFDataSink(filename)
    df = remove_unwanted_stars(df, write_out_star_ids=write_out_star_ids)
    cols = get_index_columns(df)
    sink.write(df, id_cols=cols)
    MPGlobals.logger.info(
        "DONE write this_df {}, for measure_id={}, file {}".format(
            time.time(), measure_id, filename))
def back_calc_pafs(df, n_draws):
    """
    Back calculate PAFs for each cause-risk pair
    We should only back-calc in number space, or age-standardized.
    """
    MPGlobals.logger.info("start back-calculating PAFs, n_draws = {}, time = "
                          "{}".format(n_draws, time.time()))
    index_cols = get_index_columns(df)
    MPGlobals.logger.info("BCP columns = {}".format(index_cols))
    default_draw_cols = ['draw_{}'.format(dn) for dn in range(n_draws)]
    pafs_df = risk_attr_burden_to_paf(
        df[df.metric_id == gbd.metrics.NUMBER],
        MPGlobals.data_container['cause_risk_metadata'], default_draw_cols)
    age_std_pafs_df = risk_attr_burden_to_paf(
        df[df.age_group_id == gbd.age.AGE_STANDARDIZED],
        MPGlobals.data_container['cause_risk_metadata'], default_draw_cols)
    pafs_df = pd.concat([pafs_df, age_std_pafs_df])
    pafs_df = pafs_df.loc[pafs_df['rei_id'] != 0]
    MPGlobals.logger.info("back-calculating PAFs complete, time = "
                          "{}".format(time.time()))
    return pafs_df
Пример #5
0
def run_burdenator_cleanup(out_dir, location_id, year_id, n_draws, measure_id,
                           cod_dir, cod_pattern, epi_dir,
                           turn_off_null_and_nan_check, gbd_round_id,
                           decomp_step, write_out_star_ids, cache_dir,
                           dual_upload):
    """Take a set of aggregated results and reformat them into draws consistent
    with the most-detailed location draws.

    Args:
        out_dir (str): the root directory for this burdenator run
        location_id (int): location_id of the aggregate location
        year_id (int): year of the aggregate location
        n_draws (int): the number of draw columns in the H5 data frames,
            greater than zero
        measure_id (int): measure_id of the aggregate location
        cod_dir (str): directory where the cause-level envelope for
            cod (CoDCorrect) files are stored
        cod_pattern (str): file pattern for accessing CoD-or-FauxCorrect
            draws.  Example: '{measure_id}_{location_id}.h5'
        epi_dir (str): directory where the cause-level envelope for
            epi (COMO) files are stored
        turn_off_null_and_nan_check (bool): Disable checks for NaNs and Nulls
        write_out_star_ids (bool): If true, include star_ids in output
        draw files and CSV upload files
        dual_upload (bool): If True upload to column store as well
            as the gbd database.  Currently not implemented.
    """
    MPGlobals.logger = logger
    start_time = time.time()
    logger.info("START pipeline burdenator cleanup at {}".format(start_time))
    logging.basicConfig(format='%(asctime)s %(message)s',
                        datefmt='%m/%d/%Y %I:%M:%S %p')
    logging.warning('is when this event was logged.')

    # Get aggregated draws
    logger.info("start append files, time = {}".format(time.time()))
    draw_dir = os.path.join(out_dir, 'draws')
    aggregated_draw_dir = os.path.join(out_dir, 'loc_agg_draws')
    # df contains Attribute Burden, which is in Number space.
    # It is a subset of the total count for the parent metric,
    # ie AB of YLL's for a cause attributable to a risk
    # (or to all known & unknown risks, ie rei_id == 0)

    # df is a list of data frames
    df = []
    for metric in ['burden']:
        input_file_pattern = ('FILEPATH')
        logger.debug("Cleanup file pattern {}".format(
            input_file_pattern.format(root=aggregated_draw_dir,
                                      metric=metric,
                                      location_id=location_id,
                                      year_id=year_id,
                                      measure_id=measure_id)))
        draw_files = glob.glob(
            input_file_pattern.format(root=aggregated_draw_dir,
                                      metric=metric,
                                      location_id=location_id,
                                      year_id=year_id,
                                      measure_id=measure_id))
        for f in draw_files:
            logger.info("appending {}".format(f))
            this_df = pd.read_hdf('{}'.format(f))
            dups = this_df[this_df.filter(
                like='_id').columns].duplicated().any()
            if dups:
                msg = ("Duplicates found in location aggregate output "
                       "file {}. Failing this cleanup job".format(f))
                logger.error(msg)
                raise RuntimeError(msg)
            df.append(this_df)
    df = pd.concat(df)
    logger.info("append files complete, time = {}".format(time.time()))
    logger.info("columns appended df {}".format(get_index_columns(df)))
    add_star_id(df)

    # Get cause envelope
    data_container = DataContainer(
        {
            'location_id': location_id,
            'year_id': year_id
        },
        n_draws=n_draws,
        gbd_round_id=gbd_round_id,
        decomp_step=decomp_step,
        cod_dir=cod_dir,
        cod_pattern=cod_pattern,
        epi_dir=epi_dir,
        turn_off_null_and_nan_check=turn_off_null_and_nan_check,
        cache_dir=cache_dir)
    MPGlobals.data_container = data_container

    # cause_env_df has all-cause mortality/whatever, without risks
    if measure_id == gbd.measures.DEATH:
        cause_env_df = data_container['death']
    elif measure_id == gbd.measures.YLL:
        cause_env_df = data_container['yll']
    elif measure_id == gbd.measures.YLD:
        cause_env_df = data_container['yld']
    elif measure_id == gbd.measures.DALY:
        # Get YLLs and YLDs
        yll_df = data_container['yll']
        yld_df = data_container['yld']
        yld_df = yld_df.loc[yld_df.measure_id == gbd.measures.YLD]
        # Compute DALYs
        draw_cols = list(yld_df.filter(like='draw').columns)
        index_cols = list(set(yld_df.columns) - set(draw_cols))
        daly_ce = ComputeDalys(yll_df, yld_df, draw_cols, index_cols)
        cause_env_df = daly_ce.get_data_frame()

    cause_env_df['rei_id'] = gbd.risk.TOTAL_ATTRIBUTABLE
    cause_env_df['star_id'] = gbd.star.ANY_EVIDENCE_LEVEL

    # Concatenate cause envelope with data

    most_detailed_age_groups = MetricConverter.get_detailed_ages()
    df = pd.concat([df, cause_env_df], sort=True)
    df = df.loc[((df['sex_id'].isin([gbd.sex.MALE, gbd.sex.FEMALE])) &
                 (df['age_group_id'].isin(most_detailed_age_groups)) &
                 (df['metric_id'] == gbd.metrics.NUMBER))]

    # Do sex aggregation
    draw_cols = list(df.filter(like='draw').columns)
    index_cols = list(set(df.columns) - set(draw_cols))
    logger.info("start aggregating sexes, time = {}".format(time.time()))
    my_sex_aggr = SexAggregator(df, draw_cols, index_cols)
    df = my_sex_aggr.get_data_frame()
    logger.info("aggregating ages sexes, time = {}".format(time.time()))

    # Do age aggregation
    logger.info("start aggregating ages, time = {}".format(time.time()))
    my_age_aggr = AgeAggregator(df,
                                draw_cols,
                                index_cols,
                                data_container=data_container)
    df = my_age_aggr.get_data_frame()
    logger.info("aggregating ages complete, time = {}".format(time.time()))

    # Convert to rate space
    logger.info("start converting to rates, time = {}".format(time.time()))
    df = MetricConverter(df, to_rate=True,
                         data_container=data_container).get_data_frame()
    logger.info("converting to rates complete, time = {}".format(time.time()))

    # df does not contain AB's any more, because they are RATES

    # Back-calculate PAFs
    logger.info("start back-calculating PAFs, time = {}".format(time.time()))
    to_calc_pafs = ((df['metric_id'] == gbd.metrics.NUMBER) |
                    (df['age_group_id'] == gbd.age.AGE_STANDARDIZED))
    pafs_df = df.loc[to_calc_pafs].copy(deep=True)

    # back_calc_pafs is part of the most detailed pipeline, reused from here.
    pafs_df = back_calc_pafs(pafs_df, n_draws)
    df = pd.concat([df, pafs_df], sort=True)
    logger.info("back-calculating PAFs complete, time = {}".format(
        time.time()))

    # Calculate and write out summaries as CSV files
    csv_dir = "FILEPATH".format(draw_dir, location_id)
    write_sum.write_summaries(location_id,
                              year_id,
                              csv_dir,
                              df,
                              index_cols,
                              do_risk_aggr=True,
                              write_out_star_ids=write_out_star_ids,
                              dual_upload=dual_upload)

    # Save draws
    df = df.loc[(
        (df['sex_id'].isin([gbd.sex.MALE, gbd.sex.FEMALE])) &
        (df['age_group_id'].isin(most_detailed_age_groups)) &
        (df['metric_id'].isin([gbd.metrics.NUMBER, gbd.metrics.PERCENT])))]
    logger.info("start saving draws, time = {}".format(time.time()))
    output_file_pattern = ('FILEPATH')
    output_file_path = output_file_pattern.format(location_id=location_id,
                                                  year_id=year_id,
                                                  measure_id=measure_id)
    filename = "FILEPATH".format(draw_dir, output_file_path)
    remove_unwanted_stars(df, write_out_star_ids=write_out_star_ids)
    sink = HDFDataSink(filename, complib="zlib", complevel=1)
    sink.write(df)
    logger.info("saving output draws complete, time = {}".format(time.time()))

    # End log
    end_time = time.time()
    elapsed = end_time - start_time
    logger.info("DONE cleanup pipeline at {}, elapsed seconds= {}".format(
        end_time, elapsed))
    logger.info("FILEPATH".format(SUCCESS_LOG_MESSAGE))
Пример #6
0
def run_burdenator_cleanup(out_dir, location_id, year_id, n_draws, measure_id,
                           cod_dir, epi_dir,
                           turn_off_null_and_nan_check, gbd_round_id,
                           write_out_star_ids,
                           cache_dir):
    """Take a set of aggregated results and reformat them into draws consistent
    with the most-detailed location draws.

    Args:
        out_dir (str): the root directory for this burdenator run
        location_id (int): location_id of the aggregate location
        year_id (int): year of the aggregate location
        n_draws (int): the number of draw columns in the H5 data frames,
            greater than zero
        measure_id (int): measure_id of the aggregate location
        cod_dir (str): directory where the cause-level envelope for
            cod (CoDCorrect) files are stored
        epi_dir (str): directory where the cause-level envelope for
            epi (COMO) files are stored
        turn_off_null_and_nan_check (bool): Disable checks for NaNs and Nulls
        write_out_star_ids (bool): If true, include star_ids in output
        draw files and CSV upload files
    """
    MPGlobals.logger = logger
    start_time = time.time()
    logger.info("START pipeline burdenator cleanup at {}".format(start_time))

    # Get aggregated draws
    logger.info("start append files, time = {}".format(time.time()))
    draw_dir = os.path.join(out_dir, 'draws')
    aggregated_draw_dir = os.path.join(out_dir, 'loc_agg_draws')

    df = []
    for metric in ['burden']:
        input_file_pattern = ('{root}/{metric}/'
                              '{location_id}/{measure_id}/'
                              '{measure_id}_{year_id}_{location_id}_*.h5')
        logger.debug("Cleanup file pattern {}".format(
            input_file_pattern.format(root=aggregated_draw_dir, metric=metric,
                                      location_id=location_id, year_id=year_id,
                                      measure_id=measure_id)))
        draw_files = glob.glob(input_file_pattern.format(
            root=aggregated_draw_dir, metric=metric, location_id=location_id,
            year_id=year_id, measure_id=measure_id))
        for f in draw_files:
            logger.info("appending {}".format(f))
            this_df = pd.read_hdf('{}'.format(f))
            dups = this_df[this_df.filter(like='_id').columns
                           ].duplicated().any()
            if dups:
                msg = ("Duplicates found in location aggregate output "
                       "file {}. Failing this cleanup job".format(f))
                logger.error(msg)
                raise RuntimeError(msg)
            df.append(this_df)
    df = pd.concat(df)
    logger.info("append files complete, time = {}".format(time.time()))
    logger.info("columns appended df {}".format(get_index_columns(df)))
    add_star_id(df)

    # Get cause envelope
    data_container = DataContainer(
        {'location_id': location_id,
         'year_id': year_id},
        n_draws=n_draws,
        gbd_round_id=gbd_round_id,
        cod_dir=cod_dir,
        epi_dir=epi_dir,
        turn_off_null_and_nan_check=turn_off_null_and_nan_check,
        cache_dir=cache_dir)
    MPGlobals.data_container = data_container

    # cause_env_df has all-cause, without risks
    if measure_id == gbd.measures.DEATH:
        cause_env_df = data_container['death']
    elif measure_id == gbd.measures.YLL:
        cause_env_df = data_container['yll']
    elif measure_id == gbd.measures.YLD:
        cause_env_df = data_container['yld']
    elif measure_id == gbd.measures.DALY:
        # Get YLLs and YLDs
        yll_df = data_container['yll']
        yld_df = data_container['yld']
        yld_df = yld_df.loc[yld_df.measure_id == gbd.measures.YLD]
        # Compute DALYs
        draw_cols = list(yld_df.filter(like='draw').columns)
        index_cols = list(set(yld_df.columns) - set(draw_cols))
        daly_ce = ComputeDalys(yll_df, yld_df, draw_cols, index_cols)
        cause_env_df = daly_ce.get_data_frame()
    cause_env_df['rei_id'] = gbd.risk.TOTAL_ATTRIBUTABLE
    cause_env_df['star_id'] = gbd.star.ANY_EVIDENCE_LEVEL

    # Concatenate cause envelope with data
    most_detailed_age_groups = MetricConverter.get_detailed_ages()
    df = pd.concat([df, cause_env_df])
    df = df.loc[((df['sex_id'].isin([gbd.sex.MALE, gbd.sex.FEMALE])) &
                (df['age_group_id'].isin(most_detailed_age_groups)) &
                (df['metric_id'] == gbd.metrics.NUMBER))]

    # Do sex aggregation
    draw_cols = list(df.filter(like='draw').columns)
    index_cols = list(set(df.columns) - set(draw_cols))
    logger.info("start aggregating sexes, time = {}".format(time.time()))
    my_sex_aggr = SexAggregator(df, draw_cols, index_cols)
    df = my_sex_aggr.get_data_frame()
    logger.info("aggregating ages sexes, time = {}".format(time.time()))

    # Do age aggregation
    logger.info("start aggregating ages, time = {}".format(time.time()))
    my_age_aggr = AgeAggregator(df, draw_cols, index_cols,
                                data_container=data_container)
    df = my_age_aggr.get_data_frame()
    logger.info("aggregating ages complete, time = {}".format(time.time()))

    # Convert to rate space
    logger.info("start converting to rates, time = {}".format(time.time()))
    df = MetricConverter(df, to_rate=True,
                         data_container=data_container).get_data_frame()
    logger.info("converting to rates complete, time = {}".format(time.time()))

    # Back-calculate PAFs
    logger.info("start back-calculating PAFs, time = {}".format(time.time()))
    to_calc_pafs = ((df['metric_id'] == gbd.metrics.NUMBER) |
                    (df['age_group_id'] == gbd.age.AGE_STANDARDIZED))
    pafs_df = df.loc[to_calc_pafs].copy(deep=True)

    pafs_df = back_calc_pafs(pafs_df, n_draws)
    df = pd.concat([df, pafs_df])
    logger.info("back-calculating PAFs complete, time = {}"
                .format(time.time()))

    # Calculate and write out summaries as CSV files
    csv_dir = "{}/{}/upload/".format(draw_dir, location_id)
    write_sum.write_summaries(location_id, year_id, csv_dir, df, index_cols,
                              do_risk_aggr=True,
                              write_out_star_ids=write_out_star_ids)

    # Save draws
    df = df.loc[((df['sex_id'].isin([gbd.sex.MALE, gbd.sex.FEMALE])) &
                (df['age_group_id'].isin(most_detailed_age_groups)) &
                (df['metric_id'].isin([gbd.metrics.NUMBER,
                                      gbd.metrics.PERCENT])))]
    logger.info("start saving draws, time = {}".format(time.time()))
    output_file_pattern = ('{location_id}/'
                           '{measure_id}_{location_id}_{year_id}.h5')
    output_file_path = output_file_pattern.format(
        location_id=location_id, year_id=year_id, measure_id=measure_id)
    filename = "{}/{}".format(draw_dir, output_file_path)
    df = remove_unwanted_stars(df, write_out_star_ids=write_out_star_ids)
    sink = HDFDataSink(filename,
                       complib="zlib",
                       complevel=1)
    sink.write(df)
    logger.info("saving output draws complete, time = {}".format(time.time()))

    # End log
    end_time = time.time()
    elapsed = end_time - start_time
    logger.info("DONE cleanup pipeline at {}, elapsed seconds= {}".format(
        end_time, elapsed))
    logger.info("{}".format(SUCCESS_LOG_MESSAGE))
Пример #7
0
def risk_attr_burden_to_paf(
    risk_cause_df,
    hundred_percent_pafs_df,
    value_cols,
    demographic_cols=[
        'location_id', 'year_id', 'age_group_id', 'sex_id', 'measure_id',
        'metric_id'
    ],
):
    """Takes a dataframe whose values represent risk-attributable burden
    and convert those to PAFs"""
    # Get the cause-level envelope
    if 'star_id' in risk_cause_df:
        burden_by_cause = risk_cause_df.query(
            'rei_id == @gbd.risk.TOTAL_ATTRIBUTABLE and '
            'star_id == @gbd.star.ANY_EVIDENCE_LEVEL')
    else:
        burden_by_cause = risk_cause_df.query(
            'rei_id == @gbd.risk.TOTAL_ATTRIBUTABLE')

    logger.info("APPLY PAFS BEGIN burden_by_cause {}".format(
        get_index_columns(burden_by_cause)))
    # Merge cause-level envelope onto data
    # Do not merge on star_id. For pafs we don't care about the star,
    # just it proportion.
    # Left hand side will be in 1..5, RHS will be == 6
    paf_df = risk_cause_df.merge(burden_by_cause,
                                 on=demographic_cols + ['cause_id'],
                                 suffixes=('', '_bbc'))
    # Divide attributable burden by cause-level envelope
    bbc_vcs = ["{}_bbc".format(col) for col in value_cols]
    paf_df[value_cols] = paf_df[value_cols].values / paf_df[bbc_vcs].values
    paf_df[value_cols] = paf_df[value_cols].fillna(0)

    # Set certain cause-risk pairs to 100 % pafs
    # This should not happen on age standardized pafs
    if hundred_percent_pafs_df.empty:
        logger.debug("No hundred-percent PAFs detected")
    else:
        hundred_percent_pafs_df['full_paf'] = 1
        paf_df = pd.merge(paf_df,
                          hundred_percent_pafs_df,
                          on=['cause_id', 'rei_id'],
                          how='left')

        set_to_one = ((paf_df['full_paf'] == 1) &
                      (paf_df['age_group_id'] != gbd.age.AGE_STANDARDIZED))
        paf_rows = paf_df.loc[set_to_one].index.tolist()
        # for all the 100% pafs, make sure that the draws arent all equal
        # to 0. If they are all 0 they are not 100% attributable
        should_be_one_rows = paf_df.index.isin(paf_rows)
        not_actually_one_rows = (paf_df.loc[should_be_one_rows,
                                            value_cols] == 0).all(axis=1)
        paf_rows = list(
            set(paf_rows) -
            set(not_actually_one_rows[not_actually_one_rows].index))
        paf_df.loc[paf_rows, value_cols] = 1.0

    # Change metric to percent
    paf_df['metric_id'] = gbd.metrics.PERCENT
    logger.info("APPLY PAFS END")
    # Keep only the columns we need
    if 'star_id' in paf_df:
        keep_cols = demographic_cols + ['cause_id', 'rei_id', 'star_id'
                                        ] + value_cols
    else:
        keep_cols = demographic_cols + ['cause_id', 'rei_id'] + value_cols
    return paf_df[keep_cols]