예제 #1
0
def write_summaries_multi(location_id, start_year, end_year, csv_dir, df,
                          index_cols, write_out_star_ids):
    logger.debug("Entering write summaries multi_year")
    year_dir = "DIRECTORY"

    write_columns_order = [
        'measure_id', 'year_start_id', 'year_end_id', 'location_id', 'sex_id',
        'age_group_id', 'cause_id', 'metric_id', 'mean', 'upper', 'lower'
    ]
    if 'rei_id' in df.columns:
        # Add rei_id to write_columns_order
        cid_pos = write_columns_order.index('cause_id')
        write_columns_order.insert(cid_pos + 1, 'rei_id')
        # Merge on REI types
        rei_type_id_df = get_rei_type_id_df()
        df = pd.merge(df, rei_type_id_df, on='rei_id')

    remove_unwanted_stars(df, write_out_star_ids=write_out_star_ids)

    for my_measure_id in (gbd.measures.DEATH, gbd.measures.DALY,
                          gbd.measures.YLD, gbd.measures.YLL):
        this_df = df[df['measure_id'] == my_measure_id]

        if not this_df.empty:
            this_out_dir = '{d}/{m}/{y}'.format(d=csv_dir,
                                                m=my_measure_id,
                                                y=year_dir)
            if 'rei_id' in df.columns:
                # Write out risks
                out_file_name = "upload_risk_{}_{}_{}.csv".format(
                    location_id, start_year, end_year)
                df_to_csv(this_df[this_df['rei_type_id'] == RISK_REI_TYPE],
                          index_cols, this_out_dir, out_file_name,
                          write_columns_order)
                # Write out etiologies
                out_file_name = "upload_eti_{}_{}_{}.csv".format(
                    location_id, start_year, end_year)
                df_to_csv(this_df[this_df['rei_type_id'] == ETI_REI_TYPE],
                          index_cols, this_out_dir, out_file_name,
                          write_columns_order)
            else:
                out_file_name = "upload_summary_{}_{}_{}.csv".format(
                    location_id, start_year, end_year)
                df_to_csv(this_df, index_cols, this_out_dir, out_file_name,
                          write_columns_order)
예제 #2
0
def run_pipeline_burdenator(args):
    """
    Run the entire dalynator pipeline. Typically called from
    run_all->qsub->run_remote_pipeline->here

    Will raise ValueError if input files are not present.

    :param args:
    :return:
    """
    # Start logger
    logger = logging.getLogger(__name__)
    start_time = time.time()
    logger.info("START pipeline burdenator at {}".format(start_time))
    logger.info("START pipeline burdenator n_draws {}".format(args.n_draws))
    # Validate args before doing any heavy-lifting
    if not any([
            args.write_out_ylls_paf, args.write_out_ylds_paf,
            args.write_out_deaths_paf, args.write_out_dalys_paf
    ]):
        raise ValueError("must choose at least one of --ylls_paf, --ylds_paf,"
                         " --deaths_paf, or --dalys_paf ")

    # Share args across processes
    MPGlobals.args = args
    MPGlobals.logger = logger

    # Get detailed ages
    MPGlobals.most_detailed_age_groups = MetricConverter.get_detailed_ages()

    logger.info("START pipeline burdenator before data_container ")
    # Create a DataContainer, cache data to be shared across processes
    data_container = DataContainer(
        location_id=args.location_id,
        year_id=args.year_id,
        n_draws=args.n_draws,
        gbd_round_id=args.gbd_round_id,
        epi_dir=args.epi_dir,
        cod_dir=args.cod_dir,
        daly_dir=args.daly_dir,
        paf_dir=args.paf_dir,
        turn_off_null_and_nan_check=args.turn_off_null_and_nan_check,
        cache_dir=args.cache_dir)

    # Fetch PAF input from RF team
    logger.info("start apply PAFs, time = {}".format(time.time()))
    yll_columns = ['paf_yll_{}'.format(x) for x in xrange(args.n_draws)]
    yld_columns = ['paf_yld_{}'.format(x) for x in xrange(args.n_draws)]
    draw_columns = ['draw_{}'.format(x) for x in xrange(args.n_draws)]
    pafs_filter = PAFInputFilter(yll_columns=yll_columns,
                                 yld_columns=yld_columns,
                                 draw_columns=draw_columns)
    paf_df = data_container['paf']
    pafs_filter.set_input_data_frame(paf_df)
    MPGlobals.pafs_filter = pafs_filter

    # Cache data and burdenate
    measures = []
    if args.write_out_ylls_paf:
        measures.append('yll')
        data_container['yll']
    if args.write_out_ylds_paf:
        measures.append('yld')
        data_container['yld']
    if args.write_out_deaths_paf:
        measures.append('death')
        data_container['death']

    MPGlobals.data_container = data_container
    pool_size = len(measures)
    pool = Pool(pool_size)
    results = map_and_raise(pool, burdenate_caught, measures)

    # Compute DALYs and associated summaries, if requested
    if args.write_out_dalys_paf:
        if not (args.write_out_ylls_paf and args.write_out_ylds_paf):
            raise ValueError("Can't compute risk-attributable DALYs unless "
                             "both ylls and ylds are also provided")
        measures.append('daly')
        yld_df = [i['draws'] for i in results if i['key'] == 'yld'][0]
        yll_df = [i['draws'] for i in results if i['key'] == 'yll'][0]
        daly_df = compute_dalys(yld_df[yld_df.measure_id == gbd.measures.YLD],
                                yll_df)
        results.append({'key': 'daly', 'draws': daly_df})

    # Write out meta-information for downstream aggregation step
    meta_df = pd.concat([get_dimensions(r['draws']) for r in results])
    meta_df = aggregate_dimensions(meta_df)
    meta_dict = generate_meta(meta_df)
    write_meta(args.out_dir, meta_dict)

    # Set the results as a Global, for use in summarization Pool
    MPGlobals.results = results

    # Summarize
    pool_size = len(measures)
    pool = Pool(pool_size)
    summ_df = map_and_raise(pool, summarize_caught, measures)

    summ_df = pd.concat(summ_df)
    summ_df = match_with_dimensions(summ_df, meta_df)
    summ_df.reset_index(drop=True, inplace=True)

    logger.info(
        "Risk attribution & daly computation complete, df shape {}".format(
            (summ_df.shape)))

    logger.info("  FINAL burdenator result shape {}".format(summ_df.shape))

    # Write out the year summaries as CSV files
    rei_types = get_rei_type_id_df()
    summ_df = summ_df.loc[summ_df['rei_id'] != 0]
    for measure_id in summ_df.measure_id.unique():
        for risk_type in [RISK_REI_TYPE, ETI_REI_TYPE]:

            # Get list of rei_ids of this type
            risks_of_type = rei_types[rei_types.rei_type_id == risk_type]
            risks_of_type = risks_of_type.rei_id.squeeze()

            # Compute filename
            summ_fn = get_summ_filename(args.out_dir, risk_type,
                                        args.location_id, args.year_id,
                                        measure_id)
            logger.info("Writing {}".format(summ_fn))

            # Write appropriate subset to file
            write_csv(
                summ_df[((summ_df.measure_id == measure_id) &
                         (summ_df.rei_id.isin(risks_of_type)))], summ_fn)

    end_time = time.time()
    elapsed = end_time - start_time
    logger.info("DONE location-year pipeline at {}, elapsed seconds= "
                "{}".format(end_time, elapsed))
    logger.info("{}".format(SUCCESS_LOG_MESSAGE))

    return summ_df.shape
예제 #3
0
def write_summaries(location_id,
                    year_id,
                    csv_dir,
                    df,
                    index_cols,
                    do_risk_aggr=False,
                    write_out_star_ids=False):
    # find none/undefined value in df, remove it from df
    if do_risk_aggr:
        write_columns_order = [
            'measure_id', 'year_id', 'location_id', 'sex_id', 'age_group_id',
            'cause_id', 'rei_id', 'metric_id', 'mean', 'upper', 'lower'
        ]
    else:
        write_columns_order = [
            'measure_id', 'year_id', 'location_id', 'sex_id', 'age_group_id',
            'cause_id', 'metric_id', 'mean', 'upper', 'lower'
        ]

    write_columns_order = remove_unwanted_star_id_column(
        write_columns_order, write_out_star_ids)

    logger.debug("Entering write summaries")
    if do_risk_aggr:
        rei_type_id_df = get_rei_type_id_df()
        df = pd.merge(df, rei_type_id_df, on='rei_id')

        tmp_df = df
        measure_ids = [
            gbd.measures.DEATH, gbd.measures.YLL, gbd.measures.YLD,
            gbd.measures.DALY
        ]
        for measure_id in measure_ids:
            this_df = tmp_df[tmp_df['measure_id'] == measure_id]

            if not this_df.empty:
                logger.debug("rei non-zero {}".format(measure_id))
                this_out_dir = "{}/{}/single_year/".format(csv_dir, measure_id)
                logger.debug("this_out_dir={}".format(this_out_dir))

                out_file_name = ("upload_risk_" + str(location_id) + "_" +
                                 str(year_id) + ".csv")
                df_to_csv(this_df[this_df['rei_type_id'] == RISK_REI_TYPE],
                          index_cols, this_out_dir, out_file_name,
                          write_columns_order)

                out_file_name = ("upload_eti_" + str(location_id) + "_" +
                                 str(year_id) + ".csv")
                df_to_csv(this_df[this_df['rei_type_id'] == ETI_REI_TYPE],
                          index_cols, this_out_dir, out_file_name,
                          write_columns_order)
    else:
        tmp_df = df

        # Save mortality measure IDs
        mortality_measure_ids = [gbd.measures.DEATH, gbd.measures.YLL]
        for measure_id in mortality_measure_ids:
            this_df = tmp_df[tmp_df['measure_id'] == measure_id]
            if not this_df.empty:
                logger.debug("rei_id is 0, measures 1 & 4, " +
                             "measure == {}".format(measure_id))
                this_out_dir = "{}/{}/single_year/".format(csv_dir, measure_id)
                out_file_name = ("upload_risk_" + str(location_id) + "_" +
                                 str(year_id) + ".csv")
                df_to_csv(this_df, index_cols, this_out_dir, out_file_name,
                          write_columns_order)

        # Save DALY results
        for measure_id in [gbd.measures.DALY]:
            this_df = tmp_df[tmp_df['measure_id'] == measure_id]
            if not this_df.empty:
                logger.debug("measure 2, measure is {}".format(measure_id))
                this_out_dir = "{}/{}/single_year/".format(csv_dir, measure_id)
                logger.debug("this_out_dir={}".format(this_out_dir))
                out_file_name = ("upload_summary_" + str(location_id) + "_" +
                                 str(year_id) + ".csv")
                df_to_csv(this_df, index_cols, this_out_dir, out_file_name,
                          write_columns_order)