def read_all_cause(self, year_id):
     ''' pull cause envelope for given year
     this will only be called when running the burdenator as it is
     needed to back calculate pafs (used as the denominator)
     and generate multi year (pct change) estimates in pct space '''
     cause_data_container = DataContainer(
         {'location_id': self.args.location_id,
          'year_id': year_id},
         gbd_round_id=self.args.gbd_round_id,
         decomp_step=self.args.decomp_step,
         cache_dir=self.cache_dir,
         n_draws=self.args.n_draws,
         cod_dir=self.args.cod_dir,
         cod_pattern=self.args.cod_pattern,
         epi_dir=self.args.epi_dir)
     if self.args.measure_id == gbd.measures.DEATH:
         cause_df = cause_data_container['death']
     elif self.args.measure_id == gbd.measures.YLL:
         cause_df = cause_data_container['yll']
     elif self.args.measure_id == gbd.measures.YLD:
         cause_df = cause_data_container['yld']
     elif self.args.measure_id == gbd.measures.DALY:
         # Get YLLs and YLDs
         yll_df = cause_data_container['yll']
         yld_df = cause_data_container['yld']
         yld_df = yld_df.loc[yld_df.measure_id == gbd.measures.YLD]
         # Compute DALYs
         draw_cols = list(yld_df.filter(like='draw').columns)
         index_cols = list(set(yld_df.columns) - set(draw_cols))
         daly_df = ComputeDalys(yll_df, yld_df, draw_cols, index_cols)
         cause_df = daly_df.get_data_frame()
     cause_df['rei_id'] = gbd.risk.TOTAL_ATTRIBUTABLE
     cause_df['star_id'] = gbd.star.ANY_EVIDENCE_LEVEL
     return cause_df.loc[cause_df['metric_id'] == gbd.metrics.NUMBER]
示例#2
0
    def instantiate_data_containers(self, cache_dir=None):
        if not cache_dir:
            self.cache_dir = '{}/cache'.format(self.args.out_dir)
        else:
            self.cache_dir = cache_dir

        self.data_container_start = DataContainer(
            location_id=self.args.location_id,
            year_id=self.args.start_year,
            gbd_round_id=self.args.gbd_round_id,
            cache_dir=self.cache_dir,
            n_draws=self.args.n_draws)
        self.data_container_end = DataContainer(
            location_id=self.args.location_id,
            year_id=self.args.end_year,
            gbd_round_id=self.args.gbd_round_id,
            cache_dir=self.cache_dir,
            n_draws=self.args.n_draws)
    def instantiate_data_containers(self, cache_dir=None):
        if not cache_dir:
            self.cache_dir = 'FILEPATH'.format(self.args.out_dir)
        else:
            self.cache_dir = cache_dir

        self.data_container_start = DataContainer(
            {'location_id': self.args.location_id,
             'year_id': self.args.start_year},
            gbd_round_id=self.args.gbd_round_id,
            decomp_step=self.args.decomp_step,
            cache_dir=self.cache_dir,
            n_draws=self.args.n_draws)
        self.data_container_end = DataContainer(
            {'location_id': self.args.location_id,
             'year_id': self.args.end_year},
            gbd_round_id=self.args.gbd_round_id,
            decomp_step=self.args.decomp_step,
            cache_dir=self.cache_dir,
            n_draws=self.args.n_draws)
示例#4
0
    def __init__(self, location_set_id, year_id, rei_id, sex_id, measure_id,
                 gbd_round_id, n_draws, data_root, region_locs,
                 write_out_star_ids):
        self.location_set_id = location_set_id
        self.year_id = year_id
        self.rei_id = rei_id
        self.sex_id = sex_id
        self.measure_id = measure_id
        self.gbd_round_id = gbd_round_id
        self.n_draws = n_draws
        self.data_root = data_root
        self.region_locs = region_locs
        self.data_container = DataContainer(
            {'location_set_id': self.location_set_id,
             'year_id': self.year_id,
             'sex_id': self.sex_id},
            n_draws=self.n_draws, gbd_round_id=self.gbd_round_id,
            cache_dir=os.path.join(self.data_root, 'cache'))
        self.loctree = self.data_container[
            'location_hierarchy_{}'.format(self.location_set_id)]

        self.in_dir = os.path.join(self.data_root, 'draws')
        self.out_dir = os.path.join(self.data_root, 'loc_agg_draws/burden')
        mkds.makedirs_safely(self.out_dir)
        self.write_out_star_ids = write_out_star_ids

        # Remove old aggregates in case jobs failed in the middle
        aggregates = [n.id for n in self.loctree.nodes
                      if n not in self.loctree.leaves()]
        for loc in aggregates:
            filename = ('{o}/{lo}/{me}/{m}_{y}_{loc}_{r}_{s}.h5'
                        .format(o=self.out_dir, lo=loc, me=self.measure_id,
                                m=self.measure_id, y=self.year_id,
                                loc=loc, r=self.rei_id, s=self.sex_id))
            logger.debug("Deleting potentially pre-existing loc-agg file"
                         "{e}: '{f}'".format(e=os.path.exists(filename),
                                             f=filename))
            with contextlib.suppress(FileNotFoundError):
                os.remove(filename)

        self.index_cols = ['measure_id', 'metric_id', 'sex_id', 'cause_id',
                           'rei_id', 'year_id', 'age_group_id']
        self.value_cols = ['draw_{}'.format(i) for i in range(self.n_draws)]
        self.draw_filters = {'metric_id': gbd.metrics.NUMBER,
                             'rei_id': self.rei_id,
                             'sex_id': self.sex_id,
                             'measure_id': self.measure_id,
                             'year_id': self.year_id}

        self.operator = self.get_operator()
        self.draw_source, self.draw_sink = self.get_draw_source_sink()
示例#5
0
def run_pipeline_burdenator(args):
    """
    Run the entire dalynator pipeline. Typically called from
    run_all->qsub->run_remote_pipeline->here

    Will raise ValueError if input files are not present.

    :param args:
    :return:
    """
    # Start logger
    logger = logging.getLogger(__name__)
    start_time = time.time()
    logger.info("START pipeline burdenator at {}".format(start_time))
    logger.info("START pipeline burdenator n_draws {}".format(args.n_draws))
    # Validate args before doing any heavy-lifting
    if not any([
            args.write_out_ylls_paf, args.write_out_ylds_paf,
            args.write_out_deaths_paf, args.write_out_dalys_paf
    ]):
        raise ValueError("must choose at least one of --ylls_paf, --ylds_paf,"
                         " --deaths_paf, or --dalys_paf ")

    # Share args across processes
    MPGlobals.args = args
    MPGlobals.logger = logger

    # Get detailed ages
    MPGlobals.most_detailed_age_groups = MetricConverter.get_detailed_ages()

    logger.info("START pipeline burdenator before data_container ")
    # Create a DataContainer, cache data to be shared across processes
    data_container = DataContainer(
        location_id=args.location_id,
        year_id=args.year_id,
        n_draws=args.n_draws,
        gbd_round_id=args.gbd_round_id,
        epi_dir=args.epi_dir,
        cod_dir=args.cod_dir,
        daly_dir=args.daly_dir,
        paf_dir=args.paf_dir,
        turn_off_null_and_nan_check=args.turn_off_null_and_nan_check,
        cache_dir=args.cache_dir)

    # Fetch PAF input from RF team
    logger.info("start apply PAFs, time = {}".format(time.time()))
    yll_columns = ['paf_yll_{}'.format(x) for x in xrange(args.n_draws)]
    yld_columns = ['paf_yld_{}'.format(x) for x in xrange(args.n_draws)]
    draw_columns = ['draw_{}'.format(x) for x in xrange(args.n_draws)]
    pafs_filter = PAFInputFilter(yll_columns=yll_columns,
                                 yld_columns=yld_columns,
                                 draw_columns=draw_columns)
    paf_df = data_container['paf']
    pafs_filter.set_input_data_frame(paf_df)
    MPGlobals.pafs_filter = pafs_filter

    # Cache data and burdenate
    measures = []
    if args.write_out_ylls_paf:
        measures.append('yll')
        data_container['yll']
    if args.write_out_ylds_paf:
        measures.append('yld')
        data_container['yld']
    if args.write_out_deaths_paf:
        measures.append('death')
        data_container['death']

    MPGlobals.data_container = data_container
    pool_size = len(measures)
    pool = Pool(pool_size)
    results = map_and_raise(pool, burdenate_caught, measures)

    # Compute DALYs and associated summaries, if requested
    if args.write_out_dalys_paf:
        if not (args.write_out_ylls_paf and args.write_out_ylds_paf):
            raise ValueError("Can't compute risk-attributable DALYs unless "
                             "both ylls and ylds are also provided")
        measures.append('daly')
        yld_df = [i['draws'] for i in results if i['key'] == 'yld'][0]
        yll_df = [i['draws'] for i in results if i['key'] == 'yll'][0]
        daly_df = compute_dalys(yld_df[yld_df.measure_id == gbd.measures.YLD],
                                yll_df)
        results.append({'key': 'daly', 'draws': daly_df})

    # Write out meta-information for downstream aggregation step
    meta_df = pd.concat([get_dimensions(r['draws']) for r in results])
    meta_df = aggregate_dimensions(meta_df)
    meta_dict = generate_meta(meta_df)
    write_meta(args.out_dir, meta_dict)

    # Set the results as a Global, for use in summarization Pool
    MPGlobals.results = results

    # Summarize
    pool_size = len(measures)
    pool = Pool(pool_size)
    summ_df = map_and_raise(pool, summarize_caught, measures)

    summ_df = pd.concat(summ_df)
    summ_df = match_with_dimensions(summ_df, meta_df)
    summ_df.reset_index(drop=True, inplace=True)

    logger.info(
        "Risk attribution & daly computation complete, df shape {}".format(
            (summ_df.shape)))

    logger.info("  FINAL burdenator result shape {}".format(summ_df.shape))

    # Write out the year summaries as CSV files
    rei_types = get_rei_type_id_df()
    summ_df = summ_df.loc[summ_df['rei_id'] != 0]
    for measure_id in summ_df.measure_id.unique():
        for risk_type in [RISK_REI_TYPE, ETI_REI_TYPE]:

            # Get list of rei_ids of this type
            risks_of_type = rei_types[rei_types.rei_type_id == risk_type]
            risks_of_type = risks_of_type.rei_id.squeeze()

            # Compute filename
            summ_fn = get_summ_filename(args.out_dir, risk_type,
                                        args.location_id, args.year_id,
                                        measure_id)
            logger.info("Writing {}".format(summ_fn))

            # Write appropriate subset to file
            write_csv(
                summ_df[((summ_df.measure_id == measure_id) &
                         (summ_df.rei_id.isin(risks_of_type)))], summ_fn)

    end_time = time.time()
    elapsed = end_time - start_time
    logger.info("DONE location-year pipeline at {}, elapsed seconds= "
                "{}".format(end_time, elapsed))
    logger.info("{}".format(SUCCESS_LOG_MESSAGE))

    return summ_df.shape
    def _compute_most_detailed_df(self):
        """
        Computations only, does not write files. Makes testing easier.
        """

        start_time = time.time()
        logger.info("START location-year pipeline at {}".format(start_time))

        # Create a DataContainer
        data_container = DataContainer(
            {
                'location_id': self.location_id,
                'year_id': self.year_id
            },
            n_draws=self.n_draws,
            gbd_round_id=self.gbd_round_id,
            epi_dir=self.epi_dir,
            cod_dir=self.cod_dir,
            cache_dir=self.cache_dir,
            turn_off_null_and_nan_check=self.turn_off_null_and_nan_check)
        yll_df = data_container['yll']
        yld_df = data_container['yld']

        # Compute DALYs
        draw_cols = list(yll_df.filter(like='draw').columns)
        index_cols = list(set(yll_df.columns) - set(draw_cols))
        computer = ComputeDalys(yll_df, yld_df, draw_cols, index_cols)
        df = computer.get_data_frame()

        logger.info("DALY computation complete, df shape {}".format(
            (df.shape)))
        logger.info(" input DF age_group_id {}".format(
            df['age_group_id'].unique()))

        draw_cols = list(df.filter(like='draw').columns)
        index_cols = list(set(df.columns) - set(draw_cols))
        existing_age_groups = df['age_group_id'].unique()

        logger.info("Preparing for sex aggregation")

        # Do sex aggregation
        my_sex_aggr = SexAggregator(df, draw_cols, index_cols)
        df = my_sex_aggr.get_data_frame()
        logger.info("Sex aggregation complete")

        # Do age aggregation
        my_age_aggr = AgeAggregator(df,
                                    draw_cols,
                                    index_cols,
                                    data_container=data_container)
        df = my_age_aggr.get_data_frame()
        logger.info("Age aggregation complete")

        # Convert to rate and % space
        df = MetricConverter(df,
                             to_rate=True,
                             to_percent=True,
                             data_container=data_container).get_data_frame()

        logger.debug("new  DF age_group_id {}".format(
            df['age_group_id'].unique()))
        logger.info("  FINAL dalynator result shape {}".format(df.shape))
        end_time = time.time()
        elapsed = end_time - start_time
        logger.info(
            "DONE location-year pipeline at {}, elapsed seconds= {}".format(
                end_time, elapsed))
        logger.info("{}".format(SUCCESS_LOG_MESSAGE))

        return df, existing_age_groups
示例#7
0
def run_burdenator_cleanup(out_dir, location_id, year_id, n_draws, measure_id,
                           cod_dir, cod_pattern, epi_dir,
                           turn_off_null_and_nan_check, gbd_round_id,
                           decomp_step, write_out_star_ids, cache_dir,
                           dual_upload):
    """Take a set of aggregated results and reformat them into draws consistent
    with the most-detailed location draws.

    Args:
        out_dir (str): the root directory for this burdenator run
        location_id (int): location_id of the aggregate location
        year_id (int): year of the aggregate location
        n_draws (int): the number of draw columns in the H5 data frames,
            greater than zero
        measure_id (int): measure_id of the aggregate location
        cod_dir (str): directory where the cause-level envelope for
            cod (CoDCorrect) files are stored
        cod_pattern (str): file pattern for accessing CoD-or-FauxCorrect
            draws.  Example: '{measure_id}_{location_id}.h5'
        epi_dir (str): directory where the cause-level envelope for
            epi (COMO) files are stored
        turn_off_null_and_nan_check (bool): Disable checks for NaNs and Nulls
        write_out_star_ids (bool): If true, include star_ids in output
        draw files and CSV upload files
        dual_upload (bool): If True upload to column store as well
            as the gbd database.  Currently not implemented.
    """
    MPGlobals.logger = logger
    start_time = time.time()
    logger.info("START pipeline burdenator cleanup at {}".format(start_time))
    logging.basicConfig(format='%(asctime)s %(message)s',
                        datefmt='%m/%d/%Y %I:%M:%S %p')
    logging.warning('is when this event was logged.')

    # Get aggregated draws
    logger.info("start append files, time = {}".format(time.time()))
    draw_dir = os.path.join(out_dir, 'draws')
    aggregated_draw_dir = os.path.join(out_dir, 'loc_agg_draws')
    # df contains Attribute Burden, which is in Number space.
    # It is a subset of the total count for the parent metric,
    # ie AB of YLL's for a cause attributable to a risk
    # (or to all known & unknown risks, ie rei_id == 0)

    # df is a list of data frames
    df = []
    for metric in ['burden']:
        input_file_pattern = ('FILEPATH')
        logger.debug("Cleanup file pattern {}".format(
            input_file_pattern.format(root=aggregated_draw_dir,
                                      metric=metric,
                                      location_id=location_id,
                                      year_id=year_id,
                                      measure_id=measure_id)))
        draw_files = glob.glob(
            input_file_pattern.format(root=aggregated_draw_dir,
                                      metric=metric,
                                      location_id=location_id,
                                      year_id=year_id,
                                      measure_id=measure_id))
        for f in draw_files:
            logger.info("appending {}".format(f))
            this_df = pd.read_hdf('{}'.format(f))
            dups = this_df[this_df.filter(
                like='_id').columns].duplicated().any()
            if dups:
                msg = ("Duplicates found in location aggregate output "
                       "file {}. Failing this cleanup job".format(f))
                logger.error(msg)
                raise RuntimeError(msg)
            df.append(this_df)
    df = pd.concat(df)
    logger.info("append files complete, time = {}".format(time.time()))
    logger.info("columns appended df {}".format(get_index_columns(df)))
    add_star_id(df)

    # Get cause envelope
    data_container = DataContainer(
        {
            'location_id': location_id,
            'year_id': year_id
        },
        n_draws=n_draws,
        gbd_round_id=gbd_round_id,
        decomp_step=decomp_step,
        cod_dir=cod_dir,
        cod_pattern=cod_pattern,
        epi_dir=epi_dir,
        turn_off_null_and_nan_check=turn_off_null_and_nan_check,
        cache_dir=cache_dir)
    MPGlobals.data_container = data_container

    # cause_env_df has all-cause mortality/whatever, without risks
    if measure_id == gbd.measures.DEATH:
        cause_env_df = data_container['death']
    elif measure_id == gbd.measures.YLL:
        cause_env_df = data_container['yll']
    elif measure_id == gbd.measures.YLD:
        cause_env_df = data_container['yld']
    elif measure_id == gbd.measures.DALY:
        # Get YLLs and YLDs
        yll_df = data_container['yll']
        yld_df = data_container['yld']
        yld_df = yld_df.loc[yld_df.measure_id == gbd.measures.YLD]
        # Compute DALYs
        draw_cols = list(yld_df.filter(like='draw').columns)
        index_cols = list(set(yld_df.columns) - set(draw_cols))
        daly_ce = ComputeDalys(yll_df, yld_df, draw_cols, index_cols)
        cause_env_df = daly_ce.get_data_frame()

    cause_env_df['rei_id'] = gbd.risk.TOTAL_ATTRIBUTABLE
    cause_env_df['star_id'] = gbd.star.ANY_EVIDENCE_LEVEL

    # Concatenate cause envelope with data

    most_detailed_age_groups = MetricConverter.get_detailed_ages()
    df = pd.concat([df, cause_env_df], sort=True)
    df = df.loc[((df['sex_id'].isin([gbd.sex.MALE, gbd.sex.FEMALE])) &
                 (df['age_group_id'].isin(most_detailed_age_groups)) &
                 (df['metric_id'] == gbd.metrics.NUMBER))]

    # Do sex aggregation
    draw_cols = list(df.filter(like='draw').columns)
    index_cols = list(set(df.columns) - set(draw_cols))
    logger.info("start aggregating sexes, time = {}".format(time.time()))
    my_sex_aggr = SexAggregator(df, draw_cols, index_cols)
    df = my_sex_aggr.get_data_frame()
    logger.info("aggregating ages sexes, time = {}".format(time.time()))

    # Do age aggregation
    logger.info("start aggregating ages, time = {}".format(time.time()))
    my_age_aggr = AgeAggregator(df,
                                draw_cols,
                                index_cols,
                                data_container=data_container)
    df = my_age_aggr.get_data_frame()
    logger.info("aggregating ages complete, time = {}".format(time.time()))

    # Convert to rate space
    logger.info("start converting to rates, time = {}".format(time.time()))
    df = MetricConverter(df, to_rate=True,
                         data_container=data_container).get_data_frame()
    logger.info("converting to rates complete, time = {}".format(time.time()))

    # df does not contain AB's any more, because they are RATES

    # Back-calculate PAFs
    logger.info("start back-calculating PAFs, time = {}".format(time.time()))
    to_calc_pafs = ((df['metric_id'] == gbd.metrics.NUMBER) |
                    (df['age_group_id'] == gbd.age.AGE_STANDARDIZED))
    pafs_df = df.loc[to_calc_pafs].copy(deep=True)

    # back_calc_pafs is part of the most detailed pipeline, reused from here.
    pafs_df = back_calc_pafs(pafs_df, n_draws)
    df = pd.concat([df, pafs_df], sort=True)
    logger.info("back-calculating PAFs complete, time = {}".format(
        time.time()))

    # Calculate and write out summaries as CSV files
    csv_dir = "FILEPATH".format(draw_dir, location_id)
    write_sum.write_summaries(location_id,
                              year_id,
                              csv_dir,
                              df,
                              index_cols,
                              do_risk_aggr=True,
                              write_out_star_ids=write_out_star_ids,
                              dual_upload=dual_upload)

    # Save draws
    df = df.loc[(
        (df['sex_id'].isin([gbd.sex.MALE, gbd.sex.FEMALE])) &
        (df['age_group_id'].isin(most_detailed_age_groups)) &
        (df['metric_id'].isin([gbd.metrics.NUMBER, gbd.metrics.PERCENT])))]
    logger.info("start saving draws, time = {}".format(time.time()))
    output_file_pattern = ('FILEPATH')
    output_file_path = output_file_pattern.format(location_id=location_id,
                                                  year_id=year_id,
                                                  measure_id=measure_id)
    filename = "FILEPATH".format(draw_dir, output_file_path)
    remove_unwanted_stars(df, write_out_star_ids=write_out_star_ids)
    sink = HDFDataSink(filename, complib="zlib", complevel=1)
    sink.write(df)
    logger.info("saving output draws complete, time = {}".format(time.time()))

    # End log
    end_time = time.time()
    elapsed = end_time - start_time
    logger.info("DONE cleanup pipeline at {}, elapsed seconds= {}".format(
        end_time, elapsed))
    logger.info("FILEPATH".format(SUCCESS_LOG_MESSAGE))
示例#8
0
def run_burdenator_cleanup(out_dir, location_id, year_id, n_draws, measure_id,
                           cod_dir, epi_dir,
                           turn_off_null_and_nan_check, gbd_round_id,
                           write_out_star_ids,
                           cache_dir):
    """Take a set of aggregated results and reformat them into draws consistent
    with the most-detailed location draws.

    Args:
        out_dir (str): the root directory for this burdenator run
        location_id (int): location_id of the aggregate location
        year_id (int): year of the aggregate location
        n_draws (int): the number of draw columns in the H5 data frames,
            greater than zero
        measure_id (int): measure_id of the aggregate location
        cod_dir (str): directory where the cause-level envelope for
            cod (CoDCorrect) files are stored
        epi_dir (str): directory where the cause-level envelope for
            epi (COMO) files are stored
        turn_off_null_and_nan_check (bool): Disable checks for NaNs and Nulls
        write_out_star_ids (bool): If true, include star_ids in output
        draw files and CSV upload files
    """
    MPGlobals.logger = logger
    start_time = time.time()
    logger.info("START pipeline burdenator cleanup at {}".format(start_time))

    # Get aggregated draws
    logger.info("start append files, time = {}".format(time.time()))
    draw_dir = os.path.join(out_dir, 'draws')
    aggregated_draw_dir = os.path.join(out_dir, 'loc_agg_draws')

    df = []
    for metric in ['burden']:
        input_file_pattern = ('{root}/{metric}/'
                              '{location_id}/{measure_id}/'
                              '{measure_id}_{year_id}_{location_id}_*.h5')
        logger.debug("Cleanup file pattern {}".format(
            input_file_pattern.format(root=aggregated_draw_dir, metric=metric,
                                      location_id=location_id, year_id=year_id,
                                      measure_id=measure_id)))
        draw_files = glob.glob(input_file_pattern.format(
            root=aggregated_draw_dir, metric=metric, location_id=location_id,
            year_id=year_id, measure_id=measure_id))
        for f in draw_files:
            logger.info("appending {}".format(f))
            this_df = pd.read_hdf('{}'.format(f))
            dups = this_df[this_df.filter(like='_id').columns
                           ].duplicated().any()
            if dups:
                msg = ("Duplicates found in location aggregate output "
                       "file {}. Failing this cleanup job".format(f))
                logger.error(msg)
                raise RuntimeError(msg)
            df.append(this_df)
    df = pd.concat(df)
    logger.info("append files complete, time = {}".format(time.time()))
    logger.info("columns appended df {}".format(get_index_columns(df)))
    add_star_id(df)

    # Get cause envelope
    data_container = DataContainer(
        {'location_id': location_id,
         'year_id': year_id},
        n_draws=n_draws,
        gbd_round_id=gbd_round_id,
        cod_dir=cod_dir,
        epi_dir=epi_dir,
        turn_off_null_and_nan_check=turn_off_null_and_nan_check,
        cache_dir=cache_dir)
    MPGlobals.data_container = data_container

    # cause_env_df has all-cause, without risks
    if measure_id == gbd.measures.DEATH:
        cause_env_df = data_container['death']
    elif measure_id == gbd.measures.YLL:
        cause_env_df = data_container['yll']
    elif measure_id == gbd.measures.YLD:
        cause_env_df = data_container['yld']
    elif measure_id == gbd.measures.DALY:
        # Get YLLs and YLDs
        yll_df = data_container['yll']
        yld_df = data_container['yld']
        yld_df = yld_df.loc[yld_df.measure_id == gbd.measures.YLD]
        # Compute DALYs
        draw_cols = list(yld_df.filter(like='draw').columns)
        index_cols = list(set(yld_df.columns) - set(draw_cols))
        daly_ce = ComputeDalys(yll_df, yld_df, draw_cols, index_cols)
        cause_env_df = daly_ce.get_data_frame()
    cause_env_df['rei_id'] = gbd.risk.TOTAL_ATTRIBUTABLE
    cause_env_df['star_id'] = gbd.star.ANY_EVIDENCE_LEVEL

    # Concatenate cause envelope with data
    most_detailed_age_groups = MetricConverter.get_detailed_ages()
    df = pd.concat([df, cause_env_df])
    df = df.loc[((df['sex_id'].isin([gbd.sex.MALE, gbd.sex.FEMALE])) &
                (df['age_group_id'].isin(most_detailed_age_groups)) &
                (df['metric_id'] == gbd.metrics.NUMBER))]

    # Do sex aggregation
    draw_cols = list(df.filter(like='draw').columns)
    index_cols = list(set(df.columns) - set(draw_cols))
    logger.info("start aggregating sexes, time = {}".format(time.time()))
    my_sex_aggr = SexAggregator(df, draw_cols, index_cols)
    df = my_sex_aggr.get_data_frame()
    logger.info("aggregating ages sexes, time = {}".format(time.time()))

    # Do age aggregation
    logger.info("start aggregating ages, time = {}".format(time.time()))
    my_age_aggr = AgeAggregator(df, draw_cols, index_cols,
                                data_container=data_container)
    df = my_age_aggr.get_data_frame()
    logger.info("aggregating ages complete, time = {}".format(time.time()))

    # Convert to rate space
    logger.info("start converting to rates, time = {}".format(time.time()))
    df = MetricConverter(df, to_rate=True,
                         data_container=data_container).get_data_frame()
    logger.info("converting to rates complete, time = {}".format(time.time()))

    # Back-calculate PAFs
    logger.info("start back-calculating PAFs, time = {}".format(time.time()))
    to_calc_pafs = ((df['metric_id'] == gbd.metrics.NUMBER) |
                    (df['age_group_id'] == gbd.age.AGE_STANDARDIZED))
    pafs_df = df.loc[to_calc_pafs].copy(deep=True)

    pafs_df = back_calc_pafs(pafs_df, n_draws)
    df = pd.concat([df, pafs_df])
    logger.info("back-calculating PAFs complete, time = {}"
                .format(time.time()))

    # Calculate and write out summaries as CSV files
    csv_dir = "{}/{}/upload/".format(draw_dir, location_id)
    write_sum.write_summaries(location_id, year_id, csv_dir, df, index_cols,
                              do_risk_aggr=True,
                              write_out_star_ids=write_out_star_ids)

    # Save draws
    df = df.loc[((df['sex_id'].isin([gbd.sex.MALE, gbd.sex.FEMALE])) &
                (df['age_group_id'].isin(most_detailed_age_groups)) &
                (df['metric_id'].isin([gbd.metrics.NUMBER,
                                      gbd.metrics.PERCENT])))]
    logger.info("start saving draws, time = {}".format(time.time()))
    output_file_pattern = ('{location_id}/'
                           '{measure_id}_{location_id}_{year_id}.h5')
    output_file_path = output_file_pattern.format(
        location_id=location_id, year_id=year_id, measure_id=measure_id)
    filename = "{}/{}".format(draw_dir, output_file_path)
    df = remove_unwanted_stars(df, write_out_star_ids=write_out_star_ids)
    sink = HDFDataSink(filename,
                       complib="zlib",
                       complevel=1)
    sink.write(df)
    logger.info("saving output draws complete, time = {}".format(time.time()))

    # End log
    end_time = time.time()
    elapsed = end_time - start_time
    logger.info("DONE cleanup pipeline at {}, elapsed seconds= {}".format(
        end_time, elapsed))
    logger.info("{}".format(SUCCESS_LOG_MESSAGE))
def run_pipeline(args):

    """
    Run the entire dalynator pipeline. Typically called from run_all->qsub->run_remote_pipeline->here

    Will throw ValueError if input files are not present.

    TBD Refactor as a ComputationElement followed by a DataSink at the end
    :param args
    :return:
    """

    logger = logging.getLogger(__name__)
    start_time = time.time()
    logger.info("START location-year pipeline at {}".format(start_time))

    # Create a DataContainer
    data_container = DataContainer(
        location_id=args.location_id,
        year_id=args.year_id,
        n_draws=args.n_draws,
        gbd_round_id=args.gbd_round_id,
        epi_dir=args.epi_dir,
        cod_dir=args.cod_dir,
        cache_dir=args.cache_dir,
        turn_off_null_and_nan_check=args.turn_off_null_and_nan_check)
    yll_df = data_container['yll']
    yld_df = data_container['yld']

    # Compute DALYs
    draw_cols = list(yll_df.filter(like='draw').columns)
    index_cols = list(set(yll_df.columns) - set(draw_cols))
    computer = ComputeDalys(yll_df, yld_df, draw_cols, index_cols)
    df = computer.get_data_frame()

    logger.info("DALY computation complete, df shape {}".format((df.shape)))
    logger.info(" input DF age_group_id {}".format(df['age_group_id'].unique()))

    draw_cols = list(df.filter(like='draw').columns)
    index_cols = list(set(df.columns) - set(draw_cols))
    existing_age_groups= df['age_group_id'].unique()

    logger.info("Preparing for sex aggregation")

    # Do sex aggregation
    my_sex_aggr = SexAggregator(df, draw_cols, index_cols)
    df = my_sex_aggr.get_data_frame()
    logger.info("Sex aggregation complete")

    # Do age aggregation
    my_age_aggr = AgeAggregator(df, draw_cols, index_cols,
                                data_container=data_container)
    df = my_age_aggr.get_data_frame()
    logger.info("Age aggregation complete")

    # Convert to rate and % space
    df = MetricConverter(df, to_rate=True, to_percent=True,
                         data_container=data_container).get_data_frame()

    logger.debug("new  DF age_group_id {}".format(df['age_group_id'].unique()))
    logger.info("  FINAL dalynator result shape {}".format(df.shape))

    # Calculate and write out the year summaries as CSV files
    draw_cols = list(df.filter(like='draw').columns)
    index_cols = list(set(df.columns) - set(draw_cols))

    csv_dir = args.out_dir + '/upload/'
    write_sum.write_summaries(args.location_id, args.year_id, csv_dir, df, index_cols, False, args.gbd_round_id)

    end_time = time.time()
    elapsed = end_time - start_time
    logger.info("DONE location-year pipeline at {}, elapsed seconds= {}".format(end_time, elapsed))
    logger.info("{}".format(SUCCESS_LOG_MESSAGE))

    # Adding any index-like column to the HDF index for later random access
    filename = get_input_args.calculate_output_filename(args.out_dir, gbd.measures.DALY, args.location_id, args.year_id)
    if args.no_sex_aggr:
        df = df[df['sex_id'] != gbd.sex.BOTH]

    if args.no_age_aggr:
        df = df[df['age_group_id'].isin(existing_age_groups)]

    sink = HDFDataSink(filename,
                       data_columns=[col for col in df if col.endswith("_id")],
                       complib="zlib", complevel=1)
    sink.write(df)
    logger.info("DONE write DF {}".format(time.time()))

    return df.shape