def read_all_cause(self, year_id): ''' pull cause envelope for given year this will only be called when running the burdenator as it is needed to back calculate pafs (used as the denominator) and generate multi year (pct change) estimates in pct space ''' cause_data_container = DataContainer( {'location_id': self.args.location_id, 'year_id': year_id}, gbd_round_id=self.args.gbd_round_id, decomp_step=self.args.decomp_step, cache_dir=self.cache_dir, n_draws=self.args.n_draws, cod_dir=self.args.cod_dir, cod_pattern=self.args.cod_pattern, epi_dir=self.args.epi_dir) if self.args.measure_id == gbd.measures.DEATH: cause_df = cause_data_container['death'] elif self.args.measure_id == gbd.measures.YLL: cause_df = cause_data_container['yll'] elif self.args.measure_id == gbd.measures.YLD: cause_df = cause_data_container['yld'] elif self.args.measure_id == gbd.measures.DALY: # Get YLLs and YLDs yll_df = cause_data_container['yll'] yld_df = cause_data_container['yld'] yld_df = yld_df.loc[yld_df.measure_id == gbd.measures.YLD] # Compute DALYs draw_cols = list(yld_df.filter(like='draw').columns) index_cols = list(set(yld_df.columns) - set(draw_cols)) daly_df = ComputeDalys(yll_df, yld_df, draw_cols, index_cols) cause_df = daly_df.get_data_frame() cause_df['rei_id'] = gbd.risk.TOTAL_ATTRIBUTABLE cause_df['star_id'] = gbd.star.ANY_EVIDENCE_LEVEL return cause_df.loc[cause_df['metric_id'] == gbd.metrics.NUMBER]
def instantiate_data_containers(self, cache_dir=None): if not cache_dir: self.cache_dir = '{}/cache'.format(self.args.out_dir) else: self.cache_dir = cache_dir self.data_container_start = DataContainer( location_id=self.args.location_id, year_id=self.args.start_year, gbd_round_id=self.args.gbd_round_id, cache_dir=self.cache_dir, n_draws=self.args.n_draws) self.data_container_end = DataContainer( location_id=self.args.location_id, year_id=self.args.end_year, gbd_round_id=self.args.gbd_round_id, cache_dir=self.cache_dir, n_draws=self.args.n_draws)
def instantiate_data_containers(self, cache_dir=None): if not cache_dir: self.cache_dir = 'FILEPATH'.format(self.args.out_dir) else: self.cache_dir = cache_dir self.data_container_start = DataContainer( {'location_id': self.args.location_id, 'year_id': self.args.start_year}, gbd_round_id=self.args.gbd_round_id, decomp_step=self.args.decomp_step, cache_dir=self.cache_dir, n_draws=self.args.n_draws) self.data_container_end = DataContainer( {'location_id': self.args.location_id, 'year_id': self.args.end_year}, gbd_round_id=self.args.gbd_round_id, decomp_step=self.args.decomp_step, cache_dir=self.cache_dir, n_draws=self.args.n_draws)
def __init__(self, location_set_id, year_id, rei_id, sex_id, measure_id, gbd_round_id, n_draws, data_root, region_locs, write_out_star_ids): self.location_set_id = location_set_id self.year_id = year_id self.rei_id = rei_id self.sex_id = sex_id self.measure_id = measure_id self.gbd_round_id = gbd_round_id self.n_draws = n_draws self.data_root = data_root self.region_locs = region_locs self.data_container = DataContainer( {'location_set_id': self.location_set_id, 'year_id': self.year_id, 'sex_id': self.sex_id}, n_draws=self.n_draws, gbd_round_id=self.gbd_round_id, cache_dir=os.path.join(self.data_root, 'cache')) self.loctree = self.data_container[ 'location_hierarchy_{}'.format(self.location_set_id)] self.in_dir = os.path.join(self.data_root, 'draws') self.out_dir = os.path.join(self.data_root, 'loc_agg_draws/burden') mkds.makedirs_safely(self.out_dir) self.write_out_star_ids = write_out_star_ids # Remove old aggregates in case jobs failed in the middle aggregates = [n.id for n in self.loctree.nodes if n not in self.loctree.leaves()] for loc in aggregates: filename = ('{o}/{lo}/{me}/{m}_{y}_{loc}_{r}_{s}.h5' .format(o=self.out_dir, lo=loc, me=self.measure_id, m=self.measure_id, y=self.year_id, loc=loc, r=self.rei_id, s=self.sex_id)) logger.debug("Deleting potentially pre-existing loc-agg file" "{e}: '{f}'".format(e=os.path.exists(filename), f=filename)) with contextlib.suppress(FileNotFoundError): os.remove(filename) self.index_cols = ['measure_id', 'metric_id', 'sex_id', 'cause_id', 'rei_id', 'year_id', 'age_group_id'] self.value_cols = ['draw_{}'.format(i) for i in range(self.n_draws)] self.draw_filters = {'metric_id': gbd.metrics.NUMBER, 'rei_id': self.rei_id, 'sex_id': self.sex_id, 'measure_id': self.measure_id, 'year_id': self.year_id} self.operator = self.get_operator() self.draw_source, self.draw_sink = self.get_draw_source_sink()
def run_pipeline_burdenator(args): """ Run the entire dalynator pipeline. Typically called from run_all->qsub->run_remote_pipeline->here Will raise ValueError if input files are not present. :param args: :return: """ # Start logger logger = logging.getLogger(__name__) start_time = time.time() logger.info("START pipeline burdenator at {}".format(start_time)) logger.info("START pipeline burdenator n_draws {}".format(args.n_draws)) # Validate args before doing any heavy-lifting if not any([ args.write_out_ylls_paf, args.write_out_ylds_paf, args.write_out_deaths_paf, args.write_out_dalys_paf ]): raise ValueError("must choose at least one of --ylls_paf, --ylds_paf," " --deaths_paf, or --dalys_paf ") # Share args across processes MPGlobals.args = args MPGlobals.logger = logger # Get detailed ages MPGlobals.most_detailed_age_groups = MetricConverter.get_detailed_ages() logger.info("START pipeline burdenator before data_container ") # Create a DataContainer, cache data to be shared across processes data_container = DataContainer( location_id=args.location_id, year_id=args.year_id, n_draws=args.n_draws, gbd_round_id=args.gbd_round_id, epi_dir=args.epi_dir, cod_dir=args.cod_dir, daly_dir=args.daly_dir, paf_dir=args.paf_dir, turn_off_null_and_nan_check=args.turn_off_null_and_nan_check, cache_dir=args.cache_dir) # Fetch PAF input from RF team logger.info("start apply PAFs, time = {}".format(time.time())) yll_columns = ['paf_yll_{}'.format(x) for x in xrange(args.n_draws)] yld_columns = ['paf_yld_{}'.format(x) for x in xrange(args.n_draws)] draw_columns = ['draw_{}'.format(x) for x in xrange(args.n_draws)] pafs_filter = PAFInputFilter(yll_columns=yll_columns, yld_columns=yld_columns, draw_columns=draw_columns) paf_df = data_container['paf'] pafs_filter.set_input_data_frame(paf_df) MPGlobals.pafs_filter = pafs_filter # Cache data and burdenate measures = [] if args.write_out_ylls_paf: measures.append('yll') data_container['yll'] if args.write_out_ylds_paf: measures.append('yld') data_container['yld'] if args.write_out_deaths_paf: measures.append('death') data_container['death'] MPGlobals.data_container = data_container pool_size = len(measures) pool = Pool(pool_size) results = map_and_raise(pool, burdenate_caught, measures) # Compute DALYs and associated summaries, if requested if args.write_out_dalys_paf: if not (args.write_out_ylls_paf and args.write_out_ylds_paf): raise ValueError("Can't compute risk-attributable DALYs unless " "both ylls and ylds are also provided") measures.append('daly') yld_df = [i['draws'] for i in results if i['key'] == 'yld'][0] yll_df = [i['draws'] for i in results if i['key'] == 'yll'][0] daly_df = compute_dalys(yld_df[yld_df.measure_id == gbd.measures.YLD], yll_df) results.append({'key': 'daly', 'draws': daly_df}) # Write out meta-information for downstream aggregation step meta_df = pd.concat([get_dimensions(r['draws']) for r in results]) meta_df = aggregate_dimensions(meta_df) meta_dict = generate_meta(meta_df) write_meta(args.out_dir, meta_dict) # Set the results as a Global, for use in summarization Pool MPGlobals.results = results # Summarize pool_size = len(measures) pool = Pool(pool_size) summ_df = map_and_raise(pool, summarize_caught, measures) summ_df = pd.concat(summ_df) summ_df = match_with_dimensions(summ_df, meta_df) summ_df.reset_index(drop=True, inplace=True) logger.info( "Risk attribution & daly computation complete, df shape {}".format( (summ_df.shape))) logger.info(" FINAL burdenator result shape {}".format(summ_df.shape)) # Write out the year summaries as CSV files rei_types = get_rei_type_id_df() summ_df = summ_df.loc[summ_df['rei_id'] != 0] for measure_id in summ_df.measure_id.unique(): for risk_type in [RISK_REI_TYPE, ETI_REI_TYPE]: # Get list of rei_ids of this type risks_of_type = rei_types[rei_types.rei_type_id == risk_type] risks_of_type = risks_of_type.rei_id.squeeze() # Compute filename summ_fn = get_summ_filename(args.out_dir, risk_type, args.location_id, args.year_id, measure_id) logger.info("Writing {}".format(summ_fn)) # Write appropriate subset to file write_csv( summ_df[((summ_df.measure_id == measure_id) & (summ_df.rei_id.isin(risks_of_type)))], summ_fn) end_time = time.time() elapsed = end_time - start_time logger.info("DONE location-year pipeline at {}, elapsed seconds= " "{}".format(end_time, elapsed)) logger.info("{}".format(SUCCESS_LOG_MESSAGE)) return summ_df.shape
def _compute_most_detailed_df(self): """ Computations only, does not write files. Makes testing easier. """ start_time = time.time() logger.info("START location-year pipeline at {}".format(start_time)) # Create a DataContainer data_container = DataContainer( { 'location_id': self.location_id, 'year_id': self.year_id }, n_draws=self.n_draws, gbd_round_id=self.gbd_round_id, epi_dir=self.epi_dir, cod_dir=self.cod_dir, cache_dir=self.cache_dir, turn_off_null_and_nan_check=self.turn_off_null_and_nan_check) yll_df = data_container['yll'] yld_df = data_container['yld'] # Compute DALYs draw_cols = list(yll_df.filter(like='draw').columns) index_cols = list(set(yll_df.columns) - set(draw_cols)) computer = ComputeDalys(yll_df, yld_df, draw_cols, index_cols) df = computer.get_data_frame() logger.info("DALY computation complete, df shape {}".format( (df.shape))) logger.info(" input DF age_group_id {}".format( df['age_group_id'].unique())) draw_cols = list(df.filter(like='draw').columns) index_cols = list(set(df.columns) - set(draw_cols)) existing_age_groups = df['age_group_id'].unique() logger.info("Preparing for sex aggregation") # Do sex aggregation my_sex_aggr = SexAggregator(df, draw_cols, index_cols) df = my_sex_aggr.get_data_frame() logger.info("Sex aggregation complete") # Do age aggregation my_age_aggr = AgeAggregator(df, draw_cols, index_cols, data_container=data_container) df = my_age_aggr.get_data_frame() logger.info("Age aggregation complete") # Convert to rate and % space df = MetricConverter(df, to_rate=True, to_percent=True, data_container=data_container).get_data_frame() logger.debug("new DF age_group_id {}".format( df['age_group_id'].unique())) logger.info(" FINAL dalynator result shape {}".format(df.shape)) end_time = time.time() elapsed = end_time - start_time logger.info( "DONE location-year pipeline at {}, elapsed seconds= {}".format( end_time, elapsed)) logger.info("{}".format(SUCCESS_LOG_MESSAGE)) return df, existing_age_groups
def run_burdenator_cleanup(out_dir, location_id, year_id, n_draws, measure_id, cod_dir, cod_pattern, epi_dir, turn_off_null_and_nan_check, gbd_round_id, decomp_step, write_out_star_ids, cache_dir, dual_upload): """Take a set of aggregated results and reformat them into draws consistent with the most-detailed location draws. Args: out_dir (str): the root directory for this burdenator run location_id (int): location_id of the aggregate location year_id (int): year of the aggregate location n_draws (int): the number of draw columns in the H5 data frames, greater than zero measure_id (int): measure_id of the aggregate location cod_dir (str): directory where the cause-level envelope for cod (CoDCorrect) files are stored cod_pattern (str): file pattern for accessing CoD-or-FauxCorrect draws. Example: '{measure_id}_{location_id}.h5' epi_dir (str): directory where the cause-level envelope for epi (COMO) files are stored turn_off_null_and_nan_check (bool): Disable checks for NaNs and Nulls write_out_star_ids (bool): If true, include star_ids in output draw files and CSV upload files dual_upload (bool): If True upload to column store as well as the gbd database. Currently not implemented. """ MPGlobals.logger = logger start_time = time.time() logger.info("START pipeline burdenator cleanup at {}".format(start_time)) logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') logging.warning('is when this event was logged.') # Get aggregated draws logger.info("start append files, time = {}".format(time.time())) draw_dir = os.path.join(out_dir, 'draws') aggregated_draw_dir = os.path.join(out_dir, 'loc_agg_draws') # df contains Attribute Burden, which is in Number space. # It is a subset of the total count for the parent metric, # ie AB of YLL's for a cause attributable to a risk # (or to all known & unknown risks, ie rei_id == 0) # df is a list of data frames df = [] for metric in ['burden']: input_file_pattern = ('FILEPATH') logger.debug("Cleanup file pattern {}".format( input_file_pattern.format(root=aggregated_draw_dir, metric=metric, location_id=location_id, year_id=year_id, measure_id=measure_id))) draw_files = glob.glob( input_file_pattern.format(root=aggregated_draw_dir, metric=metric, location_id=location_id, year_id=year_id, measure_id=measure_id)) for f in draw_files: logger.info("appending {}".format(f)) this_df = pd.read_hdf('{}'.format(f)) dups = this_df[this_df.filter( like='_id').columns].duplicated().any() if dups: msg = ("Duplicates found in location aggregate output " "file {}. Failing this cleanup job".format(f)) logger.error(msg) raise RuntimeError(msg) df.append(this_df) df = pd.concat(df) logger.info("append files complete, time = {}".format(time.time())) logger.info("columns appended df {}".format(get_index_columns(df))) add_star_id(df) # Get cause envelope data_container = DataContainer( { 'location_id': location_id, 'year_id': year_id }, n_draws=n_draws, gbd_round_id=gbd_round_id, decomp_step=decomp_step, cod_dir=cod_dir, cod_pattern=cod_pattern, epi_dir=epi_dir, turn_off_null_and_nan_check=turn_off_null_and_nan_check, cache_dir=cache_dir) MPGlobals.data_container = data_container # cause_env_df has all-cause mortality/whatever, without risks if measure_id == gbd.measures.DEATH: cause_env_df = data_container['death'] elif measure_id == gbd.measures.YLL: cause_env_df = data_container['yll'] elif measure_id == gbd.measures.YLD: cause_env_df = data_container['yld'] elif measure_id == gbd.measures.DALY: # Get YLLs and YLDs yll_df = data_container['yll'] yld_df = data_container['yld'] yld_df = yld_df.loc[yld_df.measure_id == gbd.measures.YLD] # Compute DALYs draw_cols = list(yld_df.filter(like='draw').columns) index_cols = list(set(yld_df.columns) - set(draw_cols)) daly_ce = ComputeDalys(yll_df, yld_df, draw_cols, index_cols) cause_env_df = daly_ce.get_data_frame() cause_env_df['rei_id'] = gbd.risk.TOTAL_ATTRIBUTABLE cause_env_df['star_id'] = gbd.star.ANY_EVIDENCE_LEVEL # Concatenate cause envelope with data most_detailed_age_groups = MetricConverter.get_detailed_ages() df = pd.concat([df, cause_env_df], sort=True) df = df.loc[((df['sex_id'].isin([gbd.sex.MALE, gbd.sex.FEMALE])) & (df['age_group_id'].isin(most_detailed_age_groups)) & (df['metric_id'] == gbd.metrics.NUMBER))] # Do sex aggregation draw_cols = list(df.filter(like='draw').columns) index_cols = list(set(df.columns) - set(draw_cols)) logger.info("start aggregating sexes, time = {}".format(time.time())) my_sex_aggr = SexAggregator(df, draw_cols, index_cols) df = my_sex_aggr.get_data_frame() logger.info("aggregating ages sexes, time = {}".format(time.time())) # Do age aggregation logger.info("start aggregating ages, time = {}".format(time.time())) my_age_aggr = AgeAggregator(df, draw_cols, index_cols, data_container=data_container) df = my_age_aggr.get_data_frame() logger.info("aggregating ages complete, time = {}".format(time.time())) # Convert to rate space logger.info("start converting to rates, time = {}".format(time.time())) df = MetricConverter(df, to_rate=True, data_container=data_container).get_data_frame() logger.info("converting to rates complete, time = {}".format(time.time())) # df does not contain AB's any more, because they are RATES # Back-calculate PAFs logger.info("start back-calculating PAFs, time = {}".format(time.time())) to_calc_pafs = ((df['metric_id'] == gbd.metrics.NUMBER) | (df['age_group_id'] == gbd.age.AGE_STANDARDIZED)) pafs_df = df.loc[to_calc_pafs].copy(deep=True) # back_calc_pafs is part of the most detailed pipeline, reused from here. pafs_df = back_calc_pafs(pafs_df, n_draws) df = pd.concat([df, pafs_df], sort=True) logger.info("back-calculating PAFs complete, time = {}".format( time.time())) # Calculate and write out summaries as CSV files csv_dir = "FILEPATH".format(draw_dir, location_id) write_sum.write_summaries(location_id, year_id, csv_dir, df, index_cols, do_risk_aggr=True, write_out_star_ids=write_out_star_ids, dual_upload=dual_upload) # Save draws df = df.loc[( (df['sex_id'].isin([gbd.sex.MALE, gbd.sex.FEMALE])) & (df['age_group_id'].isin(most_detailed_age_groups)) & (df['metric_id'].isin([gbd.metrics.NUMBER, gbd.metrics.PERCENT])))] logger.info("start saving draws, time = {}".format(time.time())) output_file_pattern = ('FILEPATH') output_file_path = output_file_pattern.format(location_id=location_id, year_id=year_id, measure_id=measure_id) filename = "FILEPATH".format(draw_dir, output_file_path) remove_unwanted_stars(df, write_out_star_ids=write_out_star_ids) sink = HDFDataSink(filename, complib="zlib", complevel=1) sink.write(df) logger.info("saving output draws complete, time = {}".format(time.time())) # End log end_time = time.time() elapsed = end_time - start_time logger.info("DONE cleanup pipeline at {}, elapsed seconds= {}".format( end_time, elapsed)) logger.info("FILEPATH".format(SUCCESS_LOG_MESSAGE))
def run_burdenator_cleanup(out_dir, location_id, year_id, n_draws, measure_id, cod_dir, epi_dir, turn_off_null_and_nan_check, gbd_round_id, write_out_star_ids, cache_dir): """Take a set of aggregated results and reformat them into draws consistent with the most-detailed location draws. Args: out_dir (str): the root directory for this burdenator run location_id (int): location_id of the aggregate location year_id (int): year of the aggregate location n_draws (int): the number of draw columns in the H5 data frames, greater than zero measure_id (int): measure_id of the aggregate location cod_dir (str): directory where the cause-level envelope for cod (CoDCorrect) files are stored epi_dir (str): directory where the cause-level envelope for epi (COMO) files are stored turn_off_null_and_nan_check (bool): Disable checks for NaNs and Nulls write_out_star_ids (bool): If true, include star_ids in output draw files and CSV upload files """ MPGlobals.logger = logger start_time = time.time() logger.info("START pipeline burdenator cleanup at {}".format(start_time)) # Get aggregated draws logger.info("start append files, time = {}".format(time.time())) draw_dir = os.path.join(out_dir, 'draws') aggregated_draw_dir = os.path.join(out_dir, 'loc_agg_draws') df = [] for metric in ['burden']: input_file_pattern = ('{root}/{metric}/' '{location_id}/{measure_id}/' '{measure_id}_{year_id}_{location_id}_*.h5') logger.debug("Cleanup file pattern {}".format( input_file_pattern.format(root=aggregated_draw_dir, metric=metric, location_id=location_id, year_id=year_id, measure_id=measure_id))) draw_files = glob.glob(input_file_pattern.format( root=aggregated_draw_dir, metric=metric, location_id=location_id, year_id=year_id, measure_id=measure_id)) for f in draw_files: logger.info("appending {}".format(f)) this_df = pd.read_hdf('{}'.format(f)) dups = this_df[this_df.filter(like='_id').columns ].duplicated().any() if dups: msg = ("Duplicates found in location aggregate output " "file {}. Failing this cleanup job".format(f)) logger.error(msg) raise RuntimeError(msg) df.append(this_df) df = pd.concat(df) logger.info("append files complete, time = {}".format(time.time())) logger.info("columns appended df {}".format(get_index_columns(df))) add_star_id(df) # Get cause envelope data_container = DataContainer( {'location_id': location_id, 'year_id': year_id}, n_draws=n_draws, gbd_round_id=gbd_round_id, cod_dir=cod_dir, epi_dir=epi_dir, turn_off_null_and_nan_check=turn_off_null_and_nan_check, cache_dir=cache_dir) MPGlobals.data_container = data_container # cause_env_df has all-cause, without risks if measure_id == gbd.measures.DEATH: cause_env_df = data_container['death'] elif measure_id == gbd.measures.YLL: cause_env_df = data_container['yll'] elif measure_id == gbd.measures.YLD: cause_env_df = data_container['yld'] elif measure_id == gbd.measures.DALY: # Get YLLs and YLDs yll_df = data_container['yll'] yld_df = data_container['yld'] yld_df = yld_df.loc[yld_df.measure_id == gbd.measures.YLD] # Compute DALYs draw_cols = list(yld_df.filter(like='draw').columns) index_cols = list(set(yld_df.columns) - set(draw_cols)) daly_ce = ComputeDalys(yll_df, yld_df, draw_cols, index_cols) cause_env_df = daly_ce.get_data_frame() cause_env_df['rei_id'] = gbd.risk.TOTAL_ATTRIBUTABLE cause_env_df['star_id'] = gbd.star.ANY_EVIDENCE_LEVEL # Concatenate cause envelope with data most_detailed_age_groups = MetricConverter.get_detailed_ages() df = pd.concat([df, cause_env_df]) df = df.loc[((df['sex_id'].isin([gbd.sex.MALE, gbd.sex.FEMALE])) & (df['age_group_id'].isin(most_detailed_age_groups)) & (df['metric_id'] == gbd.metrics.NUMBER))] # Do sex aggregation draw_cols = list(df.filter(like='draw').columns) index_cols = list(set(df.columns) - set(draw_cols)) logger.info("start aggregating sexes, time = {}".format(time.time())) my_sex_aggr = SexAggregator(df, draw_cols, index_cols) df = my_sex_aggr.get_data_frame() logger.info("aggregating ages sexes, time = {}".format(time.time())) # Do age aggregation logger.info("start aggregating ages, time = {}".format(time.time())) my_age_aggr = AgeAggregator(df, draw_cols, index_cols, data_container=data_container) df = my_age_aggr.get_data_frame() logger.info("aggregating ages complete, time = {}".format(time.time())) # Convert to rate space logger.info("start converting to rates, time = {}".format(time.time())) df = MetricConverter(df, to_rate=True, data_container=data_container).get_data_frame() logger.info("converting to rates complete, time = {}".format(time.time())) # Back-calculate PAFs logger.info("start back-calculating PAFs, time = {}".format(time.time())) to_calc_pafs = ((df['metric_id'] == gbd.metrics.NUMBER) | (df['age_group_id'] == gbd.age.AGE_STANDARDIZED)) pafs_df = df.loc[to_calc_pafs].copy(deep=True) pafs_df = back_calc_pafs(pafs_df, n_draws) df = pd.concat([df, pafs_df]) logger.info("back-calculating PAFs complete, time = {}" .format(time.time())) # Calculate and write out summaries as CSV files csv_dir = "{}/{}/upload/".format(draw_dir, location_id) write_sum.write_summaries(location_id, year_id, csv_dir, df, index_cols, do_risk_aggr=True, write_out_star_ids=write_out_star_ids) # Save draws df = df.loc[((df['sex_id'].isin([gbd.sex.MALE, gbd.sex.FEMALE])) & (df['age_group_id'].isin(most_detailed_age_groups)) & (df['metric_id'].isin([gbd.metrics.NUMBER, gbd.metrics.PERCENT])))] logger.info("start saving draws, time = {}".format(time.time())) output_file_pattern = ('{location_id}/' '{measure_id}_{location_id}_{year_id}.h5') output_file_path = output_file_pattern.format( location_id=location_id, year_id=year_id, measure_id=measure_id) filename = "{}/{}".format(draw_dir, output_file_path) df = remove_unwanted_stars(df, write_out_star_ids=write_out_star_ids) sink = HDFDataSink(filename, complib="zlib", complevel=1) sink.write(df) logger.info("saving output draws complete, time = {}".format(time.time())) # End log end_time = time.time() elapsed = end_time - start_time logger.info("DONE cleanup pipeline at {}, elapsed seconds= {}".format( end_time, elapsed)) logger.info("{}".format(SUCCESS_LOG_MESSAGE))
def run_pipeline(args): """ Run the entire dalynator pipeline. Typically called from run_all->qsub->run_remote_pipeline->here Will throw ValueError if input files are not present. TBD Refactor as a ComputationElement followed by a DataSink at the end :param args :return: """ logger = logging.getLogger(__name__) start_time = time.time() logger.info("START location-year pipeline at {}".format(start_time)) # Create a DataContainer data_container = DataContainer( location_id=args.location_id, year_id=args.year_id, n_draws=args.n_draws, gbd_round_id=args.gbd_round_id, epi_dir=args.epi_dir, cod_dir=args.cod_dir, cache_dir=args.cache_dir, turn_off_null_and_nan_check=args.turn_off_null_and_nan_check) yll_df = data_container['yll'] yld_df = data_container['yld'] # Compute DALYs draw_cols = list(yll_df.filter(like='draw').columns) index_cols = list(set(yll_df.columns) - set(draw_cols)) computer = ComputeDalys(yll_df, yld_df, draw_cols, index_cols) df = computer.get_data_frame() logger.info("DALY computation complete, df shape {}".format((df.shape))) logger.info(" input DF age_group_id {}".format(df['age_group_id'].unique())) draw_cols = list(df.filter(like='draw').columns) index_cols = list(set(df.columns) - set(draw_cols)) existing_age_groups= df['age_group_id'].unique() logger.info("Preparing for sex aggregation") # Do sex aggregation my_sex_aggr = SexAggregator(df, draw_cols, index_cols) df = my_sex_aggr.get_data_frame() logger.info("Sex aggregation complete") # Do age aggregation my_age_aggr = AgeAggregator(df, draw_cols, index_cols, data_container=data_container) df = my_age_aggr.get_data_frame() logger.info("Age aggregation complete") # Convert to rate and % space df = MetricConverter(df, to_rate=True, to_percent=True, data_container=data_container).get_data_frame() logger.debug("new DF age_group_id {}".format(df['age_group_id'].unique())) logger.info(" FINAL dalynator result shape {}".format(df.shape)) # Calculate and write out the year summaries as CSV files draw_cols = list(df.filter(like='draw').columns) index_cols = list(set(df.columns) - set(draw_cols)) csv_dir = args.out_dir + '/upload/' write_sum.write_summaries(args.location_id, args.year_id, csv_dir, df, index_cols, False, args.gbd_round_id) end_time = time.time() elapsed = end_time - start_time logger.info("DONE location-year pipeline at {}, elapsed seconds= {}".format(end_time, elapsed)) logger.info("{}".format(SUCCESS_LOG_MESSAGE)) # Adding any index-like column to the HDF index for later random access filename = get_input_args.calculate_output_filename(args.out_dir, gbd.measures.DALY, args.location_id, args.year_id) if args.no_sex_aggr: df = df[df['sex_id'] != gbd.sex.BOTH] if args.no_age_aggr: df = df[df['age_group_id'].isin(existing_age_groups)] sink = HDFDataSink(filename, data_columns=[col for col in df if col.endswith("_id")], complib="zlib", complevel=1) sink.write(df) logger.info("DONE write DF {}".format(time.time())) return df.shape