def _save_all_ylls_fauxcorrect( ylls: pd.DataFrame, yll_shocks: pd.DataFrame, parent_dir: str, location_id: int, sex_id: int, measure_id: int = 4 ) -> None: """Save YLLs for given location and sex""" ylls_sink = DrawSink({ 'draw_dir': join( parent_dir, FilePaths.DRAWS_SCALED_DIR, FilePaths.YLLS_DIR ), 'file_pattern': FilePaths.YLL_DRAWS_FILE_PATTERN.format( sex_id=sex_id, location_id=location_id ), 'h5_tablename': Keys.DRAWS }) ylls_sink.add_transform(add_measure_id_to_sink, measure_id=measure_id) ylls_sink.push(ylls, append=False) shocks_sink = DrawSink({ 'draw_dir': join( parent_dir, FilePaths.UNAGGREGATED_DIR, FilePaths.SHOCKS_DIR, FilePaths.YLLS_DIR ), 'file_pattern': FilePaths.YLL_DRAWS_FILE_PATTERN.format( sex_id=sex_id, location_id=location_id ), 'h5_tablename': Keys.DRAWS }) shocks_sink.add_transform(add_measure_id_to_sink, measure_id=measure_id) shocks_sink.push(yll_shocks, append=False)
def save_all_draws(parent_dir, ylls, yll_shocks, location_id, index_columns, measure_id=4): # Save yll data agg_rescaled_params = { 'draw_dir': os.path.join(parent_dir, 'aggregated/rescaled'), 'file_pattern': '{measure_id}_{location_id}_{year_id}.h5', 'h5_tablename': 'draws' } rescaled_sink = DrawSink(agg_rescaled_params) rescaled_sink.add_transform(add_measure_id_to_sink, measure_id=measure_id) rescaled_sink.push(ylls, append=False) agg_shocks_params = { 'draw_dir': os.path.join(parent_dir, 'aggregated/shocks'), 'file_pattern': '{measure_id}_{location_id}_{year_id}.h5', 'h5_tablename': 'draws' } shocks_sink = DrawSink(agg_shocks_params) shocks_sink.add_transform(add_measure_id_to_sink, measure_id=measure_id) shocks_sink.push(yll_shocks, append=False)
def save_all_draws(parent_dir, index_columns, rescaled_data, shock_data, unscaled_data, measure_id=1): for data in [rescaled_data, shock_data, unscaled_data]: for i in index_columns: data[i] = data[i].astype(int) rescaled_params = { 'draw_dir': os.path.join(parent_dir, 'aggregated/rescaled'), 'file_pattern': '{measure_id}_{location_id}_{year_id}.h5', 'h5_tablename': 'draws' } rescaled_sink = DrawSink(rescaled_params) rescaled_sink.add_transform(add_measure_id_to_sink, measure_id=measure_id) rescaled_sink.push(rescaled_data, append=False) unscaled_params = { 'draw_dir': os.path.join(parent_dir, 'aggregated/unscaled'), 'file_pattern': '{measure_id}_{location_id}_{year_id}.h5', 'h5_tablename': 'draws' } unscaled_sink = DrawSink(unscaled_params) unscaled_sink.add_transform(add_measure_id_to_sink, measure_id=measure_id) unscaled_sink.push(unscaled_data, append=False) shocks_params = { 'draw_dir': os.path.join(parent_dir, 'aggregated/shocks'), 'file_pattern': '{measure_id}_{location_id}_{year_id}.h5', 'h5_tablename': 'draws' } shocks_sink = DrawSink(shocks_params) shocks_sink.add_transform(add_measure_id_to_sink, measure_id=measure_id) shocks_sink.push(shock_data, append=False)
def aggregate_locations(aggregation_type: str, parent_dir: str, measure_id: int, gbd_round_id: int, location_set_id: int, year_id: int) -> None: """ Uses a AggMemEff aggregator to aggregate locations for deaths and YLLs. Arguments: aggregation_type (str): the type of data to be aggregated up a location hierarchy. One of 'aggregated/rescaled', 'aggregated/shocks', 'aggregated/unscaled' 'scaled', or 'unaggregated/shocks'. parent_dir (str): parent fauxcorrect directory e.g. PATH/{version} measure_id (int): measure ID for deaths or YLLs gbd_round_id (int): GBD round ID for this fauxcorrect run location_set_id (int): location set ID with which to aggregate year_id (int): draws year ID Raises: ValueError: if measure_id is not deaths (1) or YLLs (4) """ # Set up DrawSource and DrawSink. source_dir, sink_dir = _get_draw_source_sink_dirs(parent_dir, aggregation_type, measure_id) source, draw_filters = _get_draw_source_and_filters( aggregation_type, source_dir, year_id, measure_id) sink = DrawSink({ 'draw_dir': sink_dir, 'file_pattern': FilePaths.LOCATION_AGGREGATE_FILE_PATTERN.format(year_id=year_id), 'h5_tablename': Keys.DRAWS }) sink.add_transform(_apply_regional_scalars, parent_dir, gbd_round_id, location_set_id) sink.add_transform(add_measure_id_to_sink, measure_id=measure_id) # clean up old files we plan on writing clean_aggregation_directory(root_dir=sink.params['draw_dir'], file_pattern=sink.params['file_pattern'], location_set_id=location_set_id, gbd_round_id=gbd_round_id) # Set up aggregator and location tree. index_cols = ([col for col in Columns.INDEX if col != Columns.LOCATION_ID]) operator = Sum(index_cols, Columns.DRAWS) agg = AggMemEff(source, sink, index_cols, Columns.LOCATION_ID, operator, chunksize=2) is_sdi_set = location_set_id == LocationSetId.SDI trees = loctree(location_set_id=location_set_id, gbd_round_id=gbd_round_id, return_many=is_sdi_set) logging.info(f"Aggregating locations, location_set_id: {location_set_id}") for tree in np.atleast_1d(trees): agg.run(tree, draw_filters=draw_filters, n_processes=10)
draw_source = DrawSource(source_config) output_pattern = '{measure_id}_{location_id}_{year_id}.h5' sink_config = { 'draw_dir': draw_dir, 'file_pattern': output_pattern, 'h5_tablename': 'draws' } draw_sink = DrawSink(sink_config) # Apply regional scalar transform region_locs = get_location_metadata(gbd_round_id=GBD.GBD_ROUND_ID, location_set_id=35) region_locs = region_locs[region_locs.level == 2].location_id.tolist() draw_sink.add_transform(apply_regional_scalars, region_locs=region_locs, parent_dir=parent_dir) draw_sink.add_transform(transform_add_measure, measure_id=measure_id) # create operator logging.info("Reading regional scalars from flatfiles.") index_cols = [col for col in index_cols if col != 'location_id'] operator = Sum(index_cols, draw_cols) # Aggregate logging.info("Instantiate aggregator.aggregators.AggMemEff.") aggregator = AggMemEff(draw_source=draw_source, draw_sink=draw_sink, index_cols=index_cols, aggregate_col='location_id', operator=operator,
def gen_draw_sink(self): sink = DrawSink(self._mem_io_params, mem_write_func) sink.add_transform(sort_index_columns, self.dimensions.index_names) return sink
def location_aggregate_birth_counts(gbd_round_id: int, decomp_step: str, constants_path: pathlib.PosixPath, location_set_id: int) -> None: """ for given gbd_round, decomp_step, location_set_id, get a complete set of location-aggregated live births """ logger.info(f'aggregating for location_set_id {location_set_id}') multiple_tree_flag = (location_set_id in mmr_constants.MULTIPLE_ROOT_LOCATION_SET_IDS) scalars = get_regional_scalars(gbd_round_id, decomp_step) index_cols = ['location_id', 'year_id', 'age_group_id', 'sex_id'] cov_estimate_filename = ( mmr_constants.COV_ESTIMATES_FORMAT_FILENAME.format(location_set_id)) region_locs, most_detailed_locs = get_location_level_sets( gbd_round_id=gbd_round_id, decomp_step=decomp_step, location_set_id=location_set_id) save_birth_count_estimates(gbd_round_id=gbd_round_id, decomp_step=decomp_step, cov_estimate_filepath=constants_path / cov_estimate_filename, location_set_id=location_set_id, most_detailed_locs=most_detailed_locs) loc_trees = dbtrees.loctree(location_set_id=location_set_id, gbd_round_id=gbd_round_id, decomp_step=decomp_step, return_many=multiple_tree_flag) if not multiple_tree_flag: loc_trees = [loc_trees] draw_source = DrawSource(params={ 'draw_dir': str(constants_path), 'file_pattern': cov_estimate_filename }) i = 1 output_filenames = [] for loc_tree in loc_trees: output_filename = f'{location_set_id}_{i}.h5' i += 1 draw_sink = DrawSink(params={ 'draw_dir': str(constants_path), 'file_pattern': output_filename }) draw_sink.add_transform( _apply_regional_scalars, regional_scalars_df=scalars.query('location_id in @region_locs'), gbd_round_id=gbd_round_id, decomp_step=decomp_step) op = Sum(index_cols=[s for s in index_cols if s != 'location_id'], value_cols=[mmr_constants.Columns.LIVE_BIRTH_VALUE_COL]) AggSynchronous( draw_source=draw_source, draw_sink=draw_sink, index_cols=[s for s in index_cols if s != 'location_id'], aggregate_col='location_id', operator=op).run(loc_tree, include_leaves=True) output_filenames.append(output_filename) return output_filenames