def _save_all_ylls_fauxcorrect( ylls: pd.DataFrame, yll_shocks: pd.DataFrame, parent_dir: str, location_id: int, sex_id: int, measure_id: int = 4 ) -> None: """Save YLLs for given location and sex""" ylls_sink = DrawSink({ 'draw_dir': join( parent_dir, FilePaths.DRAWS_SCALED_DIR, FilePaths.YLLS_DIR ), 'file_pattern': FilePaths.YLL_DRAWS_FILE_PATTERN.format( sex_id=sex_id, location_id=location_id ), 'h5_tablename': Keys.DRAWS }) ylls_sink.add_transform(add_measure_id_to_sink, measure_id=measure_id) ylls_sink.push(ylls, append=False) shocks_sink = DrawSink({ 'draw_dir': join( parent_dir, FilePaths.UNAGGREGATED_DIR, FilePaths.SHOCKS_DIR, FilePaths.YLLS_DIR ), 'file_pattern': FilePaths.YLL_DRAWS_FILE_PATTERN.format( sex_id=sex_id, location_id=location_id ), 'h5_tablename': Keys.DRAWS }) shocks_sink.add_transform(add_measure_id_to_sink, measure_id=measure_id) shocks_sink.push(yll_shocks, append=False)
def write(self, data_frame, **kwargs): logger.info("Writing HDF file '{}'".format(self.file_path)) self.check_paths() to_pass = deepcopy(self.kwargs) to_pass.update(kwargs) sink = DrawSink(params={ 'file_pattern': self.file_name, 'draw_dir': self.dir_name }) if 'rei_id' in data_frame.columns: sink.push(data_frame[data_frame.rei_id != risk.TOTAL_ATTRIBUTABLE], append=False, **to_pass) else: sink.push(data_frame, append=False, **to_pass) logger.info(" finished write to HDF file {}".format(self.file_path))
def _import_static_draws_by_component(self, component): dim = self.dimensions.get_dimension_by_component( component, self.measure_id) draw_dir = os.path.join(self.como_version.como_dir, "draws", component, str(dim.index_dim.get_level("location_id")[0])) real_source = DrawSource({ "draw_dir": draw_dir, "file_pattern": "{measure_id}_{year_id}_{sex_id}.h5" }) real_source.add_transform(add_metric) fake_sink = DrawSink({ "draw_dict": self.io_mock, "name": component }, mem_write_func) # get the filters and data sim_dim = self.dimensions.get_simulation_dimensions(self.measure_id) filters = sim_dim.index_dim.levels.copy() filters["age_group_id"] = dim.index_dim.get_level("age_group_id") df = real_source.content(filters=filters) fake_sink.push(df)
def save_all_draws(parent_dir, ylls, yll_shocks, location_id, index_columns, measure_id=4): # Save yll data agg_rescaled_params = { 'draw_dir': os.path.join(parent_dir, 'aggregated/rescaled'), 'file_pattern': '{measure_id}_{location_id}_{year_id}.h5', 'h5_tablename': 'draws' } rescaled_sink = DrawSink(agg_rescaled_params) rescaled_sink.add_transform(add_measure_id_to_sink, measure_id=measure_id) rescaled_sink.push(ylls, append=False) agg_shocks_params = { 'draw_dir': os.path.join(parent_dir, 'aggregated/shocks'), 'file_pattern': '{measure_id}_{location_id}_{year_id}.h5', 'h5_tablename': 'draws' } shocks_sink = DrawSink(agg_shocks_params) shocks_sink.add_transform(add_measure_id_to_sink, measure_id=measure_id) shocks_sink.push(yll_shocks, append=False)
def save_all_draws(parent_dir, index_columns, rescaled_data, shock_data, unscaled_data, measure_id=1): for data in [rescaled_data, shock_data, unscaled_data]: for i in index_columns: data[i] = data[i].astype(int) rescaled_params = { 'draw_dir': os.path.join(parent_dir, 'aggregated/rescaled'), 'file_pattern': '{measure_id}_{location_id}_{year_id}.h5', 'h5_tablename': 'draws' } rescaled_sink = DrawSink(rescaled_params) rescaled_sink.add_transform(add_measure_id_to_sink, measure_id=measure_id) rescaled_sink.push(rescaled_data, append=False) unscaled_params = { 'draw_dir': os.path.join(parent_dir, 'aggregated/unscaled'), 'file_pattern': '{measure_id}_{location_id}_{year_id}.h5', 'h5_tablename': 'draws' } unscaled_sink = DrawSink(unscaled_params) unscaled_sink.add_transform(add_measure_id_to_sink, measure_id=measure_id) unscaled_sink.push(unscaled_data, append=False) shocks_params = { 'draw_dir': os.path.join(parent_dir, 'aggregated/shocks'), 'file_pattern': '{measure_id}_{location_id}_{year_id}.h5', 'h5_tablename': 'draws' } shocks_sink = DrawSink(shocks_params) shocks_sink.add_transform(add_measure_id_to_sink, measure_id=measure_id) shocks_sink.push(shock_data, append=False)
def _get_population( version: MachineParameters, location_set_id: int = constants.LocationSetId.OUTPUTS, agg_loc_sets: Optional[List[int]] = ( constants.LocationAggregation.Ids.SPECIAL_LOCATIONS + [constants.LocationSetId.OUTPUTS]) ) -> pd.DataFrame: """ Unpacks arguments from version object to use with get_population function. Requests most detailed ages and most detailed sexes because age-sex population aggregates are created in the summarize module. Dependant on demographics team to upload population for majority of aggregate locations but currently uses AggSynchronous to create population information for select Norway locations in LocationSetId.OUTPUTS. Arguments: version (MachineParameters): object containing all the demographic and configuration data needed to query population estimates. location_set_id (int): The id for hierarchy to aggregate up agg_loc_sets (list): Additional location sets to create special aggregates Return: pd.DataFrame """ pop = get_population(age_group_id=version.most_detailed_age_group_ids, location_id=version.location_ids, year_id=version.year_ids, sex_id=version.sex_ids, run_id=version.population_version_id, decomp_step=version.decomp_step, gbd_round_id=version.gbd_round_id) io_mock = {} source = DrawSource({"draw_dict": io_mock, "name": "tmp"}, mem_read_func) sink = DrawSink({"draw_dict": io_mock, "name": "tmp"}, mem_write_func) index_cols = constants.Columns.DEMOGRAPHIC_INDEX data_cols = [constants.Columns.POPULATION] sink.push(pop[index_cols + data_cols]) # location if agg_loc_sets: assert len(agg_loc_sets) == len(set(agg_loc_sets)) assert agg_loc_sets[-1] == constants.LocationSetId.OUTPUTS for set_id in agg_loc_sets: loc_tree = dbtrees.loctree(location_set_id=set_id, gbd_round_id=version.gbd_round_id) operator = Sum(index_cols=([ col for col in index_cols if col != constants.Columns.LOCATION_ID ]), value_cols=data_cols) aggregator = AggSynchronous( draw_source=source, draw_sink=sink, index_cols=([ col for col in index_cols if col != constants.Columns.LOCATION_ID ]), aggregate_col=constants.Columns.LOCATION_ID, operator=operator) aggregator.run(loc_tree) special_locations = source.content() else: special_locations = pd.DataFrame() return pd.concat([ pop, special_locations. loc[~special_locations.location_id.isin(pop.location_id.unique())] ], ignore_index=True)
def new_population(self, location_set_id, agg_loc_sets=[]): dim = self.nonfatal_dimensions.get_simulation_dimensions( self.measure_id) df = get_population( age_group_id=( dim.index_dim.get_level("age_group_id") + [164]), location_id=dbtrees.loctree(location_set_id=location_set_id, gbd_round_id=self.gbd_round_id ).node_ids, sex_id=dim.index_dim.get_level("sex_id"), year_id=dim.index_dim.get_level("year_id")) index_cols = ["location_id", "year_id", "age_group_id", "sex_id"] data_cols = ["population"] io_mock = {} source = DrawSource({"draw_dict": io_mock, "name": "tmp"}, mem_read_func) sink = DrawSink({"draw_dict": io_mock, "name": "tmp"}, mem_write_func) sink.push(df[index_cols + data_cols]) # location for set_id in agg_loc_sets: loc_tree = dbtrees.loctree( location_set_id=set_id, gbd_round_id=self.gbd_round_id) operator = Sum( index_cols=[col for col in index_cols if col != "location_id"], value_cols=data_cols) aggregator = AggSynchronous( draw_source=source, draw_sink=sink, index_cols=[col for col in index_cols if col != "location_id"], aggregate_col="location_id", operator=operator) aggregator.run(loc_tree) # age for age_group_id in ComoSummaries._gbd_compare_age_group_list: age_tree = dbtrees.agetree(age_group_id) operator = Sum( index_cols=[col for col in index_cols if col != "age_group_id" ], value_cols=data_cols) aggregator = AggSynchronous( draw_source=source, draw_sink=sink, index_cols=[col for col in index_cols if col != "age_group_id" ], aggregate_col="age_group_id", operator=operator) aggregator.run(age_tree) # sex sex_tree = dbtrees.sextree() operator = Sum( index_cols=[col for col in index_cols if col != "sex_id"], value_cols=data_cols) aggregator = AggSynchronous( draw_source=source, draw_sink=sink, index_cols=[col for col in index_cols if col != "sex_id"], aggregate_col="sex_id", operator=operator) aggregator.run(sex_tree) df = source.content() df.to_hdf( "{}/info/population.h5".format(self.como_dir), 'draws', mode='w', format='table', data_columns=["location_id", "year_id", "age_group_id", "sex_id"])