def compute_sex_aggregates(self): sex_tree = sextree() # make the source and sink source = self.gen_draw_source() source.add_transform( fill_square, index_cols=[ col for col in self.dimensions.index_names if col != "sex_id" ], square_col="sex_id", square_col_vals=[node.id for node in sex_tree.leaves()]) sink = self.gen_draw_sink() # construct aggregator obj operator = WtdSum( index_cols=[ col for col in self.dimensions.index_names if col != "sex_id" ], value_cols=self.dimensions.data_list(), weight_df=self.population, weight_name="population", merge_cols=["location_id", "year_id", "age_group_id", "sex_id"]) aggregator = AggSynchronous(draw_source=source, draw_sink=sink, index_cols=[ col for col in self.dimensions.index_names if col != "sex_id" ], aggregate_col="sex_id", operator=operator) # run the tree aggregator.run(sex_tree)
def _agg_age_std_ages(self): age_tree = agetree(age.AGE_STANDARDIZED) # make the source and sink source = self.gen_draw_source() source.add_transform( fill_square, index_cols=[ col for col in self.dimensions.index_names if col != "age_group_id" ], square_col="age_group_id", square_col_vals=[node.id for node in age_tree.leaves()]) sink = self.gen_draw_sink() # constuct aggregator obj operator = WtdSum(index_cols=[ col for col in self.dimensions.index_names if col != "age_group_id" ], value_cols=self.dimensions.data_list(), weight_df=self.std_age_weights, weight_name="age_group_weight_value", merge_cols=["age_group_id"]) aggregator = AggSynchronous(draw_source=source, draw_sink=sink, index_cols=[ col for col in self.dimensions.index_names if col != "age_group_id" ], aggregate_col="age_group_id", operator=operator) # run the tree aggregator.run(age_tree)
def _agg_pop_wtd_ages_birth(self, age_group_id): age_tree = agetree(age_group_id) age_tree.add_node(age.BIRTH, {}, age_tree.root.id) # make the source and sink source = self.gen_draw_source() source.add_transform(convert_to_counts, self.population, self.dimensions.data_list()) sink = self.gen_draw_sink() sink.add_transform(convert_to_rates, self.population, self.dimensions.data_list()) # constuct aggregator obj operator = Sum(index_cols=[ col for col in self.dimensions.index_names if col != "age_group_id" ], value_cols=self.dimensions.data_list()) aggregator = AggSynchronous(draw_source=source, draw_sink=sink, index_cols=[ col for col in self.dimensions.index_names if col != "age_group_id" ], aggregate_col="age_group_id", operator=operator) aggregator.run(age_tree)
def _get_population( version: MachineParameters, location_set_id: int = constants.LocationSetId.OUTPUTS, agg_loc_sets: Optional[List[int]] = ( constants.LocationAggregation.Ids.SPECIAL_LOCATIONS + [constants.LocationSetId.OUTPUTS]) ) -> pd.DataFrame: """ Unpacks arguments from version object to use with get_population function. Requests most detailed ages and most detailed sexes because age-sex population aggregates are created in the summarize module. Dependant on demographics team to upload population for majority of aggregate locations but currently uses AggSynchronous to create population information for select Norway locations in LocationSetId.OUTPUTS. Arguments: version (MachineParameters): object containing all the demographic and configuration data needed to query population estimates. location_set_id (int): The id for hierarchy to aggregate up agg_loc_sets (list): Additional location sets to create special aggregates Return: pd.DataFrame """ pop = get_population(age_group_id=version.most_detailed_age_group_ids, location_id=version.location_ids, year_id=version.year_ids, sex_id=version.sex_ids, run_id=version.population_version_id, decomp_step=version.decomp_step, gbd_round_id=version.gbd_round_id) io_mock = {} source = DrawSource({"draw_dict": io_mock, "name": "tmp"}, mem_read_func) sink = DrawSink({"draw_dict": io_mock, "name": "tmp"}, mem_write_func) index_cols = constants.Columns.DEMOGRAPHIC_INDEX data_cols = [constants.Columns.POPULATION] sink.push(pop[index_cols + data_cols]) # location if agg_loc_sets: assert len(agg_loc_sets) == len(set(agg_loc_sets)) assert agg_loc_sets[-1] == constants.LocationSetId.OUTPUTS for set_id in agg_loc_sets: loc_tree = dbtrees.loctree(location_set_id=set_id, gbd_round_id=version.gbd_round_id) operator = Sum(index_cols=([ col for col in index_cols if col != constants.Columns.LOCATION_ID ]), value_cols=data_cols) aggregator = AggSynchronous( draw_source=source, draw_sink=sink, index_cols=([ col for col in index_cols if col != constants.Columns.LOCATION_ID ]), aggregate_col=constants.Columns.LOCATION_ID, operator=operator) aggregator.run(loc_tree) special_locations = source.content() else: special_locations = pd.DataFrame() return pd.concat([ pop, special_locations. loc[~special_locations.location_id.isin(pop.location_id.unique())] ], ignore_index=True)
def new_population(self, location_set_id, agg_loc_sets=[]): dim = self.nonfatal_dimensions.get_simulation_dimensions( self.measure_id) df = get_population( age_group_id=( dim.index_dim.get_level("age_group_id") + [164]), location_id=dbtrees.loctree(location_set_id=location_set_id, gbd_round_id=self.gbd_round_id ).node_ids, sex_id=dim.index_dim.get_level("sex_id"), year_id=dim.index_dim.get_level("year_id")) index_cols = ["location_id", "year_id", "age_group_id", "sex_id"] data_cols = ["population"] io_mock = {} source = DrawSource({"draw_dict": io_mock, "name": "tmp"}, mem_read_func) sink = DrawSink({"draw_dict": io_mock, "name": "tmp"}, mem_write_func) sink.push(df[index_cols + data_cols]) # location for set_id in agg_loc_sets: loc_tree = dbtrees.loctree( location_set_id=set_id, gbd_round_id=self.gbd_round_id) operator = Sum( index_cols=[col for col in index_cols if col != "location_id"], value_cols=data_cols) aggregator = AggSynchronous( draw_source=source, draw_sink=sink, index_cols=[col for col in index_cols if col != "location_id"], aggregate_col="location_id", operator=operator) aggregator.run(loc_tree) # age for age_group_id in ComoSummaries._gbd_compare_age_group_list: age_tree = dbtrees.agetree(age_group_id) operator = Sum( index_cols=[col for col in index_cols if col != "age_group_id" ], value_cols=data_cols) aggregator = AggSynchronous( draw_source=source, draw_sink=sink, index_cols=[col for col in index_cols if col != "age_group_id" ], aggregate_col="age_group_id", operator=operator) aggregator.run(age_tree) # sex sex_tree = dbtrees.sextree() operator = Sum( index_cols=[col for col in index_cols if col != "sex_id"], value_cols=data_cols) aggregator = AggSynchronous( draw_source=source, draw_sink=sink, index_cols=[col for col in index_cols if col != "sex_id"], aggregate_col="sex_id", operator=operator) aggregator.run(sex_tree) df = source.content() df.to_hdf( "{}/info/population.h5".format(self.como_dir), 'draws', mode='w', format='table', data_columns=["location_id", "year_id", "age_group_id", "sex_id"])
def location_aggregate_birth_counts(gbd_round_id: int, decomp_step: str, constants_path: pathlib.PosixPath, location_set_id: int) -> None: """ for given gbd_round, decomp_step, location_set_id, get a complete set of location-aggregated live births """ logger.info(f'aggregating for location_set_id {location_set_id}') multiple_tree_flag = (location_set_id in mmr_constants.MULTIPLE_ROOT_LOCATION_SET_IDS) scalars = get_regional_scalars(gbd_round_id, decomp_step) index_cols = ['location_id', 'year_id', 'age_group_id', 'sex_id'] cov_estimate_filename = ( mmr_constants.COV_ESTIMATES_FORMAT_FILENAME.format(location_set_id)) region_locs, most_detailed_locs = get_location_level_sets( gbd_round_id=gbd_round_id, decomp_step=decomp_step, location_set_id=location_set_id) save_birth_count_estimates(gbd_round_id=gbd_round_id, decomp_step=decomp_step, cov_estimate_filepath=constants_path / cov_estimate_filename, location_set_id=location_set_id, most_detailed_locs=most_detailed_locs) loc_trees = dbtrees.loctree(location_set_id=location_set_id, gbd_round_id=gbd_round_id, decomp_step=decomp_step, return_many=multiple_tree_flag) if not multiple_tree_flag: loc_trees = [loc_trees] draw_source = DrawSource(params={ 'draw_dir': str(constants_path), 'file_pattern': cov_estimate_filename }) i = 1 output_filenames = [] for loc_tree in loc_trees: output_filename = f'{location_set_id}_{i}.h5' i += 1 draw_sink = DrawSink(params={ 'draw_dir': str(constants_path), 'file_pattern': output_filename }) draw_sink.add_transform( _apply_regional_scalars, regional_scalars_df=scalars.query('location_id in @region_locs'), gbd_round_id=gbd_round_id, decomp_step=decomp_step) op = Sum(index_cols=[s for s in index_cols if s != 'location_id'], value_cols=[mmr_constants.Columns.LIVE_BIRTH_VALUE_COL]) AggSynchronous( draw_source=draw_source, draw_sink=draw_sink, index_cols=[s for s in index_cols if s != 'location_id'], aggregate_col='location_id', operator=op).run(loc_tree, include_leaves=True) output_filenames.append(output_filename) return output_filenames