def run_task(self, location_set_version_id, component): source = self.get_source(component) sink = self.get_sink(component) dimensions = self.dimensions.get_dimension_by_component( component, self.measure_id) # get the tree we are aggregating loc_trees = dbtrees.loctree( location_set_version_id=location_set_version_id, return_many=True) for loc_tree in loc_trees: # get the weight vals pop = get_population( self.como_version, age_group_id=dimensions.index_dim.get_level("age_group_id"), location_id=[node.id for node in loc_tree.nodes], year_id=dimensions.index_dim.get_level("year_id"), sex_id=dimensions.index_dim.get_level("sex_id")) pop = pop[[ "age_group_id", "location_id", "year_id", "sex_id", "population" ]] # set up our aggregation operator operator = WtdSum(index_cols=[ col for col in dimensions.index_names if col != "location_id" ], value_cols=dimensions.data_list(), weight_df=pop, weight_name="population", merge_cols=[ "location_id", "year_id", "age_group_id", "sex_id" ]) # run our aggregation aggregator = AggMemEff(draw_source=source, draw_sink=sink, index_cols=[ col for col in dimensions.index_names if col != "location_id" ], aggregate_col="location_id", operator=operator, chunksize=self.chunksize[component]) # run the tree aggregator.run(loc_tree, draw_filters={ "measure_id": [self.measure_id], "year_id": dimensions.index_dim.get_level("year_id"), "sex_id": dimensions.index_dim.get_level("sex_id") }, n_processes=self.chunksize[component])
def get_dataframe(self): start_time = time.time() logger.info("START aggregate locations, time = {}".format(start_time)) AggMemEff(self.draw_source, self.draw_sink, self.index_cols, 'location_id', self.operator, chunksize=2 ).run(self.loctree, include_leaves=False, n_processes=8, draw_filters=self.draw_filters) end_time = time.time() logger.info("location aggregation complete, time = {}" .format(end_time)) elapsed = end_time - start_time logger.info("DONE location agg pipeline at {}, " "elapsed seconds= {}".format(end_time, elapsed)) logger.info("{}".format(SUCCESS_LOG_MESSAGE))
def aggregate_locations(aggregation_type: str, parent_dir: str, measure_id: int, gbd_round_id: int, location_set_id: int, year_id: int) -> None: """ Uses a AggMemEff aggregator to aggregate locations for deaths and YLLs. Arguments: aggregation_type (str): the type of data to be aggregated up a location hierarchy. One of 'aggregated/rescaled', 'aggregated/shocks', 'aggregated/unscaled' 'scaled', or 'unaggregated/shocks'. parent_dir (str): parent fauxcorrect directory e.g. PATH/{version} measure_id (int): measure ID for deaths or YLLs gbd_round_id (int): GBD round ID for this fauxcorrect run location_set_id (int): location set ID with which to aggregate year_id (int): draws year ID Raises: ValueError: if measure_id is not deaths (1) or YLLs (4) """ # Set up DrawSource and DrawSink. source_dir, sink_dir = _get_draw_source_sink_dirs(parent_dir, aggregation_type, measure_id) source, draw_filters = _get_draw_source_and_filters( aggregation_type, source_dir, year_id, measure_id) sink = DrawSink({ 'draw_dir': sink_dir, 'file_pattern': FilePaths.LOCATION_AGGREGATE_FILE_PATTERN.format(year_id=year_id), 'h5_tablename': Keys.DRAWS }) sink.add_transform(_apply_regional_scalars, parent_dir, gbd_round_id, location_set_id) sink.add_transform(add_measure_id_to_sink, measure_id=measure_id) # clean up old files we plan on writing clean_aggregation_directory(root_dir=sink.params['draw_dir'], file_pattern=sink.params['file_pattern'], location_set_id=location_set_id, gbd_round_id=gbd_round_id) # Set up aggregator and location tree. index_cols = ([col for col in Columns.INDEX if col != Columns.LOCATION_ID]) operator = Sum(index_cols, Columns.DRAWS) agg = AggMemEff(source, sink, index_cols, Columns.LOCATION_ID, operator, chunksize=2) is_sdi_set = location_set_id == LocationSetId.SDI trees = loctree(location_set_id=location_set_id, gbd_round_id=gbd_round_id, return_many=is_sdi_set) logging.info(f"Aggregating locations, location_set_id: {location_set_id}") for tree in np.atleast_1d(trees): agg.run(tree, draw_filters=draw_filters, n_processes=10)
] draw_cols = ['draw_{}'.format(i) for i in range(n_draws)] for lsid in location_set_id: popfile = os.path.join(drawdir, 'population_{}.csv'.format(lsid)) population = pd.read_csv(popfile) # aggregation operator operator = WtdSum( index_cols=index_cols, value_cols=draw_cols, weight_df=population, weight_name='population', merge_cols=['location_id', 'year_id', 'age_group_id', 'sex_id']) # run aggregation aggregator = AggMemEff(draw_source=source, draw_sink=sink, index_cols=index_cols, aggregate_col='location_id', operator=operator) if lsid == 40: loc_trees = loctree(location_set_id=lsid, gbd_round_id=gbd_round_id, return_many=True) for tree in loc_trees: aggregator.run(tree, draw_filters={'rei_id': rei_id}) else: loc_tree = loctree(location_set_id=lsid, gbd_round_id=gbd_round_id) aggregator.run(loc_tree, draw_filters={'rei_id': rei_id})
region_locs = region_locs[region_locs.level == 2].location_id.tolist() draw_sink.add_transform(apply_regional_scalars, region_locs=region_locs, parent_dir=parent_dir) draw_sink.add_transform(transform_add_measure, measure_id=measure_id) # create operator logging.info("Reading regional scalars from flatfiles.") index_cols = [col for col in index_cols if col != 'location_id'] operator = Sum(index_cols, draw_cols) # Aggregate logging.info("Instantiate aggregator.aggregators.AggMemEff.") aggregator = AggMemEff(draw_source=draw_source, draw_sink=draw_sink, index_cols=index_cols, aggregate_col='location_id', operator=operator, chunksize=2) logging.info("Create location tree(s).") is_sdi_set = False if location_set_id == 40: is_sdi_set = True tree = loctree(location_set_id=location_set_id, gbd_round_id=GBD.GBD_ROUND_ID, return_many=is_sdi_set) logging.info("Run aggregator.") for t in np.atleast_1d(tree): aggregator.run(t, draw_filters={ 'measure_id': measure_id,