Пример #1
0
    def run_task(self, location_set_version_id, component):
        source = self.get_source(component)
        sink = self.get_sink(component)
        dimensions = self.dimensions.get_dimension_by_component(
            component, self.measure_id)

        # get the tree we are aggregating
        loc_trees = dbtrees.loctree(
            location_set_version_id=location_set_version_id, return_many=True)
        for loc_tree in loc_trees:

            # get the weight vals
            pop = get_population(
                self.como_version,
                age_group_id=dimensions.index_dim.get_level("age_group_id"),
                location_id=[node.id for node in loc_tree.nodes],
                year_id=dimensions.index_dim.get_level("year_id"),
                sex_id=dimensions.index_dim.get_level("sex_id"))
            pop = pop[[
                "age_group_id", "location_id", "year_id", "sex_id",
                "population"
            ]]

            # set up our aggregation operator
            operator = WtdSum(index_cols=[
                col for col in dimensions.index_names if col != "location_id"
            ],
                              value_cols=dimensions.data_list(),
                              weight_df=pop,
                              weight_name="population",
                              merge_cols=[
                                  "location_id", "year_id", "age_group_id",
                                  "sex_id"
                              ])

            # run our aggregation
            aggregator = AggMemEff(draw_source=source,
                                   draw_sink=sink,
                                   index_cols=[
                                       col for col in dimensions.index_names
                                       if col != "location_id"
                                   ],
                                   aggregate_col="location_id",
                                   operator=operator,
                                   chunksize=self.chunksize[component])

            # run the tree
            aggregator.run(loc_tree,
                           draw_filters={
                               "measure_id": [self.measure_id],
                               "year_id":
                               dimensions.index_dim.get_level("year_id"),
                               "sex_id":
                               dimensions.index_dim.get_level("sex_id")
                           },
                           n_processes=self.chunksize[component])
Пример #2
0
    def get_dataframe(self):
        start_time = time.time()
        logger.info("START aggregate locations, time = {}".format(start_time))

        AggMemEff(self.draw_source, self.draw_sink, self.index_cols,
                  'location_id', self.operator, chunksize=2
                  ).run(self.loctree, include_leaves=False, n_processes=8,
                        draw_filters=self.draw_filters)

        end_time = time.time()
        logger.info("location aggregation complete, time = {}"
                    .format(end_time))
        elapsed = end_time - start_time
        logger.info("DONE location agg pipeline at {}, "
                    "elapsed seconds= {}".format(end_time, elapsed))
        logger.info("{}".format(SUCCESS_LOG_MESSAGE))
Пример #3
0
def aggregate_locations(aggregation_type: str, parent_dir: str,
                        measure_id: int, gbd_round_id: int,
                        location_set_id: int, year_id: int) -> None:
    """
    Uses a AggMemEff aggregator to aggregate locations for deaths and
    YLLs.

    Arguments:
        aggregation_type (str): the type of data to be aggregated up a
            location hierarchy. One of 'aggregated/rescaled',
            'aggregated/shocks', 'aggregated/unscaled'
            'scaled', or 'unaggregated/shocks'.
        parent_dir (str): parent fauxcorrect directory
            e.g. PATH/{version}
        measure_id (int): measure ID for deaths or YLLs
        gbd_round_id (int): GBD round ID for this fauxcorrect run
        location_set_id (int): location set ID with which to aggregate
        year_id (int): draws year ID

    Raises:
        ValueError: if measure_id is not deaths (1) or YLLs (4)
    """
    # Set up DrawSource and DrawSink.
    source_dir, sink_dir = _get_draw_source_sink_dirs(parent_dir,
                                                      aggregation_type,
                                                      measure_id)
    source, draw_filters = _get_draw_source_and_filters(
        aggregation_type, source_dir, year_id, measure_id)

    sink = DrawSink({
        'draw_dir':
        sink_dir,
        'file_pattern':
        FilePaths.LOCATION_AGGREGATE_FILE_PATTERN.format(year_id=year_id),
        'h5_tablename':
        Keys.DRAWS
    })
    sink.add_transform(_apply_regional_scalars, parent_dir, gbd_round_id,
                       location_set_id)
    sink.add_transform(add_measure_id_to_sink, measure_id=measure_id)

    # clean up old files we plan on writing
    clean_aggregation_directory(root_dir=sink.params['draw_dir'],
                                file_pattern=sink.params['file_pattern'],
                                location_set_id=location_set_id,
                                gbd_round_id=gbd_round_id)

    # Set up aggregator and location tree.
    index_cols = ([col for col in Columns.INDEX if col != Columns.LOCATION_ID])
    operator = Sum(index_cols, Columns.DRAWS)

    agg = AggMemEff(source,
                    sink,
                    index_cols,
                    Columns.LOCATION_ID,
                    operator,
                    chunksize=2)
    is_sdi_set = location_set_id == LocationSetId.SDI
    trees = loctree(location_set_id=location_set_id,
                    gbd_round_id=gbd_round_id,
                    return_many=is_sdi_set)

    logging.info(f"Aggregating locations, location_set_id: {location_set_id}")
    for tree in np.atleast_1d(trees):
        agg.run(tree, draw_filters=draw_filters, n_processes=10)
Пример #4
0
    ]
    draw_cols = ['draw_{}'.format(i) for i in range(n_draws)]

    for lsid in location_set_id:
        popfile = os.path.join(drawdir, 'population_{}.csv'.format(lsid))
        population = pd.read_csv(popfile)

        # aggregation operator
        operator = WtdSum(
            index_cols=index_cols,
            value_cols=draw_cols,
            weight_df=population,
            weight_name='population',
            merge_cols=['location_id', 'year_id', 'age_group_id', 'sex_id'])
        # run aggregation
        aggregator = AggMemEff(draw_source=source,
                               draw_sink=sink,
                               index_cols=index_cols,
                               aggregate_col='location_id',
                               operator=operator)

        if lsid == 40:
            loc_trees = loctree(location_set_id=lsid,
                                gbd_round_id=gbd_round_id,
                                return_many=True)
            for tree in loc_trees:
                aggregator.run(tree, draw_filters={'rei_id': rei_id})
        else:
            loc_tree = loctree(location_set_id=lsid, gbd_round_id=gbd_round_id)
            aggregator.run(loc_tree, draw_filters={'rei_id': rei_id})
Пример #5
0
        region_locs = region_locs[region_locs.level == 2].location_id.tolist()
        draw_sink.add_transform(apply_regional_scalars,
                                region_locs=region_locs,
                                parent_dir=parent_dir)
        draw_sink.add_transform(transform_add_measure, measure_id=measure_id)

        # create operator
        logging.info("Reading regional scalars from flatfiles.")
        index_cols = [col for col in index_cols if col != 'location_id']
        operator = Sum(index_cols, draw_cols)

        # Aggregate
        logging.info("Instantiate aggregator.aggregators.AggMemEff.")
        aggregator = AggMemEff(draw_source=draw_source,
                               draw_sink=draw_sink,
                               index_cols=index_cols,
                               aggregate_col='location_id',
                               operator=operator,
                               chunksize=2)

        logging.info("Create location tree(s).")
        is_sdi_set = False
        if location_set_id == 40:
            is_sdi_set = True
        tree = loctree(location_set_id=location_set_id,
                       gbd_round_id=GBD.GBD_ROUND_ID,
                       return_many=is_sdi_set)
        logging.info("Run aggregator.")
        for t in np.atleast_1d(tree):
            aggregator.run(t,
                           draw_filters={
                               'measure_id': measure_id,