示例#1
0
文件: ylls.py 项目: cheth-rowe/ihmexp
def _save_all_ylls_fauxcorrect(
        ylls: pd.DataFrame,
        yll_shocks: pd.DataFrame,
        parent_dir: str,
        location_id: int,
        sex_id: int,
        measure_id: int = 4
) -> None:
    """Save YLLs for given location and sex"""
    ylls_sink = DrawSink({
        'draw_dir': join(
            parent_dir, FilePaths.DRAWS_SCALED_DIR, FilePaths.YLLS_DIR
        ),
        'file_pattern': FilePaths.YLL_DRAWS_FILE_PATTERN.format(
            sex_id=sex_id, location_id=location_id
        ),
        'h5_tablename': Keys.DRAWS
    })
    ylls_sink.add_transform(add_measure_id_to_sink, measure_id=measure_id)
    ylls_sink.push(ylls, append=False)

    shocks_sink = DrawSink({
        'draw_dir': join(
            parent_dir,
            FilePaths.UNAGGREGATED_DIR,
            FilePaths.SHOCKS_DIR,
            FilePaths.YLLS_DIR
        ),
        'file_pattern': FilePaths.YLL_DRAWS_FILE_PATTERN.format(
            sex_id=sex_id, location_id=location_id
        ),
        'h5_tablename': Keys.DRAWS
    })
    shocks_sink.add_transform(add_measure_id_to_sink, measure_id=measure_id)
    shocks_sink.push(yll_shocks, append=False)
示例#2
0
def save_all_draws(parent_dir,
                   ylls,
                   yll_shocks,
                   location_id,
                   index_columns,
                   measure_id=4):
    # Save yll data
    agg_rescaled_params = {
        'draw_dir': os.path.join(parent_dir, 'aggregated/rescaled'),
        'file_pattern': '{measure_id}_{location_id}_{year_id}.h5',
        'h5_tablename': 'draws'
    }
    rescaled_sink = DrawSink(agg_rescaled_params)
    rescaled_sink.add_transform(add_measure_id_to_sink, measure_id=measure_id)
    rescaled_sink.push(ylls, append=False)

    agg_shocks_params = {
        'draw_dir': os.path.join(parent_dir, 'aggregated/shocks'),
        'file_pattern': '{measure_id}_{location_id}_{year_id}.h5',
        'h5_tablename': 'draws'
    }
    shocks_sink = DrawSink(agg_shocks_params)
    shocks_sink.add_transform(add_measure_id_to_sink, measure_id=measure_id)
    shocks_sink.push(yll_shocks, append=False)
示例#3
0
def save_all_draws(parent_dir,
                   index_columns,
                   rescaled_data,
                   shock_data,
                   unscaled_data,
                   measure_id=1):
    for data in [rescaled_data, shock_data, unscaled_data]:
        for i in index_columns:
            data[i] = data[i].astype(int)

    rescaled_params = {
        'draw_dir': os.path.join(parent_dir, 'aggregated/rescaled'),
        'file_pattern': '{measure_id}_{location_id}_{year_id}.h5',
        'h5_tablename': 'draws'
    }
    rescaled_sink = DrawSink(rescaled_params)
    rescaled_sink.add_transform(add_measure_id_to_sink, measure_id=measure_id)
    rescaled_sink.push(rescaled_data, append=False)

    unscaled_params = {
        'draw_dir': os.path.join(parent_dir, 'aggregated/unscaled'),
        'file_pattern': '{measure_id}_{location_id}_{year_id}.h5',
        'h5_tablename': 'draws'
    }
    unscaled_sink = DrawSink(unscaled_params)
    unscaled_sink.add_transform(add_measure_id_to_sink, measure_id=measure_id)
    unscaled_sink.push(unscaled_data, append=False)

    shocks_params = {
        'draw_dir': os.path.join(parent_dir, 'aggregated/shocks'),
        'file_pattern': '{measure_id}_{location_id}_{year_id}.h5',
        'h5_tablename': 'draws'
    }
    shocks_sink = DrawSink(shocks_params)
    shocks_sink.add_transform(add_measure_id_to_sink, measure_id=measure_id)
    shocks_sink.push(shock_data, append=False)
示例#4
0
def aggregate_locations(aggregation_type: str, parent_dir: str,
                        measure_id: int, gbd_round_id: int,
                        location_set_id: int, year_id: int) -> None:
    """
    Uses a AggMemEff aggregator to aggregate locations for deaths and
    YLLs.

    Arguments:
        aggregation_type (str): the type of data to be aggregated up a
            location hierarchy. One of 'aggregated/rescaled',
            'aggregated/shocks', 'aggregated/unscaled'
            'scaled', or 'unaggregated/shocks'.
        parent_dir (str): parent fauxcorrect directory
            e.g. PATH/{version}
        measure_id (int): measure ID for deaths or YLLs
        gbd_round_id (int): GBD round ID for this fauxcorrect run
        location_set_id (int): location set ID with which to aggregate
        year_id (int): draws year ID

    Raises:
        ValueError: if measure_id is not deaths (1) or YLLs (4)
    """
    # Set up DrawSource and DrawSink.
    source_dir, sink_dir = _get_draw_source_sink_dirs(parent_dir,
                                                      aggregation_type,
                                                      measure_id)
    source, draw_filters = _get_draw_source_and_filters(
        aggregation_type, source_dir, year_id, measure_id)

    sink = DrawSink({
        'draw_dir':
        sink_dir,
        'file_pattern':
        FilePaths.LOCATION_AGGREGATE_FILE_PATTERN.format(year_id=year_id),
        'h5_tablename':
        Keys.DRAWS
    })
    sink.add_transform(_apply_regional_scalars, parent_dir, gbd_round_id,
                       location_set_id)
    sink.add_transform(add_measure_id_to_sink, measure_id=measure_id)

    # clean up old files we plan on writing
    clean_aggregation_directory(root_dir=sink.params['draw_dir'],
                                file_pattern=sink.params['file_pattern'],
                                location_set_id=location_set_id,
                                gbd_round_id=gbd_round_id)

    # Set up aggregator and location tree.
    index_cols = ([col for col in Columns.INDEX if col != Columns.LOCATION_ID])
    operator = Sum(index_cols, Columns.DRAWS)

    agg = AggMemEff(source,
                    sink,
                    index_cols,
                    Columns.LOCATION_ID,
                    operator,
                    chunksize=2)
    is_sdi_set = location_set_id == LocationSetId.SDI
    trees = loctree(location_set_id=location_set_id,
                    gbd_round_id=gbd_round_id,
                    return_many=is_sdi_set)

    logging.info(f"Aggregating locations, location_set_id: {location_set_id}")
    for tree in np.atleast_1d(trees):
        agg.run(tree, draw_filters=draw_filters, n_processes=10)
示例#5
0
        draw_source = DrawSource(source_config)

        output_pattern = '{measure_id}_{location_id}_{year_id}.h5'
        sink_config = {
            'draw_dir': draw_dir,
            'file_pattern': output_pattern,
            'h5_tablename': 'draws'
        }
        draw_sink = DrawSink(sink_config)

        # Apply regional scalar transform
        region_locs = get_location_metadata(gbd_round_id=GBD.GBD_ROUND_ID,
                                            location_set_id=35)
        region_locs = region_locs[region_locs.level == 2].location_id.tolist()
        draw_sink.add_transform(apply_regional_scalars,
                                region_locs=region_locs,
                                parent_dir=parent_dir)
        draw_sink.add_transform(transform_add_measure, measure_id=measure_id)

        # create operator
        logging.info("Reading regional scalars from flatfiles.")
        index_cols = [col for col in index_cols if col != 'location_id']
        operator = Sum(index_cols, draw_cols)

        # Aggregate
        logging.info("Instantiate aggregator.aggregators.AggMemEff.")
        aggregator = AggMemEff(draw_source=draw_source,
                               draw_sink=draw_sink,
                               index_cols=index_cols,
                               aggregate_col='location_id',
                               operator=operator,
示例#6
0
 def gen_draw_sink(self):
     sink = DrawSink(self._mem_io_params, mem_write_func)
     sink.add_transform(sort_index_columns, self.dimensions.index_names)
     return sink
示例#7
0
def location_aggregate_birth_counts(gbd_round_id: int, decomp_step: str,
                                    constants_path: pathlib.PosixPath,
                                    location_set_id: int) -> None:
    """
    for given gbd_round, decomp_step, location_set_id, get a complete
    set of location-aggregated live births
    """

    logger.info(f'aggregating for location_set_id {location_set_id}')
    multiple_tree_flag = (location_set_id
                          in mmr_constants.MULTIPLE_ROOT_LOCATION_SET_IDS)

    scalars = get_regional_scalars(gbd_round_id, decomp_step)
    index_cols = ['location_id', 'year_id', 'age_group_id', 'sex_id']

    cov_estimate_filename = (
        mmr_constants.COV_ESTIMATES_FORMAT_FILENAME.format(location_set_id))

    region_locs, most_detailed_locs = get_location_level_sets(
        gbd_round_id=gbd_round_id,
        decomp_step=decomp_step,
        location_set_id=location_set_id)

    save_birth_count_estimates(gbd_round_id=gbd_round_id,
                               decomp_step=decomp_step,
                               cov_estimate_filepath=constants_path /
                               cov_estimate_filename,
                               location_set_id=location_set_id,
                               most_detailed_locs=most_detailed_locs)

    loc_trees = dbtrees.loctree(location_set_id=location_set_id,
                                gbd_round_id=gbd_round_id,
                                decomp_step=decomp_step,
                                return_many=multiple_tree_flag)
    if not multiple_tree_flag:
        loc_trees = [loc_trees]

    draw_source = DrawSource(params={
        'draw_dir': str(constants_path),
        'file_pattern': cov_estimate_filename
    })

    i = 1
    output_filenames = []
    for loc_tree in loc_trees:
        output_filename = f'{location_set_id}_{i}.h5'
        i += 1
        draw_sink = DrawSink(params={
            'draw_dir': str(constants_path),
            'file_pattern': output_filename
        })
        draw_sink.add_transform(
            _apply_regional_scalars,
            regional_scalars_df=scalars.query('location_id in @region_locs'),
            gbd_round_id=gbd_round_id,
            decomp_step=decomp_step)

        op = Sum(index_cols=[s for s in index_cols if s != 'location_id'],
                 value_cols=[mmr_constants.Columns.LIVE_BIRTH_VALUE_COL])

        AggSynchronous(
            draw_source=draw_source,
            draw_sink=draw_sink,
            index_cols=[s for s in index_cols if s != 'location_id'],
            aggregate_col='location_id',
            operator=op).run(loc_tree, include_leaves=True)

        output_filenames.append(output_filename)

    return output_filenames