Пример #1
0
# do all the prep work
dep_map = pd.read_csv("FILEPATH" % dep_map_type,
                      header=0).dropna(axis='columns', how='all')
step_df = dep_map.ix[dep_map.step == 2].reset_index()

columns = maternal_fns.filter_cols()
columns.remove('measure_id')
index_cols = [col for col in columns if not col.startswith('draw_')]

# logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger("maternal_custom.02_adjust_parent")
logger.info("Correcting underreporting for late maternal deaths")

# get list of locations
locs = maternal_fns.get_locations()

# get list of location/years that don't need correction from USER
logger.info('Pulling in adjustment csv from USER.')
adjust_df = pd.read_csv('%s/FILEPATH.csv' % (os.getcwd()))
adjust_df = adjust_df[['location_id', 'year_to_start']]

# get original codem envelope
logger.info("Pulling in codem envelope")
env = get_draws(gbd_id=366,
                gbd_id_type='cause_id',
                source='codem',
                version_id=int(env_model_vers),
                location_id=locs,
                age_group_id=list(range(7, 16)))
env = env[columns + ['envelope']]
Пример #2
0
def main(year):
    year = int(year)
    current_date = maternal_fns.get_time()
    # shorten to just today
    current_date = current_date[0:10]
    out_dir = 'FILEPATH' % current_date
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    diag_out_dir = '%s/diagnostics' % out_dir
    if not os.path.exists(diag_out_dir):
        os.makedirs(diag_out_dir)

    ############
    # Prep Work
    ############
    locations = maternal_fns.get_locations()
    columns = maternal_fns.filter_cols()
    rr_cols = []
    for i in range(0, 1000):
        rr_cols.append('rr_%d' % i)

    all_diag = pd.DataFrame()

    ############
    # Load data
    ############
    # load relative risks
    rr = pd.read_stata('FILEPATH')
    rr.drop('dummy', axis=1, inplace=True)
    rr = rr.reindex(range(0, 9))
    # propagate the relative risks down to all ages of interest
    rr['age_group_id'] = range(7, 16)
    rr.fillna(method='ffill', axis=0, inplace=True)
    rr.set_index('age_group_id', inplace=True)
    cols = rr.columns

    # rename all draw columns to be draw_0 to draw_999
    for idx, i in enumerate(cols):
        rr.rename(columns={'%s' % i: 'draw_%s' % idx}, inplace=True)

    print("pulling draws")
    all_pafs = pd.DataFrame()
    for geo in locations:
        hiv_prev = get_draws(gbd_id_type='modelable_entity_id',
                             gbd_id=9313,
                             source='epi',
                             sex_id=2,
                             location_id=geo,
                             year_id=year,
                             status='best',
                             age_group_id=range(7, 16),
                             gbd_round_id=6,
                             decomp_step='step2')

        hiv_prev = hiv_prev[columns]

        # we only want maternal age groups for hiv_prev
        hiv_prev.set_index('age_group_id', inplace=True)

        # replace all nulls with zeroes
        hiv_prev.fillna(0, inplace=True)

        # assert that HIV prevalence in pregnancy is between 0 and 1
        if (hiv_prev.values.any() < 0):
            raise ValueError(
                'HIV Prev values are below 0 for geo %s and year %s' %
                (geo, year))
            continue
        elif (hiv_prev.values.any() > 1):
            raise ValueError(
                'HIV Prev values are above 1 for geo %s and year %s' %
                (geo, year))
            continue

        ############
        # Math
        ############

        # multiply hiv prev for every loc/year/age by every rr draw using this
        pafs = (hiv_prev * (rr - 1)) / ((hiv_prev * (rr - 1)) + 1)

        # assert that all pafs are bounded between 0 and 1
        if (pafs.values.any() < 0):
            raise ValueError('PAF values are below 0 for geo %s and year %s' %
                             (geo, year))
            continue
        elif (pafs.values.any() > 1):
            raise ValueError('PAF values are above 1 for geo %s and year %s' %
                             (geo, year))
            continue

        pafs['location_id'] = geo
        pafs['year'] = year
        all_pafs = all_pafs.append(pafs)

        ############
        # Diagnostics
        ############
        # create paf_mean and find countries where paf is high to output seperately
        diagnostics = pafs.copy()
        diagnostics['paf_mean'] = diagnostics.mean(axis=1).to_frame()
        if diagnostics.paf_mean.any() >= .01:
            diagnostics['add'] = 0
        else:
            diagnostics['add'] = 1
        diagnostics['location_id'] = geo

        diagnostics.reset_index(inplace=True)
        all_diag = all_diag.append(diagnostics)

    ############
    # Save
    ############
    # output pafs as hdf
    print("saving")
    all_pafs.fillna(0, inplace=True)
    all_pafs.reset_index(inplace=True)
    all_pafs.convert_objects()
    all_pafs.to_hdf('%s/pafs_%s.h5' % (out_dir, year),
                    'pafs',
                    mode='w',
                    format='table',
                    data_columns=['location_id', 'age_group_id', 'year'])

    all_diag.to_csv('%s/diagnostics_%s.csv' % (diag_out_dir, year),
                    index=False,
                    encoding='utf-8')