Exemplo n.º 1
0
def get_envelope_draws(meid, loc):
    df = get_draws('modelable_entity_id',
                   meid,
                   'dismod',
                   location_ids=loc,
                   year_ids=yids,
                   sex_ids=sids,
                   age_group_ids=agids,
                   gbd_round_id={GBD ROUND ID})
    df = df.drop(['modelable_entity_id', 'model_version_id', 'measure_id'], axis=1)
    renames = {'draw_%s' % d: 'env_%s' % d for d in range(1000)}
    df.rename(columns=renames, inplace=True)
    draws = [col for col in list(df) if col.startswith('env')]
    df['mean'] = df[draws].mean(axis=1)
    for col in df[draws]:
        df[col] = df[col]/df['mean']
    df.fillna(value=1, inplace=True)
    df= df.drop(['mean'], axis=1)
    return df
Exemplo n.º 2
0
locations = stroke_fns.get_locations()
sexes = [1, 2]
loops = (len(locations) * 2) * 2

all_df = pd.DataFrame()
#all_df_list = []

count = 0
for me, mv in izip(modelable_entity_ids, models):
    for geo in locations:
        for sex in sexes:
            print 'On loop %s of %s' % (count, loops)
            draws = get_draws('modelable_entity_id',
                              me,
                              'epi',
                              location_ids=geo,
                              year_ids=year,
                              sex_ids=sex,
                              gbd_round_id=4)
            d_ages = draws.age_group_id.unique()
            if 235 in d_ages:
                draws = draws[draws.age_group_id.isin(ages2)]
                draws[['age_group_id']] = draws[['age_group_id'
                                                 ]].replace(to_replace=235,
                                                            value=33)
            elif 33 in d_ages:
                draws = draws[draws.age_group_id.isin(ages1)]

            # pull out incidence and EMR into seperate dfs
            incidence = draws[draws.measure_id == 6]
            emr = draws[draws.measure_id == 9]
Exemplo n.º 3
0
def run_yld_compile(yld_tmp, yld_dir, yld_version, root_dir, location, ar,
                    n_draws):
    if ar:
        years = range(1990, 2017)
    else:
        years = [1990, 1995, 2000, 2005, 2010, 2016]
    pops = pd.read_csv('{root_dir}/PATH/pop.csv'.format(root_dir=root_dir))
    index_cols = ['location_id', 'age_group_id', 'sex_id', 'year_id']

    if yld_version == 0:
        yld_draws = get_draws("cause_id",
                              source="como",
                              gbd_id=294,
                              measure_ids=3,
                              location_ids=location,
                              year_ids=years,
                              sex_ids=[1, 2],
                              n_draws=n_draws,
                              resample=True)
    else:
        yld_draws = get_draws("cause_id",
                              source="como",
                              gbd_id=294,
                              measure_ids=3,
                              location_ids=location,
                              year_ids=years,
                              sex_ids=[1, 2],
                              version=yld_version,
                              n_draws=n_draws,
                              resample=True)
    draw_cols = [col for col in yld_draws.columns if 'draw' in col]
    yld_draws = yld_draws[index_cols + draw_cols]

    yld_draws = yld_draws.merge(pops, on=index_cols)
    yld_draws = yld_draws.set_index(index_cols)
    yld_draws[draw_cols] = yld_draws[draw_cols].multiply(
        yld_draws['population'], axis='index')
    yld_draws.drop('population', axis=1, inplace=True)
    yld_draws = yld_draws.reset_index()

    yld_draws.loc[yld_draws['age_group_id'].isin(range(2, 5) + [164]),
                  'age_group_id'] = 28
    ages = range(5, 21) + range(30, 33) + [28, 235]
    yld_draws = yld_draws.loc[yld_draws['age_group_id'].isin(ages)]
    yld_draws = yld_draws.groupby(index_cols).sum().reset_index()
    sex_agg = yld_draws.groupby(['age_group_id', 'location_id',
                                 'year_id']).sum().reset_index()
    sex_agg['sex_id'] = 3
    yld_draws = yld_draws.append(sex_agg)

    yld_draws = yld_draws.merge(pops, on=index_cols)
    yld_draws = yld_draws.set_index(index_cols)
    yld_draws[draw_cols] = yld_draws[draw_cols].divide(yld_draws['population'],
                                                       axis='index')
    yld_draws.drop('population', axis=1, inplace=True)
    yld_draws = yld_draws.reset_index()

    yld_draws.rename(columns=(lambda x: x.replace('draw_', '')
                              if x in draw_cols else x),
                     inplace=True)
    new_draws = [col for col in yld_draws.columns if col.isdigit()]
    yld_draws = pd.melt(yld_draws,
                        id_vars=index_cols,
                        value_vars=new_draws,
                        var_name='draw',
                        value_name='yld_rate')
    csv_draws = yld_draws.set_index('location_id')
    csv_draws.to_csv('{yld_tmp}/{location}_draws.csv'.format(
        yld_tmp=yld_tmp, location=location))

    summ_cols = ['yld_rate']
    calc_summary(yld_draws, summ_cols, yld_dir, location)
def process_location_cc_draws(location_id, test=False):
    """Pull mortality numbers, limiting to desired ages by cause

    Gets all years >1990 and ages for the location id as mortality numbers
    from get_draws
    """
    dfs = []
    cause_age_sets = [[
        dw.CC_ALL_AGE_CAUSE_IDS,
        range(2, 21) + range(30, 33) + [235]
    ], [dw.CC_THIRTY_SEVENTY_CAUSE_IDS,
        range(11, 19)], [dw.PRE_1990_CAUSES, [22]]]
    if test:
        years = [2016]
    else:
        years = []
    for causes, ages in cause_age_sets:
        gbd_ids = {'cause_ids': causes}
        df = get_draws(gbd_id_field=['cause_id'] * len(causes),
                       gbd_id=causes,
                       source='codcorrect',
                       version=dw.CC_VERS,
                       location_ids=[location_id],
                       year_ids=years,
                       age_group_ids=ages,
                       sex_ids=[3],
                       measure_ids=1)
        dfs.append(df)
    df = pd.concat(dfs, ignore_index=True)

    # keep relevant years
    df = df.ix[(df['year_id'] >= 1990) |
               ((df['cause_id'].isin(dw.PRE_1990_CAUSES)) &
                (df['year_id'] >= 1980))]

    # make sure index variables are ints
    for idvar in dw.CC_GROUP_COLS:
        df[idvar] = df[idvar].astype(int)

    # make sure it looks like we expect
    assert set(df.ix[df['cause_id'].isin(dw.PRE_1990_CAUSES)].age_group_id) == set([22]), \
        'unexpected age group ids found'
    assert set(df.ix[~df['cause_id'].isin(dw.PRE_1990_CAUSES)].age_group_id) == \
        set(range(2, 21) + range(30, 33) + [235]), \
        'unexpected age group ids found'
    assert set(df.sex_id) == set([3]), \
        'unexpected sex ids found'
    if not test:
        assert set(df.ix[df['cause_id'].isin(dw.PRE_1990_CAUSES)].year_id) == \
            set(range(1980, 2017)), \
            'unexpected year ids found'
        assert set(df.ix[
            ~df['cause_id'].isin(dw.PRE_1990_CAUSES)
        ].year_id) == \
            set(range(1990, 2017)), \
            'unexpected year ids found'
    assert set(df.location_id) == set([location_id]), \
        'unexpected location ids found'

    # age standardize
    df = age_standardize(df, 'codcorrect')

    # write the output
    df = df[dw.CC_GROUP_COLS + dw.DRAW_COLS]
    write_output(df, 'codcorrect', location_id)

    return df
def process_location_como_draws(location_id, measure_id, test=False):
    """Pull indidence rates, merging with population to make cases

    Gets all years, ages, and sexes for the location id as incidence rates
    from get_draws, and combines into all ages, both
    sexes cases.
    """
    db_pops = qry.get_pops()
    if measure_id == 6:
        causes = dw.COMO_INC_CAUSE_IDS
    elif measure_id == 5:
        causes = dw.COMO_PREV_CAUSE_IDS
    else:
        raise ValueError("bad measure_id: {}".format(measure_id))

    dfs = []
    if test:
        years = [2016]
    else:
        years = []
    for cause_id in causes:
        print("pulling {c}".format(c=cause_id))
        if test:
            df = get_draws(gbd_id_field='cause_id',
                           gbd_id=cause_id,
                           source='como',
                           version=dw.COMO_VERS,
                           location_ids=[location_id],
                           year_ids=years,
                           age_group_ids=[],
                           sex_ids=[1, 2],
                           measure_ids=[measure_id])
        else:
            df = interpolate(gbd_id_field='cause_id',
                             gbd_id=cause_id,
                             source='como',
                             version=dw.COMO_VERS,
                             reporting_year_start=1990,
                             reporting_year_end=2016,
                             location_ids=[location_id],
                             age_group_ids=[],
                             sex_ids=[1, 2],
                             measure_ids=[measure_id])

        # these pull in as rates
        df['metric_id'] = 3

        # make sure it looks like we expect
        assert set(df.age_group_id) == set(range(2, 21) + range(30, 33) + [235]), \
            'unexpected age group ids found'
        assert set(df.sex_id) == set([1, 2]), \
            'unexpected sex ids found'
        if not test:
            assert set(df.year_id) == set(range(1990, 2017)), \
                'unexpected year ids found'
        assert set(df.location_id) == set([location_id]), \
            'unexpected location ids found'

        # compile
        dfs.append(df[dw.COMO_GROUP_COLS + dw.DRAW_COLS])

    df = pd.concat(dfs, ignore_index=True)

    # merge with pops to transform to cases
    df = df.merge(db_pops, how='left')
    assert df.population.notnull().values.all(
    ), 'merge with populations failed'

    # concatenate the metadata with the draw cols times the pop
    # this multiplies each draw column by the population column
    df = pd.concat([
        df[dw.COMO_GROUP_COLS],
        df[dw.DRAW_COLS].apply(lambda x: x * df['population']),
        df['population']
    ],
                   axis=1)

    # now its numbers (this line is for readability)
    df['metric_id'] = 1

    # aggregate sexes
    df['sex_id'] = 3

    # collapse sexes together
    df = df.groupby(dw.COMO_GROUP_COLS,
                    as_index=False)[dw.DRAW_COLS + ['population']].sum()
    df = pd.concat([
        df[dw.COMO_GROUP_COLS],
        df[dw.DRAW_COLS].apply(lambda x: x / df['population'])
    ],
                   axis=1)
    df['metric_id'] = 3

    # AGE STANDARDIZE
    print("age standardizing")
    wgts = custom_age_weights(0, 125)
    df = df.merge(wgts, on=['age_group_id'], how='left')
    assert df.age_group_weight_value.notnull().values.all(), \
        'merge w wgts failed'
    df = pd.concat([
        df[dw.COMO_GROUP_COLS],
        df[dw.DRAW_COLS].apply(lambda x: x * df['age_group_weight_value'])
    ],
                   axis=1)
    df['age_group_id'] = 27
    df = df.groupby(dw.COMO_GROUP_COLS, as_index=False)[dw.DRAW_COLS].sum()

    df = df[dw.COMO_GROUP_COLS + dw.DRAW_COLS]
    write_output(df, 'como', location_id)
    return df
def process_location_risk_exposure_draws(location_id, test=False):
    """Return yearly age standardized estimates of each rei_id.

    Arguments:
        location_id: the location_id to process

    Returns:
        pandas dataframe like so:
        [ID_COLS] : [dw.DRAW_COLS]
    """
    dfs = []

    # version_df = pd.DataFrame()
    risks = set(dw.RISK_EXPOSURE_REI_IDS).union(
        set(dw.RISK_EXPOSURE_REI_IDS_MALN))
    if test:
        years = [2016]
    else:
        years = []
    for rei_id in risks:
        print("pulling {r}".format(r=rei_id))
        if test or rei_id == 166:
            df = get_draws(gbd_id_field='rei_id',
                           gbd_id=rei_id,
                           source='risk',
                           location_ids=[location_id],
                           year_ids=years,
                           age_group_ids=[],
                           sex_ids=[],
                           draw_type='exposure')
        elif not test and rei_id == 86:
            df = interpolate(gbd_id_field='rei_id',
                             gbd_id=rei_id,
                             source='risk',
                             reporting_year_start=1990,
                             reporting_year_end=2016,
                             location_ids=[location_id],
                             age_group_ids=[],
                             sex_ids=[],
                             measure_ids=19,
                             draw_type='exposure')
        else:
            df = interpolate(gbd_id_field='rei_id',
                             gbd_id=rei_id,
                             source='risk',
                             reporting_year_start=1990,
                             reporting_year_end=2016,
                             location_ids=[location_id],
                             age_group_ids=[],
                             sex_ids=[],
                             draw_type='exposure')

        # remove any other ages besides main gbd ages
        df = df.query(
            '(age_group_id >= 2 & age_group_id <= 20) or age_group_id in [30, 31, 32, 235] and sex_id in [1, 2]'
        )
        df = df.query('year_id >= 1990')

        if rei_id == 166:
            # only keep 10+ for smoking
            df = df.query('age_group_id >= 7')
            df = df.query('parameter=="cat1"')

        # set the rei_id because it isnt in the get_draws pull
        df['rei_id'] = rei_id

        # these are prevalence rates
        df['metric_id'] = 3
        if rei_id == 86:
            df['measure_id'] = 19
        else:
            df['measure_id'] = 5

        dfs.append(df[dw.RISK_EXPOSURE_GROUP_COLS + dw.DRAW_COLS])

    df = pd.concat(dfs, ignore_index=True)

    # COLLAPSE SEX
    print("collapsing sex")
    df = df.merge(qry.get_pops(), how='left')
    assert df.population.notnull().values.all(), 'merge with pops fail'
    # overriding the sex variable for collapsing
    df['sex_id'] = df.rei_id.apply(lambda x: 2 if x == 167 else 3)

    # for stunting and wasting (where we only have under-5), keep only under-5 and aggregate ages
    df.ix[df['rei_id'].isin(dw.RISK_EXPOSURE_REI_IDS_MALN), 'age_group_id'] = 1

    # make all ages for PM 2.5
    df.ix[df['rei_id'] == 86, 'age_group_id'] = 22

    df = pd.concat([
        df[dw.RISK_EXPOSURE_GROUP_COLS],
        df[dw.DRAW_COLS].apply(lambda x: x * df['population'])
    ],
                   axis=1)
    # so unnecessary programmatically but good for documentation -
    #  these are now prev cases
    df['metric_id'] = 1
    # now that its in cases it is possible to collapse sex
    df = df.groupby(dw.RISK_EXPOSURE_GROUP_COLS, as_index=False).sum()

    # RETURN TO RATES
    print("returning to rates")
    df = df.merge(qry.get_pops(), how='left')
    assert df.population.notnull().values.all(), 'merge with pops fail'
    df = pd.concat([
        df[dw.RISK_EXPOSURE_GROUP_COLS],
        df[dw.DRAW_COLS].apply(lambda x: x / df['population'])
    ],
                   axis=1)
    df['metric_id'] = 3

    # AGE STANDARDIZE
    print("age standardizing")
    wgts = custom_age_weights(10, 125)  # FOR SMOKING ONLY
    df = df.merge(wgts, on=['age_group_id'], how='left')
    assert df.age_group_weight_value.notnull().values.all(), \
        'merge w wgts failed'
    df = pd.concat([
        df[dw.RISK_EXPOSURE_GROUP_COLS],
        df[dw.DRAW_COLS].apply(lambda x: x * df['age_group_weight_value'])
    ],
                   axis=1)
    df['age_group_id'] = 27
    df = df.groupby(dw.RISK_EXPOSURE_GROUP_COLS,
                    as_index=False)[dw.DRAW_COLS].sum()

    df = df[dw.RISK_EXPOSURE_GROUP_COLS + dw.DRAW_COLS]
    write_output(df, 'risk_exposure', location_id)
    return df
def process_location_risk_burden_draws(location_id, test=False):
    ''' Given a list of rei_ids, use gopher to get attributable burden draws
    and save to out directory.

    '''

    dfs = []
    for rei_id in dw.RISK_BURDEN_REI_IDS + dw.RISK_BURDEN_DALY_REI_IDS:
        print(rei_id)
        if rei_id in dw.RISK_BURDEN_REI_IDS:
            measure_id = 1
        elif rei_id in dw.RISK_BURDEN_DALY_REI_IDS:
            measure_id = 2
        else:
            raise ValueError("no measure found")
        print('Getting draws')
        df = get_draws(gbd_id_field=['cause_id', 'rei_id'],
                       gbd_id=[294, rei_id],
                       source='burdenator',
                       version=dw.BURDENATOR_VERS,
                       location_ids=location_id,
                       year_ids=[],
                       age_group_ids=[],
                       sex_ids=[],
                       num_workers=3,
                       n_draws=1000,
                       resample=True)

        # keep years we want
        df = df.query('measure_id == {}'.format(measure_id))
        df = df.query('metric_id == 1')
        df = df.query('age_group_id in {} and sex_id in [1, 2]'.format(
            range(2, 21) + range(30, 33) + [235]))
        df = df.query('year_id in {}'.format(range(1990, 2011, 5) + [2016]))

        # aggregate to both sexes
        df['sex_id'] = 3
        df = df.groupby(dw.RISK_BURDEN_GROUP_COLS,
                        as_index=False)[dw.DRAW_COLS].sum()
        pops = qry.get_pops(both_sexes=True)
        df = df.merge(pops,
                      how='left',
                      on=['location_id', 'age_group_id', 'sex_id', 'year_id'])
        df = pd.concat([
            df[dw.RISK_BURDEN_GROUP_COLS],
            df[dw.DRAW_COLS].apply(lambda x: x / df['population'])
        ],
                       axis=1)
        df['metric_id'] = 3

        # keep the right columns
        df = df[dw.RISK_BURDEN_GROUP_COLS + dw.DRAW_COLS]

        # interpolate years
        print('Interpolating')
        df = custom_interpolate(df)

        # age-standardize
        age_weights = qry.get_age_weights(4)
        df = df.merge(age_weights)
        df = pd.concat([
            df[dw.RISK_BURDEN_GROUP_COLS],
            df[dw.DRAW_COLS].apply(lambda x: x * df['age_group_weight_value'])
        ],
                       axis=1)
        df['age_group_id'] = 27
        df = df.groupby(dw.RISK_BURDEN_GROUP_COLS,
                        as_index=False)[dw.DRAW_COLS].sum()
        dfs.append(df)

    df = pd.concat(dfs)
    write_output(df, 'risk_burden', location_id)
    return df
Exemplo n.º 8
0
def draw_deaths(cause_id, location_id, out_path, send_Slack, slack, channel):
	"""
	draw_deaths: For a given location and Cause grab death estimates at the
	draw-level. Collapse it mean, upper, and lower.
	"""
	
	# year IDs
	YEAR_IDS = range(1990, 2017)
		
	# All ages that could be used
	ALL_AGES = range(2, 22) + [28] + range(30, 33) + [235]

	# Early and late ages of DisMod
	DISMOD_AGES = [2, 3, 4, 30, 31, 32, 235]

	# DisMod old ages
	OLD_DISMOD_AGES = [30, 31, 32, 235]

	# DisMod young ages
	YOUNG_DISMOD_AGES = [2, 3, 4]

	# Ages
	AGES = range(2, 5) + range(30, 34)

	# old ages
	OLD_AGES = range(30, 34)

	# young ages
	YOUNG_AGES = range(2, 5)

	# necessary columns
	group_cols = ['cause_id', 'sex_id', 'age_group_id', 'location_id']
	measure_cols = ['mean_death', 'upper_death', 'lower_death']

	# Draw columns
	draw_cols = (['draw_%s' % i for i in range(0, 1000)])
	hf_death_cols = (['hf_deaths_%s' % i for i in range(0, 1000)])

	# split <1 age group into early neonatal, late neonatal, and post neonatal
	young_age_groups = pd.DataFrame({'age_group_id':YOUNG_DISMOD_AGES,
									 'temp_id':1})

	# split 80+ age group into early neonatal, late neonatal, and post neonatal
	old_age_groups = pd.DataFrame({'age_group_id':OLD_DISMOD_AGES, 
								   'temp_id':1})

	# Query draws for the cause/location combo
	print cause_id, location_id
	try:
		DF = get_draws('cause_id', 
					   cause_id,
					   'codcorrect',
					   location_ids=location_id,
					   age_group_ids=ALL_AGES,
					   gbd_round_id=4,
					   #status="best",
					   status="latest",
					   #version_id=64,
					   sexes=[1,2],
					   location_set_id=35,
					   measure_ids=1,
					   numworkers=5)
		print DF['output_version_id'].unique(), " ", DF.shape
		
		DF = DF.query('year_id in {}'.format(YEAR_IDS))
		DF = DF.query('output_version_id == {}'.format(64))
		DF.fillna(0, inplace=True)
	except:
		if send_Slack == "YES":
			message = ("get_draws query FAILED for location_id={location_id} "
			           "and cause_id={cause_id}").format(\
					                                    location_id=location_id,
														cause_id=cause_id)
			slack.chat.post_message(channel, message)
		print message
			
	if send_Slack == "YES" and not len(DF):
		message = "Missing data for get_draws for location_id={location_id}" +\
				  " and cause_id={cause_id}".format(location_id=location_id,
													cause_id=cause_id)
		slack.chat.post_message(channel, message)

	# Compute 25 and 75 percentiles of the distribution for each row					
	stats = DF[draw_cols].transpose().describe(
			percentiles=[.25, .75]).transpose()[['mean', '25%', '75%']]		
	stats.rename(
			columns={'mean':'mean_death',
			         '25%': 'lower_death',
					 '75%': 'upper_death'}, inplace=True)

	# add these percentiles to the dataset		
	DF['mean_death'] = stats['mean_death']
	DF['lower_death'] = stats['lower_death']
	DF['upper_death'] = stats['upper_death']

	# To make this script robust filter by what ever the age groups are 
	# available if age group ID 235 is available go with that otherwise find 
	# another way to get/make an 80+ age group
	if pd.Series(DISMOD_AGES).isin(DF.age_group_id.unique()).all():
		ages = range(2,21) + [30, 31, 32, 235]
		DF = DF.query('age_group_id in {}'.format(ages))
		print "DISMOD_AGES"
		
	elif pd.Series(YOUNG_DISMOD_AGES).isin(DF.age_group_id.unique()).all() and \
		 not pd.Series(OLD_DISMOD_AGES).isin(DF.age_group_id.unique()).all():
		ages = range(2, 21) + range(30, 34)
		DF = DF.query('age_group_id in {}'.format(ages))
		DF['age_group_id'].replace(to_replace=33, value=235, inplace=True)
		print "YOUNG_DISMOD_AGES"
		
	elif pd.Series(OLD_DISMOD_AGES).isin(DF.age_group_id.unique()).all() and \
		not pd.Series(YOUNG_DISMOD_AGES).isin(DF.age_group_id.unique()).all():
		ages = range(5, 21) + [28] + [30, 31, 32, 235]
		DF = DF.query('age_group_id in {}'.format(ages))
		print "OLD_DISMOD_AGES"
		
	elif pd.Series(AGES).isin(DF.age_group_id.unique()).all():
		ages = range(2, 21) + range(30, 34)
		DF = DF.query('age_group_id in {}'.format(ages))
		print "AGES"
		
	elif pd.Series(OLD_AGES).isin(DF.age_group_id.unique()).all() and \
		not pd.Series(YOUNG_AGES).isin(DF.age_group_id.unique()).all():
		ages = range(5, 21) + [28] + range(30, 34)
		DF = DF.query('age_group_id in {}'.format(ages))
		print "OLD_AGES"

	elif pd.Series(YOUNG_AGES).isin(DF.age_group_id.unique()).all() and \
		not pd.Series(OLD_AGES).isin(DF.age_group_id.unique()).all():
		ages = range(2, 22)
		DF = DF.query('age_group_id in {}'.format(ages))
		print "YOUNG_AGES"

	else:
		ages = range(5, 22) + [28]
		DF = DF.query('age_group_id in {}'.format(ages))
		"A lot is missing"

	# split out neotals age groups if needed
	if pd.Series([28]).isin(DF.age_group_id.unique()).all():
		temp = DF.query('age_group_id == 28').copy()
		DF = DF.query('age_group_id != 28')
		
		# Make a temporary id to merge with age groups DataFrame
		temp.drop('age_group_id', axis=1, inplace=True)
		temp['temp_id'] = 1
		temp = temp.merge(young_age_groups, on='temp_id', how='inner')
		temp.drop('temp_id', axis=1, inplace=True)

		# Append the new df w/ age groups to the original
		# (excluding the <1 age composite).
		DF = DF.append(temp)

	# split out 80+ age groups if needed
	if pd.Series([21]).isin(DF.age_group_id.unique()).all():
		temp = DF.query('age_group_id == 21').copy()
		DF = DF.query('age_group_id != 21')
		
		# Make a temporary id to merge with age groups DataFrame
		temp.drop('age_group_id', axis=1, inplace=True)
		temp['temp_id'] = 1
		temp = temp.merge(old_age_groups, on='temp_id', how='inner')
		temp.drop('temp_id', axis=1, inplace=True)
		
		# Append the new df w/ age groups to the original
		# (excluding the 80+ age composite).
		DF = DF.append(temp)

	# The columns that are added up
	DF = DF.groupby(group_cols)[measure_cols].sum().reset_index()
	
	# fill in any missing data
	DF = make_square_matrix(DF)
	
	# Save it to a CSV on the cluster to be read back into the main script.					
	DF.to_csv('{out_path}/codcorrect_{cause}_{location}.csv'.format(\
			  out_path=out_path,
			  cause=cause_id,
			  location=location_id),
			  index=False, encoding='utf-8')
Exemplo n.º 9
0
all_cols = keep_cols + index_cols
ages1 = range(2,21) + [30,31,32,33]
ages2 = range(2,21) + [30,31,32,235]
# ages1 = [22,27]
# ages2 = [22,27]

all_acute_isch_list = []
all_acute_hem_list = []
all_chronic_isch_list = []
all_chronic_hem_list = []

count = 0
location_count = len(locations)
for geo in locations:
	# get acute isch
	csmr_isch = get_draws('modelable_entity_id', isch_me, 'epi', location_ids=geo,
			year_ids=year, sex_ids=[1,2], gbd_round_id=4)
	csmr_isch = csmr_isch[csmr_isch['measure_id']==15]
	isch_ages = csmr_isch.age_group_id.unique()
	if 235 in isch_ages:
		csmr_isch = csmr_isch[csmr_isch.age_group_id.isin(ages2)]
		csmr_isch[['age_group_id']] = csmr_isch[['age_group_id']].replace(to_replace=235,value=33)
	elif 33 in isch_ages:
		csmr_isch = csmr_isch[csmr_isch.age_group_id.isin(ages1)]
	csmr_isch = csmr_isch[all_cols]
	
	#get acute hem
	csmr_hem = get_draws('modelable_entity_id', hem_me, 'epi', location_ids=geo, 
			year_ids=year, sex_ids=[1,2], gbd_round_id=4)
	csmr_hem = csmr_hem[csmr_hem['measure_id']==15]
	hem_ages = csmr_hem.age_group_id.unique()
	if 235 in hem_ages:
	def get_envelope(self):
		"""get the envelope"""
		
		print "get envelope"
		
		# columns
		draw_cols = (['draw_%s' % i for i in range(0, 1000)])
		env_cols = (['env_prev_%s' % i for i in range(0, 1000)])
		hf_cols = (['hf_prev_%s' % i for i in range(0, 1000)])
		
		# get overall prevalence of heart failure
		hf = get_draws('modelable_entity_id',
					   2412,
					   'dismod',
					   sex_ids=[1,2],
					   status="best",
					   measure_ids=5,
					   location_ids=self.location_id,
					   gbd_round_id=4,
					   num_workers=5)
		
		# drop unneeded columns
		hf = hf[self.group_cols + draw_cols]
		
		# drop unneeded age groups
		hf = hf.query('age_group_id in {}'.format((self.AGE_GROUPS_IDS)))
		
		# delete the prevalence due to Chagas
		chagas = get_draws('modelable_entity_id',
						   2413,
						   'dismod',
						   sex_ids=[1,2],
						   status="best",
						   measure_ids=5,
						   location_ids=self.location_id,
						   gbd_round_id=4,
						   num_workers=5)
		
		# drop unneeded columns
		chagas = chagas[self.group_cols + draw_cols]
		
		# drop unneeded age groups
		chagas = chagas.query('age_group_id in {}'.format((self.AGE_GROUPS_IDS)))
		
		# make sure the size of HF and Chagas is the same
		if len(hf) != len(chagas) and self.send_Slack == "YES":
				message = "Chagas and HF envelople have different number of rows for {location_id}"\
							.format(location_id=location_id)
				self.slack.chat.post_message(self.channel, message)
		
		#assert len(hf) == len(chagas), "matrices are not the same size."
		
		# rename HF prev draws
		for i in xrange(1000):
			hf.rename(columns={'draw_'+str(i):'env_prev_'+str(i)}, inplace=True)
			
		# merge the HF-chagas prev draws and the HF prev draws
		hf = hf.merge(chagas, on=self.group_cols, how='inner')
			
		# subtract prevalence of HF due to Chagas from total HF prevalence
		for i in xrange(1000):
			hf['nonchagas_prev_'+str(i)] = hf['env_prev_'+str(i)] - hf['draw_'+str(i)]

			hf.rename(columns={'nonchagas_prev_'+str(i):'hf_prev_'+str(i)}, inplace=True)
		
		# drop unneeded columns
		hf = hf[self.group_cols + hf_cols]
		
		hf[hf < 0] = 0
		
		# Make sure the Matrix is square
		assert_env_df_is_square(hf, self.send_Slack, self.slack, self.channel, self.location_id)
		
		# Make sure there aren't any duplicates
		if hf[self.group_cols].duplicated().any() and self.send_Slack == "YES":
			message = "The Chagas deleted HF envelope has duplicates for {location_id}"\
						.format(location_id=location_id)
			self.slack.chat.post_message(self.channel, message)
		#assert not hf[self.index_cols].duplicated().any(), 'duplicates introduced in custom cause generation'
		
		self.envelope = hf