def aggregate_locations_to_global(df, group_cols, denominator='population'): """multiply by population column and aggregate sexes and ages""" assert denominator in df.columns, '{d} column not in dataframe'.format( d=denominator) assert df[denominator].notnull().values.all( ), 'merge with {d} failed'.format(d=denominator) print("aggregatings locations") # select only level 3 locations locs = qry.get_sdg_reporting_locations(level_3=True) df = df[df.location_id.isin(locs.location_id)] df = pd.concat([ df[group_cols], df[dw.DRAW_COLS].apply(lambda x: x * df[denominator]), df[denominator] ], axis=1) # set location_id, groupby group_cols, and sum df['location_id'] = 1 df = df.groupby(group_cols, as_index=False)[dw.DRAW_COLS + [denominator]].sum() # return to appropriate metric df = pd.concat([ df[group_cols], df[dw.DRAW_COLS].apply(lambda x: x / df[denominator]), df[denominator] ], axis=1) df.drop('population', axis=1, inplace=True) df.reset_index(drop=True, inplace=True) return df
def process_all_qx_files(test=True): """Sequentially write results from all qx files""" qx_files = os.listdir(dw.QX_DIR) qx_files = [q for q in qx_files if 'qx_' in q] if test: qx_files = qx_files[0:40] dfs = [] print('assembling qx files') n = len(qx_files) i = 0 for qx_file in qx_files: df = process_qx_file(qx_file) dfs.append(df) i = i + 1 if i % (n / 10) == 0: print('{i}/{n} qx_files complete'.format(i=i, n=n)) df = pd.concat(dfs, ignore_index=True) locsdf = qry.get_sdg_reporting_locations() df = df[df['location_id'].isin(locsdf['location_id'].values)] id_cols = [ 'location_id', 'year_id', 'age_group_id', 'sex_id', 'measure_id', 'metric_id' ] df = df[id_cols + dw.DRAW_COLS] assert set(df.age_group_id) == {1, 42}, 'unexpected age group ids' assert set(df.year_id) == set(range(1990, 2017)), 'unexpected year ids' return df
def clean_compiled_indicators(df): # replace disaster with multi year moving average if 1019 in df['indicator_id'].unique(): df = multi_year_avg(df, 1019) indic_table = qry.get_indicator_table() df = df.merge( indic_table[['indicator_id', 'invert', 'scale', 'indicator_stamp']], how='left') assert df.invert.notnull().values.all(), 'merge with indicator meta fail' # get sdg locations and filter to these sdg_locs = set(qry.get_sdg_reporting_locations().location_id) df = df.loc[df['location_id'].isin(sdg_locs)] # filter to sdg reporting years (move this to global in config file) sdg_years = range(1990, 2031) df = df.loc[df['year_id'].isin(sdg_years)] # make sure each id column is an integer for id_col in dw.INDICATOR_ID_COLS: df[id_col] = df[id_col].astype(int) # return return df
def compile_output(df, add_rank=False, collapse_means=True): # collapse draws to means if collapse_means: df = collapse_to_means(df) # test that the data is square #sdg_test.df_is_square_on_indicator_location_year(df) # Data was square and still failing test # add indicator metadata indic_table = qry.get_indicator_table() indic_table = indic_table[[ 'indicator_id', 'indicator_short', 'indicator_stamp', 'indicator_paperorder' ]] df = df.merge(indic_table, how='left') assert df.indicator_stamp.notnull().values.all(), \ 'merge with indic table failed' # add location metadata locs = qry.get_sdg_reporting_locations() locs = locs[['location_id', 'location_name', 'ihme_loc_id']] df = df.merge(locs, how='left') assert df.location_name.notnull().values.all(), \ 'merge with locations failed' print 'Number of locations: {}'.format(len(df.location_id.unique())) # make sure its just reporting years df = df.loc[df.year_id.isin(range(1990, 2031))] # set column order col_order = [ 'indicator_id', 'location_id', 'year_id', 'indicator_short', 'indicator_stamp', 'indicator_paperorder', 'ihme_loc_id', 'location_name', 'rank', 'mean_val', 'upper', 'lower' ] # optionally add rank by sdg index if add_rank: # keep sdg index sdg_index = df.query('indicator_id==1054') # calculate rank sdg_index['rank'] = sdg_index.groupby('year_id').mean_val.transform( lambda x: pd.Series.rank(x, method='first', ascending=False)) # add it to the data df = df.merge(sdg_index[['location_id', 'year_id', 'rank']].drop_duplicates(), how='left') assert df['rank'].notnull().values.all(), 'merge failed' return df[col_order] else: col_order.remove('rank') return df[col_order]
def all_sdg_locations(df): """Test that all level three locations are present in df""" sdg_locs = set(qry.get_sdg_reporting_locations().location_id) missing_locs = sdg_locs - set(df.location_id) if len(missing_locs) > 0: raise ValueError( "Found {n} missing locations: {l}".format( n=len(missing_locs), l=missing_locs ) ) else: return True
def run_all(indicator_type, test=False): """Run each cod correct location-year draw job""" # set some locals based on indicator type if indicator_type == 'codcorrect': temp_dir = dw.CC_TEMP_OUT_DIR delete_dir = dw.CC_TEMP_OUT_DIR_DELETE elif indicator_type in ['como_prev', 'como_inc']: temp_dir = dw.COMO_TEMP_OUT_DIR delete_dir = dw.COMO_TEMP_OUT_DIR_DELETE elif indicator_type == 'risk_exposure': temp_dir = dw.RISK_EXPOSURE_TEMP_OUT_DIR delete_dir = dw.RISK_EXPOSURE_TEMP_OUT_DIR_DELETE elif indicator_type == 'risk_burden': temp_dir = dw.RISK_BURDEN_TEMP_OUT_DIR delete_dir = dw.RISK_BURDEN_TEMP_OUT_DIR_DELETE else: raise ValueError("{it} not supported".format(it=indicator_type)) # set location ids to run locations = qry.get_sdg_reporting_locations() location_ids = list(locations.location_id.unique()) # change those if it is a test if test: location_ids = [68, 165] # location_ids = [133, 6, 71, 39, 76, 173, 142, 208, 81, 116, 117, 217, 25, 93, 95] # make a new directory to use and delete the other one in a new thread print("making a new temp directory and starting a thread to delete old...") assert os.path.exists(temp_dir), '{} doesnt exist'.format(temp_dir) shutil.move(temp_dir, delete_dir) # start a thread that deletes the delete dir thread_deleting_old = Thread(target=shutil.rmtree, args=(delete_dir, )) thread_deleting_old.start() os.mkdir(temp_dir) print("processing draws...") # initialize list of job ids to hold on job_ids = [] for location_id in location_ids: job_ids.append(submit_job_location_draws(indicator_type, location_id)) print('collecting all output') if not test: submit_job_collect(job_ids, indicator_type) print('waiting for temp dir deletion to finish...') thread_deleting_old.join() print('... done deleting and all jobs submitted.')
def aggregate_locations_to_global(df, group_cols, denominator='population', age_standardized=False, age_group_years_start=None, age_group_years_end=None, age_group_id=None): """multiply by population column and aggregate sexes and ages""" assert denominator in df.columns, '{d} column not in dataframe'.format( d=denominator) assert df[denominator].notnull().values.all( ), 'merge with {d} failed'.format(d=denominator) print("aggregatings locations") # select only level 3 locations locs = qry.get_sdg_reporting_locations(level_3=True) df = df[df.location_id.isin(locs.location_id)] df = pd.concat([ df[group_cols], df[dw.DRAW_COLS].apply(lambda x: x * df[denominator]), df[denominator] ], axis=1) # set location_id, groupby group_cols, and sum df['location_id'] = 1 df = df.groupby(group_cols, as_index=False)[dw.DRAW_COLS + [denominator]].sum() # return to appropriate metric df = pd.concat([ df[group_cols], df[dw.DRAW_COLS].apply(lambda x: x / df[denominator]), df[denominator] ], axis=1) if age_standardized == True: standardize_args = [ age_group_years_start, age_group_years_end, age_group_id ] assert all(isinstance(arg, int) for arg in standardize_args ), 'age-standardization parameters must be integers' df = age_standardize(df, group_cols, age_group_years_start, age_group_years_end, age_group_id) return df
def main(): """read, standardize columns, add location id, add china aggregate""" # get all the things locsdf = qry.get_sdg_reporting_locations() dfs = [] print('assembling pm25 files') n = len(locsdf['ihme_loc_id']) i = 0 for ihme_loc_id in locsdf['ihme_loc_id']: if os.path.isfile(os.path.join(dw.MEAN_PM25_INDIR, ihme_loc_id + '.csv')): df = pd.read_csv(os.path.join(dw.MEAN_PM25_INDIR, ihme_loc_id + '.csv')) df['ihme_loc_id'] = ihme_loc_id dfs.append(df) else: print("Does not exist:" + os.path.join(dw.MEAN_PM25_INDIR, ihme_loc_id + '.csv')) i = i + 1 if i % (n / 10) == 0: print('{i}/{n} files complete'.format(i=i, n=n)) df = pd.concat(dfs, ignore_index=True) df = df.merge(locsdf[['ihme_loc_id', 'location_id']]) df = df.rename(columns={'year': 'year_id'}) df = df.rename(columns={'draw_1000': 'draw_0'}) df = df[['location_id', 'year_id'] + dw.DRAW_COLS] df = add_england_aggregate(df, locsdf) # standardize column structure again df['metric_id'] = 3 df['measure_id'] = 19 df['age_group_id'] = 22 df['sex_id'] = 3 df = df[dw.MEAN_PM25_GROUP_COLS + dw.DRAW_COLS] # interpolate pre-2010 post2010df = df.query('year_id >= 2010') df = custom_interpolate(df) df = df.append(post2010df) # save try: if not os.path.exists(dw.MEAN_PM25_OUTDIR): os.makedirs(dw.MEAN_PM25_OUTDIR) except OSError: pass df.to_hdf(dw.MEAN_PM25_OUTDIR + "/air_pm_draws_clean.h5", format="table", key="data", data_columns=['location_id', 'year_id'])
def df_is_square_on_indicator_location_year(df, return_fail_df=False): """Test that the dataframe has all combos of indicator-location-year""" # construct square dataframe that contains level 3 locations, all years in # five year increments of 1990-2015, and all status 1 indicators indic_table = _get_indicator_table() locs_table = qry.get_sdg_reporting_locations() sdg_locs = list(set(locs_table.location_id)) locs = pd.DataFrame({'location_id': sdg_locs}) stat1indic = indic_table.query( 'indicator_status_id==1').indicator_id.unique() indicators = pd.DataFrame({'indicator_id': stat1indic}) years5inc = range(1990, 2016, 5) years = pd.DataFrame({'year_id': years5inc}) for sdf in [locs, indicators, years]: sdf['key'] = 0 square_df = locs.merge(indicators, on='key') square_df = square_df.merge(years, on='key') square_df = square_df.drop('key', axis=1) # store dataframe of missing values from the square dataframe df['in_data'] = 1 square_df['in_square'] = 1 mdf = square_df.merge(df, how='outer') mdf = mdf.ix[(mdf['in_data'].isnull()) | (mdf['in_square'].isnull())] mdf = mdf.ix[~((mdf['in_data']==1) & \ (mdf['indicator_id'].isin([1054, 1055, 1060])))] if len(mdf) > 0: # pretty print the missing stuff mdf = mdf[['location_id', 'year_id', 'indicator_id', 'in_data', 'in_square']] mdf = mdf.merge(locs_table[['location_id', 'location_name']]) mdf = mdf.merge(indic_table[['indicator_id', 'indicator_short']]) mdf = mdf[['location_name', 'indicator_short', 'year_id', 'in_data', 'in_square']] if return_fail_df: return mdf else: raise ValueError( "Mismatch in the following: \n{df}".format(df=mdf) ) else: return True
def output_hrh_components_full_time_series(cov_version=dw.COV_VERS, sdg_version=dw.SDG_VERS): past_path = dw.INPUT_DATA_DIR + 'covariate' + '/' + cov_version future_path = dw.FORECAST_DATA_DIR + 'covariate' + '/' + cov_version dfs = [] for component_id in [1457, 1460, 1463]: print("pulling {c}".format(c=component_id)) df_past = pd.read_feather(past_path + '/' + str(component_id) + '.feather') df_past = df_past[['indicator_component_id', 'location_id', 'year_id'] + dw.DRAW_COLS] df_future = pd.read_feather(future_path + '/' + str(component_id) + '.feather') df_future = df_future.loc[df_future.scenario == 0, :] df_future['indicator_component_id'] = component_id df_future = df_future[ ['indicator_component_id', 'location_id', 'year_id'] + dw.DRAW_COLS] dfs.extend([df_past, df_future]) print "concatenating" df = pd.concat(dfs, ignore_index=True) # adding level column locsdf = qry.get_sdg_reporting_locations().loc[:, ['location_id', 'level']] df = df.merge(locsdf, on='location_id') print "outputting feather" df['indicator_id'] = 1096 df = df[[ 'indicator_id', 'indicator_component_id', 'location_id', 'year_id', 'level' ] + dw.DRAW_COLS] df.columns = df.columns.astype(str) df.to_feather(dw.INDICATOR_DATA_DIR + 'gbd2017/hrh_unscaled_components_v{v}.feather'.format( v=sdg_version)) print("done")
if getuser() == 'USER': SDG_REPO = 'FILEPATH' sys.path.append(SDG_REPO) import sdg_utils.draw_files as dw import sdg_utils.queries as qry POP_FILE = 'FILEPATH' component_ids = [20, 23] group_cols_past = [ 'location_id', 'year_id', 'measure_id', 'metric_id', 'age_group_id', 'sex_id' ] group_cols_future = [ 'location_id', 'year_id', 'age_group_id', 'sex_id', 'scenario' ] NATS = qry.get_sdg_reporting_locations(level_3=True) def add_denom_rows(df, agg_var, agg_dim, agg_parent, agg_children=None): ''' Create expanded demographics dataframe for denominators. ''' index_cols = [ 'location_id', 'year_id', 'age_group_id', 'sex_id', 'scenario' ] agg_df = df.copy() if agg_children is not None: agg_df = agg_df.loc[agg_df[agg_dim].isin(agg_children)] agg_df[agg_dim] = agg_parent agg_df = agg_df.groupby(index_cols, as_index=False)[agg_var].sum()
import pandas as pd import sys import os from getpass import getuser import sdg_utils.draw_files as dw import sdg_utils.queries as qry # get locations locsdf = qry.get_sdg_reporting_locations() # get main dataset df = pd.read_csv(dw.MET_NEED_FILE) df = df.query('year_id >= 1990') df = df.query('location_id in {}'.format(list(locsdf['location_id']))) df['metric_id'] = 2 df['measure_id'] = 18 # get weights if 'mod_contra' in dw.MET_NEED_VERS: agesdf = qry.get_pops() agesdf = agesdf.loc[agesdf.age_group_id.isin(df.age_group_id.unique())] agesdf['totpop'] = agesdf.groupby( ['location_id', 'year_id', 'sex_id'], as_index=False)['population'].transform('sum') agesdf['weights'] = agesdf['population'] / agesdf['totpop'] else: agesdf = pd.read_csv(dw.MET_NEED_WEIGHTS_FILE) agesdf = agesdf.query('location_id in {}'.format( list(locsdf['location_id'])))
def fetch_indicators(sdg_version=dw.SDG_VERS, force_recompile=False): """Fetch all indicator data and save in shared scratch space.""" # write/read output here path = "{idd}/gbd2017/".format(idd=dw.INDICATOR_DATA_DIR) out_file = path + "all_indicators_unscaled_v{}_conflict_disaster_truncated.feather".format( sdg_version) if os.path.exists(out_file) and not force_recompile: print "reading from existing file" df = pd.read_feather(out_file) else: print "compiling {}".format(sdg_version) print out_file # get an input_file dict to determine where to read data for each input_file_dict = fetch_input_file_dict() forecast_file_dict = fetch_forecast_file_dict() # list of dataframes for fast concatenation dfs = [] for indicator_id in input_file_dict.keys(): # read indicator data file if both past and future exist if os.path.exists( input_file_dict[indicator_id]) and os.path.exists( forecast_file_dict[indicator_id]): print "\t{}".format(indicator_id) df = pd.read_feather(input_file_dict[indicator_id]) if len(df.sex_id.unique()) > 1: df = df.loc[df.sex_id == 3, :] df = df[['location_id', 'year_id', 'age_group_id', 'sex_id'] + dw.DRAW_COLS] futuredf = pd.read_feather(forecast_file_dict[indicator_id]) if indicator_id == 1037: futuredf['sex_id'] = 3 futuredf['age_group_id'] = 22 futuredf.rename(index=str, columns={ 'draw_1000': 'draw_0', 'year': 'year_id', 'iso3': 'ihme_loc_id' }, inplace=True) futuredf[(futuredf.year_id >= 2018) & (futuredf.year_id <= 2030)] locs = qry.get_sdg_reporting_locations(level_3=True) locs = locs[['location_id', 'ihme_loc_id']] futuredf = futuredf.merge(locs, how='left') if len(futuredf.sex_id.unique()) > 1: futuredf = futuredf.loc[futuredf.sex_id == 3, :] if 'scenario' in futuredf.columns: futuredf = futuredf.loc[futuredf.scenario == 0, :] futuredf = futuredf[futuredf.year_id != 2017] futuredf = futuredf[ ['location_id', 'year_id', 'age_group_id', 'sex_id'] + dw.DRAW_COLS] if indicator_id in [1033, 1037]: futuredf = futuredf.loc[futuredf.year_id.isin( range(2018, 2031))] df = df.append(futuredf) # set indicator id to the key in the dict df['indicator_id'] = indicator_id if len(df.age_group_id.unique() > 1): df = df[df.age_group_id != 159] # should be one age and one sex assert len(df.sex_id.unique()) == 1, 'multiple sexes present' assert len( df.age_group_id.unique()) == 1, 'multiple ages present' if indicator_id in [1019, 1031]: print "truncating" df['upper'] = df[dw.DRAW_COLS].quantile(q=0.975, axis=1) df['lower'] = df[dw.DRAW_COLS].quantile(q=0.025, axis=1) df[dw.DRAW_COLS] = df[dw.DRAW_COLS].clip(lower=df.lower, upper=df.upper, axis=0) df = df[dw.INDICATOR_ID_COLS + dw.DRAW_COLS] dfs.append(df) print "concatenating" df = pd.concat(dfs, ignore_index=True) # set floor of < 1e-12 for i in xrange(1000): df.loc[df['draw_' + str(i)] < 1e-12, 'draw_' + str(i)] = 1e-12 df.loc[df['draw_' + str(i)].isnull(), 'draw_' + str(i)] = 1e-12 # remove level 6 (UTLAs) and level 5 india urban/rural (but keep English regions) locsdf = qry.get_sdg_reporting_locations( ).loc[:, ['location_id', 'level']] locsdf = locsdf.loc[locsdf['level'] != 6] locsdf = locsdf.loc[~( (locsdf['level'] == 5) & (~locsdf['location_id'].isin(range(4618, 4627))))] df = df.merge(locsdf, on='location_id') # write all the indicator data to the directory df = df[dw.INDICATOR_ID_COLS + ['level'] + dw.DRAW_COLS] df.columns = df.columns.astype(str) print "outputting feather" df.to_feather(out_file) return df
def fetch_indicators(sdg_version, force_recompile=False): # write/read output here version_dir = "{idd}/{v}".format(idd=dw.INDICATOR_DATA_DIR, v=sdg_version) out_file = version_dir + "/all_indicators.h5" if os.path.exists(out_file) and not force_recompile: print "reading from existing file" df = pd.read_hdf(out_file) else: print "recompiling" # get an input_file dict to determine where to read data for each # indicator input_file_dict = fetch_input_file_dict() # list of dataframes for fast concatenation dfs = [] for indicator_id in input_file_dict.keys(): print "\t{}".format(indicator_id) # read indicator data file if os.path.exists(input_file_dict[indicator_id]): df = pd.read_hdf(input_file_dict[indicator_id]) else: df = pd.read_csv(input_file_dict[indicator_id].replace( '.h5', '.csv')) df = df[['location_id', 'year_id', 'age_group_id', 'sex_id'] + dw.DRAW_COLS] forecastpath = input_file_dict[indicator_id].replace( '/input_data/', '/forecasts/') futuredf = pd.read_hdf(forecastpath) futuredf = futuredf[ ['location_id', 'year_id', 'age_group_id', 'sex_id'] + dw.DRAW_COLS] df = df.append(futuredf) # for now, limit locations to admin0 locsdf = qry.get_sdg_reporting_locations() df = df.loc[df.location_id.isin(locsdf.location_id.values)] assert set(df.location_id.values) == set( locsdf.location_id.values ), 'Missing location(s) for ' + str(indicator_id) # set indicator id to the key in the dict df['indicator_id'] = indicator_id # should be one age and one sex assert len(df.sex_id.unique()) == 1, 'multiple sexes present' assert len(df.age_group_id.unique()) == 1, 'multiple ages present' # if len(df.age_group_id.unique()) > 1: # df = age_standardize(df, weights_dict[indicator_id]) # keep & verify required columns df = df[dw.INDICATOR_ID_COLS + dw.DRAW_COLS] # append to dataframe list dfs.append(df) df = pd.concat(dfs, ignore_index=True) # set floor of < 1e-12 for i in xrange(1000): df.loc[df['draw_' + str(i)] < 1e-12, 'draw_' + str(i)] = 1e-12 # make version directory if it doesnt exist yet (likely) if not os.path.exists(version_dir): os.mkdir(version_dir) # save the input dictionary as a pickle for convenience pd.to_pickle(input_file_dict, version_dir + "/input_file_dict.pickle") # write all the indicator data to the version directory df.to_hdf(version_dir + "/all_indicators.h5", format="table", key="data", data_columns=dw.INDICATOR_ID_COLS) # return return df
def process_demo_draws(past_future, version=dw.DEMO_VERS): '''process demographics draws''' if past_future == 'past': data_dir = dw.INPUT_DATA_DIR + 'demographics' + '/' + str(version) elif past_future == 'future': data_dir = dw.FORECAST_DATA_DIR + 'demographics' + '/' + str(version) else: raise ValueError( 'The past_future arg must be set to "past" or "future".') component_ids = dw.DEMO_COMPONENT_IDS dfs = [] for component_id in component_ids: df = pd.read_feather(data_dir + '/' + str(component_id) + '.feather') dfs.append(df) print('concatenating') df = pd.concat(dfs, ignore_index=True) #temporary for gk locs = qry.get_sdg_reporting_locations(level_3=True) df = df[df.location_id.isin(locs.location_id)] # get live births births = load_births() if past_future == 'past': births = births[births.scenario == 0] births.drop('scenario', inplace=True, axis=1) df = df.merge(births, how='left') # Keep sex-split df_sex_split = df.copy(deep=True) # aggregate sexes df['sex_id'] == 3 df = agg.age_sex_aggregate(df, dw.DEMO_GROUP_COLS, denominator='births') df = df.append(df_sex_split, ignore_index=True) # global aggregation df_global = agg.aggregate_locations_to_global(df, dw.DEMO_GROUP_COLS, denominator='births') # output df = df[dw.DEMO_GROUP_COLS + dw.DRAW_COLS] df_global = df_global[dw.DEMO_GROUP_COLS + dw.DRAW_COLS] for component_id in component_ids: if component_id == 56: ind_id = '1040' else: ind_id = '1041' print("outputting " + ind_id) df_id = df[df.indicator_component_id == component_id] df_id.reset_index(drop=True, inplace=True) df_id.to_feather(data_dir + '/' + ind_id + '.feather') df_id_global = df_global[df_global.indicator_component_id == component_id] df_id_global.reset_index(drop=True, inplace=True) df_id_global.to_feather(data_dir + '/' + ind_id + '_' + 'global' + '.feather')
df = df.ix[df['source_type'].str.startswith("VR")] df = df.query("~(ihme_loc_id=='ZAF' & source_type=='VR-SSA')") # add China national estimate df = df.append(assemble_china_df(), ignore_index=True) # add India national estimate after removing other india df = df.query("ihme_loc_id != 'IND'") df = df.append(assemble_ind_df(), ignore_index=True) # add England as copy of UK df = add_england(df) # create square dataset of all sdg reporting locations and years, and fill in # any missing completeness years = range(1990, 2016, 1) years = pd.DataFrame({'year_id': years}) years['key'] = 0 locs = qry.get_sdg_reporting_locations()[['location_id']] locs['key'] = 0 sqr_df = years.merge(locs, on='key') sqr_df = sqr_df.drop('key', axis=1) sqr_df['sex_id'] = 3 df = sqr_df.merge(df, how='left') df[dw.COMPLETENESS_DATA_COL] = df[dw.COMPLETENESS_DATA_COL].fillna(0) # make a fake draws dataframe by copying trunc_pred (completeness) # into each draw, so there is no uncertainty idx = df.index draws = pd.DataFrame(index=df.index, columns=dw.DRAW_COLS) draws = draws.apply(lambda x: x.fillna(df[dw.COMPLETENESS_DATA_COL])) # add the draws to the dataframe, leveraging the shared index df = pd.concat([df, draws], axis=1)