def predict_abc(interp, extrap, interp_index, extrap_index, weight, interp_weights, extrap_weights, cs, abc, verbose=True): # set up age range ages = range(22, 30) + range(31, 68) # set up dictionaries to store output params_interp = {} params_extrap = {} error_mat = {} # set up matrices for interpolation/extrapolation parameters, and errors for sex in ['pooled', 'male', 'female']: params_interp[sex] = pd.DataFrame( [[np.nan for j in range(len(cols.interp.predictors) + 3)] for k in range(22, 30)], index=range(22, 30)) params_interp[sex].index.names = ['age'] params_interp[sex].columns = ['Intercept'] + cols.interp.predictors + [ 'y' ] + ['rmse'] params_extrap[sex] = pd.DataFrame( [[np.nan for j in range(len(cols.extrap.predictors) + 3)] for k in range(31, 68)], index=range(31, 68)) params_extrap[sex].index.names = ['age'] params_extrap[sex].columns = ['Intercept'] + cols.extrap.predictors + [ 'y' ] + ['rmse'] error_mat[sex] = pd.DataFrame([]) # obtain parameters for every age for age in ages: if age in range(22, 30): aux = deepcopy(interp.loc[interp_index, :]) if age == 22: interp_weights.reset_index(inplace=True) del interp_weights['draw'] interp_weights.set_index('id', inplace=True, drop=True) weight_array = deepcopy( interp_weights.loc[pd.IndexSlice[interp_index], :]) age_x = age - 1 predictors = cols.interp.predictors + ['inc_labor{}'.format(age_x)] elif age in range(31, 68): aux = deepcopy(extrap.loc[extrap_index, :]) if age == 31: age_x = 29 predictors = cols.extrap.predictors + [ 'inc_labor{}'.format(age_x) ] else: age_x = age - 1 predictors = cols.extrap.predictors + [ 'inc_labor{}'.format(age_x) ] if age == 31: extrap_index_weight = [x[1] for x in extrap_index] extrap_weights.reset_index(inplace=True) del extrap_weights['draw'] extrap_weights.set_index('id', inplace=True, drop=True) weight_array = deepcopy( extrap_weights.loc[extrap_index_weight, :]) c = 'inc_labor{}'.format(age) # drop black # drop black aux = aux.loc[aux.black == 1] # obtain parameters for different sexes for sex in ['pooled', 'male', 'female']: if sex == 'pooled': data = aux abcd = abc abcd_count = abcd.shape[0] elif sex == 'male': data = aux.loc[aux.male == 1] abcd = abc.loc[abc.male == 1] abcd_count = abcd.loc[abcd['male'] == 1]['male'].count() else: data = aux.loc[aux.male == 0] abcd = abc.loc[abc.male == 0] abcd_count = abcd.loc[abcd['male'] == 0]['male'].count() if weight == 'treat': abcd = abcd.loc[abcd.R == 1] elif weight == 'control': abcd = abcd.loc[abcd.R == 0] # reset auxiliary index (because dmatrices won't use id) data.reset_index('id', drop=True, inplace=True) data.index = [j for j in range(data.shape[0])] weight_array.reset_index('id', drop=True, inplace=True) weight_array.index = [j for j in range(weight_array.shape[0])] #weight_array = weight_array[data.index] # create design matrix for regressions fmla = '{} ~ {}'.format(c, ' + '.join(predictors)) endog, exog = dmatrices(fmla, data, return_type='dataframe') exog = sm.add_constant(exog) exog_index = [x for x in exog.index] weight_forWLS = weight_array.loc[pd.IndexSlice[exog_index]] weight_type = 'wtabc_allids_c' + cs + '_' + weight weight_forWLS = weight_forWLS.loc[:, weight_type] weight_forWLS.dropna(axis=0, inplace=True) exog = exog.loc[weight_forWLS.index, :] endog = endog.loc[weight_forWLS.index, :] # estimate coefficients fail_switch = 0 try: model = sm.WLS(endog, exog, weights=weight_forWLS) fit = model.fit() params = fit.params resid = fit.resid except: fail_switch = 1 if age in range(22, 30): params = pd.Series( [np.nan for j in range(1 + len(predictors))], index=['Intercept'] + cols.interp.predictors + ['y']) else: params = pd.Series( [np.nan for j in range(1 + len(predictors))], index=['Intercept'] + cols.extrap.predictors + ['y']) resid = pd.Series([np.nan for j in range(endog.shape[0])]) # calculate RMSE rmse = resid * resid rmse = pd.Series(sqrt(rmse.mean(axis=0)), index=['rmse']) params = pd.concat([params, rmse], axis=0) params.rename({'inc_labor{}'.format(age_x): 'y'}, inplace=True) if age in range(22, 30): params_interp[sex].loc[age, :] = params else: params_extrap[sex].loc[age, :] = params # resample the errors, and merge in with ABC IDs if fail_switch == 0: ehat = pd.DataFrame(np.random.choice(resid, size=abcd_count)) else: ehat = pd.DataFrame([np.nan for j in range(abcd_count)]) abcd_ix = abcd.reset_index(level=0) ehat = pd.concat([abcd_ix.loc[:, 'id'], ehat], axis=1) ehat.columns = ['id', age] ehat.columns.name = 'age' ehat.set_index('id', inplace=True) error_mat[sex] = pd.concat([error_mat[sex], ehat], axis=1) if verbose: print 'Successful predictions, age {}, n={}'.format( age, exog.shape[0]) # add treatment indicator back into error matrix, add column names treat = abc.loc[:, 'R'] for sex in ['pooled', 'male', 'female']: error_mat[sex] = pd.concat([error_mat[sex], treat], axis=1, join='inner') params_interp[sex].columns.name = 'variable' params_extrap[sex].columns.name = 'variable' male_interp_nix = abcd.loc[abcd.male == 1].loc[pd.isnull( abcd.loc[abcd.male == 1, cols.interp.predictors]).any(axis=1)].index female_interp_nix = abcd.loc[abcd.male == 0].loc[pd.isnull( abcd.loc[abcd.male == 0, cols.interp.predictors]).any(axis=1)].index male_extrap_nix = abcd.loc[abcd.male == 1].loc[pd.isnull( abcd.loc[abcd.male == 1, cols.extrap.predictors]).any(axis=1)].index female_extrap_nix = abcd.loc[abcd.male == 0].loc[pd.isnull( abcd.loc[abcd.male == 0, cols.extrap.predictors]).any(axis=1)].index # remove errors for ABC individuals for whom we do not predict earnings # interp (we only check age 22 since predicatablity of each year are based on the same set of outcomes) error_mat['male'].loc[male_interp_nix, slice(0, 8)] = np.nan error_mat['female'].loc[female_interp_nix, slice(0, 8)] = np.nan error_mat['pooled'].loc[female_interp_nix.append(male_interp_nix), slice(0, 8)] = np.nan # extrap (we only check age 31 since predicatablity of each year are based on the same set of outcomes) error_mat['male'].loc[male_extrap_nix, slice(9, 45)] = np.nan error_mat['female'].loc[female_extrap_nix, slice(9, 45)] = np.nan error_mat['pooled'].loc[female_extrap_nix.append(male_extrap_nix), slice(9, 45)] = np.nan # predict earnings projection_interp = {} projection_extrap = {} abc.loc[:, 'Intercept'] = [1 for j in range(abc.shape[0])] for sex in ['pooled', 'male', 'female']: if sex == 'pooled': abcd = abc elif sex == 'male': abcd = abc.loc[abc.male == 1] else: abcd = abc.loc[abc.male == 0] abcd_interp = abcd.loc[:, ['Intercept'] + cols.interp.predictors + ['y']] abcd_extrap = abcd.loc[:, ['Intercept'] + cols.extrap.predictors + ['y']] projection_interp[sex] = pd.DataFrame([]) projection_extrap[sex] = pd.DataFrame([]) for age in ages: if age in range(22, 30): if age == 22: abcd_interp['y'] = 0 params_interp_trans = pd.DataFrame( params_interp[sex].loc[age].drop('rmse').T) interp_dot = abcd_interp.dot( params_interp_trans) + error_mat[sex][[age]] abcd_interp['y'] = interp_dot projection_interp[sex] = pd.concat( [projection_interp[sex], interp_dot], axis=1) else: if age == 31: params_extrap[sex].loc[31]['y'] = 0 abcd_extrap['y'] = interp_dot abcd_extrap['y'].fillna(value=0, inplace=True) params_extrap_trans = pd.DataFrame( params_extrap[sex].loc[age].drop('rmse').T) extrap_dot = abcd_extrap.dot( params_extrap_trans) + error_mat[sex][[age]] abcd_extrap['y'] = extrap_dot projection_extrap[sex] = pd.concat( [projection_extrap[sex], extrap_dot], axis=1) return params_interp, params_extrap, error_mat, projection_interp, projection_extrap
def predict_abc(extrap, extrap_index, abc, verbose=True): # set up age range ages = range(21, 68) # set up dictionaries to store output params_extrap = {} error_mat = {} # set up matrices for interpolation/extrapolation parameters, and errors for sex in ['pooled', 'male', 'female']: params_extrap[sex] = pd.DataFrame([[np.nan for j in range(len(cols.extrap.predictors) + 3)] for k in range(21,68)], index = range(21,68)) params_extrap[sex].index.names = ['age'] params_extrap[sex].columns = ['Intercept'] + cols.extrap.predictors + ['y'] + ['rmse'] error_mat[sex] = pd.DataFrame([]) # obtain parameters for every age for age in ages: age_x = age - 1 predictors = cols.extrap.predictors + ['inc_labor{}'.format(age_x)] aux = deepcopy(extrap.loc[extrap_index, :]) c = 'inc_labor{}'.format(age) # obtain parameters for different sexes for sex in ['pooled', 'male', 'female']: if sex == 'pooled': data = aux abcd = abc abcd_count = abcd.shape[0] elif sex == 'male': data = aux abcd = abc.loc[abc.male_subject==1] abcd_count = abcd.loc[abcd['male_subject']==1]['male_subject'].count() else: data = aux abcd = abc.loc[abc.male_subject==0] abcd_count = abcd.loc[abcd['male_subject']==0]['male_subject'].count() # reset auxiliary index because sm.OLS drops some rows data.reset_index('id', drop=True, inplace=True) data.index = [j for j in range(data.shape[0])] # create design matrix for regressions fmla = '{} ~ {}'.format(c, ' + '.join(predictors)) endog, exog = dmatrices(fmla, data, return_type='dataframe') exog = sm.add_constant(exog) # estimate coefficients fail_switch = 0 try: model = sm.OLS(endog, exog) fit = model.fit() params = fit.params resid = fit.resid except: fail_switch = 1 params = pd.Series([np.nan for j in range(1 + len(predictors))], index=['Intercept'] + cols.extrap.predictors + ['y']) resid = pd.Series([np.nan for j in range(endog.shape[0])]) # calculate RMSE rmse = resid * resid rmse = pd.Series(sqrt(rmse.mean(axis=0)), index=['rmse']) params = pd.concat([params, rmse],axis=0) params.rename({'inc_labor{}'.format(age_x):'y'}, inplace=True) params_extrap[sex].loc[age, :] = params # resample the errors, and merge in with ABC IDs if fail_switch == 0: ehat = pd.DataFrame(np.random.choice(resid, size=abcd_count)) else: ehat = pd.DataFrame([np.nan for j in range(abcd_count)]) abcd_ix = abcd.reset_index(level=0) ehat = pd.concat([abcd_ix.loc[:,'id'], ehat], axis=1) ehat.columns = ['id', age] ehat.columns.name = 'age' ehat.set_index('id', inplace=True) error_mat[sex] = pd.concat([error_mat[sex], ehat], axis=1) if verbose: print 'Successful predictions, age {}, n={}'.format(age, exog.shape[0]) # add treatment indicator back into error matrix, add column names treat = abc.loc[:,'R'] for sex in ['pooled', 'male', 'female']: error_mat[sex] = pd.concat([error_mat[sex], treat], axis=1, join='inner') params_extrap[sex].columns.name = 'variable' # extrap (we only check age 31 since predicatablity of each year are based on the same set of outcomes) error_mat['male'].loc[male_extrap_nix, slice(9,45)] = np.nan error_mat['female'].loc[female_extrap_nix, slice(9,45)] = np.nan error_mat['pooled'].loc[female_extrap_nix.append(male_extrap_nix), slice(9,45)] = np.nan # predict earnings projection_extrap = {} abc.loc[:, 'Intercept'] = [1 for j in range(abc.shape[0])] for sex in ['pooled', 'male', 'female']: if sex == 'pooled': abcd = abc elif sex == 'male': abcd = abc.loc[abc.male_subject==1] else: abcd = abc.loc[abc.male_subject==0] abcd_extrap = abcd.loc[:, ['Intercept'] + cols.extrap.predictors + ['y']] projection_extrap[sex] = pd.DataFrame([]) for idx in abcd.iterrows(): age_extrap = pd.DataFrame([np.nan for k in range(21,69)], index = [range(21,69)]) age_extrap.index.names = ['age'] tmp_age = idx[1].loc['last_age'] abcd_extrap.loc[idx[0], 'y'] = idx[1].loc['inc_labor_last'] if tmp_age > 20: for age in range(tmp_age, 68): params_extrap_trans = pd.DataFrame(params_extrap[sex].loc[age].drop('rmse').T) extrap_dot = abcd_extrap.loc[idx[0],:].dot(params_extrap_trans) + error_mat[sex][[age]].loc[idx[0],:] abcd_extrap.loc[idx[0],'y'] = extrap_dot.iloc[0] age_extrap.loc[age] = extrap_dot.iloc[0] age_extrap.loc[69] = idx[0] projection_extrap[sex] = pd.concat([projection_extrap[sex], age_extrap.T], axis=0) projection_extrap[sex].set_index(projection_extrap[sex].loc[:,69], inplace=True, drop=True) return params_extrap, error_mat, projection_extrap, abc
def predict_abc(interp, extrap, interp_index, extrap_index, abc, verbose=True): # set up age range ages = range(22, 30) + range(31, 68) # set up dictionaries to store output params_interp = {} params_extrap = {} error_mat = {} # set up matrices for interpolation/extrapolation parameters, and errors for sex in ['pooled', 'male', 'female']: params_interp[sex] = pd.DataFrame([[np.nan for j in range(len(cols.interp.predictors) + 2)] for k in range(22,30)], index = range(22,30)) params_interp[sex].index.names = ['age'] params_interp[sex].columns = ['Intercept'] + cols.interp.predictors + ['rmse'] params_extrap[sex] = pd.DataFrame([[np.nan for j in range(len(cols.extrap.predictors) + 2)] for k in range(31,68)], index = range(31,68)) params_extrap[sex].index.names = ['age'] params_extrap[sex].columns = ['Intercept'] + cols.extrap.predictors + ['rmse'] error_mat[sex] = pd.DataFrame([]) # obtain parameters for every age for age in ages: if age in range(22, 30): predictors = cols.interp.predictors aux = deepcopy(interp.loc[interp_index, :]) elif age in range(31, 68): predictors = cols.extrap.predictors aux = deepcopy(extrap.loc[extrap_index, :]) c = 'inc_labor{}'.format(age) # obtain parameters for different sexes for sex in ['pooled', 'male', 'female']: if sex == 'pooled': data = aux abcd = abc abcd_count = abcd.shape[0] elif sex == 'male': data = aux.loc[aux.male==1] abcd = abc.loc[abc.male==1] abcd_count = abcd.loc[abcd['male']==1]['male'].count() else: data = aux.loc[aux.male==0] abcd = abc.loc[abc.male==0] abcd_count = abcd.loc[abcd['male']==0]['male'].count() # reset auxiliary index (why?) data.reset_index('id', drop=True, inplace=True) data.index = [j for j in range(data.shape[0])] # create design matrix for regressions fmla = '{} ~ {}'.format(c, ' + '.join(predictors)) endog, exog = dmatrices(fmla, data, return_type='dataframe') exog = sm.add_constant(exog) # estimate coefficients fail_switch = 0 try: model = sm.OLS(endog, exog) fit = model.fit() params = fit.params resid = fit.resid except: fail_switch = 1 params = pd.Series([np.nan for j in range(1 + len(predictors))], index=['Intercept'] + predictors) resid = pd.Series([np.nan for j in range(endog.shape[0])]) # calculate RMSE rmse = resid * resid rmse = pd.Series(sqrt(rmse.mean(axis=0)), index=['rmse']) params = pd.concat([params, rmse],axis=0) if age in range(22,30): params_interp[sex].loc[age, :] = params else: params_extrap[sex].loc[age, :] = params # resample the errors, and merge in with ABC IDs if fail_switch == 0: ehat = pd.DataFrame(np.random.choice(resid, size=abcd_count)) else: ehat = pd.DataFrame([np.nan for j in range(abcd_count)]) abcd_ix = abcd.reset_index(level=0) ehat = pd.concat([abcd_ix.loc[:,'id'], ehat], axis=1) ehat.columns = ['id', age] ehat.columns.name = 'age' ehat.set_index('id', inplace=True) error_mat[sex] = pd.concat([error_mat[sex], ehat], axis=1) if verbose: print 'Successful predictions, age {}, n={}'.format(age, exog.shape[0]) # add treatment indicator back into error matrix, add column names treat = abc.loc[:,'R'] for sex in ['pooled', 'male', 'female']: error_mat[sex] = pd.concat([error_mat[sex], treat], axis=1, join='inner') params_interp[sex].columns.name = 'variable' params_extrap[sex].columns.name = 'variable' # remove errors for ABC individuals for whom we do not predict earnings # interp (we only check age 22 since predicatablity of each year are based on the same set of outcomes) error_mat['male'].loc[male_interp_nix, slice(0,8)] = np.nan error_mat['female'].loc[female_interp_nix, slice(0,8)] = np.nan error_mat['pooled'].loc[female_interp_nix.append(male_interp_nix), slice(0,8)] = np.nan # extrap (we only check age 31 since predicatablity of each year are based on the same set of outcomes) error_mat['male'].loc[male_extrap_nix, slice(9,45)] = np.nan error_mat['female'].loc[female_extrap_nix, slice(9,45)] = np.nan error_mat['pooled'].loc[female_extrap_nix.append(male_extrap_nix), slice(9,45)] = np.nan # predict earnings projection_interp = {} projection_extrap = {} abc.loc[:, 'Intercept'] = [1 for j in range(abc.shape[0])] for sex in ['pooled', 'male', 'female']: if sex == 'pooled': abcd = abc abcd_interp = abcd.loc[:, ['Intercept'] + cols.interp.predictors] abcd_extrap = abcd.loc[:, ['Intercept'] + cols.extrap.predictors] elif sex == 'male': abcd = abc.loc[abc.male==1] abcd_interp = abcd.loc[:, ['Intercept'] + cols.interp.predictors] abcd_extrap = abcd.loc[:, ['Intercept'] + cols.extrap.predictors] else: abcd = abc.loc[abc.male==0] abcd_interp = abcd.loc[:, ['Intercept'] + cols.interp.predictors] abcd_extrap = abcd.loc[:, ['Intercept'] + cols.extrap.predictors] # peform projetions using dot product, add back in the errors projection_interp[sex] = abcd_interp.dot(params_interp[sex].drop('rmse', axis=1).T) + error_mat[sex].drop('R', axis=1).loc[:,slice(22,29)] projection_extrap[sex] = abcd_extrap.dot(params_extrap[sex].drop('rmse', axis=1).T) + error_mat[sex].drop('R', axis=1).loc[:,slice(31,67)] return params_interp, params_extrap, error_mat, projection_interp, projection_extrap
def predict_abc(interp, extrap, interp_index, extrap_index, weight, interp_weights, extrap_weights, cs, abc, verbose=True): # set up age range ages = range(22, 30) + range(31, 68) # set up dictionaries to store output params_interp = {} params_extrap = {} error_mat = {} # set up matrices for interpolation/extrapolation parameters, and errors for sex in ['pooled', 'male', 'female']: params_interp[sex] = pd.DataFrame([[np.nan for j in range(len(cols.interp.predictors) + 3)] for k in range(22,30)], index = range(22,30)) params_interp[sex].index.names = ['age'] params_interp[sex].columns = ['Intercept'] + cols.interp.predictors + ['y'] + ['rmse'] params_extrap[sex] = pd.DataFrame([[np.nan for j in range(len(cols.extrap.predictors) + 3)] for k in range(31,68)], index = range(31,68)) params_extrap[sex].index.names = ['age'] params_extrap[sex].columns = ['Intercept'] + cols.extrap.predictors + ['y'] + ['rmse'] error_mat[sex] = pd.DataFrame([]) # obtain parameters for every age for age in ages: if age in range(22, 30): aux = deepcopy(interp.loc[interp_index, :]) if age == 22: interp_weights.reset_index(inplace=True) del interp_weights['draw'] interp_weights.set_index('id', inplace=True, drop=True) weight_array = deepcopy(interp_weights.loc[pd.IndexSlice[interp_index],:]) age_x = age - 1 predictors = cols.interp.predictors + ['inc_labor{}'.format(age_x)] elif age in range(31, 68): aux = deepcopy(extrap.loc[extrap_index, :]) if age == 31: age_x = 29 predictors = cols.extrap.predictors + ['inc_labor{}'.format(age_x)] else: age_x = age - 1 predictors = cols.extrap.predictors + ['inc_labor{}'.format(age_x)] if age == 31: extrap_index_weight = [x[1] for x in extrap_index] extrap_weights.reset_index(inplace=True) del extrap_weights['draw'] extrap_weights.set_index('id', inplace=True, drop=True) weight_array = deepcopy(extrap_weights.loc[extrap_index_weight,:]) c = 'inc_labor{}'.format(age) # drop black # drop black aux = aux.loc[aux.black == 1] # obtain parameters for different sexes for sex in ['pooled', 'male', 'female']: if sex == 'pooled': data = aux abcd = abc abcd_count = abcd.shape[0] elif sex == 'male': data = aux.loc[aux.male==1] abcd = abc.loc[abc.male==1] abcd_count = abcd.loc[abcd['male']==1]['male'].count() else: data = aux.loc[aux.male==0] abcd = abc.loc[abc.male==0] abcd_count = abcd.loc[abcd['male']==0]['male'].count() if weight == 'treat': abcd = abcd.loc[abcd.R==1] elif weight == 'control': abcd = abcd.loc[abcd.R==0] # reset auxiliary index (because dmatrices won't use id) data.reset_index('id', drop=True, inplace=True) data.index = [j for j in range(data.shape[0])] weight_array.reset_index('id', drop=True, inplace=True) weight_array.index = [j for j in range(weight_array.shape[0])] #weight_array = weight_array[data.index] # create design matrix for regressions fmla = '{} ~ {}'.format(c, ' + '.join(predictors)) endog, exog = dmatrices(fmla, data, return_type='dataframe') exog = sm.add_constant(exog) exog_index = [x for x in exog.index] weight_forWLS = weight_array.loc[pd.IndexSlice[exog_index]] weight_type = 'wtabc_allids_c' + cs + '_' + weight weight_forWLS = weight_forWLS.loc[:, weight_type] weight_forWLS.dropna(axis=0, inplace=True) exog = exog.loc[weight_forWLS.index,:] endog = endog.loc[weight_forWLS.index,:] # estimate coefficients fail_switch = 0 try: model = sm.WLS(endog, exog, weights=weight_forWLS) fit = model.fit() params = fit.params resid = fit.resid except: fail_switch = 1 if age in range(22, 30): params = pd.Series([np.nan for j in range(1 + len(predictors))], index=['Intercept'] + cols.interp.predictors + ['y']) else: params = pd.Series([np.nan for j in range(1 + len(predictors))], index=['Intercept'] + cols.extrap.predictors + ['y']) resid = pd.Series([np.nan for j in range(endog.shape[0])]) # calculate RMSE rmse = resid * resid rmse = pd.Series(sqrt(rmse.mean(axis=0)), index=['rmse']) params = pd.concat([params, rmse],axis=0) params.rename({'inc_labor{}'.format(age_x):'y'}, inplace=True) if age in range(22,30): params_interp[sex].loc[age, :] = params else: params_extrap[sex].loc[age, :] = params # resample the errors, and merge in with ABC IDs if fail_switch == 0: ehat = pd.DataFrame(np.random.choice(resid, size=abcd_count)) else: ehat = pd.DataFrame([np.nan for j in range(abcd_count)]) abcd_ix = abcd.reset_index(level=0) ehat = pd.concat([abcd_ix.loc[:,'id'], ehat], axis=1) ehat.columns = ['id', age] ehat.columns.name = 'age' ehat.set_index('id', inplace=True) error_mat[sex] = pd.concat([error_mat[sex], ehat], axis=1) if verbose: print 'Successful predictions, age {}, n={}'.format(age, exog.shape[0]) # add treatment indicator back into error matrix, add column names treat = abc.loc[:,'R'] for sex in ['pooled', 'male', 'female']: error_mat[sex] = pd.concat([error_mat[sex], treat], axis=1, join='inner') params_interp[sex].columns.name = 'variable' params_extrap[sex].columns.name = 'variable' male_interp_nix = abcd.loc[abcd.male==1].loc[pd.isnull(abcd.loc[abcd.male==1, cols.interp.predictors]).any(axis=1)].index female_interp_nix = abcd.loc[abcd.male==0].loc[pd.isnull(abcd.loc[abcd.male==0, cols.interp.predictors]).any(axis=1)].index male_extrap_nix = abcd.loc[abcd.male==1].loc[pd.isnull(abcd.loc[abcd.male==1, cols.extrap.predictors]).any(axis=1)].index female_extrap_nix = abcd.loc[abcd.male==0].loc[pd.isnull(abcd.loc[abcd.male==0, cols.extrap.predictors]).any(axis=1)].index # remove errors for ABC individuals for whom we do not predict earnings # interp (we only check age 22 since predicatablity of each year are based on the same set of outcomes) error_mat['male'].loc[male_interp_nix, slice(0,8)] = np.nan error_mat['female'].loc[female_interp_nix, slice(0,8)] = np.nan error_mat['pooled'].loc[female_interp_nix.append(male_interp_nix), slice(0,8)] = np.nan # extrap (we only check age 31 since predicatablity of each year are based on the same set of outcomes) error_mat['male'].loc[male_extrap_nix, slice(9,45)] = np.nan error_mat['female'].loc[female_extrap_nix, slice(9,45)] = np.nan error_mat['pooled'].loc[female_extrap_nix.append(male_extrap_nix), slice(9,45)] = np.nan # predict earnings projection_interp = {} projection_extrap = {} abc.loc[:, 'Intercept'] = [1 for j in range(abc.shape[0])] for sex in ['pooled', 'male', 'female']: if sex == 'pooled': abcd = abc elif sex == 'male': abcd = abc.loc[abc.male==1] else: abcd = abc.loc[abc.male==0] abcd_interp = abcd.loc[:, ['Intercept'] + cols.interp.predictors + ['y']] abcd_extrap = abcd.loc[:, ['Intercept'] + cols.extrap.predictors + ['y']] projection_interp[sex] = pd.DataFrame([]) projection_extrap[sex] = pd.DataFrame([]) for age in ages: if age in range(22, 30): if age == 22: abcd_interp['y'] = 0 params_interp_trans = pd.DataFrame(params_interp[sex].loc[age].drop('rmse').T) interp_dot = abcd_interp.dot(params_interp_trans) + error_mat[sex][[age]] abcd_interp['y'] = interp_dot projection_interp[sex] = pd.concat([projection_interp[sex], interp_dot], axis=1) else: if age == 31: params_extrap[sex].loc[31]['y'] = 0 abcd_extrap['y'] = interp_dot abcd_extrap['y'].fillna(value=0, inplace=True) params_extrap_trans = pd.DataFrame(params_extrap[sex].loc[age].drop('rmse').T) extrap_dot = abcd_extrap.dot(params_extrap_trans) + error_mat[sex][[age]] abcd_extrap['y'] = extrap_dot projection_extrap[sex] =pd.concat([projection_extrap[sex],extrap_dot],axis=1) return params_interp, params_extrap, error_mat, projection_interp, projection_extrap