def experiment(c, a, y, t, mask, debug=False): ''' Accept data columns from dataset wrapper function and run experiment ''' obs_i = [i for i in range(len(mask)) if not mask[i]] full_pc = fit_bernoulli(c) full_pyac = fit_simple((c, a), y) cc_pyac = fit_simple((c[obs_i], a[obs_i]), y[obs_i]) cc_pc = fit_bernoulli(c[obs_i]) textless_mi_pyac = textless_mi((c, a, y), mask, 20) mi_pyac = mi((c, a, y), t, mask, 20) bad_mi_pyac = bad_mis((c, a, y), t, mask, 20) oracle_err = gformula(full_pyac, full_pc) naive_err = gformula(cc_pyac, cc_pc) textless_err = gformula(textless_mi_pyac, full_pc) bad_mi_err = gformula(bad_mi_pyac, full_pc) mi_err = gformula(mi_pyac, full_pc) if debug: print("\tOracle: {:0.3f}".format(oracle_err)) print("\tNaive: {:0.6f}".format(naive_err)) print("\tTextless: {:0.6f}".format(textless_err)) print("\tbad m.i.: {:0.6f}".format(bad_mi_err)) print("\tm.i.: {:0.6f}".format(mi_err)) return [(x - oracle_err)**2 for x in (naive_err, textless_err, bad_mi_err, mi_err)]
def mi(truth, t, mask, k): ''' Correct multiple imputation implementation ''' c, a, y = truth obs_i = [i for i in range(len(mask)) if not mask[i]] missing_i = [i for i in range(len(mask)) if mask[i]] a_imputed = impute((c[obs_i], y[obs_i], t[obs_i]), a[obs_i], (c[missing_i], y[missing_i], t[missing_i])) def get_imputed_values(imputed_probs): vals = [] for i in range(len(missing_i)): w = np.random.choice([0, 1], 1, p=a_imputed[i]) vals.append(w) return np.squeeze(np.array(vals)) resamples = [] for i in range(k): imp_a = a.copy() imp_a[missing_i] = get_imputed_values(a_imputed) imp_pyac = fit_simple((c, imp_a), y) resamples.append(imp_pyac) pyac = {key: 0 for key in resamples[0]} for key in pyac: for resample in resamples: pyac[key] += resample[key] pyac[key] = pyac[key] / k return pyac
def textless_mi(truth, mask, k): ''' Multiple imputation without using text data This corresponds to "no_text" in the paper, \S 5.3.2 ''' c, a, y = truth obs_i = [i for i in range(len(mask)) if not mask[i]] missing_i = [i for i in range(len(mask)) if mask[i]] a_imputed = impute((c[obs_i], y[obs_i]), a[obs_i], (c[missing_i], y[missing_i])) def get_imputed_values(imputed_probs): vals = [] for i in range(len(missing_i)): w = np.random.choice([0, 1], 1, p=a_imputed[i]) vals.append(w) return np.squeeze(np.array(vals)) resamples = [] for i in range(k): imp_a = a.copy() imp_a[missing_i] = get_imputed_values(a_imputed) imp_pyac = fit_simple((c, imp_a), y) resamples.append(imp_pyac) pyac = {key: 0 for key in resamples[0]} for key in pyac: for resample in resamples: pyac[key] += resample[key] pyac[key] = pyac[key] / k return pyac
def train_adjust(train, test, proxy_i=1, confound_i=(), debug=False): ''' Given train and test data, train a logistic regression classifier to impute a proxy for the missing variables, then calculate the errors from an oracle in causal effect estimation. ''' n = test.shape[0] # use half the train set for training, half for dev and error calculation num_train = train.shape[1] // 2 truth = test[:3, :] new_dist, proxy = impute_and_correct(train, test, n, num_train, proxy_i, confound_i, debug) oracle_effect = gformula(dist_pyac(get_dist(truth)), dist_pc(get_dist(truth))) # Instead of training our model for the mismeasurement, just report # the causal effect present in the training dataset naive_effect = gformula( fit_simple(np.transpose(train[:2, :]), train[2, :]), fit_bernoulli(train[0, :])) misspecified_effect = gformula(dist_pyac(get_dist(proxy)), dist_pc(get_dist(proxy))) corrected_effect = gformula(dist_pyac(new_dist), dist_pc(new_dist)) if debug: print("True dist gives effect: {:0.3f}".format(oracle_effect)) print("Naive approach gives effect: {:0.3f}".format(naive_effect)) print("Misspecified dist gives effect: {:0.3f}".format( misspecified_effect)) print("corrected dist gives effect: {:0.3f}".format(corrected_effect)) return [(x - oracle_effect)**2 for x in (naive_effect, misspecified_effect, corrected_effect)]