Exemplo n.º 1
0
def experiment(c, a, y, t, mask, debug=False):
    '''
  Accept data columns from dataset wrapper function and run experiment 
  '''
    obs_i = [i for i in range(len(mask)) if not mask[i]]

    full_pc = fit_bernoulli(c)
    full_pyac = fit_simple((c, a), y)
    cc_pyac = fit_simple((c[obs_i], a[obs_i]), y[obs_i])
    cc_pc = fit_bernoulli(c[obs_i])
    textless_mi_pyac = textless_mi((c, a, y), mask, 20)
    mi_pyac = mi((c, a, y), t, mask, 20)
    bad_mi_pyac = bad_mis((c, a, y), t, mask, 20)

    oracle_err = gformula(full_pyac, full_pc)
    naive_err = gformula(cc_pyac, cc_pc)
    textless_err = gformula(textless_mi_pyac, full_pc)
    bad_mi_err = gformula(bad_mi_pyac, full_pc)
    mi_err = gformula(mi_pyac, full_pc)

    if debug:
        print("\tOracle: {:0.3f}".format(oracle_err))
        print("\tNaive: {:0.6f}".format(naive_err))
        print("\tTextless: {:0.6f}".format(textless_err))
        print("\tbad m.i.: {:0.6f}".format(bad_mi_err))
        print("\tm.i.: {:0.6f}".format(mi_err))

    return [(x - oracle_err)**2
            for x in (naive_err, textless_err, bad_mi_err, mi_err)]
Exemplo n.º 2
0
def mi(truth, t, mask, k):
    '''
  Correct multiple imputation implementation
  '''
    c, a, y = truth

    obs_i = [i for i in range(len(mask)) if not mask[i]]
    missing_i = [i for i in range(len(mask)) if mask[i]]

    a_imputed = impute((c[obs_i], y[obs_i], t[obs_i]), a[obs_i],
                       (c[missing_i], y[missing_i], t[missing_i]))

    def get_imputed_values(imputed_probs):
        vals = []
        for i in range(len(missing_i)):
            w = np.random.choice([0, 1], 1, p=a_imputed[i])
            vals.append(w)
        return np.squeeze(np.array(vals))

    resamples = []
    for i in range(k):
        imp_a = a.copy()
        imp_a[missing_i] = get_imputed_values(a_imputed)

        imp_pyac = fit_simple((c, imp_a), y)
        resamples.append(imp_pyac)

    pyac = {key: 0 for key in resamples[0]}
    for key in pyac:
        for resample in resamples:
            pyac[key] += resample[key]
        pyac[key] = pyac[key] / k

    return pyac
Exemplo n.º 3
0
def textless_mi(truth, mask, k):
    '''
  Multiple imputation without using text data
  This corresponds to "no_text" in the paper, \S 5.3.2
  '''
    c, a, y = truth

    obs_i = [i for i in range(len(mask)) if not mask[i]]
    missing_i = [i for i in range(len(mask)) if mask[i]]
    a_imputed = impute((c[obs_i], y[obs_i]), a[obs_i],
                       (c[missing_i], y[missing_i]))

    def get_imputed_values(imputed_probs):
        vals = []
        for i in range(len(missing_i)):
            w = np.random.choice([0, 1], 1, p=a_imputed[i])
            vals.append(w)
        return np.squeeze(np.array(vals))

    resamples = []
    for i in range(k):
        imp_a = a.copy()
        imp_a[missing_i] = get_imputed_values(a_imputed)

        imp_pyac = fit_simple((c, imp_a), y)
        resamples.append(imp_pyac)

    pyac = {key: 0 for key in resamples[0]}
    for key in pyac:
        for resample in resamples:
            pyac[key] += resample[key]
        pyac[key] = pyac[key] / k

    return pyac
Exemplo n.º 4
0
def train_adjust(train, test, proxy_i=1, confound_i=(), debug=False):
    '''
  Given train and test data, train a logistic regression classifier to
    impute a proxy for the missing variables, then calculate the errors
    from an oracle in causal effect estimation.
  '''
    n = test.shape[0]

    # use half the train set for training, half for dev and error calculation
    num_train = train.shape[1] // 2

    truth = test[:3, :]
    new_dist, proxy = impute_and_correct(train, test, n, num_train, proxy_i,
                                         confound_i, debug)

    oracle_effect = gformula(dist_pyac(get_dist(truth)),
                             dist_pc(get_dist(truth)))

    # Instead of training our model for the mismeasurement, just report
    #   the causal effect present in the training dataset
    naive_effect = gformula(
        fit_simple(np.transpose(train[:2, :]), train[2, :]),
        fit_bernoulli(train[0, :]))

    misspecified_effect = gformula(dist_pyac(get_dist(proxy)),
                                   dist_pc(get_dist(proxy)))
    corrected_effect = gformula(dist_pyac(new_dist), dist_pc(new_dist))

    if debug:
        print("True dist gives effect: {:0.3f}".format(oracle_effect))
        print("Naive approach gives effect: {:0.3f}".format(naive_effect))
        print("Misspecified dist gives effect: {:0.3f}".format(
            misspecified_effect))
        print("corrected dist gives effect: {:0.3f}".format(corrected_effect))

    return [(x - oracle_effect)**2
            for x in (naive_effect, misspecified_effect, corrected_effect)]