def run_parameterized_estimators( df, df2=None, stopwords=None, vocab_size=2000, use_counts=False, threshold=0.8, only_zeros=True, inner_alpha='optimal', outer_alpha='optimal', g_weight=1, Q_weight=1, mlm_weight=1, run_cb=False): """ Run all the ATE estimators based on models: regression expansion (+pu classifier), bert adjustment, and regression expansion + bert. """ X, vocab, vectorizer = prepare_covariates(df, stopwords, vocab_size, use_counts) T_true = df['T_true'].to_numpy() T_proxy = df['T_proxy'].to_numpy() # PU classifier expansion only_zeros=True pu = label_expansion.PUClassifier( inner_alpha=inner_alpha, outer_alpha=outer_alpha) pu.fit(X, T_proxy) T_plus_pu = label_expansion.expand_variable(pu, X, T_proxy, threshold=threshold, only_zeros=only_zeros) ATE_pu = util.ATE_adjusted(df.C_true, T_plus_pu , df.Y_sim) # Plain regression expansion reg = SGDClassifier(loss="log", penalty="l2", alpha=outer_alpha) reg.fit(X, T_proxy) T_plus_reg = label_expansion.expand_variable(reg, X, T_proxy, threshold=threshold, only_zeros=only_zeros) ATE_reg = util.ATE_adjusted(df.C_true, T_plus_reg , df.Y_sim) if run_cb: cbw = CausalBert.CausalBertWrapper(g_weight=g_weight, Q_weight=args.Q_weight, mlm_weight=args.mlm_weight) cbw.train(df['text'], df.C_true, df.T_proxy, df.Y_sim, epochs=3) ATE_cb_Tproxy = cbw.ATE(df.C_true, df['text'], Y=df.Y_sim, platt_scaling=False) cbw = CausalBert.CausalBertWrapper(g_weight=g_weight, Q_weight=args.Q_weight, mlm_weight=args.mlm_weight) cbw.train(df['text'], df.C_true, T_plus_pu, df.Y_sim, epochs=3) ATE_cb_Tplus = cbw.ATE(df.C_true, df['text'], Y=df.Y_sim, platt_scaling=False) else: ATE_cb_Tproxy, ATE_cb_Tplus = -1, -1 return ATE_pu, ATE_reg, ATE_cb_Tproxy, ATE_cb_Tplus
def run_experiment(args): """ Run an experiment with the given args and seed. Returns {causal estimator: ATE estimate} """ random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) df = get_data(args) ATE_estimates = [] if 'T_true' in df: ATE_estimates.append( ('unadj_T', util.ATE_unadjusted(df.T_true, df.Y_sim))) ATE_estimates.append( ('ate_T', util.ATE_adjusted(df.C_true, df.T_true, df.Y_sim)) ) ATE_estimates.append( ('ate_matrix', util.ATE_matrix(df.T_true, df.T_proxy, df.C_true, df.Y_sim)) ) if 'T_proxy' in df: ATE_estimates.append( ('unadj_T_proxy', util.ATE_unadjusted(df.T_proxy, df.Y_sim))) ATE_estimates.append( ('ate_T_proxy', util.ATE_adjusted(df.C_true, df.T_proxy, df.Y_sim))) ATE_T_plus_reg, T_plus_reg = run_label_expansion(df, args, inner_alpha=args.ina, outer_alpha=args.outa, threshold=args.thre) ATE_estimates.append(('ate_T_plus_reg', ATE_T_plus_reg)) ATE_T_plus_pu, T_plus_pu = run_label_expansion(df, args, single_class=True, inner_alpha=args.ina, outer_alpha=args.outa, threshold=args.thre) ATE_estimates.append(('ate_T_plus_pu', ATE_T_plus_pu)) if args.run_cb: cbw = CausalBert.CausalBertWrapper(g_weight=args.g_weight, Q_weight=args.Q_weight, mlm_weight=args.mlm_weight) cbw.train(df['text'], df.C_true, df.T_proxy, df.Y_sim, epochs=3) ATE_cb_Tproxy = cbw.ATE(df.C_true, df['text'], Y=df.Y_sim, platt_scaling=False) ATE_estimates.append(('ate_cb_T_proxy', ATE_cb_Tproxy)) cbw = CausalBert.CausalBertWrapper(g_weight=args.g_weight, Q_weight=args.Q_weight, mlm_weight=args.mlm_weight) cbw.train(df['text'], df.C_true, T_plus_pu, df.Y_sim, epochs=3) ATE_cb_Tplus = cbw.ATE(df.C_true, df['text'], Y=df.Y_sim, platt_scaling=False) ATE_estimates.append(('ate_cb_T_plus', ATE_cb_Tplus)) return dict(ATE_estimates)
def run_label_expansion(df, args, stopwords=None, use_counts=False, single_class=False, only_zeros=True, inner_alpha='optimal', outer_alpha='optimal', threshold=0.8): X, vocab, vectorizer = prepare_covariates(df, stopwords, args.vs, use_counts) T_proxy = df['T_proxy'].to_numpy() # one-class learning if single_class: model = label_expansion.PUClassifier(inner_alpha=inner_alpha, outer_alpha=outer_alpha) # use a logistic regression else: model = SGDClassifier(loss="log", penalty="l2", alpha=outer_alpha) model.fit(X, T_proxy) T_plus = label_expansion.expand_variable(model, X, T_proxy, threshold=threshold, only_zeros=only_zeros) ATE_plus = util.ATE_adjusted(df.C_true, T_plus, df.Y_sim) return ATE_plus, T_plus
# if i > 100: break data = (torch.tensor(out[x]) for x in ['W_ids', 'W_len', 'W_mask', 'C', 'T', 'Y']) data = TensorDataset(*data) sampler = RandomSampler( data) if sampler == 'random' else SequentialSampler(data) dataloader = DataLoader(data, sampler=sampler, batch_size=self.batch_size) # collate_fn=collate_CandT) return dataloader if __name__ == '__main__': import pandas as pd df = pd.read_csv(open('TEST_DF')) cb = CausalBertWrapper(batch_size=2, g_weight=0.0, Q_weight=0, mlm_weight=1) cb.train(df['review'], df.C_true, df.T_true, df.Y_sim) # print(cb.inference(df['review'], df.C_true, df.T_proxy)) # print(cb.ATE(df.C_true, df['review'], df.Y_sim, platt_scaling=True)) import util print(util.ATE_adjusted(df.C_true, df.T_true, df.Y_sim)) quit()