def _get_weights_ec(self, estimand): x = self._dataset.get_covariates(add_pscore=False) t = self._dataset.get_treatment() x_treated = x[t == 1] x_control = x[t == 0] # Get weights. weights = np.ones(t.shape, dtype=float) if estimand == 'ATT': w0, _ = ec.maybe_exact_calibrate(covariates=x_control, target_covariates=x_treated, autoscale=True) weights[t == 0] = w0 elif estimand == 'ATC': w1, _ = ec.maybe_exact_calibrate(covariates=x_treated, target_covariates=x_control, autoscale=True) weights[t == 1] = w1 elif estimand == 'ATE': w1, _ = ec.maybe_exact_calibrate(covariates=x_treated, target_covariates=x, autoscale=True) w0, _ = ec.maybe_exact_calibrate(covariates=x_control, target_covariates=x, autoscale=True) weights[t == 1] = w1 weights[t == 0] = w0 return weights
def g_ec(wh, xmat, targets, options): # options # target_weights: np.ndarray = None, # objective: ec.Objective = ec.Objective.ENTROPY, # increment: float = 0.001): # this is a wrapper to get g, the ratio of new weights to old weights, # for the empirical calibration function # small_positive = np.nextafter(np.float64(0), np.float64(1)) wh = np.where(wh == 0, SMALL_POSITIVE, wh) pop = wh.sum() tmeans = targets / pop # ompw: optimal means-producing weights ompw, l2_norm = ec.maybe_exact_calibrate( covariates=xmat, target_covariates=tmeans.reshape((1, -1)), baseline_weights=wh, # target_weights=np.array([[.25, .75]]), # target priorities target_weights=options['target_weights'], # target priorities??? autoscale=options['autoscale'], # doesn't always seem to work well # note that QUADRATIC weights often can be zero objective=options['objective'], # ENTROPY or QUADRATIC increment=options['increment'] ) # print(l2_norm) # wh, when multiplied by g, will yield the targets g = ompw * pop / wh g = np.array(g, dtype=float).reshape((-1, )) # djb return g
def gec(wh, xmat, targets, target_weights: np.ndarray = None, objective: ec.Objective = ec.Objective.ENTROPY, increment: float = 0.001): # ec.Objective.ENTROPY ec.Objective.QUADRATIC # small_positive = np.nextafter(np.float64(0), np.float64(1)) wh = np.where(wh == 0, SMALL_POSITIVE, wh) pop = wh.sum() tmeans = targets / pop # ompw: optimal means-producing weights ompw, l2_norm = ec.maybe_exact_calibrate( covariates=xmat, target_covariates=tmeans.reshape((1, -1)), baseline_weights=wh, # target_weights=np.array([[.25, .75]]), # target priorities # target_weights=target_weights, autoscale=True, # doesn't always seem to work well # note that QUADRATIC weights often can be zero objective=objective, # ENTROPY or QUADRATIC increment=increment) # print(l2_norm) # wh, when multiplied by g, will yield the targets g = ompw * pop / wh g = np.array(g, dtype=float).reshape((-1, )) # djb return l2_norm, g
def gec(wh, xmat, targets, options=None): a = timer() # update options with any user-supplied options if options is None: options_all = options_defaults.copy() else: options_all = options_defaults.copy() options_all.update(options) # options_all = {**options_defaults, **options} if options_all['objective'] == 'ENTROPY': options_all['objective'] = ENTROPY elif options_all['objective'] == 'QUADRATIC': options_all['objective'] = QUADRATIC # convert dict to named tuple for ease of use opts = ut.dict_nt(options_all) # small_positive = np.nextafter(np.float64(0), np.float64(1)) wh = np.where(wh == 0, SMALL_POSITIVE, wh) wh = np.full(wh.shape, wh.mean()) pop = wh.sum() tmeans = targets / pop # ompw: optimal means-producing weights ompw, l2_norm = ec.maybe_exact_calibrate( covariates=xmat, target_covariates=tmeans.reshape((1, -1)), # baseline_weights=wh, # target_weights=np.array([[.25, .75]]), # target priorities # target_weights=target_weights, autoscale=opts.autoscale, # doesn't always seem to work well # note that QUADRATIC weights often can be zero objective=opts.objective, # ENTROPY or QUADRATIC increment=opts.increment) # print(l2_norm) # wh, when multiplied by g, will yield the targets g = ompw * pop / wh g = np.array(g, dtype=float).reshape((-1, )) # djb wh_opt = g * wh targets_opt = np.dot(xmat.T, wh_opt) b = timer() # create a named tuple of items to return fields = ('elapsed_seconds', 'wh_opt', 'targets_opt', 'g', 'opts', 'l2_norm') Result = namedtuple('Result', fields, defaults=(None, ) * len(fields)) res = Result(elapsed_seconds=b - a, wh_opt=wh_opt, targets_opt=targets_opt, g=g, opts=opts, l2_norm=l2_norm) return res
def test_target_weights(self): # Replicating the first 10 rows of self.target_covariates should be # equivalent to # assigning a weight of 2 to each of the first 10 rows and 0 for others. n = len(self.target_covariates) index = list(range(10)) + list(range(n)) weights = [2] * 10 + [1] * (n - 10) duplicated_weights, duplicated_l2 = ec.maybe_exact_calibrate( covariates=self.covariates, target_covariates=self.target_covariates[index]) weighted_weights, weighted_l2 = ec.maybe_exact_calibrate( covariates=self.covariates, target_covariates=self.target_covariates, target_weights=weights) self.assertAlmostEqual(duplicated_l2, weighted_l2) self.assertAlmostEqual( 0.0, np.linalg.norm(duplicated_weights - weighted_weights))
def test_maybe_exact_calibrate(self, min_feasible_l2_norm, mock_maybe_exact_calibrate): # pylint: disable=unused-argument _mock_calibrate.min_feasible_l2_norm = min_feasible_l2_norm self.assertEqual( ec.maybe_exact_calibrate(covariates=None, target_covariates=None, target_weights=None, autoscale=None, objective=None, max_weight=None, increment=0.01)[1], min_feasible_l2_norm)
def test_from_formula(self, objective, target_weights): # Two api should give the same results. # _ec indicates the original empirical_calibration API. weights_ec, l2_norm_ec = ec.maybe_exact_calibrate( covariates=self.dmatrix, target_covariates=self.target_dmatrix, target_weights=target_weights, objective=objective) # _fec indicates empirical_calibration's formula API. formula = "~ x + y" weights_fec, l2_norm_fec = ec.from_formula( formula=formula, df=self.df, target_df=self.target_df, target_weights=target_weights, objective=objective) np.testing.assert_almost_equal(weights_ec, weights_fec, decimal=3) self.assertAlmostEqual(l2_norm_ec, l2_norm_fec, places=2)
# %% package example # !wget -q https://github.com/anqif/CVXR/raw/master/data/dspop.rda # !wget -q https://github.com/anqif/CVXR/raw/master/data/dssamp.rda dspop = rdata.conversion.convert(rdata.parser.parse_file('dspop.rda'))['dspop'] dssamp = rdata.conversion.convert(rdata.parser.parse_file('dssamp.rda'))['dssamp'] type(dspop) # pandas dssamp cols = ['sex', 'age'] weights, l2_norm = ec.maybe_exact_calibrate( covariates=dssamp[cols], # 100 rows target_covariates=dspop[cols], # 1000 rows objective=ec.Objective.ENTROPY ) l2_norm # weights is an array, length 100, sum is 1 weights.sum() check = np.multiply(dssamp[cols], weights.reshape(weights.size, 1)) check.sum(axis=0) # ok, this hits the means dspop[cols].mean() # so this gets weights that ensure that weighted sample means = pop means # therefore, for sums, we have: dspop[cols].sum() tmeans * np.size(dspop, 0) dspop[cols].sum() / np.size(dspop, 0) # this is what we should use as target