Пример #1
0
def test_weighted_variance():
    x = np.array([1, 2, 3, 4, 5])
    sample_weight_equal = np.ones(len(x))

    var_x = get_weighted_variance(x, sample_weight_equal)
    # should get the same variance with equal sample_weight
    assert var_x == x.var()

    x1 = np.array([1, 2, 3, 4, 4, 5, 5])
    sample_weight_equal = np.ones(len(x1))
    sample_weight = [1, 1, 1, 2, 2]
    var_x2 = get_weighted_variance(x, sample_weight)
    var_x1 = get_weighted_variance(x1, sample_weight_equal)

    # should get the same variance by duplicate the observation based on the sample weight
    assert var_x1 == var_x2
Пример #2
0
    def fit(self, X, treatment, y, p=None, sample_weight=None, verbose=True):
        """Fit the treatment effect and outcome models of the R learner.

        Args:
            X (np.matrix or np.array or pd.Dataframe): a feature matrix
            treatment (np.array or pd.Series): a treatment vector
            y (np.array or pd.Series): an outcome vector
            p (np.ndarray or pd.Series or dict, optional): an array of propensity scores of float (0,1) in the
                single-treatment case; or, a dictionary of treatment groups that map to propensity vectors of
                float (0,1); if None will run ElasticNetPropensityModel() to generate the propensity scores.
            sample_weight (np.array or pd.Series, optional): an array of sample weights indicating the
                weight of each observation for `effect_learner`. If None, it assumes equal weight.
            verbose (bool, optional): whether to output progress logs
        """
        X, treatment, y = convert_pd_to_np(X, treatment, y)
        check_treatment_vector(treatment, self.control_name)
        if sample_weight is not None:
            assert len(sample_weight) == len(
                y
            ), "Data length must be equal for sample_weight and the input data"
            sample_weight = convert_pd_to_np(sample_weight)
        self.t_groups = np.unique(treatment[treatment != self.control_name])
        self.t_groups.sort()

        if p is None:
            self._set_propensity_models(X=X, treatment=treatment, y=y)
            p = self.propensity
        else:
            p = self._format_p(p, self.t_groups)

        self._classes = {group: i for i, group in enumerate(self.t_groups)}
        self.models_tau = {group: deepcopy(self.model_tau) for group in self.t_groups}
        self.vars_c = {}
        self.vars_t = {}

        if verbose:
            logger.info("generating out-of-fold CV outcome estimates")
        yhat = cross_val_predict(self.model_mu, X, y, cv=self.cv, n_jobs=-1)

        for group in self.t_groups:
            mask = (treatment == group) | (treatment == self.control_name)
            treatment_filt = treatment[mask]
            X_filt = X[mask]
            y_filt = y[mask]
            yhat_filt = yhat[mask]
            p_filt = p[group][mask]
            w = (treatment_filt == group).astype(int)

            weight = (w - p_filt) ** 2
            diff_c = y_filt[w == 0] - yhat_filt[w == 0]
            diff_t = y_filt[w == 1] - yhat_filt[w == 1]
            if sample_weight is not None:
                sample_weight_filt = sample_weight[mask]
                sample_weight_filt_c = sample_weight_filt[w == 0]
                sample_weight_filt_t = sample_weight_filt[w == 1]
                self.vars_c[group] = get_weighted_variance(diff_c, sample_weight_filt_c)
                self.vars_t[group] = get_weighted_variance(diff_t, sample_weight_filt_t)
                weight *= sample_weight_filt  # update weight
            else:
                self.vars_c[group] = diff_c.var()
                self.vars_t[group] = diff_t.var()

            if verbose:
                logger.info(
                    "training the treatment effect model for {} with R-loss".format(
                        group
                    )
                )
            self.models_tau[group].fit(
                X_filt, (y_filt - yhat_filt) / (w - p_filt), sample_weight=weight
            )
Пример #3
0
    def fit(self, X, treatment, y, p=None, sample_weight=None, verbose=True):
        """Fit the treatment effect and outcome models of the R learner.

        Args:
            X (np.matrix or np.array or pd.Dataframe): a feature matrix
            y (np.array or pd.Series): an outcome vector
            p (np.ndarray or pd.Series or dict, optional): an array of propensity scores of float (0,1) in the
                single-treatment case; or, a dictionary of treatment groups that map to propensity vectors of
                float (0,1); if None will run ElasticNetPropensityModel() to generate the propensity scores.
            sample_weight (np.array or pd.Series, optional): an array of sample weights indicating the
                weight of each observation for `effect_learner`. If None, it assumes equal weight.
            verbose (bool, optional): whether to output progress logs
        """
        X, treatment, y = convert_pd_to_np(X, treatment, y)
        check_treatment_vector(treatment, self.control_name)
        # initialize equal sample weight if it's not provided, for simplicity purpose
        sample_weight = (
            convert_pd_to_np(sample_weight)
            if sample_weight is not None
            else convert_pd_to_np(np.ones(len(y)))
        )
        assert len(sample_weight) == len(
            y
        ), "Data length must be equal for sample_weight and the input data"
        self.t_groups = np.unique(treatment[treatment != self.control_name])
        self.t_groups.sort()

        if p is None:
            self._set_propensity_models(X=X, treatment=treatment, y=y)
            p = self.propensity
        else:
            p = self._format_p(p, self.t_groups)

        self._classes = {group: i for i, group in enumerate(self.t_groups)}
        self.models_tau = {group: deepcopy(self.model_tau) for group in self.t_groups}
        self.vars_c = {}
        self.vars_t = {}

        if verbose:
            logger.info("generating out-of-fold CV outcome estimates")
        yhat = cross_val_predict(self.model_mu, X, y, cv=self.cv, n_jobs=-1)

        for group in self.t_groups:
            treatment_mask = (treatment == group) | (treatment == self.control_name)
            treatment_filt = treatment[treatment_mask]
            w = (treatment_filt == group).astype(int)

            X_filt = X[treatment_mask]
            y_filt = y[treatment_mask]
            yhat_filt = yhat[treatment_mask]
            p_filt = p[group][treatment_mask]
            sample_weight_filt = sample_weight[treatment_mask]

            if verbose:
                logger.info(
                    "training the treatment effect model for {} with R-loss".format(
                        group
                    )
                )

            if self.early_stopping:
                (
                    X_train_filt,
                    X_test_filt,
                    y_train_filt,
                    y_test_filt,
                    yhat_train_filt,
                    yhat_test_filt,
                    w_train,
                    w_test,
                    p_train_filt,
                    p_test_filt,
                    sample_weight_train_filt,
                    sample_weight_test_filt,
                ) = train_test_split(
                    X_filt,
                    y_filt,
                    yhat_filt,
                    w,
                    p_filt,
                    sample_weight_filt,
                    test_size=self.test_size,
                    random_state=self.random_state,
                )

                weight = sample_weight_filt
                self.models_tau[group].fit(
                    X=X_train_filt,
                    y=(y_train_filt - yhat_train_filt) / (w_train - p_train_filt),
                    sample_weight=sample_weight_train_filt
                    * ((w_train - p_train_filt) ** 2),
                    eval_set=[
                        (
                            X_test_filt,
                            (y_test_filt - yhat_test_filt) / (w_test - p_test_filt),
                        )
                    ],
                    sample_weight_eval_set=[
                        sample_weight_test_filt * ((w_test - p_test_filt) ** 2)
                    ],
                    eval_metric=self.effect_learner_eval_metric,
                    early_stopping_rounds=self.early_stopping_rounds,
                    verbose=verbose,
                )

            else:
                self.models_tau[group].fit(
                    X_filt,
                    (y_filt - yhat_filt) / (w - p_filt),
                    sample_weight=sample_weight_filt * ((w - p_filt) ** 2),
                    eval_metric=self.effect_learner_eval_metric,
                )

            diff_c = y_filt[w == 0] - yhat_filt[w == 0]
            diff_t = y_filt[w == 1] - yhat_filt[w == 1]
            sample_weight_filt_c = sample_weight_filt[w == 0]
            sample_weight_filt_t = sample_weight_filt[w == 1]
            self.vars_c[group] = get_weighted_variance(diff_c, sample_weight_filt_c)
            self.vars_t[group] = get_weighted_variance(diff_t, sample_weight_filt_t)