def fit(self, X, a, r, p):
     """
     Fits the Offset Tree estimator to partially-labeled data collected from a different policy.
     
     Parameters
     ----------
     X : array (n_samples, n_features)
         Matrix of covariates for the available data.
     a : array (n_samples), int type
         Arms or actions that were chosen for each observations.
     r : array (n_samples), {0,1}
         Rewards that were observed for the chosen actions. Must be binary rewards 0/1.
     p : array (n_samples)
         Reward estimates for the actions that were chosen by the policy.
     """
     X, a, r = _check_fit_input(X, a, r)
     p = _check_1d_inp(p)
     assert p.shape[0] == X.shape[0]
     
     if self.c is not None:
         p = self.c * p
     if self.pmin is not None:
         p = np.clip(p, a_min = self.pmin, a_max = None)
     
     self._oracles = [deepcopy(self.base_algorithm) for c in range(self.nchoices - 1)]
     Parallel(n_jobs=self.njobs, verbose=0, require="sharedmem")(delayed(self._fit)(classif, X, a, r, p) for classif in range(len(self._oracles)))
示例#2
0
    def fit(self, X, a, r, p):
        """
        Fits the Offset Tree estimator to partially-labeled data collected from a different policy.
        
        Parameters
        ----------
        X : array (n_samples, n_features)
            Matrix of covariates for the available data.
        a : array (n_samples), int type
            Arms or actions that were chosen for each observations.
        r : array (n_samples), {0,1}
            Rewards that were observed for the chosen actions. Must be binary rewards 0/1.
        p : array (n_samples)
            Reward estimates for the actions that were chosen by the policy.
        """
        X, a, r = _check_fit_input(X, a, r)
        p = _check_1d_inp(p)
        assert p.shape[0] == X.shape[0]

        if self.c is not None:
            p = self.c * p
        if self.pmin is not None:
            p = np.clip(p, a_min=self.pmin, a_max=None)

        self._oracles = [
            deepcopy(self.base_algorithm) for c in range(self.nchoices - 1)
        ]
        for classif in range(len(self._oracles)):
            obs_take = np.in1d(a, self.tree.node_comparisons[classif][0])
            X_node = X[obs_take, :]
            a_node = a[obs_take]
            r_node = r[obs_take]
            p_node = p[obs_take]

            r_more_onehalf = r_node >= .5
            y = (np.in1d(
                a_node,
                self.tree.node_comparisons[classif][2])).astype('uint8')

            y_node = y.copy()
            y_node[r_more_onehalf] = 1 - y[r_more_onehalf]
            w_node = (.5 - r_node) / p_node
            w_node[r_more_onehalf] = ((r_node - .5) / p_node)[r_more_onehalf]
            w_node = w_node * w_node.shape[0] / np.sum(w_node)

            if y_node.shape[0] == 0:
                self._oracles[classif] = _RandomPredictor()
            elif y_node.sum() == y_node.shape[0]:
                self._oracles[classif] = _OnePredictor()
            elif y_node.sum() == 0:
                self._oracles[classif] = _ZeroPredictor()
            else:
                self._oracles[classif].fit(X_node,
                                           y_node,
                                           sample_weight=w_node)
示例#3
0
    def fit(self, X, a, r, p):
        """
        Fits the Doubly-Robust estimator to partially-labeled data collected from a different policy.
        
        Parameters
        ----------
        X : array (n_samples, n_features)
            Matrix of covariates for the available data.
        a : array (n_samples), int type
            Arms or actions that were chosen for each observations.
        r : array (n_samples), {0,1}
            Rewards that were observed for the chosen actions. Must be binary rewards 0/1.
        p : array (n_samples)
            Reward estimates for the actions that were chosen by the policy.
        """
        X, a, r = _check_fit_input(X, a, r)
        p = _check_1d_inp(p)
        assert p.shape[0] == X.shape[0]
        l = -r

        if type(self.reward_estimator) == np.ndarray:
            C = self.reward_estimator
        elif 'predict_proba_separate' in dir(self.reward_estimator):
            C = -self.reward_estimator.predict_proba_separate(X)
        elif 'predict_proba' in dir(self.reward_estimator):
            reward_estimator = SeparateClassifiers(self.reward_estimator,
                                                   self.nchoices)
            reward_estimator.fit(X, a, r)
            C = -reward_estimator.predict_proba_separate(X)
        else:
            raise ValueError(
                "Error: couldn't obtain reward estimates. Are you passing the right input to 'reward_estimator'?"
            )

        if self.handle_invalid:
            C[C == 1] = np.random.beta(3, 1, size=C.shape)[C == 1]
            C[C == 0] = np.random.beta(1, 3, size=C.shape)[C == 0]

        if self.c is not None:
            p = self.c * p
        if self.pmin is not None:
            p = np.clip(p, a_min=self.pmin, a_max=None)

        C[np.arange(C.shape[0]),
          a] += (l - C[np.arange(C.shape[0]), a]) / p.reshape(-1)
        if self.method == 'rovr':
            self.oracle = RegressionOneVsRest(self.base_algorithm)
        else:
            self.oracle = WeightedAllPairs(self.base_algorithm)
        self.oracle.fit(X, C)
def evaluateRejectionSampling(policy,
                              X,
                              a,
                              r,
                              online=False,
                              start_point_online='random',
                              batch_size=10):
    """
    Evaluate a policy using rejection sampling on test data.
    
    Note
    ----
    In order for this method to be unbiased, the actions on the test sample must have been
    collected at random and not according to some other policy.
    
    Parameters
    ----------
    policy : obj
        Policy to be evaluated (already fitted to data). Must have a 'predict' method.
        If it is an online policy, it must also have a 'fit' method.
    X : array (n_samples, n_features)
        Matrix of covariates for the available data.
    a : array (n_samples), int type
        Arms or actions that were chosen for each observation.
    r : array (n_samples), {0,1}
        Rewards that were observed for the chosen actions. Must be binary rewards 0/1.
    online : bool
        Whether this is an online policy to be evaluated by refitting it to the data
        as it makes choices on it.
    start_point_online : either str 'random' or int in [0, n_samples-1]
        Point at which to start evaluating cases in the sample.
        Only used when passing online=True.
    batch_size : int
        After how many rounds to refit the policy being evaluated.
        Only used when passing online=True.
        
    Returns
    -------
    result : tuple (float, int)
        Estimated mean reward and number of observations taken.
        
    References
    ----------
    .. [1] Li, Lihong, et al. "A contextual-bandit approach to personalized news article recommendation."
           Proceedings of the 19th international conference on World wide web. ACM, 2010.
    """
    X, a, r = _check_fit_input(X, a, r)
    if start_point_online == 'random':
        start_point_online = np.random.randint(X.shape[0])
    else:
        if isinstance(start_point_online, int):
            pass
        elif isinstance(start_point_online, float):
            pass
        else:
            raise ValueError(
                "'start_point_online' must be one of 'random', float [0,1] or int [0, sample_size]"
            )

    if not online:
        pred = policy.predict(X)
        match = pred == a
        return (np.mean(r[match]), match.sum())
    else:
        cum_r = 0
        cum_n = 0
        ix_chosen = list()
        policy.fit(X[:0, :], a[:0], r[:0])
        for i in range(start_point_online, X.shape[0]):
            obs = X[i, :].reshape(1, -1)
            would_choose = policy.predict(obs)[0]
            if would_choose == a[i]:
                cum_r += r[i]
                cum_n += 1
                ix_chosen.append(i)
                if (cum_n % batch_size) == 0:
                    ix_fit = np.array(ix_chosen)
                    policy.fit(X[ix_fit, :], a[ix_fit], r[ix_fit])
        for i in range(0, start_point_online):
            obs = X[i, :].reshape(1, -1)
            would_choose = policy.predict(obs)[0]
            if would_choose == a[i]:
                cum_r += r[i]
                cum_n += 1
                ix_chosen.append(i)
                if (cum_n % batch_size) == 0:
                    ix_fit = np.array(ix_chosen)
                    policy.fit(X[ix_fit, :], a[ix_fit], r[ix_fit])
        if cum_n == 0:
            raise ValueError(
                "Rejection sampling couldn't obtain any matching samples.")
        return (cum_r / cum_n, cum_n)
def evaluateDoublyRobust(pred,
                         X,
                         a,
                         r,
                         p,
                         reward_estimator,
                         nchoices=None,
                         handle_invalid=True,
                         c=None,
                         pmin=1e-5):
    """
    Doubly-Robust Policy Evaluation
    
    Evaluates rewards of arm choices of a policy from data collected by another policy.
    
    Note
    ----
    This method requires to form reward estimates of the arms that were chosen and of the arms
    that the policy to be evaluated would choose. In order to do so, you can either provide
    estimates as an array (see Parameters), or pass a model.
    
    One method to obtain reward estimates is to fit a model to both the training and test data
    and use its predictions as reward estimates. You can do so by passing an object of class
    `contextualbandits.online.SeparateClassifiers` which should be already fitted.
    
    Another method is to fit a model to the test data, in which case you can pass a classifier
    with a 'predict_proba' method here, which will be fit to the same test data passed to this
    function to obtain reward estimates.
    
    The last two options can suffer from invalid predictions if there are some arms for which every time
    they were chosen they resulted in a reward, or never resulted in a reward. In such cases,
    this function includes the option to impute the "predictions" for them (which would otherwise
    always be exactly zero or one regardless of the context) by replacing them with random
    numbers ~Beta(3,1) or ~Beta(1,3) for the cases of always good and always bad.
    
    This is just a wild idea though, and doesn't guarantee reasonable results in such siutation.
    
    Note that, if you are using the 'SeparateClassifiers' class from the online module in this
    same package, it comes with a method 'predict_proba_separate' that can be used to get reward
    estimates. It still can suffer from the same problem of always-one and always-zero predictions though.
    
    Parameters
    ----------
    pred : array (n_samples,)
        Arms that would be chosen by the policy to evaluate.
    X : array (n_samples, n_features)
        Matrix of covariates for the available data.
    a : array (n_samples), int type
        Arms or actions that were chosen for each observation.
    r : array (n_samples), {0,1}
        Rewards that were observed for the chosen actions. Must be binary rewards 0/1.
    p : array (n_samples)
        Scores or reward estimates from the policy that generated the data for the actions
        that were chosen by it.
    reward_estimator : obj or array (n_samples, 2)
        One of the following:
            * An array with the first column corresponding to the reward estimates for the action chosen
              by the new policy, and the second column corresponding to the reward estimates for the
              action chosen in the data (see Note for details).
            * An already-fit object of class 'contextualbandits.online.SeparateClassifiers', which will
              be used to make predictions on the actions chosen and the actions that the new
              policy would choose.
            * A classifier with a 'predict_proba' method, which will be fit to the same test data
              passed here in order to obtain reward estimates (see Note for details).
    nchoices : int
        Number of arms/labels to choose from.
        Only used when passing a classifier object to 'reward_estimator'.
    handle_invalid : bool
        Whether to replace 0/1 estimated rewards with randomly-generated numbers (see Note)
    c : None or float
        Constant by which to multiply all scores from the exploration policy.
    pmin : None or float
        Scores (from the exploration policy) will be converted to the minimum between
        pmin and the original estimate.
    
    References
    ----------
    .. [1] Dudík, Miroslav, John Langford, and Lihong Li. "Doubly robust policy evaluation and learning."
           arXiv preprint arXiv:1103.4601 (2011).
    """
    X, a, r = _check_fit_input(X, a, r)
    p = _check_1d_inp(p)
    pred = _check_1d_inp(pred)
    assert p.shape[0] == X.shape[0]
    assert pred.shape[0] == X.shape[0]
    if c is not None:
        assert isinstance(c, float)
    if pmin is not None:
        assert isinstance(pmin, float)

    if type(reward_estimator) == np.ndarray:
        assert reward_estimator.shape[1] == 2
        assert reward_estimator.shape[0] == X.shape[0]
        rhat_new = reward_estimator[:, 0]
        rhat_old = reward_estimator[:, 1]
    elif 'predict_proba_separate' in dir(reward_estimator):
        rhat = reward_estimator.predict_proba_separate(X)
        rhat_new = rhat[np.arange(rhat.shape[0]), pred]
        rhat_old = rhat[np.arange(rhat.shape[0]), a]
    elif 'predict_proba' in dir(reward_estimator):
        reward_estimator = SeparateClassifiers(reward_estimator, nchoices)
        reward_estimator.fit(X, a, r)
        rhat = reward_estimator.predict_proba_separate(X)
        rhat_new = rhat[np.arange(rhat.shape[0]), pred]
        rhat_old = rhat[np.arange(rhat.shape[0]), a]
    else:
        error_msg = "'reward_estimator' must be either an array, a classifier with"
        error_msg += "'predict_proba', or a 'SeparateClassifiers' object."
        raise ValueError(error_msg)

    if handle_invalid:
        rhat_new[rhat_new == 1] = np.random.beta(
            3, 1, size=rhat_new.shape)[rhat_new == 1]
        rhat_new[rhat_new == 0] = np.random.beta(
            1, 3, size=rhat_new.shape)[rhat_new == 0]
        rhat_old[rhat_old == 1] = np.random.beta(
            3, 1, size=rhat_old.shape)[rhat_old == 1]
        rhat_old[rhat_old == 0] = np.random.beta(
            1, 3, size=rhat_old.shape)[rhat_old == 0]

    if c is not None:
        p = c * p
    if pmin is not None:
        p = np.clip(p, a_min=pmin, a_max=None)

    actions_matching = pred == a
    out = rhat_new
    out[actions_matching] += (r[actions_matching] - rhat_old[actions_matching]
                              ) / p[actions_matching].reshape(-1)

    return np.mean(out)