Пример #1
0
class CorralLearner(Learner):
    """A meta-learner that takes a collection of learners and determines
    which is best in an environment.
    
    This is an implementation of the Agarwal et al. (2017) Corral algorithm
    and requires that the reward is always in [0,1].

    References:
        Agarwal, Alekh, Haipeng Luo, Behnam Neyshabur, and Robert E. Schapire. 
        "Corralling a band of bandit algorithms." In Conference on Learning 
        Theory, pp. 12-38. PMLR, 2017.
    """

    def __init__(self, 
        learners: Sequence[Learner], 
        eta     : float = 0.075,
        T       : float = math.inf, 
        mode    : Literal["importance","rejection","off-policy"] ="importance", 
        seed    : int = 1) -> None:
        """Instantiate a CorralLearner.

        Args:
            learners: The collection of base learners.
            eta: The learning rate. This controls how quickly Corral picks a best base_learner. 
            T: The number of interactions expected during the learning process. A small T will cause
                the learning rate to shrink towards 0 quickly while a large value for T will cause the
                learning rate to shrink towards 0 slowly. A value of inf means that the learning rate
                will remain constant.
            mode: Determines the method with which feedback is provided to the base learners. The 
                original paper used importance sampling. We also support `off-policy` and `rejection`.
            seed: A seed for a random number generation in ordre to get repeatable results.
        """
        if mode not in ["importance", "off-policy", "rejection"]:
            raise CobaException("The provided `mode` for CorralLearner was unrecognized.")

        self._base_learners = [ SafeLearner(learner) for learner in learners]

        M = len(self._base_learners)

        self._T     = T
        self._gamma = 1/T
        self._beta  = 1/math.exp(1/math.log(T))

        self._eta_init = eta
        self._etas     = [ eta ] * M
        self._rhos     = [ float(2*M) ] * M
        self._ps       = [ 1/M ] * M
        self._p_bars   = [ 1/M ] * M

        self._mode = mode

        self._random_pick   = CobaRandom(seed)
        self._random_reject = CobaRandom(CobaRandom(seed).randint(0,10000))

    @property
    def params(self) -> Dict[str, Any]:
        return { "family": "corral", "eta": self._eta_init, "mode":self._mode, "T": self._T, "B": [ str(b) for b in self._base_learners ], "seed":self._random_pick._seed }

    def predict(self, context: Context, actions: Sequence[Action]) -> Tuple[Probs, Info]:

        base_predicts = [ base_algorithm.predict(context, actions) for base_algorithm in self._base_learners ]
        base_predicts, base_infos = zip(*base_predicts)

        if self._mode in ["importance"]:
            base_actions = [ self._random_pick.choice(actions, predict) for predict in base_predicts              ]
            base_probs   = [ predict[actions.index(action)] for action,predict in zip(base_actions,base_predicts) ]

            predict = [ sum([p_b*int(a==b_a) for p_b,b_a in zip(self._p_bars, base_actions)]) for a in actions ]
            info    = (base_actions, base_probs, base_infos, base_predicts, actions, predict)

        if self._mode in ["off-policy", "rejection"]:
            predict = [ sum([p_b*b_p[i] for p_b,b_p in zip(self._p_bars, base_predicts)]) for i in range(len(actions)) ]
            info    = (None, None, base_infos, base_predicts, actions, predict)

        return (predict, info)

    def learn(self, context: Context, action: Action, reward: float, probability:float, info: Info) -> None:

        assert  0 <= reward and reward <= 1, "This Corral implementation assumes a loss between 0 and 1"

        base_actions = info[0]
        base_probs   = info[1]
        base_infos   = info[2]
        base_preds   = info[3]
        actions      = info[4]
        predict      = info[5]

        if self._mode == "importance":
            # This is what is in the original paper. It has the following characteristics:
            #   > It is able to provide feedback to every base learner on every iteration
            #   > It uses a reward estimator with higher variance and no bias (aka, importance sampling)
            #   > It is "on-policy" with respect to base learner's prediction distributions
            # The reward, R, supplied to the base learners satisifies E[R|context,A] = E[reward|context,A]
            for learner, A, P, base_info in zip(self._base_learners, base_actions, base_probs, base_infos):
                R = reward * int(A==action)/probability
                learner.learn(context, A, R, P, base_info)

        if self._mode == "off-policy":
            # An alternative variation to the paper is provided below. It has the following characterisitcs: 
            #   > It is able to provide feedback to every base learner on every iteration
            #   > It uses a MVUB reward estimator (aka, the unmodified, observed reward)
            #   > It is "off-policy" (i.e., base learners receive action feedback distributed differently from their predicts).
            for learner, base_info in zip(self._base_learners, base_infos):
                learner.learn(context, action, reward, probability, base_info)

        if self._mode == "rejection":
            # An alternative variation to the paper is provided below. It has the following characterisitcs: 
            #   > It doesn't necessarily provide feedback to every base learner on every iteration
            #   > It uses a MVUB reward estimator (aka, the unmodified, observed reward) when it does provide feedback
            #   > It is "on-policy" (i.e., base learners receive action feedback is distributed identically to their predicts).
            p = self._random_reject.random() #can I reuse this across all learners like this??? I think so???
            for learner, base_info, base_predict in zip(self._base_learners, base_infos, base_preds):
                f = lambda a: base_predict[actions.index(a)] #the PMF we want
                g = lambda a: predict[actions.index(a)]      #the PMF we have
                
                M = max([f(A)/g(A) for A in actions if g(A) > 0])
                if p <= f(action)/(M*g(action)):
                    learner.learn(context, action, reward, f(action), base_info)

        # Instant loss is an unbiased estimate of E[loss|learner] for this iteration.
        # Our estimate differs from the orginal Corral paper because we have access to the
        # action probabilities of the base learners while the Corral paper did not assume 
        # access to this information. This information allows for a loss esimator with the same 
        # expectation as the original Corral paper's estimator but with a lower variance.

        loss = 1-reward

        picked_index = actions.index(action)
        instant_loss = [ loss * base_pred[picked_index]/probability for base_pred in base_preds ]
        self._ps     = CorralLearner._log_barrier_omd(self._ps, instant_loss, self._etas)
        self._p_bars = [ (1-self._gamma)*p + self._gamma*1/len(self._base_learners) for p in self._ps ]

        for i in range(len(self._base_learners)):
            if 1/self._p_bars[i] > self._rhos[i]:
                self._rhos[i] = 2/self._p_bars[i]
                self._etas[i] *= self._beta

        base_predict_data = { f"predict_{i}": base_preds[i][picked_index] for i in range(len(self._base_learners)) }
        base_pbar_data    = { f"pbar_{i}"   : self._p_bars[i]             for i in range(len(self._base_learners)) }
        predict_data      = { "predict"     : probability, **base_predict_data, **base_pbar_data }

        InteractionContext.learner_info.update({**predict_data, **base_predict_data, **base_pbar_data})

    @staticmethod
    def _log_barrier_omd(ps, losses, etas) -> Sequence[float]:

        f  = lambda l: float(sum( [ 1/((1/p) + eta*(loss-l)) for p, eta, loss in zip(ps, etas, losses)]))
        df = lambda l: float(sum( [ eta/((1/p) + eta*(loss-l))**2 for p, eta, loss in zip(ps, etas, losses)]))

        denom_zeros = [ ((-1/p)-(eta*loss))/-eta for p, eta, loss in zip(ps, etas, losses) ]

        min_loss = min(losses)
        max_loss = max(losses)

        precision = 4

        def binary_search(l,r) -> Optional[float]:
            #in theory the above check should guarantee this has a solution
            while True:

                x = (l+r)/2
                y = f(x)

                if round(y,precision) == 1:
                    return x

                if y < 1:
                    l = x

                if y > 1:
                    r = x

        def find_root_of_1():
            brackets = list(sorted(filter(lambda z: min_loss <= z and z <= max_loss, set(denom_zeros + [min_loss, max_loss]))))

            for l_brack, r_brack in zip(brackets[:-1], brackets[1:]):
                
                if (f(l_brack+.00001)-1) * (f(r_brack-.00001)-1) >= 0:
                    continue
                else:
                    # we use binary search because newtons 
                    # method can overshoot our objective
                    return binary_search(l_brack, r_brack)

        lmbda: Optional[float] = None

        if min_loss == max_loss:
            lmbda = min_loss
        elif min_loss not in denom_zeros and round(f(min_loss),precision) == 1:
            lmbda = min_loss
        elif max_loss not in denom_zeros and round(f(max_loss),precision) == 1:
            lmbda = max_loss
        else:
            lmbda = find_root_of_1()

        if lmbda is None:
            raise Exception(f'Something went wrong in Corral OMD {ps}, {etas}, {losses}')

        new_ps = [ 1/((1/p) + eta*(loss-lmbda)) for p, eta, loss in zip(ps, etas, losses)]

        assert round(sum(new_ps),precision) == 1, "An invalid update was made by the log barrier in Corral"

        return new_ps
Пример #2
0
    def __init__(self,
                 n_interactions: int,
                 n_actions: int = 10,
                 n_context_features: int = 10,
                 n_action_features: int = 10,
                 n_neighborhoods: int = 10,
                 seed: int = 1) -> None:
        """Instantiate a NeighborsSyntheticSimulation.

        Args:
            n_interactions: The number of interactions the simulation should have.
            n_actions: The number of actions each interaction should have.
            n_context_features: The number of features each context should have.
            n_action_features: The number of features each action should have.
            n_neighborhoods: The number of neighborhoods the simulation should have.
            seed: The random number seed used to generate all contexts and action rewards.
        """

        self._args = (n_interactions, n_actions, n_context_features,
                      n_action_features, n_neighborhoods, seed)

        self._n_interactions = n_interactions
        self._n_actions = n_actions
        self._n_context_feats = n_context_features
        self._n_action_feats = n_action_features
        self._n_neighborhoods = n_neighborhoods
        self._seed = seed

        rng = CobaRandom(self._seed)

        def context_gen():
            return tuple(rng.gausses(n_context_features, 0,
                                     1)) if n_context_features else None

        def actions_gen():
            if not n_action_features:
                return OneHotEncoder().fit_encodes(range(n_actions))
            else:
                return [
                    tuple(rng.gausses(n_action_features, 0, 1))
                    for _ in range(n_actions)
                ]

        contexts = list(
            set([context_gen() for _ in range(self._n_neighborhoods)]))
        context_actions = {c: actions_gen() for c in contexts}
        context_action_rewards = {(c, a): rng.random()
                                  for c in contexts
                                  for a in context_actions[c]}

        context_iter = iter(islice(cycle(contexts), n_interactions))

        def context(index: int):
            return next(context_iter)

        def actions(index: int, context: Tuple[float, ...]):
            return context_actions[context]

        def reward(index: int, context: Tuple[float, ...], action: Tuple[int,
                                                                         ...]):
            return context_action_rewards[(context, action)]

        return super().__init__(self._n_interactions, context, actions, reward)
Пример #3
0
    def __init__(self,
                 n_interactions: int = 500,
                 n_actions: int = 10,
                 n_features: int = 10,
                 context_features: bool = True,
                 action_features: bool = True,
                 sparse: bool = False,
                 seed: int = 1) -> None:

        self._n_bandits = n_actions
        self._n_features = n_features
        self._context_features = context_features
        self._action_features = action_features
        self._seed = seed

        r = CobaRandom(seed)

        context: Callable[[int], Context]
        actions: Callable[[int, Context], Sequence[Action]]
        rewards: Callable[[int, Context, Action], float]

        sparsify = lambda x: (tuple(range(len(x))), tuple(x)
                              ) if sparse else tuple(x)
        unsparse = lambda x: x[1] if sparse else x
        normalize = lambda X: [x / sum(X) for x in X]

        if not context_features and not action_features:

            means = [
                m / n_actions + 1 / (2 * n_actions)
                for m in r.randoms(n_actions)
            ]

            actions_features = []
            for i in range(n_actions):
                action = [0] * n_actions
                action[i] = 1
                actions_features.append(tuple(action))

            context = lambda i: None
            actions = lambda i, c: sparsify(actions_features)
            rewards = lambda i, c, a: means[unsparse(a).index(1)] + (r.random(
            ) - .5) / n_actions

        if context_features and not action_features:
            #normalizing allows us to make sure our reward is in [0,1]
            bandit_thetas = [r.randoms(n_features) for _ in range(n_actions)]
            theta_totals = [sum(theta) for theta in bandit_thetas]
            bandit_thetas = [[
                t / norm for t in theta
            ] for theta, norm in zip(bandit_thetas, theta_totals)]

            actions_features = []
            for i in range(n_actions):
                action = [0] * n_actions
                action[i] = 1
                actions_features.append(tuple(action))

            context = lambda i: sparsify(r.randoms(n_features))
            actions = lambda i, c: [sparsify(af) for af in actions_features]
            rewards = lambda i, c, a: sum([
                cc * t for cc, t in zip(unsparse(c), bandit_thetas[unsparse(a).
                                                                   index(1)])
            ])

        if not context_features and action_features:

            theta = r.randoms(n_features)

            context = lambda i: None
            actions = lambda i, c: [
                sparsify(normalize(r.randoms(n_features)))
                for _ in range(r.randint(2, 10))
            ]
            rewards = lambda i, c, a: float(
                sum([cc * t for cc, t in zip(theta, unsparse(a))]))

        if context_features and action_features:

            context = lambda i: sparsify(r.randoms(n_features))
            actions = lambda i, c: [
                sparsify(normalize(r.randoms(n_features)))
                for _ in range(r.randint(2, 10))
            ]
            rewards = lambda i, c, a: sum(
                [cc * t for cc, t in zip(unsparse(c), unsparse(a))])

        super().__init__(n_interactions, context, actions, rewards)