示例#1
0
 def pick_action(self, context):
     # context contains all context_vectors for all arms
     self.context = context
     self.means = np.array([
         self._compute_mean(self.theta_hat[i], context[i])
         for i in range(self.k_arms)
     ])
     return random_argmax(self.means)
示例#2
0
 def observe_reward(self, arm_idx, reward):
     self.arms_data[arm_idx].append(reward)
     n = len(self.arms_data[arm_idx])
     self.estimated_means[arm_idx] = reward / n + (
         n - 1) * self.estimated_means[arm_idx] / n
     self.pulls[arm_idx] += 1
     self.best_arm = random_argmax(self.estimated_means)
     self._t += 1
示例#3
0
 def pick_action(self):
     # try each arm once then compute UCBs
     if self._t > self.k_arms:
         self._UCBs = [
             self._bound_function(arm_idx) for arm_idx in self._arm_idxs
         ]
         arm_idx = random_argmax(self._UCBs)
     else:
         arm_idx = next(self._try_each_arm)
     return arm_idx
示例#4
0
 def pick_action(self):
     # try each arm once then compute UCBs
     if self._t > self.k_arms:
         self.Vs = self.sq_sums - self.estimated_means**2 + self.radius(
             self._t, self.pulls)
         self._UCBs = self.estimated_means + np.sqrt(
             (np.log(self._t) / self.pulls) * self.Vs)
         arm_idx = random_argmax(self._UCBs)
     else:
         arm_idx = next(self._try_each_arm)
     return arm_idx
示例#5
0
 def pick_action(self):
     # try each arm once then compute UCBs
     if self._t > self.k_arms:
         self._UCBs = self.estimated_means + self.radius(
             self._t, self.pulls)
         arm_idx = random_argmax(self._UCBs)
         if self.keep_history:
             self.UCB_history.append(self._UCBs.copy())
             self.means_history.append(self.estimated_means.copy())
             self.pulls_history.append(self.pulls.copy())
     else:
         arm_idx = next(self._try_each_arm)
     return arm_idx
示例#6
0
    def generate_context(self):
        """Generates context vector of indicators and computes current
        real reward probability
        """
        context = []
        context_vector = bernuolli(self.context_options)
        if self.add_bias:
            context_vector = np.append([1], context_vector)
        for i in range(self.k_arms):
            context.append(context_vector)
        # pull all arms to generate current mean, rewards and get the optimal one
        # agent/policy/algorithm knows only the context but means are not revealed
        self.current_rewards = [
            arm.pull(context_vector)
            for arm, context_vector in zip(self.arms, context)
        ]
        self.current_means = [arm.get_current_mean() for arm in self.arms]
        self.current_optimal_arm = random_argmax(self.current_means)
        self.current_optimal_mean = self.current_means[
            self.current_optimal_arm]

        return context
示例#7
0
 def pick_action(self):
     if self.keep_history:
         self.prior_data_history.append(self.prior_data.copy())
         self.means_history.append(self.estimated_means.copy())
         self.pulls_history.append(self.pulls.copy())
     return random_argmax(self._sample_from_arms())