def analyse_suboptimal_arm_pulls(self): # Compute deltas and theoretical upper bound of playing each sub-optimal arm. self.best_arm = mh.get_maximum_index(self.true_means) mean_of_best_arm = self.true_means[self.best_arm] for i in range(self.K): self.deltas[i] = mean_of_best_arm - self.true_means[i] del_sq_invs = mh.get_instance_dependent_square_inverses( self.deltas, self.best_arm) addi_constant = rvh.func_of_pi(add=1, power=2, mult=1 / 3) time_series = np.arange(self.T + 1) logarithmic_time_series = rvh.natural_logarithm(time_series) a = np.array(del_sq_invs) del_sq_inv_row_matrix = np.reshape(a, (1, -1)) logarithmic_time_series_column_matrix = np.reshape( logarithmic_time_series, (-1, 1)) matrix = np.dot(logarithmic_time_series_column_matrix, del_sq_inv_row_matrix) self.theoretical_bounds_arm_pulls = matrix + addi_constant
def test_library_random_variables(self): # Uniform distribution result_1 = rvh.get_uniform_sample(0, 1, 10) # Bernoulli distribution result_2 = rvh.get_bernoulli_sample(0.8) result_3 = rvh.get_bernoulli_sample(p=0.5, size=10) a = 5
def play_arms(self): rewards = [0] n = 0 # At time t = 0, for i in range(1, self.K + 1): arm_number = i - 1 reward = super().pull_arm(arm_number) rewards.append(reward) n = n + 1 # From time t = 1 for t in range(1, rvh.ceiled_log_base_2(self.N) + 1): self.revise_ucbs(n) # pull the arm with highest UCB 2 power t times pulls_this_iteration = 2**t arm_with_highest_ucb = mh.get_maximum_index( self.upper_confidence_bound) for i in range(pulls_this_iteration): if n >= self.N: break reward = super().pull_arm(arm_with_highest_ucb) rewards.append(reward) n = n + 1 # end for # end for return rewards
def get_arms(arm_count, tape_size): true_means = rvh.get_uniform_sample(0, 1, arm_count) arms = [] for i in range(arm_count): arm = Arm(true_means[i], size=tape_size) arms.append(arm) return true_means, arms
def analyse_common_stats(self): # Compute deltas and theoretical upper bound of regret of UCB1. self.best_arm = mh.get_maximum_index(self.true_means) mean_of_best_arm = self.true_means[self.best_arm] for i in range(self.K): self.deltas[i] = mean_of_best_arm - self.true_means[i] sum_del_inv, sum_del = mh.get_instance_dependent_values( self.best_arm, self.deltas) mult_constant, addi_constant = mh.get_theoretical_constants( sum_del_inv, sum_del) time_series = np.arange(self.T + 1) self.cum_regret_theo_bound = mult_constant * rvh.natural_logarithm( time_series) + addi_constant self.cum_optimal_reward = time_series * mean_of_best_arm
def get_theoretical_constants(sum_del_inv, sum_del): mult_constant = 2 * sum_del_inv addi_constant = rvh.func_of_pi(add=1, power=2, mult=1 / 3) * sum_del return mult_constant, addi_constant
def __init__(self, mean, size=10**7): self._mean = mean # Create a tape of values to return. self._tape = rvh.get_bernoulli_sample(p=self._mean, size=size) self._tape_index = 0