def play(self): for run_counter in tqdm(range(0, self.runs)): self.multi_armed_bandit = MultiArmedBandit( arms=self.arms, stationary=self.stationary) self.pulls = self.init_pulls() self.value_estimates = self.init_value_estimates() self._run(run_counter=run_counter)
def test_calculate_regret(): multi_armed_bandit = MultiArmedBandit(2) result = multi_armed_bandit.calculate_regret([0, 1], 1) assert result == 0 result = multi_armed_bandit.calculate_regret([0, 1], 0) assert result == 1
def main(): reward_distribution = [0.3, 0.5, 0.8] my_bandit = MultiArmedBandit(reward_probability_list=reward_distribution) tot_arms = 3 tot_episodes = 2000 tot_steps = 1000 print_every_episodes = 100 cumulated_reward_list = list() print_every_episodes = 100 average_utility_array = np.zeros(tot_arms) print("Starting random agent...") for episode in range(tot_episodes): cumulated_reward = 0 reward_counter_array = np.zeros(tot_arms) action_counter_array = np.full(tot_arms, 1.0e-5) for step in range(tot_steps): action = np.random.randint(low=0, high=tot_arms) reward = my_bandit.step(action) reward_counter_array[action] += reward action_counter_array[action] += 1 cumulated_reward += reward cumulated_reward_list.append(cumulated_reward) utility_array = np.true_divide(reward_counter_array, action_counter_array) average_utility_array += utility_array if episode % print_every_episodes == 0: print("Episode: " + str(episode)) print("Cumulated Reward: " + str(cumulated_reward)) print("Utility distribution: " + str(utility_array)) print("Utility RMSE: " + str(return_rmse(utility_array, reward_distribution))) print("") print("") print("Average Cumulated Reward: " + str(np.mean(cumulated_reward_list))) print("Std Cumulated Reward: " + str(np.std(cumulated_reward_list))) print("Average utility distribution: " + str(average_utility_array / tot_episodes)) print("Average utility RMSE: " + str(return_rmse(average_utility_array/tot_episodes, reward_distribution)))
def test_init(mock_random): mock_random.side_effect = [0.2, 0.3] multi_armed_bandit = MultiArmedBandit(2) assert multi_armed_bandit.best_arm == 1 mock_random.side_effect = [0.2, 0.1] multi_armed_bandit = MultiArmedBandit(2) assert multi_armed_bandit.best_arm == 0
def test_pull_not_optimal(mock_binomial, mock_random): mock_random.side_effect = [0.2, 0.3] mock_binomial.side_effect = [1, 0] multi_armed_bandit = MultiArmedBandit(2) result, regret, optimal = multi_armed_bandit.pull(0) assert result == 1 assert regret == 0 assert optimal == 0
def main(): reward_distribution = [0.3, 0.5, 0.8] my_bandit = MultiArmedBandit(reward_probability_list=reward_distribution) tot_arms = 3 tot_episodes = 2000 tot_steps = 1000 print_every_episodes = 100 cumulated_reward_list = list() average_utility_array = np.zeros(tot_arms) print("Starting Thompson agent...") for episode in range(tot_episodes): cumulated_reward = 0 arms_counter_array = np.ones(tot_arms) arms_values_array = np.zeros(tot_arms) success_counter_array = np.ones(tot_arms) failure_counter_array = np.ones(tot_arms) #success_counter_array = [10, 10, 10] #failure_counter_array = [10, 10, 10] action_counter_array = np.full(tot_arms, 1.0e-5) for step in range(tot_steps): action = return_ucb_action(arms_counter_array, arms_values_array) reward = my_bandit.step(action) arms_counter_array[action] += 1 arms_values_array = np.multiply( np.true_divide((arms_counter_array - 1), arms_counter_array), arms_values_array) + 1 / np.ones(tot_arms) * reward if reward == 1: success_counter_array[action] += 1 elif reward == 0: failure_counter_array[action] += 1 else: raise Exception("Wrong value returned as Reward...") action_counter_array[action] += 1 cumulated_reward += reward # Append the cumulated reward for this episode in a list cumulated_reward_list.append(cumulated_reward) utility_array = np.true_divide( np.add(success_counter_array, [-1, -1, -1]), action_counter_array) average_utility_array += utility_array if episode % print_every_episodes == 0: print("Episode: " + str(episode)) print("Cumulated Reward: " + str(cumulated_reward)) print("Success counter: " + str(np.add(success_counter_array, [-1, -1, -1]))) print("Failure counter: " + str(np.add(failure_counter_array, [-1, -1, -1]))) print("Utility distribution: " + str(utility_array)) print("Utility RMSE: " + str(return_rmse(utility_array, reward_distribution))) print("") # Print the average cumulated reward for all the episodes print("Average cumulated reward: " + str(np.mean(cumulated_reward_list))) print("Std Cumulated Reward: " + str(np.std(cumulated_reward_list))) print("Average utility distribution: " + str(average_utility_array / tot_episodes)) print("Average utility RMSE: " + str( return_rmse(average_utility_array / tot_episodes, reward_distribution)))
def test_mab_run_many_arms(self): mab = MultiArmedBandit(k=100, t=10**6) mab.run_bandit_algorithm() # With many arms, we don't analyse pulls of sub-optimal arms. mab.analyse_regret() mab.print_performance() mab.plot_regret() mab.ph.show_plots()
def main(): reward_distribution = [0.45, 0.5, 0.55] my_bandit = MultiArmedBandit(reward_probability_list=reward_distribution) #set epsilon as 1 to start random bet epsilon_start = 1 epsilon_stop = 0.001 tot_arms = 3 tot_episodes = 2000 tot_steps = 1000 print_every_episodes = 100 cumulated_reward_list = list() average_utility_array = np.zeros(tot_arms) epsilon_array = np.linspace(epsilon_start, epsilon_stop, num=100) print("Starting epsilon-decreasing agent...") for episode in range(tot_episodes): cumulated_reward = 0 reward_counter_array = np.ones(tot_arms) action_counter_array = np.ones(tot_arms) for step in range(tot_steps): curr_reward_counter_array = np.true_divide(reward_counter_array, action_counter_array) epsilon = math.exp(-0.05 * step) # if(step < 100): # epsilon = epsilon_array[step] action = return_epsilon_greedy_action(epsilon, curr_reward_counter_array) reward = my_bandit.step(action) reward_counter_array[action] += reward action_counter_array[action] += 1 cumulated_reward += reward # Append the cumulated reward for this episode in a list cumulated_reward_list.append(cumulated_reward) utility_array = np.true_divide(reward_counter_array - 1, action_counter_array - 1) average_utility_array += utility_array if episode % print_every_episodes == 0: print("Episode: " + str(episode)) print("Cumulated Reward: " + str(cumulated_reward)) print("Reward counter: " + str(reward_counter_array - 1)) print("Utility distribution: " + str(utility_array)) print("Utility RMSE: " + str(return_rmse(utility_array, reward_distribution))) print("") # Print the average cumulated reward for all the episodes print("Average cumulated reward: " + str(np.mean(cumulated_reward_list))) print("Std Cumulated Reward: " + str(np.std(cumulated_reward_list))) print("Average utility distribution: " + str(average_utility_array / tot_episodes)) print("Average utility RMSE: " + str( return_rmse(average_utility_array / tot_episodes, reward_distribution)))
def main(): reward_distribution = [0.3, 0.5, 0.8] my_bandit = MultiArmedBandit(reward_probability_list=reward_distribution) temperature_start = 0.1 temperature_stop = 0.0001 epsilon = 0.1 tot_arms = 3 tot_episodes = 2000 tot_steps = 1000 print_every_episodes = 100 cumulated_reward_list = list() average_utility_array = np.zeros(tot_arms) temperature_array = np.linspace(temperature_start, temperature_stop, num=tot_steps) print("Starting Boltzmann agent...") for episode in range(tot_episodes): cumulated_reward = 0 reward_counter_array = np.zeros(tot_arms) action_counter_array = np.full(tot_arms, 1.0e-5) for step in range(tot_steps): temperature = temperature_array[step] action = return_boltzmann_action( temperature, np.true_divide(reward_counter_array, action_counter_array)) reward = my_bandit.step(action) reward_counter_array[action] += reward action_counter_array[action] += 1 cumulated_reward += reward # Append the cumulated reward for this episode in a list cumulated_reward_list.append(cumulated_reward) utility_array = np.true_divide(reward_counter_array, action_counter_array) average_utility_array += utility_array if episode % print_every_episodes == 0: print("Episode: " + str(episode)) print("Cumulated Reward: " + str(cumulated_reward)) print("Reward counter: " + str(reward_counter_array)) print("Utility distribution: " + str(utility_array)) print("Utility RMSE: " + str(return_rmse(utility_array, reward_distribution))) print("") # Print the average cumulated reward for all the episodes print("Average cumulated reward: " + str(np.mean(cumulated_reward_list))) print("Std Cumulated Reward: " + str(np.std(cumulated_reward_list))) print("Average utility distribution: " + str(average_utility_array / tot_episodes)) print("Average utility RMSE: " + str( return_rmse(average_utility_array / tot_episodes, reward_distribution)))
def main(): reward_distribution = [0.3, 0.5, 0.8] my_bandit = MultiArmedBandit(reward_probability_list=reward_distribution) tot_arms = 3 tot_episodes = 2000 tot_steps = 1000 print_every_episodes = 100 cumulated_reward_list = list() average_utility_array = np.zeros(tot_arms) print("Starting greedy agent...") for episode in range(tot_episodes): cumulated_reward = 0 reward_counter_array = np.zeros(tot_arms) action_counter_array = np.full(tot_arms, 1.0e-5) for step in range(tot_steps): if step < tot_arms: action = step # press all the arms first else: action = return_greedy_action( np.true_divide(reward_counter_array, action_counter_array)) reward = my_bandit.step(action) reward_counter_array[action] += reward action_counter_array[action] += 1 cumulated_reward += reward # Append the cumulated reward for this episode in a list cumulated_reward_list.append(cumulated_reward) utility_array = np.true_divide(reward_counter_array, action_counter_array) average_utility_array += utility_array if episode % print_every_episodes == 0: print("Episode: " + str(episode)) print("Cumulated Reward: " + str(cumulated_reward)) print("Reward counter: " + str(reward_counter_array)) print("Utility distribution: " + str(utility_array)) print("Utility RMSE: " + str(return_rmse(utility_array, reward_distribution))) print("") # Print the average cumulated reward for all the episodes print("Average cumulated reward: " + str(np.mean(cumulated_reward_list))) print("Std Cumulated Reward: " + str(np.std(cumulated_reward_list))) print("Average utility distribution: " + str(average_utility_array / tot_episodes)) print("Average utility RMSE: " + str( return_rmse(average_utility_array / tot_episodes, reward_distribution)))
def main(): reward_distribution = [0.3, 0.5, 0.8] my_bandit = MultiArmedBandit(reward_probability_list=reward_distribution) tot_arms = 3 tot_episodes = 2000 tot_steps = 1000 print_every_episodes = 100 cumulated_reward_list = list() average_utility_array = np.zeros(tot_arms) print("Starting Thompson agent...") for episode in range(tot_episodes): cumulated_reward = 0 success_counter_array = np.ones(tot_arms) failure_counter_array = np.ones(tot_arms) action_counter_array = np.full(tot_arms, 1.0e-5) for step in range(tot_steps): action = return_thompson_action(success_counter_array, failure_counter_array) reward = my_bandit.step(action) if reward == 1: success_counter_array[action] += 1 elif reward == 0: failure_counter_array[action] += 1 else: raise Exception("Wrong value returned as Reward...") action_counter_array[action] += 1 cumulated_reward += reward # Append the cumulated reward for this episode in a list cumulated_reward_list.append(cumulated_reward) utility_array = np.true_divide(success_counter_array, action_counter_array) average_utility_array += utility_array if episode % print_every_episodes == 0: print("Episode: " + str(episode)) print("Cumulated Reward: " + str(cumulated_reward)) print("Success counter: " + str(success_counter_array)) print("Failure counter: " + str(failure_counter_array)) print("Utility distribution: " + str(utility_array)) print("Utility RMSE: " + str(return_rmse(utility_array, reward_distribution))) print("") # Print the average cumulated reward for all the episodes print("Average cumulated reward: " + str(np.mean(cumulated_reward_list))) print("Std Cumulated Reward: " + str(np.std(cumulated_reward_list))) print("Average utility distribution: " + str(average_utility_array / tot_episodes)) print("Average utility RMSE: " + str(return_rmse(average_utility_array/tot_episodes, reward_distribution)))
def run(self): for i in range(self.total_experiment_count): self.logger.debug("MAB instance {0}".format(i)) mab = None mab = MultiArmedBandit(k=self.arms_count_k, t=self.arm_pulls_n) mab.set_algorithms(self.algorithms_to_run) mab.run_bandit_algorithm() mab.analyse_regret() self.update_average_regrets(mab) self.current_experiment_count = self.current_experiment_count + 1
def main(): reward_distribution = [0.3, 0.5, 0.8] my_bandit = MultiArmedBandit(reward_probability_list=reward_distribution) temperature_start = 0.1 temperature_stop = 0.0001 epsilon = 0.1 tot_arms = 3 tot_episodes = 2000 tot_steps = 1000 print_every_episodes = 100 cumulated_reward_list = list() average_utility_array = np.zeros(tot_arms) temperature_array = np.linspace(temperature_start, temperature_stop, num=tot_steps) print("Starting Boltzmann agent...") for episode in range(tot_episodes): cumulated_reward = 0 reward_counter_array = np.zeros(tot_arms) action_counter_array = np.full(tot_arms, 1.0e-5) for step in range(tot_steps): temperature = temperature_array[step] action = return_boltzmann_action(temperature, np.true_divide(reward_counter_array, action_counter_array)) reward = my_bandit.step(action) reward_counter_array[action] += reward action_counter_array[action] += 1 cumulated_reward += reward # Append the cumulated reward for this episode in a list cumulated_reward_list.append(cumulated_reward) utility_array = np.true_divide(reward_counter_array, action_counter_array) average_utility_array += utility_array if episode % print_every_episodes == 0: print("Episode: " + str(episode)) print("Cumulated Reward: " + str(cumulated_reward)) print("Reward counter: " + str(reward_counter_array)) print("Utility distribution: " + str(utility_array)) print("Utility RMSE: " + str(return_rmse(utility_array, reward_distribution))) print("") # Print the average cumulated reward for all the episodes print("Average cumulated reward: " + str(np.mean(cumulated_reward_list))) print("Std Cumulated Reward: " + str(np.std(cumulated_reward_list))) print("Average utility distribution: " + str(average_utility_array / tot_episodes)) print("Average utility RMSE: " + str(return_rmse(average_utility_array/tot_episodes, reward_distribution)))
def main(): reward_distribution = [0.3, 0.5, 0.8] my_bandit = MultiArmedBandit(reward_probability_list=reward_distribution) tot_arms = 3 tot_episodes = 2000 tot_steps = 1000 print_every_episodes = 100 cumulated_reward_list = list() print_every_episodes = 100 average_utility_array = np.zeros(tot_arms) print("Starting random agent...") for episode in range(tot_episodes): cumulated_reward = 0 reward_counter_array = np.zeros(tot_arms) action_counter_array = np.full(tot_arms, 1.0e-5) for step in range(tot_steps): action = np.random.randint(low=0, high=tot_arms) reward = my_bandit.step(action) reward_counter_array[action] += reward action_counter_array[action] += 1 cumulated_reward += reward cumulated_reward_list.append(cumulated_reward) utility_array = np.true_divide(reward_counter_array, action_counter_array) average_utility_array += utility_array if episode % print_every_episodes == 0: print("Episode: " + str(episode)) print("Cumulated Reward: " + str(cumulated_reward)) print("Utility distribution: " + str(utility_array)) print("Utility RMSE: " + str(return_rmse(utility_array, reward_distribution))) print("") print("") print("Average Cumulated Reward: " + str(np.mean(cumulated_reward_list))) print("Std Cumulated Reward: " + str(np.std(cumulated_reward_list))) print("Average utility distribution: " + str(average_utility_array / tot_episodes)) print("Average utility RMSE: " + str( return_rmse(average_utility_array / tot_episodes, reward_distribution)))
def main(): reward_distribution = [0.3, 0.5, 0.8] my_bandit = MultiArmedBandit(reward_probability_list=reward_distribution) tot_arms = 3 tot_episodes = 2000 tot_steps = 1000 print_every_episodes = 100 cumulated_reward_list = list() average_utility_array = np.zeros(tot_arms) print("Starting greedy agent...") for episode in range(tot_episodes): cumulated_reward = 0 reward_counter_array = np.zeros(tot_arms) action_counter_array = np.full(tot_arms, 1.0e-5) for step in range(tot_steps): if step < tot_arms: action = step # press all the arms first else: action = return_greedy_action(np.true_divide(reward_counter_array, action_counter_array)) reward = my_bandit.step(action) reward_counter_array[action] += reward action_counter_array[action] += 1 cumulated_reward += reward # Append the cumulated reward for this episode in a list cumulated_reward_list.append(cumulated_reward) utility_array = np.true_divide(reward_counter_array, action_counter_array) average_utility_array += utility_array if episode % print_every_episodes == 0: print("Episode: " + str(episode)) print("Cumulated Reward: " + str(cumulated_reward)) print("Reward counter: " + str(reward_counter_array)) print("Utility distribution: " + str(utility_array)) print("Utility RMSE: " + str(return_rmse(utility_array, reward_distribution))) print("") # Print the average cumulated reward for all the episodes print("Average cumulated reward: " + str(np.mean(cumulated_reward_list))) print("Std Cumulated Reward: " + str(np.std(cumulated_reward_list))) print("Average utility distribution: " + str(average_utility_array / tot_episodes)) print("Average utility RMSE: " + str(return_rmse(average_utility_array/tot_episodes, reward_distribution)))
def test_mab_run(self): mab = MultiArmedBandit(k=5, t=10**4) algorithms_to_run = [("UCB1", UCB1), ("UCB-Inc", UCBIncremental), ("UCB-Doub-TR", UCBDoubling, mh.ucb_doubling_radius), ("UCB-Doub", UCBDoubling)] mab.set_algorithms(algorithms_to_run) mab.run_bandit_algorithm() mab.analyse_regret() mab.analyse_suboptimal_arm_pulls() # mab.print_performance() mab.plot_suboptimal_arm() mab.plot_regret() mab.ph.show_plots()
class Agent: def __init__(self, arms: int, runs: int, steps: int, exploration_rate: float, initial_values: float, stationary: bool = True): self.arms = arms self.runs = runs self.steps = steps self.initial_values = initial_values self.stationary = stationary self.multi_armed_bandit = None self.average_regret = 0 self.average_reward = 0 self.maximum_average_reward = 0 self.exploration_rate = exploration_rate self.pulls = self.init_pulls() self.value_estimates = self.init_value_estimates() self.action_optimal = np.zeros((runs, steps)) self.rewards = np.zeros((runs, steps)) def play(self): for run_counter in tqdm(range(0, self.runs)): self.multi_armed_bandit = MultiArmedBandit( arms=self.arms, stationary=self.stationary) self.pulls = self.init_pulls() self.value_estimates = self.init_value_estimates() self._run(run_counter=run_counter) def _run(self, run_counter: int): # self.print_bandit_info() for step_counter in range(0, self.steps): arm = self._choose_arm() reward, regret, optimal = self.multi_armed_bandit.pull(arm) self.update_value_estimate(reward=reward, arm=arm, pull=step_counter + 1) self.action_optimal[run_counter][step_counter] = optimal self.rewards[run_counter][step_counter] = reward # self.exploration_rate = 1 / math.log(i + 0.00001) # print([f'{value:1.3f}' for value in self.value_estimates]) # print('-----------------------------------------------------------') def _choose_arm(self) -> int: if np.random.uniform(0, 1) <= self.exploration_rate: return np.random.randint(0, self.arms) else: return int(np.argmax(self.value_estimates)) # return np.random.choice(np.arange(0, self.arms), p=self.value_estimates / np.sum(self.value_estimates)) def print_bandit_info(self): print([ f'{bandit.probability:1.3f}' for bandit in self.multi_armed_bandit.bandits ]) def update_value_estimate(self, reward: int, arm: int, pull: int): self.pulls[arm] = self.pulls[arm] + 1 self.value_estimates[arm] = self.value_estimates[arm] + ( reward - self.value_estimates[arm]) / self.pulls[arm] def init_value_estimates(self) -> np.ndarray: return np.full((self.arms, ), self.initial_values) def init_pulls(self) -> dict: return {i: 0 for i in range(0, self.arms)}