def greedy_policy(): # Agent chooses by the greedy policy rewards = np.zeros((len(epsilons), num_sessions, num_trials)) num_best = np.zeros((len(epsilons), num_sessions, num_trials)) for i in range(len(epsilons)): policy = GreedyPolicy() bandit = GaussianBandit(n) agent = Agent(n, policy, num_trials) env = Environment(bandit, agent, num_trials, num_sessions) rewards[i, :, :], num_best[i, :, :] = env.run() ave_reward = rewards[i, :, :].mean(axis=0) plt.plot(ave_reward) plt.title("Average Reward") plt.xlabel('Trial') plt.ylabel('Reward') plt.show() ave_percent_best = num_best[i, :, :].mean(axis=0) plt.plot(ave_percent_best) plt.title("Average Percent Best Option") plt.xlabel('Trial') plt.ylabel('Percent Best Option') plt.show()
def run_bandit(epsilon, n, num_trials, num_sessions): # Runs the bandit for a single epsilon, n policy = EpsilonGreedyPolicy(epsilon) bandit = GaussianBandit(n) agent = Agent(n, policy, num_trials) env = Environment(bandit, agent, num_trials, num_sessions) rewards, num_best = env.run() plot_ave_reward(rewards) plt.show() plot_percent_best_action(num_best) plt.show()
def compare_epsilons(n, epsilons): # Compare various values of n and epsilon # maximizer: epsilon = 1, complete exploration # satisficer: epsilon = 0, complete exploitation rewards = np.zeros((len(epsilons), num_sessions, num_trials)) num_best = np.zeros((len(epsilons), num_sessions, num_trials)) ave_reward = np.zeros((len(epsilons), num_trials)) cum_reward = np.zeros(num_sessions) ave_cum_reward = np.zeros((len(epsilons), 2)) for i in range(len(epsilons)): policy = EpsilonGreedyPolicy(epsilons[i]) bandit = GaussianBandit(n) agent = Agent(n, policy, num_trials) env = Environment(bandit, agent, num_trials, num_sessions) rewards[i, :, :], num_best[i, :, :] = env.run() # Compare average reward across values of epsilon color = iter(cm.rainbow(np.linspace(0, 1, len(epsilons)))) for i in range(len(epsilons)): c = next(color) ave_reward[i, :] = rewards[i, :, :].mean(axis=0) plt.plot(ave_reward[i, :], label="Epsilon:" + str(epsilons[i]), c=c) plt.title("Average Reward" + ", n: " + str(n)) plt.xlabel('Trial') plt.ylabel('Reward') plt.legend(loc="upper left") plt.rc('legend', fontsize='x-small') plt.show() color2 = iter(cm.rainbow(np.linspace(0, 1, len(epsilons)))) for i in range(len(epsilons)): c = next(color2) ave_percent_best = num_best[i, :, :].mean(axis=0) plt.plot(ave_percent_best, label="Epsilon:" + str(epsilons[i]), c=c) plt.title("Average Percent Best Option" + ", n: " + str(n)) plt.xlabel('Trial') plt.ylabel('Percent Best Option') plt.legend(loc="upper left") plt.rc('legend', fontsize='x-small') plt.show() for i in range(len(epsilons)): for j in range(num_sessions): cum_reward[j] = rewards[i, j, :].sum() ave_cum_reward[i, :] = [epsilons[i], np.mean(cum_reward)] print(np.shape(cum_reward)) print(np.shape(ave_cum_reward)) print(ave_cum_reward)
def __init__(self, n: int, mu: float, std: float, noise: float) -> None: """ Constructs an N-armed bandit environment. @param n: The number of bandit arms. @param mu: The mean of the bandits' true rewards. @param std: The standard deviation of the bandits' true rewards. @param noise: The standard deviation of the Gaussian noise around rewards. """ super(GaussBanditEnvironment, self).__init__() self._mu = mu self._std = std self._noise = noise self._rng = np.random.default_rng() self._bandits = [ GaussianBandit(self._rng.normal(self._mu, self._std), self._noise) for _ in range(n) ]
def compare_n(n_list): # Compare across values of n rewards = np.zeros((len(n_list), num_sessions, num_trials)) num_best = np.zeros((len(n_list), num_sessions, num_trials)) cum_reward = np.zeros(num_sessions) ave_cum_reward = np.zeros((len(n_list), 2)) for i in range(len(n_list)): policy = EpsilonGreedyPolicy(epsilon) bandit = GaussianBandit(n_list[i]) agent = Agent(n_list[i], policy, num_trials) env = Environment(bandit, agent, num_trials, num_sessions) rewards[i, :, :], num_best[i, :, :] = env.run() # Compare average reward across values of epsilon color = iter(cm.rainbow(np.linspace(0, 1, len(n_list)))) for i in range(len(n_list)): c = next(color) ave_reward = rewards[i, :, :].mean(axis=0) plt.plot(ave_reward, label="n:" + str(n_list[i]), c=c) plt.title("Average Reward") plt.xlabel('Trial') plt.ylabel('Reward') plt.legend(loc="upper left") plt.show() color2 = iter(cm.rainbow(np.linspace(0, 1, len(n_list)))) for i in range(len(n_list)): c = next(color2) ave_percent_best = num_best[i, :, :].mean(axis=0) plt.plot(ave_percent_best, label="n:" + str(n_list[i]), c=c) plt.title("Average Percent Best Option") plt.xlabel('Trial') plt.ylabel('Percent Best Option') plt.legend(loc="upper left") plt.show() for i in range(len(n_list)): for j in range(num_sessions): cum_reward[j] = rewards[i, j, :].sum() ave_cum_reward[i, :] = [n_list[i], np.mean(cum_reward)] print(np.shape(cum_reward)) print(np.shape(ave_cum_reward)) print(ave_cum_reward)
def reset(self) -> None: self._bandits = \ [GaussianBandit(self._rng.normal(self._mu, self._std), self._noise) for _ in range(len(self._bandits))]
ax1.grid() ax1.legend(['alpha = {}'.format(i) for i in alpha], loc='upper right') ax1.set_title('Average reward vs. alpha (learning rate)') ax1.set_xlabel('Iterations') ax1.set_ylabel('Average reward') # plt.show() if __name__ == "__main__": NUM_ARMS = 5 SIG = 5.0 AMP = 2.0 INTERVAL = 5000 EPOCH = 100000 SEED = 2020 toy_bandit = GaussianBandit(num_arms=NUM_ARMS, sig=SIG, seed=SEED) print(toy_bandit.centers) print("Testing epsilon greedy method ... ") test_epsilon_greedy(toy_bandit) print("Testing UCB method ... ") test_ucb_select(toy_bandit) ALPHA = [0, 0.05, 0.1, 0.2, 1.0] # ALPHA = [0.1] print("Testing different alpha (learning rate) in unstable bandit ... ") unstable_bandit = UnstableGaussianBandit(num_arms=NUM_ARMS, sig=SIG, change_interval=INTERVAL, change_amp=AMP, seed=SEED) test_unstable_bandit(unstable_bandit, alpha=ALPHA, n_epoch=EPOCH) plt.show()
regret[i][:temp, :] = reg_i[:temp, :] else: temp = 0 start_it = np.min((start_it, temp)) if start_it == (n_runs + 1): start_it = 0 if start_it == 0: regret = {i: np.zeros((n_runs, H * T)) for i in range(len(policies))} else: start_it = 0 # switch_sequences = np.random.randint(0, M, (n_runs-start_it, H)).astype('int').reshape((n_runs-start_it, H)) switch_sequences = np.array([[(i + j) % M for j in range(H)] for i in range(n_runs)]) # switch_sequences = np.zeros((n_runs, H)).astype('int') bandits = [GaussianBandit(mu) for mu in models] datasets = [build_dataset(bandits, T, H, seq) for seq in switch_sequences] agent = {i: SwitchingAgent(bandits, pi, T) for i, pi in enumerate(policies)} FPR = np.zeros((n_runs, T - K)) TPR = np.zeros((n_runs, T - K)) NEG = np.zeros((n_runs, T - K)) for it in tqdm(range(start_it, n_runs)): data = datasets[it - start_it] switch_seq = switch_sequences[it - start_it] for i, pi in enumerate(policies): agent[i].run(data, switch_seq) regret[i][it] = agent[i].regret if pi.__str__() == 'KLUCB-RB': FPR[it] = agent[i].fp_rate TPR[it] = agent[i].tp_rate
ax2 = fig.add_subplot(212) fig.subplots_adjust(wspace=None, hspace=0.3) for i in range(num_arms): ax1.plot(ls[i]) ax1.grid() ax1.set_title('Rewards of Each Arm') ax1.set_xlabel('Iterations') ax1.set_ylabel('Reward') # reward mean change curve for i in range(num_arms): ax2.plot(cntr[i]) ax2.grid() ax2.set_title('Mean of Each Arm of Gaussian Bandit (showing unstability)') ax2.set_xlabel('Iterations') ax2.set_ylabel('Current Mean of Reward') # plt.show() if __name__ == "__main__": # test normal Gaussian Bandit toy_bandit = GaussianBandit(num_arms=8) print(toy_bandit.centers) print(toy_bandit.get_reward(1)) print(toy_bandit.get_reward(2)) # test unstable Gaussian Bandit toy_bandit = UnstableGaussianBandit(num_arms=3, sig=2.0, change_interval=100) draw_unstable_bandit(toy_bandit) plt.show()
# aver_reward_list.append(toy_bandit.centers[0]) for each_arm in range(num_arms): if each_arm == arm_id: act_selection_aver[each_arm, i] = act_selection_aver[each_arm, max(i-1, 0)] + 1 else: act_selection_aver[each_arm, i] = act_selection_aver[each_arm, max(i-1, 0)] act_selection_aver = act_selection_aver / (np.array([[range(n_epoch)] * num_arms])[0,:,:] + 1) return q_list, aver_reward_list, act_selection_aver if __name__ == "__main__": NUM_ARMS = 5 SIG = 1.0 toy_bandit = GaussianBandit(num_arms=NUM_ARMS, sig=SIG) print(toy_bandit.centers) q_list, aver_reward_list, act_selection_aver \ = bandit_algorithm(toy_bandit, n_epoch=200, warm_up=True, epsilon=0.1) fig = plt.figure(figsize=(10,10)) ax1 = fig.add_subplot(211) ax2 = fig.add_subplot(212) fig.subplots_adjust(wspace=None, hspace=0.3) ax1.plot(aver_reward_list) ax1.grid() ax1.set_title('Average reward vs. Iter') ax1.set_xlabel('Iterations') ax1.set_ylabel('Average reward') for each_arm in range(NUM_ARMS): ax2.plot(act_selection_aver[each_arm]) ax2.grid()