def run_games(game_length, left_arm_mean, left_arm_std, n_players, right_arm_mean, right_arm_std, use_asrn, learning_rate = 0.01, gamma=0.95, epsilon=1.0, epsilon_decay=0.99): all_rewards = [] all_goods = [] all_losses = [] all_q_tables = [] trained_agent_q_values = [left_arm_mean / (1 - gamma), right_arm_mean / (1 - gamma)] for j in range(n_players): two_armed_bandit = BrokenArmedBandit(left_arm_mean=left_arm_mean, right_arm_mean=right_arm_mean, left_arm_std=left_arm_std, right_arm_std=right_arm_std) ## giving the real mean as initialization(!) left_initial_mean = trained_agent_q_values[0] right_initial_mean = trained_agent_q_values[1] q_learning = QLearning(left_initial_mean, right_initial_mean, learning_rate, gamma, epsilon, epsilon_decay) rewards = np.zeros((game_length, 1)) goods = np.zeros((game_length, 1)) losses = np.zeros((game_length, 1)) q_table = [] if use_asrn: asrn = BinsASRN(0, learning_period=game_length/10) for i in range(game_length): right, reward_estimation = q_learning.choose() good = q_learning.right_mean > q_learning.left_mean goods[i] = good q_table.append([q_learning.right_mean, q_learning.left_mean]) reward = two_armed_bandit.pull(right) rewards[i] = reward if use_asrn: if right: updated_right_mean = (1 - q_learning.learning_rate) * q_learning.right_mean + q_learning.learning_rate * (reward + q_learning.gamma * q_learning.right_mean) reward = asrn.noise(q_learning.right_mean, updated_right_mean, reward) else: updated_left_mean = (1 - q_learning.learning_rate) * q_learning.left_mean + q_learning.learning_rate * (reward + q_learning.gamma * q_learning.left_mean) reward = asrn.noise(q_learning.left_mean, updated_left_mean, reward) loss = q_learning.update(right, reward) losses[i] = loss all_rewards.append(rewards) all_goods.append(goods) all_losses.append(losses) all_q_tables.append(q_table) return all_q_tables, all_rewards, all_goods, np.asarray(all_losses)
def run_games(game_length, left_arm_mean, left_arm_std, n_players, right_arm_mean, right_arm_std, use_asrn, learning_rate=0.01, gamma=0.95, epsilon=1.0, epsilon_decay=0.99, debug=False, random_init=False): all_rewards = [] all_goods = [] all_losses = [] trained_agent_q_values = [ left_arm_mean / (1 - gamma), right_arm_mean / (1 - gamma) ] mx = np.max(trained_agent_q_values) mn = np.min(trained_agent_q_values) avg = 0 std = mx - mn for j in range(n_players): two_armed_bandit = BrokenArmedBandit(left_arm_mean=left_arm_mean, right_arm_mean=right_arm_mean, left_arm_std=left_arm_std, right_arm_std=right_arm_std) if random_init: left_initial_mean = np.random.normal(avg, std) right_initial_mean = np.random.normal(avg, std) if left_initial_mean < right_initial_mean: left_initial_mean = -1 right_initial_mean = 1 else: left_initial_mean = 1 right_initial_mean = -1 else: ## giving the real mean as initialization(!) left_initial_mean = trained_agent_q_values[0] right_initial_mean = trained_agent_q_values[1] q_learning = QLearning(left_initial_mean, right_initial_mean, learning_rate, gamma, epsilon, epsilon_decay) rewards = np.zeros((game_length, 1)) goods = np.zeros((game_length, 1)) losses = np.zeros((game_length, 1)) debug_data = [] if use_asrn: asrn = BinsASRN(0, learning_period=game_length / 10) for i in range(game_length): right, reward_estimation = q_learning.choose() good = q_learning.right_mean > q_learning.left_mean goods[i] = good if debug: debug_data.append( [right, q_learning.right_mean, q_learning.left_mean]) reward = two_armed_bandit.pull(right) rewards[i] = reward if use_asrn: if right: updated_right_mean = ( 1 - q_learning.learning_rate ) * q_learning.right_mean + q_learning.learning_rate * ( reward + q_learning.gamma * q_learning.right_mean) reward = asrn.noise(q_learning.right_mean, updated_right_mean, reward) else: updated_left_mean = ( 1 - q_learning.learning_rate ) * q_learning.left_mean + q_learning.learning_rate * ( reward + q_learning.gamma * q_learning.left_mean) reward = asrn.noise(q_learning.left_mean, updated_left_mean, reward) loss = q_learning.update(right, reward) losses[i] = loss all_rewards.append(rewards) all_goods.append(goods) all_losses.append(losses) if debug: debug_data = np.asarray(debug_data)[:, 1:] plt.plot(debug_data[:, 0], '-g') plt.plot(debug_data[:, 1], '-r') plt.legend(['Q r', 'Q l']) plt.show() return np.asarray(all_rewards), np.asarray(all_goods), np.asarray( all_losses)