예제 #1
0
def run_games(game_length, left_arm_mean, left_arm_std, n_players, right_arm_mean, right_arm_std, use_asrn, learning_rate = 0.01, gamma=0.95, epsilon=1.0, epsilon_decay=0.99):
    all_rewards = []
    all_goods = []
    all_losses = []
    all_q_tables = []
    trained_agent_q_values = [left_arm_mean / (1 - gamma), right_arm_mean / (1 - gamma)]
 
    for j in range(n_players):
        two_armed_bandit = BrokenArmedBandit(left_arm_mean=left_arm_mean, right_arm_mean=right_arm_mean, left_arm_std=left_arm_std, right_arm_std=right_arm_std)

        ## giving the real mean as initialization(!)
        left_initial_mean = trained_agent_q_values[0]
        right_initial_mean = trained_agent_q_values[1]

        q_learning = QLearning(left_initial_mean, right_initial_mean, learning_rate, gamma, epsilon, epsilon_decay)

        rewards = np.zeros((game_length, 1))
        goods = np.zeros((game_length, 1))
        losses = np.zeros((game_length, 1))
        q_table = []

        if use_asrn:
            asrn = BinsASRN(0, learning_period=game_length/10)
        for i in range(game_length):
            right, reward_estimation = q_learning.choose()
            good = q_learning.right_mean > q_learning.left_mean
            goods[i] = good

            q_table.append([q_learning.right_mean, q_learning.left_mean])
            
            reward = two_armed_bandit.pull(right)
            rewards[i] = reward

            if use_asrn:
                if right:
                    updated_right_mean = (1 - q_learning.learning_rate) * q_learning.right_mean + q_learning.learning_rate * (reward + q_learning.gamma * q_learning.right_mean)
                    reward = asrn.noise(q_learning.right_mean, updated_right_mean, reward)
                else:
                    updated_left_mean = (1 - q_learning.learning_rate) * q_learning.left_mean + q_learning.learning_rate * (reward + q_learning.gamma * q_learning.left_mean)
                    reward = asrn.noise(q_learning.left_mean, updated_left_mean, reward)

            loss = q_learning.update(right, reward)
            losses[i] = loss

        all_rewards.append(rewards)
        all_goods.append(goods)
        all_losses.append(losses)
        all_q_tables.append(q_table)

    return all_q_tables, all_rewards, all_goods, np.asarray(all_losses)
def run_games(game_length,
              left_arm_mean,
              left_arm_std,
              n_players,
              right_arm_mean,
              right_arm_std,
              use_asrn,
              learning_rate=0.01,
              gamma=0.95,
              epsilon=1.0,
              epsilon_decay=0.99,
              debug=False,
              random_init=False):
    all_rewards = []
    all_goods = []
    all_losses = []
    trained_agent_q_values = [
        left_arm_mean / (1 - gamma), right_arm_mean / (1 - gamma)
    ]
    mx = np.max(trained_agent_q_values)
    mn = np.min(trained_agent_q_values)
    avg = 0
    std = mx - mn
    for j in range(n_players):
        two_armed_bandit = BrokenArmedBandit(left_arm_mean=left_arm_mean,
                                             right_arm_mean=right_arm_mean,
                                             left_arm_std=left_arm_std,
                                             right_arm_std=right_arm_std)

        if random_init:
            left_initial_mean = np.random.normal(avg, std)
            right_initial_mean = np.random.normal(avg, std)
            if left_initial_mean < right_initial_mean:
                left_initial_mean = -1
                right_initial_mean = 1
            else:
                left_initial_mean = 1
                right_initial_mean = -1
        else:
            ## giving the real mean as initialization(!)
            left_initial_mean = trained_agent_q_values[0]
            right_initial_mean = trained_agent_q_values[1]

        q_learning = QLearning(left_initial_mean, right_initial_mean,
                               learning_rate, gamma, epsilon, epsilon_decay)
        rewards = np.zeros((game_length, 1))
        goods = np.zeros((game_length, 1))
        losses = np.zeros((game_length, 1))
        debug_data = []

        if use_asrn:
            asrn = BinsASRN(0, learning_period=game_length / 10)
        for i in range(game_length):
            right, reward_estimation = q_learning.choose()
            good = q_learning.right_mean > q_learning.left_mean
            goods[i] = good
            if debug:
                debug_data.append(
                    [right, q_learning.right_mean, q_learning.left_mean])
            reward = two_armed_bandit.pull(right)
            rewards[i] = reward

            if use_asrn:
                if right:
                    updated_right_mean = (
                        1 - q_learning.learning_rate
                    ) * q_learning.right_mean + q_learning.learning_rate * (
                        reward + q_learning.gamma * q_learning.right_mean)
                    reward = asrn.noise(q_learning.right_mean,
                                        updated_right_mean, reward)
                else:
                    updated_left_mean = (
                        1 - q_learning.learning_rate
                    ) * q_learning.left_mean + q_learning.learning_rate * (
                        reward + q_learning.gamma * q_learning.left_mean)
                    reward = asrn.noise(q_learning.left_mean,
                                        updated_left_mean, reward)

            loss = q_learning.update(right, reward)
            losses[i] = loss

        all_rewards.append(rewards)
        all_goods.append(goods)
        all_losses.append(losses)
        if debug:
            debug_data = np.asarray(debug_data)[:, 1:]
            plt.plot(debug_data[:, 0], '-g')
            plt.plot(debug_data[:, 1], '-r')
            plt.legend(['Q r', 'Q l'])
            plt.show()

    return np.asarray(all_rewards), np.asarray(all_goods), np.asarray(
        all_losses)