示例#1
0
def main():
    # arm1 = NormalArm(0.2, 1)
    # arm2 = NormalArm(0.3, 1)
    # arm3 = NormalArm(0.4, 1)
    # arm4 = NormalArm(0.5, 1)
    # arm5 = NormalArm(0.6, 1)
    # arm6 = NormalArm(0.7, 1)
    # arm7 = NormalArm(0.6, 1)
    # arm8 = NormalArm(0.5, 1)
    # arm9 = NormalArm(0.4, 1)
    # arm10 = NormalArm(0.1, 1)

    arm1 = BernoulliArm(0.2)
    arm2 = BernoulliArm(0.5)
    arm3 = BernoulliArm(0.9)
    arm4 = BernoulliArm(0.4)
    arm5 = BernoulliArm(0.4)
    arm6 = BernoulliArm(0.3)
    arm7 = BernoulliArm(0.2)
    arm8 = BernoulliArm(0.1)
    arms = [arm1, arm2, arm3, arm4, arm5, arm6, arm7, arm8]

    max_mu = max([arm.mu for arm in arms])
    n_arms = len(arms)
    print(f'Optimal arm: #{argmax([arm.mu for arm in arms]) + 1}')
    param_dict = {"epsilon": 0.5, "sigma": 1.96, "tau": 0.2, "gamma": 0.2}
    algo_epsilon = EpsilonGreedy(n_arms, param_dict)
    algo_anneal_epsilon = AnnealingEpsilonGreedy(n_arms, param_dict)
    algo_ucb1 = UCB1(n_arms, param_dict)
    algo_ucb_bayesian = UCB_Bayesian(n_arms, param_dict)  # 95% confident
    algo_softmax = Softmax(n_arms, param_dict)
    algo_anneal_softmax = AnnealingSoftmax(n_arms, param_dict)
    algo_exp3 = Exp3(n_arms, param_dict)
    algo_thompson = ThompsonSampling(n_arms, param_dict)

    algorithms = [algo_ucb1]
    algorithm_rewards = [
    ]  # 2D list[algo][t] (array of running avg. rewards for each algo at time-step t)
    algorithm_cum_rewards = [
    ]  # 2D list[algo][t] (array of cumulative rewards for each algo at time-step t)
    algorithm_arm_selections = [
    ]  # 2D list[algo][t] (array of arm selections for each algo at time-step t)

    # semi-global variables
    timesteps = 1000  # number of time-steps (T)
    total_iteration = 1  # outer-loop
    algorithm_timestep_reward_stacked = np.zeros((timesteps), dtype=int)

    for algo in algorithms:
        print(algo.get_name())
        avg_rewards, cum_rewards, arm_selections = [0], [0], []
        new_avg = 0

        for i in range(total_iteration):
            # TODO: reinitialize this DYNAMICALLY!!
            algo = UCB1(
                n_arms,
                param_dict)  # reinitialize algorithm (clear previous memory)
            for t in range(timesteps):  # NOTE: 0 based? 1 based?
                chosen_arm = algo.select_arm()
                arm_selections.append(chosen_arm +
                                      1)  # convert 0-based index to 1-based
                reward = arms[chosen_arm].draw_reward()
                algorithm_timestep_reward_stacked[
                    t] += reward  # This persists over total_iterations
                if t != 0:
                    new_avg = (avg_rewards[-1] *
                               (t - 1) + reward) / t  # new running avg.
                avg_rewards.append(new_avg)
                cum_rewards.append(
                    new_avg *
                    t)  # [1,1,0,1,1] [1,2,2,3,?], cur_avg=0.75, new_avg = 0.8
                algo.update(chosen_arm, reward)

            # Resetting variable
            arm_selections = []

        algorithm_rewards.append(avg_rewards)
        algorithm_cum_rewards.append(cum_rewards)
        algorithm_arm_selections.append(arm_selections)

        # Compute average rewards for each iteration
        average_reward_in_each_round = np.zeros(timesteps, dtype=float)

        # Calculate the values for one good 1000 rounds
        # Squash 200X1000 -> 1X1000
        for t in range(timesteps):
            average_reward_in_each_round[t] = float(
                algorithm_timestep_reward_stacked[t]) / float(total_iteration)

        cumulative_optimal_reward = 0.0
        cumulative_reward = 0.0
        x_axis = np.zeros(timesteps, dtype=int)
        regrets = np.zeros(timesteps, dtype=float)  # regret for each round

        # print(average_reward_in_each_round)

        for t in range(timesteps):
            x_axis[t] = t
            cumulative_optimal_reward += max_mu
            cumulative_reward += average_reward_in_each_round[t]
            # print(f"{cumulative_optimal_reward} \t {cumulative_reward}")
            regrets[t] = cumulative_optimal_reward - cumulative_reward

        plot_regret(x_axis, regrets, cumulative_optimal_reward,
                    cumulative_reward, average_reward_in_each_round, timesteps,
                    algo.get_name())
        print(
            f"The average regret for {algo.get_name()} is {cumulative_optimal_reward - cumulative_reward}"
        )

    max_cum_reward = max(
        [algorithm_cum_rewards[i][-1] for i in range(len(algorithms))])
    for i in range(len(algorithms)):
        print(
            f"{algorithms[i].get_name()}: {algorithm_cum_rewards[i][-1]:.2f}")

    plot_graph(timesteps, arms, algorithms, algorithm_rewards,
               algorithm_cum_rewards, algorithm_arm_selections, max_mu,
               max_cum_reward)
    plot_cum_rewards(algorithms, algorithm_cum_rewards, timesteps,
                     max_cum_reward)
# -*- coding: utf-8 -*-
import matplotlib.pyplot as plt
from algorithms.softmax import Softmax
from arms.bernoulli import BernoulliArm
from tests.test_framework import test_algorithm

algo = Softmax(0.1, 5)
means = [0.1, 0.1, 0.1, 0.1, 0.9]
arms = map(lambda mu: BernoulliArm(mu), means)
times, chosen_arms, rewards, cumulative_rewards = test_algorithm(algo, arms, 500)

# accuracy of the Epsilon Greedy Algorithm
best_arms = [0.0 for _ in range(len(times))]
for t in times:
    if chosen_arms[t-1] == 4:
        if t == 1:
            best_arms[t-1] = 1.0
        else:
            best_arms[t-1] = 1.0 * (best_arms[t-2] * (t-1) + 1) / t
    else:
        if t == 1:
            best_arms[t-1] = 0.0
        else:
            best_arms[t-1] = 1.0 * best_arms[t-2] * (t-1) / t
plt.subplot(221)
plt.plot(times, best_arms)
plt.grid()

# Performance of the Epsilon Greedy Algorithm
average_rewards = [0.0 for _ in range(len(times))]
for t in times:
示例#3
0
def main():
    # arm1 = NormalArm(0.2, 1)
    # arm2 = NormalArm(0.3, 1)
    # arm3 = NormalArm(0.4, 1)
    # arm4 = NormalArm(0.5, 1)
    # arm5 = NormalArm(0.6, 1)
    # arm6 = NormalArm(0.7, 1)
    # arm7 = NormalArm(0.6, 1)
    # arm8 = NormalArm(0.5, 1)
    # arm9 = NormalArm(0.4, 1)
    # arm10 = NormalArm(0.1, 1)
    arm1 = BernoulliArm(0.2)
    arm2 = BernoulliArm(0.5)
    arm3 = BernoulliArm(0.9)
    arm4 = BernoulliArm(0.4)
    arm5 = BernoulliArm(0.4)
    arm6 = BernoulliArm(0.3)
    arm7 = BernoulliArm(0.2)
    arm8 = BernoulliArm(0.1)
    arms = [arm1, arm2, arm3, arm4, arm5, arm6, arm7, arm8]
    n_arms = len(arms)
    print(f'Optimal arm: #{argmax([arm.mu for arm in arms]) + 1}')
    change_of_distribution = False

    algo_epsilon = EpsilonGreedy(0.05, n_arms)
    algo_anneal_epsilon = AnnealingEpsilonGreedy(n_arms)
    algo_ucb1 = UCB1(n_arms)
    algo_ucb_bayesian = UCB_Bayesian(1.96, n_arms)  # 95% confident
    algo_softmax = Softmax(.2, n_arms)
    algo_anneal_softmax = AnnealingSoftmax(n_arms)
    algo_exp3 = Exp3(.2, n_arms)
    algo_thompson = ThompsonSampling(n_arms)

    algorithms = [algo_epsilon, algo_ucb1, algo_thompson]
    algorithm_rewards = [
    ]  # 2D list[algo][t] (array of running avg. rewards for each algo at time-step t)
    algorithm_cum_rewards = [
    ]  # 2D list[algo][t] (array of cumulative rewards for each algo at time-step t)
    algorithm_arm_selections = [
    ]  # 2D list[algo][t] (array of arm selections for each algo at time-step t)

    timesteps = 5000  # number of time-steps

    for algo in algorithms:
        avg_rewards, cum_rewards, arm_selections = [0], [0], []

        for t in range(1, timesteps):
            if change_of_distribution and t == timesteps / 2:  # change distribution of rewards at half-time
                arms = change_distribution()
                print(f'Optimal arm: {argmax([arm.mu for arm in arms]) + 1}')
                algo.initialize(len(arms))
            chosen_arm = algo.select_arm()
            arm_selections.append(chosen_arm +
                                  1)  # convert 0-based index to 1-based
            reward = arms[chosen_arm].draw_reward()
            new_avg = (avg_rewards[-1] *
                       (t - 1) + reward) / t  # new running avg.
            avg_rewards.append(new_avg)
            cum_rewards.append(new_avg * t)
            algo.update(chosen_arm, reward)
        algorithm_rewards.append(avg_rewards)
        algorithm_cum_rewards.append(cum_rewards)
        algorithm_arm_selections.append(arm_selections)

    max_mu = max([arm.mu for arm in arms])
    max_cum_reward = max(
        [algorithm_cum_rewards[i][-1] for i in range(len(algorithms))])
    for i in range(len(algorithms)):
        print(algorithms[i].get_name() + ":",
              f'{algorithm_cum_rewards[i][-1]:.2f}')

    plot_graph(timesteps, arms, algorithms, algorithm_rewards,
               algorithm_cum_rewards, algorithm_arm_selections, max_mu,
               max_cum_reward, change_of_distribution)
    plot_cum_rewards(algorithms, algorithm_cum_rewards, timesteps,
                     max_cum_reward)