Python GaussianBandit примеры использования

Язык программирования: Python

Пространство имен/Пакет: bandit

Класс/Тип: GaussianBandit

Примеров на hotexamples.com: 10

Python GaussianBandit - 10 примеров найдено. Это лучшие примеры Python кода для bandit.GaussianBandit, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

GaussianBandit(10)

get_reward(1)

Основные методы

GaussianBandit (10)

get_reward (1)

Пример #1

Показать файл

Файл: run_and_plot.py Проект: erinbugbee/statistics_thesis

def greedy_policy():
    # Agent chooses by the greedy policy
    rewards = np.zeros((len(epsilons), num_sessions, num_trials))
    num_best = np.zeros((len(epsilons), num_sessions, num_trials))

    for i in range(len(epsilons)):
        policy = GreedyPolicy()
        bandit = GaussianBandit(n)
        agent = Agent(n, policy, num_trials)
        env = Environment(bandit, agent, num_trials, num_sessions)
        rewards[i, :, :], num_best[i, :, :] = env.run()

    ave_reward = rewards[i, :, :].mean(axis=0)
    plt.plot(ave_reward)
    plt.title("Average Reward")
    plt.xlabel('Trial')
    plt.ylabel('Reward')
    plt.show()

    ave_percent_best = num_best[i, :, :].mean(axis=0)
    plt.plot(ave_percent_best)
    plt.title("Average Percent Best Option")
    plt.xlabel('Trial')
    plt.ylabel('Percent Best Option')
    plt.show()

Пример #2

Показать файл

Файл: run_and_plot.py Проект: erinbugbee/statistics_thesis

def run_bandit(epsilon, n, num_trials, num_sessions):
    # Runs the bandit for a single epsilon, n
    policy = EpsilonGreedyPolicy(epsilon)
    bandit = GaussianBandit(n)
    agent = Agent(n, policy, num_trials)
    env = Environment(bandit, agent, num_trials, num_sessions)
    rewards, num_best = env.run()

    plot_ave_reward(rewards)
    plt.show()

    plot_percent_best_action(num_best)
    plt.show()

Пример #3

Показать файл

Файл: run_and_plot.py Проект: erinbugbee/statistics_thesis

def compare_epsilons(n, epsilons):
    # Compare various values of n and epsilon

    # maximizer: epsilon = 1, complete exploration
    # satisficer: epsilon = 0, complete exploitation
    rewards = np.zeros((len(epsilons), num_sessions, num_trials))
    num_best = np.zeros((len(epsilons), num_sessions, num_trials))
    ave_reward = np.zeros((len(epsilons), num_trials))
    cum_reward = np.zeros(num_sessions)
    ave_cum_reward = np.zeros((len(epsilons), 2))

    for i in range(len(epsilons)):
        policy = EpsilonGreedyPolicy(epsilons[i])
        bandit = GaussianBandit(n)
        agent = Agent(n, policy, num_trials)
        env = Environment(bandit, agent, num_trials, num_sessions)
        rewards[i, :, :], num_best[i, :, :] = env.run()

    # Compare average reward across values of epsilon
    color = iter(cm.rainbow(np.linspace(0, 1, len(epsilons))))
    for i in range(len(epsilons)):
        c = next(color)
        ave_reward[i, :] = rewards[i, :, :].mean(axis=0)
        plt.plot(ave_reward[i, :], label="Epsilon:" + str(epsilons[i]), c=c)
        plt.title("Average Reward" + ", n: " + str(n))
        plt.xlabel('Trial')
        plt.ylabel('Reward')
        plt.legend(loc="upper left")
        plt.rc('legend', fontsize='x-small')
    plt.show()

    color2 = iter(cm.rainbow(np.linspace(0, 1, len(epsilons))))
    for i in range(len(epsilons)):
        c = next(color2)
        ave_percent_best = num_best[i, :, :].mean(axis=0)
        plt.plot(ave_percent_best, label="Epsilon:" + str(epsilons[i]), c=c)
        plt.title("Average Percent Best Option" + ", n: " + str(n))
        plt.xlabel('Trial')
        plt.ylabel('Percent Best Option')
        plt.legend(loc="upper left")
        plt.rc('legend', fontsize='x-small')
    plt.show()

    for i in range(len(epsilons)):
        for j in range(num_sessions):
            cum_reward[j] = rewards[i, j, :].sum()
        ave_cum_reward[i, :] = [epsilons[i], np.mean(cum_reward)]
    print(np.shape(cum_reward))
    print(np.shape(ave_cum_reward))
    print(ave_cum_reward)

Пример #4

Показать файл

    def __init__(self, n: int, mu: float, std: float, noise: float) -> None:
        """
		Constructs an N-armed bandit environment.

		@param n: The number of bandit arms.
		@param mu: The mean of the bandits' true rewards.
		@param std: The standard deviation of the bandits' true rewards.
		@param noise: The standard deviation of the Gaussian noise around rewards.
		"""
        super(GaussBanditEnvironment, self).__init__()
        self._mu = mu
        self._std = std
        self._noise = noise
        self._rng = np.random.default_rng()
        self._bandits = [
            GaussianBandit(self._rng.normal(self._mu, self._std), self._noise)
            for _ in range(n)
        ]

Пример #5

Показать файл

Файл: run_and_plot.py Проект: erinbugbee/statistics_thesis

def compare_n(n_list):
    # Compare across values of n
    rewards = np.zeros((len(n_list), num_sessions, num_trials))
    num_best = np.zeros((len(n_list), num_sessions, num_trials))
    cum_reward = np.zeros(num_sessions)
    ave_cum_reward = np.zeros((len(n_list), 2))

    for i in range(len(n_list)):
        policy = EpsilonGreedyPolicy(epsilon)
        bandit = GaussianBandit(n_list[i])
        agent = Agent(n_list[i], policy, num_trials)
        env = Environment(bandit, agent, num_trials, num_sessions)
        rewards[i, :, :], num_best[i, :, :] = env.run()

    # Compare average reward across values of epsilon
    color = iter(cm.rainbow(np.linspace(0, 1, len(n_list))))
    for i in range(len(n_list)):
        c = next(color)
        ave_reward = rewards[i, :, :].mean(axis=0)
        plt.plot(ave_reward, label="n:" + str(n_list[i]), c=c)
        plt.title("Average Reward")
        plt.xlabel('Trial')
        plt.ylabel('Reward')
        plt.legend(loc="upper left")
    plt.show()

    color2 = iter(cm.rainbow(np.linspace(0, 1, len(n_list))))
    for i in range(len(n_list)):
        c = next(color2)
        ave_percent_best = num_best[i, :, :].mean(axis=0)
        plt.plot(ave_percent_best, label="n:" + str(n_list[i]), c=c)
        plt.title("Average Percent Best Option")
        plt.xlabel('Trial')
        plt.ylabel('Percent Best Option')
        plt.legend(loc="upper left")
    plt.show()

    for i in range(len(n_list)):
        for j in range(num_sessions):
            cum_reward[j] = rewards[i, j, :].sum()
        ave_cum_reward[i, :] = [n_list[i], np.mean(cum_reward)]
    print(np.shape(cum_reward))
    print(np.shape(ave_cum_reward))
    print(ave_cum_reward)

Пример #6

Показать файл

 def reset(self) -> None:
     self._bandits = \
      [GaussianBandit(self._rng.normal(self._mu, self._std), self._noise) for _ in range(len(self._bandits))]

Пример #7

Показать файл

Файл: experiments.py Проект: jzsherlock4869/Reinforcement-Learning-An-Introduction-Sutton-Barto-Code-Implementations

    ax1.grid()
    ax1.legend(['alpha = {}'.format(i) for i in alpha], loc='upper right')
    ax1.set_title('Average reward vs. alpha (learning rate)')
    ax1.set_xlabel('Iterations')
    ax1.set_ylabel('Average reward')
    # plt.show()


if __name__ == "__main__":

    NUM_ARMS = 5
    SIG = 5.0
    AMP = 2.0
    INTERVAL = 5000
    EPOCH = 100000
    SEED = 2020

    toy_bandit = GaussianBandit(num_arms=NUM_ARMS, sig=SIG, seed=SEED)
    print(toy_bandit.centers)
    print("Testing epsilon greedy method ... ")
    test_epsilon_greedy(toy_bandit)
    print("Testing UCB method ... ")
    test_ucb_select(toy_bandit)

    ALPHA = [0, 0.05, 0.1, 0.2, 1.0]
    # ALPHA = [0.1]
    print("Testing different alpha (learning rate) in unstable bandit ... ")
    unstable_bandit = UnstableGaussianBandit(num_arms=NUM_ARMS, sig=SIG, change_interval=INTERVAL, change_amp=AMP, seed=SEED)
    test_unstable_bandit(unstable_bandit, alpha=ALPHA, n_epoch=EPOCH)
    plt.show()

Пример #8

Показать файл

            regret[i][:temp, :] = reg_i[:temp, :]
        else:
            temp = 0
        start_it = np.min((start_it, temp))
    if start_it == (n_runs + 1):
        start_it = 0
    if start_it == 0:
        regret = {i: np.zeros((n_runs, H * T)) for i in range(len(policies))}
else:
    start_it = 0

# switch_sequences = np.random.randint(0, M, (n_runs-start_it, H)).astype('int').reshape((n_runs-start_it, H))
switch_sequences = np.array([[(i + j) % M for j in range(H)]
                             for i in range(n_runs)])
# switch_sequences = np.zeros((n_runs, H)).astype('int')
bandits = [GaussianBandit(mu) for mu in models]
datasets = [build_dataset(bandits, T, H, seq) for seq in switch_sequences]
agent = {i: SwitchingAgent(bandits, pi, T) for i, pi in enumerate(policies)}
FPR = np.zeros((n_runs, T - K))
TPR = np.zeros((n_runs, T - K))
NEG = np.zeros((n_runs, T - K))

for it in tqdm(range(start_it, n_runs)):
    data = datasets[it - start_it]
    switch_seq = switch_sequences[it - start_it]
    for i, pi in enumerate(policies):
        agent[i].run(data, switch_seq)
        regret[i][it] = agent[i].regret
        if pi.__str__() == 'KLUCB-RB':
            FPR[it] = agent[i].fp_rate
            TPR[it] = agent[i].tp_rate

Пример #9

Показать файл

Файл: test_bandit.py Проект: jzsherlock4869/Reinforcement-Learning-An-Introduction-Sutton-Barto-Code-Implementations

    ax2 = fig.add_subplot(212)
    fig.subplots_adjust(wspace=None, hspace=0.3)
    for i in range(num_arms):
        ax1.plot(ls[i])
    ax1.grid()
    ax1.set_title('Rewards of Each Arm')
    ax1.set_xlabel('Iterations')
    ax1.set_ylabel('Reward')
    # reward mean change curve
    for i in range(num_arms):
        ax2.plot(cntr[i])
    ax2.grid()
    ax2.set_title('Mean of Each Arm of Gaussian Bandit (showing unstability)')
    ax2.set_xlabel('Iterations')
    ax2.set_ylabel('Current Mean of Reward')
    # plt.show()


if __name__ == "__main__":

    # test normal Gaussian Bandit
    toy_bandit = GaussianBandit(num_arms=8)
    print(toy_bandit.centers)
    print(toy_bandit.get_reward(1))
    print(toy_bandit.get_reward(2))

    # test unstable Gaussian Bandit
    toy_bandit = UnstableGaussianBandit(num_arms=3, sig=2.0, change_interval=100)
    draw_unstable_bandit(toy_bandit)
    plt.show()

Пример #10

Показать файл

        # aver_reward_list.append(toy_bandit.centers[0])
        for each_arm in range(num_arms):
            if each_arm == arm_id:
                act_selection_aver[each_arm, i] = act_selection_aver[each_arm, max(i-1, 0)] + 1
            else:
                act_selection_aver[each_arm, i] = act_selection_aver[each_arm, max(i-1, 0)]
    act_selection_aver = act_selection_aver / (np.array([[range(n_epoch)] * num_arms])[0,:,:] + 1)

    return q_list, aver_reward_list, act_selection_aver


if __name__ == "__main__":

    NUM_ARMS = 5
    SIG = 1.0
    toy_bandit = GaussianBandit(num_arms=NUM_ARMS, sig=SIG)
    print(toy_bandit.centers)
    q_list, aver_reward_list, act_selection_aver \
                    = bandit_algorithm(toy_bandit, n_epoch=200, warm_up=True, epsilon=0.1)
    fig = plt.figure(figsize=(10,10))
    ax1 = fig.add_subplot(211)
    ax2 = fig.add_subplot(212)
    fig.subplots_adjust(wspace=None, hspace=0.3)
    ax1.plot(aver_reward_list)
    ax1.grid()
    ax1.set_title('Average reward vs. Iter')
    ax1.set_xlabel('Iterations')
    ax1.set_ylabel('Average reward')
    for each_arm in range(NUM_ARMS):
        ax2.plot(act_selection_aver[each_arm])
    ax2.grid()