示例#1
0
    mp.set_start_method('spawn')

    print('Stationary greedy started...')
    t1 = time.perf_counter()

    with mp.Pool(mp.cpu_count()) as pool:

        def func(x):
            return np.array(pool.map(EpsGreedy(eps=x).rews_opts_stat, args))

        result = [func(eps) for eps in epsilons]
        # get 3 (2000, 2, 1000)-shaped arrays, axis=1 stands for rewards and optimals

    t2 = time.perf_counter()
    print(f'Done in {round(t2 - t1, 3)} sec')

    # get the average rewards
    rewards = [pair[:, 0, :].mean(axis=0) for pair in result]
    # get the percentage of the optimal actions
    optimals = [Bandit.percent(pair[:, 1, :]) for pair in result]

    # plotting
    colors = ('green', 'blue', 'red')
    labels = (r'$\varepsilon=0$ (greedy)', r'$\varepsilon=0.1$',
              r'$\varepsilon=0.01$')

    Bandit.plot(rewards, labels, 'Average reward')
    Bandit.plot(optimals, labels, '% Optimal action')

    plt.show()
示例#2
0
            print(f'done in {round(t2 - t1, 3)} sec')

        t3 = time.perf_counter()
        print(f'Overall execution time {round(t3 - t0, 3)} sec')

    # plotting
    # labels and colors
    labels = (r'$\varepsilon$-greedy, $\varepsilon$', 'constant step\n'
              r'$\varepsilon$-greedy $\alpha=0.1$, $\varepsilon$',
              r'gradient bandit, $\alpha$', r'UCB, $c$', 'optimistic greedy\n'
              r'$\alpha=0.1, Q_0$')
    ylabel = 'Average reward over\n last 100 000 steps'
    xlabel = r'$\varepsilon, \alpha, c, Q_0$'
    colors = ('red', 'purple', 'green', 'blue', 'black')

    # x axis values to correspond with parameter slices
    x = [
        list(range(10)[start:stop]) for (start, stop) in param_slices.values()
    ]
    # plots
    ax = Bandit.plot(rewards.values(),
                     labels,
                     ylabel,
                     datax=x,
                     xlabel=xlabel,
                     colors=colors,
                     fig_size=(15, 8))
    plt.xticks(range(10), x_ticks)

    plt.show()
示例#3
0
    runs = int(2e3)  # the number of different bandit experiments
    steps = int(1e3)  # number of learning iterations in a single experiment
    args = [steps] * runs

    # comment this line if run on windows or OS X  (default method)
    mp.set_start_method('spawn')

    print('Start upper confidence bound...')
    t1 = time.perf_counter()

    with mp.Pool(mp.cpu_count()) as pool:
        ucb = np.array(pool.map(UCB(c=2).rewards_stat, args))
        greedy = np.array(pool.map(EpsGreedy(eps=0.1).rewards_stat, args))

    t2 = time.perf_counter()
    print(f'Done in {round(t2 - t1, 3)} sec')

    # get the averages
    ucb = ucb.mean(axis=0)
    greedy = greedy.mean(axis=0)

    # plot
    labels = (r'UCB, $c=2$', r'$\varepsilon$-greedy, $\varepsilon=0.1$')
    Bandit.plot((ucb, greedy),
                labels,
                'Average reward',
                colors=('blue', 'grey'))

    plt.show()
示例#4
0
        bl01 = np.array(
            pool.map(
                GradientBaseline(true_value=4, alpha=0.1).optimals_stat, args))
        bl04 = np.array(
            pool.map(
                GradientBaseline(true_value=4, alpha=0.4).optimals_stat, args))
        no_bl01 = np.array(
            pool.map(
                GradientNoBaseline(true_value=4, alpha=0.1).optimals_stat,
                args))
        no_bl04 = np.array(
            pool.map(
                GradientNoBaseline(true_value=4, alpha=0.4).optimals_stat,
                args))

    t2 = time.perf_counter()
    print(f'Done in {round(t2 - t1, 3)} sec')

    result = [bl01, bl04, no_bl01, no_bl04]
    # get percentages
    result = [Bandit.percent(i) for i in result]

    # plotting
    labels = (r'with baseline, $\alpha=0.1$', r'with baseline, $\alpha=0.4$',
              r'without baseline, $\alpha=0.1$',
              r'without baseline, $\alpha=0.4$')
    colors = ('blue', 'cornflowerblue', 'sienna', 'tan')

    Bandit.plot(result, labels, '% Optimal action', colors=colors)
    plt.show()