Пример #1
0
def fig2_3():
    steps = int(1e3)
    runs = int(2e3)

    # comment this line if run on windows or OS X (default method)
    mp.set_start_method('spawn')

    print('Optimistic vs realistic started...')
    t1 = time.perf_counter()

    with mp.Pool(mp.cpu_count()) as pool:
        real = np.array(pool.starmap(realistic, [(steps, 0.1, 0.1)] * runs))
        opt = np.array(pool.starmap(optimistic, [(steps, 0.1)] * runs))

    t2 = time.perf_counter()
    print(f'Done in {round(t2 - t1, 3)} sec')

    # percentage of optimal actions
    real = percent(real)
    opt = percent(opt)

    # plotting
    labels = ('Realistic, greedy\n'
              r'$Q_1=0, \varepsilon=0$', r'Optimistic, $\varepsilon$-greedy'
              '\n'
              r'$Q_1=5, \varepsilon=0.1$')

    plot((real, opt),
         labels,
         '% Optimal action',
         colors=('grey', 'dodgerblue'))

    plt.show()
Пример #2
0
def fig2_5():
    runs = int(2e3)
    steps = int(1e3)

    # comment this line if run on windows or OS X  (default method)
    mp.set_start_method('spawn')

    print('Started gradient bandit...')
    t1 = time.perf_counter()

    with mp.Pool(mp.cpu_count()) as pool:
        bl01 = np.array(pool.starmap(grad_bline, [(steps, 0.1)] * runs))
        bl04 = np.array(pool.starmap(grad_bline, [(steps, 0.4)] * runs))
        no_bl01 = np.array(pool.starmap(grad_no_bline, [(steps, 0.1)] * runs))
        no_bl04 = np.array(pool.starmap(grad_no_bline, [(steps, 0.4)] * runs))

    t2 = time.perf_counter()
    print(f'Done in {round(t2 - t1, 3)} sec')

    result = [bl01, bl04, no_bl01, no_bl04]
    # get percentages
    result = [percent(i) for i in result]

    # plotting
    labels = (r'with baseline, $\alpha=0.1$', r'with baseline, $\alpha=0.4$',
              r'without baseline, $\alpha=0.1$',
              r'without baseline, $\alpha=0.4$')
    colors = ('blue', 'cornflowerblue', 'sienna', 'tan')

    plot(result, labels, '% Optimal action', colors=colors)
    plt.show()
Пример #3
0
def fig2_4():
    runs = int(2e3)  # the number of different bandit experiments
    steps = int(1e3)  # number of learning iterations in a single experiment

    # comment this line if run on windows or OS X  (default method)
    mp.set_start_method('spawn')

    print('Start upper confidence bound...')
    t1 = time.perf_counter()

    with mp.Pool(mp.cpu_count()) as pool:
        rewards_ucb = np.array(pool.starmap(ucb, [(steps, 2)] * runs))
        rewards_greedy = np.array(
            pool.starmap(eps_greedy, [(steps, 0.1)] * runs))

    t2 = time.perf_counter()
    print(f'Done in {round(t2 - t1, 3)} sec')

    # get the averages
    rewards_ucb = rewards_ucb.mean(axis=0)
    rewards_greedy = rewards_greedy.mean(axis=0)

    # plot
    labels = (r'UCB, $c=2$', r'$\varepsilon$-greedy, $\varepsilon=0.1$')
    plot((rewards_ucb, rewards_greedy),
         labels,
         'Average reward',
         colors=('blue', 'grey'))

    plt.show()
Пример #4
0
                print(f'{x}', end=' ')
                # mean reward across all runs
                arr = np.array(pool.starmap(locals()[method], [(steps, param)] * runs)).mean(axis=0)
                # overall mean reward for the last 100 000 steps
                rewards[method].append(arr[100000:].mean())

            t2 = time.perf_counter()
            print(f'done in {round(t2 - t1, 3)} sec')

        t3 = time.perf_counter()
        print(f'Overall execution time {round(t3 - t0, 3)} sec')

    # plotting
    # labels and colors
    labels = (r'$\varepsilon$-greedy, $\varepsilon$',
              'constant step\n' r'$\varepsilon$-greedy $\alpha=0.1$, $\varepsilon$',
              r'gradient bandit, $\alpha$',
              r'UCB, $c$',
              'optimistic greedy\n' r'$\alpha=0.1, Q_0$')
    ylabel = 'Average reward over\n last 100 000 steps'
    xlabel = r'$\varepsilon, \alpha, c, Q_0$'
    colors = ('red', 'purple', 'green', 'blue', 'black')

    # x axis values to correspond with parameter slices
    x = [list(range(10)[start:stop]) for (start, stop) in param_slices.values()]
    # plots
    ax = plot(rewards.values(), labels, ylabel, datax=x, xlabel=xlabel, colors=colors, fig_size=(15, 8))
    plt.xticks(range(10), x_ticks)

    plt.show()
Пример #5
0
    t1 = time.perf_counter()

    with mp.Pool(mp.cpu_count()) as pool:
        sample_av = np.array(
            pool.starmap(sample_average, [(steps, 0.1)] * runs))
        const_step = np.array(
            pool.starmap(constant_step, [(steps, 0.1, 0.1)] * runs))
        # got (2000, 2, 1000)-shaped arrays, axis=1 stands for rewards and optimals

    t2 = time.perf_counter()
    print(f'Done in {round(t2 - t1, 3)} sec')

    # reshape the arrays to distinguish rewards and optimal actions
    sample_av = np.transpose(sample_av, (1, 0, 2))
    const_step = np.transpose(const_step, (1, 0, 2))

    # get average rewards
    rewards = (sample_av[0].mean(axis=0), const_step[0].mean(axis=0))

    # get optimal action percentage
    optimals = (percent(sample_av[1]), percent(const_step[1]))

    # plot
    labels = ('Sample average\n'
              r'$\varepsilon=0.1$', 'Constant step-size\n'
              r'$\varepsilon=0.1, \alpha=0.1$')
    plot(rewards, labels, 'Average reward')
    plot(optimals, labels, '% Optimal action')

    plt.show()