示例#1
0
文件: ex2_5.py 项目: boldyshev/sutton
    # comment this line if run on windows or OS X (default method)
    mp.set_start_method('spawn')

    print('Start exercise 2.5... ')
    t1 = time.perf_counter()

    with mp.Pool(mp.cpu_count()) as pool:
        sample_av = np.array(pool.map(EpsGreedy(eps=0.1).rews_opts_nonstat, args))
        const_step = np.array(pool.map(EpsGreedyConstant(eps=0.1, alpha=0.1).rews_opts_nonstat, args))
        # got (2000, 2, 1000)-shaped arrays, axis=1 stands for rewards and optimals

    t2 = time.perf_counter()
    print(f'Done in {round(t2 - t1, 3)} sec')

    # get average rewards
    rewards = (sample_av[:, 0, :].mean(axis=0),
               const_step[:, 0, :].mean(axis=0))

    # get optimal action percentage
    optimals = (Bandit.percent(sample_av[:, 1, :]),
                Bandit.percent(const_step[:, 1, :]))

    # plot
    labels = ('Sample average\n' r'$\varepsilon=0.1$',
              'Constant step-size\n' r'$\varepsilon=0.1, \alpha=0.1$')
    Bandit.plot(rewards, labels, 'Average reward')
    Bandit.plot(optimals, labels, '% Optimal action')

    plt.show()
示例#2
0
    mp.set_start_method('spawn')

    print('Stationary greedy started...')
    t1 = time.perf_counter()

    with mp.Pool(mp.cpu_count()) as pool:

        def func(x):
            return np.array(pool.map(EpsGreedy(eps=x).rews_opts_stat, args))

        result = [func(eps) for eps in epsilons]
        # get 3 (2000, 2, 1000)-shaped arrays, axis=1 stands for rewards and optimals

    t2 = time.perf_counter()
    print(f'Done in {round(t2 - t1, 3)} sec')

    # get the average rewards
    rewards = [pair[:, 0, :].mean(axis=0) for pair in result]
    # get the percentage of the optimal actions
    optimals = [Bandit.percent(pair[:, 1, :]) for pair in result]

    # plotting
    colors = ('green', 'blue', 'red')
    labels = (r'$\varepsilon=0$ (greedy)', r'$\varepsilon=0.1$',
              r'$\varepsilon=0.01$')

    Bandit.plot(rewards, labels, 'Average reward')
    Bandit.plot(optimals, labels, '% Optimal action')

    plt.show()
示例#3
0
        bl01 = np.array(
            pool.map(
                GradientBaseline(true_value=4, alpha=0.1).optimals_stat, args))
        bl04 = np.array(
            pool.map(
                GradientBaseline(true_value=4, alpha=0.4).optimals_stat, args))
        no_bl01 = np.array(
            pool.map(
                GradientNoBaseline(true_value=4, alpha=0.1).optimals_stat,
                args))
        no_bl04 = np.array(
            pool.map(
                GradientNoBaseline(true_value=4, alpha=0.4).optimals_stat,
                args))

    t2 = time.perf_counter()
    print(f'Done in {round(t2 - t1, 3)} sec')

    result = [bl01, bl04, no_bl01, no_bl04]
    # get percentages
    result = [Bandit.percent(i) for i in result]

    # plotting
    labels = (r'with baseline, $\alpha=0.1$', r'with baseline, $\alpha=0.4$',
              r'without baseline, $\alpha=0.1$',
              r'without baseline, $\alpha=0.4$')
    colors = ('blue', 'cornflowerblue', 'sienna', 'tan')

    Bandit.plot(result, labels, '% Optimal action', colors=colors)
    plt.show()