# comment this line if run on windows or OS X (default method) mp.set_start_method('spawn') print('Start exercise 2.5... ') t1 = time.perf_counter() with mp.Pool(mp.cpu_count()) as pool: sample_av = np.array(pool.map(EpsGreedy(eps=0.1).rews_opts_nonstat, args)) const_step = np.array(pool.map(EpsGreedyConstant(eps=0.1, alpha=0.1).rews_opts_nonstat, args)) # got (2000, 2, 1000)-shaped arrays, axis=1 stands for rewards and optimals t2 = time.perf_counter() print(f'Done in {round(t2 - t1, 3)} sec') # get average rewards rewards = (sample_av[:, 0, :].mean(axis=0), const_step[:, 0, :].mean(axis=0)) # get optimal action percentage optimals = (Bandit.percent(sample_av[:, 1, :]), Bandit.percent(const_step[:, 1, :])) # plot labels = ('Sample average\n' r'$\varepsilon=0.1$', 'Constant step-size\n' r'$\varepsilon=0.1, \alpha=0.1$') Bandit.plot(rewards, labels, 'Average reward') Bandit.plot(optimals, labels, '% Optimal action') plt.show()
mp.set_start_method('spawn') print('Stationary greedy started...') t1 = time.perf_counter() with mp.Pool(mp.cpu_count()) as pool: def func(x): return np.array(pool.map(EpsGreedy(eps=x).rews_opts_stat, args)) result = [func(eps) for eps in epsilons] # get 3 (2000, 2, 1000)-shaped arrays, axis=1 stands for rewards and optimals t2 = time.perf_counter() print(f'Done in {round(t2 - t1, 3)} sec') # get the average rewards rewards = [pair[:, 0, :].mean(axis=0) for pair in result] # get the percentage of the optimal actions optimals = [Bandit.percent(pair[:, 1, :]) for pair in result] # plotting colors = ('green', 'blue', 'red') labels = (r'$\varepsilon=0$ (greedy)', r'$\varepsilon=0.1$', r'$\varepsilon=0.01$') Bandit.plot(rewards, labels, 'Average reward') Bandit.plot(optimals, labels, '% Optimal action') plt.show()
bl01 = np.array( pool.map( GradientBaseline(true_value=4, alpha=0.1).optimals_stat, args)) bl04 = np.array( pool.map( GradientBaseline(true_value=4, alpha=0.4).optimals_stat, args)) no_bl01 = np.array( pool.map( GradientNoBaseline(true_value=4, alpha=0.1).optimals_stat, args)) no_bl04 = np.array( pool.map( GradientNoBaseline(true_value=4, alpha=0.4).optimals_stat, args)) t2 = time.perf_counter() print(f'Done in {round(t2 - t1, 3)} sec') result = [bl01, bl04, no_bl01, no_bl04] # get percentages result = [Bandit.percent(i) for i in result] # plotting labels = (r'with baseline, $\alpha=0.1$', r'with baseline, $\alpha=0.4$', r'without baseline, $\alpha=0.1$', r'without baseline, $\alpha=0.4$') colors = ('blue', 'cornflowerblue', 'sienna', 'tan') Bandit.plot(result, labels, '% Optimal action', colors=colors) plt.show()