def grid_sampling(theta, cm, K, Ke, N, epsilon): theta_list = np.random.multivariate_normal(theta, cm, K) result_list = [] for x in range(K): # concurrent_eval(theta_list, x, result_list, N) avg_reward = 0 for i in range(N): grid = Grid() grid.pi_params = theta_list[x].reshape(23, 4) grid.softmax() epi = GridEpisode(grid) avg_reward += epi.run_all_steps() result_list.append((theta_list[x], avg_reward / N)) # print(sorted(result_list, key=lambda n: n[-1], reverse=True)) elite_list = sorted(result_list, key=lambda n: n[-1], reverse=True)[:Ke] # print(elite_list) theta_final = np.zeros(92) cm_final = epsilon * np.identity(92) J_final = 0 for t in elite_list: theta_final += t[0] cm_final += np.array([t[0] - theta]).T.dot(np.array([t[0] - theta])) J_final += t[1] theta_final /= Ke cm_final /= (epsilon + Ke) # print(cm_final) J_final /= Ke return theta_final, cm_final, J_final
def grid_evaluate(table, N): avg_reward = 0 for i in range(N): g = Grid() g.pi_params = table g.softmax() epi = GridEpisode(g) avg_reward += epi.run_all_steps() return avg_reward / N
def multi_grid_episode(table, l): # total_reward = 0 for i in l: grid = Grid() # print(i) grid.pi_params = table grid.softmax() epi = GridEpisode(grid) grid_q.put(epi.run_all_steps()) return 0
def grid_evaluate(t, N): reward_l = [] table = t.reshape(23, 4) for i in range(N): # concurrent_eval(theta_list, x, result_list, N) grid = Grid() # print(i) grid.pi_params = table grid.softmax() epi = GridEpisode(grid) reward_l.append(epi.run_all_steps()) return sum(reward_l) / N
def reinforce_grid(lr, eps, epoch=100, searchbound=400): estimated_rewards = np.zeros(epoch) # theta is a representation of policy theta = np.zeros((23, 4)) grid = Grid() actions = grid.action # print(epoch) # for each episode: for x in range(epoch): # s ∼ d0 s = grid.d_zero() count = 0 hist_s = [] hist_a = [] hist_r = [] grid.pi_params = estimation.softmax(theta, eps(x)) # for each time step, until s is the terminal absorbing state do while s != [5, 5] and count < 1000: hist_s.append(s) a = grid.pi(s) hist_a.append(a) new_s, r = grid.P_and_R(s, a) hist_r.append(r) s = new_s count += 1 # delta_j = 0 decay = 1 for i in range(len(hist_s)): g = 0 gd = 1 for j in range(i, len(hist_s)): g += gd * hist_r[j] gd *= grid.gamma theta[grid.get_index(hist_s[i]), actions.index(hist_a[i])] += lr * decay * g decay *= grid.gamma grid.pi_params = estimation.softmax(theta, eps(x)) # grid.softmax() grid_epi = GridEpisode(grid, step_bound=searchbound) # print('episode: ', x, ', pi: ', grid.pi_params) estimated_rewards[x] = grid_epi.run_all_steps() if x == epoch - 1: print('episode: ', x, ', reward: ', estimated_rewards[x]) # decay *= decay_rate return estimated_rewards
def td_grid(lrs): tabular = np.zeros(23 * 4) grid = Grid() grid.pi_params = tabular.reshape(23, 4) grid.softmax() print('gridworld td') alpha_result = [] for alpha in lrs: estimated_v = np.zeros(23) print('alpha = ', alpha) # update tabular in 100 loops for x in range(100): s = grid.d_zero() count = 0 while s != [5, 5] and count < 500: a = grid.pi(s) new_s, r = grid.P_and_R(s, a) i = grid.get_index(s) new_i = grid.get_index(new_s) estimated_v[i] += alpha * (r + grid.gamma * estimated_v[new_i] - estimated_v[i]) s = new_s count += 1 # calculate td in another 100 loops td_list = [] for x in range(100): s = grid.d_zero() count = 0 while s != [5, 5] and count < 500: a = grid.pi(s) new_s, r = grid.P_and_R(s, a) i = grid.get_index(s) new_i = grid.get_index(new_s) td_list.append((r + grid.gamma * estimated_v[new_i] - estimated_v[i]) ** 2) s = new_s count += 1 td_list.append(0) print('square td = ', np.mean(np.array(td_list))) alpha_result.append(np.mean(np.array(td_list))) print('##########################') return alpha_result
def sarsa_grid(lr, eps, epoch=100, searchbound=400): grid = Grid() grid.pi_params = np.zeros((23, 4)) grid.softmax() actions = grid.action estimated_rewards = np.zeros(epoch) q = np.zeros((23, 4)) for x in range(epoch): s = grid.d_zero() # choose a from s using a policy derived from q (e.g., ε-greedy or softmax); pi_s = pe.softmax(q[grid.get_index(s)], actions, eps(x)) a = np.random.choice(actions, 1, p=pi_s)[0] while s != [5, 5]: # print(q) # Take action a and observe r and s′; new_s, r = grid.P_and_R(s, a) # choose new_a from new_s using policy derived from q pi_temp = pe.softmax(q[grid.get_index(new_s)], actions, eps(x)) new_a = np.random.choice(actions, 1, p=pi_temp)[0] q[grid.get_index(s), actions.index(a)] += lr * ( r + grid.gamma * q[grid.get_index(new_s), actions.index(new_a)] - q[grid.get_index(s), actions.index(a)]) s = new_s a = new_a # using q function to estimate the reward and add it to estimated_reward # print('episode: ', x, ', q function: ', q) grid.pi_params = pe.softmax(q, actions, eps(x)) grid_epi = GridEpisode(grid, step_bound=searchbound) # print('episode: ', x, ', pi: ', grid.pi_params) estimated_rewards[x] = grid_epi.run_all_steps() print('episode: ', x, ', reward: ', estimated_rewards[x], 'epsilon: ', eps(x)) # decay *= decay_rate return estimated_rewards
def qlearning_grid(lr, eps, epoch=100, searchbound=400): q = np.zeros((23, 4)) grid = Grid() grid.pi_params = np.zeros((23, 4)) grid.softmax() actions = grid.action estimated_rewards = np.zeros(epoch) for x in range(epoch): s = grid.d_zero() while s != [5, 5]: # Choose a from s using a policy derived from q; pi_temp = pe.epsilon_greedy(q[grid.get_index(s)], actions, eps(x)) a = np.random.choice(actions, 1, p=pi_temp)[0] # Take action a and observe r and s′; new_s, r = grid.P_and_R(s, a) q[grid.get_index(s), actions.index(a)] += lr * ( r + grid.gamma * np.max(q[grid.get_index(new_s)]) - q[grid.get_index(s), actions.index(a)]) s = new_s grid.pi_params = pe.epsilon_greedy(q, actions, eps(x)) grid_epi = GridEpisode(grid, step_bound=searchbound) # print('episode: ', x, ', pi: ', grid.pi_params) estimated_rewards[x] = grid_epi.run_all_steps() print('episode: ', x, ', reward: ', estimated_rewards[x], ', epsilon: ', eps(x)) return estimated_rewards
def cartpole_evaluate(table, N): avg_reward = 0 for i in range(N): cartpole = CartPole() cartpole.pi_params = table epi = CartPoleEpisode(cartpole) avg_reward += epi.run_all_steps() return avg_reward / N tic = time.time() theta = np.ones(92) * 0.25 theta_f = grid_param_sampling(theta, 0.5, 200) grid = Grid() grid.pi_params = theta_f.reshape(23, 4) grid.softmax() episode = GridEpisode(grid) print('optimized reward: ', episode.run_all_steps()) print('optimized theta: ', theta_f.reshape(23, 4)) # theta = np.ones(8) * 0.25 # theta_f = cartpole_sampling(theta, 0.5, 500) # cartpole = CartPole() # cartpole.pi_params = theta_f.reshape(4, 2) # episode = CartPoleEpisode(cartpole) # print('optimized reward: ', episode.run_all_steps()) # print('optimized theta: ', theta_f.reshape(4, 2))
def sarsa_lambda_grid(lr, l, eps, epoch=100, searchbound=400): grid = Grid() grid.pi_params = np.zeros((23, 4)) grid.softmax() actions = grid.action estimated_rewards = np.zeros(epoch) # Initialize tabular-q arbitrarily q = np.zeros((23, 4)) # for each episode: for x in range(epoch): # s ∼ d0 s = grid.d_zero() # e ← 0 e = np.zeros((23, 4)) # choose a from s using a policy derived from q (e.g., ε-greedy or softmax); pi_s = estimation.epsilon_greedy(q[grid.get_index(s)], actions, eps(x)) a = np.random.choice(actions, 1, p=pi_s)[0] # for each time step, until s is the terminal absorbing state do while s != [5, 5]: # Take action a and observe r and s′; new_s, r = grid.P_and_R(s, a) # choose new_a from new_s using policy derived from q pi_temp = estimation.epsilon_greedy(q[grid.get_index(new_s)], actions, eps(x)) new_a = np.random.choice(actions, 1, p=pi_temp)[0] # e ← γλe + ∂qw(s,a)/∂qw; e = l * grid.gamma * e e[grid.get_index(s), actions.index(a)] += 1 # δ ← r + γqw(s′,a′) − qw(s,a); delta = r + grid.gamma * q[grid.get_index(new_s), actions.index(new_a)] - q[grid.get_index(s), actions.index(a)] # w ← w + αδe; q += lr * delta * e s = new_s a = new_a # using q function to estimate the reward and add it to estimated_reward # print('episode: ', x, ', q function: ', q) grid.pi_params = estimation.epsilon_greedy(q, actions, eps(x)) grid_epi = GridEpisode(grid, step_bound=searchbound) # print('episode: ', x, ', pi: ', grid.pi_params) estimated_rewards[x] = grid_epi.run_all_steps() print('episode: ', x, ', reward: ', estimated_rewards[x], 'epsilon: ', eps(x)) # decay *= decay_rate return estimated_rewards
def actor_critic_grid(lr, eps, epoch=100, searchbound=400): estimated_rewards = np.zeros(epoch) # Initialize tabular-v arbitrarily v = np.zeros(23) # theta is a representation of policy theta = np.zeros((23, 4)) grid = Grid() actions = grid.action # for each episode: for x in range(epoch): # s ∼ d0 s = grid.d_zero() count = 0 # for each time step, until s is the terminal absorbing state do while s != [5, 5] and count < 1000: # a ∼ π(s, ·); grid.pi_params = estimation.softmax(theta, eps(x)) a = grid.pi(s) # Take action a and observe r and s′; new_s, r = grid.P_and_R(s, a) # Critic update using TD(λ) # e ← γλe + ∂qw(s,a)/∂qw; delta = r + grid.gamma * v[grid.get_index(new_s)] - v[ grid.get_index(s)] # w←w+αδev; v[grid.get_index(s)] += lr * delta theta[grid.get_index(s), actions.index(a)] += lr * delta # print(theta) s = new_s count += 1 # using q function to estimate the reward and add it to estimated_reward # print('episode: ', x, ', q function: ', q) grid.pi_params = estimation.softmax(theta, eps(x)) # grid.softmax() grid_epi = GridEpisode(grid, step_bound=searchbound) # print('episode: ', x, ', pi: ', grid.pi_params) estimated_rewards[x] = grid_epi.run_all_steps() if x == 99: print('episode: ', x, ', reward: ', estimated_rewards[x]) # decay *= decay_rate return estimated_rewards