def qlearning_grid(lr, eps, epoch=100, searchbound=400): grid = Grid() grid.pi_params = np.zeros((23, 4)) grid.softmax() actions = grid.action estimated_rewards = np.zeros(epoch) q = np.zeros((23, 4)) for x in range(epoch): s = grid.d_zero() while s != [5, 5]: # choose new_a from new_s using policy derived from q pi_temp = pe.softmax(q[grid.get_index(s)], actions, eps(x)) a = np.random.choice(actions, 1, p=pi_temp)[0] # print(q) # Take action a and observe r and s′; new_s, r = grid.P_and_R(s, a) q[grid.get_index(s), actions.index(a)] += lr * ( r + grid.gamma * np.max(q[grid.get_index(new_s)]) - q[grid.get_index(s), actions.index(a)]) s = new_s # using q function to estimate the reward and add it to estimated_reward # print('episode: ', x, ', q function: ', q) grid.pi_params = pe.softmax(q, actions, eps(x)) grid_epi = GridEpisode(grid, step_bound=searchbound) # print('episode: ', x, ', pi: ', grid.pi_params) estimated_rewards[x] = grid_epi.run_all_steps() print('episode: ', x, ', reward: ', estimated_rewards[x], 'epsilon: ', eps(x)) # decay *= decay_rate return estimated_rewards
def td_grid(lrs): tabular = np.zeros(23 * 4) grid = Grid() grid.pi_params = tabular.reshape(23, 4) grid.softmax() print('gridworld td') alpha_result = [] for alpha in lrs: estimated_v = np.zeros(23) print('alpha = ', alpha) # update tabular in 100 loops for x in range(100): s = grid.d_zero() count = 0 while s != [5, 5] and count < 500: a = grid.pi(s) new_s, r = grid.P_and_R(s, a) i = grid.get_index(s) new_i = grid.get_index(new_s) estimated_v[i] += alpha * (r + grid.gamma * estimated_v[new_i] - estimated_v[i]) s = new_s count += 1 # calculate td in another 100 loops td_list = [] for x in range(100): s = grid.d_zero() count = 0 while s != [5, 5] and count < 500: a = grid.pi(s) new_s, r = grid.P_and_R(s, a) i = grid.get_index(s) new_i = grid.get_index(new_s) td_list.append((r + grid.gamma * estimated_v[new_i] - estimated_v[i]) ** 2) s = new_s count += 1 td_list.append(0) print('square td = ', np.mean(np.array(td_list))) alpha_result.append(np.mean(np.array(td_list))) print('##########################') return alpha_result
def sarsa_lambda_grid(lr, l, eps, epoch=100, searchbound=400): grid = Grid() grid.pi_params = np.zeros((23, 4)) grid.softmax() actions = grid.action estimated_rewards = np.zeros(epoch) # Initialize tabular-q arbitrarily q = np.zeros((23, 4)) # for each episode: for x in range(epoch): # s ∼ d0 s = grid.d_zero() # e ← 0 e = np.zeros((23, 4)) # choose a from s using a policy derived from q (e.g., ε-greedy or softmax); pi_s = estimation.epsilon_greedy(q[grid.get_index(s)], actions, eps(x)) a = np.random.choice(actions, 1, p=pi_s)[0] # for each time step, until s is the terminal absorbing state do while s != [5, 5]: # Take action a and observe r and s′; new_s, r = grid.P_and_R(s, a) # choose new_a from new_s using policy derived from q pi_temp = estimation.epsilon_greedy(q[grid.get_index(new_s)], actions, eps(x)) new_a = np.random.choice(actions, 1, p=pi_temp)[0] # e ← γλe + ∂qw(s,a)/∂qw; e = l * grid.gamma * e e[grid.get_index(s), actions.index(a)] += 1 # δ ← r + γqw(s′,a′) − qw(s,a); delta = r + grid.gamma * q[grid.get_index(new_s), actions.index(new_a)] - q[grid.get_index(s), actions.index(a)] # w ← w + αδe; q += lr * delta * e s = new_s a = new_a # using q function to estimate the reward and add it to estimated_reward # print('episode: ', x, ', q function: ', q) grid.pi_params = estimation.epsilon_greedy(q, actions, eps(x)) grid_epi = GridEpisode(grid, step_bound=searchbound) # print('episode: ', x, ', pi: ', grid.pi_params) estimated_rewards[x] = grid_epi.run_all_steps() print('episode: ', x, ', reward: ', estimated_rewards[x], 'epsilon: ', eps(x)) # decay *= decay_rate return estimated_rewards
def reinforce_grid(lr, eps, epoch=100, searchbound=400): estimated_rewards = np.zeros(epoch) # theta is a representation of policy theta = np.zeros((23, 4)) grid = Grid() actions = grid.action # print(epoch) # for each episode: for x in range(epoch): # s ∼ d0 s = grid.d_zero() count = 0 hist_s = [] hist_a = [] hist_r = [] grid.pi_params = estimation.softmax(theta, eps(x)) # for each time step, until s is the terminal absorbing state do while s != [5, 5] and count < 1000: hist_s.append(s) a = grid.pi(s) hist_a.append(a) new_s, r = grid.P_and_R(s, a) hist_r.append(r) s = new_s count += 1 # delta_j = 0 decay = 1 for i in range(len(hist_s)): g = 0 gd = 1 for j in range(i, len(hist_s)): g += gd * hist_r[j] gd *= grid.gamma theta[grid.get_index(hist_s[i]), actions.index(hist_a[i])] += lr * decay * g decay *= grid.gamma grid.pi_params = estimation.softmax(theta, eps(x)) # grid.softmax() grid_epi = GridEpisode(grid, step_bound=searchbound) # print('episode: ', x, ', pi: ', grid.pi_params) estimated_rewards[x] = grid_epi.run_all_steps() if x == epoch - 1: print('episode: ', x, ', reward: ', estimated_rewards[x]) # decay *= decay_rate return estimated_rewards
def actor_critic_grid(lr, eps, epoch=100, searchbound=400): estimated_rewards = np.zeros(epoch) # Initialize tabular-v arbitrarily v = np.zeros(23) # theta is a representation of policy theta = np.zeros((23, 4)) grid = Grid() actions = grid.action # for each episode: for x in range(epoch): # s ∼ d0 s = grid.d_zero() count = 0 # for each time step, until s is the terminal absorbing state do while s != [5, 5] and count < 1000: # a ∼ π(s, ·); grid.pi_params = estimation.softmax(theta, eps(x)) a = grid.pi(s) # Take action a and observe r and s′; new_s, r = grid.P_and_R(s, a) # Critic update using TD(λ) # e ← γλe + ∂qw(s,a)/∂qw; delta = r + grid.gamma * v[grid.get_index(new_s)] - v[ grid.get_index(s)] # w←w+αδev; v[grid.get_index(s)] += lr * delta theta[grid.get_index(s), actions.index(a)] += lr * delta # print(theta) s = new_s count += 1 # using q function to estimate the reward and add it to estimated_reward # print('episode: ', x, ', q function: ', q) grid.pi_params = estimation.softmax(theta, eps(x)) # grid.softmax() grid_epi = GridEpisode(grid, step_bound=searchbound) # print('episode: ', x, ', pi: ', grid.pi_params) estimated_rewards[x] = grid_epi.run_all_steps() if x == 99: print('episode: ', x, ', reward: ', estimated_rewards[x]) # decay *= decay_rate return estimated_rewards