def td_grid(lrs): tabular = np.zeros(23 * 4) grid = Grid() grid.pi_params = tabular.reshape(23, 4) grid.softmax() print('gridworld td') alpha_result = [] for alpha in lrs: estimated_v = np.zeros(23) print('alpha = ', alpha) # update tabular in 100 loops for x in range(100): s = grid.d_zero() count = 0 while s != [5, 5] and count < 500: a = grid.pi(s) new_s, r = grid.P_and_R(s, a) i = grid.get_index(s) new_i = grid.get_index(new_s) estimated_v[i] += alpha * (r + grid.gamma * estimated_v[new_i] - estimated_v[i]) s = new_s count += 1 # calculate td in another 100 loops td_list = [] for x in range(100): s = grid.d_zero() count = 0 while s != [5, 5] and count < 500: a = grid.pi(s) new_s, r = grid.P_and_R(s, a) i = grid.get_index(s) new_i = grid.get_index(new_s) td_list.append((r + grid.gamma * estimated_v[new_i] - estimated_v[i]) ** 2) s = new_s count += 1 td_list.append(0) print('square td = ', np.mean(np.array(td_list))) alpha_result.append(np.mean(np.array(td_list))) print('##########################') return alpha_result
def reinforce_grid(lr, eps, epoch=100, searchbound=400): estimated_rewards = np.zeros(epoch) # theta is a representation of policy theta = np.zeros((23, 4)) grid = Grid() actions = grid.action # print(epoch) # for each episode: for x in range(epoch): # s ∼ d0 s = grid.d_zero() count = 0 hist_s = [] hist_a = [] hist_r = [] grid.pi_params = estimation.softmax(theta, eps(x)) # for each time step, until s is the terminal absorbing state do while s != [5, 5] and count < 1000: hist_s.append(s) a = grid.pi(s) hist_a.append(a) new_s, r = grid.P_and_R(s, a) hist_r.append(r) s = new_s count += 1 # delta_j = 0 decay = 1 for i in range(len(hist_s)): g = 0 gd = 1 for j in range(i, len(hist_s)): g += gd * hist_r[j] gd *= grid.gamma theta[grid.get_index(hist_s[i]), actions.index(hist_a[i])] += lr * decay * g decay *= grid.gamma grid.pi_params = estimation.softmax(theta, eps(x)) # grid.softmax() grid_epi = GridEpisode(grid, step_bound=searchbound) # print('episode: ', x, ', pi: ', grid.pi_params) estimated_rewards[x] = grid_epi.run_all_steps() if x == epoch - 1: print('episode: ', x, ', reward: ', estimated_rewards[x]) # decay *= decay_rate return estimated_rewards
def actor_critic_grid(lr, eps, epoch=100, searchbound=400): estimated_rewards = np.zeros(epoch) # Initialize tabular-v arbitrarily v = np.zeros(23) # theta is a representation of policy theta = np.zeros((23, 4)) grid = Grid() actions = grid.action # for each episode: for x in range(epoch): # s ∼ d0 s = grid.d_zero() count = 0 # for each time step, until s is the terminal absorbing state do while s != [5, 5] and count < 1000: # a ∼ π(s, ·); grid.pi_params = estimation.softmax(theta, eps(x)) a = grid.pi(s) # Take action a and observe r and s′; new_s, r = grid.P_and_R(s, a) # Critic update using TD(λ) # e ← γλe + ∂qw(s,a)/∂qw; delta = r + grid.gamma * v[grid.get_index(new_s)] - v[ grid.get_index(s)] # w←w+αδev; v[grid.get_index(s)] += lr * delta theta[grid.get_index(s), actions.index(a)] += lr * delta # print(theta) s = new_s count += 1 # using q function to estimate the reward and add it to estimated_reward # print('episode: ', x, ', q function: ', q) grid.pi_params = estimation.softmax(theta, eps(x)) # grid.softmax() grid_epi = GridEpisode(grid, step_bound=searchbound) # print('episode: ', x, ', pi: ', grid.pi_params) estimated_rewards[x] = grid_epi.run_all_steps() if x == 99: print('episode: ', x, ', reward: ', estimated_rewards[x]) # decay *= decay_rate return estimated_rewards