def problemA(): """ Have the agent uniformly randomly select actions. Run 10,000 episodes. Report the mean, standard deviation, maximum, and minimum of the observed discounted returns. """ # setting random seed for reproducibility print("Problem A") env = Gridworld() discounted_returns = [] for episode in range(10000): # print (episode) discounted_return = 0.0 while not env.isEnd: state = env.state action = np.random.choice([0, 1, 2, 3]) # print (state, action) actual_action, new_state, reward = env.step(action) # print (actual_action, new_state, reward) discounted_return += reward # print (t) env.reset() # print(time_step) discounted_returns.append(discounted_return) print("Mean ", np.mean(discounted_returns)) print("Std Dev ", np.std(discounted_returns)) print("Max ", np.max(discounted_returns)) print("Min ", np.min(discounted_returns)) return discounted_returns """
def problemC(): print("PROBLEM C...") policy = np.array([ 3, 3, 3, 3, 1, 0, 3, 3, 3, 1, 0, 2, 4, 3, 1, 0, 2, 4, 3, 1, 0, 2, 3, 3, 4 ]) episodes = 10000 arr = np.zeros(episodes) G = Gridworld() G.gamma = 0.9 for e in range(episodes): G.timestep = 0 # print("episode %d" % (e+1)) while (not G.isEnd): # print(G.currentState) G.step(G.stoch_action(policy[G.state])) arr[e] = G.reward G.reset() # arr[e] = disc_returns opt_disc_returns = np.amax(arr) opt_episode = np.argmax(arr) + 1 mean = np.mean(arr) variance = np.var(arr) std_dev = np.std(arr) min = np.amin(arr) print("Highest observed discounted returns is %f achieved in" " episode number %d" % (opt_disc_returns, opt_episode)) print("The mean of discounted returns is %f, variance is %f" " and standard deviation is %f" % (mean, variance, std_dev)) print("Max is %f and min is %f" % (opt_disc_returns, min)) # print(np.argmin(arr) + 1) return arr
class Evaluate: def __init__(self): self.environment = Gridworld() self.policy = TabularSoftmax(25,4) self._G = [] @property def batchReturn(self)->str: return self._G def __call__(self, theta:np.array, numEpisodes:int): # print("Evaluating Gridworld") # self._G = [] #reset G at every call # environment = Gridworld() # policy = TabularSoftmax(25,4) self.policy.parameters = theta # print("numEpisodes",numEpisodes) Count = 200 for episode in range(numEpisodes): self.environment.reset() G_episode = 0 counter = 0 ctr=0 while not self.environment.isEnd: if(counter>=Count): G_episode = -50 break state = self.environment.state action = self.policy.samplAction(state) _, reward, _ = self.environment.step(action) G_episode += (self.environment.gamma**ctr)*reward # G_episode += reward counter+=1 ctr+=1 # self.returns.append(Gi) self._G.append(G_episode) # if (episode % 50 == 0): # print(G_episode) # print("Mean Return ", np.mean(G)) return np.mean(self._G) def reset(self): self.environment = Gridworld() self.policy = TabularSoftmax(25,4) self._G = []
def problemA(): print("PROBLEM A...") episodes = 10000 arr = np.zeros(episodes) G = Gridworld() G.gamma = 0.9 for e in range(episodes): # number of episodes loop G.timeStep = 0 # print("episode %d" % (e+1)) while (not G.isEnd): # print(G.currentState) G.step(G.action) arr[e] = G.reward G.reset() opt_disc_returns = np.amax(arr) opt_episode = np.argmax(arr) + 1 mean = np.mean(arr) variance = np.var(arr) std_dev = np.std(arr) min = np.amin(arr) print("Highest observed discounted returns is %f achieved in" " episode number %d" % (opt_disc_returns, opt_episode)) print("The mean of discounted returns is %f, variance is %f" " and standard deviation is %f" % (mean, variance, std_dev)) print("Max is %f and min is %f" % (opt_disc_returns, min)) return arr
def problemE(): env = Gridworld(startState=18) env.gamma = 0.9 episode = 0 hit = 0 total_try = 100000 while episode < total_try: episode += 1 env.timeStep = 8 while env.timeStep < 19: act = env.action state, reward, isEnd = Gridworld.step(env, act) if isEnd: break if env.currentState == 21: hit += 1 print('P is {}'.format(hit / total_try))
def problemE(num_iters): agent = Agent() gridworld = Gridworld() count = 0 for i in range(num_iters): gridworld.state = 19 #defining the state to be above end for i in range(8, 19): time = i #not used anywhere, just for clarity purpose action = agent.act( ) #this will be action a_18 in the last iteration gridworld.step(action) if gridworld.state == 22: count += 1 print('P(S_19 = 22| S_8=19) = ', count / num_iters)
def runEnvironment(getAction, numeps=10000): returns = np.zeros(numeps) grid = Gridworld() for ep in range(numeps): grid.reset() step = 0 g = 0 while not grid.isEnd: s, r, e = grid.step(getAction(grid.state)) g += (grid.gamma ** step) * r step += 1 returns[ep] = g print("Average: {}\nStandard Deviation: {}\nMin: {}\nMax: {}".format( \ np.mean(returns), np.std(returns), np.min(returns), np.max(returns))) return returns
def run_gridworld_episode(p): environment = Gridworld() policy = TabularSoftmax(25, 4) policy.parameters = p is_end = False discounted_return = 0 t = 0 while not is_end: action = policy.samplAction(environment.state) new_state, reward, is_end = environment.step(action) discounted_return += (environment.gamma**t) * reward t += 1 if t > 200: discounted_return = -50 break environment.reset() return discounted_return
def problemE(): """ Using simulations, empirically estimate the probability that S_19=21 given that S_8=18 (the state above the goal) when running the uniform random policy. Describe how you estimated this quantity (there is not a typo in this problem, nor an oversight) NOTE: State 18 is state 19 in this gridworld implementation and state 21 is 22. """ print("\nProblem E") env = Gridworld() success = 0 N = 100000 for trial in range(N): env.reset() env._state = 19 step = 0 while not env.isEnd: env.step(np.random.choice(range(4))) step += 1 if step == 11: break if env._state == 22: success += 1 p = success / N eps = np.sqrt((1 / (2 * N)) * np.log(2 / 0.05)) # Hoeffding's inequality print( "Pr(S_19=s_22 | S_8=s_18)={0:.5f} empirically and is in ({1:.5f},{2:.5f}) with 95% confidence using Hoeffding's inequality".format( p, p - eps, p + eps))
def runEnvironment_gridworld(policy, numeps=10000): returns = np.zeros(numeps) grid = Gridworld() for ep in range(numeps): grid.reset() step = 0 g = 0 while not grid.isEnd: action = policy.samplAction(grid.state) s, r, e = grid.step(action) g += (grid.gamma**step) * r step += 1 if step > 200: g = -50 break returns[ep] = g return returns
def problemA(): """ Have the agent uniformly randomly select actions. Run 10,000 episodes. Report the mean, standard deviation, maximum, and minimum of the observed discounted returns. """ grid_world = Gridworld() rewards = [] for episod in range(10000): is_end = False grid_world.reset() r = 0 while ~is_end: action = np.random.randint(4) r_, is_end = grid_world.step(action) r += r_ rewards.append(r) print(episod, r, is_end) rewards = np.array(rewards) print(rewards.mean(), rewards.std(), rewards.max(), rewards.min())
def problemE(): """ Have the agent uniformly randomly select actions. Run 10,000 episodes. Report the mean, standard deviation, maximum, and minimum of the observed discounted returns. """ # setting random seed for reproducibility print("Problem E") start_time = time.time() env = Gridworld(startState=19) num_episodes = 1000000 count_s19_22_given_s8_19 = 0 for episode in range(num_episodes): # print (episode) time_step = 0 while (not env.isEnd) and time_step < 12: state = env.state if time_step == 11 and state == 22: count_s19_22_given_s8_19 += 1 action = np.random.choice([0, 1, 2, 3]) env.step(action) time_step += 1 # print (t) env.reset() print(count_s19_22_given_s8_19) Pr_s19_22_given_s8_19 = (count_s19_22_given_s8_19 * 1.0) / num_episodes end_time = time.time() print("Estimate of Pr(S_8=19 | S_19 = 22) = ", Pr_s19_22_given_s8_19) print("Execution time = ", end_time - start_time) """
def problemA(num_iters): """ Have the agent uniformly randomly select actions. Run 10,000 episodes. Report the mean, standard deviation, maximum, and minimum of the observed discounted returns. """ agent = Agent() discounted_returns = [] gridworld = Gridworld() for i in range(num_iters): reward = 0 time = 0 while True: action = agent.act() gridworld.step(action) reward += gridworld.reward * (gridworld.gamma**time) if gridworld.isEnd: break time += 1 discounted_returns.append(reward) gridworld.reset() print('Mean = ', st.mean(discounted_returns)) print('Standard deviation = ', st.stdev(discounted_returns)) print('Max = ', max(discounted_returns)) print('Min = ', min(discounted_returns)) return discounted_returns
def problemC(num_iters): """ Find an optimal policy (you may do this any way you choose, including by reasoning through the problem yourself). Report the optimal policy here. Comment on whether it is unique """ agent = Agent() discounted_returns = [] gridworld = Gridworld() print("acting optimally") for i in range(num_iters): reward = 0 time = 0 while True: action = agent.actOptimally(gridworld.state) gridworld.step(action) reward += gridworld.reward * (gridworld.gamma**time) if gridworld.isEnd: break time += 1 discounted_returns.append(reward) gridworld.reset() print('Mean = ', st.mean(discounted_returns)) print('Standard deviation = ', st.stdev(discounted_returns)) print('Max = ', max(discounted_returns)) print('Min = ', min(discounted_returns)) return discounted_returns
def problemB(): """ Run the optimal policy that you found for 10,000 episodes. Repor the mean, standard deviation, maximum, and minimum of the observed discounted returns """ print("Problem B") optimal_policy_actions = [ 1, 1, 1, 1, 2, 0, 1, 1, 1, 2, 0, 2, -1, 2, 2, 0, 3, -1, 1, 2, 0, 3, 1, 1, -1 ] env = Gridworld() discounted_returns = [] for t in range(10000): # print (t) discounted_return = 0.0 while not env.isEnd: state = env.state action = optimal_policy_actions[state] # print (state, action) actual_action, new_state, reward = env.step(action) # print (actual_action, new_state, reward) discounted_return += reward discounted_returns.append(discounted_return) env.reset() print("Mean ", np.mean(discounted_returns)) print("Std Dev ", np.std(discounted_returns)) print("Max ", np.max(discounted_returns)) print("Min ", np.min(discounted_returns)) return discounted_returns # plt.hist(sorted(discounted_returns), density = True, cumulative=True, label='CDF', # histtype='step', alpha=0.8, color='k') # plt.show() """
def problemE(): print("PROBLEM E...") episodes = 10000 count = 0 G = Gridworld(startState=19) G.gamma = 0.9 for e in range(episodes): G.timeStep = 0 while ((G.timeStep < 11) and (not G.isEnd)): G.step(G.action) if G.state == 22: count = count + 1 G.reset() print("The empirical probability of S19 = 21 given S8 = 18 is %f" % (count / episodes))
def __call__(self, parameters: np.array, numEpisodes: int): # print("Evaluating Gridworld") G = [] policy = TabularSoftmax(25, 4) policy.parameters = parameters env = Gridworld() for ep in range(numEpisodes): # print("Episode ", ep) env.reset() Gi = 0 timeStep = 0 while not env.isEnd: state = env.state action = policy.samplAction(state) _, next_state, reward = env.step(action) Gi += reward timeStep += 1 if timeStep == 200: Gi += -50 break G.append(Gi) self.curTrialReturns.append(Gi) print("Mean Return ", np.mean(G)) return np.mean(G)
def problemA(): """ Have the agent uniformly randomly select actions. Run 10,000 episodes. Report the mean, standard deviation, maximum, and minimum of the observed discounted returns. """ time_list = [] reward_list = [] env = Gridworld() env.gamma = 0.9 episode = 0 while episode <= 10000: episode += 1 print('Episode {}'.format(episode)) step = 0 totalReward = 0 reached = False while True: step += 1 act = env.action state, reward, isEnd = Gridworld.step(env, act) # reward_list.append(reward) totalReward += reward if isEnd: reached = True print('Steps take: {}\tTotal reward: {:.4f}'.format( step, totalReward)) break if not reached: episode -= 1 continue Gridworld.reset(env) reward_list.append(totalReward) print('finished') reward_array = np.array(reward_list) mean = reward_array.mean() std = reward_array.std() max = reward_array.max() min = reward_array.min() print('Mean: {:.2f}\tSTD: {:.2f}\tMax: {:.2f}\tMin: {:.2f}'.format( mean, std, max, min)) print('Num of reward: {}'.format(len(reward_list))) with open('./resultA.json', 'w') as file: json.dump(reward_list, file)
def problem1(): """ Apply the CEM algorithm to the More-Watery 687-Gridworld. Use a tabular softmax policy. Search the space of hyperparameters for hyperparameters that work well. Report how you searched the hyperparameters, what hyperparameters you found worked best, and present a learning curve plot using these hyperparameters, as described in class. This plot may be over any number of episodes, but should show convergence to a nearly optimal policy. The plot should average over at least 500 trials and should include standard error or standard deviation error bars. Say which error bar variant you used. """ #TODO popSize = 10 #10 numElite = 5 #5 epsilon = 4.0 #4.0 sigma = 1.0 #1.0 numEpisodes = 20 #50 numTrials = 5 #5 numIterations = 50 #200 returns = np.zeros((numTrials, numEpisodes * numIterations)) for trial in range(numTrials): np.random.seed(np.random.randint(10000)) gridworld = Gridworld() tabular_softmax = TabularSoftmax(25, 4) theta = np.random.randn(tabular_softmax.parameters.shape[0]) count = 0 def evaluateFunction(theta, numEpisodes): nonlocal count
def problem3(): """ Repeat the previous question, but using the GA (as described earlier in this assignment) on the More-Watery 687-Gridworld domain. Report the same quantities. """ #TODO populationSize = 40 # 40 numElite = 20 # 20 numEpisodes = 20 # 20 numTrials = 50 #50 numIterations = 100 # 100 Kp = 30 # 30 alpha = 3.0 # 3.0 returns = np.zeros((numTrials, numEpisodes * numIterations * populationSize)) for trial in range(numTrials): np.random.seed(np.random.randint(10000)) gridworld = Gridworld() tabular_softmax = TabularSoftmax(25, 4) theta = np.random.randn(tabular_softmax.parameters.shape[0]) count = 0 def evaluateFunction(theta, numEpisodes): nonlocal count expected_reward = 0 numTimeSteps = 10000 tabular_softmax.parameters = theta for episode in range(numEpisodes): state = gridworld.state G = 0 discount = 1 for t in range(numTimeSteps): action = tabular_softmax.samplAction(state); nextstate, reward, end = gridworld.step(action) G += (discount) * reward discount *= gridworld.gamma if end == True: break elif t == 200: G = -50 break state = nextstate expected_reward += G returns[trial][count] = G gridworld.reset() count += 1 return expected_reward / numEpisodes def initPopulation(populationSize : int) -> np.ndarray: return np.random.randn(populationSize, tabular_softmax.parameters.shape[0]) agent = GA(populationSize, evaluateFunction, initPopulation, numElite, numEpisodes, Kp, alpha) for iteration in range(numIterations): print("Trial: %d" % (trial, )) print("Iteration: %d" % (iteration, )) p = agent.train() print(returns[trial][iteration * numEpisodes * populationSize : count]) print(np.mean(returns[trial][iteration * numEpisodes * populationSize : count])) l = [[0 for i in range(5)] for j in range(5)] for i in range(25): k = tabular_softmax.getActionProbabilities(i) # print(k) r = np.argmax(k) if(r == 0): l[i//5][i % 5] = '↑' elif(r == 1): l[i//5][i % 5] = '↓' elif(r == 2): l[i//5][i % 5] = '←' elif(r == 3): l[i//5][i % 5] = '→' for i in range(5): print(l[i]) print(p) plot(returns, 'More-Watery 687-Gridworld domain Genetic Algorithm (standard deviation error bars) - 50 trials', 3)
def problem2(): """ Repeat the previous question, but using first-choice hill-climbing on the More-Watery 687-Gridworld domain. Report the same quantities. """ #TODO sigma = 1.0 #1.0 numEpisodes = 200 #200 numTrials = 50 #50 numIterations = 200 #200 returns = np.zeros((numTrials, numEpisodes * numIterations)) for trial in range(numTrials): np.random.seed(np.random.randint(10000)) gridworld = Gridworld() tabular_softmax = TabularSoftmax(25, 4) theta = np.random.randn(tabular_softmax.parameters.shape[0]) count = -1 def evaluateFunction(theta, numEpisodes): nonlocal count expected_reward = 0 numTimeSteps = 10000 tabular_softmax.parameters = theta for episode in range(numEpisodes): state = gridworld.state G = 0 discount = 1 for t in range(numTimeSteps): action = tabular_softmax.samplAction(state); nextstate, reward, end = gridworld.step(action) G += (discount) * reward discount *= gridworld.gamma if end == True: break elif t == 100: break state = nextstate expected_reward += G if(count != -1): returns[trial][count] = G count += 1 gridworld.reset() return expected_reward / numEpisodes agent = FCHC(theta, sigma, evaluateFunction, numEpisodes) count = 0 for iteration in range(numIterations): print("Trial: %d" % (trial, )) print("Iteration: %d" % (iteration, )) p = agent.train() print(returns[trial][iteration * numEpisodes : count]) print(np.mean(returns[trial][iteration * numEpisodes : count])) l = [[0 for i in range(5)] for j in range(5)] for i in range(25): k = tabular_softmax.getActionProbabilities(i) print(k) r = np.argmax(k) if(r == 0): l[i//5][i % 5] = '↑' elif(r == 1): l[i//5][i % 5] = '↓' elif(r == 2): l[i//5][i % 5] = '←' elif(r == 3): l[i//5][i % 5] = '→' for i in range(5): print(l[i]) print(p) plot(returns, 'More-Watery 687-Gridworld domain First Choice Hill Climbing (standard deviation error bars) - 50 trials', 3)
def reset(self): self.environment = Gridworld() self.policy = TabularSoftmax(25,4) self._G = []
def __init__(self): self.environment = Gridworld() self.policy = TabularSoftmax(25,4) self._G = []
def problemB(): """ Run the optimal policy that you found for 10,000 episodes. Repor the mean, standard deviation, maximum, and minimum of the observed discounted returns """ # if on the upper edge, move right; if on right edge, move down; # else, move right or down env = Gridworld() env.gamma = 0.9 episode = 0 reward_list = [] # obstacles = [12, 17] # waterStates = [6, 18, 22] # upperBounds = [0, 1, 2, 3, 4] # rightBounds = [4, 9, 14, 19, 24] while episode < 10000: episode += 1 print('Episode {}'.format(episode)) step = 0 totalReward = 0 reached = False while step < 10000: step += 1 if env.currentState in env.rightBounds: act = 2 # Move down elif env.currentState in env.upperBounds: act = 3 # Move right else: if random.random() < 0.5: act = 3 else: act = 2 # secure = False # while not secure: # if act == 2: # nextState = env.currentState + 5 # if nextState in env.waterStates or nextState in env.obstacles: # act = 3 # else: # secure = True # else: # nextState = env.currentState + 1 # if nextState in env.waterStates or nextState in env.obstacles: # act = 2 # else: # secure = True state, reward, isEnd = Gridworld.step(env, act) totalReward += reward if isEnd: reached = True print('Steps take: {}\tTotal reward: {:.4f}'.format( step, totalReward)) break if not reached: episode -= 1 continue Gridworld.reset(env) reward_list.append(totalReward) print('finished') reward_array = np.array(reward_list) mean = reward_array.mean() std = reward_array.std() max = reward_array.max() min = reward_array.min() print('Mean: {:.2f}\tSTD: {:.2f}\tMax: {:.2f}\tMin: {:.2f}'.format( mean, std, max, min)) print('Num of reward: {}'.format(len(reward_list))) with open('./resultB.json', 'w') as file: json.dump(reward_list, file)