def problem2(para: dict, trails: int = 50): """ Repeat the previous question, but using first-choice hill-climbing on the More-Watery 687-Gridworld domain. Report the same quantities. """ sigma = para['sigma'] numEpisodes = para['numEpisodes'] mean_return_log = [] print('sigma:{}\tnumEpisodes:{}\t'.format(sigma, numEpisodes)) def evaluate(theta, numEpisodes): eva_policy = TabularSoftmax(25, 4) eva_policy.parameters = theta returns = runEnvironment_gridworld(eva_policy, numEpisodes) mean_return = np.mean(returns) mean_return_log.append(mean_return) # print(mean_return) return mean_return policy = TabularSoftmax(25, 4) agent = FCHC(theta=policy.parameters, sigma=sigma, numEpisodes=numEpisodes, evaluationFunction=evaluate) for i in range(trails): policy.parameters = agent.train() print('Episode {} finished'.format(i)) return mean_return_log
def problem2(config, iterations: int = 1000): """ Repeat the previous question, but using first-choice hill-climbing on the More-Watery 687-Gridworld domain. Report the same quantities. """ all_returns = [] def evaluate(p, episodes): returns = [] for i in range(episodes): r = run_gridworld_episode(p) returns.append(r) all_returns.append(r) return np.mean(returns) agent_policy = TabularSoftmax(25, 4) agent = FCHC(agent_policy.parameters, sigma=config[0], evaluationFunction=evaluate, numEpisodes=config[1]) bar = range(iterations) for i in bar: agent_policy.parameters = agent.train() # bar.set_description("Average return: {}".format(evaluate(agent_policy.parameters, 5))) return np.array(all_returns)
def problem3(config, iterations: int = 200): """ Repeat the previous question, but using the GA (as described earlier in this assignment) on the More-Watery 687-Gridworld domain. Report the same quantities. """ all_returns = [] def evaluate(p, episodes): returns = [] for i in range(episodes): r = run_gridworld_episode(p) returns.append(r) all_returns.append(r) return np.mean(returns) agent_policy = TabularSoftmax(25, 4) agent = GA(populationSize=config[0], evaluationFunction=evaluate, initPopulationFunction=init_gridworld_population, numElite=config[1], numEpisodes=config[2], alpha=config[3], parent_frac=config[4]) bar = range(iterations) for i in bar: agent_policy.parameters = agent.train() # bar.set_description("Average return: {}".format(evaluate(agent_policy.parameters, 5))) return np.array(all_returns)
def evaluate(theta, numEpisodes): eva_policy = TabularSoftmax(25, 4) eva_policy.parameters = theta returns = runEnvironment_gridworld(eva_policy, numEpisodes) mean_return = np.mean(returns) mean_return_log.append(mean_return) # print(mean_return) return mean_return
def run_gridworld_episode(p): environment = Gridworld() policy = TabularSoftmax(25, 4) policy.parameters = p is_end = False discounted_return = 0 t = 0 while not is_end: action = policy.samplAction(environment.state) new_state, reward, is_end = environment.step(action) discounted_return += (environment.gamma**t) * reward t += 1 if t > 200: discounted_return = -50 break environment.reset() return discounted_return
def problem1(para: dict, trails: int = 50): """ Apply the CEM algorithm to the More-Watery 687-Gridworld. Use a tabular softmax policy. Search the space of hyperparameters for hyperparameters that work well. Report how you searched the hyperparameters, what hyperparameters you found worked best, and present a learning curve plot using these hyperparameters, as described in class. This plot may be over any number of episodes, but should show convergence to a nearly optimal policy. The plot should average over at least 500 trials and should include standard error or standard deviation error bars. Say which error bar variant you used. """ sigma = para['sigma'] popSize = para['popSize'] numElite = para['numElite'] numEpisodes = para['numEpisodes'] epsilon = para['epsilon'] mean_return_log = [] print( 'sigma:{}\tpopSize:{}\tnumElite:{}\tnumEpisodes:{}\tepsilon:{}'.format( sigma, popSize, numElite, numEpisodes, epsilon)) def evaluate(theta, numEpisodes): eva_policy = TabularSoftmax(25, 4) eva_policy.parameters = theta returns = runEnvironment_gridworld(eva_policy, numEpisodes) mean_return = np.mean(returns) mean_return_log.append(mean_return) # print(mean_return) return mean_return policy = TabularSoftmax(25, 4) agent = CEM(theta=policy.parameters, sigma=sigma, popSize=popSize, numElite=numElite, numEpisodes=numEpisodes, evaluationFunction=evaluate, epsilon=epsilon) for i in range(trails): policy.parameters = agent.train() print('Episode {} finished'.format(i)) return mean_return_log
class Evaluate: def __init__(self): self.environment = Gridworld() self.policy = TabularSoftmax(25,4) self._G = [] @property def batchReturn(self)->str: return self._G def __call__(self, theta:np.array, numEpisodes:int): # print("Evaluating Gridworld") # self._G = [] #reset G at every call # environment = Gridworld() # policy = TabularSoftmax(25,4) self.policy.parameters = theta # print("numEpisodes",numEpisodes) Count = 200 for episode in range(numEpisodes): self.environment.reset() G_episode = 0 counter = 0 ctr=0 while not self.environment.isEnd: if(counter>=Count): G_episode = -50 break state = self.environment.state action = self.policy.samplAction(state) _, reward, _ = self.environment.step(action) G_episode += (self.environment.gamma**ctr)*reward # G_episode += reward counter+=1 ctr+=1 # self.returns.append(Gi) self._G.append(G_episode) # if (episode % 50 == 0): # print(G_episode) # print("Mean Return ", np.mean(G)) return np.mean(self._G) def reset(self): self.environment = Gridworld() self.policy = TabularSoftmax(25,4) self._G = []
def __call__(self, parameters: np.array, numEpisodes: int): # print("Evaluating Gridworld") G = [] policy = TabularSoftmax(25, 4) policy.parameters = parameters env = Gridworld() for ep in range(numEpisodes): # print("Episode ", ep) env.reset() Gi = 0 timeStep = 0 while not env.isEnd: state = env.state action = policy.samplAction(state) _, next_state, reward = env.step(action) Gi += reward timeStep += 1 if timeStep == 200: Gi += -50 break G.append(Gi) self.curTrialReturns.append(Gi) print("Mean Return ", np.mean(G)) return np.mean(G)
def problem1(config, iterations: int = 200): """ Apply the CEM algorithm to the More-Watery 687-Gridworld. Use a tabular softmax policy. Search the space of hyperparameters for hyperparameters that work well. Report how you searched the hyperparameters, what hyperparameters you found worked best, and present a learning curve plot using these hyperparameters, as described in class. This plot may be over any number of episodes, but should show convergence to a nearly optimal policy. The plot should average over at least 500 trials and should include standard error or standard deviation error bars. Say which error bar variant you used. """ all_returns = [] def evaluate(p, episodes): returns = [] for i in range(episodes): r = run_gridworld_episode(p) returns.append(r) all_returns.append(r) return np.mean(returns) agent_policy = TabularSoftmax(25, 4) agent = CEM(agent_policy.parameters, sigma=config[0], popSize=config[1], numElite=config[2], numEpisodes=config[3], evaluationFunction=evaluate, epsilon=config[4]) bar = range(iterations) for i in bar: agent_policy.parameters = agent.train() # bar.set_description("Average return: {}".format(evaluate(agent_policy.parameters, 5))) return np.array(all_returns)
def problem1(): """ Apply the CEM algorithm to the More-Watery 687-Gridworld. Use a tabular softmax policy. Search the space of hyperparameters for hyperparameters that work well. Report how you searched the hyperparameters, what hyperparameters you found worked best, and present a learning curve plot using these hyperparameters, as described in class. This plot may be over any number of episodes, but should show convergence to a nearly optimal policy. The plot should average over at least 500 trials and should include standard error or standard deviation error bars. Say which error bar variant you used. """ #TODO popSize = 10 #10 numElite = 5 #5 epsilon = 4.0 #4.0 sigma = 1.0 #1.0 numEpisodes = 20 #50 numTrials = 5 #5 numIterations = 50 #200 returns = np.zeros((numTrials, numEpisodes * numIterations)) for trial in range(numTrials): np.random.seed(np.random.randint(10000)) gridworld = Gridworld() tabular_softmax = TabularSoftmax(25, 4) theta = np.random.randn(tabular_softmax.parameters.shape[0]) count = 0 def evaluateFunction(theta, numEpisodes): nonlocal count
def problem3(): """ Repeat the previous question, but using the GA (as described earlier in this assignment) on the More-Watery 687-Gridworld domain. Report the same quantities. """ #TODO populationSize = 40 # 40 numElite = 20 # 20 numEpisodes = 20 # 20 numTrials = 50 #50 numIterations = 100 # 100 Kp = 30 # 30 alpha = 3.0 # 3.0 returns = np.zeros((numTrials, numEpisodes * numIterations * populationSize)) for trial in range(numTrials): np.random.seed(np.random.randint(10000)) gridworld = Gridworld() tabular_softmax = TabularSoftmax(25, 4) theta = np.random.randn(tabular_softmax.parameters.shape[0]) count = 0 def evaluateFunction(theta, numEpisodes): nonlocal count expected_reward = 0 numTimeSteps = 10000 tabular_softmax.parameters = theta for episode in range(numEpisodes): state = gridworld.state G = 0 discount = 1 for t in range(numTimeSteps): action = tabular_softmax.samplAction(state); nextstate, reward, end = gridworld.step(action) G += (discount) * reward discount *= gridworld.gamma if end == True: break elif t == 200: G = -50 break state = nextstate expected_reward += G returns[trial][count] = G gridworld.reset() count += 1 return expected_reward / numEpisodes def initPopulation(populationSize : int) -> np.ndarray: return np.random.randn(populationSize, tabular_softmax.parameters.shape[0]) agent = GA(populationSize, evaluateFunction, initPopulation, numElite, numEpisodes, Kp, alpha) for iteration in range(numIterations): print("Trial: %d" % (trial, )) print("Iteration: %d" % (iteration, )) p = agent.train() print(returns[trial][iteration * numEpisodes * populationSize : count]) print(np.mean(returns[trial][iteration * numEpisodes * populationSize : count])) l = [[0 for i in range(5)] for j in range(5)] for i in range(25): k = tabular_softmax.getActionProbabilities(i) # print(k) r = np.argmax(k) if(r == 0): l[i//5][i % 5] = '↑' elif(r == 1): l[i//5][i % 5] = '↓' elif(r == 2): l[i//5][i % 5] = '←' elif(r == 3): l[i//5][i % 5] = '→' for i in range(5): print(l[i]) print(p) plot(returns, 'More-Watery 687-Gridworld domain Genetic Algorithm (standard deviation error bars) - 50 trials', 3)
def problem2(): """ Repeat the previous question, but using first-choice hill-climbing on the More-Watery 687-Gridworld domain. Report the same quantities. """ #TODO sigma = 1.0 #1.0 numEpisodes = 200 #200 numTrials = 50 #50 numIterations = 200 #200 returns = np.zeros((numTrials, numEpisodes * numIterations)) for trial in range(numTrials): np.random.seed(np.random.randint(10000)) gridworld = Gridworld() tabular_softmax = TabularSoftmax(25, 4) theta = np.random.randn(tabular_softmax.parameters.shape[0]) count = -1 def evaluateFunction(theta, numEpisodes): nonlocal count expected_reward = 0 numTimeSteps = 10000 tabular_softmax.parameters = theta for episode in range(numEpisodes): state = gridworld.state G = 0 discount = 1 for t in range(numTimeSteps): action = tabular_softmax.samplAction(state); nextstate, reward, end = gridworld.step(action) G += (discount) * reward discount *= gridworld.gamma if end == True: break elif t == 100: break state = nextstate expected_reward += G if(count != -1): returns[trial][count] = G count += 1 gridworld.reset() return expected_reward / numEpisodes agent = FCHC(theta, sigma, evaluateFunction, numEpisodes) count = 0 for iteration in range(numIterations): print("Trial: %d" % (trial, )) print("Iteration: %d" % (iteration, )) p = agent.train() print(returns[trial][iteration * numEpisodes : count]) print(np.mean(returns[trial][iteration * numEpisodes : count])) l = [[0 for i in range(5)] for j in range(5)] for i in range(25): k = tabular_softmax.getActionProbabilities(i) print(k) r = np.argmax(k) if(r == 0): l[i//5][i % 5] = '↑' elif(r == 1): l[i//5][i % 5] = '↓' elif(r == 2): l[i//5][i % 5] = '←' elif(r == 3): l[i//5][i % 5] = '→' for i in range(5): print(l[i]) print(p) plot(returns, 'More-Watery 687-Gridworld domain First Choice Hill Climbing (standard deviation error bars) - 50 trials', 3)
def reset(self): self.environment = Gridworld() self.policy = TabularSoftmax(25,4) self._G = []
def __init__(self): self.environment = Gridworld() self.policy = TabularSoftmax(25,4) self._G = []