def problem3(config, iterations: int = 200): """ Repeat the previous question, but using the GA (as described earlier in this assignment) on the More-Watery 687-Gridworld domain. Report the same quantities. """ all_returns = [] def evaluate(p, episodes): returns = [] for i in range(episodes): r = run_gridworld_episode(p) returns.append(r) all_returns.append(r) return np.mean(returns) agent_policy = TabularSoftmax(25, 4) agent = GA(populationSize=config[0], evaluationFunction=evaluate, initPopulationFunction=init_gridworld_population, numElite=config[1], numEpisodes=config[2], alpha=config[3], parent_frac=config[4]) bar = range(iterations) for i in bar: agent_policy.parameters = agent.train() # bar.set_description("Average return: {}".format(evaluate(agent_policy.parameters, 5))) return np.array(all_returns)
def problem6(): """ Repeat the previous question, but using the GA (as described earlier in this homework) on the cart-pole domain. Report the same quantities and how the policy was parameterized. """ #TODO print("Problem 6") # Environment Params m = 4 numActions = 2 # Policy Search Params numTrials = 50 numGenerations = 20 populationSize = 20 numEpisodes = 10 numElite = 10 numTruncate = 5 alpha = 0.1 k = 3 policyEval = CartPoleEvaluation(k=k) initGA = GAInit(numActions * np.power(k + 1, m)) # print("Trials: %d\nIterations: %d\nEpisodes: %d\nSigma: %f" % (numTrials, numIters, numEps, sigma)) agent = GA(populationSize, policyEval, initGA, numElite=numElite, numTruncate=numTruncate, alpha=alpha, numEpisodes=numEpisodes) for trial in range(numTrials): print("Trial ", trial) for gen in range(numGenerations): print("Generation ", gen) agent.train() policyEval.endTrial() agent.reset() policyEval.plot('learningCurve_cartpole_GA_{}.png'.format(trial), "Learning Curve - Cartpole with GA Agent")
def problem3(): """ Repeat the previous question, but using the GA (as described earlier in this assignment) on the More-Watery 687-Gridworld domain. Report the same quantities. """ #TODO print("Problem 3") # Environment Params num_states = 25 num_actions = 4 # Policy Search Params numTrials = 50 numGenerations = 100 populationSize = 30 numEpisodes = 20 numElite = 20 numTruncate = 5 alpha = 1.25 policyEval = GridworldEvaluation() initGA = GAInit(num_states * num_actions) # print("Trials: %d\nIterations: %d\nEpisodes: %d\nSigma: %f" % (numTrials, numIters, numEps, sigma)) agent = GA(populationSize, policyEval, initGA, numElite=numElite, numTruncate=numTruncate, alpha=alpha, numEpisodes=numEpisodes) for trial in range(numTrials): print("Trial ", trial) for gen in range(numGenerations): print("Generation ", gen) agent.train() policyEval.endTrial() agent.reset() policyEval.plot('learningCurve_gridworld_GA_{}.png'.format(trial), "Learning Curve - Gridworld with GA Agent")
def problem3(para: dict, trails: int = 50): """ Repeat the previous question, but using the GA (as described earlier in this assignment) on the More-Watery 687-Gridworld domain. Report the same quantities. """ popSize = para['popSize'] numElite = para['numElite'] numEpisodes = para['numEpisodes'] alpha = para['alpha'] print('popSize:{}\tnumElite:{}\tnumEpisodes:{}\talpha:{}'.format( popSize, numElite, numEpisodes, alpha)) mean_return_log = [] def evaluate(theta, numEpisodes): eva_policy = TabularSoftmax(25, 4) eva_policy.parameters = theta returns = runEnvironment_gridworld(eva_policy, numEpisodes) mean_return = np.mean(returns, axis=0) mean_return_log.append(mean_return) # print(mean_return) return mean_return def initPopulation(popSize: int): population = np.random.normal(0, 1, (popSize, 25 * 4)) # Initialize randomly return population # policy = TabularSoftmax(25, 4) agent = GA(populationSize=popSize, numElite=numElite, numEpisodes=numEpisodes, evaluationFunction=evaluate, alpha=alpha, initPopulationFunction=initPopulation) for i in range(trails): agent.train() print('Episode {} finished'.format(i)) return mean_return_log
def problem6(para: dict, trails: int = 50): """ Repeat the previous question, but using the GA (as described earlier in this homework) on the cart-pole domain. Report the same quantities and how the policy was parameterized. """ popSize = para['popSize'] numElite = para['numElite'] numEpisodes = para['numEpisodes'] alpha = para['alpha'] print('popSize:{}\tnumElite:{}\tnumEpisodes:{}\talpha:{}'.format( popSize, numElite, numEpisodes, alpha)) mean_return_log = [] def evaluate(theta, numEpisodes): eva_policy = LinearSoftmax(4, 2, 2) eva_policy.parameters = theta returns = runEnvironment_carpole(eva_policy, numEpisodes) mean_return = np.mean(returns, axis=0) mean_return_log.append(mean_return) # print(mean_return) return mean_return def initPopulation(popSize: int): population = np.random.normal(0, 1, (popSize, 2 * 81)) # Initialize randomly return population agent = GA(populationSize=popSize, numElite=numElite, numEpisodes=numEpisodes, evaluationFunction=evaluate, alpha=alpha, initPopulationFunction=initPopulation) for i in range(trails): agent.train() print('Episode {} finished'.format(i)) return mean_return_log
def problem6(config, iterations: int = 25): """ Repeat the previous question, but using the GA (as described earlier in this homework) on the cart-pole domain. Report the same quantities and how the policy was parameterized. """ all_returns = [] def evaluate(p, episodes): returns = [] for i in range(episodes): r = run_cartpole_episode(p, config[0]) returns.append(r) all_returns.append(r) return np.mean(returns) agent_policy = LinearApproximation(state_dim=4, num_actions=2, basis=config[0]) if config[0] == 2: agent = GA(populationSize=config[1], evaluationFunction=evaluate, initPopulationFunction=init_cartpole_population_2, numElite=config[2], numEpisodes=config[3], alpha=config[4], parent_frac=config[5]) else: agent = GA(populationSize=config[1], evaluationFunction=evaluate, initPopulationFunction=init_cartpole_population_3, numElite=config[2], numEpisodes=config[3], alpha=config[4], parent_frac=config[5]) for i in range(iterations): agent_policy.parameters = agent.train() return np.array(all_returns)
def problem6(): """ Repeat the previous question, but using the GA (as described earlier in this homework) on the cart-pole domain. Report the same quantities and how the policy was parameterized. """ #TODO populationSize = 20 #20 numElite = 5 #5 numEpisodes = 5 #5 numTrials = 50 #50 numIterations = 20 #20 Kp = 10 #10 alpha = 2.5 #2.5 k = 2 #2 returns = np.zeros((numTrials, numEpisodes * numIterations * populationSize)) for trial in range(numTrials): np.random.seed(np.random.randint(10000)) cartpole = Cartpole() tabular_softmax = TabularSoftmaxContinuous(k, 2) theta = np.random.randn(tabular_softmax.parameters.shape[0]) count = 0 def evaluateFunction(theta, numEpisodes): nonlocal count expected_reward = 0 numTimeSteps = 1000 tabular_softmax.parameters = theta for episode in range(numEpisodes): state = cartpole.state G = 0 discount = 1 for t in range(numTimeSteps): action = tabular_softmax.samplAction(state); nextstate, reward, end = cartpole.step(action) G += (discount) * reward discount *= cartpole.gamma if end == True: break state = nextstate expected_reward += G returns[trial][count] = G cartpole.reset() count += 1 return expected_reward / numEpisodes def initPopulation(populationSize : int) -> np.ndarray: return np.random.randn(populationSize, tabular_softmax.parameters.shape[0]) agent = GA(populationSize, evaluateFunction, initPopulation, numElite, numEpisodes, Kp, alpha) for iteration in range(numIterations): print("Trial: %d" % (trial, )) print("Iteration: %d" % (iteration, )) p = agent.train() print(returns[trial][iteration * numEpisodes * populationSize : count]) print(iteration * numEpisodes * populationSize) print(count) # l = [[0 for i in range(5)] for j in range(5)] # for i in range(25): # s = tabular_softmax.getActionProbabilities(i) # print(s) # r = np.argmax(s) # if(r == 0): # l[i//5][i % 5] = '↑' # elif(r == 1): # l[i//5][i % 5] = '↓' # elif(r == 2): # l[i//5][i % 5] = '←' # elif(r == 3): # l[i//5][i % 5] = '→' # for i in range(5): # print(l[i]) print(p) plot(returns, 'Cartpole domain Genetic Algorithm (standard deviation error bars) - 50 trials', 1000)
def problem3(): """ Repeat the previous question, but using the GA (as described earlier in this assignment) on the More-Watery 687-Gridworld domain. Report the same quantities. """ #TODO populationSize = 40 # 40 numElite = 20 # 20 numEpisodes = 20 # 20 numTrials = 50 #50 numIterations = 100 # 100 Kp = 30 # 30 alpha = 3.0 # 3.0 returns = np.zeros((numTrials, numEpisodes * numIterations * populationSize)) for trial in range(numTrials): np.random.seed(np.random.randint(10000)) gridworld = Gridworld() tabular_softmax = TabularSoftmax(25, 4) theta = np.random.randn(tabular_softmax.parameters.shape[0]) count = 0 def evaluateFunction(theta, numEpisodes): nonlocal count expected_reward = 0 numTimeSteps = 10000 tabular_softmax.parameters = theta for episode in range(numEpisodes): state = gridworld.state G = 0 discount = 1 for t in range(numTimeSteps): action = tabular_softmax.samplAction(state); nextstate, reward, end = gridworld.step(action) G += (discount) * reward discount *= gridworld.gamma if end == True: break elif t == 200: G = -50 break state = nextstate expected_reward += G returns[trial][count] = G gridworld.reset() count += 1 return expected_reward / numEpisodes def initPopulation(populationSize : int) -> np.ndarray: return np.random.randn(populationSize, tabular_softmax.parameters.shape[0]) agent = GA(populationSize, evaluateFunction, initPopulation, numElite, numEpisodes, Kp, alpha) for iteration in range(numIterations): print("Trial: %d" % (trial, )) print("Iteration: %d" % (iteration, )) p = agent.train() print(returns[trial][iteration * numEpisodes * populationSize : count]) print(np.mean(returns[trial][iteration * numEpisodes * populationSize : count])) l = [[0 for i in range(5)] for j in range(5)] for i in range(25): k = tabular_softmax.getActionProbabilities(i) # print(k) r = np.argmax(k) if(r == 0): l[i//5][i % 5] = '↑' elif(r == 1): l[i//5][i % 5] = '↓' elif(r == 2): l[i//5][i % 5] = '←' elif(r == 3): l[i//5][i % 5] = '→' for i in range(5): print(l[i]) print(p) plot(returns, 'More-Watery 687-Gridworld domain Genetic Algorithm (standard deviation error bars) - 50 trials', 3)
def problem6(): """ Repeat the previous question, but using the GA (as described earlier in this homework) on the cart-pole domain. Report the same quantities and how the policy was parameterized. """ print("ga-cartpole-softmax_theta_phi") # fourier_param = 2 def initPopFn(pop_size): theta_arr = np.zeros((pop_size, 2 * fourier_param**4)) return theta_arr state = np.array([0, 0, 0, 0]) env = Cartpole() env.nextState(state, 0) fourier_param = 4 # theta = np.zeros(2*fourier_param**4) # sigma = 1 popSize = 10 numElite = 3 numEpisodes = 5 evaluate = EvaluateCartpole() # epsilon = 0.005 ga = GA(popSize, evaluate, initPopFn, numElite, numEpisodes) # numTrials = 50 numTrials = 10 numIterations = 100 # numIterations = 250 # numIterations = 20 total_episodes = numIterations * numEpisodes * popSize # 20*50*10 results = np.zeros((numTrials, total_episodes)) for trial in range(numTrials): ga.reset() for i in range(numIterations): #DEBUG if (i % 5 == 0): print("cart ga: ", "trial: ", trial, "/", numTrials, " iteration: ", i, "/", numIterations) ga.train() batch_start = (i * numEpisodes) * popSize batch_end = ((i + 1) * numEpisodes) * popSize results[trial, batch_start:batch_end] = np.array(evaluate.batchReturn) average_results = np.average(np.array(results), axis=0) std_results = np.std(np.array(results), axis=0) maximumEpisodes = average_results.shape[0] max_avg = np.max(average_results) plt.errorbar(np.array([i for i in range(maximumEpisodes)]), average_results, std_results, marker='.', ecolor='aqua') plt.grid(True) plt.axhline(max_avg) plt.text(0, max_avg, "max: " + str(round(max_avg, 2)), fontsize=15, backgroundcolor='w') plt_name = "ga_cartpole" now = datetime.now() param_string = "_numTrials_"+str(numTrials)+"_numIter_" \ + str(numIterations) + "_popSize_" +str(popSize) dt_string = now.strftime("_t_%H_%M") plt_name += param_string plt_name += dt_string print("plt_name=", plt_name) plt_path = "images/" + plt_name + ".png" # plot_min = -100 # plot_max = 10 # plt.ylim(average_results.min(), average_results.max()) # plt.ylim(plot_min, plot_max) plt.savefig(plt_path, dpi=200) plt.show() np.save("data/" + "results_" + plt_name, results) np.save("data/" + "average_results_" + plt_name, average_results) np.save("data/" + "std_results_" + plt_name, std_results) #TODO pass
def problem3(): """ Repeat the previous question, but using the GA (as described earlier in this assignment) on the More-Watery 687-Gridworld domain. Report the same quantities. """ def initPopFn(pop_size): # theta_arr = np.zeros((pop_size,100)) theta_zeros = np.zeros(100) theta_arr = np.random.multivariate_normal(theta_zeros, np.identity( len(theta_zeros)), size=pop_size) # print(theta_arr.shape) # return child return theta_arr print("ga-gridworld-tabular_softmax") popSize = 10 evaluate = Evaluate() numElite = 4 numEpisodes = 10 # numEpisodes = 50 ga = GA(popSize, evaluate, initPopFn, numElite, numEpisodes) # numTrials = 50 numTrials = 20 # numIterations = 100 numIterations = 25 total_episodes = numIterations * numEpisodes * popSize # 20*50*10 # total_episodes = numIterations*num_episodes # 100*50 results = np.zeros((numTrials, total_episodes)) # results = [] # iter_results = [] for trial in range(numTrials): ga.reset() for i in range(numIterations): #DEBUG if (i % 5 == 0): print("ga: ", "trial: ", trial, "/", numTrials, " iteration: ", i, "/", numIterations) ga.train() batch_start = (i * numEpisodes) * popSize batch_end = ((i + 1) * numEpisodes) * popSize results[trial,batch_start:batch_end] =\ np.array(evaluate.batchReturn) # np.evaluate.batchReturn average_results = np.mean(np.array(results), axis=0) std_results = np.std(np.array(results), axis=0) maximumEpisodes = average_results.shape[0] max_avg = np.max(average_results) plt.errorbar(np.array(range(maximumEpisodes)), average_results, std_results, marker='.', ecolor='aqua') plt.grid(True) plt.axhline(max_avg) plt.text(0, max_avg, "max: " + str(round(max_avg, 2)), fontsize=15, backgroundcolor='w') plt_name = "ga_gridworld" now = datetime.now() param_string = "_numTrials_" + str(numTrials) + "_numIter_" + str( numIterations) dt_string = now.strftime("_%H_%M") plt_name += param_string plt_name += dt_string print("plt_name=", plt_name) plt_path = "images/" + plt_name + ".png" # plot_min = -100 # plot_max = 10 # plt.ylim(average_results.min(), average_results.max()) # plt.ylim(plot_min, plot_max) plt.savefig(plt_path, dpi=200) plt.show() np.save("data/" + "results_" + plt_name, results) np.save("data/" + "average_results_" + plt_name, average_results) np.save("data/" + "std_results_" + plt_name, std_results)