def __call__(self, parameters: np.array, numEpisodes: int): # print("Evaluating Cartpole") G = [] # self.policy.parameters = policy policy = SoftmaxWithLFA(4, 2, self.k) policy.parameters = parameters # threadPool = Pool(numEpisodes) # G = threadPool.map(self.runEpisode, [policy for i in range(numEpisodes)]) # print("G", G) # self.curTrialReturns.extend(G) env = Cartpole() for ep in range(numEpisodes): # print("Episode ", ep) env.reset() Gi = 0 while not env.isEnd: state = env.state action = policy.samplAction(state) next_state, reward, _ = env.step(action) Gi += reward G.append(Gi) self.lock.acquire() self.curTrialReturns.append(Gi) # print("Number of returns collected int trial", len(self.curTrialReturns)) self.lock.release() # threadPool.close() # threadPool.join() print("Mean Return ", np.mean(G)) return np.mean(G)
def __init__(self, numStates: int, numActions: int, k: int): # self.G = [] self._numStates = numStates self._numActions = numActions self._k = k self.environment = Cartpole() self.policy = SoftmaxThetaPhi(numStates, numActions, k)
def runEpisode(self, policy): env = Cartpole() Gi = 0 while not env.isEnd: state = env.state action = policy.samplAction(state) next_state, reward, _ = env.step(action) Gi += reward return Gi
class GenEpCartpole: def __init__(self, numStates: int, numActions: int, k: int): # self.G = [] self._numStates = numStates self._numActions = numActions self._k = k self.environment = Cartpole() self.policy = SoftmaxThetaPhi(numStates, numActions, k) # self.policy.k = k # @property # def batchReturn(self)->str: # return self._G def __call__(self, theta: np.array, numEpisodes: int): self.policy.parameters = theta D = [] for episode in range(numEpisodes): self.environment.reset() G_episode = 0 counter = 0 H = {} S = [] A = [] R = [] while not self.environment.isEnd: state = self.environment.state action = self.policy.samplAction(state) _, reward, _ = self.environment.step(action) G_episode += reward phi_s = self.policy.phiS(state) S.append(phi_s) A.append(action) R.append(reward) counter += 1 H['S'] = np.array(S) H['A'] = np.array(A) H['R'] = np.array(R) D.append(H) return D def reset(self): self.environment = Cartpole() self.policy = SoftmaxThetaPhi(self._numStates, self._numActions) self.policy.k = self._k
def run_cartpole_episode(p, basis): environment = Cartpole() policy = LinearApproximation(state_dim=4, num_actions=2, basis=basis) policy.parameters = p is_end = False discounted_return = 0 t = 0 while not is_end: action = policy.samplAction(environment.state) new_state, reward, is_end = environment.step(action) discounted_return += (environment.gamma**t) * reward t += 1 environment.reset() return discounted_return
def runHistory(getAction, numeps=10000): histories = [] cartPole = Cartpole() for ep in range(numeps): history = [] cartPole.reset() history.append(cartPole.state) step = 0 while not cartPole.isEnd: s, r, e = cartPole.step(getAction(cartPole.state)) history.append(cartPole.action) history.append(cartPole.reward) history.append(cartPole.state) histories.append(history) return histories
def runEnvironment_carpole(policy, numeps=10000): returns = np.zeros(numeps) env = Cartpole() for ep in range(numeps): env.reset() step = 0 g = 0 while not env.isEnd: action = policy.samplAction(env.state) s, r, e = env.step(action) g += (env.gamma**step) * r step += 1 if step > 200: g = -50 break returns[ep] = g return returns
class EvaluateCartpole: def __init__(self): self.environment = Cartpole() self.policy = SoftmaxThetaPhi(4, 2) self._G = [] @property def batchReturn(self) -> str: return self._G def __call__(self, theta: np.array, numEpisodes: int): self.policy.parameters = theta for episode in range(numEpisodes): self.environment.reset() G_episode = 0 counter = 0 while not self.environment.isEnd: state = self.environment.state action = self.policy.samplAction(state) _, reward, _ = self.environment.step(action) G_episode += reward counter += 1 self._G.append(G_episode) return np.mean(self._G) def reset(self): self.environment = Cartpole() self.policy = SoftmaxThetaPhi(4, 2) self._G = []
def problem6(): """ Repeat the previous question, but using the GA (as described earlier in this homework) on the cart-pole domain. Report the same quantities and how the policy was parameterized. """ #TODO populationSize = 20 #20 numElite = 5 #5 numEpisodes = 5 #5 numTrials = 50 #50 numIterations = 20 #20 Kp = 10 #10 alpha = 2.5 #2.5 k = 2 #2 returns = np.zeros((numTrials, numEpisodes * numIterations * populationSize)) for trial in range(numTrials): np.random.seed(np.random.randint(10000)) cartpole = Cartpole() tabular_softmax = TabularSoftmaxContinuous(k, 2) theta = np.random.randn(tabular_softmax.parameters.shape[0]) count = 0 def evaluateFunction(theta, numEpisodes): nonlocal count expected_reward = 0 numTimeSteps = 1000 tabular_softmax.parameters = theta for episode in range(numEpisodes): state = cartpole.state G = 0 discount = 1 for t in range(numTimeSteps): action = tabular_softmax.samplAction(state); nextstate, reward, end = cartpole.step(action) G += (discount) * reward discount *= cartpole.gamma if end == True: break state = nextstate expected_reward += G returns[trial][count] = G cartpole.reset() count += 1 return expected_reward / numEpisodes def initPopulation(populationSize : int) -> np.ndarray: return np.random.randn(populationSize, tabular_softmax.parameters.shape[0]) agent = GA(populationSize, evaluateFunction, initPopulation, numElite, numEpisodes, Kp, alpha) for iteration in range(numIterations): print("Trial: %d" % (trial, )) print("Iteration: %d" % (iteration, )) p = agent.train() print(returns[trial][iteration * numEpisodes * populationSize : count]) print(iteration * numEpisodes * populationSize) print(count) # l = [[0 for i in range(5)] for j in range(5)] # for i in range(25): # s = tabular_softmax.getActionProbabilities(i) # print(s) # r = np.argmax(s) # if(r == 0): # l[i//5][i % 5] = '↑' # elif(r == 1): # l[i//5][i % 5] = '↓' # elif(r == 2): # l[i//5][i % 5] = '←' # elif(r == 3): # l[i//5][i % 5] = '→' # for i in range(5): # print(l[i]) print(p) plot(returns, 'Cartpole domain Genetic Algorithm (standard deviation error bars) - 50 trials', 1000)
def problem5(): """ Repeat the previous question, but using first-choice hill-climbing (as described in class) on the cart-pole domain. Report the same quantities and how the policy was parameterized. """ #TODO sigma = 1.0 numEpisodes = 150 #100 numTrials = 50 #50 numIterations = 75 #50 k = 2 #2 returns = np.zeros((numTrials, numEpisodes * numIterations)) for trial in range(numTrials): np.random.seed(np.random.randint(10000)) cartpole = Cartpole() tabular_softmax = TabularSoftmaxContinuous(k, 2) theta = np.random.randn(tabular_softmax.parameters.shape[0]) count = -1 def evaluateFunction(theta, numEpisodes): nonlocal count expected_reward = 0 numTimeSteps = 1000 tabular_softmax.parameters = theta for episode in range(numEpisodes): state = cartpole.state G = 0 discount = 1 for t in range(numTimeSteps): action = tabular_softmax.samplAction(state); nextstate, reward, end = cartpole.step(action) G += (discount) * reward discount *= cartpole.gamma if end == True: break state = nextstate expected_reward += G if(count != -1): returns[trial][count] = G count += 1 cartpole.reset() return expected_reward / numEpisodes agent = FCHC(theta, sigma, evaluateFunction, numEpisodes) count = 0 for iteration in range(numIterations): print("Trial: %d" % (trial, )) print("Iteration: %d" % (iteration, )) p = agent.train() print(returns[trial][iteration * numEpisodes : count]) print(np.mean(returns[trial][iteration * numEpisodes : count])) # l = [[0 for i in range(5)] for j in range(5)] # for i in range(25): # s = tabular_softmax.getActionProbabilities(i) # print(s) # r = np.argmax(s) # if(r == 0): # l[i//5][i % 5] = '↑' # elif(r == 1): # l[i//5][i % 5] = '↓' # elif(r == 2): # l[i//5][i % 5] = '←' # elif(r == 3): # l[i//5][i % 5] = '→' # for i in range(5): # print(l[i]) print(p) plot(returns, 'Cartpole domain First Choice Hill Climbing (standard deviation error bars) - 50 trials', 1000)
def problem4(): """ Repeat the previous question, but using the cross-entropy method on the cart-pole domain. Notice that the state is not discrete, and so you cannot directly apply a tabular softmax policy. It is up to you to create a representation for the policy for this problem. Consider using the softmax action selection using linear function approximation as described in the notes. Report the same quantities, as well as how you parameterized the policy. """ #TODO popSize = 10 #10 numElite = 5 #5 epsilon = 4.0 #4.0 sigma = 1.0 #1.0 numEpisodes = 20 #20 numTrials = 5 #5 numIterations = 40 #40 k = 2 #2 returns = np.zeros((numTrials, numEpisodes * numIterations * popSize)) for trial in range(numTrials): np.random.seed(np.random.randint(10000)) cartpole = Cartpole() tabular_softmax = TabularSoftmaxContinuous(k, 2) theta = np.random.randn(tabular_softmax.parameters.shape[0]) count = 0 def evaluateFunction(theta, numEpisodes): nonlocal count expected_reward = 0 numTimeSteps = 1000 tabular_softmax.parameters = theta for episode in range(numEpisodes): state = cartpole.state G = 0 discount = 1 for t in range(numTimeSteps): action = tabular_softmax.samplAction(state); nextstate, reward, end = cartpole.step(action) G += (discount) * reward discount *= cartpole.gamma if end == True: break state = nextstate expected_reward += G returns[trial][count] = G cartpole.reset() count += 1 return expected_reward / numEpisodes agent = CEM(theta, sigma, popSize, numElite, numEpisodes, evaluateFunction, epsilon) for iteration in range(numIterations): print("Trial: %d" % (trial, )) print("Iteration: %d" % (iteration, )) p = agent.train() print(returns[trial][iteration * numEpisodes * popSize : count]) # l = [[0 for i in range(5)] for j in range(5)] # for i in range(25): # s = tabular_softmax.getActionProbabilities(i) # print(s) # r = np.argmax(s) # if(r == 0): # l[i//5][i % 5] = '↑' # elif(r == 1): # l[i//5][i % 5] = '↓' # elif(r == 2): # l[i//5][i % 5] = '←' # elif(r == 3): # l[i//5][i % 5] = '→' # for i in range(5): # print(l[i]) print(p) plot(returns, 'Cartpole domain Cross Entropy Method (standard deviation error bars) - 5 trials', 1000)
def reset(self): self.environment = Cartpole() self.policy = SoftmaxThetaPhi(self._numStates, self._numActions) self.policy.k = self._k
def __init__(self): self.environment = Cartpole() self.policy = SoftmaxThetaPhi(4, 2) self._G = []
def reset(self): self.environment = Cartpole() self.policy = SoftmaxThetaPhi(4, 2) self._G = []
def problem6(): """ Repeat the previous question, but using the GA (as described earlier in this homework) on the cart-pole domain. Report the same quantities and how the policy was parameterized. """ print("ga-cartpole-softmax_theta_phi") # fourier_param = 2 def initPopFn(pop_size): theta_arr = np.zeros((pop_size, 2 * fourier_param**4)) return theta_arr state = np.array([0, 0, 0, 0]) env = Cartpole() env.nextState(state, 0) fourier_param = 4 # theta = np.zeros(2*fourier_param**4) # sigma = 1 popSize = 10 numElite = 3 numEpisodes = 5 evaluate = EvaluateCartpole() # epsilon = 0.005 ga = GA(popSize, evaluate, initPopFn, numElite, numEpisodes) # numTrials = 50 numTrials = 10 numIterations = 100 # numIterations = 250 # numIterations = 20 total_episodes = numIterations * numEpisodes * popSize # 20*50*10 results = np.zeros((numTrials, total_episodes)) for trial in range(numTrials): ga.reset() for i in range(numIterations): #DEBUG if (i % 5 == 0): print("cart ga: ", "trial: ", trial, "/", numTrials, " iteration: ", i, "/", numIterations) ga.train() batch_start = (i * numEpisodes) * popSize batch_end = ((i + 1) * numEpisodes) * popSize results[trial, batch_start:batch_end] = np.array(evaluate.batchReturn) average_results = np.average(np.array(results), axis=0) std_results = np.std(np.array(results), axis=0) maximumEpisodes = average_results.shape[0] max_avg = np.max(average_results) plt.errorbar(np.array([i for i in range(maximumEpisodes)]), average_results, std_results, marker='.', ecolor='aqua') plt.grid(True) plt.axhline(max_avg) plt.text(0, max_avg, "max: " + str(round(max_avg, 2)), fontsize=15, backgroundcolor='w') plt_name = "ga_cartpole" now = datetime.now() param_string = "_numTrials_"+str(numTrials)+"_numIter_" \ + str(numIterations) + "_popSize_" +str(popSize) dt_string = now.strftime("_t_%H_%M") plt_name += param_string plt_name += dt_string print("plt_name=", plt_name) plt_path = "images/" + plt_name + ".png" # plot_min = -100 # plot_max = 10 # plt.ylim(average_results.min(), average_results.max()) # plt.ylim(plot_min, plot_max) plt.savefig(plt_path, dpi=200) plt.show() np.save("data/" + "results_" + plt_name, results) np.save("data/" + "average_results_" + plt_name, average_results) np.save("data/" + "std_results_" + plt_name, std_results) #TODO pass
def problem4(): """ Repeat the previous question, but using the cross-entropy method on the cart-pole domain. Notice that the state is not discrete, and so you cannot directly apply a tabular softmax policy. It is up to you to create a representation for the policy for this problem. Consider using the softmax action selection using linear function approximation as described in the notes. Report the same quantities, as well as how you parameterized the policy. """ #TODO print("cem-cartpole-softmax_theta_phi") state = np.array([0, 0, 0, 0]) env = Cartpole() env.nextState(state, 0) fourier_param = 4 theta = np.zeros(2 * fourier_param**4) sigma = 1 popSize = 10 numElite = 3 numEpisodes = 5 # numEpisodes = 20 evaluate = EvaluateCartpole() epsilon = 0.005 cem = CEM(theta, sigma, popSize, numElite, numEpisodes, evaluate, epsilon) # numTrials = 50 numTrials = 10 # numIterations = 250 numIterations = 100 # total_episodes = 20,000 total_episodes = numIterations * numEpisodes * popSize # 20*50*10 results = np.zeros((numTrials, total_episodes)) for trial in range(numTrials): cem.reset() for i in range(numIterations): #DEBUG if (i % 5 == 0): print("cart cem: ", "trial: ", trial, "/", numTrials, " iteration: ", i, "/", numIterations) cem.train() batch_start = (i * numEpisodes) * popSize batch_end = ((i + 1) * numEpisodes) * popSize results[trial, batch_start:batch_end] = np.array(evaluate.batchReturn) average_results = np.average(np.array(results), axis=0) std_results = np.std(np.array(results), axis=0) maximumEpisodes = average_results.shape[0] max_avg = np.max(average_results) plt.errorbar(np.array([i for i in range(maximumEpisodes)]), average_results, std_results, marker='.', ecolor='aqua') plt.grid(True) plt.axhline(max_avg) plt.text(0, max_avg, "max: " + str(round(max_avg, 2)), fontsize=15, backgroundcolor='w') plt_name = "cem_cartpole" now = datetime.now() param_string = "_numTrials_"+str(numTrials)+"_numIter_" \ + str(numIterations) + "_popSize_" +str(popSize) dt_string = now.strftime("_t_%H_%M") plt_name += param_string plt_name += dt_string print("plt_name=", plt_name) plt_path = "images/" + plt_name + ".png" # plot_min = -100 # plot_max = 10 # plt.ylim(average_results.min(), average_results.max()) # plt.ylim(plot_min, plot_max) plt.savefig(plt_path, dpi=200) plt.show() np.save("data/" + "results_" + plt_name, results) np.save("data/" + "average_results_" + plt_name, average_results) np.save("data/" + "std_results_" + plt_name, std_results)