def main(): env = gym.make('FrozenLake-v0') rewardWindow = [0 for _ in range(100)] qtab = qTable(env.observation_space.n, env.action_space.n) # Excercise 5 specific (load qTable from ex3) ex3qtab = qTable(env.observation_space.n, env.action_space.n) f = open("ex3qtable.json", 'r').read() ex3qtab.table = json.loads(f) epsilon = 0.1 ep = [] rew = [] for i_episode in range(8000): observation = env.reset() accumulatedReward = 0 for t in range(10000): #Render enviorment #env.render() #Select action action = epsilonGreedy(epsilon, env, observation, qtab) #Perform action prevObs = observation observation, reward, done, info = env.step(action) accumulatedReward += reward #Update Q oldQ = qtab.getQ(prevObs, action) currQ = qtab.getQ(observation, ex3qtab.getMaxQAction(observation)) # ex5 specific newQ = oldQ + LEARNING_RATE*(reward + DISCOUNT*currQ - oldQ) qtab.setQ(prevObs, action, newQ) #Check if episode is done if done: rewardWindow[i_episode % 100] = accumulatedReward ep.append(i_episode) break #Decrease exploration rate epsilon *= 0.9995 # ends up at e = 0.002 after 8000 iterations windowAvg = 0 for i in rewardWindow: windowAvg += i rew.append(windowAvg/100) print(i_episode, " ", windowAvg, end='\r') if windowAvg >= 78: break plt.plot(ep, rew) plt.xlabel('episode') plt.ylabel('reward') plt.title('Frozen Lake on policy') plt.grid(True) plt.savefig("op.png") plt.show()
def main(): env = gym.make('FrozenLake-v0') rewardWindow = [0 for _ in range(100)] qtab = qTable(env.observation_space.n, env.action_space.n) epsilon = 0.1 ep = [] rew = [] for i_episode in range(8000): observation = env.reset() accumulatedReward = 0 for t in range(10000): #Render enviorment #env.render() #Select action action = epsilonGreedy(epsilon, env, observation, qtab) #Perform action prevObs = observation observation, reward, done, info = env.step(action) accumulatedReward += reward #Update Q oldQ = qtab.getQ(prevObs, action) maxCurrQ = qtab.getMaxQ(observation) newQ = oldQ + LEARNING_RATE * (reward + DISCOUNT * maxCurrQ - oldQ) qtab.setQ(prevObs, action, newQ) #Check if episode is done if done: rewardWindow[i_episode % 100] = accumulatedReward ep.append(i_episode) break #Decrease exploration rate epsilon *= 0.9995 # ends up at e = 0.002 after 8000 iterations windowAvg = 0 for i in rewardWindow: windowAvg += i print(i_episode, " ", windowAvg, end='\r') rew.append(windowAvg / 100) if windowAvg >= 78: break plt.plot(ep, rew) plt.xlabel('episode') plt.ylabel('reward') plt.title('Frozen Lake Q learning') plt.grid(True) plt.savefig("qlrn.png") plt.show() """ Export qtable to json """ f = open("ex3qtable.json", 'w') f.write(json.dumps(qtab.table)) f.close()
def main(): env = gym.make('FrozenLake-v0') rewardWindow = [0 for _ in range(100)] qtab = qTable(env.observation_space.n, env.action_space.n) epsilon = 0.1 ep = [] rew = [] for i_episode in range(8000): observation = env.reset() action = epsilonGreedy(epsilon, env, observation, qtab) accumulatedReward = 0 for t in range(100): #Render enviorment #env.render() #Perform action prevObs = observation observation, reward, done, info = env.step(action) accumulatedReward += reward #Select action prevAct = action action = epsilonGreedy(epsilon, env, observation, qtab) #Update Q oldQ = qtab.getQ(prevObs, prevAct) actQ = qtab.getQ(observation, action) newQ = oldQ + LEARNING_RATE * (reward + DISCOUNT * actQ - oldQ) qtab.setQ(prevObs, prevAct, newQ) #Check if episode is done if done: rewardWindow[i_episode % 99] = accumulatedReward ep.append(i_episode) break #Decrease exploration rate epsilon *= 0.9995 windowAvg = 0 for i in rewardWindow: windowAvg += i rew.append(windowAvg / 100) print(i_episode, " ", windowAvg, end='\r') if windowAvg >= 78: break plt.plot(ep, rew) plt.xlabel('episode') plt.ylabel('reward') plt.title('Frozen Lake SARSA') plt.grid(True) plt.savefig("sarsa.png") plt.show()
def main(): env = gym.make('Taxi-v1') rewardWindow = [0 for _ in range(100)] qtab = qTable(env.observation_space.n, env.action_space.n) epsilon = 0.1 ep = [] rew = [] for i_episode in range(80000): observation = env.reset() accumulatedReward = 0 for t in range(10000): #Render enviorment env.render() #Select action action = epsilonGreedy(epsilon, env, observation, qtab) #Perform action prevObs = observation observation, reward, done, info = env.step(action) accumulatedReward += reward #Update Q oldQ = qtab.getQ(prevObs, action) maxCurrQ = qtab.getMaxQ(observation) newQ = oldQ + LEARNING_RATE * (reward + DISCOUNT * maxCurrQ - oldQ) qtab.setQ(prevObs, action, newQ) #Check if episode is done if done: rewardWindow[i_episode % 100] = accumulatedReward ep.append(i_episode) rew.append(accumulatedReward) break #Decrease exploration rate epsilon *= 0.9995 windowAvg = 0 for i in rewardWindow: windowAvg += i print(i_episode, " ", windowAvg, end='\r') if windowAvg >= 970: break plt.plot(ep, rew) plt.xlabel('episode') plt.ylabel('reward') plt.title('Taxi Q learning') plt.grid(True) plt.savefig("qlrn.png") plt.show()