def frozen_ql_experiment(env_name, new_lake): np.random.seed(0) min_r = -100.0 max_r = 100.0 problem = MyWrapper.TransformReward( gym.make(env_name, desc=new_lake), lambda r: np.clip(r * 100.0, min_r, max_r)) problem.seed(0) problem.reset() folder = "q_learning/" env = MyWrapper.Monitor(problem, folder, force=True) # env.observation_space.n is number of states # q_table = np.zeros((env.observation_space.n, env.action_space.n)) # param -> q_table num_of_states = env.observation_space.n num_of_action = env.action_space.n rewards_list = [] # this will record reward for that run iterations_list = [] # this will record all number of iteration alpha = [0.5, 0.9] # param -> alpha [0.45, 0.65, 0.85] current 0.45 gamma = 0.99 # param -> gamma episodes = 10000 rar = [0.1, 0.9] # epsilon [0.1,0.3,0.5,0.7,0.9], current 0.1 radr = 0.99 # randomess decay time_list = [] # begin the timer before the iteration begin # initialize the qlearner here qlearner = QLearner( num_actions=num_of_action, num_states=num_of_states, alpha=alpha[0], gamma=gamma, rar=rar[0], radr=radr, ) # print(qlearner.q_table) """This is for plot #1 """ # total time spend per episode init_time_diff = 0 for episode in range(episodes): # total number of iterations start_time = time.time() qlearner.s = env.reset() # current state done = False total_reward = 0 # this is the initial reward i have max_steps = 10000000 # print(state) for i in range(max_steps): if done: break # update qlearner.s by state """Key step, refer to the qlearner implementation """ # update s before use as an input # action here is either a random action or the best action of the given state action = qlearner.choose_best_action( qlearner.num_actions, qlearner.rar, qlearner.s, qlearner.q_table) # use current q_table # qlearner.s = qlearner.s # get state reward done, info from the environment next_state, reward, done, info = env.step( action) # this will update done # qlearner.s = qlearner.s already updated qlearner.a = action # update my reward total_reward += reward """ right now the problem is that q table is not being updated""" # reward is current reward, total_reward is cumulative reward # update q-table on q[qlearner.s, action] using state(future_state) and reward, temp_action = qlearner.query( next_state, reward, False) # this step will not update self.s and self.a # update state to next state, action is already updated, we good qlearner.s = next_state end_time = time.time() time_spend_one_episode = (end_time - start_time) * 1000 init_time_diff += (time_spend_one_episode ) # by the end of iteration cumulative time time_list.append(init_time_diff) rewards_list.append(total_reward) # total rewards for this episode iterations_list.append( i) # record current iteration when it's done for the episide # close the environment, find the time difference env.close() def chunk_list(l, n): for i in range(0, len(l), n): yield l[i:i + n] """rewards vs # of iterations plot""" episode_size = int(episodes / 50) segments = list(chunk_list(rewards_list, episode_size)) average_reward = [sum(segment) / len(segment) for segment in segments] plt.title( "Average Rewards vs Iterations (learning rate: 0.5, Epsilon: 0.1)") plt.plot(range(0, len(rewards_list), episode_size), average_reward) plt.xlabel("Iterations") plt.ylabel("Average Reward") plt.savefig( "./plots/frozen_lake_experiment/frozen_qlearner_reward_vs_iterations.png" ) plt.close() plt.figure() """plot 1 done """ """Plot 2 computation time vs episodes """ plt.title( "Computation time vs episodes (learning rate: 0.5, Epsilon: 0.1)") plt.plot(range(0, episodes, 1), time_list) plt.xlabel("episodes") plt.ylabel("computation time (mili seconds)") plt.savefig( "./plots/frozen_lake_experiment/computation_time_vs_episodes.png") plt.close() plt.figure() """This is for plot #3 change alpha:0.9, rar 0.1 """ # plot 2 alpha = 0.65 vs reward single_alpha = alpha[1] # alpha = 0.9 rewards_list = [] # this will record reward for that run iterations_list = [] # this will record all number of iteration time_list = [] init_time_diff = 0 qlearner = QLearner( num_actions=num_of_action, num_states=num_of_states, alpha=single_alpha, gamma=gamma, rar=rar[0], radr=radr, ) for episode in range(episodes): # total number of iterations start_time = time.time() qlearner.s = env.reset() # current state done = False total_reward = 0 # this is the initial reward i have max_steps = 10000000 # print(state) for i in range(max_steps): if done: break # update qlearner.s by state """Key step, refer to the qlearner implementation """ # start the timer # update s before use as an input # action here is either a random action or the best action of the given state action = qlearner.choose_best_action( qlearner.num_actions, qlearner.rar, qlearner.s, qlearner.q_table) # use current q_table # qlearner.s = qlearner.s # get state reward done, info from the environment next_state, reward, done, info = env.step( action) # this will update done # qlearner.s = qlearner.s already updated qlearner.a = action # update my reward total_reward += reward """ right now the problem is that q table is not being updated""" # reward is current reward, total_reward is cumulative reward # update q-table on q[qlearner.s, action] using state(future_state) and reward, temp_action = qlearner.query( next_state, reward, False) # this step will not update self.s and self.a # update state to next state, action is already updated, we good qlearner.s = next_state end_time = time.time() time_spend_one_episode = (end_time - start_time) * 1000 init_time_diff += (time_spend_one_episode ) # by the end of iteration cumulative time time_list.append(init_time_diff) rewards_list.append(total_reward) # total rewards for this episode iterations_list.append( i) # record current iteration when it's done for the episide # close the environment, find the time difference """plot 3""" episode_size = int(episodes / 50) segments = list(chunk_list(rewards_list, episode_size)) average_reward = [sum(segment) / len(segment) for segment in segments] plt.title("Reward vs Iteration (Learning Rate: 0.9, Epsilon:0.1)") # print(single_alpha) plt.plot(range(0, len(rewards_list), episode_size), average_reward) plt.xlabel("Iterations") plt.ylabel("Average Rewards") plt.savefig( "./plots/frozen_lake_experiment/frozen_qlearner_rewards_vs_iter_alpha0.9.png" ) plt.close() plt.figure() """plot 4 time vs iters""" plt.title( "Computation time vs episodes (learning rate: 0.9, Epsilon: 0.1)") plt.plot(range(0, episodes, 1), time_list) plt.xlabel("episodes") plt.ylabel("computation time (mili seconds)") plt.savefig( "./plots/frozen_lake_experiment/computation_time_vs_episodes_alpha0.9.png" ) plt.close() plt.figure() """This is for plot #4 alpha: 0.5, rar(epsilon) 0.9""" single_alpha = alpha[0] # alpha = 0.9 single_rar = rar[1] rewards_list = [] # this will record reward for that run iterations_list = [] # this will record all number of iteration time_list = [] init_time_diff = 0 qlearner = QLearner( num_actions=num_of_action, num_states=num_of_states, alpha=single_alpha, gamma=gamma, rar=single_rar, radr=radr, ) for episode in range(episodes): # total number of iterations start_time = time.time() qlearner.s = env.reset() # current state done = False total_reward = 0 # this is the initial reward i have max_steps = 10000 # print(state) for i in range(max_steps): if done: break # update qlearner.s by state """Key step, refer to the qlearner implementation """ # start the timer # update s before use as an input # action here is either a random action or the best action of the given state action = qlearner.choose_best_action( qlearner.num_actions, qlearner.rar, qlearner.s, qlearner.q_table) # use current q_table # qlearner.s = qlearner.s # get state reward done, info from the environment next_state, reward, done, info = env.step( action) # this will update done # qlearner.s = qlearner.s already updated qlearner.a = action # update my reward total_reward += reward """ right now the problem is that q table is not being updated""" # reward is current reward, total_reward is cumulative reward # update q-table on q[qlearner.s, action] using state(future_state) and reward, temp_action = qlearner.query( next_state, reward, False) # this step will not update self.s and self.a # update state to next state, action is already updated, we good qlearner.s = next_state end_time = time.time() time_spend_one_episode = (end_time - start_time) * 1000 init_time_diff += (time_spend_one_episode ) # by the end of iteration cumulative time time_list.append(init_time_diff) rewards_list.append(total_reward) # total rewards for this episode iterations_list.append( i) # record current iteration when it's done for the episide """plot 5 reward vs iteration""" episode_size = int(episodes / 50) segments = list(chunk_list(rewards_list, episode_size)) average_reward = [sum(segment) / len(segment) for segment in segments] plt.title("Reward vs Iteration (Learning Rate: 0.5, Epsilon:0.9)") # print(single_alpha) plt.plot(range(0, len(rewards_list), episode_size), average_reward) plt.xlabel("Iterations") plt.ylabel("Average Rewards") plt.savefig( "./plots/frozen_lake_experiment/frozen_qlearner_rewards_vs_iter_epsilon0.9.png" ) plt.close() plt.figure() """plot 6 time vs iters""" plt.title( "Computation time vs episodes (learning rate: 0.5, Epsilon: 0.9)") plt.plot(range(0, episodes, 1), time_list) plt.xlabel("episodes") plt.ylabel("computation time (mili seconds)") plt.savefig( "./plots/frozen_lake_experiment/computation_time_vs_episodes_epsilon0.9.png" ) plt.close() plt.figure()