next_states = np.array([each[3] for each in batch]) # Train network target_Qs = sess.run(mainQN.output, feed_dict={mainQN.inputs_: next_states}) # Set target_Qs to 0 for states where episode ends episode_ends = (next_states == np.zeros( states[0].shape)).all(axis=1) target_Qs[episode_ends] = (0, 0) targets = rewards + gamma * np.max(target_Qs, axis=1) loss, _ = sess.run( [mainQN.loss, mainQN.opt], feed_dict={ mainQN.inputs_: states, mainQN.targetQs_: targets, mainQN.actions_: actions }) utility_list.append(utility) if len(utility_list) == avg_window: curve.add_sample(['utility', 'epsilon'], episode, [np.mean(utility_list), epsilon]) utility_list = [] saver.save( sess, f'checkpoints/{training_name}_{gym_env_name[-2:]}/cartpole.ckpt') curve.save_plot(f'./curves/q_deep_{training_name}_{gym_env_name[-2:]}.png')
def q_learning(self, gamma=0.9, alpha=0.3, episodes=100, max_steps=50, fps=30, epsilon_0=-1.0, plot=False): ''' Q learning q: change view (state values or q values) s: change speed (slow or fast) e: Explore or not ''' if plot: l_curve = LearningCurve(min_y=-1.5, max_y=1.5) self.state_values, self.state_q_values = self.init_values() flag_q = False flag_fast = False flag_exit = False flag_explore = True episode = 0 while (True): # for episode in range(episodes): if flag_explore: if epsilon_0 >= 0.0: epsilon = epsilon_0 else: epsilon = np.exp(-episode / (episodes / 5)) else: epsilon = 0.0 state = random.choice(self.states) done = False explore = False action = '' utility = 0.0 reward = 0.0 for step in range(max_steps): # while(True): self.r_draw_background() if not flag_q: self.r_draw_values() else: self.r_draw_q_values() self.r_draw_agent(state) self.r_draw_reward(reward, utility, done) # self.r_draw_rl_metrics(f'{episode+1}/{episodes}', epsilon, action, explore) self.r_draw_rl_metrics(episode + 1, epsilon, action, explore) pygame.display.flip() if flag_fast: key = self.tick_key(fps) else: key = self.tick_key(1) if key == pygame.K_q: flag_q = not flag_q elif key == pygame.K_s: flag_fast = not flag_fast elif key == pygame.K_e: flag_explore = not flag_explore elif key == pygame.K_ESCAPE: flag_exit = True break if done: break if np.random.uniform() < epsilon: explore = True action = random.choice(self.allowed_actions[state]) else: explore = False action = self.policy[state] new_state, reward, _, done = self.step(state, action) if done: sample = reward else: sample = reward + gamma * self.max_val( self.state_q_values[new_state]) self.state_q_values[state][action] = ( 1 - alpha ) * self.state_q_values[state][action] + alpha * sample self.policy[state], self.state_values[state] = self.key_max( self.state_q_values[state]) utility += (gamma**step) * reward state = new_state if plot: l_curve.add_sample(episode, utility) if flag_exit: break episode += 1