Пример #1
0
                next_states = np.array([each[3] for each in batch])

                # Train network
                target_Qs = sess.run(mainQN.output,
                                     feed_dict={mainQN.inputs_: next_states})

                # Set target_Qs to 0 for states where episode ends
                episode_ends = (next_states == np.zeros(
                    states[0].shape)).all(axis=1)
                target_Qs[episode_ends] = (0, 0)

                targets = rewards + gamma * np.max(target_Qs, axis=1)

                loss, _ = sess.run(
                    [mainQN.loss, mainQN.opt],
                    feed_dict={
                        mainQN.inputs_: states,
                        mainQN.targetQs_: targets,
                        mainQN.actions_: actions
                    })

        utility_list.append(utility)
        if len(utility_list) == avg_window:
            curve.add_sample(['utility', 'epsilon'], episode,
                             [np.mean(utility_list), epsilon])
            utility_list = []

    saver.save(
        sess, f'checkpoints/{training_name}_{gym_env_name[-2:]}/cartpole.ckpt')
    curve.save_plot(f'./curves/q_deep_{training_name}_{gym_env_name[-2:]}.png')
Пример #2
0
    def q_learning(self,
                   gamma=0.9,
                   alpha=0.3,
                   episodes=100,
                   max_steps=50,
                   fps=30,
                   epsilon_0=-1.0,
                   plot=False):
        ''' Q learning
        q: change view (state values or q values)
        s: change speed (slow or fast)
        e: Explore or not
        '''

        if plot:
            l_curve = LearningCurve(min_y=-1.5, max_y=1.5)

        self.state_values, self.state_q_values = self.init_values()
        flag_q = False
        flag_fast = False
        flag_exit = False
        flag_explore = True

        episode = 0
        while (True):
            # for episode in range(episodes):
            if flag_explore:
                if epsilon_0 >= 0.0:
                    epsilon = epsilon_0
                else:
                    epsilon = np.exp(-episode / (episodes / 5))
            else:
                epsilon = 0.0
            state = random.choice(self.states)
            done = False
            explore = False
            action = ''
            utility = 0.0
            reward = 0.0
            for step in range(max_steps):
                # while(True):
                self.r_draw_background()
                if not flag_q:
                    self.r_draw_values()
                else:
                    self.r_draw_q_values()
                self.r_draw_agent(state)
                self.r_draw_reward(reward, utility, done)
                # self.r_draw_rl_metrics(f'{episode+1}/{episodes}', epsilon, action, explore)
                self.r_draw_rl_metrics(episode + 1, epsilon, action, explore)
                pygame.display.flip()
                if flag_fast:
                    key = self.tick_key(fps)
                else:
                    key = self.tick_key(1)
                if key == pygame.K_q:
                    flag_q = not flag_q
                elif key == pygame.K_s:
                    flag_fast = not flag_fast
                elif key == pygame.K_e:
                    flag_explore = not flag_explore
                elif key == pygame.K_ESCAPE:
                    flag_exit = True
                    break
                if done:
                    break
                if np.random.uniform() < epsilon:
                    explore = True
                    action = random.choice(self.allowed_actions[state])
                else:
                    explore = False
                    action = self.policy[state]
                new_state, reward, _, done = self.step(state, action)
                if done:
                    sample = reward
                else:
                    sample = reward + gamma * self.max_val(
                        self.state_q_values[new_state])
                self.state_q_values[state][action] = (
                    1 - alpha
                ) * self.state_q_values[state][action] + alpha * sample
                self.policy[state], self.state_values[state] = self.key_max(
                    self.state_q_values[state])
                utility += (gamma**step) * reward
                state = new_state
            if plot:
                l_curve.add_sample(episode, utility)
            if flag_exit:
                break
            episode += 1