def simple_replay_train(DQN, train_batch):
    x_stack = np.empty(0).reshape(0, DQN.input_size)
    y_stack = np.empty(0).reshape(0, DQN.output_size)

    for state, action, reward, next_state, done in train_batch:
        Q = DQN.predict(state)

        if done:
            Q[0, action] = reward
        else:
            Q[0, action] = reward + dis * np.max(DQN.predict(next_state))

        y_stack = np.vstack([y_stack, Q])
        x_stack = np.vstack([x_stack, state])

    return DQN.update(x_stack, y_stack)
예제 #2
0
class Agent:
    """
    Class representing a learning agent acting in an environment.
    """
    def __init__(self,
                 buffer_size,
                 batch_size,
                 alpha,
                 gamma,
                 epsilon,
                 epsilon_min,
                 epsilon_decay,
                 lr,
                 game="CartPole-v1",
                 mean_bound=5,
                 reward_bound=495.0,
                 sync_model=1000,
                 save_model=10):
        """
        Constructor of the agent class.
            - game="CartPole-v1" : Name of the game environment
            - mean_bound=5 : Number of last acquired rewards considered for mean reward
            - reward_bound=495.0 : Reward acquired for completing an episode properly
            - sync_model=1000 : Interval for synchronizing model and target model
            - save_model=10 : Interval for saving model

            - buffer_size : Replay buffer size of the DQN model
            - batch_size : Batch size of the DQN model
            - alpha : Learning rate for Q-Learning
            - gamma : Discount factor for Q-Learning
            - epsilon : Threshold for taking a random action
            - epsilon_min : Minimal value allowed for epsilon
            - epsilon_decay : Decay rate for epsilon
            - lr : Learning rate for the DQN model
        """

        # Environment variables
        self.game = game
        self.env = gym.make(self.game)
        self.num_states = self.env.observation_space.shape[0]
        self.num_actions = self.env.action_space.n

        # Agent variables
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.buffer = ReplayBuffer(self.buffer_size, self.batch_size)
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.mean_bound = mean_bound
        self.reward_bound = reward_bound

        # DQN variables
        self.lr = lr
        self.model = DQN(self.num_states, self.num_actions, self.lr)
        self.target_model = DQN(self.num_states, self.num_actions, self.lr)
        self.target_model.update(self.model)
        self.sync_model = sync_model
        self.save_model = save_model

        # File paths
        dirname = os.path.dirname(__file__)
        self.path_model = os.path.join(dirname, "../models/dqn.h5")
        self.path_plot = os.path.join(dirname, "../plots/dqn.png")

        # Load model, if it already exists
        try:
            self.model.load(self.path_model)
            self.target_model.update(self.model)
        except:
            print("Model does not exist! Create new model...")

    def reduce_epsilon(self):
        """
        Reduces the parameter epsilon up to a given minimal value where the speed of decay is controlled by some given parameter.
        """

        epsilon = self.epsilon * self.epsilon_decay

        if epsilon >= self.epsilon_min:
            self.epsilon = epsilon
        else:
            self.epsilon = self.epsilon_min

    def get_action(self, state):
        """
        Returns an action for a given state, based on the current policy.
            - state : Current state of the agent
        """

        if np.random.random() < self.epsilon:
            action = self.env.action_space.sample()
        else:
            action = np.argmax(self.model.predict(state))

        return action

    def train(self, num_episodes, report_interval):
        """
        Trains the DQN model for a given number of episodes. Outputting report information is controlled by a given time interval.
            - num_episodes : Number of episodes to train
            - report_interval : Interval for outputting report information of training
        """

        step = 0
        total_rewards = []

        for episode in range(1, num_episodes + 1):
            if episode % self.save_model == 0:
                self.model.save(self.path_model)

            state = self.env.reset()
            state = state.reshape((1, self.num_states))
            total_reward = 0.0

            while True:
                step += 1

                action = self.get_action(state)
                next_state, reward, done, _ = self.env.step(action)
                next_state = next_state.reshape((1, self.num_states))

                # Penalize agent if pole could not be balanced until end of episode
                if done and reward < 499.0:
                    reward = -100.0

                self.buffer.remember(state, action, reward, next_state, done)
                self.replay()
                self.reduce_epsilon()

                state = next_state
                total_reward += reward

                if step % self.sync_model == 0:
                    self.target_model.update(self.model)

                if done:
                    total_reward += 100.0
                    total_rewards.append(total_reward)
                    mean_reward = np.mean(total_rewards[-self.mean_bound:])

                    if episode % report_interval == 0:
                        print(f"Episode: {episode}/{num_episodes}"
                              f"\tStep: {step}"
                              f"\tMemory Size: {len(self.memory)}"
                              f"\tEpsilon: {self.epsilon : .3f}"
                              f"\tReward: {total_reward}"
                              f"\tLast 5 Mean: {mean_reward : .2f}")

                        self.plot_rewards(total_rewards)

                    if mean_reward > self.reward_bound:
                        self.model.save(self.path_model)
                        return

                    break

        self.model.save(self.path_model)

    def replay(self):
        """
        Samples training data from the replay buffer and fits the DQN model.
        """

        sample_size, states, actions, rewards, next_states, dones = self.memory.sample(
        )

        q_values = self.model.predict(states)
        next_q_values = self.target_model.predict(next_states)

        for i in range(sample_size):
            action = actions[i]
            done = dones[i]

            if done:
                q_target = rewards[i]
            else:
                q_target = rewards[i] + self.gamma * np.max(next_q_values[i])

            q_values[i][action] = (1 - self.alpha) * \
                q_values[i][action] + self.alpha * q_target

        self.model.fit(states, q_values)

    def play(self, num_episodes):
        """
        Renders the trained agent for a given number of episodes.
            - num_episodes : Number of episodes to render
        """

        self.epsilon = self.epsilon_min

        for episode in range(1, num_episodes + 1):
            state = self.env.reset()
            state = state.reshape((1, self.num_states))
            total_reward = 0.0

            while True:
                self.env.render()
                action = self.get_action(state)
                next_state, reward, done, _ = self.env.step(action)
                next_state = next_state.reshape((1, self.num_states))
                state = next_state
                total_reward += reward

                if done:
                    print(f"Episode: {episode}/{num_episodes}"
                          f"\tTotal Reward: {total_reward : .2f}")

                    break

    def plot_rewards(self, total_rewards):
        """
        Plots the rewards the agent has acquired during training.
            - total_rewards : Rewards the agent has gained per episode
        """

        x = range(len(total_rewards))
        y = total_rewards

        slope, intercept, _, _, _ = linregress(x, y)

        plt.plot(x, y, linewidth=0.8)
        plt.plot(x, slope * x + intercept, color="red", linestyle="-.")
        plt.xlabel("Episode")
        plt.ylabel("Reward")
        plt.title("DQN-Learning")
        plt.savefig(self.path_plot)
                #     -abs term代表鼓勵agent不要去移動車子,
                #     一直維持在中間才能獲得很高的獎賞!
                # 2.  r2得到的是角度的資訊,
                #     儘量讓棒子跟垂直線的角度愈小愈好(就是讓棒子立正),
                #     角度愈小獎賞愈高
                # 最後扣0.5是讓獎勵區間分布於[-1~1]之間
                # ----------------------------------------------------
                x, x_dot, theta, theta_dot = s_
                r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
                r2 = (env.theta_threshold_radians -
                      abs(theta)) / env.theta_threshold_radians - 0.5
                r = r1 + r2

            # Sotre the transition and update the network
            net.store_path(s, a, r, s_)
            net.update()

            # Judge if finish an episode (and decay the epsilon)
            if finish:
                print("Episode: %d \t Total reward: %d \t Eps: %f" %
                      (i, total_reward, net.epsilon))
                reward_list.append(total_reward)
                if net.epsilon > 0.01:
                    net.episodeDecay()
                break

            # Update the current state as the future state of previous state
            s = s_
    net.save()
    plt.plot(range(len(reward_list)), reward_list, '-')
    plt.show()
예제 #4
0
파일: agent.py 프로젝트: KokoMind/DQN-TF
class Agent:
    """Our Wasted Agent :P """
    def __init__(self, sess, config, environment, evaluation_enviroment):
        # Get the session, config, environment, and create a replaymemory
        self.sess = sess
        self.config = config
        self.environment = environment
        self.evaluation_enviroment = evaluation_enviroment

        if config.prm:
            self.memory = PrioritizedExperienceReplay(sess, config)
        else:
            self.memory = ReplayMemory(config.state_shape, config.rep_max_size)

        self.init_dirs()

        self.init_cur_epsiode()
        self.init_global_step()
        self.init_epsilon()
        self.init_summaries()

        # Intialize the DQN graph which contain 2 Networks Target and Q
        self.estimator = DQN(sess, config, self.environment.n_actions)

        # To initialize all variables
        self.init = tf.group(tf.global_variables_initializer(),
                             tf.local_variables_initializer())
        self.sess.run(self.init)

        self.saver = tf.train.Saver(max_to_keep=10)
        self.summary_writer = tf.summary.FileWriter(self.summary_dir,
                                                    self.sess.graph)

        if config.is_train and not config.cont_training:
            pass
        elif config.is_train and config.cont_training:
            self.load()
        elif config.is_play:
            self.load()
        else:
            raise Exception("Please Set proper mode for training or playing")

    def load(self):
        latest_checkpoint = tf.train.latest_checkpoint(self.checkpoint_dir)
        if latest_checkpoint:
            print("Loading model checkpoint {}...\n".format(latest_checkpoint))
            self.saver.restore(self.sess, latest_checkpoint)

    def save(self):
        self.saver.save(self.sess, self.checkpoint_dir,
                        self.global_step_tensor)

    def init_dirs(self):
        # Create directories for checkpoints and summaries
        self.checkpoint_dir = os.path.join(self.config.experiment_dir,
                                           "checkpoints/")
        self.summary_dir = os.path.join(self.config.experiment_dir,
                                        "summaries/")

    def init_cur_epsiode(self):
        """Create cur episode tensor to totally save the process of the training"""
        with tf.variable_scope('cur_episode'):
            self.cur_episode_tensor = tf.Variable(-1,
                                                  trainable=False,
                                                  name='cur_epsiode')
            self.cur_epsiode_input = tf.placeholder('int32',
                                                    None,
                                                    name='cur_episode_input')
            self.cur_episode_assign_op = self.cur_episode_tensor.assign(
                self.cur_epsiode_input)

    def init_global_step(self):
        """Create a global step variable to be a reference to the number of iterations"""
        with tf.variable_scope('step'):
            self.global_step_tensor = tf.Variable(0,
                                                  trainable=False,
                                                  name='global_step')
            self.global_step_input = tf.placeholder('int32',
                                                    None,
                                                    name='global_step_input')
            self.global_step_assign_op = self.global_step_tensor.assign(
                self.global_step_input)

    def init_epsilon(self):
        """Create an epsilon variable"""
        with tf.variable_scope('epsilon'):
            self.epsilon_tensor = tf.Variable(self.config.initial_epsilon,
                                              trainable=False,
                                              name='epsilon')
            self.epsilon_input = tf.placeholder('float32',
                                                None,
                                                name='epsilon_input')
            self.epsilon_assign_op = self.epsilon_tensor.assign(
                self.epsilon_input)

    def init_summaries(self):
        """Create the summary part of the graph"""
        with tf.variable_scope('summary'):
            self.summary_placeholders = {}
            self.summary_ops = {}
            self.scalar_summary_tags = [
                'episode.total_reward', 'episode.length',
                'evaluation.total_reward', 'evaluation.length', 'epsilon'
            ]
            for tag in self.scalar_summary_tags:
                self.summary_placeholders[tag] = tf.placeholder('float32',
                                                                None,
                                                                name=tag)
                self.summary_ops[tag] = tf.summary.scalar(
                    tag, self.summary_placeholders[tag])

    def init_replay_memory(self):
        # Populate the replay memory with initial experience
        print("initializing replay memory...")

        state = self.environment.reset()
        for i in itertools.count():
            action = self.take_action(state)
            next_state, reward, done = self.observe_and_save(
                state, self.environment.valid_actions[action])
            if done:
                if self.config.prm:
                    if i >= self.config.prm_init_size:
                        break
                else:
                    if i >= self.config.replay_memory_init_size:
                        break
                state = self.environment.reset()
            else:
                state = next_state
        print("finished initializing replay memory")

    def policy_fn(self, fn_type, estimator, n_actions):
        """Function that contain definitions to various number of policy functions and choose between them"""
        def epsilon_greedy(sess, observation, epsilon):
            actions = np.ones(n_actions, dtype=float) * epsilon / n_actions
            q_values = estimator.predict(np.expand_dims(observation, 0))[0]
            best_action = np.argmax(q_values)
            actions[best_action] += (1.0 - epsilon)
            return actions

        def greedy(sess, observation):
            q_values = estimator.predict(np.expand_dims(observation, 0),
                                         type="target")[0]
            best_action = np.argmax(q_values)
            return best_action

        if fn_type == 'epsilon_greedy':
            return epsilon_greedy
        elif fn_type == 'greedy':
            return greedy
        else:
            raise Exception("Please Select a proper policy function")

    def take_action(self, state):
        """Take the action based on the policy function"""
        action_probs = self.policy(self.sess, state,
                                   self.epsilon_tensor.eval(self.sess))
        action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
        return action

    def observe_and_save(self, state, action):
        """Function that observe the new state , reward and save it in the memory"""
        next_state, reward, done = self.environment.step(action)
        self.memory.push(state, next_state, action, reward, done)
        return next_state, reward, done

    def update_target_network(self):
        """Update Target network By copying paramter between the two networks in DQN"""
        self.estimator.update_target_network()

    def add_summary(self, summaries_dict, step):
        """Add the summaries to tensorboard"""
        summary_list = self.sess.run(
            [self.summary_ops[tag] for tag in summaries_dict.keys()], {
                self.summary_placeholders[tag]: value
                for tag, value in summaries_dict.items()
            })
        for summary in summary_list:
            self.summary_writer.add_summary(summary, step)
        self.summary_writer.flush()

    def train_episodic(self):
        """Train the agent in episodic techniques"""

        # Initialize the epsilon step, it's step, the policy function, the replay memory
        self.epsilon_step = (
            self.config.initial_epsilon -
            self.config.final_epsilon) / self.config.exploration_steps
        self.policy = self.policy_fn(self.config.policy_fn, self.estimator,
                                     self.environment.n_actions)
        self.init_replay_memory()

        for cur_episode in range(
                self.cur_episode_tensor.eval(self.sess) + 1,
                self.config.num_episodes, 1):

            # Save the current checkpoint
            self.save()

            # Update the Cur Episode tensor
            self.cur_episode_assign_op.eval(
                session=self.sess,
                feed_dict={
                    self.cur_epsiode_input:
                    self.cur_episode_tensor.eval(self.sess) + 1
                })

            # Evaluate Now to see how it behave
            if cur_episode % self.config.evaluate_every == 0:
                self.evaluate(cur_episode / self.config.evaluate_every)

            state = self.environment.reset()
            total_reward = 0

            # Take steps in the environment untill terminal state of epsiode
            for t in itertools.count():

                # Update the Global step
                self.global_step_assign_op.eval(
                    session=self.sess,
                    feed_dict={
                        self.global_step_input:
                        self.global_step_tensor.eval(self.sess) + 1
                    })

                # time to update the target estimator
                if self.global_step_tensor.eval(
                        self.sess
                ) % self.config.update_target_estimator_every == 0:
                    self.update_target_network()

                # Calculate the Epsilon for this time step
                # Take an action ..Then observe and save
                self.epsilon_assign_op.eval(
                    {
                        self.epsilon_input:
                        max(
                            self.config.final_epsilon,
                            self.epsilon_tensor.eval(self.sess) -
                            self.epsilon_step)
                    }, self.sess)
                action = self.take_action(state)
                next_state, reward, done = self.observe_and_save(
                    state, self.environment.valid_actions[action])

                # Sample a minibatch from the replay memory
                if self.config.prm:
                    indices_batch, weights_batch, state_batch, next_state_batch, action_batch, reward_batch, done_batch = self.memory.sample(
                    )
                else:
                    state_batch, next_state_batch, action_batch, reward_batch, done_batch = self.memory.get_batch(
                        self.config.batch_size)

                # Calculate targets Then Compute the loss
                q_values_next = self.estimator.predict(next_state_batch,
                                                       type="target")
                targets_batch = reward_batch + np.invert(done_batch).astype(
                    np.float32) * self.config.discount_factor * np.amax(
                        q_values_next, axis=1)

                if self.config.prm:
                    _ = self.estimator.update(state_batch, action_batch,
                                              targets_batch, weights_batch)
                else:
                    _ = self.estimator.update(state_batch, action_batch,
                                              targets_batch)

                total_reward += reward

                if done:  # IF terminal state so exit the episode
                    # Add summaries to tensorboard
                    summaries_dict = {
                        'episode.total_reward': total_reward,
                        'episode.length': t,
                        'epsilon': self.epsilon_tensor.eval(self.sess)
                    }
                    self.add_summary(summaries_dict,
                                     self.global_step_tensor.eval(self.sess))
                    break

                state = next_state

        print("Training Finished")

    def train_continous(self):
        # TODO implement on global step only
        pass

    def play(self, n_episode=10):
        """Function that play greedily on the policy learnt"""
        # Play Greedily
        self.policy = self.policy_fn('greedy', self.estimator,
                                     self.environment.n_actions)

        for cur_episode in range(n_episode):

            state = self.environment.reset()
            total_reward = 0

            for t in itertools.count():

                best_action = self.policy(self.sess, state)
                next_state, reward, done = self.environment.step(
                    self.environment.valid_actions[best_action])

                total_reward += reward

                if done:
                    print("Total Reward in Epsiode " + str(cur_episode) +
                          " = " + str(total_reward))
                    print("Total Length in Epsiode " + str(cur_episode) +
                          " = " + str(t))
                    break

                state = next_state

    def evaluate(self, local_step):

        print('evaluation #{0}'.format(local_step))

        policy = self.policy_fn('greedy', self.estimator,
                                self.evaluation_enviroment.n_actions)

        for cur_episode in range(self.config.evaluation_episodes):

            state = self.evaluation_enviroment.reset()
            total_reward = 0

            for t in itertools.count():

                best_action = policy(self.sess, state)
                next_state, reward, done = self.evaluation_enviroment.step(
                    self.evaluation_enviroment.valid_actions[best_action])

                total_reward += reward

                if done:
                    # Add summaries to tensorboard
                    summaries_dict = {
                        'evaluation.total_reward': total_reward,
                        'evaluation.length': t
                    }
                    self.add_summary(summaries_dict,
                                     local_step * 5 + cur_episode)
                    break

                state = next_state

        print('Finished evaluation #{0}'.format(local_step))