Python Critic.update示例

class Agent:
    def __init__(self, env, gamma, gae_lambda, batch_size, lr_rate,
                 ratio_clipping, epochs):

        self.env = env
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]
        self.action_bound = env.action_space.high[0]

        self.gamma = gamma
        self.gae_lambda = gae_lambda
        self.batch_size = batch_size
        self.epochs = epochs

        self.actor = Actor(self.state_dim, self.action_dim, self.action_bound,
                           lr_rate[0], ratio_clipping)
        self.critic = Critic(self.state_dim, lr_rate[1])

        self.save_epi_reward = []

    def gae_target(self, rewards, v_values, next_v_value, done):
        n_step_targets = torch.zeros_like(rewards)
        gae = torch.zeros_like(rewards)
        gae_cumulative = 0.
        forward_val = 0.

        if not done:
            forward_val = next_v_value

        for k in reversed(range(0, len(rewards))):
            delta = rewards[k] + self.gamma * forward_val - v_values[k]
            gae_cumulative = self.gamma * self.gae_lambda * gae_cumulative + delta
            gae[k] = gae_cumulative
            forward_val = v_values[k]
            n_step_targets[k] = gae[k] + v_values[k]

        return gae, n_step_targets

    def unpack_batch(self, batch):
        unpack = []
        for idx in range(len(batch)):
            unpack.append(batch[idx])

        unpack = torch.cat(unpack, axis=0)
        return unpack

    def train(self, max_episode_num, save_path, save_names):
        batch_state, batch_action, batch_reward = [], [], []
        batch_log_old_policy_pdf = []

        for episode in range(max_episode_num):
            time, episode_reward, done = 0, 0, False
            state = self.env.reset()
            state = torch.from_numpy(state).type(torch.FloatTensor)

            while not done:
                #env.render()
                mu_old, std_old, action = self.actor.get_policy_action(state)
                action = np.array([action.item()])
                mu_old = np.array([mu_old.item()])
                std_old = np.array([std_old.item()])
                action = np.clip(action, -self.action_bound, self.action_bound)

                var_old = std_old**2
                log_old_policy_pdf = -0.5 * (
                    action - mu_old)**2 / var_old - 0.5 * np.log(
                        var_old * 2 * np.pi)
                log_old_policy_pdf = np.sum(log_old_policy_pdf)

                next_state, reward, done, _ = self.env.step(action)
                next_state = torch.from_numpy(next_state).type(
                    torch.FloatTensor)
                action = torch.from_numpy(action).type(torch.FloatTensor)
                reward = torch.FloatTensor([reward])
                log_old_policy_pdf = torch.FloatTensor([log_old_policy_pdf])

                state = state.view(1, self.state_dim)
                next_state = next_state.view(1, self.state_dim)
                action = action.view(1, self.action_dim)
                reward = reward.view(1, 1)
                log_old_policy_pdf = log_old_policy_pdf.view(1, 1)

                batch_state.append(state)
                batch_action.append(action)
                batch_reward.append((reward + 8) / 8)
                batch_log_old_policy_pdf.append(log_old_policy_pdf)

                if len(batch_state) < self.batch_size:
                    state = next_state[0]
                    episode_reward += reward[0]
                    time += 1
                    continue

                states = self.unpack_batch(batch_state)
                actions = self.unpack_batch(batch_action)
                rewards = self.unpack_batch(batch_reward)
                log_old_policy_pdfs = self.unpack_batch(
                    batch_log_old_policy_pdf)
                batch_state, batch_action, batch_reward = [], [], []
                batch_log_old_policy_pdf = []

                v_values = self.critic.get_value(states)
                next_v_value = self.critic.get_value(next_state)
                gaes, y_i = self.gae_target(rewards, v_values, next_v_value,
                                            done)

                for _ in range(self.epochs):
                    self.actor.update(states, actions, gaes,
                                      log_old_policy_pdfs)
                    self.critic.update(states, y_i)

                state = next_state[0]
                episode_reward += reward[0]
                time += 1

            self.save_epi_reward.append(episode_reward.item())

            if len(self.save_epi_reward) < 20:
                print('Episode:', episode + 1, 'Time:',
                      time, 'Reward(ave of recent20):',
                      np.mean(self.save_epi_reward))
            else:
                print('Episode:', episode + 1, 'Time:', time,
                      'Reward(ave of recent20):',
                      np.mean(self.save_epi_reward[-20:]))

            if episode % 10 == 0:
                self.actor.save(save_path, save_names[0])
                self.critic.save(save_path, save_names[1])

示例#2

显示文件

文件： agent.py 项目： msieb1/Actor-Critic-TF

class Agent:
    def __init__(self, sess, config, environment):
        # Get the session, config, environment, and create a replaymemory
        self.sess = sess
        self.config = config
        self.environment = environment

        self.init_dirs()
        self.init_cur_epsiode()
        self.init_global_step()
        self.init_summaries()

        # Intialize the graph which contain 2 Networks Actor and Critic
        self.actor = Actor(sess, self.environment.n_actions,
                           self.environment.state_shape, config)
        self.critic = Critic(sess, self.environment.state_shape, config)

        # To initialize all variables
        self.init = tf.group(tf.global_variables_initializer(),
                             tf.local_variables_initializer())
        self.sess.run(self.init)

        self.saver = tf.train.Saver(max_to_keep=10)
        self.summary_writer = tf.summary.FileWriter(self.summary_dir,
                                                    self.sess.graph)

        if config.is_train and not config.cont_training:
            pass
        elif config.is_train and config.cont_training:
            self.load()
        elif config.is_play:
            self.load()
        else:
            raise Exception("Please Set proper mode for training or playing")

    def load(self):
        latest_checkpoint = tf.train.latest_checkpoint(self.checkpoint_dir)
        if latest_checkpoint:
            print("Loading model checkpoint {}...\n".format(latest_checkpoint))
            self.saver.restore(self.sess, latest_checkpoint)

    def save(self):
        self.saver.save(self.sess, self.checkpoint_dir,
                        self.global_step_tensor)

    def init_dirs(self):
        # Create directories for checkpoints and summaries
        self.checkpoint_dir = os.path.join(self.config.experiment_dir,
                                           "checkpoints/")
        self.summary_dir = os.path.join(self.config.experiment_dir,
                                        "summaries/")

    def init_cur_epsiode(self):
        """Create cur episode tensor to totally save the process of the training"""
        with tf.variable_scope('cur_episode'):
            self.cur_episode_tensor = tf.Variable(-1,
                                                  trainable=False,
                                                  name='cur_epsiode')
            self.cur_epsiode_input = tf.placeholder('int32',
                                                    None,
                                                    name='cur_episode_input')
            self.cur_episode_assign_op = self.cur_episode_tensor.assign(
                self.cur_epsiode_input)

    def init_global_step(self):
        """Create a global step variable to be a reference to the number of iterations"""
        with tf.variable_scope('step'):
            self.global_step_tensor = tf.Variable(0,
                                                  trainable=False,
                                                  name='global_step')
            self.global_step_input = tf.placeholder('int32',
                                                    None,
                                                    name='global_step_input')
            self.global_step_assign_op = self.global_step_tensor.assign(
                self.global_step_input)

    def init_summaries(self):
        """Create the summary part of the graph"""
        with tf.variable_scope('summary'):
            self.summary_placeholders = {}
            self.summary_ops = {}
            self.scalar_summary_tags = [
                'episode.total_reward', 'episode.length',
                'evaluation.total_reward', 'evaluation.length', 'epsilon'
            ]
            for tag in self.scalar_summary_tags:
                self.summary_placeholders[tag] = tf.placeholder('float32',
                                                                None,
                                                                name=tag)
                self.summary_ops[tag] = tf.summary.scalar(
                    tag, self.summary_placeholders[tag])

    def add_summary(self, summaries_dict, step):
        """Add the summaries to tensorboard"""
        summary_list = self.sess.run(
            [self.summary_ops[tag] for tag in summaries_dict.keys()], {
                self.summary_placeholders[tag]: value
                for tag, value in summaries_dict.items()
            })
        for summary in summary_list:
            self.summary_writer.add_summary(summary, step)
        self.summary_writer.flush()

    def take_action(self, state):
        """Take the action"""
        action_probs = self.actor.predict(state)
        action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
        return action

    def observe(self, action):
        """Function that observe the new state, reward"""
        return self.environment.step(action)

    def train_episodic(self):
        """Train the agent in episodic techniques"""

        for cur_episode in range(
                self.cur_episode_tensor.eval(self.sess) + 1,
                self.config.num_episodes, 1):

            # Save the current checkpoint
            self.save()

            # Update the Cur Episode tensor
            self.cur_episode_assign_op.eval(
                session=self.sess,
                feed_dict={
                    self.cur_epsiode_input:
                    self.cur_episode_tensor.eval(self.sess) + 1
                })

            state = self.environment.reset()
            total_reward = 0

            # Take steps in the environment untill terminal state of epsiode
            for t in itertools.count():

                # Update the Global step
                self.global_step_assign_op.eval(
                    session=self.sess,
                    feed_dict={
                        self.global_step_input:
                        self.global_step_tensor.eval(self.sess) + 1
                    })

                # Take an action
                action = self.take_action(state)
                next_state, reward, done = self.observe(
                    self.environment.valid_actions[action])

                # Calculate the TD Target
                value_next = self.critic.predict(next_state)
                td_target = reward + self.config.discount_factor * value_next
                td_error = td_target - self.critic.predict(state)

                # Update the Critic
                self.critic.update(state, td_target)

                # Update the Actor
                # using the td error as our advantage estimate
                # TODO Research about the best advantage estimate
                self.actor.update(state, action, td_error)

                total_reward += reward

                if done:  # IF terminal state so exit the episode
                    # Add summaries to tensorboard
                    summaries_dict = {
                        'episode.total_reward': total_reward,
                        'episode.length': t
                    }
                    self.add_summary(summaries_dict,
                                     self.global_step_tensor.eval(self.sess))
                    break

                state = next_state

        print("Training Finished")