Exemplo n.º 1
0
class Agent:
    def __init__(self, env):
        self.env = env
        self.sess = None
        self._config_initialize()

    def _config_initialize(self):
        """initialize env config"""
        Config.data.action_dim = self.env.action_space.shape[0]
        Config.data.action_bound = self.env.action_space.high
        Config.data.state_dim = self.env.observation_space.shape[0]

        self.scaler = Scaler(Config.data.state_dim)

    def model_fn(self, mode, features, labels):
        self.mode = mode
        self.loss, self.train_op, self.training_hooks, self.evaluation_hooks = None, None, None, None
        self._build_graph()

        return tf.estimator.EstimatorSpec(
            mode=mode,
            loss=self.loss,
            train_op=self.train_op,
            training_hooks=self.training_hooks,
            evaluation_hooks=self.evaluation_hooks)

    def _build_graph(self):
        self.actor = Actor()
        self.critic = Critic()

        if self.mode == tf.estimator.ModeKeys.TRAIN:
            ave_ep_reward = tf.placeholder(tf.float32, name='ave_ep_reward')
            tf.summary.scalar('ave_ep_reward', ave_ep_reward)
            self.loss = ave_ep_reward
            global_step = tf.train.get_global_step()
            self.train_op = tf.assign_add(global_step, 1)
            self.training_hooks = [TrainingHook(self)]
        else:
            self.loss = tf.constant(1)
            self.evaluation_hooks = [EvalHook(self)]

    def init_scaler(self, init_episode=5):
        for e in range(init_episode):
            self.run_episode()

    def choose_action(self, observation):
        feed_dict = {self.actor.states: [observation]}
        action = self.sess.run(self.actor.sample, feed_dict)
        return action

    def eval(self, animate=False):
        observation = self.env.reset()
        ep_reward = 0
        count = 0
        done = False
        while not done:
            if animate:
                self.env.render()
            action = self.choose_action(self.scaler.normalize(observation))
            next_observation, reward, done, info = self.env.step(action)
            ep_reward += reward
            observation = next_observation

            if Config.train.get('max_episode_steps', None):
                count += 1
                if count == Config.train.max_episode_steps:
                    break

        return ep_reward

    def run_episode(self, animate=False):
        observation = self.env.reset()
        states, actions, rewards, unscaled_states = [], [], [], []
        done = False
        count = 0

        while not done:
            if animate:
                self.env.render()
            unscaled_states.append(observation)
            observation = self.scaler.normalize(observation)
            states.append(observation)
            action = self.choose_action(observation)
            actions.append(action)
            next_observation, reward, done, info = self.env.step(action)
            rewards.append(reward)
            observation = next_observation

            if Config.train.get('max_episode_steps', None):
                count += 1
                if count == Config.train.max_episode_steps:
                    break

        self.scaler.update(np.array(unscaled_states))

        return states, actions, rewards, next_observation, done

    def cal_target_v(self, done, next_observation, rewards):
        if done:
            next_value = 0
        else:
            next_value = \
            self.sess.run(self.critic.value, {self.critic.states: [self.scaler.normalize(next_observation)]})[0, 0]
        target_v = []
        for reward in rewards[::-1]:
            next_value = reward + Config.train.reward_decay * next_value
            target_v.append([next_value])
        target_v.reverse()
        return target_v

    def update_actor(self, states, actions, target_v):
        feed_dict = {self.critic.states: states, self.actor.states: states}
        value, old_mean, old_stdd = self.sess.run(
            [self.critic.value, self.actor.mean, self.actor.stdd], feed_dict)
        advantage = np.array(target_v) - value
        feed_dict = {
            self.actor.states: states,
            self.actor.advantage: advantage,
            self.actor.old_mean: old_mean,
            self.actor.old_stdd: old_stdd,
            self.actor.actions: actions
        }
        if Config.train.surrogate_clip:
            for e in range(Config.train.actor_train_episode):
                self.sess.run(self.actor.train_op, feed_dict)
        else:
            for e in range(Config.train.actor_train_episode):
                _, kl = self.sess.run([self.actor.train_op, self.actor.kl],
                                      feed_dict)
                if kl > Config.train.kl_target * 4:
                    break

            if kl > Config.train.kl_target * Config.train.kl_target_beta:
                Config.train.kl_loss_lam *= Config.train.kl_lam_alpha
            elif kl < Config.train.kl_target / Config.train.kl_target_beta:
                Config.train.kl_loss_lam /= Config.train.kl_lam_alpha

    def update_critic(self, states, target_v):
        feed_dict = {
            self.critic.states: states,
            self.critic.target_v: target_v
        }
        for e in range(Config.train.critic_train_episode):
            self.sess.run(self.critic.train_op, feed_dict)
Exemplo n.º 2
0
class Agent:
    def __init__(self, env, name, chief=None):
        assert name == 'chief' or 'worker' in name
        if 'worker' in name:
            assert chief is not None
            self.chief = chief
        else:
            self.scaler = Scaler(Config.data.state_dim)
        self.name = name
        self.env = env
        self.sess = None
        self.coord = None

        with tf.variable_scope(name):
            self._build_graph()

    def _build_graph(self):
        self.actor = Actor()
        self.critic = Critic()
        if 'worker' in self.name:
            self._build_update_op()

    def _build_update_op(self):
        global_step = tf.train.get_global_step()
        tf.assign_add(global_step, 1, name='global_step_add')

        with tf.variable_scope('sync'):
            with tf.variable_scope('pull'):
                pull_a_params_op = [
                    actor_param.assign(chief_param)
                    for actor_param, chief_param in zip(
                        self.actor.params, self.chief.actor.params)
                ]
                pull_c_params_op = [
                    critic_param.assign(chief_param)
                    for critic_param, chief_param in zip(
                        self.critic.params, self.chief.critic.params)
                ]
                self.pull_op = tf.group(pull_a_params_op + pull_c_params_op)

            with tf.variable_scope('push'):
                update_a_op = self.chief.actor.optimizer.apply_gradients(
                    zip(self.actor.grads, self.chief.actor.params))
                update_c_op = self.chief.critic.optimizer.apply_gradients(
                    zip(self.critic.grads, self.chief.critic.params))
                self.update_op = tf.group([update_a_op, update_c_op])

    def init_scaler(self, init_episode=5):
        for e in range(init_episode):
            observation = self.env.reset()
            states = []
            done = False
            count = 0
            while not done:
                states.append(observation)
                action = self.choose_action(observation)
                next_observation, reward, done, info = self.env.step(action)
                observation = next_observation

                if Config.train.get('max_episode_steps', None):
                    count += 1
                    if count == Config.train.max_episode_steps:
                        break
            self.scaler.update(np.array(states))

    def update_chief(self, states, actions, target_v):
        feed_dict = {self.critic.states: states}
        value = self.sess.run(self.critic.value, feed_dict)
        td_error = np.array(target_v) - value
        feed_dict = {
            self.critic.states: states,
            self.critic.target_v: target_v,
            self.actor.states: states,
            self.actor.actions: actions,
            self.actor.td_error: td_error
        }
        self.sess.run([
            self.critic.loss, self.update_op, self.name + '/global_step_add:0'
        ], feed_dict)

    def pull_params(self):
        self.sess.run(self.pull_op)

    def cal_target_v(self, done, next_observation, rewards):
        if done:
            next_value = 0
        else:
            next_value = self.sess.run(
                self.critic.value, {
                    self.critic.states:
                    [self.chief.scaler.normalize(next_observation)]
                })[0, 0]
        target_v = []
        for reward in rewards[::-1]:
            next_value = reward + Config.train.reward_decay * next_value
            target_v.append([next_value])
        target_v.reverse()
        return target_v

    def choose_action(self, observation):
        if Config.data.action_type == 'discrete':
            policy = self.sess.run(self.actor.policy,
                                   {self.actor.states: [observation]})[0]
            action = np.random.choice(range(Config.data.action_num), p=policy)
        else:
            action = self.sess.run(self.actor.sample,
                                   {self.actor.states: [observation]})
        return action

    def eval(self, animate=False):
        assert self.name == 'chief'
        observation = self.env.reset()
        ep_reward = 0
        count = 0
        done = False
        while not done:
            if animate:
                self.env.render()
            action = self.choose_action(self.scaler.normalize(observation))
            next_observation, reward, done, info = self.env.step(action)
            ep_reward += reward
            observation = next_observation

            if Config.train.get('max_episode_steps', None):
                count += 1
                if count == Config.train.max_episode_steps:
                    break
        return ep_reward

    def work(self):
        total_step = 0
        states, actions, rewards, unscaled_states = [], [], [], []
        self.pull_params()

        while not self.coord.should_stop():
            observation = self.env.reset()
            ep_reward = 0
            done = False
            count = 0
            while not done:
                unscaled_states.append(observation)
                observation = self.chief.scaler.normalize(observation)
                states.append(observation)
                action = self.choose_action(observation)
                next_observation, reward, done, info = self.env.step(action)
                total_step += 1
                ep_reward += reward

                actions.append(action)
                rewards.append(reward)
                if total_step % Config.train.update_n_iter == 0 or done:
                    target_v = self.cal_target_v(done, next_observation,
                                                 rewards)
                    self.update_chief(states, actions, target_v)
                    self.chief.scaler.update(np.array(unscaled_states))
                    states, actions, rewards, unscaled_states = [], [], [], []
                    self.pull_params()

                observation = next_observation

                if Config.train.get('max_episode_steps', None):
                    count += 1
                    if count == Config.train.max_episode_steps:
                        break