def load_low_level_policy(policy_path=None, name=None):
    with tf_utils.get_default_session().as_default():
        with tf.variable_scope(name, reuse=False):
            snapshot = joblib.load(policy_path)

    policy = snapshot["policy"]
    return policy
示例#2
0
    def log_diagnostics(self, iteration, batch):
        """Record diagnostic information to the logger.

        Records the mean, min, max, and standard deviation of the GMM
        means, component weights, and covariances.
        """

        feeds = {self._observations_ph: batch['observations']}
        if self.todropoutpi:
            feeds[self.dropoutpi_placeholder] = 1.0
        if self.batchnormpi:
            feeds[self.isbnpitrainmode] = False

        sess = tf_utils.get_default_session()
        mu, log_sig, log_pi, reg_loss_t = sess.run((
            self.distribution.mu_t,
            self.distribution.log_sig_t,
            self.distribution.log_p_t,
            self.distribution._reg_loss_t,
        ), feeds)

        logger.record_tabular('policy-mus-mean', np.mean(mu))
        logger.record_tabular('policy-mus-min', np.min(mu))
        logger.record_tabular('policy-mus-max', np.max(mu))
        logger.record_tabular('policy-mus-std', np.std(mu))
        logger.record_tabular('log-sigs-mean', np.mean(log_sig))
        logger.record_tabular('log-sigs-min', np.min(log_sig))
        logger.record_tabular('log-sigs-max', np.max(log_sig))
        logger.record_tabular('log-sigs-std', np.std(log_sig))
        logger.record_tabular('log-pi-mean', np.mean(log_pi))
        logger.record_tabular('log-pi-max', np.max(log_pi))
        logger.record_tabular('log-pi-min', np.min(log_pi))
        logger.record_tabular('log-pi-std', np.std(log_pi))
        logger.record_tabular('mu-sig-output-reg', reg_loss_t)
示例#3
0
    def log_diagnostics(self, iteration, batch):
        """Record diagnostic information to the logger.

        Records the mean, min, max, and standard deviation of the GMM
        means, component weights, and covariances.
        """

        feeds = {self._observations_ph: batch['observations']}
        sess = tf_utils.get_default_session()
        mus, log_sigs, log_ws = sess.run(
            (
                self.distribution.mus_t,
                self.distribution.log_sigs_t,
                self.distribution.log_ws_t,
            ),
            feeds
        )

        logger.record_tabular('gmm-mus-mean', np.mean(mus))
        logger.record_tabular('gmm-mus-min', np.min(mus))
        logger.record_tabular('gmm-mus-max', np.max(mus))
        logger.record_tabular('gmm-mus-std', np.std(mus))
        logger.record_tabular('gmm-log-w-mean', np.mean(log_ws))
        logger.record_tabular('gmm-log-w-min', np.min(log_ws))
        logger.record_tabular('gmm-log-w-max', np.max(log_ws))
        logger.record_tabular('gmm-log-w-std', np.std(log_ws))
        logger.record_tabular('gmm-log-sigs-mean', np.mean(log_sigs))
        logger.record_tabular('gmm-log-sigs-min', np.min(log_sigs))
        logger.record_tabular('gmm-log-sigs-max', np.max(log_sigs))
        logger.record_tabular('gmm-log-sigs-std', np.std(log_sigs))
 def pis_for(self, obs):
     feeds = {self._observations_ph: obs}
     sess = tf_utils.get_default_session()
     x_t = sess.run(
         (
             self.distribution.x_t,
         ),
         feeds
     )
     return x_t
 def log_pis_for(self, obs):
     feeds = {self._observations_ph: obs}
     sess = tf_utils.get_default_session()
     log_pi = sess.run(
         (
             self.distribution.log_p_t,
         ),
         feeds
     )
     return log_pi
    def log_diagnostics(self, iteration, batch):
        """Record diagnostic information to the logger.

        Records the mean, min, max, and standard deviation of the GMM
        means, component weights, and covariances.
        """

        feeds = {self._observations_ph: batch['observations']}
        sess = tf_utils.get_default_session()
        probs = sess.run(self.distribution.p_all, feeds)

        logger.record_tabular('policy-prob-sum', np.mean(np.sum(probs, 1)))
示例#7
0
    def __init__(
            self,
            sampler,
            n_epochs=1000,
            n_train_repeat=1,
            n_initial_exploration_steps=10000,
            epoch_length=1000,
            eval_n_episodes=10,
            eval_deterministic=True,
            eval_render=False,
            control_interval=1,
            expert_path="dataset/hopper.npz",
            max_bc_iter=int(1e5),
    ):
        """
        Args:
            n_epochs (`int`): Number of epochs to run the training for.
            n_train_repeat (`int`): Number of times to repeat the training
                for single time step.
            n_initial_exploration_steps: Number of steps in the beginning to 
                take using actions drawn from a separate exploration policy.
            epoch_length (`int`): Epoch length.
            eval_n_episodes (`int`): Number of rollouts to evaluate.
            eval_deterministic (`int`): Whether or not to run the policy in
                deterministic mode when evaluating policy.
            eval_render (`int`): Whether or not to render the evaluation
                environment.
        """
        self.sampler = sampler

        self._n_epochs = int(n_epochs)
        self._n_train_repeat = n_train_repeat
        self._epoch_length = epoch_length
        self._n_initial_exploration_steps = n_initial_exploration_steps
        self._control_interval = control_interval

        self._eval_n_episodes = eval_n_episodes
        self._eval_deterministic = eval_deterministic
        self._eval_render = eval_render

        self._expert_path = expert_path
        self._max_bc_iter = max_bc_iter

        self._sess = tf_utils.get_default_session()

        self._env = None
        self._policy = None
        self._pool = None
示例#8
0
文件: base.py 项目: vlad17/sac
    def __init__(
        self,
        batch_size=64,
        n_epochs=1000,
        n_train_repeat=1,
        epoch_length=1000,
        min_pool_size=10000,
        max_path_length=1000,
        eval_n_episodes=10,
        eval_deterministic=True,
        eval_render=False,
    ):
        """
        Args:
            batch_size (`int`): Size of the sample batch to be used
                for training.
            n_epochs (`int`): Number of epochs to run the training for.
            n_train_repeat (`int`): Number of times to repeat the training
                for single time step.
            epoch_length (`int`): Epoch length.
            min_pool_size (`int`): Minimum size of the sample pool before
                running training.
            max_path_length (`int`): Number of timesteps before resetting
                environment and policy, and the number of paths used for
                evaluation rollout.
            eval_n_episodes (`int`): Number of rollouts to evaluate.
            eval_deterministic (`int`): Whether or not to run the policy in
                deterministic mode when evaluating policy.
            eval_render (`int`): Whether or not to render the evaluation
                environment.
        """
        self._batch_size = batch_size
        self._n_epochs = n_epochs
        self._n_train_repeat = n_train_repeat
        self._epoch_length = epoch_length
        self._min_pool_size = min_pool_size
        self._max_path_length = max_path_length

        self._eval_n_episodes = eval_n_episodes
        self._eval_deterministic = eval_deterministic
        self._eval_render = eval_render

        self._sess = tf_utils.get_default_session()

        self._env = None
        self._policy = None
        self._pool = None
示例#9
0
文件: base.py 项目: rcorona/sac
    def __init__(self,
                 sampler,
                 n_epochs=1000,
                 n_train_repeat=1,
                 n_initial_exploration_steps=10000,
                 epoch_length=1000,
                 eval_n_episodes=10,
                 eval_deterministic=True,
                 eval_render=False,
                 control_interval=1,
                 gpu_fraction=1.0):
        """
        Args:
            n_epochs (`int`): Number of epochs to run the training for.
            n_train_repeat (`int`): Number of times to repeat the training
                for single time step.
            n_initial_exploration_steps: Number of steps in the beginning to 
                take using actions drawn from a separate exploration policy.
            epoch_length (`int`): Epoch length.
            eval_n_episodes (`int`): Number of rollouts to evaluate.
            eval_deterministic (`int`): Whether or not to run the policy in
                deterministic mode when evaluating policy.
            eval_render (`int`): Whether or not to render the evaluation
                environment.
        """
        self.sampler = sampler

        self._n_epochs = int(n_epochs)
        self._n_train_repeat = n_train_repeat
        self._epoch_length = epoch_length
        self._n_initial_exploration_steps = n_initial_exploration_steps
        self._control_interval = control_interval

        self._eval_n_episodes = eval_n_episodes
        self._eval_deterministic = eval_deterministic
        self._eval_render = eval_render

        # Hack to get GPU fraction for parallelization.
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=gpu_fraction)
        config = tf.ConfigProto(gpu_options=gpu_options)
        self._sess = tf_utils.get_default_session(config=config)

        self._env = None
        self._policy = None
        self._pool = None
示例#10
0
    def __init__(
        self,
        sampler,
        n_epochs=1000,
        n_train_repeat=1,
        epoch_length=2000,
        eval_n_episodes=10,
        eval_n_frequency=1,
        eval_deterministic=True,
        eval_render=False,
        control_interval=1,
    ):
        """
        Args:
            n_epochs (`int`): Number of epochs to run the training for.
            n_train_repeat (`int`): Number of times to repeat the training
                for single time step.
            epoch_length (`int`): Epoch length.
            eval_n_episodes (`int`): Number of rollouts to evaluate.
            eval_deterministic (`int`): Whether or not to run the policy in
                deterministic mode when evaluating policy.
            eval_render (`int`): Whether or not to render the evaluation
                environment.
        """
        self.sampler = sampler

        self._n_epochs = n_epochs
        self._n_train_repeat = n_train_repeat
        self._epoch_length = epoch_length
        self._control_interval = control_interval

        self._eval_n_episodes = eval_n_episodes
        self._eval_n_frequency = eval_n_frequency
        self._eval_deterministic = eval_deterministic
        self._eval_render = eval_render

        self._sess = tf_utils.get_default_session()

        self._env = None
        self._policy = None
        self._pool = None

        self.log_writer = None
示例#11
0
    def log_diagnostics(self, iteration, batch):
        """Record diagnostic information to the logger.

        Records the mean, min, max, and standard deviation of the GMM
        means, component weights, and covariances.
        """

        sess = tf_utils.get_default_session()
        feed = {
            self._observations_ph: batch["observations"],
            self.sub_level_actions: batch["sub_level_actions"],
            self.sub_level_entropies: batch["sub_level_probs"]
        }
        log_pi = sess.run(self.log_pi, feed)

        logger.record_tabular('log-pi-mean', np.mean(log_pi))
        logger.record_tabular('log-pi-max', np.max(log_pi))
        logger.record_tabular('log-pi-min', np.min(log_pi))
        logger.record_tabular('log-pi-std', np.std(log_pi))
示例#12
0
    def eval(self, *inputs):
        feeds = {pl: val for pl, val in zip(self._input_pls, inputs)}

        return tf_utils.get_default_session().run(self._output_t, feeds)
示例#13
0
    def _train(self, env, policy, pool):
        """When training our policy expects an augmented observation."""
        self._init_training(env, policy, pool)

        with self._sess.as_default():
            env._wrapped_env.env.initialize(seed_task=SEED_TASK)
            observation = env.reset()
            policy.reset()
            log_p_z_episode = []  # Store log_p_z for this episode
            path_length = 0
            path_return = 0
            last_path_return = 0
            max_path_return = -np.inf
            n_episodes = 0
            self.prev_n_episodes = 0

            if self._learn_p_z:
                log_p_z_list = [
                    deque(maxlen=self._max_path_length)
                    for _ in range(self._num_skills)
                ]

            gt.rename_root('RLAlgorithm')
            gt.reset()
            gt.set_def_unique(False)

            for epoch in gt.timed_for(range(self._n_epochs + 1),
                                      save_itrs=True):

                path_length_list = []
                z = self._sample_z()
                aug_obs = utils.concat_obs_z(observation, z, self._num_skills)

                for t in range(self._epoch_length):
                    iteration = t + epoch * self._epoch_length

                    action, _ = policy.get_action(aug_obs)

                    if self._learn_p_z:
                        (obs, _) = utils.split_aug_obs(aug_obs,
                                                       self._num_skills)
                        feed_dict = {
                            self._discriminator._obs_pl: obs[None],
                            self._discriminator._action_pl: action[None]
                        }
                        logits = tf_utils.get_default_session().run(
                            self._discriminator._output_t, feed_dict)[0]
                        log_p_z = np.log(utils._softmax(logits)[z])
                        if self._learn_p_z:
                            log_p_z_list[z].append(log_p_z)

                    next_ob, reward, terminal, info = env.step(action)
                    aug_next_ob = utils.concat_obs_z(next_ob, z,
                                                     self._num_skills)
                    path_length += 1
                    path_return += reward

                    self._pool.add_sample(
                        aug_obs,
                        action,
                        reward,
                        terminal,
                        aug_next_ob,
                    )

                    if terminal or path_length >= self._max_path_length:
                        path_length_list.append(path_length)

                        # print("\n===RESET", epoch, n_episodes, "===", self._epoch_length, path_length, "===",
                        #     # env._wrapped_env.env.nstep_internal,
                        #     datetime.datetime.now())

                        env._wrapped_env.env.initialize(seed_task=SEED_TASK)
                        observation = env.reset()
                        policy.reset()
                        log_p_z_episode = []
                        path_length = 0
                        max_path_return = max(max_path_return, path_return)
                        last_path_return = path_return

                        path_return = 0
                        n_episodes += 1

                        # EPOCH IS DONE epoch
                        if not epoch % 10:
                            logger.log("Epoch: {:4} | Episodes: {}".format(
                                epoch, n_episodes),
                                       with_prefix=False)

                        if not n_episodes % self.eval_freq or \
                               n_episodes >= EPISODE_LIMIT or \
                               epoch >= self._n_epochs:
                            # is_final = epoch >= self._n_epochs \
                            #            or n_episodes >= EPISODE_LIMIT
                            self.sample_skills_to_bd(n_epoch=epoch,
                                                     n_episodes=n_episodes)
                            # Make snapshot
                            params = self.get_snapshot(epoch)
                            logger.save_itr_params(epoch, params)

                            gt.stamp('behaviours')

                    else:
                        aug_obs = aug_next_ob

                    gt.stamp('sample')

                    if self._pool.size >= self._min_pool_size:
                        for i in range(self._n_train_repeat):
                            batch = self._pool.random_batch(self._batch_size)
                            self._do_training(iteration, batch)

                    gt.stamp('train')

                    # Terminate after 1000000 episodes
                    if n_episodes >= EPISODE_LIMIT:
                        break

                else:
                    continue
                break

                if self._learn_p_z:
                    print('learning p(z)')
                    for z in range(self._num_skills):
                        if log_p_z_list[z]:
                            print(
                                '\t skill = %d, min=%.2f, max=%.2f, mean=%.2f, len=%d'
                                % (z, np.min(
                                    log_p_z_list[z]), np.max(log_p_z_list[z]),
                                   np.mean(
                                       log_p_z_list[z]), len(log_p_z_list[z])))
                    log_p_z = [
                        np.mean(log_p_z)
                        if log_p_z else np.log(1.0 / self._num_skills)
                        for log_p_z in log_p_z_list
                    ]
                    print('log_p_z: %s' % log_p_z)
                    self._p_z = utils._softmax(log_p_z)

            logger.push_prefix('Epoch #%d | ' % epoch)
            self._evaluate(epoch)

            params = self.get_snapshot(epoch)
            logger.save_itr_params(epoch, params)
            times_itrs = gt.get_times().stamps.itrs

            eval_time = times_itrs['eval'][-1] if epoch > 1 else 0
            total_time = gt.get_times().total
            logger.record_tabular('time-train', times_itrs['train'][-1])
            logger.record_tabular('time-eval', eval_time)
            logger.record_tabular('time-sample', times_itrs['sample'][-1])
            logger.record_tabular('time-total', total_time)
            logger.record_tabular('epoch', epoch)
            logger.record_tabular('episodes', n_episodes)
            logger.record_tabular('max-path-return', max_path_return)
            logger.record_tabular('last-path-return', last_path_return)
            logger.record_tabular('pool-size', self._pool.size)
            logger.record_tabular('path-length', np.mean(path_length_list))

            logger.dump_tabular(with_prefix=False)
            logger.pop_prefix()

            gt.stamp('eval')

            env.terminate()
示例#14
0
 def eval(self, *inputs):
     feeds = {pl: val for pl, val in zip(self._input_pls, inputs)}
     if self.todropoutvf:
         feeds[self.dropoutvf_placeholder] = self.dropoutvf_keep_prob
     return tf_utils.get_default_session().run(self._output_t, feeds)
示例#15
0
文件: diayn.py 项目: sumitsk/sac
    def _train(self, env, policy, pool):
        """When training our policy expects an augmented observation."""
        self._init_training(env, policy, pool)

        with self._sess.as_default():
            observation = env.reset()
            policy.reset()
            log_p_z_episode = []  # Store log_p_z for this episode
            path_length = 0
            path_return = 0
            last_path_return = 0
            max_path_return = -np.inf
            n_episodes = 0

            if self._learn_p_z:
                log_p_z_list = [
                    deque(maxlen=self._max_path_length)
                    for _ in range(self._num_skills)
                ]

            gt.rename_root('RLAlgorithm')
            gt.reset()
            gt.set_def_unique(False)

            for epoch in gt.timed_for(range(self._n_epochs + 1),
                                      save_itrs=True):
                logger.push_prefix('Epoch #%d | ' % epoch)

                path_length_list = []
                z = self._sample_z()
                aug_obs = utils.concat_obs_z(observation,
                                             z,
                                             self._num_skills,
                                             concat_type=self.concat_type)

                for t in range(self._epoch_length):
                    iteration = t + epoch * self._epoch_length

                    action, _ = policy.get_action(aug_obs)

                    if self._learn_p_z:
                        (obs, _) = utils.split_aug_obs(aug_obs,
                                                       self._num_skills)
                        feed_dict = {
                            self._discriminator._obs_pl: obs[None],
                            self._discriminator._action_pl: action[None]
                        }
                        logits = tf_utils.get_default_session().run(
                            self._discriminator._output_t, feed_dict)[0]
                        log_p_z = np.log(utils._softmax(logits)[z])
                        if self._learn_p_z:
                            log_p_z_list[z].append(log_p_z)

                    next_ob, reward, terminal, info = env.step(action)
                    aug_next_ob = utils.concat_obs_z(
                        next_ob,
                        z,
                        self._num_skills,
                        concat_type=self.concat_type)
                    path_length += 1
                    path_return += reward

                    self._pool.add_sample(
                        aug_obs,
                        action,
                        reward,
                        terminal,
                        aug_next_ob,
                    )

                    if terminal or path_length >= self._max_path_length:
                        path_length_list.append(path_length)
                        observation = env.reset()
                        policy.reset()
                        log_p_z_episode = []
                        path_length = 0
                        max_path_return = max(max_path_return, path_return)
                        last_path_return = path_return

                        path_return = 0
                        n_episodes += 1

                    else:
                        aug_obs = aug_next_ob
                    gt.stamp('sample')

                    if self._pool.size >= self._min_pool_size:
                        for i in range(self._n_train_repeat):
                            batch = self._pool.random_batch(self._batch_size)
                            self._do_training(iteration, batch)

                    gt.stamp('train')

                if self._learn_p_z:
                    print('learning p(z)')
                    for z in range(self._num_skills):
                        if log_p_z_list[z]:
                            print(
                                '\t skill = %d, min=%.2f, max=%.2f, mean=%.2f, len=%d'
                                % (z, np.min(
                                    log_p_z_list[z]), np.max(log_p_z_list[z]),
                                   np.mean(
                                       log_p_z_list[z]), len(log_p_z_list[z])))
                    log_p_z = [
                        np.mean(log_p_z)
                        if log_p_z else np.log(1.0 / self._num_skills)
                        for log_p_z in log_p_z_list
                    ]
                    print('log_p_z: %s' % log_p_z)
                    self._p_z = utils._softmax(log_p_z)

                self._evaluate(epoch)

                params = self.get_snapshot(epoch)
                logger.save_itr_params(epoch, params)
                times_itrs = gt.get_times().stamps.itrs

                eval_time = times_itrs['eval'][-1] if epoch > 1 else 0
                total_time = gt.get_times().total
                logger.record_tabular('time-train', times_itrs['train'][-1])
                logger.record_tabular('time-eval', eval_time)
                logger.record_tabular('time-sample', times_itrs['sample'][-1])
                logger.record_tabular('time-total', total_time)
                logger.record_tabular('epoch', epoch)
                logger.record_tabular('episodes', n_episodes)
                logger.record_tabular('max-path-return', max_path_return)
                logger.record_tabular('last-path-return', last_path_return)
                logger.record_tabular('pool-size', self._pool.size)
                logger.record_tabular('path-length', np.mean(path_length_list))

                logger.dump_tabular(with_prefix=False)
                logger.pop_prefix()

                gt.stamp('eval')

            env.terminate()
示例#16
0
    def train(self):
        """
        CG: the function that conducts ensemble training.
        :return: 
        """
        # Set up parameters for the training process.
        self._n_epochs = self._base_ac_params['n_epochs']
        self._epoch_length = self._base_ac_params['epoch_length']
        self._n_train_repeat = self._base_ac_params['n_train_repeat']
        self._n_initial_exploration_steps = self._base_ac_params[
            'n_initial_exploration_steps']
        self._eval_render = self._base_ac_params['eval_render']
        self._eval_n_episodes = self._base_ac_params['eval_n_episodes']
        self._eval_deterministic = self._base_ac_params['eval_deterministic']

        # Set up the evaluation environment.
        if self._eval_n_episodes > 0:
            with tf.variable_scope("low_level_policy", reuse=True):
                self._eval_env = deep_clone(self._env)

        # Set up the tensor flow session.
        self._sess = tf_utils.get_default_session()

        # Import required libraries for training.
        import random
        import math
        import operator
        import numpy as np

        # Initialize the sampler.
        alg_ins = random.choice(self._alg_instances)
        self._sampler.initialize(self._env, alg_ins[0].policy, self._pool)

        # Perform the training/evaluation process.
        num_episode = 0.
        with self._sess.as_default():
            gt.rename_root('RLAlgorithm')
            gt.reset()
            gt.set_def_unique(False)

            for epoch in gt.timed_for(range(self._n_epochs + 1),
                                      save_itrs=True):
                logger.push_prefix('Epoch #%d | ' % epoch)

                for t in range(self._epoch_length):
                    isEpisodeEnd = self._sampler.sample()

                    # If an episode is ended, we need to update performance statistics for each AC instance and
                    # pick randomly another AC instance for next episode of exploration.
                    if isEpisodeEnd:
                        num_episode = num_episode + 1.
                        alg_ins[1] = 0.9 * alg_ins[
                            1] + 0.1 * self._sampler._last_path_return
                        alg_ins[2] = alg_ins[2] + 1.

                        if self._use_ucb:
                            # Select an algorithm instance based on UCB.
                            selected = False
                            for ains in self._alg_instances:
                                if ains[2] < 1.:
                                    alg_ins = ains
                                    selected = True
                                    break
                                else:
                                    ains[3] = ains[1] + math.sqrt(
                                        2.0 * math.log(num_episode) / ains[2])

                            if not selected:
                                alg_ins = max(self._alg_instances,
                                              key=operator.itemgetter(3))

                        else:
                            # Select an algorithm instance uniformly at random.
                            alg_ins = random.choice(self._alg_instances)
                            self._sampler.set_policy(alg_ins[0].policy)

                    if not self._sampler.batch_ready():
                        continue
                    gt.stamp('sample')

                    # Perform training over all AC instances.
                    for i in range(self._n_train_repeat):
                        batch = self._sampler.random_batch()
                        for ains in self._alg_instances:
                            ains[0]._do_training(iteration=t +
                                                 epoch * self._epoch_length,
                                                 batch=batch)
                    gt.stamp('train')

                # Perform evaluation after one full epoch of training is completed.
                if self._eval_n_episodes < 1:
                    continue

                if self._evaluation_strategy == 'ensemble':
                    # Use a whole ensemble of AC instances for evaluation.
                    paths = rollouts(self._eval_env, self,
                                     self._sampler._max_path_length,
                                     self._eval_n_episodes)

                elif self._evaluation_strategy == 'best-policy':
                    # Choose the AC instance with the highest observed performance so far for evaluation.
                    eval_alg_ins = max(self._alg_instances,
                                       key=operator.itemgetter(1))
                    with eval_alg_ins[0].policy.deterministic(
                            self._eval_deterministic):
                        paths = rollouts(self._eval_env,
                                         eval_alg_ins[0].policy,
                                         self._sampler._max_path_length,
                                         self._eval_n_episodes)

                else:
                    paths = None

                if paths is not None:
                    total_returns = [path['rewards'].sum() for path in paths]
                    episode_lengths = [len(p['rewards']) for p in paths]
                    logger.record_tabular('return-average',
                                          np.mean(total_returns))
                    logger.record_tabular('return-min', np.min(total_returns))
                    logger.record_tabular('return-max', np.max(total_returns))
                    logger.record_tabular('return-std', np.std(total_returns))
                    logger.record_tabular('episode-length-avg',
                                          np.mean(episode_lengths))
                    logger.record_tabular('episode-length-min',
                                          np.min(episode_lengths))
                    logger.record_tabular('episode-length-max',
                                          np.max(episode_lengths))
                    logger.record_tabular('episode-length-std',
                                          np.std(episode_lengths))

                    self._eval_env.log_diagnostics(paths)
                    if self._eval_render:
                        self._eval_env.render(paths)

                # Produce log info after each episode of training and evaluation.
                times_itrs = gt.get_times().stamps.itrs
                eval_time = times_itrs['eval'][-1] if epoch > 1 else 0
                total_time = gt.get_times().total
                logger.record_tabular('time-train', times_itrs['train'][-1])
                logger.record_tabular('time-eval', eval_time)
                logger.record_tabular('time-sample', times_itrs['sample'][-1])
                logger.record_tabular('time-total', total_time)
                logger.record_tabular('epoch', epoch)

                self._sampler.log_diagnostics()

                logger.dump_tabular(with_prefix=False)
                logger.pop_prefix()

                gt.stamp('eval')

            # Terminate the sampler after the training process is completed.
            self._sampler.terminate()
示例#17
0
    def __init__(
        self,
        environment_name,
        algorithm_name,
        lr,
        scale_reward,
        scale_entropy,
        discount,
        tau,
        max_replay_buffer_size,
        sampler_params,
        value_func_layers_number,
        value_func_layer_size,
        policy_func_layers_number,
        policy_func_layer_size,
        base_ac_alg_params,
        q_param_list,
        use_ucb=False,
        evaluation_strategy='ensemble',
    ):
        """
        CG: the constructor.
        :param environment_name: the name of the environment in string. 
        :param algorithm_name: the name of the AC algorithm to be used in the ensemble.
        :param lr: the learning rate to be used in the ensemble.
        :param scale_reward: the reward scaling factor.
        :param scale_entropy: the entropy scaling factor.
        :param discount: the reward discount factor.
        :param tau: the target value function updating factor.
        :param max_replay_buffer_size: the maximum size of the replay buffer.
        :param sampler_params: extra parameter settings for the random sampler.
        :param value_func_layers_number: the number of hidden layers for the value network, i.e. V function and Q function.
        :param value_func_layer_size: the number of neurons of each hidden layer of the value network. 
        :param policy_func_layers_number: th number of hidden layers for the policy network.
        :param policy_func_layer_size: the number of neurons of each hidden layer of the policy network.
        :param base_ac_alg_params: base parameters for the AC algorithm.
        :param q_param_list: the list of q values for the ensemble. Each q value in the list represents one AC instance in the ensemble.
        :param use_ucb: an indicator regarding the use of ucb for selecting AC instances in the ensemble for exploration.
        :param evaluation_strategy: the strategy used for evaluation. We have two strategies available, 'ensemble' and 'best-policy'.
        """
        # Set up the environment.
        self._environment_name = environment_name
        self._env = GymEnv(self._environment_name)

        # Set up the algorithm parameters.
        self._algorithm_name = algorithm_name
        self._lr = lr
        self._scale_reward = scale_reward
        self._scale_entropy = scale_entropy
        self._discount = discount
        self._tau = tau
        self._use_ucb = use_ucb
        self._evaluation_strategy = evaluation_strategy

        # Set up the replay buffer.
        self._max_replay_buffer_size = max_replay_buffer_size
        self._pool = SimpleReplayBuffer(
            env_spec=self._env.spec,
            max_replay_buffer_size=self._max_replay_buffer_size)

        # Set up the environment sampler.
        self._sampler_params = sampler_params
        self._sampler = SimpleSampler(**self._sampler_params)

        # Set up the required number of AC instances in the ensemble. Each AC instance has its own value network and policy network.
        self._alg_instances = []
        self._base_ac_params = base_ac_alg_params
        self._base_alg_params = dict(self._base_ac_params,
                                     sampler=self._sampler)
        for id, q_val in enumerate(q_param_list):
            # Set up the value function network for an AC instance.
            qf1 = NNQFunction(env_spec=self._env.spec,
                              hidden_layer_sizes=tuple([
                                  value_func_layer_size
                                  for _ in range(value_func_layers_number)
                              ]),
                              name=str(id) + 'qf1')
            qf2 = NNQFunction(env_spec=self._env.spec,
                              hidden_layer_sizes=tuple([
                                  value_func_layer_size
                                  for _ in range(value_func_layers_number)
                              ]),
                              name=str(id) + 'qf2')
            vf = NNVFunction(env_spec=self._env.spec,
                             hidden_layer_sizes=tuple([
                                 value_func_layer_size
                                 for _ in range(value_func_layers_number)
                             ]),
                             name=str(id) + 'vf')

            # Set up the policy network for an AC instance.
            policy = GaussianPolicy(
                env_spec=self._env.spec,
                hidden_layer_sizes=tuple([
                    policy_func_layer_size
                    for _ in range(policy_func_layers_number)
                ]),
                squash=True,
                reparameterize=False,
                reg=1.e-3,
                name=str(id) + 'gaussian_policy')
            initial_exploration_policy = policy

            # Set up an AC instance.
            if self._algorithm_name == 'sac':
                algorithm = SACV1(
                    base_kwargs=self._base_alg_params,
                    env=self._env,
                    policy=policy,
                    initial_exploration_policy=initial_exploration_policy,
                    pool=self._pool,
                    qf1=qf1,
                    qf2=qf2,
                    vf=vf,
                    lr=self._lr,
                    scale_reward=self._scale_reward,
                    scale_entropy=self._scale_entropy,
                    discount=self._discount,
                    tau=self._tau,
                    reparameterize=False,
                    target_update_interval=1,
                    action_prior='uniform',
                    save_full_state=False,
                )
            elif self._algorithm_name == 'tac':
                algorithm = TAC(
                    base_kwargs=self._base_alg_params,
                    env=self._env,
                    policy=policy,
                    initial_exploration_policy=initial_exploration_policy,
                    pool=self._pool,
                    qf1=qf1,
                    qf2=qf2,
                    vf=vf,
                    lr=self._lr,
                    scale_reward=self._scale_reward,
                    scale_entropy=self._scale_entropy,
                    discount=self._discount,
                    tau=self._tau,
                    reparameterize=False,
                    target_update_interval=1,
                    action_prior='uniform',
                    save_full_state=False,
                    tsallisQ=q_val,
                )
            elif self._algorithm_name == 'rac':
                algorithm = RAC(
                    base_kwargs=self._base_alg_params,
                    env=self._env,
                    policy=policy,
                    initial_exploration_policy=initial_exploration_policy,
                    pool=self._pool,
                    qf1=qf1,
                    qf2=qf2,
                    vf=vf,
                    lr=self._lr,
                    scale_reward=self._scale_reward,
                    scale_entropy=self._scale_entropy,
                    discount=self._discount,
                    tau=self._tau,
                    reparameterize=False,
                    target_update_interval=1,
                    action_prior='uniform',
                    save_full_state=False,
                    renyiQ=q_val,
                )
            else:
                raise NotImplementedError

            # Initialize the AC instance.
            # algorithm._sess.run(tf.global_variables_initializer())

            # Put the initialized AC instance into the algorithm instance list.
            # Each element of the algorithm instance list is made up of
            #           the algorithm instance,
            #           the moving average performance of the instance,
            #           the number of times the instance has been used for exploration previously, and
            #           the UCB bound.
            self._alg_instances.append([algorithm, 0.0, 0.0, 0.0])

        # Set up the ensemble Q-function for action selection.
        self._Q_ensemble = NNQFunction(
            env_spec=self._env.spec,
            hidden_layer_sizes=tuple([
                value_func_layer_size for _ in range(value_func_layers_number)
            ]),
            name='ensqf')

        # ========================================================================
        # Set up the training target for the ensemble Q-function for action selection.
        # ========================================================================
        # Create the observation placeholder.
        self._observations_ens_ph = tf.placeholder(
            tf.float32,
            shape=(None, self._env.spec.observation_space.flat_dim),
            name='obv_ens',
        )

        # Create the next observation placeholder.
        self._observations_ens_next_ph = tf.placeholder(
            tf.float32,
            shape=(None, self._env.spec.observation_space.flat_dim),
            name='next_obv_ens',
        )

        # Create a list of next action placeholders.
        self._acts_next_phs = []
        for i in range(len(q_param_list)):
            act_ens_ph = tf.placeholder(
                tf.float32,
                shape=(None, self._env.spec.action_space.flat_dim),
                name=str(i) + '_next_act_ens',
            )
            self._acts_next_phs.append(act_ens_ph)

        # Create the observed action placeholder.
        self._obv_act_ph = tf.placeholder(
            tf.float32,
            shape=(None, self._env.spec.action_space.flat_dim),
            name='act_obv_ens',
        )

        # Create the reward placeholder.
        self._rewards_ph = tf.placeholder(
            tf.float32,
            shape=(None, ),
            name='rew_ens',
        )

        # Create the terminal placeholder.
        self._terminals_ph = tf.placeholder(
            tf.float32,
            shape=(None, ),
            name='ter_ens',
        )

        # Determine the target Q-value for next step.
        self._q_ens_targets = []
        for act_next_ph in self._acts_next_phs:
            qt = self._Q_ensemble.get_output_for(
                self._observations_ens_next_ph, act_next_ph, reuse=True)
            self._q_ens_targets.append(qt)

        for i, q_t in enumerate(self._q_ens_targets):
            if i == 0:
                self._q_ens_next = q_t
            else:
                self._q_ens_next = tf.maximum(self._q_ens_next, q_t)
                # self._q_ens_next = self._q_ens_next + q_t
        # self._q_ens_next = self._q_ens_next / len(self._q_ens_targets)

        # Determine the Q-loss.
        self._q_train = self._Q_ensemble.get_output_for(
            self._observations_ens_ph, self._obv_act_ph, reuse=True)
        self._q_ens_loss = 0.5 * tf.reduce_mean(
            (self._q_train -
             tf.stop_gradient(self._scale_reward * self._rewards_ph +
                              (1 - self._terminals_ph) * self._discount *
                              self._q_ens_next))**2)

        # Determine the Q-training operator.
        self._q_ens_train_operator = tf.train.AdamOptimizer(self._lr).minimize(
            loss=self._q_ens_loss,
            var_list=self._Q_ensemble.get_params_internal())

        # Set up the tensor flow session.
        self._sess = tf_utils.get_default_session()
        self._sess.run(tf.global_variables_initializer())