示例#1
0
    def setup_model(self):
        with SetVerbosity(self.verbose):

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_util.make_session(num_cpu=self.nprocs,
                                                 graph=self.graph)

                self.advs_ph = advs_ph = tf.placeholder(tf.float32, [None])
                self.rewards_ph = rewards_ph = tf.placeholder(
                    tf.float32, [None])
                self.pg_lr_ph = pg_lr_ph = tf.placeholder(tf.float32, [])

                n_batch_step = None
                n_batch_train = None
                if issubclass(self.policy, LstmPolicy):
                    n_batch_step = self.n_envs
                    n_batch_train = self.n_envs * self.n_steps

                self.model = step_model = self.policy(self.sess,
                                                      self.observation_space,
                                                      self.action_space,
                                                      self.n_envs,
                                                      1,
                                                      n_batch_step,
                                                      reuse=False)
                self.model2 = train_model = self.policy(self.sess,
                                                        self.observation_space,
                                                        self.action_space,
                                                        self.n_envs,
                                                        self.n_steps,
                                                        n_batch_train,
                                                        reuse=True)

                self.action_ph = action_ph = train_model.pdtype.sample_placeholder(
                    [None])

                logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=train_model.policy, labels=action_ph)
                self.logits = train_model.policy

                # training loss
                pg_loss = tf.reduce_mean(advs_ph * logpac)
                self.entropy = entropy = tf.reduce_mean(
                    calc_entropy(train_model.policy))
                self.pg_loss = pg_loss = pg_loss - self.ent_coef * entropy
                self.vf_loss = vf_loss = mse(tf.squeeze(train_model.value_fn),
                                             rewards_ph)
                train_loss = pg_loss + self.vf_coef * vf_loss

                # Fisher loss construction
                self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac)
                sample_net = train_model.value_fn + tf.random_normal(
                    tf.shape(train_model.value_fn))
                self.vf_fisher = vf_fisher_loss = -self.vf_fisher_coef * tf.reduce_mean(
                    tf.pow(train_model.value_fn - tf.stop_gradient(sample_net),
                           2))
                self.joint_fisher = pg_fisher_loss + vf_fisher_loss

                self.params = params = find_trainable_variables("model")

                self.grads_check = tf.gradients(train_loss, params)

                with tf.device('/gpu:0'):
                    self.optim = optim = kfac.KfacOptimizer(
                        learning_rate=pg_lr_ph,
                        clip_kl=self.kfac_clip,
                        momentum=0.9,
                        kfac_update=1,
                        epsilon=0.01,
                        stats_decay=0.99,
                        async=1,
                        cold_iter=10,
                        max_grad_norm=self.max_grad_norm,
                        verbose=self.verbose)

                    optim.compute_and_apply_stats(self.joint_fisher,
                                                  var_list=params)

                self.train_model = train_model
                self.step_model = step_model
                self.step = step_model.step
                self.proba_step = step_model.proba_step
                self.value = step_model.value
                self.initial_state = step_model.initial_state
                tf.global_variables_initializer().run(session=self.sess)
def learn(env,
          policy,
          value_fn,
          gamma,
          lam,
          timesteps_per_batch,
          num_timesteps,
          animate=False,
          callback=None,
          desired_kl=0.002):
    """
    Trains an ACKTR model.

    :param env: (Gym environment) The environment to learn from
    :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...)
    :param value_fn: (Object) The value function model to use (MLP, CNN, LSTM, ...)
    :param gamma: (float) The discount value
    :param lam: (float) the tradeoff between exploration and exploitation
    :param timesteps_per_batch: (int) the number of timesteps for each batch
    :param num_timesteps: (int) the total number of timesteps to run
    :param animate: (bool) if render env
    :param callback: (function) called every step, used for logging and saving
    :param desired_kl: (float) the Kullback leibler weight for the loss
    """
    obfilter = ZFilter(env.observation_space.shape)

    max_pathlength = env.spec.timestep_limit
    stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)),
                           name='stepsize')
    inputs, loss, loss_sampled = policy.update_info
    optim = kfac.KfacOptimizer(learning_rate=stepsize,
                               cold_lr=stepsize * (1 - 0.9),
                               momentum=0.9,
                               kfac_update=2,
                               epsilon=1e-2,
                               stats_decay=0.99,
                               async_eigen_decomp=1,
                               cold_iter=1,
                               weight_decay_dict=policy.wd_dict,
                               max_grad_norm=None)
    pi_var_list = []
    for var in tf.trainable_variables():
        if "pi" in var.name:
            pi_var_list.append(var)

    update_op, q_runner = optim.minimize(loss,
                                         loss_sampled,
                                         var_list=pi_var_list)
    do_update = tf_util.function(inputs, update_op)
    tf_util.initialize()

    # start queue runners
    enqueue_threads = []
    coord = tf.train.Coordinator()
    for queue_runner in [q_runner, value_fn.q_runner]:
        assert queue_runner is not None
        enqueue_threads.extend(
            queue_runner.create_threads(tf.get_default_session(),
                                        coord=coord,
                                        start=True))

    i = 0
    timesteps_so_far = 0
    while True:
        if timesteps_so_far > num_timesteps:
            break
        logger.log("********** Iteration %i ************" % i)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            path = rollout(env,
                           policy,
                           max_pathlength,
                           animate=(len(paths) == 0 and (i % 10 == 0)
                                    and animate),
                           obfilter=obfilter)
            paths.append(path)
            timesteps_this_batch += path["reward"].shape[0]
            timesteps_so_far += path["reward"].shape[0]
            if timesteps_this_batch > timesteps_per_batch:
                break

        # Estimate advantage function
        vtargs = []
        advs = []
        for path in paths:
            rew_t = path["reward"]
            return_t = common.discount(rew_t, gamma)
            vtargs.append(return_t)
            vpred_t = value_fn.predict(path)
            vpred_t = np.append(vpred_t,
                                0.0 if path["terminated"] else vpred_t[-1])
            delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1]
            adv_t = common.discount(delta_t, gamma * lam)
            advs.append(adv_t)
        # Update value function
        value_fn.fit(paths, vtargs)

        # Build arrays for policy update
        ob_no = np.concatenate([path["observation"] for path in paths])
        action_na = np.concatenate([path["action"] for path in paths])
        oldac_dist = np.concatenate([path["action_dist"] for path in paths])
        adv_n = np.concatenate(advs)
        standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8)

        # Policy update
        do_update(ob_no, action_na, standardized_adv_n)

        min_stepsize = np.float32(1e-8)
        max_stepsize = np.float32(1e0)
        # Adjust stepsize
        kl_loss = policy.compute_kl(ob_no, oldac_dist)
        if kl_loss > desired_kl * 2:
            logger.log("kl too high")
            tf.assign(stepsize, tf.maximum(min_stepsize,
                                           stepsize / 1.5)).eval()
        elif kl_loss < desired_kl / 2:
            logger.log("kl too low")
            tf.assign(stepsize, tf.minimum(max_stepsize,
                                           stepsize * 1.5)).eval()
        else:
            logger.log("kl just right!")

        logger.record_tabular(
            "EpRewMean", np.mean([path["reward"].sum() for path in paths]))
        logger.record_tabular(
            "EpRewSEM",
            np.std([
                path["reward"].sum() / np.sqrt(len(paths)) for path in paths
            ]))
        logger.record_tabular(
            "EpLenMean", np.mean([path["reward"].shape[0] for path in paths]))
        logger.record_tabular("KL", kl_loss)
        if callback is not None:
            # Only stop training if return value is False, not when it is None. This is for backwards
            # compatibility with callbacks that have no return statement.
            if callback(locals(), globals()) == False:
                break
        logger.dump_tabular()
        i += 1

    coord.request_stop()
    coord.join(enqueue_threads)
示例#3
0
    def setup_model(self):
        with SetVerbosity(self.verbose):

            assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the ACKTR model must be " \
                                                               "an instance of common.policies.ActorCriticPolicy."

            if isinstance(self.action_space, Box):
                raise NotImplementedError(
                    "WIP: ACKTR does not support Continuous actions yet.")

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_util.make_session(num_cpu=self.nprocs,
                                                 graph=self.graph)

                n_batch_step = None
                n_batch_train = None
                if issubclass(self.policy, RecurrentActorCriticPolicy):
                    n_batch_step = self.n_envs
                    n_batch_train = self.n_envs * self.n_steps

                self.model = step_model = self.policy(self.sess,
                                                      self.observation_space,
                                                      self.action_space,
                                                      self.n_envs,
                                                      1,
                                                      n_batch_step,
                                                      reuse=False,
                                                      **self.policy_kwargs)

                self.params = params = tf_util.get_trainable_vars("model")

                with tf.variable_scope(
                        "train_model",
                        reuse=True,
                        custom_getter=tf_util.outer_scope_getter(
                            "train_model")):
                    self.model2 = train_model = self.policy(
                        self.sess,
                        self.observation_space,
                        self.action_space,
                        self.n_envs,
                        self.n_steps,
                        n_batch_train,
                        reuse=True,
                        **self.policy_kwargs)

                with tf.variable_scope(
                        "loss",
                        reuse=False,
                        custom_getter=tf_util.outer_scope_getter("loss")):
                    self.advs_ph = advs_ph = tf.placeholder(tf.float32, [None])
                    self.rewards_ph = rewards_ph = tf.placeholder(
                        tf.float32, [None])
                    self.pg_lr_ph = pg_lr_ph = tf.placeholder(tf.float32, [])
                    self.action_ph = action_ph = train_model.pdtype.sample_placeholder(
                        [None])

                    logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
                        logits=train_model.policy, labels=action_ph)
                    self.logits = train_model.policy

                    # training loss
                    pg_loss = tf.reduce_mean(advs_ph * logpac)
                    self.entropy = entropy = tf.reduce_mean(
                        calc_entropy(train_model.policy))
                    self.pg_loss = pg_loss = pg_loss - self.ent_coef * entropy
                    self.vf_loss = vf_loss = mse(
                        tf.squeeze(train_model.value_fn), rewards_ph)
                    train_loss = pg_loss + self.vf_coef * vf_loss

                    # Fisher loss construction
                    self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac)
                    sample_net = train_model.value_fn + tf.random_normal(
                        tf.shape(train_model.value_fn))
                    self.vf_fisher = vf_fisher_loss = -self.vf_fisher_coef * tf.reduce_mean(
                        tf.pow(
                            train_model.value_fn -
                            tf.stop_gradient(sample_net), 2))
                    self.joint_fisher = pg_fisher_loss + vf_fisher_loss

                    tf.summary.scalar('entropy_loss', self.entropy)
                    tf.summary.scalar('policy_gradient_loss', pg_loss)
                    tf.summary.scalar('policy_gradient_fisher_loss',
                                      pg_fisher_loss)
                    tf.summary.scalar('value_function_loss', self.vf_loss)
                    tf.summary.scalar('value_function_fisher_loss',
                                      vf_fisher_loss)
                    tf.summary.scalar('loss', train_loss)

                    self.grads_check = tf.gradients(train_loss, params)

                with tf.variable_scope("input_info", reuse=False):
                    tf.summary.scalar('discounted_rewards',
                                      tf.reduce_mean(self.rewards_ph))
                    tf.summary.scalar('learning_rate',
                                      tf.reduce_mean(self.pg_lr_ph))
                    tf.summary.scalar('advantage',
                                      tf.reduce_mean(self.advs_ph))

                    if self.full_tensorboard_log:
                        tf.summary.histogram('discounted_rewards',
                                             self.rewards_ph)
                        tf.summary.histogram('learning_rate', self.pg_lr_ph)
                        tf.summary.histogram('advantage', self.advs_ph)
                        if tf_util.is_image(self.observation_space):
                            tf.summary.image('observation', train_model.obs_ph)
                        else:
                            tf.summary.histogram('observation',
                                                 train_model.obs_ph)

                with tf.variable_scope(
                        "kfac",
                        reuse=False,
                        custom_getter=tf_util.outer_scope_getter("kfac")):
                    with tf.device('/gpu:0'):
                        self.optim = optim = kfac.KfacOptimizer(
                            learning_rate=pg_lr_ph,
                            clip_kl=self.kfac_clip,
                            momentum=0.9,
                            kfac_update=1,
                            epsilon=0.01,
                            stats_decay=0.99,
                            async_eigen_decomp=self.async_eigen_decomp,
                            cold_iter=10,
                            max_grad_norm=self.max_grad_norm,
                            verbose=self.verbose)

                        optim.compute_and_apply_stats(self.joint_fisher,
                                                      var_list=params)

                self.train_model = train_model
                self.step_model = step_model
                self.step = step_model.step
                self.proba_step = step_model.proba_step
                self.value = step_model.value
                self.initial_state = step_model.initial_state
                tf.global_variables_initializer().run(session=self.sess)

                self.summary = tf.summary.merge_all()
    def __init__(self, ob_dim, ac_dim, verbose=1):
        """
        Create an MLP policy for a value function

        :param ob_dim: (int) Observation dimention
        :param ac_dim: (int) action dimention
        :param verbose: (int) verbosity level
        """
        obs_ph = tf.placeholder(tf.float32,
                                shape=[None, ob_dim * 2 + ac_dim * 2 + 2
                                       ])  # batch of observations
        vtarg_n = tf.placeholder(tf.float32, shape=[None], name='vtarg')
        wd_dict = {}
        layer_1 = tf.nn.elu(
            dense(obs_ph,
                  64,
                  "h1",
                  weight_init=tf_util.normc_initializer(1.0),
                  bias_init=0,
                  weight_loss_dict=wd_dict))
        layer_2 = tf.nn.elu(
            dense(layer_1,
                  64,
                  "h2",
                  weight_init=tf_util.normc_initializer(1.0),
                  bias_init=0,
                  weight_loss_dict=wd_dict))
        vpred_n = dense(layer_2,
                        1,
                        "hfinal",
                        weight_init=tf_util.normc_initializer(1.0),
                        bias_init=0,
                        weight_loss_dict=wd_dict)[:, 0]
        sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n))
        wd_loss = tf.get_collection("vf_losses", None)
        loss = tf.reduce_mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss)
        loss_sampled = tf.reduce_mean(
            tf.square(vpred_n - tf.stop_gradient(sample_vpred_n)))

        self._predict = tf_util.function([obs_ph], vpred_n)

        optim = kfac.KfacOptimizer(learning_rate=0.001,
                                   cold_lr=0.001 * (1 - 0.9),
                                   momentum=0.9,
                                   clip_kl=0.3,
                                   epsilon=0.1,
                                   stats_decay=0.95,
                                   async=1,
                                   kfac_update=2,
                                   cold_iter=50,
                                   weight_decay_dict=wd_dict,
                                   max_grad_norm=None,
                                   verbose=verbose)
        vf_var_list = []
        for var in tf.trainable_variables():
            if "vf" in var.name:
                vf_var_list.append(var)

        update_op, self.q_runner = optim.minimize(loss,
                                                  loss_sampled,
                                                  var_list=vf_var_list)
        self.do_update = tf_util.function([obs_ph, vtarg_n], update_op)  # pylint: disable=E1101
        tf_util.initialize()  # Initialize uninitialized TF variables