Пример #1
0
    def setup_model(self):
        with SetVerbosity(self.verbose):

            assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the A2C model must be an " \
                                                                "instance of common.policies.ActorCriticPolicy."

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess,
                                                 graph=self.graph)

                self.n_batch = self.n_envs * self.n_steps

                n_batch_step = None
                n_batch_train = None
                if issubclass(self.policy, RecurrentActorCriticPolicy):
                    n_batch_step = self.n_envs
                    n_batch_train = self.n_envs * self.n_steps

                step_model = self.policy(self.sess,
                                         self.observation_space,
                                         self.action_space,
                                         self.n_envs,
                                         1,
                                         n_batch_step,
                                         reuse=False,
                                         **self.policy_kwargs)

                with tf.compat.v1.variable_scope(
                        "train_model",
                        reuse=True,
                        custom_getter=tf_util.outer_scope_getter(
                            "train_model")):
                    train_model = self.policy(self.sess,
                                              self.observation_space,
                                              self.action_space,
                                              self.n_envs,
                                              self.n_steps,
                                              n_batch_train,
                                              reuse=True,
                                              **self.policy_kwargs)

                with tf.compat.v1.variable_scope("loss", reuse=False):
                    self.actions_ph = train_model.pdtype.sample_placeholder(
                        [None], name="action_ph")
                    self.advs_ph = tf.compat.v1.placeholder(tf.float32, [None],
                                                            name="advs_ph")
                    self.rewards_ph = tf.compat.v1.placeholder(
                        tf.float32, [None], name="rewards_ph")
                    self.learning_rate_ph = tf.compat.v1.placeholder(
                        tf.float32, [], name="learning_rate_ph")

                    neglogpac = train_model.proba_distribution.neglogp(
                        self.actions_ph)
                    self.entropy = tf.reduce_mean(
                        input_tensor=train_model.proba_distribution.entropy())
                    self.pg_loss = tf.reduce_mean(input_tensor=self.advs_ph *
                                                  neglogpac)
                    self.vf_loss = mse(tf.squeeze(train_model.value_flat),
                                       self.rewards_ph)
                    # https://arxiv.org/pdf/1708.04782.pdf#page=9, https://arxiv.org/pdf/1602.01783.pdf#page=4
                    # and https://github.com/dennybritz/reinforcement-learning/issues/34
                    # suggest to add an entropy component in order to improve exploration.
                    loss = self.pg_loss - self.entropy * self.ent_coef + self.vf_loss * self.vf_coef

                    tf.compat.v1.summary.scalar('entropy_loss', self.entropy)
                    tf.compat.v1.summary.scalar('policy_gradient_loss',
                                                self.pg_loss)
                    tf.compat.v1.summary.scalar('value_function_loss',
                                                self.vf_loss)
                    tf.compat.v1.summary.scalar('loss', loss)

                    self.params = tf_util.get_trainable_vars("model")
                    grads = tf.gradients(ys=loss, xs=self.params)
                    if self.max_grad_norm is not None:
                        grads, _ = tf.clip_by_global_norm(
                            grads, self.max_grad_norm)
                    grads = list(zip(grads, self.params))

                with tf.compat.v1.variable_scope("input_info", reuse=False):
                    tf.compat.v1.summary.scalar(
                        'discounted_rewards',
                        tf.reduce_mean(input_tensor=self.rewards_ph))
                    tf.compat.v1.summary.scalar(
                        'learning_rate',
                        tf.reduce_mean(input_tensor=self.learning_rate_ph))
                    tf.compat.v1.summary.scalar(
                        'advantage', tf.reduce_mean(input_tensor=self.advs_ph))
                    if self.full_tensorboard_log:
                        tf.compat.v1.summary.histogram('discounted_rewards',
                                                       self.rewards_ph)
                        tf.compat.v1.summary.histogram('learning_rate',
                                                       self.learning_rate_ph)
                        tf.compat.v1.summary.histogram('advantage',
                                                       self.advs_ph)
                        if tf_util.is_image(self.observation_space):
                            tf.compat.v1.summary.image('observation',
                                                       train_model.obs_ph)
                        else:
                            tf.compat.v1.summary.histogram(
                                'observation', train_model.obs_ph)

                trainer = tf.compat.v1.train.RMSPropOptimizer(
                    learning_rate=self.learning_rate_ph,
                    decay=self.alpha,
                    epsilon=self.epsilon,
                    momentum=self.momentum)
                self.apply_backprop = trainer.apply_gradients(grads)

                self.train_model = train_model
                self.step_model = step_model
                self.step = step_model.step
                self.proba_step = step_model.proba_step
                self.value = step_model.value
                self.initial_state = step_model.initial_state
                tf.compat.v1.global_variables_initializer().run(
                    session=self.sess)

                self.summary = tf.compat.v1.summary.merge_all()
Пример #2
0
    def setup_model(self):
        with SetVerbosity(self.verbose):

            assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the PPO2 model must be " \
                                                               "an instance of common.policies.ActorCriticPolicy."

            self.n_batch = self.n_envs * self.n_runs * self.n_steps

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph)

                n_batch_step = None
                n_batch_train = None
                if issubclass(self.policy, RecurrentActorCriticPolicy):
                    assert self.n_envs % self.nminibatches == 0, "For recurrent policies, "\
                        "the number of environments run in parallel should be a multiple of nminibatches."
                    n_batch_step = self.n_envs
                    n_batch_train = self.n_batch // self.nminibatches

                act_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, n_batch_step, reuse=False, **self.policy_kwargs)
                with tf.compat.v1.variable_scope("train_model", reuse=True, custom_getter=tf_util.outer_scope_getter("train_model")):
                    train_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs * self.n_runs // self.nminibatches, self.n_steps, n_batch_train, reuse=True, **self.policy_kwargs)

                self.observation_ph = tf.compat.v1.placeholder(shape=(None,) + self.observation_space.shape, dtype=self.observation_space.dtype, name='obs')
                self.processed_obs = tf.cast(self.observation_ph, tf.float32)

                self.observation_next_ph = tf.compat.v1.placeholder(shape=(None,) + self.observation_space.shape, dtype=self.observation_space.dtype, name='obs_next')
                self.processed_obs_next = tf.cast(self.observation_next_ph, tf.float32)

                with tf.compat.v1.variable_scope("obs_encoded", reuse=tf.compat.v1.AUTO_REUSE):
                    self.obs_encoded = obs_autoencoder(self.processed_obs, self.observation_space)
                    self.obs_next_encoded = obs_autoencoder(self.processed_obs_next, self.observation_space)
                #self.obs_encoded = self.processed_obs
                #self.obs_next_encoded = self.processed_obs_next

                self.act_hat = inverse_model(self.obs_encoded, self.obs_next_encoded, self.action_space)

                with tf.compat.v1.variable_scope("loss", reuse=False):
                    self.action_ph = train_model.pdtype.sample_placeholder([None], name="action_ph")

                    self.processed_act = tf.cast(tf.one_hot(self.action_ph, self.action_space.n), tf.float32)
                    self.obs_next_hat = forward_model(self.obs_encoded, self.processed_act, self.observation_space)

                    self.advs_ph = tf.compat.v1.placeholder(tf.float32, [None], name="advs_ph")
                    self.rewards_ph = tf.compat.v1.placeholder(tf.float32, [None], name="rewards_ph")

                    self.true_rewards_ph = tf.compat.v1.placeholder(tf.float32, [None], name="true_rewards_ph")

                    self.old_neglog_pac_ph = tf.compat.v1.placeholder(tf.float32, [None], name="old_neglog_pac_ph")
                    self.old_vpred_ph = tf.compat.v1.placeholder(tf.float32, [None], name="old_vpred_ph")
                    self.learning_rate_ph = tf.compat.v1.placeholder(tf.float32, [], name="learning_rate_ph")
                    self.clip_range_ph = tf.compat.v1.placeholder(tf.float32, [], name="clip_range_ph")
                    neglogpac = train_model.proba_distribution.neglogp(self.action_ph)
                    self.entropy = tf.reduce_mean(input_tensor=train_model.proba_distribution.entropy())
                    vpred = train_model.value_flat

                    # Value function clipping: not present in the original PPO
                    if self.cliprange_vf is None:
                        # Default behavior (legacy from OpenAI baselines):
                        # use the same clipping as for the policy
                        self.clip_range_vf_ph = self.clip_range_ph
                        self.cliprange_vf = self.cliprange
                    elif isinstance(self.cliprange_vf, (float, int)) and self.cliprange_vf < 0:
                        # Original PPO implementation: no value function clipping
                        self.clip_range_vf_ph = None
                    else:
                        # Last possible behavior: clipping range
                        # specific to the value function
                        self.clip_range_vf_ph = tf.compat.v1.placeholder(tf.float32, [], name="clip_range_vf_ph")

                    if self.clip_range_vf_ph is None:
                        # No clipping
                        vpred_clipped = train_model.value_flat
                    else:
                        # Clip the different between old and new value
                        # NOTE: this depends on the reward scaling
                        vpred_clipped = self.old_vpred_ph + tf.clip_by_value(train_model.value_flat - self.old_vpred_ph, - self.clip_range_vf_ph, self.clip_range_vf_ph)

                    vf_losses1 = tf.square(vpred - self.rewards_ph)
                    vf_losses2 = tf.square(vpred_clipped - self.rewards_ph)
                    self.vf_loss = .5 * tf.reduce_mean(input_tensor=tf.maximum(vf_losses1, vf_losses2))

                    ratio = tf.exp(self.old_neglog_pac_ph - neglogpac)
                    pg_losses = -self.advs_ph * ratio
                    pg_losses2 = -self.advs_ph * tf.clip_by_value(ratio, 1.0 - self.clip_range_ph, 1.0 + self.clip_range_ph)
                    self.pg_loss = tf.reduce_mean(input_tensor=tf.maximum(pg_losses, pg_losses2))
                    self.approxkl = .5 * tf.reduce_mean(input_tensor=tf.square(neglogpac - self.old_neglog_pac_ph))
                    self.clipfrac = tf.reduce_mean(input_tensor=tf.cast(tf.greater(tf.abs(ratio - 1.0), self.clip_range_ph), tf.float32))

                    self.params = tf.compat.v1.trainable_variables()
                    weight_params = [v for v in self.params if '/b' not in v.name]
                    l2_loss = tf.reduce_sum([tf.nn.l2_loss(v) for v in weight_params])

                    self.frw_loss = 0.5 * tf.reduce_sum(tf.math.square(self.obs_next_encoded - self.obs_next_hat))

                    #self.inv_loss = - tf.reduce_sum(self.processed_act * tf.math.log(self.act_hat + tf.keras.backend.epsilon()))

                    self.inv_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.act_hat, labels=tf.cast(self.action_ph, tf.int64)))

                    self.int_loss = self.beta * self.frw_loss + (1.0 - self.beta) * self.inv_loss
                    loss = self.lmd * (self.pg_loss - self.entropy * self.ent_coef + self.vf_loss * self.vf_coef) + self.int_loss

                    self.int_reward = self.eta * self.frw_loss

                    tf.compat.v1.summary.scalar('entropy_loss', self.entropy)
                    tf.compat.v1.summary.scalar('policy_gradient_loss', self.pg_loss)
                    tf.compat.v1.summary.scalar('value_function_loss', self.vf_loss)
                    tf.compat.v1.summary.scalar('intrinsic_loss', self.int_loss)
                    tf.compat.v1.summary.scalar('approximate_kullback-leibler', self.approxkl)
                    tf.compat.v1.summary.scalar('clip_factor', self.clipfrac)
                    tf.compat.v1.summary.scalar('loss', loss)

                    for var in self.params:
                        print(var.name, var)

                    with tf.compat.v1.variable_scope('model'):
                        if self.full_tensorboard_log:
                            for var in self.params:
                                tf.compat.v1.summary.histogram(var.name, var)
                    grads = tf.gradients(ys=loss, xs=self.params)
                    if self.max_grad_norm is not None:
                        grads, _grad_norm = tf.clip_by_global_norm(grads, self.max_grad_norm)
                    grads = list(zip(grads, self.params))
                    for gr in grads:
                        print(gr)
                trainer = tf.compat.v1.train.AdamOptimizer(learning_rate=self.learning_rate_ph, epsilon=1e-5)
                self._train = trainer.apply_gradients(grads)

                self.loss_names = ['policy_loss', 'value_loss', 'int_loss', 'policy_entropy', 'approxkl', 'clipfrac']

                with tf.compat.v1.variable_scope("input_info", reuse=False):
                    tf.compat.v1.summary.scalar('true_rewards', tf.reduce_mean(input_tensor=self.true_rewards_ph))
                    tf.compat.v1.summary.scalar('discounted_rewards', tf.reduce_mean(input_tensor=self.rewards_ph))
                    tf.compat.v1.summary.scalar('learning_rate', tf.reduce_mean(input_tensor=self.learning_rate_ph))
                    tf.compat.v1.summary.scalar('advantage', tf.reduce_mean(input_tensor=self.advs_ph))
                    tf.compat.v1.summary.scalar('clip_range', tf.reduce_mean(input_tensor=self.clip_range_ph))
                    if self.clip_range_vf_ph is not None:
                        tf.compat.v1.summary.scalar('clip_range_vf', tf.reduce_mean(input_tensor=self.clip_range_vf_ph))

                    tf.compat.v1.summary.scalar('old_neglog_action_probability', tf.reduce_mean(input_tensor=self.old_neglog_pac_ph))
                    tf.compat.v1.summary.scalar('old_value_pred', tf.reduce_mean(input_tensor=self.old_vpred_ph))

                    if self.full_tensorboard_log:
                        tf.compat.v1.summary.histogram('discounted_rewards', self.rewards_ph)
                        tf.compat.v1.summary.histogram('learning_rate', self.learning_rate_ph)
                        tf.compat.v1.summary.histogram('advantage', self.advs_ph)
                        tf.compat.v1.summary.histogram('clip_range', self.clip_range_ph)
                        tf.compat.v1.summary.histogram('old_neglog_action_probability', self.old_neglog_pac_ph)
                        tf.compat.v1.summary.histogram('old_value_pred', self.old_vpred_ph)
                        if tf_util.is_image(self.observation_space):
                            tf.compat.v1.summary.image('observation', train_model.obs_ph)
                        else:
                            tf.compat.v1.summary.histogram('observation', train_model.obs_ph)

                self.train_model = train_model
                self.act_model = act_model
                self.step = act_model.step
                self.proba_step = act_model.proba_step
                self.value = act_model.value
                self.initial_state = act_model.initial_state
                tf.compat.v1.global_variables_initializer().run(session=self.sess)  # pylint: disable=E1101

                self.summary = tf.compat.v1.summary.merge_all()
Пример #3
0
    def setup_model(self):
        with SetVerbosity(self.verbose):

            assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the ACKTR model must be " \
                                                               "an instance of common.policies.ActorCriticPolicy."

            # Enable continuous actions tricks (normalized advantage)
            self.continuous_actions = isinstance(self.action_space, Box)

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess,
                                                 graph=self.graph)

                n_batch_step = None
                n_batch_train = None
                if issubclass(self.policy, RecurrentActorCriticPolicy):
                    n_batch_step = self.n_envs
                    n_batch_train = self.n_envs * self.n_steps

                step_model = self.policy(self.sess,
                                         self.observation_space,
                                         self.action_space,
                                         self.n_envs,
                                         1,
                                         n_batch_step,
                                         reuse=False,
                                         **self.policy_kwargs)

                self.params = params = tf_util.get_trainable_vars("model")

                with tf.compat.v1.variable_scope(
                        "train_model",
                        reuse=True,
                        custom_getter=tf_util.outer_scope_getter(
                            "train_model")):
                    train_model = self.policy(self.sess,
                                              self.observation_space,
                                              self.action_space,
                                              self.n_envs,
                                              self.n_steps,
                                              n_batch_train,
                                              reuse=True,
                                              **self.policy_kwargs)

                with tf.compat.v1.variable_scope(
                        "loss",
                        reuse=False,
                        custom_getter=tf_util.outer_scope_getter("loss")):
                    self.advs_ph = advs_ph = tf.compat.v1.placeholder(
                        tf.float32, [None])
                    self.rewards_ph = rewards_ph = tf.compat.v1.placeholder(
                        tf.float32, [None])
                    self.learning_rate_ph = learning_rate_ph = tf.compat.v1.placeholder(
                        tf.float32, [])
                    self.actions_ph = train_model.pdtype.sample_placeholder(
                        [None])

                    neg_log_prob = train_model.proba_distribution.neglogp(
                        self.actions_ph)

                    # training loss
                    pg_loss = tf.reduce_mean(input_tensor=advs_ph *
                                             neg_log_prob)
                    self.entropy = entropy = tf.reduce_mean(
                        input_tensor=train_model.proba_distribution.entropy())
                    self.pg_loss = pg_loss = pg_loss - self.ent_coef * entropy
                    self.vf_loss = vf_loss = mse(
                        tf.squeeze(train_model.value_fn), rewards_ph)
                    train_loss = pg_loss + self.vf_coef * vf_loss

                    # Fisher loss construction
                    self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(
                        input_tensor=neg_log_prob)
                    sample_net = train_model.value_fn + tf.random.normal(
                        tf.shape(input=train_model.value_fn))
                    self.vf_fisher = vf_fisher_loss = -self.vf_fisher_coef * tf.reduce_mean(
                        input_tensor=tf.pow(
                            train_model.value_fn -
                            tf.stop_gradient(sample_net), 2))
                    self.joint_fisher = pg_fisher_loss + vf_fisher_loss

                    tf.compat.v1.summary.scalar('entropy_loss', self.entropy)
                    tf.compat.v1.summary.scalar('policy_gradient_loss',
                                                pg_loss)
                    tf.compat.v1.summary.scalar('policy_gradient_fisher_loss',
                                                pg_fisher_loss)
                    tf.compat.v1.summary.scalar('value_function_loss',
                                                self.vf_loss)
                    tf.compat.v1.summary.scalar('value_function_fisher_loss',
                                                vf_fisher_loss)
                    tf.compat.v1.summary.scalar('loss', train_loss)

                    self.grads_check = tf.gradients(ys=train_loss, xs=params)

                with tf.compat.v1.variable_scope("input_info", reuse=False):
                    tf.compat.v1.summary.scalar(
                        'discounted_rewards',
                        tf.reduce_mean(input_tensor=self.rewards_ph))
                    tf.compat.v1.summary.scalar(
                        'learning_rate',
                        tf.reduce_mean(input_tensor=self.learning_rate_ph))
                    tf.compat.v1.summary.scalar(
                        'advantage', tf.reduce_mean(input_tensor=self.advs_ph))

                    if self.full_tensorboard_log:
                        tf.compat.v1.summary.histogram('discounted_rewards',
                                                       self.rewards_ph)
                        tf.compat.v1.summary.histogram('learning_rate',
                                                       self.learning_rate_ph)
                        tf.compat.v1.summary.histogram('advantage',
                                                       self.advs_ph)
                        if tf_util.is_image(self.observation_space):
                            tf.compat.v1.summary.image('observation',
                                                       train_model.obs_ph)
                        else:
                            tf.compat.v1.summary.histogram(
                                'observation', train_model.obs_ph)

                with tf.compat.v1.variable_scope(
                        "kfac",
                        reuse=False,
                        custom_getter=tf_util.outer_scope_getter("kfac")):
                    with tf.device('/gpu:0'):
                        self.optim = optim = kfac.KfacOptimizer(
                            learning_rate=learning_rate_ph,
                            clip_kl=self.kfac_clip,
                            momentum=0.9,
                            kfac_update=self.kfac_update,
                            epsilon=0.01,
                            stats_decay=0.99,
                            async_eigen_decomp=self.async_eigen_decomp,
                            cold_iter=10,
                            max_grad_norm=self.max_grad_norm,
                            verbose=self.verbose)

                        optim.compute_and_apply_stats(self.joint_fisher,
                                                      var_list=params)

                self.train_model = train_model
                self.step_model = step_model
                self.step = step_model.step
                self.proba_step = step_model.proba_step
                self.value = step_model.value
                self.initial_state = step_model.initial_state
                tf.compat.v1.global_variables_initializer().run(
                    session=self.sess)

                self.summary = tf.compat.v1.summary.merge_all()
Пример #4
0
    def setup_model(self):
        # prevent import loops
        from reinforcement_learning.gail.adversary import TransitionClassifier

        with SetVerbosity(self.verbose):

            assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the TRPO model must be " \
                                                               "an instance of common.policies.ActorCriticPolicy."

            self.nworkers = MPI.COMM_WORLD.Get_size()
            self.rank = MPI.COMM_WORLD.Get_rank()
            np.set_printoptions(precision=3)

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph)

                if self.using_gail:
                    self.reward_giver = TransitionClassifier(self.observation_space, self.action_space,
                                                             self.hidden_size_adversary,
                                                             entcoeff=self.adversary_entcoeff)

                # Construct network for new policy
                self.policy_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1,
                                             None, reuse=False, **self.policy_kwargs)

                # Network for old policy
                with tf.compat.v1.variable_scope("oldpi", reuse=False):
                    old_policy = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1,
                                             None, reuse=False, **self.policy_kwargs)

                with tf.compat.v1.variable_scope("loss", reuse=False):
                    atarg = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None])  # Target advantage function (if applicable)
                    ret = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

                    observation = self.policy_pi.obs_ph
                    action = self.policy_pi.pdtype.sample_placeholder([None])

                    kloldnew = old_policy.proba_distribution.kl(self.policy_pi.proba_distribution)
                    ent = self.policy_pi.proba_distribution.entropy()
                    meankl = tf.reduce_mean(input_tensor=kloldnew)
                    meanent = tf.reduce_mean(input_tensor=ent)
                    entbonus = self.entcoeff * meanent

                    vferr = tf.reduce_mean(input_tensor=tf.square(self.policy_pi.value_flat - ret))

                    # advantage * pnew / pold
                    ratio = tf.exp(self.policy_pi.proba_distribution.logp(action) -
                                   old_policy.proba_distribution.logp(action))
                    surrgain = tf.reduce_mean(input_tensor=ratio * atarg)

                    optimgain = surrgain + entbonus
                    losses = [optimgain, meankl, entbonus, surrgain, meanent]
                    self.loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]

                    dist = meankl

                    all_var_list = tf_util.get_trainable_vars("model")
                    var_list = [v for v in all_var_list if "/vf" not in v.name and "/q/" not in v.name]
                    vf_var_list = [v for v in all_var_list if "/pi" not in v.name and "/logstd" not in v.name]

                    self.get_flat = tf_util.GetFlat(var_list, sess=self.sess)
                    self.set_from_flat = tf_util.SetFromFlat(var_list, sess=self.sess)

                    klgrads = tf.gradients(ys=dist, xs=var_list)
                    flat_tangent = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None], name="flat_tan")
                    shapes = [var.get_shape().as_list() for var in var_list]
                    start = 0
                    tangents = []
                    for shape in shapes:
                        var_size = tf_util.intprod(shape)
                        tangents.append(tf.reshape(flat_tangent[start: start + var_size], shape))
                        start += var_size
                    gvp = tf.add_n([tf.reduce_sum(input_tensor=grad * tangent)
                                    for (grad, tangent) in zipsame(klgrads, tangents)])  # pylint: disable=E1111
                    # Fisher vector products
                    fvp = tf_util.flatgrad(gvp, var_list)

                    tf.compat.v1.summary.scalar('entropy_loss', meanent)
                    tf.compat.v1.summary.scalar('policy_gradient_loss', optimgain)
                    tf.compat.v1.summary.scalar('value_function_loss', surrgain)
                    tf.compat.v1.summary.scalar('approximate_kullback-leibler', meankl)
                    tf.compat.v1.summary.scalar('loss', optimgain + meankl + entbonus + surrgain + meanent)

                    self.assign_old_eq_new = \
                        tf_util.function([], [], updates=[tf.compat.v1.assign(oldv, newv) for (oldv, newv) in
                                                          zipsame(tf_util.get_globals_vars("oldpi"),
                                                                  tf_util.get_globals_vars("model"))])
                    self.compute_losses = tf_util.function([observation, old_policy.obs_ph, action, atarg], losses)
                    self.compute_fvp = tf_util.function([flat_tangent, observation, old_policy.obs_ph, action, atarg],
                                                        fvp)
                    self.compute_vflossandgrad = tf_util.function([observation, old_policy.obs_ph, ret],
                                                                  tf_util.flatgrad(vferr, vf_var_list))

                    @contextmanager
                    def timed(msg):
                        if self.rank == 0 and self.verbose >= 1:
                            print(colorize(msg, color='magenta'))
                            start_time = time.time()
                            yield
                            print(colorize("done in {:.3f} seconds".format((time.time() - start_time)),
                                           color='magenta'))
                        else:
                            yield

                    def allmean(arr):
                        assert isinstance(arr, np.ndarray)
                        out = np.empty_like(arr)
                        MPI.COMM_WORLD.Allreduce(arr, out, op=MPI.SUM)
                        out /= self.nworkers
                        return out

                    tf_util.initialize(sess=self.sess)

                    th_init = self.get_flat()
                    MPI.COMM_WORLD.Bcast(th_init, root=0)
                    self.set_from_flat(th_init)

                with tf.compat.v1.variable_scope("Adam_mpi", reuse=False):
                    self.vfadam = MpiAdam(vf_var_list, sess=self.sess)
                    if self.using_gail:
                        self.d_adam = MpiAdam(self.reward_giver.get_trainable_variables(), sess=self.sess)
                        self.d_adam.sync()
                    self.vfadam.sync()

                with tf.compat.v1.variable_scope("input_info", reuse=False):
                    tf.compat.v1.summary.scalar('discounted_rewards', tf.reduce_mean(input_tensor=ret))
                    tf.compat.v1.summary.scalar('learning_rate', tf.reduce_mean(input_tensor=self.vf_stepsize))
                    tf.compat.v1.summary.scalar('advantage', tf.reduce_mean(input_tensor=atarg))
                    tf.compat.v1.summary.scalar('kl_clip_range', tf.reduce_mean(input_tensor=self.max_kl))

                    if self.full_tensorboard_log:
                        tf.compat.v1.summary.histogram('discounted_rewards', ret)
                        tf.compat.v1.summary.histogram('learning_rate', self.vf_stepsize)
                        tf.compat.v1.summary.histogram('advantage', atarg)
                        tf.compat.v1.summary.histogram('kl_clip_range', self.max_kl)
                        if tf_util.is_image(self.observation_space):
                            tf.compat.v1.summary.image('observation', observation)
                        else:
                            tf.compat.v1.summary.histogram('observation', observation)

                self.timed = timed
                self.allmean = allmean

                self.step = self.policy_pi.step
                self.proba_step = self.policy_pi.proba_step
                self.initial_state = self.policy_pi.initial_state

                self.params = tf_util.get_trainable_vars("model") + tf_util.get_trainable_vars("oldpi")
                if self.using_gail:
                    self.params.extend(self.reward_giver.get_trainable_variables())

                self.summary = tf.compat.v1.summary.merge_all()

                self.compute_lossandgrad = \
                    tf_util.function([observation, old_policy.obs_ph, action, atarg, ret],
                                     [self.summary, tf_util.flatgrad(optimgain, var_list)] + losses)
Пример #5
0
    def setup_model(self):
        with SetVerbosity(self.verbose):
            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess,
                                                 graph=self.graph)

                self.replay_buffer = ReplayBuffer(self.buffer_size)

                with tf.compat.v1.variable_scope("input", reuse=False):
                    # Create policy and target TF objects
                    self.policy_tf = self.policy(self.sess,
                                                 self.observation_space,
                                                 self.action_space,
                                                 **self.policy_kwargs)
                    self.target_policy = self.policy(self.sess,
                                                     self.observation_space,
                                                     self.action_space,
                                                     **self.policy_kwargs)

                    # Initialize Placeholders
                    self.observations_ph = self.policy_tf.obs_ph
                    # Normalized observation for pixels
                    self.processed_obs_ph = self.policy_tf.processed_obs
                    self.next_observations_ph = self.target_policy.obs_ph
                    self.processed_next_obs_ph = self.target_policy.processed_obs
                    self.action_target = self.target_policy.action_ph
                    self.terminals_ph = tf.compat.v1.placeholder(
                        tf.float32, shape=(None, 1), name='terminals')
                    self.rewards_ph = tf.compat.v1.placeholder(tf.float32,
                                                               shape=(None, 1),
                                                               name='rewards')
                    self.actions_ph = tf.compat.v1.placeholder(
                        tf.float32,
                        shape=(None, ) + self.action_space.shape,
                        name='actions')
                    self.learning_rate_ph = tf.compat.v1.placeholder(
                        tf.float32, [], name="learning_rate_ph")

                with tf.compat.v1.variable_scope("model", reuse=False):
                    # Create the policy
                    # first return value corresponds to deterministic actions
                    # policy_out corresponds to stochastic actions, used for training
                    # logp_pi is the log probability of actions taken by the policy
                    self.deterministic_action, policy_out, logp_pi = self.policy_tf.make_actor(
                        self.processed_obs_ph)
                    # Monitor the entropy of the policy,
                    # this is not used for training
                    self.entropy = tf.reduce_mean(
                        input_tensor=self.policy_tf.entropy)
                    #  Use two Q-functions to improve performance by reducing overestimation bias.
                    qf1, qf2, value_fn = self.policy_tf.make_critics(
                        self.processed_obs_ph,
                        self.actions_ph,
                        create_qf=True,
                        create_vf=True)
                    qf1_pi, qf2_pi, _ = self.policy_tf.make_critics(
                        self.processed_obs_ph,
                        policy_out,
                        create_qf=True,
                        create_vf=False,
                        reuse=True)

                    # Target entropy is used when learning the entropy coefficient
                    if self.target_entropy == 'auto':
                        # automatically set target entropy if needed
                        self.target_entropy = -np.prod(
                            self.action_space.shape).astype(np.float32)
                    else:
                        # Force conversion
                        # this will also throw an error for unexpected string
                        self.target_entropy = float(self.target_entropy)

                    # The entropy coefficient or entropy can be learned automatically
                    # see Automating Entropy Adjustment for Maximum Entropy RL section
                    # of https://arxiv.org/abs/1812.05905
                    if isinstance(self.ent_coef,
                                  str) and self.ent_coef.startswith('auto'):
                        # Default initial value of ent_coef when learned
                        init_value = 1.0
                        if '_' in self.ent_coef:
                            init_value = float(self.ent_coef.split('_')[1])
                            assert init_value > 0., "The initial value of ent_coef must be greater than 0"

                        self.log_ent_coef = tf.compat.v1.get_variable(
                            'log_ent_coef',
                            dtype=tf.float32,
                            initializer=np.log(init_value).astype(np.float32))
                        self.ent_coef = tf.exp(self.log_ent_coef)
                    else:
                        # Force conversion to float
                        # this will throw an error if a malformed string (different from 'auto')
                        # is passed
                        self.ent_coef = float(self.ent_coef)

                with tf.compat.v1.variable_scope("target", reuse=False):
                    # Create the value network
                    _, _, value_target = self.target_policy.make_critics(
                        self.processed_next_obs_ph,
                        create_qf=False,
                        create_vf=True)
                    self.value_target = value_target

                with tf.compat.v1.variable_scope("loss", reuse=False):
                    # Take the min of the two Q-Values (Double-Q Learning)
                    min_qf_pi = tf.minimum(qf1_pi, qf2_pi)

                    # Target for Q value regression
                    q_backup = tf.stop_gradient(self.rewards_ph +
                                                (1 - self.terminals_ph) *
                                                self.gamma * self.value_target)

                    # Compute Q-Function loss
                    # TODO: test with huber loss (it would avoid too high values)
                    qf1_loss = 0.5 * tf.reduce_mean(input_tensor=(q_backup -
                                                                  qf1)**2)
                    qf2_loss = 0.5 * tf.reduce_mean(input_tensor=(q_backup -
                                                                  qf2)**2)

                    # Compute the entropy temperature loss
                    # it is used when the entropy coefficient is learned
                    ent_coef_loss, entropy_optimizer = None, None
                    if not isinstance(self.ent_coef, float):
                        ent_coef_loss = -tf.reduce_mean(
                            input_tensor=self.log_ent_coef *
                            tf.stop_gradient(logp_pi + self.target_entropy))
                        entropy_optimizer = tf.compat.v1.train.AdamOptimizer(
                            learning_rate=self.learning_rate_ph)

                    # Compute the policy loss
                    # Alternative: policy_kl_loss = tf.reduce_mean(logp_pi - min_qf_pi)
                    policy_kl_loss = tf.reduce_mean(
                        input_tensor=self.ent_coef * logp_pi - qf1_pi)

                    # NOTE: in the original implementation, they have an additional
                    # regularization loss for the Gaussian parameters
                    # this is not used for now
                    # policy_loss = (policy_kl_loss + policy_regularization_loss)
                    policy_loss = policy_kl_loss

                    # Target for value fn regression
                    # We update the vf towards the min of two Q-functions in order to
                    # reduce overestimation bias from function approximation error.
                    v_backup = tf.stop_gradient(min_qf_pi -
                                                self.ent_coef * logp_pi)
                    value_loss = 0.5 * tf.reduce_mean(
                        input_tensor=(value_fn - v_backup)**2)

                    values_losses = qf1_loss + qf2_loss + value_loss

                    # Policy train op
                    # (has to be separate from value train op, because min_qf_pi appears in policy_loss)
                    policy_optimizer = tf.compat.v1.train.AdamOptimizer(
                        learning_rate=self.learning_rate_ph)
                    policy_train_op = policy_optimizer.minimize(
                        policy_loss,
                        var_list=tf_util.get_trainable_vars('model/pi'))

                    # Value train op
                    value_optimizer = tf.compat.v1.train.AdamOptimizer(
                        learning_rate=self.learning_rate_ph)
                    values_params = tf_util.get_trainable_vars(
                        'model/values_fn')

                    source_params = tf_util.get_trainable_vars(
                        "model/values_fn")
                    target_params = tf_util.get_trainable_vars(
                        "target/values_fn")

                    # Polyak averaging for target variables
                    self.target_update_op = [
                        tf.compat.v1.assign(target, (1 - self.tau) * target +
                                            self.tau * source)
                        for target, source in zip(target_params, source_params)
                    ]
                    # Initializing target to match source variables
                    target_init_op = [
                        tf.compat.v1.assign(target, source)
                        for target, source in zip(target_params, source_params)
                    ]

                    # Control flow is used because sess.run otherwise evaluates in nondeterministic order
                    # and we first need to compute the policy action before computing q values losses
                    with tf.control_dependencies([policy_train_op]):
                        train_values_op = value_optimizer.minimize(
                            values_losses, var_list=values_params)

                        self.infos_names = [
                            'policy_loss', 'qf1_loss', 'qf2_loss',
                            'value_loss', 'entropy'
                        ]
                        # All ops to call during one training step
                        self.step_ops = [
                            policy_loss, qf1_loss, qf2_loss, value_loss, qf1,
                            qf2, value_fn, logp_pi, self.entropy,
                            policy_train_op, train_values_op
                        ]

                        # Add entropy coefficient optimization operation if needed
                        if ent_coef_loss is not None:
                            with tf.control_dependencies([train_values_op]):
                                ent_coef_op = entropy_optimizer.minimize(
                                    ent_coef_loss, var_list=self.log_ent_coef)
                                self.infos_names += [
                                    'ent_coef_loss', 'ent_coef'
                                ]
                                self.step_ops += [
                                    ent_coef_op, ent_coef_loss, self.ent_coef
                                ]

                    # Monitor losses and entropy in tensorboard
                    tf.compat.v1.summary.scalar('policy_loss', policy_loss)
                    tf.compat.v1.summary.scalar('qf1_loss', qf1_loss)
                    tf.compat.v1.summary.scalar('qf2_loss', qf2_loss)
                    tf.compat.v1.summary.scalar('value_loss', value_loss)
                    tf.compat.v1.summary.scalar('entropy', self.entropy)
                    if ent_coef_loss is not None:
                        tf.compat.v1.summary.scalar('ent_coef_loss',
                                                    ent_coef_loss)
                        tf.compat.v1.summary.scalar('ent_coef', self.ent_coef)

                    tf.compat.v1.summary.scalar(
                        'learning_rate',
                        tf.reduce_mean(input_tensor=self.learning_rate_ph))

                # Retrieve parameters that must be saved
                self.params = tf_util.get_trainable_vars("model")
                self.target_params = tf_util.get_trainable_vars(
                    "target/values_fn")

                # Initialize Variables and target network
                with self.sess.as_default():
                    self.sess.run(tf.compat.v1.global_variables_initializer())
                    self.sess.run(target_init_op)

                self.summary = tf.compat.v1.summary.merge_all()