Пример #1
0
    def apply(
        self,
        x,
        action_dim,
        max_action,
        key=None,
        MPO=False,
        sample=False,
        log_sig_min=-20,
        log_sig_max=2,
    ):
        x = nn.Dense(x, features=200)
        x = nn.LayerNorm(x)
        x = nn.tanh(x)
        x = nn.Dense(x, features=200)
        x = nn.elu(x)
        x = nn.Dense(x, features=2 * action_dim)

        mu, log_sig = jnp.split(x, 2, axis=-1)
        log_sig = nn.softplus(log_sig)
        log_sig = jnp.clip(log_sig, log_sig_min, log_sig_max)

        if MPO:
            return mu, log_sig

        if not sample:
            return max_action * nn.tanh(mu), log_sig
        else:
            pi = mu + random.normal(key, mu.shape) * jnp.exp(log_sig)
            log_pi = gaussian_likelihood(pi, mu, log_sig)
            pi = nn.tanh(pi)
            log_pi -= jnp.sum(jnp.log(nn.relu(1 - pi ** 2) + 1e-6), axis=1)
            return max_action * pi, log_pi
Пример #2
0
    def _create_continuous_trainer(self, optimizer=tf.train.AdamOptimizer()):
        """
        Creates a function for vanilla policy training with a continuous action space
        """
        self.act_holders = tf.placeholder(
            tf.float32, shape=[None, self.out_op.shape[1].value])
        self.reward_holders = tf.placeholder(tf.float32, shape=[None])

        self.std = tf.Variable(0.5 * np.ones(shape=self.out_op.shape[1].value),
                               dtype=tf.float32)
        self.out_act = self.out_op + tf.random_normal(
            tf.shape(self.out_op), dtype=tf.float32) * self.std

        self.log_probs = gaussian_likelihood(self.act_holders, self.out_op,
                                             self.std)

        self.loss = -tf.reduce_mean(self.log_probs * self.reward_holders)

        self.optimizer = optimizer
        self.update = self.optimizer.minimize(self.loss)

        update_func = lambda train_data: self.sess.run(
            self.update,
            feed_dict={
                self.in_op: reshape_train_var(train_data[:, 0]),
                self.act_holders: reshape_train_var(train_data[:, 1]),
                self.reward_holders: train_data[:, 2]
            })

        self.sess.run(tf.global_variables_initializer())

        return update_func
Пример #3
0
    def train(self, state, action, reward, next_state, done):
        next_mu, next_log_std = self.actor(next_state)
        next_action = tf.random.normal(tf.shape(next_mu), next_mu,
                                       tf.math.exp(next_log_std))
        next_log_prob = gaussian_likelihood(next_action, next_mu, next_log_std)
        next_target_q1 = self.target_critic1(next_state, next_action)
        next_target_q2 = self.target_critic2(next_state, next_action)
        min_next_target_q = tf.math.minimum(
            next_target_q1, next_target_q2) - self.alpha * next_log_prob
        target_q = reward + (1. - done) * self.gamma * min_next_target_q
        with tf.GradientTape(persistent=True) as tape:
            # Critic
            q1 = self.critic1(state, action)
            q2 = self.critic2(state, action)

            critic1_loss = tf.reduce_mean(tf.keras.losses.mse(target_q, q1))
            critic2_loss = tf.reduce_mean(tf.keras.losses.mse(target_q, q2))

            # Actor
            min_q = tf.math.minimum(q1, q2)
            mu, log_std = self.actor(state)
            log_prob = gaussian_likelihood(action, mu, log_std)
            actor_loss = tf.reduce_mean(self.alpha * log_prob - min_q)

            # Alpha
            alpha_loss = -tf.reduce_mean(self.log_alpha * log_prob)

        critic1_grads = tape.gradient(critic1_loss,
                                      self.critic1.trainable_weights)
        self.critic1_optimizer.apply_gradients(
            zip(critic1_grads, self.critic1.trainable_weights))

        critic2_grads = tape.gradient(critic2_loss,
                                      self.critic2.trainable_weights)
        self.critic2_optimizer.apply_gradients(
            zip(critic2_grads, self.critic2.trainable_weights))

        actor_grads = tape.gradient(actor_loss, self.actor.trainable_weights)
        self.actor_optimizer.apply_gradients(
            zip(actor_grads, self.actor.trainable_weights))

        if self.use_dynamic_alpha:
            alpha_grad = tape.gradient(alpha_loss, self.log_alpha)
            self.alpha_optimizer.apply_gradients([(alpha_grad, self.log_alpha)
                                                  ])

        return actor_loss, critic1_loss, critic2_loss, alpha_loss
Пример #4
0
    def loss_fn(mlo, slo, actor):
        mu, log_sig = actor(state, MPO=True)
        sig = jnp.exp(log_sig)
        target_mu, target_log_sig = actor_target(state, MPO=True)
        target_sig = jnp.exp(target_log_sig)

        actor_log_prob = gaussian_likelihood(sampled_actions, target_mu, sig)
        actor_log_prob += gaussian_likelihood(sampled_actions, mu, target_sig)
        actor_log_prob = actor_log_prob.transpose((0, 1))

        mu, target_mu = nn.tanh(mu), nn.tanh(mu)

        reg_mu = eps_mu - kl_mvg_diag(target_mu, target_sig, mu, target_sig).mean()
        reg_sig = eps_sig - kl_mvg_diag(target_mu, target_sig, target_mu, sig).mean()

        mlo = lagrange_step(mlo, reg_mu)
        slo = lagrange_step(slo, reg_sig)

        actor_loss = -(actor_log_prob[:, None] * weights).sum(axis=1).mean()
        actor_loss -= mu_lagrange_optimizer.target() * reg_mu
        actor_loss -= sig_lagrange_optimizer.target() * reg_sig
        return actor_loss.mean(), (mlo, slo)
Пример #5
0
                                      kernel_initializer=initializationHidden,
                                      name="fc{}".format(i + 2))(curNode)

        actionMeanOp = tf.layers.Dense(
            outputLength,
            kernel_initializer=initializationFinalPolicy,
            name="outputA")(curNode)
        actionLogStdOp = tf.get_variable(
            name="ActionsLogStdDetachedTrainable",
            initializer=-0.3 * np.ones((1, outputLength), dtype=np.float32),
            trainable=True)
        actionStdOp = tf.math.exp(actionLogStdOp)
        actionFinalOp = actionMeanOp + tf.random_normal(
            tf.shape(actionMeanOp)) * actionStdOp
        sampledLogProbsOp = utils.gaussian_likelihood(actionFinalOp,
                                                      actionMeanOp,
                                                      actionLogStdOp)
        logProbWithCurrParamsOp = utils.gaussian_likelihood(
            aPh, actionMeanOp, actionLogStdOp)

    #definition of losses to optimize
    ratio = tf.exp(logProbWithCurrParamsOp - logProbSampPh)
    Lloss = -tf.reduce_mean(
        ratio * advPh)  # - sign because we want to maximize our objective

    if args.val_eps > 0:
        vLossUncliped = (vfOutputOp - totalEstimatedDiscountedRewardPh)**2
        vClipped = VPrevPh + tf.clip_by_value(vfOutputOp - VPrevPh,
                                              -args.val_eps, args.val_eps)
        vLossClipped = (vClipped - totalEstimatedDiscountedRewardPh)**2
        vLossMax = tf.maximum(vLossClipped, vLossUncliped)
Пример #6
0
    def _create_continuous_trainer(self):
        """
        Creates a function for vanilla policy training with a continuous action space
        """
        # First passthrough

        self.act_holders = tf.placeholder(
            tf.float32, shape=[None, self.out_op.shape[1].value])
        self.reward_holders = tf.placeholder(tf.float32, shape=[None])

        self.std = tf.Variable(0.5 * np.ones(shape=self.out_op.shape[1].value),
                               dtype=tf.float32)
        self.out_act = self.out_op + tf.random_normal(
            tf.shape(self.out_op), dtype=tf.float32) * self.std

        self.log_probs = gaussian_likelihood(self.act_holders, self.out_op,
                                             self.std)

        self.advantages = self.reward_holders - tf.squeeze(self.value_out_op)

        # Second passthrough

        self.advatange_holders = tf.placeholder(dtype=tf.float32,
                                                shape=self.advantages.shape)
        self.old_prob_holders = tf.placeholder(dtype=tf.float32,
                                               shape=self.log_probs.shape)

        self.policy_ratio = tf.exp(self.log_probs - self.old_prob_holders)
        self.clipped_ratio = tf.clip_by_value(self.policy_ratio,
                                              1 - self.clip_val,
                                              1 + self.clip_val)

        self.min_loss = tf.minimum(self.policy_ratio * self.advatange_holders,
                                   self.clipped_ratio * self.advatange_holders)

        self.optimizer = tf.train.AdamOptimizer()

        # Actor update

        self.kl_divergence = tf.reduce_mean(self.old_prob_holders -
                                            self.log_probs)
        self.actor_loss = -tf.reduce_mean(self.min_loss)
        self.actor_update = self.optimizer.minimize(self.actor_loss)

        # Value update

        self.value_loss = tf.reduce_mean(
            tf.square(self.reward_holders - tf.squeeze(self.value_out_op)))
        self.value_update = self.optimizer.minimize(self.value_loss)

        # Combined update

        self.entropy = -0.5 * tf.reduce_mean(
            tf.log(2 * np.pi * np.e * self.std))
        self.combined_loss = self.actor_loss + self.v_coef * self.value_loss + self.entropy_coef * self.entropy
        self.combined_update = self.optimizer.minimize(self.combined_loss)

        def update_func(train_data):
            self.old_probs, self.old_advantages = self.sess.run(
                [self.log_probs, self.advantages],
                feed_dict={
                    self.in_op: reshape_train_var(train_data[:, 0]),
                    self.act_holders: reshape_train_var(train_data[:, 1]),
                    self.reward_holders: train_data[:, 2]
                })

            for i in range(self.ppo_iters):
                kl_div, _ = self.sess.run(
                    [self.kl_divergence, self.combined_update],
                    feed_dict={
                        self.in_op: reshape_train_var(train_data[:, 0]),
                        self.act_holders: reshape_train_var(train_data[:, 1]),
                        self.reward_holders: train_data[:, 2],
                        self.old_prob_holders: self.old_probs,
                        self.advatange_holders: self.old_advantages
                    })
                if kl_div > 1.5 * self.target_kl:
                    break

            return kl_div, self.sess.run(self.entropy)

        self.sess.run(tf.global_variables_initializer())

        return update_func
Пример #7
0
    def _createDefault(self):
        with tf.variable_scope("PolicyNetworkContinuous{}".format(
                self.suffix)):

            if not self.orthogonalInitializtion:
                curNode = tf.layers.Dense(
                    self.hiddenLayers[0],
                    self.hiddenLayerActivations[0],
                    kernel_initializer=tf.contrib.layers.xavier_initializer(),
                    name="fc1")(self.input)
                #curNode = tf.contrib.layers.layer_norm(curNode)
                for i, l in enumerate(self.hiddenLayers[1:]):
                    curNode = tf.layers.Dense(
                        l,
                        self.hiddenLayerActivations[i + 1],
                        kernel_initializer=tf.contrib.layers.
                        xavier_initializer(),
                        name="fc{}".format(i + 2))(curNode)
                    #curNode = tf.contrib.layers.layer_norm(curNode)
                self.actionMean = tf.layers.Dense(
                    self.outputLength,
                    self.hiddenLayerActivations[-1],
                    kernel_initializer=tf.contrib.layers.xavier_initializer(),
                    name="ActionsMean")(curNode)
            else:
                curNode = tf.layers.Dense(
                    self.hiddenLayers[0],
                    self.hiddenLayerActivations[0],
                    kernel_initializer=tf.orthogonal_initializer(
                        self.orthogonalInitializtion[0]),
                    name="fc1")(self.input)
                #curNode = tf.contrib.layers.layer_norm(curNode)
                for i, l in enumerate(self.hiddenLayers[1:]):
                    curNode = tf.layers.Dense(
                        l,
                        self.hiddenLayerActivations[i + 1],
                        kernel_initializer=tf.orthogonal_initializer(
                            self.orthogonalInitializtion[i + 1]),
                        name="fc{}".format(i + 2))(curNode)
                    #curNode = tf.contrib.layers.layer_norm(curNode)
                self.actionMean = tf.layers.Dense(
                    self.outputLength,
                    self.hiddenLayerActivations[-1],
                    kernel_initializer=tf.orthogonal_initializer(
                        self.orthogonalInitializtion[-1]),
                    name="ActionsMean")(curNode)

            if (self.actionMeanScale is not None):
                assert (self.actionMeanScale.shape == (1, self.outputLength))
                self.actionMean = self.actionMean * self.actionMeanScale

            #logic for noise that is added to action mean
            if self.logStdInit is not None:
                assert (self.logStdInit.shape == (1, self.outputLength))
                self.actionLogStd = tf.get_variable(
                    name="ActionsLogStdDetached{}Trainable".format(
                        "" if self.logStdTrainable else "Non"),
                    initializer=self.logStdInit,
                    trainable=self.logStdTrainable)
            else:
                if not self.orthogonalInitializtion:
                    self.actionLogStd = tf.layers.Dense(
                        self.outputLength,
                        kernel_initializer=tf.contrib.layers.
                        xavier_initializer(),
                        name="ActionsLogStd")(curNode)
                else:
                    self.actionLogStd = tf.layers.Dense(
                        self.outputLength,
                        kernel_initializer=tf.orthogonal_initializer(
                            self.orthogonalInitializtion[-1]),
                        name="ActionsLogStd")(curNode)

            if self.clipLogStd is not None:
                self.actionLogStd = tf.clip_by_value(
                    self.actionLogStd,
                    self.clipLogStd[0],
                    self.clipLogStd[1],
                    name="ClipedActionsLogStd")

            #here we actualy add noise
            if self.actionLogStd is not None:
                self.actionStd = tf.math.exp(self.actionLogStd)
                self.actionRaw = self.actionMean + tf.random_normal(
                    tf.shape(self.actionMean)) * self.actionStd
            else:
                self.actionRaw = self.actionMean

            #action clip
            if self.actionClip is not None:
                assert (self.actionClip.shape == (2, self.outputLength))
                self.actionFinal = tf.clip_by_value(self.actionFinal,
                                                    self.actionClip[0, :],
                                                    self.actionClip[1, :])
            else:
                self.actionFinal = self.actionRaw

            #if adding std to action mean, operations for action probabilities
            if self.actionLogStd is not None:
                self.sampledLogProbs = utils.gaussian_likelihood(
                    self.actionFinal, self.actionMean, self.actionLogStd)
                self.logProbWithCurrParams = utils.gaussian_likelihood(
                    self.actions, self.actionMean, self.actionLogStd
                )  #log prob(joint, all action components are from gaussian) for action given the observation(both fed with placeholder)