Exemplo n.º 1
0
        def HVP(v):
            with tf.GradientTape() as outer:
                with tf.GradientTape() as inner:
                    p = self.policy(sta)
                    d = tf.reduce_mean(
                        self.probability(tf.stop_gradient(p)).kl(
                            self.probability(p)))
                    g = flatten(
                        inner.gradient(d, self.policy.trainable_variables))

                x = tf.reduce_sum(g * v)
                g = flatten(outer.gradient(x, self.policy.trainable_variables))

            return g + v * damping
Exemplo n.º 2
0
        def G():
            with tf.GradientTape() as tape:
                nps = self.p(obs)

                nps_in_log = self.loglikelihood(acs, nps)
                ops_in_log = self.loglikelihood(acs, ops)

                x = -tf.reduce_mean(tf.exp(nps_in_log - ops_in_log) * ads)
                g = flatten(tape.gradient(x, self.p.trainable_variables))

                return g
Exemplo n.º 3
0
        def G():
            with tf.GradientTape() as tape:
                nps = self.policy(sta)

                ops_in_log = self.probability(ops).loglikelihood(act)
                nps_in_log = self.probability(nps).loglikelihood(act)

                x = -tf.reduce_mean(tf.exp(nps_in_log - ops_in_log) * adv)

            g = flatten(tape.gradient(x, self.policy.trainable_variables))

            return g
Exemplo n.º 4
0
        def HVP(v):
            s = damping * v

            with tf.GradientTape() as outer:
                with tf.GradientTape() as inner:
                    p = self.p(obs)
                    d = tf.reduce_mean(self.kl(tf.stop_gradient(p), p))
                    g = inner.gradient(d, self.p.trainable_variables)

                v = reshape(g, v, None)
                x = tf.reduce_sum(
                    list(tf.reduce_sum(a * b) for (a, b) in zip(g, v)))
                g = flatten(outer.gradient(x, self.p.trainable_variables))

                return g + s
Exemplo n.º 5
0
    def calculate(self, obs, acs, ads, ops, δ=0.01, damping=0.001):
        def G():
            with tf.GradientTape() as tape:
                nps = self.p(obs)

                nps_in_log = self.loglikelihood(acs, nps)
                ops_in_log = self.loglikelihood(acs, ops)

                x = -tf.reduce_mean(tf.exp(nps_in_log - ops_in_log) * ads)
                g = flatten(tape.gradient(x, self.p.trainable_variables))

                return g

        def L():
            nps = self.p(obs)

            nps_in_log = self.loglikelihood(acs, nps)
            ops_in_log = self.loglikelihood(acs, ops)

            x = -tf.reduce_mean(tf.exp(nps_in_log - ops_in_log) * ads)

            d = tf.reduce_mean(self.kl(ops, nps))
            e = tf.reduce_mean(self.entropy(nps))

            print('my losses: ', x, d, e)
            return x + d + e

        def HVP(v):
            s = damping * v

            with tf.GradientTape() as outer:
                with tf.GradientTape() as inner:
                    p = self.p(obs)
                    d = tf.reduce_mean(self.kl(tf.stop_gradient(p), p))
                    g = inner.gradient(d, self.p.trainable_variables)

                v = reshape(g, v, None)
                x = tf.reduce_sum(
                    list(tf.reduce_sum(a * b) for (a, b) in zip(g, v)))
                g = flatten(outer.gradient(x, self.p.trainable_variables))

                return g + s

        g = G()
        d = CG(HVP, -g)

        d1 = d[None, :]
        d2 = HVP(d)[:, None]

        d3 = d1 @ d2

        β = np.sqrt(2 * δ / d3[0, 0])  # sqrt() tf.

        s = d * β
        e = tf.experimental.numpy.dot(-g, s)

        print('β', β)

        def η(θ):
            reshape(self.p.trainable_variables, θ, True)
            nps = self.p(obs)

            nps_in_log = self.loglikelihood(acs, nps)
            ops_in_log = self.loglikelihood(acs, ops)

            x = -tf.reduce_mean(tf.exp(nps_in_log - ops_in_log) * ads)
            return x

        theta = flatten(self.p.trainable_variables)
        theta = bls(η, theta, s, e)[1]

        # $reshape( self.p.trainable_variables, theta)
        print('after', η(theta))

        return G, L, HVP
Exemplo n.º 6
0
    def calculate(self, sta, act, adv, ops, δ=0.01, damping=0.001):
        def G():
            with tf.GradientTape() as tape:
                nps = self.policy(sta)

                ops_in_log = self.probability(ops).loglikelihood(act)
                nps_in_log = self.probability(nps).loglikelihood(act)

                x = -tf.reduce_mean(tf.exp(nps_in_log - ops_in_log) * adv)

            g = flatten(tape.gradient(x, self.policy.trainable_variables))

            return g

        def L():
            nps = self.policy(sta)

            ops_in_log = self.probability(ops).loglikelihood(act)
            nps_in_log = self.probability(nps).loglikelihood(act)

            x = -tf.reduce_mean(tf.exp(nps_in_log - ops_in_log) * adv)

            d = tf.reduce_mean(self.probability(ops).kl(self.probability(nps)))
            e = tf.reduce_mean(self.probability(nps).entropy())

            print('my losses', x, d, e)
            return x + d + e

        def HVP(v):
            with tf.GradientTape() as outer:
                with tf.GradientTape() as inner:
                    p = self.policy(sta)
                    d = tf.reduce_mean(
                        self.probability(tf.stop_gradient(p)).kl(
                            self.probability(p)))
                    g = flatten(
                        inner.gradient(d, self.policy.trainable_variables))

                x = tf.reduce_sum(g * v)
                g = flatten(outer.gradient(x, self.policy.trainable_variables))

            return g + v * damping

        return G, L, HVP

        g = G()
        d = cg(HVP, -g)

        β = np.sqrt(2 * δ / np.abs(np.dot(d, HVP(d))))
        s = d * β
        e = np.dot(s, -g)

        def η(theta):
            reshape(theta, self.policy.trainable_variables)

            nps = self.policy(sta)

            ops_in_log = self.probability(ops).loglikelihood(act)
            nps_in_log = self.probability(nps).loglikelihood(act)

            x = -tf.reduce_mean(tf.exp(nps_in_log - ops_in_log) * adv)

            return x

        theta = flatten(self.policy.trainable_variables)
        theta = bls(η, theta, s, δ, lambda arg: np.dot(arg, HVP(arg)), e)

        return G, L, HVP