def HVP(v): with tf.GradientTape() as outer: with tf.GradientTape() as inner: p = self.policy(sta) d = tf.reduce_mean( self.probability(tf.stop_gradient(p)).kl( self.probability(p))) g = flatten( inner.gradient(d, self.policy.trainable_variables)) x = tf.reduce_sum(g * v) g = flatten(outer.gradient(x, self.policy.trainable_variables)) return g + v * damping
def G(): with tf.GradientTape() as tape: nps = self.p(obs) nps_in_log = self.loglikelihood(acs, nps) ops_in_log = self.loglikelihood(acs, ops) x = -tf.reduce_mean(tf.exp(nps_in_log - ops_in_log) * ads) g = flatten(tape.gradient(x, self.p.trainable_variables)) return g
def G(): with tf.GradientTape() as tape: nps = self.policy(sta) ops_in_log = self.probability(ops).loglikelihood(act) nps_in_log = self.probability(nps).loglikelihood(act) x = -tf.reduce_mean(tf.exp(nps_in_log - ops_in_log) * adv) g = flatten(tape.gradient(x, self.policy.trainable_variables)) return g
def HVP(v): s = damping * v with tf.GradientTape() as outer: with tf.GradientTape() as inner: p = self.p(obs) d = tf.reduce_mean(self.kl(tf.stop_gradient(p), p)) g = inner.gradient(d, self.p.trainable_variables) v = reshape(g, v, None) x = tf.reduce_sum( list(tf.reduce_sum(a * b) for (a, b) in zip(g, v))) g = flatten(outer.gradient(x, self.p.trainable_variables)) return g + s
def calculate(self, obs, acs, ads, ops, δ=0.01, damping=0.001): def G(): with tf.GradientTape() as tape: nps = self.p(obs) nps_in_log = self.loglikelihood(acs, nps) ops_in_log = self.loglikelihood(acs, ops) x = -tf.reduce_mean(tf.exp(nps_in_log - ops_in_log) * ads) g = flatten(tape.gradient(x, self.p.trainable_variables)) return g def L(): nps = self.p(obs) nps_in_log = self.loglikelihood(acs, nps) ops_in_log = self.loglikelihood(acs, ops) x = -tf.reduce_mean(tf.exp(nps_in_log - ops_in_log) * ads) d = tf.reduce_mean(self.kl(ops, nps)) e = tf.reduce_mean(self.entropy(nps)) print('my losses: ', x, d, e) return x + d + e def HVP(v): s = damping * v with tf.GradientTape() as outer: with tf.GradientTape() as inner: p = self.p(obs) d = tf.reduce_mean(self.kl(tf.stop_gradient(p), p)) g = inner.gradient(d, self.p.trainable_variables) v = reshape(g, v, None) x = tf.reduce_sum( list(tf.reduce_sum(a * b) for (a, b) in zip(g, v))) g = flatten(outer.gradient(x, self.p.trainable_variables)) return g + s g = G() d = CG(HVP, -g) d1 = d[None, :] d2 = HVP(d)[:, None] d3 = d1 @ d2 β = np.sqrt(2 * δ / d3[0, 0]) # sqrt() tf. s = d * β e = tf.experimental.numpy.dot(-g, s) print('β', β) def η(θ): reshape(self.p.trainable_variables, θ, True) nps = self.p(obs) nps_in_log = self.loglikelihood(acs, nps) ops_in_log = self.loglikelihood(acs, ops) x = -tf.reduce_mean(tf.exp(nps_in_log - ops_in_log) * ads) return x theta = flatten(self.p.trainable_variables) theta = bls(η, theta, s, e)[1] # $reshape( self.p.trainable_variables, theta) print('after', η(theta)) return G, L, HVP
def calculate(self, sta, act, adv, ops, δ=0.01, damping=0.001): def G(): with tf.GradientTape() as tape: nps = self.policy(sta) ops_in_log = self.probability(ops).loglikelihood(act) nps_in_log = self.probability(nps).loglikelihood(act) x = -tf.reduce_mean(tf.exp(nps_in_log - ops_in_log) * adv) g = flatten(tape.gradient(x, self.policy.trainable_variables)) return g def L(): nps = self.policy(sta) ops_in_log = self.probability(ops).loglikelihood(act) nps_in_log = self.probability(nps).loglikelihood(act) x = -tf.reduce_mean(tf.exp(nps_in_log - ops_in_log) * adv) d = tf.reduce_mean(self.probability(ops).kl(self.probability(nps))) e = tf.reduce_mean(self.probability(nps).entropy()) print('my losses', x, d, e) return x + d + e def HVP(v): with tf.GradientTape() as outer: with tf.GradientTape() as inner: p = self.policy(sta) d = tf.reduce_mean( self.probability(tf.stop_gradient(p)).kl( self.probability(p))) g = flatten( inner.gradient(d, self.policy.trainable_variables)) x = tf.reduce_sum(g * v) g = flatten(outer.gradient(x, self.policy.trainable_variables)) return g + v * damping return G, L, HVP g = G() d = cg(HVP, -g) β = np.sqrt(2 * δ / np.abs(np.dot(d, HVP(d)))) s = d * β e = np.dot(s, -g) def η(theta): reshape(theta, self.policy.trainable_variables) nps = self.policy(sta) ops_in_log = self.probability(ops).loglikelihood(act) nps_in_log = self.probability(nps).loglikelihood(act) x = -tf.reduce_mean(tf.exp(nps_in_log - ops_in_log) * adv) return x theta = flatten(self.policy.trainable_variables) theta = bls(η, theta, s, δ, lambda arg: np.dot(arg, HVP(arg)), e) return G, L, HVP