def test_op_2(self): rnd = np.random.RandomState(0) with tf.Graph().as_default(): logits = tf.constant(rnd.uniform(0.0, 1.0, [2, 2]), dtype=tf.float32) logits_v = tf.constant(rnd.uniform(0.0, 1.0, [2, 2]), dtype=tf.float32) r = tf.Variable(0.0, dtype=tf.float32) logits = logits_v * r + logits t = tf.constant([1, 0]) t = tf.one_hot(t, 2, dtype=tf.float32) # loss = tf.nn.sparse_softmax_cross_entropy_with_logits( # logits=logits, labels=y) y = tf.nn.softmax(logits) loss = t * tf.log(y + 1e-5) loss = tf.reduce_sum(loss) g = tf.get_default_graph() node_list = g.as_graph_def().node inspect = set(["Softmax", "SoftmaxGrad"]) node_list = filter(lambda x: x.op in inspect, node_list) grad_fw = forward_gradients(loss, r, gate_gradients=True) grad_bk = tf.gradients(loss, r)[0] with self.test_session() as sess: sess.run(tf.global_variables_initializer()) grad_bk_val = sess.run(grad_bk) grad_fw_val = sess.run(grad_fw) np.testing.assert_allclose(grad_fw_val, grad_bk_val, rtol=5)
def test_convnet(self): with tf.Graph().as_default(): # Define model. r = tf.Variable(1.0) x = tf.constant(np.random.uniform(-1.0, 1.0, [1, 5, 5, 2]), dtype=tf.float32) w = tf.constant(np.random.uniform(-1.0, 1.0, [2, 2, 2, 3]), dtype=tf.float32) h = tf.nn.conv2d(r + x, r * w, [1, 1, 1, 1], "SAME") h = tf.nn.max_pool(h, [1, 3, 3, 1], [1, 2, 2, 1], "SAME") h = tf.nn.relu(h) # First branch. w2 = tf.constant(np.random.uniform(-1.0, 1.0, [27, 1]), dtype=tf.float32) h2 = tf.matmul(tf.reshape(h, [1, -1]), w2) y2 = tf.nn.tanh(h2) y2 = tf.reduce_sum(y2) # We can take a second branch. w3 = tf.constant(np.random.uniform(-1.0, 1.0, [27, 1]), dtype=tf.float32) h3 = tf.matmul(tf.reshape(h, [1, -1]), w3) y3 = tf.nn.sigmoid(h3) y3 = tf.reduce_sum(y3) # Take gradients of a list of y wrt. scalar r. # Returns [grad_y2_r, grad_y3_r]. grad_fw = forward_gradients([y2, y3], r, gate_gradients=True) # Reverse mode implementation from tensorflow. grad_bk = [tf.gradients(y2, r)[0], tf.gradients(y3, r)[0]] with self.test_session() as sess: sess.run(tf.global_variables_initializer()) grad_fw_val = sess.run(grad_fw) grad_bk_val = sess.run(grad_bk) np.testing.assert_allclose(grad_fw_val, grad_bk_val, rtol=5)
def make_unit_graph(self, x, y, rnd=None, dtype=tf.float32): """Makes a computation graph that computes (J^T r)^T v and r^T J v""" if rnd is None: rnd = np.random.RandomState(0) x_shape = [int(ss) for ss in x.get_shape()] v = self.get_random_tensor(x_shape, rnd=rnd) y_shape = [int(ss) for ss in y.get_shape()] r = self.get_random_tensor(y_shape, rnd=rnd) jt_r = tf.gradients(y, [x], r, gate_gradients=True) jt_r_t_v = self.inner_prod(jt_r, [v]) j_v = forward_gradients(y, [x], [v], gate_gradients=True) r_t_j_v = tf.reduce_sum(r * j_v) return jt_r_t_v, r_t_j_v
def test_manual(self): with tf.Graph().as_default(), tf.device("/cpu:0"): with self.test_session() as sess: x_val = np.random.uniform(0, 1) x = tf.constant(x_val) y = tf.tanh(x) dy_dx = forward_gradients(y, x, gate_gradients=True) dy_dx_tf = sess.run(dy_dx) eps = 1e-5 x_val = x_val - eps y_val_1 = np.tanh(x_val) x_val = x_val + 2 * eps y_val_2 = np.tanh(x_val) dy_dx_fd = (y_val_2 - y_val_1) / (2 * eps) np.testing.assert_allclose(dy_dx_tf, dy_dx_fd, rtol=1e-5)
def test_grad_graph(self): with tf.Graph().as_default(): # Dummy variable. r = tf.Variable(1.0) # Input. x = tf.constant(np.random.uniform(-1.0, 1.0, [1, 5, 5, 2]), dtype=tf.float32, name="x") # First convolution. v = tf.constant(np.random.uniform(-1.0, 1.0, [2, 2, 2, 3]), dtype=tf.float32, name="v") w = tf.constant(np.random.uniform(-1.0, 1.0, [2, 2, 2, 3]), dtype=tf.float32, name="w") wv = w + r * v h = tf.nn.conv2d(x, wv, [1, 1, 1, 1], "SAME") h = tf.nn.max_pool(h, [1, 3, 3, 1], [1, 2, 2, 1], "SAME") h = tf.nn.relu(h) # Second convolution. v_ = tf.constant(np.random.uniform(-1.0, 1.0, [2, 2, 3, 3]), dtype=tf.float32, name="v_") w_ = tf.constant(np.random.uniform(-1.0, 1.0, [2, 2, 3, 3]), dtype=tf.float32, name="w_") w_v = w_ + r * v_ h = tf.nn.conv2d(h, w_v, [1, 1, 1, 1], "SAME") # Fully connected. w2 = tf.constant(np.random.uniform(-1.0, 1.0, [27, 1]), dtype=tf.float32, name="w2") h2 = tf.matmul(tf.reshape(h, [1, -1]), w2) y2 = tf.nn.sigmoid(h2) y2 = tf.reduce_sum(y2) grad_bk = tf.gradients(y2, [w, w_], gate_gradients=True) grad_fw = forward_gradients(grad_bk, r, gate_gradients=True) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) sess.run(grad_fw)
def fisher_vec_fw(ys, xs, vs): """Implements Fisher vector product using backward and forward AD. Args: ys: Loss function or output variables. xs: Weights, list of tensors. vs: List of tensors to multiply, for each weight tensor. Returns: J'Jv: Fisher vector product. """ # Validate the input if type(xs) == list: if len(vs) != len(xs): raise ValueError("xs and vs must have the same length.") jv = forward_gradients(ys, xs, vs, gate_gradients=True) jjv = tf.gradients(ys, xs, jv, gate_gradients=True) return jjv
def hessian_vec_fw(ys, xs, vs, grads=None): """Implements Hessian vector product using forward on backward AD. Args: ys: Loss function. xs: Weights, list of tensors. vs: List of tensors to multiply, for each weight tensor. Returns: Hv: Hessian vector product, same size, same shape as xs. """ # Validate the input if type(xs) == list: if len(vs) != len(xs): raise ValueError("xs and vs must have the same length.") if grads is None: grads = tf.gradients(ys, xs, gate_gradients=True) return forward_gradients(grads, xs, vs, gate_gradients=True)
def fisher_vec_z(ys, xs, vs): """Implements JJ'v, where v is on the output space. Args: ys: Loss function or output variables. xs: Weights, list of tensors. vs: List of tensors to multiply, for each weight tensor. Returns: JJ'v: Fisher vector product on the output space. """ # Validate the input if type(ys) == list: if len(vs) != len(ys): raise ValueError("ys and vs must have the same length.") jv = tf.gradients(ys, xs, vs, gate_gradients=True) jjv = forward_gradients(ys, xs, jv, gate_gradients=True) return jjv
def gauss_newton_vec_z(ys, zs, xs, vs): """Implements HJJ'v, where v is on the output space. Args: ys: Loss function or output variables. zs: Before output layer (input to softmax). xs: Weights, list of tensors. vs: List of tensors to multiply, for each weight tensor. Returns: HJJ'v: Gauss-Newton vector product on the output space. """ # Validate the input if type(zs) == list: if len(vs) != len(zs): raise ValueError("zs and vs must have the same length.") grads_z = tf.gradients(ys, zs, gate_gradients=True) jv = tf.gradients(zs, xs, vs, gate_gradients=True) hjjv = forward_gradients(grads_z, xs, jv, gate_gradients=True) return hjjv
def gauss_newton_vec(ys, zs, xs, vs): """Implements Gauss-Newton vector product. Args: ys: Loss function. zs: Before output layer (input to softmax). xs: Weights, list of tensors. vs: List of perturbation vector for each weight tensor. Returns: J'HJv: Guass-Newton vector product. """ # Validate the input if type(xs) == list: if len(vs) != len(xs): raise ValueError("xs and vs must have the same length.") grads_z = tf.gradients(ys, zs, gate_gradients=True) hjv = forward_gradients(grads_z, xs, vs, gate_gradients=True) jhjv = tf.gradients(zs, xs, hjv, gate_gradients=True) return jhjv, hjv
def test_forward_mode_cnn(self): """Test v^T (J v) = (J^T v) ^T v""" rnd = np.random.RandomState(0) dtype = tf.float32 # Use float64 and CPU for finite difference checking. # tf.nn.conv2d and tf.nn.max_pool does not support float64. # with tf.Graph().as_default(), tf.device("/cpu:0"): with tf.Graph().as_default(): # Input. x = tf.constant(rnd.uniform(-1.0, 1.0, [2, 5, 5, 2]), dtype=dtype, name="x") # First convolution. v = tf.constant(rnd.uniform(-1.0, 1.0, [2, 2, 2, 3]), dtype=dtype, name="v") w = tf.constant(rnd.uniform(-1.0, 1.0, [2, 2, 2, 3]), dtype=dtype, name="w") h = tf.nn.conv2d(x, w, [1, 1, 1, 1], "SAME") h = tf.nn.max_pool(h, [1, 3, 3, 1], [1, 2, 2, 1], "SAME") h = tf.nn.relu(h) # Second convolution. v_ = tf.constant(rnd.uniform(-0.1, 0.1, [2, 2, 3, 3]), dtype=dtype, name="v_") w_ = tf.constant(rnd.uniform(-1.0, 1.0, [2, 2, 3, 3]), dtype=dtype, name="w_") h = tf.nn.conv2d(h, w_, [1, 1, 1, 1], "SAME") h = tf.nn.sigmoid(h) # Fully connected. dim = 27 v2 = tf.constant(rnd.uniform(-0.1, 0.1, [dim, 2]), dtype=dtype, name="v2") w2 = tf.constant(rnd.uniform(-1.0, 1.0, [dim, 2]), dtype=dtype, name="w2") h = tf.reshape(h, [-1, dim]) y = tf.matmul(h, w2) r = tf.constant(rnd.uniform(-1.0, 1.0, [2, 2]), dtype=dtype, name="r") w_list = [w, w_, w2] v_list = [v, v_, v2] # Taking inner product of two list of tensors. inner_prod = lambda xlist, ylist: tf.reduce_sum( tf.stack([tf.reduce_sum(x * y) for x, y in zip(xlist, ylist)])) # J^T r jt_r = tf.gradients(y, w_list, r, gate_gradients=True) # (J^T r)^T v jt_r_t_v = inner_prod(jt_r, v_list) # J v j_v = forward_gradients(y, w_list, v_list, gate_gradients=True) # r^T J v r_t_j_v = tf.reduce_sum(r * j_v) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) bk_val, fw_val = sess.run([jt_r_t_v, r_t_j_v]) np.testing.assert_allclose(bk_val, fw_val, rtol=1e-5)