def prepGeometricTensor(state, vrs): nparams = prep_variables(vrs) phi_r = tf.math.real(state) phi_c = tf.math.imag(state) jac_r = jacobian(phi_r, vrs) jac_c = jacobian(phi_c, vrs) if len(vrs) == 1: jac = tf.reshape(tf.complex(jac_r, jac_c), (state.shape + [nparams])) jac = tf.split(jac, nparams, axis=-1) jac = [tf.reshape(v, state.shape) for v in jac] else: jac = [tf.complex(jac_r[i], jac_c[i]) for i in range(nparams)] return jac, nparams
def test_jacobian_fixed_shape(self): x = random_ops.random_uniform([2, 2]) y = math_ops.matmul(x, x, transpose_a=True) jacobian_pfor = gradients.jacobian(y, x, use_pfor=True) jacobian_while = gradients.jacobian(y, x, use_pfor=False) answer = ops.convert_to_tensor([[ gradient_ops.gradients(y[0][0], x)[0], gradient_ops.gradients(y[0][1], x)[0] ], [ gradient_ops.gradients(y[1][0], x)[0], gradient_ops.gradients(y[1][1], x)[0] ]]) self.run_and_assert_equal(answer, jacobian_pfor) self.run_and_assert_equal(answer, jacobian_while)
def create_lstm_hessian(batch_size, state_size, steps): _, output = lstm_model_fn(batch_size, state_size, steps) weights = variables.trainable_variables() pfor_jacobians = gradients.jacobian(output, weights, use_pfor=True) pfor_hessians = [ gradients.jacobian(x, weights, use_pfor=True) for x in pfor_jacobians ] # TODO(agarwal): using two nested while_loop doesn't seem to work here. # Hence we use pfor_jacobians for computing while_hessians. while_jacobians = pfor_jacobians while_hessians = [ gradients.jacobian(x, weights, use_pfor=False) for x in while_jacobians ] return pfor_hessians, while_hessians
def create_lstm_hessian(batch_size, state_size, steps): _, output = lstm_model_fn(batch_size, state_size, steps) weights = variables.trainable_variables() pfor_jacobians = gradients.jacobian(output, weights, use_pfor=True) pfor_hessians = [ gradients.jacobian(x, weights, use_pfor=True) for x in pfor_jacobians ] # TODO(agarwal): using two nested while_loop doesn't seem to work here. # Hence we use pfor_jacobians for computing while_hessians. while_jacobians = pfor_jacobians while_hessians = [ gradients.jacobian(x, weights, use_pfor=False) for x in while_jacobians ] return pfor_hessians, while_hessians
def _build_hessian_op_grid(self, y, wrt_x1, wrt_x2): # DEBUG: self.monolith_hessian = jacobian(tf.gradients(y, wrt_x1)[0], wrt_x2, use_pfor=False) # DEBUG: self.monolith_hessian = tf.reshape(self.monolith_hessian, shape=(self._full_height, self._full_width)) # Compute the full gradient wrt x1. We will slice this later when computing the hessian in a blocky manner. full_gradient = tf.gradients(y, wrt_x1)[0] full_gradient = tf.reshape(full_gradient, shape=(self._full_height, )) # Redefining for readability grid_height = self._grid_height block_height = self._block_height full_height = self._full_height # Build a grid of Tensorflow operations - each operation computes a chunk of the Hessian. # The grid is actually just a list of chunks of shape [block_height, n_elements(x2)]. # When assembled, the hessian chunk will have shape [n_elements(x1), n_elements(x2)]. op_grid = [None] * grid_height for i_block in range(grid_height): # Parameters for the j-axis of the hessian. i_start_ix = i_block * block_height i_end_ix = min(full_height, (i_block + 1) * block_height) # Add to output op grid. grad_chunk = full_gradient[i_start_ix:i_end_ix] # TODO: it is not clear whether to use pfor or not (since it is still experimental - maybe we can test). # See https://github.com/tensorflow/tensorflow/issues/675#issuecomment-404665051 hess_chunk = jacobian(grad_chunk, wrt_x2, use_pfor=False) hess_chunk = tf.reshape(hess_chunk, shape=(grad_chunk.shape[0], self._full_width)) op_grid[i_block] = hess_chunk return op_grid
def caljacob(self, z): is_training = False jacob = self.sess.run(jacobian(self.x_hat, self.z), feed_dict={ self.z: z, self.is_training: is_training }) return jacob
def inverse(self, z): q, p = extract_q_p(x) q_prime = self._f.inverse(q) df = tf_gradients_ops.jacobian(self._f(q_prime), q_prime, use_pfor=True) return join_q_p(q_prime, tf.tensordot(df, p, [[4, 5, 6, 7], [0, 1, 2, 3]]))
def run_pyket(args): hilbert_state_shape = (args.input_size, 1) padding = ((0, args.kernel_size - 1), ) inputs = Input(shape=hilbert_state_shape, dtype='int8') x = ToComplex128()(inputs) for i in range(args.depth): x = PeriodicPadding(padding)(x) x = ComplexConv1D(args.width, args.kernel_size, use_bias=False, dtype=tf.complex128)(x) x = Activation(lncosh)(x) x = Flatten()(x) predictions = Lambda(lambda y: tf.reduce_sum(y, axis=1, keepdims=True))(x) model = Model(inputs=inputs, outputs=predictions) if args.fast_jacobian: predictions_jacobian = lambda x: get_predictions_jacobian(keras_model= model) else: predictions_jacobian = lambda x: gradients.jacobian( tf.real(model.output), x, use_pfor=not args.no_pfor) if args.use_stochastic_reconfiguration: optimizer = ComplexValuesStochasticReconfiguration( model, predictions_jacobian, lr=args.learning_rate, diag_shift=10.0, iterative_solver=args.use_iterative, use_cholesky=args.use_cholesky, iterative_solver_max_iterations=None) model.compile(optimizer=optimizer, loss=loss_for_energy_minimization, metrics=optimizer.metrics) else: optimizer = SGD(lr=args.learning_rate) model.compile(optimizer=optimizer, loss=loss_for_energy_minimization) model.summary() operator = Heisenberg(hilbert_state_shape=hilbert_state_shape, pbc=True) sampler = MetropolisHastingsHamiltonian( model, args.batch_size, operator, num_of_chains=args.pyket_num_of_chains, unused_sampels=numpy.prod(hilbert_state_shape)) variational_monte_carlo = VariationalMonteCarlo(model, operator, sampler) model.fit_generator(variational_monte_carlo.to_generator(), steps_per_epoch=5, epochs=1, max_queue_size=0, workers=0) start_time = time.time() model.fit_generator(variational_monte_carlo.to_generator(), steps_per_epoch=args.num_of_iterations, epochs=1, max_queue_size=0, workers=0) end_time = time.time() return end_time - start_time
def call(self, x): q, p = extract_q_p(x) q_prime = self._f(q) # Df(q)^{-1} = D(f^{-1}( q_prime )) df_inverse = tf_gradients_ops.jacobian(self._f.inverse(q_prime), q_prime, use_pfor=True) return join_q_p( q_prime, tf.tensordot(df_inverse, p, [[4, 5, 6, 7], [0, 1, 2, 3]]))
def test_jacobian_unknown_shape(self): with self.test_session() as sess: x = array_ops.placeholder(dtypes.float32, shape=[None, None]) y = math_ops.matmul(x, x, transpose_a=True) jacobian_pfor = gradients.jacobian(y, x, use_pfor=True) jacobian_while = gradients.jacobian(y, x, use_pfor=False) answer = ops.convert_to_tensor([[ gradient_ops.gradients(y[0][0], x)[0], gradient_ops.gradients(y[0][1], x)[0] ], [ gradient_ops.gradients(y[1][0], x)[0], gradient_ops.gradients(y[1][1], x)[0] ]]) ans, pfor_value, while_value = sess.run( [answer, jacobian_pfor, jacobian_while], feed_dict={x: [[1, 2], [3, 4]]}) self.assertAllClose(ans, pfor_value) self.assertAllClose(ans, while_value)
def test_jacobian_scan_shape(self): # Shape x: [3, 4] x = random_ops.random_uniform([3, 4]) elems = random_ops.random_uniform([6]) # Shape y: [6, 3, 4] y = functional_ops.scan(lambda a, e: a + e, elems, initializer=x) jacobian = gradients.jacobian(y, x) expected_shape = [6, 3, 4, 3, 4] self.assertAllEqual(expected_shape, jacobian.shape.as_list())
def test_jacobian_scan_shape(self): # Shape x: [3, 4] x = random_ops.random_uniform([3, 4]) elems = random_ops.random_uniform([6]) # Shape y: [6, 3, 4] y = functional_ops.scan(lambda a, e: a + e, elems, initializer=x) jacobian = gradients.jacobian(y, x) expected_shape = [6, 3, 4, 3, 4] self.assertAllEqual(expected_shape, jacobian.shape.as_list())
def mixed_partials(self, conf): # optimized version to speed things up a little bit. grads = self.gradients(conf) reverse_shaped = jacobian(grads, self.params, use_pfor=False) properly_shaped = [] for p in reverse_shaped: if len(p.get_shape()) == 3: properly_shaped.append(tf.transpose(p, perm=(2, 0, 1))) if len(p.get_shape()) == 4: # properly_shaped.append(tf.reshape(fixed, [-1, fixed.shape[2], fixed.shape[3]])) properly_shaped.append(tf.transpose(p, perm=(2, 3, 0, 1))) return properly_shaped
def test_jacobian_while_loop_shape(self): # Shape x: [3, 4] x = random_ops.random_uniform([3, 4]) _, y = tf_control_flow_ops.while_loop(lambda i, a: i > 5., lambda i, a: (i + 1, a + i), (constant_op.constant(0.), x)) # Shape y: [2, 3] y = y[:2, :3] jacobian = gradients.jacobian(y, x) expected_shape = [2, 3, 3, 4] self.assertAllEqual(expected_shape, jacobian.shape.as_list())
def test_jacobian_while_loop_shape(self): # Shape x: [3, 4] x = random_ops.random_uniform([3, 4]) _, y = tf_control_flow_ops.while_loop(lambda i, a: i > 5., lambda i, a: (i + 1, a + i), (constant_op.constant(0.), x)) # Shape y: [2, 3] y = y[:2, :3] jacobian = gradients.jacobian(y, x) expected_shape = [2, 3, 3, 4] self.assertAllEqual(expected_shape, jacobian.shape.as_list())
def test_sorting(): # convert to TF tensors dtype = tf.float64 tf_matrices = bitonic_matrices(8) for max_fn in [softmax, smoothmax, softmax_smooth]: test = to_tf(np.random.randint(-200, 200, 8), dtype=dtype) tf_output = tf.reshape(diff_sort(tf_matrices, test), (-1,)) tf_ranks = diff_argsort(tf_matrices, test) tf_argsort = diff_argsort(tf_matrices, test, transpose=True) tf_grads = tf.squeeze(jacobian(tf_output, test)) # compute output and gradient with tf.Session() as s: s.run((tf_output, tf_grads, tf_ranks, tf_argsort))
def mixed_partials(self, conf): """ Returns list of tensors of mixed partial derivatives evaluated at the input geometry. Parameters ---------- conf: tf.placeholder An N x 3 configuration placeholder Returns ------- tf.Tensor of size len(self.params) Returns an unflattened list of mixed partial derivatives [(p_shape), N, 3] matching each parameter in get_params() """ # (ytz): Note for implementation purposes, the order of differentiation # actually matters. The jacobian system in tensorflow expects a fixed size # tensor for the outputs, while permitting a variable list of tensors for # inputs. This means that we should naturally use the coordinate derivatives # as they all have a fixed N x 3 structure, where as the input parameters # can take on a variadic list of tensors of varying sizes. # optimized version to speed things up a little bit. grads = self.gradients(conf) # taken from tf src gradients_impl.py _IndexedSlicesToTensor if isinstance(grads, tf.IndexedSlices): grads = tf.unsorted_segment_sum(grads.values, grads.indices, grads.dense_shape[0]) reverse_shaped = jacobian(grads, self.params, use_pfor=False) if isinstance(reverse_shaped, tf.Tensor): # shove to a list reverse_shaped = [reverse_shaped] properly_shaped = [] for p in reverse_shaped: if len(p.get_shape()) == 2: properly_shaped.append(p) # already ready to go elif len(p.get_shape()) == 3: properly_shaped.append(tf.transpose(p, perm=(2, 0, 1))) elif len(p.get_shape()) == 4: # properly_shaped.append(tf.reshape(fixed, [-1, fixed.shape[2], fixed.shape[3]])) properly_shaped.append(tf.transpose(p, perm=(2, 3, 0, 1))) else: # should be easy to support, just add perm=(2,3,...,0,1) raise NotImplementedError("Shapes > 4 not supported") return properly_shaped
def jacobians(ys, xs, parallel_iterations=None): """Compute the jacobians of `ys` with respect to `xs`. Args: ys: tf.Tensor. xs: tf.Tensor. The variables wrt to compute the Jacobian. parallel_iterations: The number of iterations to be done in paralel. Used to trade-off memory consumption for speed: if None, the Jacobian computation is done in parallel, but requires most memory. Returns: a tf.Tensor of Jacobians. """ return pfor_gradients.jacobian(ys, xs, use_pfor=True, parallel_iterations=parallel_iterations)
def create_fc_per_eg_jacobians(batch_size, activation_size, num_layers): model = FullyConnectedModel(activation_size=activation_size, num_layers=num_layers) inp = random_ops.random_normal([batch_size, activation_size]) output = model(inp) jacobians = gradients.jacobian(output, variables.trainable_variables()) def loop_fn(i, use_pfor): inp_i = array_ops.expand_dims(array_ops.gather(inp, i), 0) output = array_ops.reshape(model(inp_i), [-1]) return gradients.jacobian( output, variables.trainable_variables(), use_pfor=use_pfor) per_eg_jacobians_pfor = control_flow_ops.pfor( functools.partial(loop_fn, use_pfor=True), batch_size) per_eg_jacobians_while = control_flow_ops.for_loop( functools.partial(loop_fn, use_pfor=False), [dtypes.float32] * len(variables.trainable_variables()), batch_size) return jacobians, per_eg_jacobians_pfor, per_eg_jacobians_while
def __init__(self, model_class, dataset, params): """ Creates necessary ops for deepfool in the tensorflow graph Args: model_class: The class of the model to construct. Expects subclass of BasicModel dataset: The dataset to use. Only necessary for shape and number of classes params: Additional parameters to pass to the model init """ self.image = tf.placeholder(dtype=tf.float32, shape=dataset.shape) self.model = model_class(tf.expand_dims(self.image, axis=0), trainable=False, num_classes=dataset.num_classes, **params) self.logits = self.model.logits[0] self.num_classes = self.logits.shape.as_list()[0] self.logits_grad = jacobian(self.logits, self.image)
def create_fc_per_eg_jacobians(batch_size, activation_size, num_layers): model = FullyConnectedModel(activation_size=activation_size, num_layers=num_layers) inp = random_ops.random_normal([batch_size, activation_size]) output = model(inp) jacobians = gradients.jacobian(output, variables.trainable_variables()) def loop_fn(i, use_pfor): inp_i = array_ops.expand_dims(array_ops.gather(inp, i), 0) output = array_ops.reshape(model(inp_i), [-1]) return gradients.jacobian(output, variables.trainable_variables(), use_pfor=use_pfor) per_eg_jacobians_pfor = control_flow_ops.pfor( functools.partial(loop_fn, use_pfor=True), batch_size) per_eg_jacobians_while = control_flow_ops.for_loop( functools.partial(loop_fn, use_pfor=False), [dtypes.float32] * len(variables.trainable_variables()), batch_size) return jacobians, per_eg_jacobians_pfor, per_eg_jacobians_while
def list_jacobian(outputs, inputs): """ Parameters ---------- outputs: tf.Tensor inputs: list of tf.Tensor Returns ------- list of jacobians [ tf.Tensor(p0_d0,p0_d1,...,N,3), tf.Tensor(p1_d0,p1_d2,...,N,3), ... ] """ # This is a slightly more advanced version of tensorflow's jacobian system that allows # for sparse gradients as well as automatically reshaping the results if outputs is a list. # taken from tf src gradients_impl.py _IndexedSlicesToTensor outputs = densify(outputs) output_dims = list(range(len(outputs.get_shape().as_list()))) # [0,1] n_out_dims = len(output_dims) # if isinstance(reverse_shaped, tf.Tensor): # shove to a list # reverse_shaped = [reverse_shaped] result = [] for inp, jac in zip(inputs, jacobian(outputs, inputs, use_pfor=False)): input_dims = list(range(len(inp.get_shape().as_list()))) # [0,1] perm = [(idx + n_out_dims) for idx in input_dims ] + output_dims # generate permutation indices result.append(tf.transpose(jac, perm=perm)) return result
def test_equal_to_builtin_jacobian(model_builder, batch_size): with DEFAULT_TF_GRAPH.as_default(): keras_model = model_builder() keras_model.summary() gradient_per_example_t = gradient_per_example( tf.real(keras_model.output), keras_model) tensorflow_jacobian_t = gradients.jacobian(tf.real(keras_model.output), keras_model.weights, use_pfor=False) print(gradient_per_example_t) print(tensorflow_jacobian_t) gradient_per_example_func = K.function(inputs=[keras_model.input], outputs=gradient_per_example_t) tensorflow_jacobian_func = K.function(inputs=[keras_model.input], outputs=tensorflow_jacobian_t) size = (batch_size, ) + K.int_shape(keras_model.input)[1:] batch = np.random.rand(*size) gradient_per_example_vals = gradient_per_example_func([batch]) tensorflow_jacobian_vals = tensorflow_jacobian_func([batch]) allclose = [ np.allclose(a, b, rtol=1e-3) for a, b in zip( gradient_per_example_vals, tensorflow_jacobian_vals) ] assert np.all(allclose)
def compute_task_jacobian(self, policy_loss, policy_loss_quad, tangents): # compute hvp params1 = self.warmup_policy1.parameters() params2 = self.warmup_policy2.parameters() params3 = self.warmup_policy3.parameters() self.op_task_hvp_Ax = nn.utils.quadgrad_vec_prod(policy_loss_quad, params1, params2, params3, tangents, AAx=False) task_hvp = nn.utils.quadgrad_vec_prod(policy_loss_quad, params1, params2, params3, tangents, AAx=True) if self.meanAAx: nparam = nn.utils.n_parameters_int( self.warmup_policy.parameters()).astype(np.float32) print("meanAAx nparam:", nparam) print(type(nparam)) task_hvp = task_hvp / nparam print("task_hvp:", task_hvp) print("task_hvp_Ax:", self.op_task_hvp_Ax) self.op_quad_hvp = nn.utils.hessian_vec_prod( self.quad_loss, self.warmup_policy.parameters(), tangents) self.op_hessian_hvp = nn.utils.hessian_vec_prod( policy_loss, self.warmup_policy.parameters(), tangents) self.op_hvp = nn.utils.hessian_vec_prod( self.mean_kl, self.warmup_policy.parameters(), tangents) # compute jacobian task_params = self.task.parameters() policy_params = self.warmup_policy.parameters() policy_gradient_flat = nn.utils.parameters_to_vector( tf.gradients(policy_loss, policy_params)) self.pg_flat = policy_gradient_flat self.p_flat = nn.utils.parameters_to_vector( self.warmup_policy.parameters()) print("pg_flat:", self.pg_flat) print("p_flat:", self.p_flat) task_jacobian = jacobian(policy_gradient_flat, task_params, use_pfor=False) print(task_jacobian.shape) task_jacobian = tf.reshape(task_jacobian, (-1, self.task.n_dim)) if self.AAx: ATb = [] for i in range(self.task.n_dim): ATb_i = nn.utils.quadgrad_vec_prod(policy_loss_quad, params1, params2, params3, task_jacobian[:, i], AAx=False) if self.meanAAx: ATb_i = ATb_i / nn.utils.n_parameters_int( self.warmup_policy.parameters()).astype(np.float32) ATb.append(ATb_i) self.op_ATb = tf.stack(ATb, axis=0) #n_dim x |theta| print(self.op_ATb.shape) #exit(0) ############################################################# jacobian_op = [] task_gradients = nn.utils.parameters_to_vector( tf.gradients(policy_loss, task_params)) self.task_gradients = task_gradients for i in range(self.task.n_dim): b = nn.utils.parameters_to_vector( tf.gradients(task_gradients[i], policy_params)) jacobian_op.append(b) self.jacobian_op = tf.stack(jacobian_op, axis=0) print("jacobian_op:", self.jacobian_op) print("task_gradients:", self.task_gradients) print("task_jacobian:", task_jacobian) ############################################################# return task_jacobian, task_hvp
def test_jacobian_parallel_iterations(self): x = constant_op.constant([[1., 2], [3, 4]]) y = math_ops.matmul(x, x) self.assertAllClose(gradients.jacobian(y, x, parallel_iterations=2), gradients.jacobian(y, x, parallel_iterations=3))
def jacobian(x): return gradients.jacobian(tensorflow.math.real(x), params, use_pfor=self.use_pfor)
def test_indexed_slice(self): inp = random_ops.random_uniform([3, 2]) output = nn.embedding_lookup(inp, [0, 2]) pfor_jacobian = gradients.jacobian(output, inp, use_pfor=True) while_jacobian = gradients.jacobian(output, inp, use_pfor=False) self.run_and_assert_equal(while_jacobian, pfor_jacobian)
def test_jacobian_parallel_iterations(self): x = constant_op.constant([[1., 2], [3, 4]]) y = math_ops.matmul(x, x) self.assertAllClose(gradients.jacobian(y, x, parallel_iterations=2), gradients.jacobian(y, x, parallel_iterations=3))
def CTC_Loss(): # ctc_loss v1에서는 sparse matrix가 들어가기 때문에, gt(label)에 0번 character가 포함되어 있으면, 0번에 대한 loss를 계산못한다. # v2에서도 sparse를 넣어주면 같은 결과가 나온다. # 이는 0번을 padding으로 인식하는 문제가 있기 때문이다. # 따라서, 0번에는 의미 있는 charcter를 부여하면 안된다. # v2에서 label에 sparse가 아닌, dense를 넣어주어야 한다. batch_size = 2 output_T = 5 target_T = 3 # target의 길이. Model이 만들어 내는 out_T는 target보다 길다. num_class = 4 # 0, 1, 2는 character이고, 마지막 3은 blank이다. x = np.arange(40).reshape(batch_size, output_T, num_class).astype(np.float32) x = np.random.randn(batch_size, output_T, num_class) x = np.array([[[0.74273746, 0.07847633, -0.89669566, 0.87111101], [0.35377891, 0.87161664, 0.45004634, -0.01664156], [-0.4019564, 0.59862392, -0.90470981, -0.16236736], [0.28194173, 0.82136263, 0.06700599, -0.43223688], [0.1487472, 1.04652007, -0.51399114, -0.4759599]], [[-0.53616811, -2.025543, -0.06641838, -1.88901458], [-0.75484499, 0.24393693, -0.08489008, -1.79244747], [0.36912486, 0.93965647, 0.42183299, 0.89334628], [-0.6257366, -2.25099419, -0.59857886, 0.35591563], [0.72191422, 0.37786281, 1.70582983, 0.90937337]]]).astype(np.float32) xx = tf.convert_to_tensor(x) xx = tf.Variable(xx) logits = tf.transpose(xx, [1, 0, 2]) yy = np.random.randint(0, num_class - 1, size=(batch_size, target_T)) # low=0, high=3 ==> 0,1,2 yy = np.array([[1, 2, 2], [1, 0, 1]]).astype(np.int32) #yy = np.array([[1, 2, 2,0,0,0],[1,0,2,0,0,0]]).astype(np.int32) # 끝에 붙은 0은 pad로 간주한다. 중간에 있는 0은 character로 간주 zero = tf.constant(0, dtype=tf.int32) where = tf.not_equal(yy, zero) indices = tf.where(where) values = tf.gather_nd(yy, indices) targets = tf.SparseTensor(indices, values, yy.shape) # preprocess_collapse_repeated=False ---> label은 반복되는 character가 있을 수 있으니, 당연히 False # ctc_merge_repeated=False ---> 모델이 예측한 반복된 character를 merge하지 않는다. 이것은 ctc loss의 취지와 다르다. loss0 = tf.nn.ctc_loss(labels=targets, inputs=logits, sequence_length=[output_T] * batch_size, ctc_merge_repeated=False) # 이 loss0는 의미 없음. loss1 = tf.nn.ctc_loss(labels=targets, inputs=logits, sequence_length=[output_T] * batch_size) loss2 = tf.nn.ctc_loss_v2(labels=yy, logits=logits, label_length=[target_T] * batch_size, logit_length=[output_T] * batch_size, logits_time_major=True, blank_index=num_class - 1) # lables에 sparse tensor를 넣으면, v1과 결과가 같다. loss3 = tf.nn.ctc_loss_v2(labels=targets, logits=logits, label_length=[3, 3], logit_length=[output_T] * batch_size, logits_time_major=True, blank_index=num_class - 1) optimizer = tf.train.GradientDescentOptimizer(learning_rate=1) gradient = optimizer.compute_gradients(loss1) prob = tf.nn.softmax(xx, axis=-1) # jacobian을 이용해서 logits에 대한 softmax값의 미분을 구한다. a = xx[0, 1] b = tf.nn.softmax(a) grad = jacobian(b, a) # logit에 대한 미분을 softmax에 대한 미분으로 변환하기 위해 grad의 inverse를 곱한다. # grad의 역행렬이 존재하지 않는다. sess = tf.Session() sess.run(tf.global_variables_initializer()) l0 = sess.run(loss0) l1 = sess.run(loss1) l2 = sess.run(loss2) l3 = sess.run(loss3) print('loss: ', l0, l1, l2, l3) g = sess.run(gradient[0][0]) p = sess.run(prob) gg = sess.run(grad)
def model_l1_l2_func(nm_set_points, n_in, nn_1, opt_obj, **kwargs_vals): hess_approx_flag = False neurons_cnt_x1, initializer = kwargs_vals['neurons_cnt'], kwargs_vals[ 'initializer'] wb_sizes_classif, wb_shapes = kwargs_vals['sizes'], kwargs_vals['shapes'] x_trained = kwargs_vals['xtr'] y_trained = kwargs_vals['ytr'] sess_values = kwargs_vals['sess'] neurons_cnt = kwargs_vals['neurons_cnt'] # pcsv = np.genfromtxt('results_paramsparse.csv', delimiter='\t') # p = tf.Variable(pcsv, dtype=tf.float64) p = tf.Variable(initializer([neurons_cnt], dtype=tf.float64)) p_store = tf.Variable(tf.zeros([neurons_cnt_x1], dtype=tf.float64)) save_params_p = tf.assign(p_store, p) restore_params_p = tf.assign(p, p_store) I_mat = tf.eye(neurons_cnt_x1, dtype=tf.float64) shaped_new = np.int(wb_sizes_classif[0]) + np.int(wb_sizes_classif[1]) # l2_norm_val, all_reg0 = func_structured_l2pen(p, wb_sizes_classif, wb_shapes) lambda_param = kwargs_vals['lambda_param'] lambda_param2 = kwargs_vals['lambda_param2'] # all_reg_0 = tf.reduce_sum(tf.abs(lasso_p)) # l2 structured norm loss function y_hat_model, y_hat_model_flat_x, y_labeled, x_in, r, l2_norm_val, all_reg0, l2_p, lassop = func_mse_l2( n_in, p, nn_1, kwargs_vals) r1 = y_labeled - y_hat_model loss_val = tf.reduce_sum( tf.square(r1)) + lambda_param * all_reg0 + lambda_param2 * l2_norm_val mu = tf.placeholder(tf.float64, shape=[1]) # LM parameter # initialized store for all params, grad and hessian to be trained feed_dict = {x_in: x_trained, y_labeled: y_trained} if hess_approx_flag: jcb = jacobian(y_hat_model, p) grads = tf.stack( [tf.gradients(yi, p)[0] for yi in tf.unstack(y_hat_model, axis=1)], axis=1) print(grads.shape) # g_vals = sess_values.run(grads, feed_dict=feed_dict) t_jcb = tf.matmul(tf.transpose(jcb), jcb) j1 = jacobian_mse(y_hat_model, p, nm_set_points, wb_sizes_classif, wb_shapes) jt = tf.transpose(j1) partitioned = tf.dynamic_partition(j1, nm_set_points, 1, name='dynamic_unstack') print(len(partitioned)) l2_grad = tf.gradients(l2_norm_val, l2_p)[0] dxdt = tf.expand_dims(tf.gradients(all_reg0, lassop)[0], 1) hess_l2_ps = tf.hessians(l2_norm_val, l2_p)[0] print('The shape is;', j1.shape) jtj1 = tf.matmul(jt, j1) jtr1 = 2 * tf.matmul(jt, r1) l2grad = tf.expand_dims(l2_grad, 1) s_l2grad = tf.matmul(l2grad, tf.transpose(l2grad)) # compute gradient of l2 params reshaped_gradl2 = jtr1[0:shaped_new] reshaped_l20 = reshaped_gradl2 + lambda_param2 * l2grad # l2_p_grads, 1) # build another hessian jt_hess = jt[0:shaped_new] + lambda_param2 * l2grad # l2_p_grads, 1) jt_hess_end = tf.concat([jt_hess, jt[shaped_new:, :]], axis=0) j1_t = tf.transpose(jt_hess_end) # calculate gradient for lasso params group reshaped_gradl1 = jtr1[shaped_new:] reshaped_gradl0 = reshaped_gradl1 + lambda_param * dxdt # tf.expand_dims(dxdt, 1) #tf.sign(lasso_p), 1) # Assemble the lasso group jtj = tf.matmul(jt_hess_end, j1_t) jtr = tf.concat([reshaped_l20, reshaped_gradl0], axis=0) jtr = tf.reshape(jtr, shape=(neurons_cnt_x1, 1)) # The other hess using hessian for in --> hid1 hess_part2 = jtj1[0:shaped_new, 0:shaped_new] + s_l2grad #hess_l2_ps# + h_mat_l2 hess_partsconc = tf.concat( [hess_part2, jtj1[0:shaped_new, shaped_new:]], axis=1) jtj3 = tf.concat([hess_partsconc, jtj1[shaped_new:, :]], axis=0) # remove it else: # remove it # stop_grads = tf.where(tf.math.equal(p, 0)) jtj = tf.squeeze(tf.hessians(loss_val, p)[0]) jtr = -tf.gradients(loss_val, [p])[ 0] # , stop_gradients=stop_grads, unconnected_gradients='zero')[0] jtr = tf.reshape(jtr, shape=(neurons_cnt_x1, 1)) # jtj = hessian_multivar(loss_val, [p]) jtj_store = tf.Variable( tf.zeros((neurons_cnt_x1, neurons_cnt_x1), dtype=tf.float64)) jtr_store = tf.Variable(tf.zeros((neurons_cnt_x1, 1), dtype=tf.float64)) save_jtj_jtr = [tf.assign(jtj_store, jtj), tf.assign(jtr_store, jtr)] input_mat = jtj_store + tf.multiply(mu, I_mat) try: dx = tf.matmul(tf.linalg.inv(input_mat, adjoint=None), jtr_store) except: c = tf.constant(1, dtype=tf.float64) input_mat += np.identity(input_mat.shape) * c dx = tf.matmul(tf.linalg.inv(input_mat, adjoint=None), jtr_store) dx = tf.squeeze(dx) lm = opt_obj.apply_gradients([(-dx, p)]) # p2 = p.assign(p + dx) sess_values = kwargs_vals['sess'] feed_dict[mu] = np.array([0.01], dtype=np.float64) i_cnt = 0 step = 0 mat_values = [] sess_values.run(tf.global_variables_initializer()) current_loss = sess_values.run(loss_val, feed_dict) while feed_dict[mu] > 1e-6 and step < 500: p0 = sess_values.run(p) p_0_indices = np.where(p == 0) p0[p_0_indices] = 0.0 step += 1 sess_values.run(save_params_p) sess_values.run(restore_params_p) if math.log(step, 2).is_integer(): print('step', 'mu: ', 'current loss: ') print(step, feed_dict[mu][0], current_loss) success = False sess_values.run(jtj_store, feed_dict) sess_values.run(jtr_store, feed_dict) sess_values.run(save_jtj_jtr, feed_dict) for _ in range(400): # p0 equals session object with run of p2 and feed dict sess_values.run(jtj_store, feed_dict) sess_values.run(jtr_store, feed_dict) sess_values.run(save_jtj_jtr, feed_dict) sess_values.run(lm, feed_dict) p0 = sess_values.run(p) p0[np.where(p0 == 0)] = 0 values_vec = np.where(p0 == 0.0) p0[values_vec] = 0.0 new_loss = sess_values.run(loss_val, feed_dict) # sess_values.run(save_jtj_jtr, feed_dict) if new_loss < current_loss: # divide parameters to 2 groups: 1 for l1 and the other for structured l2 # shaped_new = np.int(wb_sizes_classif[0]) + np.int(wb_sizes_classif[1]) lasso_p0 = p0[shaped_new:] in2_hidden_params = p0[0:shaped_new] # mat_values.append(lasso_p0) mat_values.append(p0) i_cnt += 1 if len(mat_values) == 3: sgn1 = mat_values[0] * mat_values[1] sgn2 = mat_values[1] * mat_values[2] # send the parameters to compute the values of structured penalty after # checking if parameters are locally close to zero px = mat_values[2] osc_vec0 = np.where((sgn1 < 0.0) & (sgn2 < 0.0)) px[osc_vec0] = 0.0 # join both sets of parameter lists here px0 = tf.concat([in2_hidden_params, px], 0) if lambda_param2 > 0.0 and np.mod(step, 5) == 0: px0 = sess_values.run(px0) new_all_params, ws_bs_in1_hid1, condvec = func_compute_cond( px0, lambda_param2, kwargs_vals) else: new_all_params = np.array(sess_values.run(px0)) p0 = func_collect_allparams(new_all_params, wb_sizes_classif, wb_shapes) p.assign(p0) mat_values = [] # mat_values = [px] else: p.assign(p0) # sess_values.run(jtj_store, feed_dict) # sess_values.run(jtr_store, feed_dict) # sess_values.run(save_jtj_jtr, feed_dict) # sess_values.run(save_params_p) feed_dict[mu] /= 10 current_loss = new_loss success = True break else: feed_dict[mu] *= 10 p.assign(p0) # sess_values.run(save_params_p) sess_values.run(restore_params_p) # sess_values.run(save_jtj_jtr, feed_dict) # sess_values.run(save_params_p) if not success: print('Failed to improve') break p_new = sess_values.run(restore_params_p) abs_p = np.abs(p_new) idx_absp = np.where(abs_p < 0.01) p_new[idx_absp] = 0.0 new_all_params, ws_bs_in1_hid1, condvec = func_compute_cond( p_new, lambda_param2, kwargs_vals) p_new = func_collect_allparams(p_new, wb_sizes_classif, wb_shapes) # p_new[osc_vec0]=0.0 non_zero = np.count_nonzero(p_new) y_predict, x_inputs = func_pred_new(n_in, nn_1, p_new, **kwargs_vals) inw_hid1 = tf.reshape(p_new[0:shaped_new], shape=(wb_shapes[0][0] + wb_shapes[1][0], wb_shapes[0][1])) feed_dict2 = {x_inputs: x_trained} print('ENDED ON STEP: ', ' FINAL LOSS:') print(step, current_loss) print('Input -> hidden layer 1 Parameters: ') print(sess_values.run(inw_hid1)) # cv.close() y_model = sess_values.run(y_predict, feed_dict2) return restore_params_p, p_new, y_model, current_loss, non_zero
def loop_fn(i, use_pfor): inp_i = array_ops.expand_dims(array_ops.gather(inp, i), 0) output = array_ops.reshape(model(inp_i), [-1]) return gradients.jacobian(output, variables.trainable_variables(), use_pfor=use_pfor)
def func_classifier_l2l1(xtest1, ytest, kwargs1, kwargspred, **kwargs): hess_approx_flag = False initializer = kwargs1['initializer'] mu1, _, mu_dec, max_inc = kwargs['mu'], kwargs['mu_inc'], kwargs[ 'mu_dec'], kwargs['mu_inc'] wb_shapes, wb_sizes_classif, hidden = kwargspred['wb_shapes'], kwargspred[ 'wb_sizes'], kwargspred['hidden'] activation, xydat, ydatrain = kwargspred['activation'], kwargspred[ 'xydat'], kwargspred['xydatrain'] x_in, nclasses = kwargspred['xtr'], kwargs1['nclasses'] y_labeled = kwargspred['ytr'] nm_set_points = x_in.shape[0] sess, neurons_cnt_x1 = kwargspred['sess'], kwargspred['neurons_cnt'] opt_obj = kwargspred['opt_obj'] params0 = tf.Variable(initializer([neurons_cnt_x1], dtype=tf.float64)) loss, x, y, y_hat_model, l2_norm_val = func_cross_entropy_loss( wb_sizes_classif, params0, kwargs1) feed_dict = {x: x_in, y: y_labeled} feed_dict2 = {x: xtest1, y: ytest} # check paper and add selected features # add correlation for Park data set # tuning parameters lambda_param = 0.008 lambda_param2 = 0.4 # l2 structured norm loss function mu = tf.placeholder(tf.float64, shape=[1]) # initialized store for all parameters, gradient and H-matrix to be trained # LM parameter p_store = tf.Variable(tf.zeros([neurons_cnt_x1], dtype=tf.float64)) save_params_p = tf.compat.v1.assign(p_store, params0) restore_params_p = tf.compat.v1.assign(params0, p_store) I_mat = tf.eye(neurons_cnt_x1, dtype=tf.float64) shaped_new = np.int(wb_sizes_classif[0]) + np.int(wb_sizes_classif[1]) lasso_p = params0[shaped_new:] l2_p = params0[0:shaped_new] print(lasso_p) all_reg0 = tf.reduce_sum(tf.abs(lasso_p)) loss_val = loss + lambda_param * all_reg0 + lambda_param2 * l2_norm_val if hess_approx_flag: # j1 equal jacobian_classif(y_hat_model, p, nm_set_points) # jt = tf.transpose(j1) # jtj = tf.matmul(jt, j1) # jtr = tf.matmul(jt, r) jcb = jacobian(y_hat_model, params0) t_jcb = tf.matmul(tf.transpose(jcb), jcb) j1 = jacobian_mse(y_hat_model, params0, nm_set_points, wb_sizes_classif, wb_shapes) jt = tf.transpose(j1) partitioned = tf.dynamic_partition(j1, nm_set_points, 1, name='dynamic_unstack') print(len(partitioned)) l2_grad = tf.gradients(l2_norm_val, l2_p)[0] dxdt = tf.expand_dims(tf.gradients(all_reg0, lasso_p)[0], 1) hess_l2_ps = tf.hessians(l2_norm_val, l2_p)[0] print('The shape is;', j1.shape) jtj1 = tf.matmul(jt, j1) jtr1 = 2 * tf.matmul(jt, r1) l2grad = tf.expand_dims(l2_grad, 1) s_l2grad = tf.matmul(l2grad, tf.transpose(l2grad)) # compute gradient of l2 params reshaped_gradl2 = jtr1[0:shaped_new] reshaped_l20 = reshaped_gradl2 + lambda_param2 * l2grad # l2_p_grads, 1) # build another hessian jt_hess = jt[0:shaped_new] + lambda_param2 * l2grad # l2_p_grads, 1) jt_hess_end = tf.concat([jt_hess, jt[shaped_new:, :]], axis=0) j1_t = tf.transpose(jt_hess_end) # calculate gradient for lasso params group reshaped_gradl1 = jtr1[shaped_new:] reshaped_gradl0 = reshaped_gradl1 + lambda_param * dxdt # tf.expand_dims(dxdt, 1) #tf.sign(lasso_p), 1) # Assemble the lasso group jtj = tf.matmul(jt_hess_end, j1_t) jtr = tf.concat([reshaped_l20, reshaped_gradl0], axis=0) jtr = tf.reshape(jtr, shape=(neurons_cnt_x1, 1)) # The other hess using hessian for in --> hid1 hess_part2 = jtj1[0:shaped_new, 0:shaped_new] + s_l2grad #hess_l2_ps# + h_mat_l2 hess_partsconc = tf.concat( [hess_part2, jtj1[0:shaped_new, shaped_new:]], axis=1) jtj3 = tf.concat([hess_partsconc, jtj1[shaped_new:, :]], axis=0) else: # remove it # stop_grads = tf.where(tf.math.equal(p, 0)) # jtj = hessian_multivar(loss_val, [params0]) jtj = tf.hessians(loss_val, params0)[0] jtr = -tf.gradients(loss_val, params0)[ 0] # stop_gradients=stop_grads, unconnected_gradients='zero')[0] jtr = tf.reshape(jtr, shape=(neurons_cnt_x1, 1)) jtj_store = tf.Variable( tf.zeros((neurons_cnt_x1, neurons_cnt_x1), dtype=tf.float64)) jtr_store = tf.Variable(tf.zeros((neurons_cnt_x1, 1), dtype=tf.float64)) save_jtj_jtr = [tf.assign(jtj_store, jtj), tf.assign(jtr_store, jtr)] input_mat = jtj_store + tf.multiply(mu, I_mat) try: dx = tf.matmul(tf.linalg.inv(input_mat, adjoint=None), jtr_store) except: c = tf.constant(0.1, dtype=tf.float64) input_mat += np.identity(input_mat.shape) * c dx = tf.matmul(tf.linalg.inv(input_mat, adjoint=None), jtr_store) dx = tf.squeeze(dx) lm = opt_obj.apply_gradients([(-dx, params0)]) # p2 equal p.assign(p + dx) sess_values = kwargspred['sess'] # print(sess_values.run(lasso_p)) feed_dict[mu] = np.array([0.1], dtype=np.float64) i_cnt = 0 step = 0 mat_values = [] sess_values.run(tf.global_variables_initializer()) current_loss = sess_values.run(loss_val, feed_dict) while feed_dict[mu] > 1e-10 and step < 200: p0 = sess_values.run(params0) values_vec = np.where(params0 == 0) p0[values_vec] = 0.0 step += 1 sess.run(save_params_p) # sess.run(restore_params_p) if math.log(step, 2).is_integer(): print('step', 'mu: ', 'current loss: ') print(step, feed_dict[mu][0], current_loss) success = False sess_values.run(jtj_store, feed_dict) sess_values.run(p_store) for _ in range(400): sess_values.run(save_jtj_jtr, feed_dict) sess_values.run(jtj_store, feed_dict) # p0 equals session object with run of p2 and feed dict sess_values.run(lm, feed_dict) p0 = sess_values.run(params0) # p0 equals tf.where(p == 0, tf.zeros_like(p), p) values_vec = np.where(p0 == 0.0) p0[values_vec] = 0.0 new_loss = sess_values.run(loss_val, feed_dict) if new_loss < current_loss: # divide parameters to 2 groups: 1 for l1 and the other for structured l2 shaped_new = np.int(wb_sizes_classif[0]) + np.int( wb_sizes_classif[1]) lasso_p0 = p0[shaped_new:] in2_hidden_params = p0[0:shaped_new] mat_values.append(lasso_p0) i_cnt += 1 if len(mat_values) == 3: sgn1 = mat_values[0] * mat_values[1] sgn2 = mat_values[1] * mat_values[2] # store parameters # checking if parameters are locally close to zero px = mat_values[2] values_vec = np.where((sgn1 < 0) & (sgn2 < 0)) px[values_vec] = 0.0 print(len(mat_values)) # join both sets of parameter lists here joined_params = np.concatenate(l2_params_set, new_p0) px0 = tf.concat([in2_hidden_params, px], 0) if lambda_param2 > 0.0 and np.mod(step, 2) == 0: px0 = sess_values.run(px0) new_all_params, ws_bs_in1_hid1, _ = func_compute_cond( px0, lambda_param2, kwargspred) else: new_all_params = np.array(sess_values.run(px0)) # sess_px0 equal np.array(sess_values.run(px0)) p_values_send = func_collect_allparams( new_all_params, wb_sizes_classif, wb_shapes) print(p_values_send.shape) params0.assign(p_values_send) i_cnt = 0 mat_values = [] mat_values = [px] else: params0.assign(p0) feed_dict[mu] /= 10 current_loss = new_loss success = True break else: feed_dict[mu] *= 10 params0.assign(p0) # sess.run(save_params_p) sess_values.run(restore_params_p) if not success: print('Failed to improve') break p_new = sess_values.run(restore_params_p) abs_p = np.abs(p_new) idx_absp = np.where(abs_p < 0.1) p_new[idx_absp] = 0.0 p_new[values_vec] = 0.0 correct_prediction, feed_dict2, y_hat_classif_logits = predclassif( wb_sizes_classif, xydat, hidden, p_new, activation, wb_shapes, nclasses) accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) print('ENDED ON STEP: ') print(step) print(' FINAL LOSS:') print(current_loss) print('Parameters: ') print(sess_values.run(restore_params_p)) print('Parameters: ') print(p_new) print("Accuracy:", sess.run(accuracy, feed_dict2)) correct_predictions = sess.run(y_hat_classif_logits, feed_dict2) correct_prediction, feed_dict21, y_hat_classif_logits = predclassif( wb_sizes_classif, ydatrain, hidden, p_new, activation, wb_shapes, nclasses) correct_predictions_train = sess.run(y_hat_classif_logits, feed_dict21) return p_new, correct_predictions, correct_predictions_train
def loop_fn(i, use_pfor): image = array_ops.gather(images, i) logits = array_ops.reshape(model(image, training=training), [-1]) return gradients.jacobian(logits, variables.trainable_variables(), use_pfor=use_pfor)
def empirical_NTK(model, train_images): from mpi4py import MPI comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() print(rank) model.compile("sgd", loss=lambda target, pred: pred) import tensorflow.keras.backend as K num_layers = len(model.trainable_weights) trainable_weights = np.array(model.trainable_weights) # fs = [] # params_per_chunk = [] num_chunks = min(size, num_layers) layers_per_chunk = num_layers // num_chunks if rank < num_chunks: chunks = list( range(int(rank * layers_per_chunk), int((rank + 1) * layers_per_chunk))) if rank < num_layers % num_chunks: chunks.append(num_chunks * layers_per_chunk + rank) params_per_layer = np.array( [np.prod(x.shape) for x in trainable_weights]) params_per_chunk = sum(params_per_layer[chunks]) # grads = model.optimizer.get_gradients(model.output, list(trainable_weights[chunks])) # grads = tf.keras.backend.gradients(model.output, list(trainable_weights[chunks])) # grads = tf.gradients(model.output, list(trainable_weights[chunks])) # symb_inputs = (model._feed_inputs + model._feed_targets) symb_inputs = model._feed_inputs grads = jacobian(model.output, list(trainable_weights[chunks])) f = K.function(symb_inputs, grads) def get_weight_grad(model, inputs, outputs): """ Gets gradient of model for given inputs and outputs for all weights""" x, y, _ = model._standardize_user_data(inputs, outputs) batch_size = inputs.shape[0] # output_grad = f(x + y) output_grad = f(x) print(output_grad[0].shape) # output_grad = np.concatenate([x.flatten() for x in output_grad]) output_grad = np.concatenate( [x.reshape((batch_size, -1)) for x in output_grad]) return output_grad X = train_images m = len(X) Y = np.zeros((len(X), 1)) NTK = np.zeros((len(X), len(X))) chunk1 = 25 chunk2 = chunk1 # it's benefitial to chunk in j2 too, in orden to reduce the python for loop. Even though we do more on numpy/pytorch (by reducing the chunking on j1, we do more grad computaiotns), python is much slower than those, and so tradeoff is worth it I think # print("tot_parameters",tot_parameters) # jac1 = np.zeros((chunk1,params_per_chunk)) # jac2 = np.zeros((chunk2,params_per_chunk)) num_chunk1s = m // chunk1 if m % chunk1 > 0: num_chunk1s += 1 num_chunk2s = m // chunk2 if m % chunk2 > 0: num_chunk2s += 1 for j1 in range(num_chunk1s): if m % chunk1 > 0 and j1 == num_chunk1s - 1: num_inputs1 = m % chunk1 # jac1 = np.zeros((num_inputs1,params_per_chunk)) else: num_inputs1 = chunk1 print("chunk", j1, "out of", num_chunk1s) sys.stdout.flush() # for i in range(num_inputs1): # gradient = get_weight_grad(model, train_images[j1*chunk1+i:j1*chunk1+i+1], Y[j1*chunk1+i:j1*chunk1+i+1]) # jac1[i,:] = gradient jac1 = get_weight_grad( model, train_images[j1 * chunk1:j1 * chunk1 + num_inputs1], Y[j1 * chunk1:j1 * chunk1 + num_inputs1]) print(jac1.shape) for j2 in range(j1, num_chunk2s): if m % chunk2 > 0 and j2 == num_chunk2s - 1: num_inputs2 = m % chunk2 # jac2 = np.zeros((num_inputs2,params_per_chunk)) else: num_inputs2 = chunk2 print(j1, j2) # for i in range(num_inputs2): # gradient = get_weight_grad(model, train_images[j2*chunk2+i:j2*chunk2+i+1], Y[j2*chunk2+i:j2*chunk2+i+1]) # jac2[i,:] = gradient jac2 = get_weight_grad( model, train_images[j2 * chunk2:j2 * chunk2 + num_inputs2], Y[j2 * chunk2:j2 * chunk2 + num_inputs2]) NTK[j1 * chunk1:j1 * chunk1 + num_inputs1, j2 * chunk2:j2 * chunk2 + num_inputs2] += np.matmul( jac1, jac2.T) ntk_recv = None if rank == 0: ntk_recv = np.zeros_like(NTK) comm.Reduce(NTK, ntk_recv, op=MPI.SUM, root=0) if rank == 0: NTK = (ntk_recv + ntk_recv.T) / 2 return NTK
def getGrad2(image): with tf.GradientTape(persistent=True) as tape: x, concepts, _ = model.helper(image) gradients = jacobian(concepts, x) return gradients
def loop_fn(i, use_pfor): image = array_ops.gather(images, i) logits = array_ops.reshape(model(image, training=training), [-1]) return gradients.jacobian( logits, variables.trainable_variables(), use_pfor=use_pfor)
def loop_fn(i, use_pfor): inp_i = array_ops.expand_dims(array_ops.gather(inp, i), 0) output = array_ops.reshape(model(inp_i), [-1]) return gradients.jacobian( output, variables.trainable_variables(), use_pfor=use_pfor)