def test_secondary_gradient(self): def P(state): p = tf.multiply(state, 2) p = tf.add(p, 10) return p def Q(action): return tf.multiply(action, 3) state = tf.constant(1.) with tfe.GradientTape() as tape: # with tfe.GradientTape() as dqda_tape: tape.watch(state) a = P(state) #12 print("a", a) with tfe.GradientTape() as dqda_tape: dqda_tape.watch(a) q = Q(a) print("Q", q) dqda = dqda_tape.gradient(q, a) print("DQDA", dqda) # print(a) # print(dqda*a) loss = -dqda * a print("Loss:", loss) grads = tape.gradient(loss, state) print(grads)
def train_vanilla_ddpg_policy(policy_func, q_func, model, optimizer, state): with tfe.GradientTape() as tape: action = policy_func(state, training=False) with tfe.GradientTape() as dqda_tape: dqda_tape.watch(action) q = q_func([state, action]) dqda = dqda_tape.gradient(q, action) loss = -dqda * action grads = tape.gradient(loss, model.variables) optimizer.apply_gradients(zip(grads, model.variables), global_step=tf.train.get_or_create_global_step())
def train_one_step(model, images, all_labels, optimizer): with tfe.GradientTape() as tape: logits = model(np.array(images), training=True, visualize=0) if params.method == 'pred_matrix': labels = np.array(all_labels) labels.shape = (len(all_labels), labels.shape[1] * labels.shape[2]) loss_nbr = tf.losses.sigmoid_cross_entropy(labels, logits) loss_dst = 0 elif params.method == 'est_dist_ths': labels = np.array([l[0] for l in all_labels]) loss_nbr = tf.losses.sigmoid_cross_entropy(np.array(labels), logits[:, :4]) dist_est_lbls = np.array([l[1] for l in all_labels]) dist_est_prd = logits[:, 4:] loss_dst_upper = tf.reduce_mean( tf.nn.relu(-(dist_est_lbls[:, 1:] - dist_est_prd))) loss_dst_lower = tf.reduce_mean( tf.nn.relu((dist_est_lbls[:, :1] - dist_est_prd))) loss_dst = loss_dst_upper + loss_dst_lower tf.contrib.summary.scalar('loss_dst', loss_dst) else: loss_nbr = tf.losses.sigmoid_cross_entropy(np.array(labels), logits[:, :4]) loss = loss_dst + loss_nbr grads = tape.gradient(loss, model.variables) optimizer.apply_gradients(zip(grads, model.variables), tf.train.get_or_create_global_step()) return loss
def train(model, optimizer, dataset, log_interval=None): """Trains model on `dataset` using `optimizer`.""" global_step = tf.train.get_or_create_global_step() start = time.time() for (batch, (images, labels)) in enumerate(tfe.Iterator(dataset)): with tf.contrib.summary.record_summaries_every_n_global_steps(10): # Record the operations used to compute the loss given the input, # so that the gradient of the loss with respect to the variables # can be computed. with tfe.GradientTape() as tape: logits = model(images, training=True) loss_value = loss(logits, labels) tf.contrib.summary.scalar('loss', loss_value) tf.contrib.summary.scalar('accuracy', compute_accuracy(logits, labels)) grads = tape.gradient(loss_value, model.variables) optimizer.apply_gradients(zip(grads, model.variables), global_step=global_step) if log_interval and batch % log_interval == 0: rate = log_interval / (time.time() - start) print('Step #%d\tLoss: %.6f (%d steps/sec)' % (batch, loss_value, rate)) start = time.time()
def compute_grads(self, imgs, labels): with tfe.GradientTape() as tape: logits, _ = self.model(imgs) loss = self.compute_loss(labels, logits) return tape.gradient(loss, self.model.variables), loss
def train_batch(self, labels, premise, premise_transition, hypothesis, hypothesis_transition): """Train model on batch of data. Args: labels: The truth labels, with shape (batch_size,). premise: The word indices of the premise sentences, with shape (max_prem_seq_len, batch_size). premise_transition: The transitions for the premise sentences, with shape (max_prem_seq_len * 2 - 3, batch_size). hypothesis: The word indices of the hypothesis sentences, with shape (max_hypo_seq_len, batch_size). hypothesis_transition: The transitions for the hypothesis sentences, with shape (max_hypo_seq_len * 2 - 3, batch_size). Returns: 1. loss value as a scalar `Tensor`. 2. logits as a dense `Tensor` of shape (batch_size, d_out), where d_out is the output dimension size of the SNLIClassifier. """ with tfe.GradientTape() as tape: tape.watch(self._model.variables) logits = self._model(premise, premise_transition, hypothesis, hypothesis_transition, training=True) loss = self.loss(labels, logits) gradients = tape.gradient(loss, self._model.variables) self._optimizer.apply_gradients(zip(gradients, self._model.variables), global_step=tf.train.get_global_step()) return loss, logits
def train(self, dataset): '''trains the model for one epoch''' epoch_loss = tf.constant(0.) for idx_batch, data in enumerate(tfe.Iterator(dataset)): with tfe.GradientTape() as tape: # forward pass predictions = self.forward(data[0]) # reverse x & y axis predictions = tf.concat([predictions[...,1::-1], predictions[...,3:1:-1], predictions[...,4:]], axis=-1) # compute loss loss = self.get_loss(predictions, data[1]) # backward pass (compute gradients) gradients = tape.gradient(loss, self.variables) # update parameters self.optimizer.apply_gradients( zip(gradients, self.variables), global_step=tf.train.get_or_create_global_step() ) epoch_loss += loss print('Batch:', idx_batch, '| Loss=', loss.numpy(), '\t', end='\r') return (epoch_loss/(idx_batch+1)).numpy()
def training_loop(model, num_iterations=8000): optimizer = tf.train.AdamOptimizer() # ipdb.set_trace() dataset = dataset_from_stage('train') data_iterator = tfe.Iterator(dataset) val_dataset = dataset_from_stage('valid') val_iterator = tfe.Iterator(val_dataset) for i in range(num_iterations): x, y = next(data_iterator) with tfe.GradientTape() as tape: loss, _ = predict(model, x, y) grads = tape.gradient(loss, model.get_variables()) optimizer.apply_gradients(zip(grads, model.get_variables())) if i % 200 == 0: xval, yval = next(val_iterator) val_loss, _ = predict(model, xval, yval, accuracy=True) print("Validation accuracy: {:.4f}".format(val_loss)) print("Current loss: {:.4f}".format(loss)) model.save(itn=i) return model
def grad(points, vr_points): with tfe.GradientTape() as tape: if modified_loss: loss_value, loss_no_rglrz = charmer_distance_mod(points, vr_points) else: loss_value, loss_no_rglrz = charmer_distance(points, vr_points) loss_grad, = tape.gradient(loss_value, [vr_points]) return loss_grad, loss_value, loss_no_rglrz
def grads_fn(self, input_data, target): """ Dynamically computes the gradients of the loss value with respect to the parameters of the model, in each forward pass. """ with tfe.GradientTape() as tape: loss = self.loss_fn(input_data, target) return tape.gradient(loss, self.variables)
def grad_G(self, Z, training): """calculate gradient of the batch for generator Args:. Z : noise vector """ with tfe.GradientTape() as tape: loss_val = self.loss_G(Z, training) return tape.gradient(loss_val, self.generator.variables), loss_val
def grads_fn(self, batch_state, batch_next_state, batch_reward, batch_action): """ Dynamically computes the gradients of the loss value with respect to the parameters of the model, in each forward pass. """ with tfe.GradientTape() as tape: loss = self.loss_fn(batch_state, batch_next_state, batch_reward, batch_action) return tape.gradient(loss, self.variables)
def grads_fn_q_one_hot(self, states, q_target, actions): """ Dynamically computes the gradients of the loss value with respect to the parameters of the model, in each forward pass. """ with tfe.GradientTape() as tape: loss = self.loss_q_one_hot(states, q_target, actions) return tape.gradient(loss, self.variables)
def grad(self, X): """calculate gradient of the batch Args: X : input tensor """ with tfe.GradientTape() as tape: loss_val, recon_loss, kl_loss = self.loss(X) return tape.gradient(loss_val, self.variables), loss_val, recon_loss, kl_loss
def grad(self, positive, negative, depth): with tfe.GradientTape() as tape: loss_value = self.loss(positive, negative, depth) weight_decay = 0.0 # This will cause local minimum? regularization = 0 for weights in self.__embeddings.variables: regularization += tf.nn.l2_loss(weights)*weight_decay loss_value += regularization/len(self.__embeddings.variables) return tape.gradient(loss_value, list(self.__embeddings.variables))
def main(argv): train_data, valid_data, cvae = load_all_data_and_build_model() times = {} global loss_times loss_times = {} optimizer = tf.train.AdamOptimizer() # training cycle node_count = [] node_depth = [] node_arity = [] for i in range(FLAGS.benchmark_runs): with Measure('data', times): train_data_iter = train_data.iter(0, FLAGS.ignore_leaves) xs, ys = next(train_data_iter) node_count.append( list(map(lambda t: t.calculate_node_count(), xs + ys))) node_depth.append(list(map(lambda t: t.calculate_max_depth(), xs + ys))) node_arity.extend(arities(xs + ys)) with tfe.GradientTape() as tape: with Measure('compute', times): kld_all, recons = cvae.get_loss_components_trees( xs, ys, FLAGS.n_sample) with Measure('loss', times): struct_loss_all, val_loss_all = recons.reconstruction_loss() loss = tf.reduce_sum(kld_all) + tf.reduce_sum(struct_loss_all + val_loss_all) with Measure('grad', times): grad = tape.gradient(loss, cvae.variables) with Measure('apply', times): optimizer.apply_gradients( zip(grad, cvae.variables), global_step=tf.train.get_or_create_global_step()) # Printing output print("#ALL") tot_avg, tot_sum = Measure.print_times(times) print("\nNodes: {0:.1f} ({1:.1f})".format(np.mean(node_count), np.std(node_count))) print("Depths: {0:.1f} ({1:.1f})".format(np.mean(node_depth), np.std(node_depth))) print("Arities: {0:.1f} ({1:.1f})".format(np.mean(node_arity), np.std(node_arity))) print((np.sum(node_count) / tot_sum), FLAGS.batch_size * FLAGS.benchmark_runs / tot_sum)
def train_vanilla_pg_value(value_func, model, optimizer, inputs, targets): with tfe.GradientTape() as tape: v = value_func(inputs, training=True) loss = tf.losses.mean_squared_error(targets, v) # print("V Loss:", loss) grads = tape.gradient(loss, model.variables) optimizer.apply_gradients(zip(grads, model.variables), global_step=tf.train.get_or_create_global_step())
def learn_03(): w = tfe.Variable([[1.0]]) with tfe.GradientTape() as tape: loss = w * w grad = tape.gradient(loss, [w]) print( grad ) # => [<tf.Tensor: id=31, shape=(1, 1), dtype=float32, numpy=array([[2.]], dtype=float32)>]
def grad_D(self, Z, real, training): """calculate gradient of the batch for discriminator Args: Z: noise vector real: real image """ with tfe.GradientTape() as tape: loss_val = self.loss_D(Z, real, training) return tape.gradient(loss_val, self.discriminator.variables), loss_val
def train_one_step(model, images, labels, optimizer): with tfe.GradientTape() as tape: logits = model(images, training=True) loss = tf.losses.softmax_cross_entropy(logits=logits, onehot_labels=labels) tf.contrib.summary.scalar(name='loss', tensor=loss) grads = tape.gradient(loss, model.variables) optimizer.apply_gradients(zip(grads, model.variables))
def grads_fn(self, X, y, seq_length, is_training): """ Dynamically computes the gradients of the loss value with respect to the parameters of the model, in each forward pass. 除了loss_fn的参数,这个函数在所有的模型里都一样 """ with tfe.GradientTape() as tape: loss = self.loss_fn(X, y, seq_length, is_training) return tape.gradient(loss, self.variables)
def train(num_episodes=1000, save_every=100, checkpoint_dir="checkpoints", tensorboard_dir="tensorboard", tboard_every=10, find_target_prop=0): pol = Policy() writer = tf.contrib.summary.create_file_writer(tensorboard_dir) for j in range(1, num_episodes + 1): random_secret = random.randint(0, config.max_guesses - 1) e = Episode(pol, random_secret, find_target_prop, True) history = e.generate() print("Episode:{}, length: {}".format(j, len(history))) G = -1 optimizer = \ tf.train.GradientDescentOptimizer( learning_rate=config.reinforce_alpha*G) for i in reversed(range(1, len(history))): history_so_far = history[:i] next_action, _ = history[i] with tfe.GradientTape() as tape: action_logits = pol(history_so_far, with_softmax=False) loss = tf.nn.softmax_cross_entropy_with_logits_v2( labels=tf.one_hot(tf.convert_to_tensor([next_action]), config.max_guesses), logits=action_logits) grads = tape.gradient(loss, pol.variables) optimizer.apply_gradients(zip(grads, pol.variables)) G -= 1 optimizer._learning_rate = G * config.reinforce_alpha optimizer._learning_rate_tensor = None # hack. Should be able to pass a callable as learning_rate, see # https://www.tensorflow.org/api_docs/python/tf/train/GradientDescentOptimizer#args # can I perhaps submit a PR to fix this bug? sys.stdout.write("{}/{}\r".format(len(history) - i, len(history))) if j % save_every == 0 or j == num_episodes: saver = tfe.Saver(pol.named_variables) save_path = os.path.join( checkpoint_dir, "episode{}".format(str(j).zfill(len(str(num_episodes))))) saver.save(save_path) if j % tboard_every == 0: with writer.as_default(): with tf.contrib.summary.always_record_summaries(): tf.contrib.summary.scalar('total_return', tf.convert_to_tensor([G]), step=j) return pol
def grads_fn(self, input_data, target): ''' 计算每一趟的梯度,return和with对齐 :param input_data: :param target: :return: ''' with tfe.GradientTape() as tape: loss = self.loss_fn(input_data, target) return tape.gradient(loss, self.variables)
def grad(self, X, y, trainig): """calculate gradient of the batch Args: X : input tensor y : target label(class number) training : whether apply dropout or not """ with tfe.GradientTape() as tape: loss_value, _ = self.loss(X, y, trainig) return tape.gradient(loss_value, self.variables), loss_value
def grad_both(self, Z, real, training): """calculate gradient of the batch for both generator and discriminator Args: Z: noise vector real: real image """ with tfe.GradientTape(persistent=True) as tape: loss_G = self.loss_G(Z, training) loss_D = self.loss_D(Z, real, training) return tape.gradient(loss_G, self.generator.variables), tape.gradient( loss_D, self.discriminator.variables), loss_G, loss_D
def fisher_vector_product(v, model, states, logits_old): with tfe.GradientTape() as t1: with tfe.GradientTape() as t2: t1.watch(model.variables) t2.watch(model.variables) loss = model.test(states, logits_old, v) grads = t2.gradient(loss, model.variables) grads_flat = tf.concat( [tf.reshape(grad_, [-1]) for grad_ in grads if grad_ is not None], axis=0) grads_v = tf.reduce_sum(grads_flat * v) grads_grads_v = t1.gradient(grads_v, model.variables) return np.array( tf.concat([ tf.reshape(grad_, [-1]) for grad_ in grads_grads_v if grad_ is not None ], axis=0))
def grad(self): with tfe.GradientTape() as tape: loss_value = self.loss(-1) weight_decay = 0.0 regularization = 0 for weights in self.__all_variables(): weights = tf.nn.softmax(weights) regularization += tf.reduce_sum( tf.sqrt(weights)) * weight_decay loss_value += regularization / len(self.__all_variables()) return tape.gradient(loss_value, self.__all_variables())
def grad_actor(self, X): """ get gradient of training batch Args: X : input features batch, shape of (batch_size, input_shape) Returns: (gradient of actor variables, loss of batch) """ with tfe.GradientTape() as tape: loss_val = self.loss_actor(X) return tape.gradient(loss_val, self.actor_active.variables), loss_val
def _train_eager_one_epoch(self): for (batch, (features, labels)) in enumerate(tfe.Iterator(self.data['train'])): with tfe.GradientTape() as tape: logits = self.model(features, training=True) train_loss = self.loss(labels, logits) train_accuracy = self.accuracy(labels, logits) grads = tape.gradient(train_loss, self.model.variables) self.optimizer.apply_gradients(zip(grads, self.model.variables), global_step=self.step_counter) summary.scalar('loss', train_loss) summary.scalar('accuracy', train_accuracy)
def train_one_epoch(model, loss, optimizer, dataset, log_interval=None): tf.train.get_or_create_global_step() for (batch, (images, labels)) in enumerate(tfe.Iterator(dataset)): with tf.contrib.summary.record_summaries_every_n_global_steps(10): with tfe.GradientTape() as tape: prediction = model(images, training=True) loss_value = loss(prediction, labels) tf.contrib.summary.scalar('loss', loss_value) grads = tape.gradient(loss_value, model.variables) optimizer.apply_gradients(zip(grads, model.variables)) if log_interval and batch % log_interval == 0: print('Batch #%d\tLoss: %.6f' % (batch, loss_value))