def testSecondGradient(self): with self.test_session() as sess: l = constant_op.constant( [ 0.0, 0.0, 1.0 / 3, 0.0, 1.0 / 3, 0.0, 0.0, 0.0, 0.0, 0.5 / 3, 0.0, 0.5 / 3 ], shape=[12], dtype=dtypes.float64, name="l") f = constant_op.constant( [0.1, 0.2, 0.3, 0.4, 0.1, 0.4, 0.9, 1.6, 0.1, 0.8, 2.7, 6.4], shape=[12], dtype=dtypes.float64, name="f") x = nn_ops.softmax_cross_entropy_with_logits( labels=l, logits=f, name="xent") loss = math_ops.reduce_sum(x) gradients = gradients_impl.gradients(loss, [f])[0] err = gradient_checker.compute_gradient_error(f, [12], gradients, [12]) # Check that second derivative is calculated. # (it is equivalent to being `BatchMatMul` op in the graph because of implementation of xentropy grad) op_names = [ op.op_def.name for op in sess.graph.get_operations() if op.op_def ] self.assertIn("BatchMatMul", op_names) print("cross entropy hessian err = ", err) self.assertLess(err, 5e-8)
def testGradient(self): with self.test_session() as sess: l = constant_op.constant( [0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0, 0.5], shape=[3, 4], dtype=dtypes.float64, name="l") f = constant_op.constant( [0.1, 0.2, 0.3, 0.4, 0.1, 0.4, 0.9, 1.6, 0.1, 0.8, 2.7, 6.4], shape=[3, 4], dtype=dtypes.float64, name="f") x = nn_ops.softmax_cross_entropy_with_logits( labels=l, logits=f, name="xent") err = gradient_checker.compute_gradient_error(f, [3, 4], x, [3]) # Check that no extra computation performed. When only first derivative is requested, # second derivative must not be computed. So when there is no second derivative, # there is no `BatchMatMul` op in the graph. op_names = [ op.op_def.name for op in sess.graph.get_operations() if op.op_def ] self.assertNotIn("BatchMatMul", op_names) print("cross entropy gradient err = ", err) self.assertLess(err, 5e-8)
def benchmarkSingleClass(self): for (m, n, p, use_gpu) in itertools.product( [128], [10, 100, 1000, 10000, 100000], [0.001, 0.01, 0.5, 0.99, 1.0], [False]): k = int(p * n) if k == 0: continue name = "single_class_m_%d_n_%d_k_%g_use_gpu_%s" % (m, n, k, use_gpu) device = "/%s:0" % ("gpu" if use_gpu else "cpu") with ops.Graph().as_default(): with ops.device(device): labels = constant_op.constant([[1.], [-1.], [0.]], dtype=dtypes.float32) logits = constant_op.constant([[-1.], [0.], [1.]], dtype=dtypes.float32) op = nn_ops.softmax_cross_entropy_with_logits( labels=labels, logits=logits) with session.Session() as sess: r = self.run_op_benchmark(sess, op, min_iters=100, name=name) gb_processed_input = m * n / 1.0e9 throughput = gb_processed_input / r["wall_time"] print("Benchmark: %s \t wall_time: %0.03g s \t " "Throughput: %0.03g GB/s" % (name, r["wall_time"], throughput)) sys.stdout.flush()
def _entropy(self): logits_2d = array_ops.reshape( self.logits, array_ops.pack([-1, self.num_classes])) histogram_2d = nn_ops.softmax(logits_2d) ret = array_ops.reshape( nn_ops.softmax_cross_entropy_with_logits(logits_2d, histogram_2d), self.batch_shape()) ret.set_shape(self.get_batch_shape()) return ret
def testZeroDimension(self): features = np.zeros([0, 2, 4]).astype(np.float32) labels = np.zeros([0, 2, 4]).astype(np.float32) np_loss, _ = self._npXent(features, labels) with self.test_session(use_gpu=True) as sess: loss = nn_ops.softmax_cross_entropy_with_logits( labels=labels, logits=features) tf_loss = sess.run(loss) self.assertAllEqual(np_loss, tf_loss)
def _testXentWrapper(self, np_features, np_labels, dim=-1, use_gpu=False): np_loss, _ = self._npXent(np_features, np_labels, dim=dim) with self.test_session(use_gpu=use_gpu) as sess: loss = nn_ops.softmax_cross_entropy_with_logits( labels=np_labels, logits=np_features, dim=dim) tf_loss = sess.run(loss) print("np_loss:", np_loss) print("tf_loss:", tf_loss) self.assertAllCloseAccordingToType(np_loss, tf_loss)
def _entropy(self): if self.logits.get_shape().ndims == 2: logits_2d = self.logits else: logits_2d = array_ops.reshape(self.logits, [-1, self.num_classes]) histogram_2d = nn_ops.softmax(logits_2d) ret = array_ops.reshape(nn_ops.softmax_cross_entropy_with_logits(logits_2d, histogram_2d), self.batch_shape()) ret.set_shape(self.get_batch_shape()) return ret
def sampled_softmax_loss(weights, biases, inputs, labels, num_sampled, num_classes, num_true=1, sampled_values=None, remove_accidental_hits=True, name="sampled_softmax_loss"): """Computes and returns the sampled softmax training loss. This is a faster way to train a softmax classifier over a huge number of classes. This operation is for training only. It is generally an underestimate of the full softmax loss. At inference time, you can compute full softmax probabilities with the expression `tf.nn.softmax(tf.matmul(inputs, weights) + biases)`. See our [Candidate Sampling Algorithms Reference] (http://www.tensorflow.org/extras/candidate_sampling.pdf) Also see Section 3 of http://arxiv.org/abs/1412.2007 for the math. Args: weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor` objects whose concatenation along dimension 0 has shape [num_classes, dim]. The (possibly-sharded) class embeddings. biases: A `Tensor` of shape `[num_classes]`. The class biases. inputs: A `Tensor` of shape `[batch_size, dim]`. The forward activations of the input network. labels: A `Tensor` of type `int64` and shape `[batch_size, num_true]`. The target classes. Note that this format differs from the `labels` argument of `nn.softmax_cross_entropy_with_logits`. num_sampled: An `int`. The number of classes to randomly sample per batch. num_classes: An `int`. The number of possible classes. num_true: An `int`. The number of target classes per training example. sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`, `sampled_expected_count`) returned by a `*_candidate_sampler` function. (if None, we default to `log_uniform_candidate_sampler`) remove_accidental_hits: A `bool`. whether to remove "accidental hits" where a sampled class equals one of the target classes. Default is True. name: A name for the operation (optional). Returns: A `batch_size` 1-D tensor of per-example sampled softmax losses. """ logits, labels = _compute_sampled_logits( weights, biases, inputs, labels, num_sampled, num_classes, num_true=num_true, sampled_values=sampled_values, subtract_log_q=True, remove_accidental_hits=remove_accidental_hits, name=name) sampled_losses = nn_ops.softmax_cross_entropy_with_logits(logits, labels) # sampled_losses is a [batch_size] tensor. return sampled_losses
def entropy(self, name="sample"): with ops.name_scope(self.name): with ops.op_scope([], name): logits_2d = array_ops.reshape( self.logits, array_ops.pack([-1, self.num_classes])) histogram_2d = nn_ops.softmax(logits_2d) ret = array_ops.reshape( nn_ops.softmax_cross_entropy_with_logits(logits_2d, histogram_2d), self.batch_shape()) ret.set_shape(self.get_batch_shape()) return ret
def sequence_loss_by_example(logits, targets, weights, num_decoder_symbols, average_across_timesteps=True, softmax_loss_function=None, name=None): """Weighted cross-entropy loss for a sequence of logits (per example). Args: logits: list of 2D Tensors of shape [batch_size x num_decoder_symbols]. targets: list of 1D batch-sized int32 Tensors of the same length as logits. weights: list of 1D batch-sized float-Tensors of the same length as logits. num_decoder_symbols: integer, number of decoder symbols (output classes). average_across_timesteps: If set, divide the returned cost by the total label weight. softmax_loss_function: function (inputs-batch, labels-batch) -> loss-batch to be used instead of the standard softmax (the default if this is None). name: optional name for this operation, default: "sequence_loss_by_example". Returns: 1D batch-sized float Tensor: the log-perplexity for each sequence. Raises: ValueError: if len(logits) is different from len(targets) or len(weights). """ if len(targets) != len(logits) or len(weights) != len(logits): raise ValueError("Lengths of logits, weights, and targets must be the same " "%d, %d, %d." % (len(logits), len(weights), len(targets))) with ops.op_scope(logits + targets + weights, name, "sequence_loss_by_example"): batch_size = array_ops.shape(targets[0])[0] log_perp_list = [] length = batch_size * num_decoder_symbols for i in xrange(len(logits)): if softmax_loss_function is None: # TODO(lukaszkaiser): There is no SparseCrossEntropy in TensorFlow, so # we need to first cast targets into a dense representation, and as # SparseToDense does not accept batched inputs, we need to do this by # re-indexing and re-sizing. When TensorFlow adds SparseCrossEntropy, # rewrite this method. indices = targets[i] + num_decoder_symbols * math_ops.range(batch_size) with ops.device("/cpu:0"): # Sparse-to-dense must be on CPU for now. dense = sparse_ops.sparse_to_dense( indices, array_ops.expand_dims(length, 0), 1.0, 0.0) target = array_ops.reshape(dense, [-1, num_decoder_symbols]) crossent = nn_ops.softmax_cross_entropy_with_logits( logits[i], target, name="SequenceLoss/CrossEntropy{0}".format(i)) else: crossent = softmax_loss_function(logits[i], targets[i]) log_perp_list.append(crossent * weights[i]) log_perps = math_ops.add_n(log_perp_list) if average_across_timesteps: total_size = math_ops.add_n(weights) total_size += 1e-12 # Just to avoid division by 0 for all-0 weights. log_perps /= total_size return log_perps
def _entropy(self): if self.logits.get_shape().ndims == 2: logits_2d = self.logits else: logits_2d = array_ops.reshape(self.logits, [-1, self.event_size]) histogram_2d = nn_ops.softmax(logits_2d) ret = array_ops.reshape( nn_ops.softmax_cross_entropy_with_logits(labels=histogram_2d, logits=logits_2d), self.batch_shape_tensor()) ret.set_shape(self.batch_shape) return ret
def f(inp, hidden_weight, hidden_bias, softmax_weight, softmax_bias): features = nn_ops.relu( nn_ops.xw_plus_b(inp, hidden_weight, hidden_bias), name="features") logits = nn_ops.xw_plus_b( features, softmax_weight, softmax_bias, name="logits") labels = constant_op.constant( label_data.tolist(), shape=[batch, classes], dtype=dtypes.float64, name="labels") cost = nn_ops.softmax_cross_entropy_with_logits( labels=labels, logits=logits, name="cost") return cost
def testGradient(self): with self.test_session(): l = constant_op.constant( [0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0, 0.5], shape=[3, 4], dtype=dtypes.float64, name="l") f = constant_op.constant( [0.1, 0.2, 0.3, 0.4, 0.1, 0.4, 0.9, 1.6, 0.1, 0.8, 2.7, 6.4], shape=[3, 4], dtype=dtypes.float64, name="f") x = nn_ops.softmax_cross_entropy_with_logits(f, l, name="xent") err = gradient_checker.compute_gradient_error(f, [3, 4], x, [3]) print("cross entropy gradient err = ", err) self.assertLess(err, 5e-8)
def _sparse_vs_dense_xent_benchmark_dense(labels, logits): labels = array_ops.identity(labels) logits = array_ops.identity(logits) with ops_lib.device("/cpu:0"): # Sparse-to-dense must be on CPU batch_size = array_ops.shape(logits)[0] num_entries = array_ops.shape(logits)[1] length = batch_size * num_entries labels += num_entries * math_ops.range(batch_size) target = sparse_ops.sparse_to_dense(labels, array_ops.stack([length]), 1.0, 0.0) target = array_ops.reshape(target, array_ops.stack([-1, num_entries])) crossent = nn_ops.softmax_cross_entropy_with_logits( logits, target, name="SequenceLoss/CrossEntropy") crossent_sum = math_ops.reduce_sum(crossent) grads = gradients_impl.gradients([crossent_sum], [logits])[0] return (crossent_sum, grads)
def _log_prob(self, x): x = self._assert_valid_sample(x) # broadcast logits or x if need be. logits = self.logits if (not x.get_shape().is_fully_defined() or not logits.get_shape().is_fully_defined() or x.get_shape() != logits.get_shape()): logits = array_ops.ones_like(x, dtype=logits.dtype) * logits x = array_ops.ones_like(logits, dtype=x.dtype) * x logits_shape = array_ops.shape(math_ops.reduce_sum(logits, -1)) logits_2d = array_ops.reshape(logits, [-1, self.event_size]) x_2d = array_ops.reshape(x, [-1, self.event_size]) ret = -nn_ops.softmax_cross_entropy_with_logits(labels=x_2d, logits=logits_2d) # Reshape back to user-supplied batch and sample dims prior to 2D reshape. ret = array_ops.reshape(ret, logits_shape) return ret
def testSecondGradient(self): with self.test_session(): l = constant_op.constant([0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0, 0.5], shape=[12], dtype=dtypes.float64, name="l") f = constant_op.constant([0.1, 0.2, 0.3, 0.4, 0.1, 0.4, 0.9, 1.6, 0.1, 0.8, 2.7, 6.4], shape=[12], dtype=dtypes.float64, name="f") x = nn_ops.softmax_cross_entropy_with_logits(labels=l, logits=f, name="xent") loss = math_ops.reduce_mean(x) # Taking ths second gradient should fail, since it is not # yet supported. with self.assertRaisesRegexp(LookupError, "explicitly disabled"): _ = gradients_impl.hessians(loss, [f])
def _log_prob(self, x): x = ops.convert_to_tensor(x, name="x") # broadcast logits or x if need be. logits = self.logits if (not x.get_shape().is_fully_defined() or not logits.get_shape().is_fully_defined() or x.get_shape() != logits.get_shape()): logits = array_ops.ones_like(x, dtype=logits.dtype) * logits x = array_ops.ones_like(logits, dtype=x.dtype) * x logits_shape = array_ops.shape(logits) if logits.get_shape().ndims == 2: logits_2d = logits x_2d = x else: logits_2d = array_ops.reshape(logits, [-1, self.num_classes]) x_2d = array_ops.reshape(x, [-1, self.num_classes]) ret = -nn_ops.softmax_cross_entropy_with_logits(logits_2d, x_2d) ret = array_ops.reshape(ret, logits_shape) return ret
def _log_prob(self, x): x = self._assert_valid_sample(x) # broadcast logits or x if need be. logits = self.logits if (not x.get_shape().is_fully_defined() or not logits.get_shape().is_fully_defined() or x.get_shape() != logits.get_shape()): logits = array_ops.ones_like(x, dtype=logits.dtype) * logits x = array_ops.ones_like(logits, dtype=x.dtype) * x logits_shape = array_ops.shape(logits) if logits.get_shape().ndims == 2: logits_2d = logits x_2d = x else: logits_2d = array_ops.reshape(logits, [-1, self.event_size]) x_2d = array_ops.reshape(x, [-1, self.event_size]) ret = -nn_ops.softmax_cross_entropy_with_logits(labels=x_2d, logits=logits_2d) ret = array_ops.reshape(ret, logits_shape) return ret
def first(x): l = tensor.Tensor([[0.0]]) x = nn_ops.softmax_cross_entropy_with_logits(labels=l, logits=x) x = math_ops.reduce_sum(x, tensor.Tensor([0])) return x
def loss(x, l): return math_ops.reduce_mean( nn_ops.softmax_cross_entropy_with_logits(logits=x, labels=l), tensor.Tensor([0]))
def _BuildAndTestMiniMNIST(self, param_index, tag): # Fix seed to avoid occasional flakiness np.random.seed(6) # Hyperparameters batch = 3 inputs = 16 features = 32 classes = 10 # Define the parameters inp_data = np.random.random_sample(inputs * batch) hidden_weight_data = np.random.randn(inputs * features) / np.sqrt(inputs) hidden_bias_data = np.random.random_sample(features) sm_weight_data = np.random.randn(features * classes) / np.sqrt(features) sm_bias_data = np.random.random_sample(classes) # special care for labels since they need to be normalized per batch label_data = np.random.random(batch * classes).reshape((batch, classes)) s = label_data.sum(axis=1) label_data /= s[:, None] with self.session(use_gpu=True): # We treat the inputs as "parameters" here inp = constant_op.constant( inp_data.tolist(), shape=[batch, inputs], dtype=dtypes.float64, name="inp") hidden_weight = constant_op.constant( hidden_weight_data.tolist(), shape=[inputs, features], dtype=dtypes.float64, name="hidden_weight") hidden_bias = constant_op.constant( hidden_bias_data.tolist(), shape=[features], dtype=dtypes.float64, name="hidden_bias") softmax_weight = constant_op.constant( sm_weight_data.tolist(), shape=[features, classes], dtype=dtypes.float64, name="softmax_weight") softmax_bias = constant_op.constant( sm_bias_data.tolist(), shape=[classes], dtype=dtypes.float64, name="softmax_bias") # List all the parameter so that we can test them one at a time all_params = [ inp, hidden_weight, hidden_bias, softmax_weight, softmax_bias ] param_sizes = [ [batch, inputs], # inp [inputs, features], # hidden_weight, [features], # hidden_bias [features, classes], # softmax_weight, [classes] ] # softmax_bias # Now, Building MNIST features = nn_ops.relu( nn_ops.xw_plus_b(inp, hidden_weight, hidden_bias), name="features") logits = nn_ops.xw_plus_b( features, softmax_weight, softmax_bias, name="logits") labels = constant_op.constant( label_data.tolist(), shape=[batch, classes], dtype=dtypes.float64, name="labels") cost = nn_ops.softmax_cross_entropy_with_logits( labels=labels, logits=logits, name="cost") # Test the gradients. err = gradient_checker.compute_gradient_error( all_params[param_index], param_sizes[param_index], cost, [batch], delta=1e-5) tf_logging.info("Mini MNIST: %s gradient error = %g", tag, err) return err
def first(x): l = constant_op.constant([[0.0]]) x = nn_ops.softmax_cross_entropy_with_logits(labels=l, logits=x) x = math_ops.reduce_sum(x, constant_op.constant([0])) return x
def loss(x, l): return math_ops.reduce_mean( nn_ops.softmax_cross_entropy_with_logits(logits=x, labels=l), constant_op.constant([0]))
def sampled_softmax_loss(weights, biases, inputs, labels, num_sampled, num_classes, num_true=1, sampled_values=None, remove_accidental_hits=True, name="sampled_softmax_loss"): """Computes and returns the sampled softmax training loss. This is a faster way to train a softmax classifier over a huge number of classes. This operation is for training only. It is generally an underestimate of the full softmax loss. At inference time, you can compute full softmax probabilities with the expression `tf.nn.softmax(tf.matmul(inputs, weights) + biases)`. See our [Candidate Sampling Algorithms Reference] (../../extras/candidate_sampling.pdf) Also see Section 3 of http://arxiv.org/abs/1412.2007 for the math. Args: weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor` objects whose concatenation along dimension 0 has shape [num_classes, dim]. The (possibly-sharded) class embeddings. biases: A `Tensor` of shape `[num_classes]`. The class biases. inputs: A `Tensor` of shape `[batch_size, dim]`. The forward activations of the input network. labels: A `Tensor` of type `int64` and shape `[batch_size, num_true]`. The target classes. Note that this format differs from the `labels` argument of `nn.softmax_cross_entropy_with_logits`. num_sampled: An `int`. The number of classes to randomly sample per batch. num_classes: An `int`. The number of possible classes. num_true: An `int`. The number of target classes per training example. sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`, `sampled_expected_count`) returned by a `*_candidate_sampler` function. (if None, we default to `log_uniform_candidate_sampler`) remove_accidental_hits: A `bool`. whether to remove "accidental hits" where a sampled class equals one of the target classes. Default is True. name: A name for the operation (optional). Returns: A `batch_size` 1-D tensor of per-example sampled softmax losses. """ logits, labels = _compute_sampled_logits( weights, biases, inputs, labels, num_sampled, num_classes, num_true=num_true, sampled_values=sampled_values, subtract_log_q=True, remove_accidental_hits=remove_accidental_hits, name=name) sampled_losses = nn_ops.softmax_cross_entropy_with_logits(logits, labels) # sampled_losses is a [batch_size] tensor. return sampled_losses
def first(x): l = tensor.Tensor([[0.0]]) x = nn_ops.softmax_cross_entropy_with_logits(labels=l, logits=x) x = math_ops.reduce_sum(x, tensor.Tensor([0])) return x
def cost_function(labels, logits, num_classes): sampled_losses = nn_ops.softmax_cross_entropy_with_logits( labels=labels, logits=logits) return sampled_losses
def loss(x, l): return math_ops.reduce_mean( nn_ops.softmax_cross_entropy_with_logits(logits=x, labels=l), tensor.Tensor([0]))
def test(): model = Model() vocabulary_size = 10 embedding_size = 2 num_sampled = 3 # Number of negative examples to sample. num_true = 1 graph = tf.Graph() with graph.as_default(): # Input data. train_dataset = tf.placeholder(tf.int32) train_labels = tf.placeholder(tf.int32, shape=[None, num_true]) # Variables. embeddings = tf.Variable( tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)) # Model. # Look up embeddings for inputs. embed = tf.nn.embedding_lookup(embeddings, train_dataset) # Compute the softmax loss, using a sample of the negative labels each time. logits, labels = yba_sampled_softmax(model=model, weights=embeddings, inputs=embed, labels=train_labels, num_sampled=num_sampled, num_classes=vocabulary_size, num_true=num_true) loss = tf.reduce_mean( nn_ops.softmax_cross_entropy_with_logits(labels=labels, logits=logits)) # Optimizer. # Note: The optimizer will optimize the softmax_weights AND the embeddings. # This is because the embeddings are defined as a variable quantity and the # optimizer's `minimize` method will by default modify all variable quantities # that contribute to the tensor it is passed. # See docs on `tf.train.Optimizer.minimize()` for more details. optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss) with tf.Session(graph=graph) as session: tf.global_variables_initializer().run() print('Initialized') average_loss = 0 for step in range(100): # batch_data = np.array( # [[0, 1, 1, 0, 0], [1, 0, 0, 1, 0], [1, 0, 0, 1, 0], # [0, 1, 1, 0, 1], [0, 0, 0, 1, 0]]) batch_data = np.array( [7, 7, 7, 7, 7]) batch_labels = np.array([[1], [2], [3], [4], [5]]) feed_dict = {train_dataset: batch_data, train_labels: batch_labels} _, l, model_logits, model_labels, sampled_values = \ session.run([ optimizer, loss, model.logits, model.labels, model.sampled_values], feed_dict=feed_dict) print('loss', l) print('sampled_values', sampled_values) if step == 99: x = 5
def first(x): l = constant_op.constant([[0.0]]) x = nn_ops.softmax_cross_entropy_with_logits(labels=l, logits=x) x = math_ops.reduce_sum(x, constant_op.constant([0])) return x
def sequence_loss_by_example(logits, targets, weights, num_decoder_symbols, average_across_timesteps=True, softmax_loss_function=None, name=None): """Weighted cross-entropy loss for a sequence of logits (per example). Args: logits: list of 2D Tensors of shape [batch_size x num_decoder_symbols]. targets: list of 1D batch-sized int32 Tensors of the same length as logits. weights: list of 1D batch-sized float-Tensors of the same length as logits. num_decoder_symbols: integer, number of decoder symbols (output classes). average_across_timesteps: If set, divide the returned cost by the total label weight. softmax_loss_function: function (inputs-batch, labels-batch) -> loss-batch to be used instead of the standard softmax (the default if this is None). name: optional name for this operation, default: "sequence_loss_by_example". Returns: 1D batch-sized float Tensor: the log-perplexity for each sequence. Raises: ValueError: if len(logits) is different from len(targets) or len(weights). """ if len(targets) != len(logits) or len(weights) != len(logits): raise ValueError( "Lengths of logits, weights, and targets must be the same " "%d, %d, %d." % (len(logits), len(weights), len(targets))) with ops.name_scope(name, "sequence_loss_by_example", logits + targets + weights): batch_size = array_ops.shape(targets[0])[0] log_perp_list = [] length = batch_size * num_decoder_symbols for i in xrange(len(logits)): if softmax_loss_function is None: # TODO(lukaszkaiser): There is no SparseCrossEntropy in TensorFlow, so # we need to first cast targets into a dense representation, and as # SparseToDense does not accept batched inputs, we need to do this by # re-indexing and re-sizing. When TensorFlow adds SparseCrossEntropy, # rewrite this method. indices = targets[i] + num_decoder_symbols * math_ops.range( batch_size) with ops.device( "/cpu:0"): # Sparse-to-dense must be on CPU for now. dense = sparse_ops.sparse_to_dense( indices, array_ops.expand_dims(length, 0), 1.0, 0.0) target = array_ops.reshape(dense, [-1, num_decoder_symbols]) crossent = nn_ops.softmax_cross_entropy_with_logits( logits=logits[i], labels=target, name="SequenceLoss/CrossEntropy{0}".format(i)) else: crossent = softmax_loss_function(logits[i], targets[i]) log_perp_list.append(crossent * weights[i]) log_perps = math_ops.add_n(log_perp_list) if average_across_timesteps: total_size = math_ops.add_n(weights) total_size += 1e-12 # Just to avoid division by 0 for all-0 weights. log_perps /= total_size return log_perps
def loss(x, l): return math_ops.reduce_mean( nn_ops.softmax_cross_entropy_with_logits(logits=x, labels=l), constant_op.constant([0]))
def _BuildAndTestMiniMNIST(self, param_index, tag): # Fix seed to avoid occasional flakiness np.random.seed(6) # Hyperparameters batch = 3 inputs = 16 features = 32 classes = 10 # Define the parameters inp_data = np.random.random_sample(inputs * batch) hidden_weight_data = np.random.randn( inputs * features) / np.sqrt(inputs) hidden_bias_data = np.random.random_sample(features) sm_weight_data = np.random.randn( features * classes) / np.sqrt(features) sm_bias_data = np.random.random_sample(classes) # special care for labels since they need to be normalized per batch label_data = np.random.random(batch * classes).reshape( (batch, classes)) s = label_data.sum(axis=1) label_data /= s[:, None] with self.test_session(use_gpu=True): # We treat the inputs as "parameters" here inp = constant_op.constant(inp_data.tolist(), shape=[batch, inputs], dtype=dtypes.float64, name="inp") hidden_weight = constant_op.constant(hidden_weight_data.tolist(), shape=[inputs, features], dtype=dtypes.float64, name="hidden_weight") hidden_bias = constant_op.constant(hidden_bias_data.tolist(), shape=[features], dtype=dtypes.float64, name="hidden_bias") softmax_weight = constant_op.constant(sm_weight_data.tolist(), shape=[features, classes], dtype=dtypes.float64, name="softmax_weight") softmax_bias = constant_op.constant(sm_bias_data.tolist(), shape=[classes], dtype=dtypes.float64, name="softmax_bias") # List all the parameter so that we can test them one at a time all_params = [ inp, hidden_weight, hidden_bias, softmax_weight, softmax_bias ] param_sizes = [ [batch, inputs], # inp [inputs, features], # hidden_weight, [features], # hidden_bias [features, classes], # softmax_weight, [classes] ] # softmax_bias # Now, Building MNIST features = nn_ops.relu(nn_ops.xw_plus_b(inp, hidden_weight, hidden_bias), name="features") logits = nn_ops.xw_plus_b(features, softmax_weight, softmax_bias, name="logits") labels = constant_op.constant(label_data.tolist(), shape=[batch, classes], dtype=dtypes.float64, name="labels") cost = nn_ops.softmax_cross_entropy_with_logits(labels=labels, logits=logits, name="cost") # Test the gradients. err = gradient_checker.compute_gradient_error( all_params[param_index], param_sizes[param_index], cost, [batch], delta=1e-5) tf_logging.info("Mini MNIST: %s gradient error = %g", tag, err) return err
def __init__(self, sess, config, data_feed, log_dir): vocab_size = len(data_feed.vocab) self.data_feed = data_feed with tf.name_scope("io"): self.inputs = tf.placeholder(dtype=tf.int32, shape=(None, None), name="input_seq") self.input_lens = tf.placeholder(dtype=tf.int32, shape=(None, ), name="seq_len") self.da_labels = tf.placeholder(dtype=tf.int32, shape=(None, ), name="dialog_acts") self.senti_labels = tf.placeholder( dtype=tf.float32, shape=(None, data_feed.feature_size[data_feed.SENTI_ID]), name="sentiments") self.learning_rate = tf.Variable(float(config.init_lr), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * config.lr_decay) max_sent_len = array_ops.shape(self.inputs)[1] batch_size = array_ops.shape(self.inputs)[0] with variable_scope.variable_scope("word-embedding"): embedding = tf.get_variable("embedding", [vocab_size, config.embed_size], dtype=tf.float32) input_embedding = embedding_ops.embedding_lookup( embedding, tf.squeeze(tf.reshape(self.inputs, [-1, 1]), squeeze_dims=[1])) input_embedding = tf.reshape(input_embedding, [-1, max_sent_len, config.embed_size]) with variable_scope.variable_scope("rnn"): if config.cell_type == "gru": cell = rnn_cell.GRUCell(config.cell_size) elif config.cell_type == "lstm": cell = rnn_cell.LSTMCell(config.cell_size, use_peepholes=False, forget_bias=1.0) elif config.cell_type == "rnn": cell = rnn_cell.BasicRNNCell(config.cell_size) else: raise ValueError("unknown RNN type") if config.keep_prob < 1.0: cell = rnn_cell.DropoutWrapper( cell, output_keep_prob=config.keep_prob, input_keep_prob=config.keep_prob) if config.num_layer > 1: cell = rnn_cell.MultiRNNCell([cell] * config.num_layer, state_is_tuple=True) # and enc_last_state will be same as the true last state outputs, _ = tf.nn.dynamic_rnn( cell, input_embedding, dtype=tf.float32, sequence_length=self.input_lens, ) # get the TRUE last outputs last_outputs = tf.reduce_sum( tf.mul( outputs, tf.expand_dims( tf.one_hot(self.input_lens - 1, max_sent_len), -1)), 1) self.dialog_acts = self.fnn( last_outputs, data_feed.feature_size[data_feed.DA_ID], [100], "dialog_act_fnn") self.sentiments = self.fnn( last_outputs, data_feed.feature_size[data_feed.SENTI_ID], [100], "setiment_fnn") self.loss = tf.reduce_sum(nn_ops.sparse_softmax_cross_entropy_with_logits(self.dialog_acts, self.da_labels)) \ + tf.reduce_sum(nn_ops.softmax_cross_entropy_with_logits(self.sentiments, self.senti_labels)) self.loss /= tf.to_float(batch_size) tf.scalar_summary("entropy_loss", self.loss) self.summary_op = tf.merge_all_summaries() # weight decay tvars = tf.trainable_variables() for v in tvars: print("Trainable %s" % v.name) # optimization if config.op == "adam": print("Use Adam") optimizer = tf.train.AdamOptimizer(self.learning_rate) elif config.op == "rmsprop": print("Use RMSProp") optimizer = tf.train.RMSPropOptimizer(self.learning_rate) else: print("Use SGD") optimizer = tf.train.GradientDescentOptimizer(self.learning_rate) grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), config.grad_clip) self.train_ops = optimizer.apply_gradients(zip(grads, tvars)) self.saver = tf.train.Saver(tf.all_variables(), write_version=tf.train.SaverDef.V2) if log_dir is not None: train_log_dir = os.path.join(log_dir, "train") print("Save summary to %s" % log_dir) self.train_summary_writer = tf.train.SummaryWriter( train_log_dir, sess.graph)