예제 #1
0
  def testBasicMemory(self):
    """Make sure arguments can be passed correctly."""
    with test_util.device(use_gpu=False):
      a = constant_op.constant(10, name="a")
      b = constant_op.constant(20, name="b")
      c = math_ops.add_n([a, b], name="c")
      d = math_ops.add_n([b, c], name="d")
      train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
      train_op.append(d)
      mg = meta_graph.create_meta_graph_def(graph=ops.get_default_graph())

    report = cost_analyzer.GenerateMemoryReport(mg)

    # Print the report to make it easier to debug
    print("{}".format(report))

    # Check the report
    self.assertTrue(
        "Peak usage for device /job:localhost/replica:0/task:0/device:CPU:0: "
        "16 bytes"
        in report)
    self.assertTrue("  a:0 uses 4 bytes" in report)
    self.assertTrue("  b:0 uses 4 bytes" in report)
    self.assertTrue("  c:0 uses 4 bytes" in report)
    self.assertTrue("  d:0 uses 4 bytes" in report)
예제 #2
0
def sequence_loss_by_example(logits, targets, weights, average_across_time=True, scope=None):
  '''
  A simple version of weighted sequence loss measured in sequence
  :param logits:
  :param targets:
  :param weights:
  :param average_across_time:
  :param softmax_loss_function:
  :param scope:
  :return:
  '''
  if len(logits) != len(targets) or len(weights) != len(logits):
    raise ValueError("Lenghts of logits, weights and target must be same "
                     "%d, %d, %d" %len(logits), len(weights), len(targets))

  with tf.variable_scope(scope or "sequence_loss_by_example"):
    sequence_loss_list = []
    for logit, target, weight in zip(logits, targets, weights):
      loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logit,target)
      # tensorflow !!!
      sequence_loss_list.append(loss*weight)
    sequence_loss = math_ops.add_n(sequence_loss_list)
    if average_across_time:
      total_weight = math_ops.add_n(weights) + 1e-12
      final_loss = sequence_loss/total_weight
    else:
      final_loss = sequence_loss
    return final_loss
예제 #3
0
  def testAddN(self):
    devices = ["/cpu:0"]
    if test_util.is_gpu_available():
      devices.append("/gpu:0")
    for device in devices:
      with ops.device(device):
        # With value
        opt1 = optional_ops.Optional.from_value((1.0, 2.0))
        opt2 = optional_ops.Optional.from_value((3.0, 4.0))

        add_tensor = math_ops.add_n([opt1._variant_tensor,
                                     opt2._variant_tensor])
        add_opt = optional_ops._OptionalImpl(add_tensor, opt1.value_structure)
        self.assertAllEqual(self.evaluate(add_opt.get_value()), (4.0, 6.0))

        # Without value
        opt_none1 = optional_ops.Optional.none_from_structure(
            opt1.value_structure)
        opt_none2 = optional_ops.Optional.none_from_structure(
            opt2.value_structure)
        add_tensor = math_ops.add_n([opt_none1._variant_tensor,
                                     opt_none2._variant_tensor])
        add_opt = optional_ops._OptionalImpl(add_tensor,
                                             opt_none1.value_structure)
        self.assertFalse(self.evaluate(add_opt.has_value()))
예제 #4
0
def _MultiDeviceAddN(tensor_list, gradient_uid):
  """Adds tensors from potentially multiple devices."""
  # Basic function structure comes from control_flow_ops.group().
  # Sort tensors according to their devices.
  tensors_on_device = collections.defaultdict(lambda: [])
  for tensor in tensor_list:
    tensors_on_device[tensor.device].append(tensor)

  # For each device, add the tensors on that device first.
  # Then gather the partial sums from multiple devices.
  # TODO(sjhwang): Create hierarchical aggregation tree as pbar's suggestion.
  # E.g., aggregate per GPU, then per task, and so on.
  summands = []

  def DeviceKey(dev):
    return "" if dev is None else dev

  for dev in sorted(six.iterkeys(tensors_on_device), key=DeviceKey):
    tensors = tensors_on_device[dev]
    with ops._colocate_with_for_gradient(  # pylint: disable=protected-access
        tensors[0].op,
        gradient_uid,
        ignore_existing=True):
      summands.append(math_ops.add_n(tensors))

  return math_ops.add_n(summands)
예제 #5
0
  def testSimpleSwap(self):
    """Check that the swap annotations are followed."""
    a = constant_op.constant(10, name='a')
    b = constant_op.constant(20, name='b')
    c = math_ops.add_n([a, b], name='c')
    d = math_ops.add_n([b, c], name='d')
    train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
    train_op.append(d)

    d.op.node_def.attr['_swap_to_host'].i = 0

    mg = meta_graph.create_meta_graph_def(graph=ops.get_default_graph())

    rewriter_config = rewriter_config_pb2.RewriterConfig(
        memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL)
    graph = tf_optimizer.OptimizeGraph(rewriter_config, mg)

    self.assertEqual(len(graph.node), 6)
    self.assertItemsEqual([node.name for node in graph.node], [
        'a',
        'b',
        'c',
        'd',
        'swap_in_d_0',
        'swap_out_d_0',
    ])
    for node in graph.node:
      if node.name == 'swap_in_d_0':
        self.assertEqual('swap_out_d_0', node.input[0])
        self.assertEqual('^b', node.input[1])
      elif node.name == 'swap_out_d_0':
        self.assertEqual('b', node.input[0])
      elif node.name == 'd':
        self.assertEqual('swap_in_d_0', node.input[0])
        self.assertEqual('c', node.input[1])
예제 #6
0
def MMIloss(logits, targets, weights, lam, gam,
                             average_across_timesteps=True,
                             softmax_loss_function=None, name=None):
    
"""lam is lambda value(diversity penalty) of the object, gam is gamma value(length penalty) of the object
(see section 4.5.1 of Li et al)"""


  if len(targets) != len(logits) or len(weights) != len(logits):
    raise ValueError("Lengths of logits, weights, and targets must be the same "
                     "%d, %d, %d." % (len(logits), len(weights), len(targets)))
  
  with ops.op_scope(logits + targets + weights, name,
                    "sequence_loss_by_example"):
    log_perp_list = []
    for logit, target, weight in zip(logits, targets, weights):
        if softmax_loss_function is None:
        
            target = array_ops.reshape(target, [-1])
            crossent = nn_ops.sparse_softmax_cross_entropy_with_logits(
            logit, target)
        else:
            crossent = softmax_loss_function(logit, target)
        log_perp_list.append(crossent * weight)
    log_perps = math_ops.add_n(log_perp_list)
    if average_across_timesteps:
        total_size = math_ops.add_n(weights)
        total_size += 1e-12  # Just to avoid division by 0 for all-0 weights.
        log_perps /= total_size
        
       
    final_perps= log_perps - (lam)*lm_perps + (gam)*len(targets)   
    return final_perps
예제 #7
0
  def testSimpleSwap(self):
    """Check that the swap annotations are followed."""
    a = variables.Variable(10, name='a')
    b = variables.Variable(20, name='b')
    c = math_ops.add_n([a, b], name='c')
    d = math_ops.add_n([b, c], name='d')
    train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
    train_op.append(d)

    d.op.node_def.attr['_swap_to_host'].i = 0

    mg = meta_graph.create_meta_graph_def(graph=ops.get_default_graph())
    graph_size = len(mg.graph_def.node)

    rewriter_config = rewriter_config_pb2.RewriterConfig(
        disable_model_pruning=True,
        memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL)
    graph = tf_optimizer.OptimizeGraph(rewriter_config, mg)

    self.assertEqual(len(graph.node), graph_size + 2)
    self.assertTrue(
        set([node.name for node in graph.node]) > set(
            ['a', 'b', 'c', 'd', 'swap_in_d_0', 'swap_out_d_0']))
    for node in graph.node:
      if node.name == 'swap_in_d_0':
        self.assertEqual('swap_out_d_0', node.input[0])
        self.assertEqual('^b/read', node.input[1])
      elif node.name == 'swap_out_d_0':
        self.assertEqual('b/read', node.input[0])
      elif node.name == 'd':
        self.assertEqual('swap_in_d_0', node.input[0])
        self.assertEqual('c', node.input[1])
예제 #8
0
def surrogate_loss(sample_losses,
                   stochastic_tensors=None,
                   name="SurrogateLoss"):
  """Surrogate loss for stochastic graphs.

  This function will call `loss_fn` on each `StochasticTensor`
  upstream of `sample_losses`, passing the losses that it influenced.

  Note that currently `surrogate_loss` does not work with `StochasticTensor`s
  instantiated in `while_loop`s or other control structures.

  Args:
    sample_losses: a list or tuple of final losses. Each loss should be per
      example in the batch (and possibly per sample); that is, it should have
      dimensionality of 1 or greater. All losses should have the same shape.
    stochastic_tensors: a list of `StochasticTensor`s to add loss terms for.
      If None, defaults to all `StochasticTensor`s in the graph upstream of
      the `Tensor`s in `sample_losses`.
    name: the name with which to prepend created ops.

  Returns:
    `Tensor` loss, which is the sum of `sample_losses` and the
    `loss_fn`s returned by the `StochasticTensor`s.

  Raises:
    TypeError: if `sample_losses` is not a list or tuple, or if its elements
      are not `Tensor`s.
    ValueError: if any loss in `sample_losses` does not have dimensionality 1
      or greater.
  """
  with ops.op_scope(sample_losses, name):
    fixed_losses = []
    if not isinstance(sample_losses, (list, tuple)):
      raise TypeError("sample_losses must be a list or tuple")
    for loss in sample_losses:
      if not isinstance(loss, ops.Tensor):
        raise TypeError("loss is not a Tensor: %s" % loss)
      ndims = loss.get_shape().ndims
      if not (ndims is not None and ndims >= 1):
        raise ValueError("loss must have dimensionality 1 or greater: %s" %
                         loss)
      fixed_losses.append(array_ops.stop_gradient(loss))

    stoch_dependencies_map = _stochastic_dependencies_map(
        fixed_losses, stochastic_tensors=stochastic_tensors)
    if not stoch_dependencies_map:
      logging.warn(
          "No collection of Stochastic Tensors found for current graph.")
      return math_ops.add_n(sample_losses)

    # Iterate through all of the stochastic dependencies, adding
    # surrogate terms where necessary.
    sample_losses = [ops.convert_to_tensor(loss) for loss in sample_losses]
    loss_terms = sample_losses
    for (stoch_node, dependent_losses) in stoch_dependencies_map.items():
      loss_term = stoch_node.loss(list(dependent_losses))
      if loss_term is not None:
        loss_terms.append(loss_term)

    return math_ops.add_n(loss_terms)
예제 #9
0
 def MYsequence_loss_by_example(logits, targets, weights,
                              average_across_timesteps=True,
                              softmax_loss_function=None, name=None):
   if len(targets) != len(logits) or len(weights) != len(logits):
     raise ValueError("Lengths of logits, weights, and targets must be the same "
                      "%d, %d, %d." % (len(logits), len(weights), len(targets)))
   with ops.op_scope(logits + targets + weights, name,
                     "sequence_loss_by_example"):
     log_perp_list = []
     for logit, target, weight in zip(logits, targets, weights):
       if softmax_loss_function is None:
         # TODO(irving,ebrevdo): This reshape is needed because
         # sequence_loss_by_example is called with scalars sometimes, which
         # violates our general scalar strictness policy.
         target = array_ops.reshape(target, [-1])
         crossent = nn_ops.sparse_softmax_cross_entropy_with_logits(
             logit, target)
       else:
         crossent = softmax_loss_function(logit, target)
       print crossent, weight
       log_perp_list.append(crossent * weight)
       print log_perp_list              
     log_perps = math_ops.add_n(log_perp_list)
     if average_across_timesteps:
       total_size = math_ops.add_n(weights)
       total_size += 1e-12  # Just to avoid division by 0 for all-0 weights.
       log_perps /= total_size
   return log_perps
예제 #10
0
  def _full_batch_training_op(self, inputs, cluster_idx_list, cluster_centers):
    """Creates an op for training for full batch case.

    Args:
      inputs: list of input Tensors.
      cluster_idx_list: A vector (or list of vectors). Each element in the
        vector corresponds to an input row in 'inp' and specifies the cluster id
        corresponding to the input.
      cluster_centers: Tensor Ref of cluster centers.

    Returns:
      An op for doing an update of mini-batch k-means.
    """
    cluster_sums = []
    cluster_counts = []
    epsilon = constant_op.constant(1e-6, dtype=inputs[0].dtype)
    for inp, cluster_idx in zip(inputs, cluster_idx_list):
      with ops.colocate_with(inp):
        cluster_sums.append(
            math_ops.unsorted_segment_sum(inp, cluster_idx, self._num_clusters))
        cluster_counts.append(
            math_ops.unsorted_segment_sum(
                array_ops.reshape(
                    array_ops.ones(
                        array_ops.reshape(array_ops.shape(inp)[0], [-1])),
                    [-1, 1]), cluster_idx, self._num_clusters))
    with ops.colocate_with(cluster_centers):
      new_clusters_centers = math_ops.add_n(cluster_sums) / (math_ops.cast(
          math_ops.add_n(cluster_counts), cluster_sums[0].dtype) + epsilon)
      if self._clusters_l2_normalized():
        new_clusters_centers = nn_impl.l2_normalize(new_clusters_centers, dim=1)
    return state_ops.assign(cluster_centers, new_clusters_centers)
예제 #11
0
 def testIndexedSlices(self):
   slc = ops.IndexedSlices(
       array_ops.constant([1, 2], shape=[1, 2]), array_ops.constant([1]),
       array_ops.constant([2, 2]))
   slc_as_dense = np.array([[0, 0], [1, 2]])
   with self.test_session(use_gpu=True):
     # add_n currently always converts IndexedSlices to dense
     self.assertAllEqual(slc_as_dense, math_ops.add_n([slc]).eval())
     self.assertAllEqual(2 * slc_as_dense, math_ops.add_n([slc, slc]).eval())
예제 #12
0
 def testFloat(self):
   np.random.seed(12345)
   for num_inputs in range(1, 10):
     x = [np.random.random((1, 2, 3, 4, 5)) - 0.5 for _ in range(num_inputs)]
     tf_x = ops.convert_n_to_tensor(x)
     with self.test_session(use_gpu=True):
       self.assertAllClose(sum(x), math_ops.add_n(tf_x).eval())
       self.assertAllClose(x[0] * num_inputs,
                           math_ops.add_n([tf_x[0]] * num_inputs).eval())
예제 #13
0
  def testPartials(self):
    """Test that previously revealed a bug in buffer forwarding for AddN."""
    partials = []
    for _ in range(98):
      partials.append(math_ops.add_n([constant_op.constant(1)]))
    partials.append(
        math_ops.add_n([constant_op.constant(1), constant_op.constant(1)]))

    res = math_ops.add_n(partials) + constant_op.constant(0)
    with self.test_session(use_gpu=True):
      self.assertAllEqual(res.eval(), 100)
예제 #14
0
def sequence_loss_by_example(logits, targets, weights, num_decoder_symbols,
                             average_across_timesteps=True,
                             softmax_loss_function=None, name=None):
  """Weighted cross-entropy loss for a sequence of logits (per example).

  Args:
    logits: list of 2D Tensors of shape [batch_size x num_decoder_symbols].
    targets: list of 1D batch-sized int32 Tensors of the same length as logits.
    weights: list of 1D batch-sized float-Tensors of the same length as logits.
    num_decoder_symbols: integer, number of decoder symbols (output classes).
    average_across_timesteps: If set, divide the returned cost by the total
      label weight.
    softmax_loss_function: function (inputs-batch, labels-batch) -> loss-batch
      to be used instead of the standard softmax (the default if this is None).
    name: optional name for this operation, default: "sequence_loss_by_example".

  Returns:
    1D batch-sized float Tensor: the log-perplexity for each sequence.

  Raises:
    ValueError: if len(logits) is different from len(targets) or len(weights).
  """
  if len(targets) != len(logits) or len(weights) != len(logits):
    raise ValueError("Lengths of logits, weights, and targets must be the same "
                     "%d, %d, %d." % (len(logits), len(weights), len(targets)))
  with ops.op_scope(logits + targets + weights, name,
                    "sequence_loss_by_example"):
    batch_size = array_ops.shape(targets[0])[0]
    log_perp_list = []
    length = batch_size * num_decoder_symbols
    for i in xrange(len(logits)):
      if softmax_loss_function is None:
        # TODO(lukaszkaiser): There is no SparseCrossEntropy in TensorFlow, so
        # we need to first cast targets into a dense representation, and as
        # SparseToDense does not accept batched inputs, we need to do this by
        # re-indexing and re-sizing. When TensorFlow adds SparseCrossEntropy,
        # rewrite this method.
        indices = targets[i] + num_decoder_symbols * math_ops.range(batch_size)
        with ops.device("/cpu:0"):  # Sparse-to-dense must be on CPU for now.
          dense = sparse_ops.sparse_to_dense(
              indices, array_ops.expand_dims(length, 0), 1.0,
              0.0)
        target = array_ops.reshape(dense, [-1, num_decoder_symbols])
        crossent = nn_ops.softmax_cross_entropy_with_logits(
            logits[i], target, name="SequenceLoss/CrossEntropy{0}".format(i))
      else:
        crossent = softmax_loss_function(logits[i], targets[i])
      log_perp_list.append(crossent * weights[i])
    log_perps = math_ops.add_n(log_perp_list)
    if average_across_timesteps:
      total_size = math_ops.add_n(weights)
      total_size += 1e-12  # Just to avoid division by 0 for all-0 weights.
      log_perps /= total_size
  return log_perps
예제 #15
0
 def _reduce(self, method_string, value, destinations):
   if not isinstance(value, values.MapOutput):
     return value
   l = value.get()
   assert l
   with ops.device(self._device):
     if method_string == "sum":
       return math_ops.add_n(l)
     elif method_string == "mean":
       return math_ops.add_n(l) / len(l)
     else:
       assert False
예제 #16
0
 def testInt(self):
   np.random.seed(54321)
   for num_inputs in range(1, 10):
     x = [
         np.random.randint(-128, 128, (5, 4, 3, 2, 1))
         for _ in range(num_inputs)
     ]
     tf_x = ops.convert_n_to_tensor(x)
     with self.test_session(use_gpu=True):
       self.assertAllEqual(sum(x), math_ops.add_n(tf_x).eval())
       self.assertAllEqual(x[0] * num_inputs,
                           math_ops.add_n([tf_x[0]] * num_inputs).eval())
 def _reduce(self, aggregation, value, destinations):
   if not isinstance(value, values.MapOutput):
     return value
   l = value.get()
   assert l
   with ops.device(self._device):
     if aggregation == vs.VariableAggregation.SUM:
       return math_ops.add_n(l)
     elif aggregation == vs.VariableAggregation.MEAN:
       return math_ops.add_n(l) / len(l)
     else:
       assert False
예제 #18
0
  def _define_maximization_operation(self, num_batches):
    """Maximization operations."""
    # TODO(xavigonzalvo): some of these operations could be moved to C++.
    # Compute the effective number of data points assigned to component k.
    with ops.control_dependencies(self._w):
      points_in_k = array_ops.squeeze(
          math_ops.add_n(self._points_in_k), axis=[0])
      # Update alpha.
      if 'w' in self._params:
        final_points_in_k = points_in_k / num_batches
        num_examples = math_ops.cast(math_ops.reduce_sum(final_points_in_k),
                                     dtypes.float32)
        self._alpha_op = self._alpha.assign(final_points_in_k /
                                            (num_examples + MEPS))
      else:
        self._alpha_op = control_flow_ops.no_op()
      self._train_ops = [self._alpha_op]

      # Update means.
      points_in_k_expanded = array_ops.reshape(points_in_k,
                                               [self._num_classes, 1, 1])
      if 'm' in self._params:
        self._means_op = self._means.assign(
            math_ops.div(
                math_ops.add_n(self._w_mul_x), points_in_k_expanded + MEPS))
      else:
        self._means_op = control_flow_ops.no_op()
      # means are (num_classes x 1 x dims)

      # Update covariances.
      with ops.control_dependencies([self._means_op]):
        b = math_ops.add_n(self._w_mul_x2) / (points_in_k_expanded + MEPS)
        new_covs = []
        for k in range(self._num_classes):
          mean = self._means.value()[k, :, :]
          square_mean = math_ops.matmul(mean, mean, transpose_a=True)
          new_cov = b[k, :, :] - square_mean + self._min_var
          if self._covariance_type == FULL_COVARIANCE:
            new_covs.append(array_ops.expand_dims(new_cov, 0))
          elif self._covariance_type == DIAG_COVARIANCE:
            new_covs.append(
                array_ops.expand_dims(array_ops.diag_part(new_cov), 0))
        new_covs = array_ops.concat(new_covs, 0)
        if 'c' in self._params:
          # Train operations don't need to take care of the means
          # because covariances already depend on it.
          with ops.control_dependencies([self._means_op, new_covs]):
            self._train_ops.append(
                state_ops.assign(
                    self._covs, new_covs, validate_shape=False))
예제 #19
0
 def _testAllReduce(self, num_workers, num_gpus, shape, build_f):
   # Use local CPU as device for all inputs.
   num_devices = num_workers * num_gpus
   dev_list = ["/replica:0/task:0/device:CPU:0"
               for _ in range(num_devices)]
   with self.cached_session():
     input_tensors = self._buildInitialVars(shape, dev_list)
     un_op = lambda x: math_ops.div(
         x, constant_op.constant(num_devices, dtype=types_pb2.DT_FLOAT))
     simple_sum = math_ops.add_n(input_tensors)
     simple_sum.op.run()
     output_tensors = build_f(input_tensors, un_op)
     sum_reduced = math_ops.add_n(output_tensors)
     sum_reduced.op.run()
     self.assertAllClose(sum_reduced.eval(), self.evaluate(simple_sum))
예제 #20
0
 def _get_cross_tower(self):
   all_components = tuple(self._index.values())
   # TODO(josh11b): Use a strategy-specific method.
   total = math_ops.add_n(all_components)
   if self._aggregation == vs.VariableAggregation.MEAN:
     return total * (1./ len(all_components))
   return total
예제 #21
0
  def _reduce(self, aggregation, value, destinations):
    if values._enclosing_tpu_context() is not None:  # pylint: disable=protected-access
      if aggregation == vs.VariableAggregation.MEAN:
        # TODO(jhseu):  Revisit once we support model-parallelism.
        value *= (1. / self.num_towers)
      elif aggregation != vs.VariableAggregation.SUM:
        raise NotImplementedError(
            "Currently only support sum & mean in TPUStrategy.")
      return tpu_ops.cross_replica_sum(value)

    # Validate that the destination is same as the host device
    # Note we don't do this when in replicate context as the reduction is
    # performed on the TPU device itself.
    devices = cross_tower_ops_lib.get_devices_from(destinations)
    if len(devices) == 1:
      assert device_util.canonicalize(devices[0]) == device_util.canonicalize(
          self.get_host_cpu_device(0))
    else:
      raise ValueError('Multiple devices are not supported for TPUStrategy')

    if aggregation == vs.VariableAggregation.ONLY_FIRST_TOWER:
      return value[0]
    output = math_ops.add_n(value)
    if aggregation == vs.VariableAggregation.MEAN:
      return output * (1. / len(value))
    return output
예제 #22
0
 def testAddN(self):
   l1 = list_ops.tensor_list_from_tensor([1.0, 2.0], element_shape=[])
   l2 = list_ops.tensor_list_from_tensor([3.0, 4.0], element_shape=[])
   l3 = list_ops.tensor_list_from_tensor([5.0, 6.0], element_shape=[])
   result = math_ops.add_n((l1, l2, l3))
   result_t = list_ops.tensor_list_stack(result, element_dtype=dtypes.float32)
   self.assertAllEqual(self.evaluate(result_t), [9., 12.])
def aggregate_single_gradient_using_copy(grad_and_vars, use_mean,
                                         check_inf_nan):
  """Calculate the average gradient for a shared variable across all replicas.

  Note that this function provides a synchronization point across all replicas.

  Args:
    grad_and_vars: A list or tuple of (gradient, variable) tuples. Each
      (gradient, variable) pair within the outer list represents the gradient
      of the variable calculated for a single replica, and the number of pairs
      equals the number of replicas.
    use_mean: if True, mean is taken, else sum of gradients is taken.
    check_inf_nan: check grads for nans and infs.

  Returns:
    The tuple ([(average_gradient, variable),], has_nan_or_inf) where the
      gradient has been averaged across all replicas. The variable is chosen
      from the first replica. The has_nan_or_inf indicates the grads has nan or
      inf.
  """
  grads = [g for g, _ in grad_and_vars]
  grad = math_ops.add_n(grads)

  if use_mean and len(grads) > 1:
    grad = array_ops.multiply(grad, 1.0 / len(grads))

  v = grad_and_vars[0][1]
  if check_inf_nan:
    has_nan_or_inf = array_ops.logical_not(
        array_ops.reduce_all(array_ops.is_finite(grads)))
    return (grad, v), has_nan_or_inf
  else:
    return (grad, v), None
예제 #24
0
def sequence_classifier(decoding, labels, sampling_decoding=None, name=None):
  """Returns predictions and loss for sequence of predictions.

  Args:
    decoding: List of Tensors with predictions.
    labels: List of Tensors with labels.
    sampling_decoding: Optional, List of Tensor with predictions to be used
      in sampling. E.g. they shouldn't have dependncy on outputs.
      If not provided, decoding is used.
    name: Operation name.

  Returns:
    Predictions and losses tensors.
  """
  with ops.op_scope([decoding, labels], name, "sequence_classifier"):
    predictions, xent_list = [], []
    for i, pred in enumerate(decoding):
      xent_list.append(nn.softmax_cross_entropy_with_logits(
          pred, labels[i],
          name="sequence_loss/xent_raw{0}".format(i)))
      if sampling_decoding:
        predictions.append(nn.softmax(sampling_decoding[i]))
      else:
        predictions.append(nn.softmax(pred))
    xent = math_ops.add_n(xent_list, name="sequence_loss/xent")
    loss = math_ops.reduce_sum(xent, name="sequence_loss")
    return array_ops.expand_concat(1, predictions), loss
예제 #25
0
  def __init__(self, inputs, num_clusters, initial_clusters, distance_metric,
               random_seed, kmeans_plus_plus_num_retries, cluster_centers,
               cluster_centers_updated, cluster_centers_initialized):
    """Creates an op factory.

    Args:
      inputs: See KMeans constructor.
      num_clusters: An integer Tensor providing the number of clusters.
      initial_clusters: See KMeans constructor.
      distance_metric: See KMeans constructor.
      random_seed: See KMeans constructor.
      kmeans_plus_plus_num_retries: See KMeans constructor.
      cluster_centers: The TF variable holding the initial centers. It may
          already contain some centers when the op is executed.
      cluster_centers_updated: A second TF variable to hold a copy of the
          initial centers, used for full-batch mode. In mini-batch mode,
          cluster_centers_updated is the same variable as cluster_centers.
      cluster_centers_initialized: A boolean TF variable that will be set
          to true when all the initial centers have been chosen.
    """
    # All of these instance variables are constants.
    self._inputs = inputs
    self._num_clusters = num_clusters
    self._initial_clusters = initial_clusters
    self._distance_metric = distance_metric
    self._random_seed = random_seed
    self._kmeans_plus_plus_num_retries = kmeans_plus_plus_num_retries
    self._cluster_centers = cluster_centers
    self._cluster_centers_updated = cluster_centers_updated
    self._cluster_centers_initialized = cluster_centers_initialized

    self._num_selected = array_ops.shape(self._cluster_centers)[0]
    self._num_remaining = self._num_clusters - self._num_selected
    self._num_data = math_ops.add_n(
        [array_ops.shape(i)[0] for i in self._inputs])
예제 #26
0
  def test_distributive_property(self):
    """Verifies the distributive property of matrix multiplication."""
    with self.cached_session():
      params = constant_op.constant([.1, .2, .3])
      sp_values_a = sparse_tensor_lib.SparseTensor(
          values=["a"], indices=[[0, 0]], dense_shape=[3, 1])
      sp_values_b = sparse_tensor_lib.SparseTensor(
          values=["b"], indices=[[2, 0]], dense_shape=[3, 1])
      sp_values_c = sparse_tensor_lib.SparseTensor(
          values=["c"], indices=[[2, 0]], dense_shape=[3, 1])
      sp_values = sparse_tensor_lib.SparseTensor(
          values=["a", "b", "c"],
          indices=[[0, 0], [2, 0], [2, 1]],
          dense_shape=[3, 2])

      result_a = embedding_ops._sampled_scattered_embedding_lookup_sparse(
          params, sp_values_a, dimension=4, hash_key=self._hash_key)
      result_b = embedding_ops._sampled_scattered_embedding_lookup_sparse(
          params, sp_values_b, dimension=4, hash_key=self._hash_key)
      result_c = embedding_ops._sampled_scattered_embedding_lookup_sparse(
          params, sp_values_c, dimension=4, hash_key=self._hash_key)
      result = embedding_ops._sampled_scattered_embedding_lookup_sparse(
          params, sp_values, dimension=4, hash_key=self._hash_key)

      result_abc = math_ops.add_n([result_a, result_b, result_c])
      self.assertAllClose(result.eval(), result_abc.eval())
예제 #27
0
def _simple_reduce(per_device_value, reduce_to_device, accumulation_fn,
                   method_string):
  # pylint: disable=g-missing-docstring
  all_values = []
  count = 0
  for v in per_device_value._index.values():  # pylint: disable=protected-access
    if isinstance(v, value_lib.MapOutput):
      v_list = v.get()
      if not v_list:
        continue
      count += len(v_list)
      # Sum within each device before aggregating across devices.
      v = math_ops.add_n(v_list)
    else:
      count += 1
    all_values.append(v)
  if not all_values:
    raise ValueError("`per_device_value` must be non-empty")

  with ops.device(reduce_to_device):
    with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
      if method_string == "sum":
        reduced = accumulation_fn(all_values)
      elif method_string == "mean":
        reduced = accumulation_fn(all_values) / count
      else:
        raise ValueError("`method_string` must be 'sum' or 'mean'")
  return reduced
예제 #28
0
  def testVariant(self):

    def create_constant_variant(value):
      return constant_op.constant(
          tensor_pb2.TensorProto(
              dtype=dtypes.variant.as_datatype_enum,
              tensor_shape=tensor_shape.TensorShape([]).as_proto(),
              variant_val=[
                  tensor_pb2.VariantTensorDataProto(
                      # Match registration in variant_op_registry.cc
                      type_name=b"int",
                      metadata=np.array(value, dtype=np.int32).tobytes())
              ]))

    # TODO(ebrevdo): Re-enable use_gpu=True once non-DMA Variant
    # copying between CPU and GPU is supported.
    with self.session(use_gpu=False):
      variant_const_3 = create_constant_variant(3)
      variant_const_4 = create_constant_variant(4)
      variant_const_5 = create_constant_variant(5)
      # 3 + 3 + 5 + 4 = 15.
      result = math_ops.add_n((variant_const_3, variant_const_3,
                               variant_const_5, variant_const_4))

      # Smoke test -- ensure this executes without trouble.
      # Right now, non-numpy-compatible objects cannot be returned from a
      # session.run call; similarly, objects that can't be converted to
      # native numpy types cannot be passed to ops.convert_to_tensor.
      # For now, run the test and examine the output to see that the result is
      # equal to 15.
      result_op = logging_ops.Print(
          result, [variant_const_3, variant_const_4, variant_const_5, result],
          message=("Variants stored an int: c(3), c(4), c(5), "
                   "add_n(c(3), c(3), c(5), c(4)): ")).op
      result_op.run()
예제 #29
0
  def _reduce(self, aggregation, value, destinations):
    graph = ops.get_default_graph()
    cf_context = graph._get_control_flow_context()  # pylint: disable=protected-access
    # If we're inside the ReplicateContext, reduction should be done using
    # CrossReplicaSum while outside we can directly use an add_n op.
    while cf_context:
      if isinstance(cf_context, tpu.TPUReplicateContext):
        if aggregation == vs.VariableAggregation.MEAN:
          # TODO(jhseu):  Revisit once we support model-parallelism.
          value *= (1. / self.num_towers)
        return tpu_ops.cross_replica_sum(value)
      cf_context = cf_context.outer_context

    # Validate that the destination is same as the host device
    # Note we don't do this when in replicate context as the reduction is
    # performed on the TPU device itself.
    devices = cross_tower_ops_lib.get_devices_from(destinations)
    if len(devices) == 1:
      assert device_util.canonicalize(devices[0]) == device_util.canonicalize(
          self._host)
    else:
      raise ValueError('Multiple devices are not supported for TPUStrategy')

    output = math_ops.add_n(value)
    if aggregation == vs.VariableAggregation.MEAN:
      return output * (1. / len(value))
    return output
예제 #30
0
  def approximate_duality_gap(self):
    """Add operations to compute the approximate duality gap.

    Returns:
      An Operation that computes the approximate duality gap over all
      examples.
    """
    with name_scope('sdca/approximate_duality_gap'):
      _, values_list = self._hashtable.export_sharded()
      shard_sums = []
      for values in values_list:
        with ops.device(values.device):
          # For large tables to_double() below allocates a large temporary
          # tensor that is freed once the sum operation completes. To reduce
          # peak memory usage in cases where we have multiple large tables on a
          # single device, we serialize these operations.
          # Note that we need double precision to get accurate results.
          with ops.control_dependencies(shard_sums):
            shard_sums.append(
                math_ops.reduce_sum(math_ops.to_double(values), 0))
      summed_values = math_ops.add_n(shard_sums)

      primal_loss = summed_values[1]
      dual_loss = summed_values[2]
      example_weights = summed_values[3]
      # Note: we return NaN if there are no weights or all weights are 0, e.g.
      # if no examples have been processed
      return (primal_loss + dual_loss + self._l1_loss() +
              (2.0 * self._l2_loss(self._symmetric_l2_regularization()))
             ) / example_weights
예제 #31
0
    def __call__(self,
                 y_true,
                 y_pred,
                 sample_weight=None,
                 regularization_losses=None):
        """Computes the overall loss.

    Arguments:
      y_true: An arbitrary structure of Tensors representing the ground truth.
      y_pred: An arbitrary structure of Tensors representing a Model's outputs.
      sample_weight: An arbitrary structure of Tensors representing the
        per-sample loss weights. If one Tensor is passed, it is used for all
        losses. If multiple Tensors are passed, the structure should match
        `y_pred`.
      regularization_losses: Additional losses to be added to the total loss.

    Returns:
      Tuple of `(total_loss, per_output_loss_list)`
    """
        y_true = map_to_output_names(y_pred, self._output_names, y_true)
        sample_weight = map_to_output_names(y_pred, self._output_names,
                                            sample_weight)

        if not self._built:
            self._build(y_pred)

        y_true = nest.flatten(y_true) if y_true is not None else []
        y_pred = nest.flatten(y_pred)

        # TODO(omalleyt): Remove ambiguity here.
        # This is currently needed to support passing only 1 loss and 1 target
        # to a Functional Model with multiple outputs. However, this is
        # ambiguous, especially with subclass, and we should reconsider how we
        # support this.
        if len(y_true) == 1 and len(y_pred) > 1:
            y_true = y_true * len(y_pred)

        sample_weight = nest.flatten(sample_weight)
        # Allows passing one sample-weight array for all outputs.
        if len(sample_weight) == 1 and len(y_pred) > 1:
            sample_weight = sample_weight * len(y_pred)

        loss_values = []  # Used for gradient calculation.
        loss_metric_values = []  # Used for loss metric calculation.
        zip_args = (y_true, y_pred, sample_weight, self._losses,
                    self._loss_weights, self._per_output_metrics)
        for y_t, y_p, sw, loss_obj, loss_weight, metric_obj in zip(*zip_args):
            if loss_obj is None:  # Ok to have no loss for an output.
                continue

            y_t, y_p, sw = match_dtype_and_rank(y_t, y_p, sw)
            sw = apply_mask(y_p, sw)

            loss_value = loss_obj(y_t, y_p, sample_weight=sw)

            loss_metric_value = loss_value
            # Correct for the `Mean` loss metrics counting each replica as a batch.
            if loss_obj.reduction == losses_utils.ReductionV2.SUM:
                loss_metric_value *= ds_context.get_strategy(
                ).num_replicas_in_sync
            if metric_obj is not None:
                metric_obj.update_state(loss_metric_value)

            if loss_weight is not None:
                loss_value *= loss_weight
                loss_metric_value *= loss_weight

            if (loss_obj.reduction
                    == losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE
                    or loss_obj.reduction == losses_utils.ReductionV2.AUTO):
                loss_value = losses_utils.scale_loss_for_distribution(
                    loss_value)

            loss_values.append(loss_value)
            loss_metric_values.append(loss_metric_value)

        if regularization_losses:
            reg_loss = math_ops.add_n(regularization_losses)
            loss_metric_values.append(reg_loss)
            loss_values.append(
                losses_utils.scale_loss_for_distribution(reg_loss))

        if loss_values:
            total_loss_metric_value = math_ops.add_n(loss_metric_values)
            self._loss_metric.update_state(total_loss_metric_value)

            total_loss = math_ops.add_n(loss_values)
            return total_loss
        else:
            # Ok for a model to have no compiled loss.
            return array_ops.zeros(shape=())
예제 #32
0
def _model_loss(model,
                inputs,
                targets,
                output_loss_metrics=None,
                sample_weights=None,
                training=False):
    """Calculates the loss for a given model.

  Arguments:
      model: The model on which metrics are being calculated.
      inputs: Either a dictionary of inputs to the model or a list of input
        arrays.
      targets: List of target arrays.
      output_loss_metrics: List of metrics that are used to aggregated output
        loss values.
      sample_weights: Optional list of sample weight arrays.
      training: Whether the model should be run in inference or training mode.

  Returns:
     Returns the model output, total loss, loss value calculated using the
     specified loss function and masks for each output. The total loss includes
     regularization losses and applies masking and sample weighting
     to the loss value.
  """
    # TODO(psv): Dedup code here with graph mode prepare_total_loss() fn.
    # Used to keep track of the total loss value (stateless).
    # eg., total_loss = loss_weight_1 * output_1_loss_fn(...) +
    #                   loss_weight_2 * output_2_loss_fn(...) +
    #                   layer losses.
    total_loss = 0
    kwargs = {}
    if model._expects_training_arg:
        kwargs['training'] = training
    if len(inputs) == 1 and not isinstance(inputs, dict):
        inputs = inputs[0]

    # Allow mixed `NumPy` and `EagerTensor` input here.
    if any(
            isinstance(input_t, (np.ndarray, float, int))
            for input_t in nest.flatten(inputs)):
        inputs = nest.map_structure(ops.convert_to_tensor, inputs)

    outs = model(inputs, **kwargs)
    outs = nest.flatten(outs)
    masks = [getattr(t, '_keras_mask', None) for t in outs]
    targets = nest.flatten(targets)

    # Used to keep track of individual output losses.
    output_losses = []

    with backend.name_scope('loss'):
        loss_fns = [
            loss_fn for loss_fn in model.loss_functions if loss_fn is not None
        ]
        for i, loss_fn in enumerate(loss_fns):
            weights = sample_weights[i] if sample_weights else None
            mask = masks[i]
            with backend.name_scope(model.output_names[i] + '_loss'):
                if mask is not None:
                    mask = math_ops.cast(mask, outs[i].dtype)
                    # Update weights with mask.
                    if weights is None:
                        weights = mask
                    else:
                        # Update dimensions of weights to match with mask if possible.
                        mask, _, weights = (
                            tf_losses_utils.squeeze_or_expand_dimensions(
                                mask, sample_weight=weights))
                        weights *= mask

                if hasattr(loss_fn, 'reduction'):
                    per_sample_losses = loss_fn.call(targets[i], outs[i])
                    weighted_losses = losses_utils.compute_weighted_loss(
                        per_sample_losses,
                        sample_weight=weights,
                        reduction=losses_utils.ReductionV2.NONE)
                    loss_reduction = loss_fn.reduction

                    # `AUTO` loss reduction defaults to `SUM_OVER_BATCH_SIZE` for all
                    # compile use cases.
                    if loss_reduction == losses_utils.ReductionV2.AUTO:
                        loss_reduction = losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE

                    # Compute the stateless loss value.
                    output_loss = losses_utils.reduce_weighted_loss(
                        weighted_losses, reduction=loss_reduction)
                else:
                    # Compute the stateless loss value for a custom loss class.
                    # Here we assume that the class takes care of loss reduction
                    # because if this class returns a vector value we cannot
                    # differentiate between use case where a custom optimizer
                    # expects a vector loss value vs unreduced per-sample loss value.
                    output_loss = loss_fn(targets[i],
                                          outs[i],
                                          sample_weight=weights)
                    loss_reduction = losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE

            # If the number of outputs is 1 then we don't append the loss metric
            # associated with each model output. When there are multiple outputs
            # associated with a model, each output's loss is calculated and returned
            # as part of the loss_metrics.
            if len(model.outputs) > 1:
                # Keep track of the stateful output loss result.
                output_losses.append(output_loss_metrics[i](output_loss))

            # Scale output loss for distribution. For custom losses we assume
            # reduction was mean.
            if loss_reduction == losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE:
                output_loss = losses_utils.scale_loss_for_distribution(
                    output_loss)
            total_loss += model._loss_weights_list[i] * output_loss

        # Add regularization losses
        custom_losses = model.losses
        if custom_losses:
            total_loss += losses_utils.scale_loss_for_distribution(
                math_ops.add_n(custom_losses))

    return outs, total_loss, output_losses, masks
예제 #33
0
def _inner_product_list(list1, list2):
    return math_ops.add_n(
        [math_ops.reduce_sum(elt1 * elt2) for elt1, elt2 in zip(list1, list2)])
예제 #34
0
 def testExecuteIntAttr(self):
     three = constant_op.constant(3)
     four = constant_op.constant(4)
     total = math_ops.add_n([three, four])
     self.assertAllEqual(7, total)
예제 #35
0
 def size(self, name=None):
     with ops.name_scope(name, 'sharded_mutable_hash_table_size'):
         sizes = [
             self._table_shards[i].size() for i in range(self._num_shards)
         ]
         return math_ops.add_n(sizes)
예제 #36
0
def _scaled_dot_product(scale, xs, ys, name=None):
    """Calculate a scaled, vector inner product between lists of Tensors."""
    return math_ops.add_n([(scale * x) * y for x, y in zip(xs, ys)
                           if _possibly_nonzero(x) and _possibly_nonzero(y)],
                          name='scaled_dot_product')
예제 #37
0
  def create_estimator_spec(
      self, features, mode, logits, labels=None, train_op_fn=None,
      regularization_losses=None):
    """Returns an `EstimatorSpec`.

    Args:
      features: Input `dict` of `Tensor` or `SparseTensor` objects.
      mode: Estimator's `ModeKeys`.
      logits: logits `Tensor` with shape `[D0, D1, ... DN, n_classes]`.
        For many applications, the shape is `[batch_size, n_classes]`.
      labels: Labels with shape matching `logits`. Can be multi-hot `Tensor`
        with shape `[D0, D1, ... DN, n_classes]` or `SparseTensor` with
        `dense_shape` `[D0, D1, ... DN, ?]`. `labels` is required argument when
        `mode` equals `TRAIN` or `EVAL`.
      train_op_fn: Function that takes a scalar loss `Tensor` and returns
        `train_op`. Required in TRAIN mode.
      regularization_losses: A list of additional scalar losses to be added to
        the training loss, such as regularization losses. These losses are
        usually expressed as a batch average, so for best results users need to
        set `loss_reduction=SUM_OVER_BATCH_SIZE` or
        `loss_reduction=SUM_OVER_NONZERO_WEIGHTS` when creating the head to
        avoid scaling errors.
    Returns:
      `EstimatorSpec`.
    Raises:
      ValueError: If `train_op_fn` is `None` in TRAIN mode.
    """
    with ops.name_scope(self._name, 'head'):
      logits = head_lib._check_logits_final_dim(logits, self.logits_dimension)  # pylint:disable=protected-access

      # Predict.
      pred_keys = prediction_keys.PredictionKeys
      with ops.name_scope(None, 'predictions', (logits,)):
        probabilities = math_ops.sigmoid(logits, name=pred_keys.PROBABILITIES)
        predictions = {
            pred_keys.LOGITS: logits,
            pred_keys.PROBABILITIES: probabilities,
        }
      if mode == model_fn.ModeKeys.PREDICT:
        classifier_output = head_lib._classification_output(  # pylint:disable=protected-access
            scores=probabilities, n_classes=self._n_classes,
            label_vocabulary=self._label_vocabulary)
        return model_fn.EstimatorSpec(
            mode=model_fn.ModeKeys.PREDICT,
            predictions=predictions,
            export_outputs={
                _DEFAULT_SERVING_KEY: classifier_output,
                head_lib._CLASSIFY_SERVING_KEY: classifier_output,  # pylint:disable=protected-access
                head_lib._PREDICT_SERVING_KEY: (  # pylint:disable=protected-access
                    export_output.PredictOutput(predictions))
            })

      (training_loss, unreduced_loss, weights,
       processed_labels) = self.create_loss(
           features=features, mode=mode, logits=logits, labels=labels)
      if regularization_losses:
        regularization_loss = math_ops.add_n(regularization_losses)
        regularized_training_loss = math_ops.add_n(
            [training_loss, regularization_loss])
      else:
        regularization_loss = None
        regularized_training_loss = training_loss

      # Eval.
      if mode == model_fn.ModeKeys.EVAL:
        return model_fn.EstimatorSpec(
            mode=model_fn.ModeKeys.EVAL,
            predictions=predictions,
            loss=regularized_training_loss,
            eval_metric_ops=self._eval_metric_ops(
                labels=processed_labels,
                probabilities=probabilities,
                weights=weights,
                unreduced_loss=unreduced_loss,
                regularization_loss=regularization_loss))

      # Train.
      if train_op_fn is None:
        raise ValueError('train_op_fn can not be None.')
      # Only summarize mean_loss for SUM reduction to preserve backwards
      # compatibility. Otherwise skip it to avoid unnecessary computation.
      if self._loss_reduction == losses.Reduction.SUM:
        example_weight_sum = math_ops.reduce_sum(
            weights * array_ops.ones_like(unreduced_loss))
        mean_loss = training_loss / example_weight_sum
      else:
        mean_loss = None
    with ops.name_scope(''):
      keys = metric_keys.MetricKeys
      summary.scalar(
          head_lib._summary_key(self._name, keys.LOSS),  # pylint:disable=protected-access
          regularized_training_loss)
      if mean_loss is not None:
        summary.scalar(
            head_lib._summary_key(self._name, keys.LOSS_MEAN),  # pylint:disable=protected-access
            mean_loss)
      if regularization_loss is not None:
        summary.scalar(
            head_lib._summary_key(self._name, keys.LOSS_REGULARIZATION),  # pylint:disable=protected-access
            regularization_loss)
    return model_fn.EstimatorSpec(
        mode=model_fn.ModeKeys.TRAIN,
        predictions=predictions,
        loss=regularized_training_loss,
        train_op=train_op_fn(regularized_training_loss))
def variational_beam_decoder_with_buckets(means,
                                          logvars,
                                          decoder_inputs,
                                          targets,
                                          weights,
                                          buckets,
                                          decoder,
                                          latent_dec,
                                          kl_f,
                                          sample,
                                          iaf=False,
                                          softmax_loss_function=None,
                                          per_example_loss=False,
                                          name=None):
    """Create a sequence-to-sequence model with support for bucketing.
    """
    if len(targets) < buckets[-1][1]:
        raise ValueError("Length of targets (%d) must be at least that of last"
                         "bucket (%d)." % (len(targets), buckets[-1][1]))
    if len(weights) < buckets[-1][1]:
        raise ValueError("Length of weights (%d) must be at least that of last"
                         "bucket (%d)." % (len(weights), buckets[-1][1]))

    all_inputs = decoder_inputs + targets + weights
    losses = []
    outputs = []
    beam_paths = []
    beam_path = []
    KL_divergences = []
    with ops.name_scope(name, "variational_decoder_with_buckets", all_inputs):
        for j, bucket in enumerate(buckets):
            with variable_scope.variable_scope(
                    variable_scope.get_variable_scope(),
                    reuse=True if j > 0 else None):
                latent_vector, kl_cost = sample(means[j], logvars[j])
                decoder_initial_state = latent_dec(latent_vector)

                bucket_outputs, _, beam_path, beam_symbol = decoder(
                    decoder_initial_state, decoder_inputs[:bucket[1]])
                outputs.append(bucket_outputs)
                beam_paths.append(beam_path)
                beam_symbols.append(beam_symbol)
                total_size = math_ops.add_n(weights[:bucket[1]])
                total_size += 1e-12
                KL_divergences.append(tf.reduce_mean(kl_cost / total_size))
                if per_example_loss:
                    losses.append(
                        sequence_loss_by_example(
                            outputs[-1],
                            targets[:bucket[1]],
                            weights[:bucket[1]],
                            softmax_loss_function=softmax_loss_function))
                else:
                    losses.append(
                        sequence_loss(
                            outputs[-1],
                            targets[:bucket[1]],
                            weights[:bucket[1]],
                            softmax_loss_function=softmax_loss_function))

    return outputs, losses, KL_objs, KL_costs
    def __init__(self, config, name_scope, dtype=tf.float32):
        # with tf.variable_scope(name_or_scope=scope_name):
        with tf.device("/gpu:0"):
            emb_dim = config.embed_dim
            word_embedding = config.word_embedding
            num_layers = config.num_layers
            vocab_size = config.vocab_size
            buckets = config.buckets
            self.learning_rate = tf.Variable(float(config.learning_rate), trainable=False, dtype=dtype)
            self.global_step = tf.Variable(initial_value=0, trainable=False)

            self.query = []
            self.answer = []
            self.weight = []
            for i in range(buckets[-1][0]):
                self.query.append(tf.placeholder(dtype=tf.int32, shape=[None], name="query{0}".format(i)))
            for i in xrange(buckets[-1][1]):
                self.answer.append(tf.placeholder(dtype=tf.int32, shape=[None], name="answer{0}".format(i)))
            for i in xrange(buckets[-1][1]):
                self.weight.append(tf.placeholder(dtype=tf.float32, shape=[None], name="weight{0}".format(i)))

            self.traj_ip_weight = tf.placeholder(dtype=tf.float32, shape=[None], name="traj_weight")

            # self.target = tf.placeholder(dtype=tf.int64, shape=[None], name="target")

            def create_rnn_cell():
                # encoDecoCell = tf.contrib.rnn.GRUCell(  # Or GRUCell, LSTMCell(args.hiddenSize)
                encoDecoCell = tf.nn.rnn_cell.GRUCell(  # Or GRUCell, LSTMCell(args.hiddenSize)
                    emb_dim,
                )
                encoDecoCell = tf.contrib.rnn.DropoutWrapper(
                    encoDecoCell,
                    input_keep_prob=1.0,
                    output_keep_prob=config.keep_prob
                )
                return encoDecoCell

            # '''
            encoder_mutil = tf.contrib.rnn.MultiRNNCell(
                [create_rnn_cell() for _ in range(num_layers)],
            )
            # '''
            query_encoder_emb = EmbeddingWrapper_GPU(encoder_mutil, embedding_classes=vocab_size,
                                                                embedding_size=word_embedding)

            context_multi = tf.contrib.rnn.MultiRNNCell(
                [create_rnn_cell() for _ in range(1)],
            )

            self.b_query_state = []
            self.b_answer_state = []
            self.b_state = []
            self.b_reward = []
            self.b_loss = []
            self.b_train_op = []
            self.b_traj_reward = []
            # with tf.name_scope('structure'):
            for i, bucket in enumerate(buckets):
                state_list = []
                reward_list = []
                with tf.variable_scope(name_or_scope="Hier_RNN_encoder", reuse=True if i > 0 else None) as scope:
                    query_output, query_state = tf.contrib.rnn.static_rnn(query_encoder_emb,
                                                                          inputs=self.query[:bucket[0]],
                                                                          dtype=tf.float32)
                    self.b_query_state.append(query_state)

                with tf.variable_scope("Hier_RNN_encoder/rnn/embedding_wrapper", reuse=True):
                    embed_in = tf.get_variable("embedding")
                    emb_answer = [
                        embedding_ops.embedding_lookup(embed_in, ix) for ix in self.answer[:bucket[1]]]

                with tf.variable_scope(name_or_scope="Hier_RNN_context", reuse=True if i > 0 else None) as var_scope:
                    '''
                    utilize the state from last step which record the hidden state of each encoding step
                    '''
                    query_state_history = query_state[-1]
                    context_action_history = []
                    for j in range(0, bucket[1]):
                        if j > 0:
                            var_scope.reuse_variables()
                        action = emb_answer[j]
                        emb_proj_w = tf.get_variable("embd_project_w", [word_embedding, emb_dim], dtype=tf.float32,
                                                     initializer=tf.random_normal_initializer(stddev=0.1))
                        emb_proj_b = tf.get_variable("embd_project_b", [emb_dim], dtype=tf.float32,
                                                     initializer=tf.random_normal_initializer(stddev=0.1))
                        projected = tf.matmul(action, emb_proj_w) + emb_proj_b
                        context_action_history.append(projected)

                with tf.variable_scope(name_or_scope="Reward_concat_layer", reuse=True if i > 0 else None) as var_scope:
                    context_input = [query_state_history] + context_action_history
                    output, state = tf.contrib.rnn.static_rnn(context_multi, context_input, dtype=tf.float32)
                    for j in range(0, bucket[1]):
                        state_action_pair = [output[j], context_action_history[j]]
                        state_list.append(state_action_pair)
                self.b_state.append(state_list)

                with tf.variable_scope("Softmax_layer_and_output", reuse=True if i > 0 else None) as var_scope:
                    for j in range(0, bucket[1]):
                        if j > 0:
                            var_scope.reuse_variables()
                        softmax_w1_s = tf.get_variable("softmax_w1_s", [emb_dim, 100], dtype=tf.float32,
                                                       initializer=tf.random_normal_initializer(stddev=0.1))
                        softmax_b1_s = tf.get_variable("softmax_b1_s", 100, dtype=tf.float32,
                                                       initializer=tf.random_normal_initializer(stddev=0.1))
                        softmax_w1_a = tf.get_variable("softmax_w1_a", [emb_dim, 100], dtype=tf.float32,
                                                       initializer=tf.random_normal_initializer(stddev=0.1))
                        softmax_b1_a = tf.get_variable("softmax_b1_a", 100, dtype=tf.float32,
                                                       initializer=tf.random_normal_initializer(stddev=0.1))
                        s_1 = tf.matmul(state_list[j][0], softmax_w1_s) + softmax_b1_s
                        a_1 = tf.matmul(state_list[j][1], softmax_w1_a) + softmax_b1_a
                        s_a_1 = tf.concat([s_1, a_1], 1)

                        softmax_w3 = tf.get_variable("softmax_w3", [200, 100], dtype=tf.float32,
                                                     initializer=tf.random_normal_initializer(stddev=0.1))
                        softmax_b3 = tf.get_variable("softmax_b3", 100, dtype=tf.float32,
                                                     initializer=tf.random_normal_initializer(stddev=0.1))
                        softmax_w4 = tf.get_variable("softmax_w4", [100, 50], dtype=tf.float32,
                                                     initializer=tf.random_normal_initializer(stddev=0.1))
                        softmax_b4 = tf.get_variable("softmax_b4", 50, dtype=tf.float32,
                                                     initializer=tf.random_normal_initializer(stddev=0.1))
                        softmax_w5 = tf.get_variable("softmax_w5", [50, 1], dtype=tf.float32,
                                                     initializer=tf.random_normal_initializer(stddev=0.1))
                        softmax_b5 = tf.get_variable("softmax_b5", 1, dtype=tf.float32,
                                                     initializer=tf.random_normal_initializer(stddev=0.1))

                        logits_mid1 = tf.matmul(s_a_1, softmax_w3) + softmax_b3
                        logits_mid2 = tf.matmul(logits_mid1, softmax_w4) + softmax_b4
                        logits = tf.matmul(logits_mid2, softmax_w5) + softmax_b5

                        reward = tf.nn.sigmoid(logits)
                        reward = tf.reshape(reward, [-1])
                        # print(reward.get_shape())
                        reward = tf.multiply(reward, self.weight[j])
                        # print(self.weight[j].get_shape())
                        reward_list.append(reward)
                self.b_reward.append(reward_list)

                with tf.name_scope("loss"):
                    traj_reward = math_ops.add_n(reward_list)
                    loss = tf.multiply(traj_reward, self.traj_ip_weight)
                    # mean_loss = tf.reduce_mean(loss)
                    mean_loss = tf.reduce_sum(loss)
                    self.b_loss.append(mean_loss)
                    self.b_traj_reward.append(traj_reward)

                with tf.name_scope("gradient_descent"):
                    # '''
                    optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate)
                    '''
                    optimizer = tf.train.AdamOptimizer(
                        learning_rate=self.learning_rate,
                        beta1=0.9,
                        beta2=0.999,
                        epsilon=1e-08
                    )
                    '''
                    gradients, variables = zip(*optimizer.compute_gradients(mean_loss))
                    gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
                    train_op = optimizer.apply_gradients(zip(gradients, variables), global_step=self.global_step)
                    self.b_train_op.append(train_op)

            all_variables = [v for v in tf.global_variables() if name_scope in v.name]
            self.saver = tf.train.Saver(all_variables)
예제 #40
0
def _AggregatedGrads(grads, op, loop_state, aggregation_method=None):
  """Get the aggregated gradients for op.

  Args:
    grads: The map of memoized gradients.
    op: The op to get gradients for.
    loop_state: An object for maintaining the state of the while loops in the
                graph. It is of type ControlFlowState. None if the graph
                contains no while loops.
    aggregation_method: Specifies the method used to combine gradient terms.
      Accepted values are constants defined in the class `AggregationMethod`.

  Returns:
    A list of gradients, one per each output of `op`. If the gradients
      for a particular output is a list, this function aggregates it
      before returning.

  Raises:
    TypeError: if the incoming grads are not Tensors or IndexedSlices.
    ValueError: if the arguments are invalid.

  """
  if aggregation_method is None:
    aggregation_method = AggregationMethod.DEFAULT
  if aggregation_method not in [
      AggregationMethod.ADD_N, AggregationMethod.EXPERIMENTAL_TREE,
      AggregationMethod.EXPERIMENTAL_ACCUMULATE_N
  ]:
    raise ValueError("Invalid aggregation_method specified %s." %
                     aggregation_method)
  out_grads = _GetGrads(grads, op)
  for i, out_grad in enumerate(out_grads):
    if loop_state:
      if isinstance(out_grad, (ops.Tensor, ops.IndexedSlices)):
        assert control_flow_ops.IsLoopSwitch(op)
        continue
    # Grads have to be Tensors or IndexedSlices
    if (isinstance(out_grad, collections.Sequence) and not all([
        isinstance(g, (ops.Tensor, ops.IndexedSlices)) for g in out_grad
        if g is not None
    ])):
      raise TypeError("gradients have to be either all Tensors "
                      "or all IndexedSlices")
    # Aggregate multiple gradients, and convert [] to None.
    if out_grad:
      if len(out_grad) < 2:
        used = "nop"
        out_grads[i] = out_grad[0]
      elif all([isinstance(g, ops.Tensor) for g in out_grad if g is not None]):
        tensor_shape = _AccumulatorShape(out_grad)
        if (aggregation_method == AggregationMethod.EXPERIMENTAL_ACCUMULATE_N
            and len(out_grad) > 2 and tensor_shape.is_fully_defined()):
          # The benefit of using AccumulateN is that its inputs can be combined
          # in any order and this can allow the expression to be evaluated with
          # a smaller memory footprint.  When used with gpu_allocator_retry,
          # it is possible to compute a sum of terms which are much larger than
          # total GPU memory.
          # AccumulateN can currently only be used if we know the shape for
          # an accumulator variable.  If this is not known, or if we only have
          # 2 grads then we fall through to the "tree" case below.
          used = "accumulate_n"
          out_grads[i] = math_ops.accumulate_n(out_grad)
        elif aggregation_method in [
            AggregationMethod.EXPERIMENTAL_TREE,
            AggregationMethod.EXPERIMENTAL_ACCUMULATE_N
        ]:
          # Aggregate all gradients by doing pairwise sums: this may
          # reduce performance, but it can improve memory because the
          # gradients can be released earlier.
          #
          # TODO(vrv): Consider replacing this with a version of
          # tf.AddN() that eagerly frees its inputs as soon as they are
          # ready, so the order of this tree does not become a problem.
          used = "tree"
          with ops.name_scope(op.name + "_gradient_sum"):
            running_sum = out_grad[0]
            for grad in out_grad[1:]:
              running_sum = math_ops.add_n([running_sum, grad])
            out_grads[i] = running_sum
        else:
          used = "add_n"
          out_grads[i] = _MultiDeviceAddN(out_grad)
        logging.vlog(2, "  _AggregatedGrads %d x %s using %s",
                     len(out_grad), tensor_shape, used)
      else:
        out_grad = math_ops._as_indexed_slices_list(
            [g for g in out_grad if g is not None])
        out_grad = [_HandleNestedIndexedSlices(x) for x in out_grad]
        # Form IndexedSlices out of the concatenated values and
        # indices.
        out_grads[i] = ops.IndexedSlices(
            array_ops.concat([x.values for x in out_grad], 0),
            array_ops.concat([x.indices for x in out_grad], 0),
            out_grad[0].dense_shape)
    else:  # not out_grad
      # out_grads[i] is [], thus its aggregation is simply None.
      out_grads[i] = None
  return out_grads
예제 #41
0
def weighted_sum_from_feature_columns(columns_to_tensors,
                                      feature_columns,
                                      num_outputs,
                                      weight_collections=None,
                                      trainable=True,
                                      scope=None):
    """A tf.contrib.layers style linear prediction builder based on FeatureColumn.

  Generally a single example in training data is described with feature columns.
  This function generates weighted sum for each num_outputs. Weighted sum refers
  to logits in classification problems. It refers to prediction itself for
  linear regression problems.

  Example:

    ```
    # Building model for training
    feature_columns = (
        real_valued_column("my_feature1"),
        ...
    )
    columns_to_tensor = tf.parse_example(...)
    logits = weighted_sum_from_feature_columns(
        columns_to_tensors=columns_to_tensor,
        feature_columns=feature_columns,
        num_outputs=1)
    loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels,
                                                   logits=logits)
    ```

  Args:
    columns_to_tensors: A mapping from feature column to tensors. 'string' key
      means a base feature (not-transformed). It can have FeatureColumn as a
      key too. That means that FeatureColumn is already transformed by input
      pipeline. For example, `inflow` may have handled transformations.
    feature_columns: A set containing all the feature columns. All items in the
      set should be instances of classes derived from FeatureColumn.
    num_outputs: An integer specifying number of outputs. Default value is 1.
    weight_collections: List of graph collections to which weights are added.
    trainable: If `True` also add variables to the graph collection
      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
    scope: Optional scope for variable_scope.

  Returns:
    A tuple containing:

      * A Tensor which represents predictions of a linear model.
      * A dictionary which maps feature_column to corresponding Variable.
      * A Variable which is used for bias.

  Raises:
    ValueError: if FeatureColumn cannot be used for linear predictions.
  """
    columns_to_tensors = columns_to_tensors.copy()
    check_feature_columns(feature_columns)
    with variable_scope.variable_scope(
            scope,
            default_name='weighted_sum_from_feature_columns',
            values=columns_to_tensors.values()):
        output_tensors = []
        column_to_variable = dict()
        transformer = _Transformer(columns_to_tensors)
        # pylint: disable=protected-access
        for column in sorted(set(feature_columns), key=lambda x: x.key):
            transformed_tensor = transformer.transform(column)
            try:
                embedding_lookup_arguments = column._wide_embedding_lookup_arguments(
                    transformed_tensor)
                variable, predictions = _create_embedding_lookup(
                    column, columns_to_tensors, embedding_lookup_arguments,
                    num_outputs, trainable, weight_collections)
            except NotImplementedError:
                with variable_scope.variable_scope(
                        None,
                        default_name=column.name,
                        values=columns_to_tensors.values()):
                    tensor = column._to_dense_tensor(transformed_tensor)
                    tensor = _maybe_reshape_input_tensor(tensor,
                                                         column.name,
                                                         output_rank=2)
                    variable = [
                        contrib_variables.model_variable(
                            name='weight',
                            shape=[tensor.get_shape()[1], num_outputs],
                            initializer=init_ops.zeros_initializer(),
                            trainable=trainable,
                            collections=weight_collections)
                    ]
                    predictions = math_ops.matmul(tensor,
                                                  variable[0],
                                                  name='matmul')
            except ValueError as ee:
                raise ValueError(
                    'Error creating weighted sum for column: {}.\n'
                    '{}'.format(column.name, ee))
            output_tensors.append(
                array_ops.reshape(predictions, shape=(-1, num_outputs)))
            column_to_variable[column] = variable
            _log_variable(variable)
            _maybe_restore_from_checkpoint(column._checkpoint_path(), variable)
        # pylint: enable=protected-access
        predictions_no_bias = math_ops.add_n(output_tensors)
        bias = contrib_variables.model_variable(
            'bias_weight',
            shape=[num_outputs],
            initializer=init_ops.zeros_initializer(),
            trainable=trainable,
            collections=_add_variable_collection(weight_collections))
        _log_variable(bias)
        predictions = nn_ops.bias_add(predictions_no_bias, bias)

        return predictions, column_to_variable, bias
예제 #42
0
 def add_all(*args):
     return math_ops.add_n(*args)
예제 #43
0
def _dot_product(xs, ys, name=None):
    """Calculate the vector inner product between two lists of Tensors."""
    return math_ops.add_n([x * y for x, y in zip(xs, ys)])
예제 #44
0
def _model_loss(model,
                inputs,
                targets,
                output_loss_metrics=None,
                sample_weights=None,
                training=False):
  """Calculates the loss for a given model.

  Arguments:
      model: The model on which metrics are being calculated.
      inputs: Either a dictionary of inputs to the model or a list of input
        arrays.
      targets: List of target arrays.
      output_loss_metrics: List of metrics that are used to aggregated output
        loss values.
      sample_weights: Optional list of sample weight arrays.
      training: Whether the model should be run in inference or training mode.

  Returns:
     Returns the model output, total loss, loss value calculated using the
     specified loss function and masks for each output. The total loss includes
     regularization losses and applies masking and sample weighting
     to the loss value.
  """
  # Used to keep track of the total loss value (stateless).
  # eg., total_loss = loss_weight_1 * output_1_loss_fn(...) +
  #                   loss_weight_2 * output_2_loss_fn(...) +
  #                   layer losses.
  total_loss = 0
  kwargs = {}
  if model._expects_training_arg:
    kwargs['training'] = training
  if len(inputs) == 1 and not isinstance(inputs, dict):
    inputs = inputs[0]

  # Allow mixed `NumPy` and `EagerTensor` input here.
  if any(
      isinstance(input_t, (np.ndarray, float, int))
      for input_t in nest.flatten(inputs)):
    inputs = nest.map_structure(ops.convert_to_tensor, inputs)

  outs = model(inputs, **kwargs)

  outs = nest.flatten(outs)
  # `None` by default for `EagerTensors`.
  masks = [t._keras_mask for t in outs]
  targets = nest.flatten(targets)

  # Used to keep track of individual output losses.
  output_losses = []

  with backend.name_scope('loss'):
    for i, loss_fn in enumerate(model.loss_functions):
      weights = sample_weights[i] if sample_weights else None
      mask = masks[i]
      with backend.name_scope(model.output_names[i] + '_loss'):
        if mask is not None:
          mask = math_ops.cast(mask, outs[i].dtype)
          # Update weights with mask.
          if weights is None:
            weights = mask
          else:
            # Update dimensions of weights to match with mask if possible.
            mask, _, weights = (
                losses_utils.squeeze_or_expand_dimensions(mask, None, weights))
            weights *= mask

        # Reset reduction on the loss so that we can get the per sample loss
        # value. We use this to get both the stateless and stateful loss
        # values without having to compute the underlying loss function
        # twice.
        weighted_losses = None
        if hasattr(loss_fn, 'reduction'):
          current_loss_reduction = loss_fn.reduction
          loss_fn.reduction = losses_utils.ReductionV2.NONE
          weighted_losses = loss_fn(targets[i], outs[i], sample_weight=weights)
          loss_fn.reduction = current_loss_reduction

          # Compute the stateless loss value.
          output_loss = losses_utils.reduce_weighted_loss(weighted_losses)
        else:
          # Compute the stateless loss value for a custom loss class.
          # Here we assume that the class takes care of loss reduction
          # because if this class returns a vector value we cannot
          # differentiate between use case where a custom optimizer
          # expects a vector loss value vs unreduced per-sample loss value.
          output_loss = loss_fn(targets[i], outs[i], sample_weight=weights)

      # If the number of outputs is 1 then we don't append the loss metric
      # associated with each model output. When there are multiple outputs
      # associated with a model, each output's loss is calculated and returned
      # as part of the loss_metrics.
      if len(model.outputs) > 1:
        # Compute the stateful loss value.
        if weighted_losses is not None:
          aggregated_output_loss = output_loss_metrics[i](weighted_losses)
        else:
          # Custom loss class.
          aggregated_output_loss = training_utils.call_metric_function(
              output_loss_metrics[i], targets[i], outs[i], weights=weights)
        # Keep track of the stateful output loss result.
        output_losses.append(aggregated_output_loss)

      total_loss += model.loss_weights_list[i] * output_loss

    total_loss = backend.mean(total_loss)
    # Add regularization losses
    custom_losses = model.losses
    if custom_losses:
      total_loss += losses_utils.scale_loss_for_distribution(
          math_ops.add_n(custom_losses))

  return outs, total_loss, output_losses, masks
예제 #45
0
 def read_var(self, tower_local_var):
   """Read the aggregate value of a tower-local variable."""
   if isinstance(tower_local_var, values.TowerLocalVariable):
     return math_ops.add_n(self.unwrap(tower_local_var))
   assert isinstance(tower_local_var, values.Mirrored)
   return array_ops.identity(tower_local_var.get())
예제 #46
0
 def _calculate_t1_cond_values(self, alpha, beta, eta, lambd,
                               scaled_s_dg_db, scaled_s_dg_dl, graph, t):
     """
     Calculates values for t > 1 condition part.
     :return: Assignments made and values that are later needed in calculations.
     """
     assignments = []
     cond_t1 = math_ops.greater(t, 1.0)
     s_dt_db_norm = math_ops.sqrt(
         math_ops.add_n([
             math_ops.reduce_sum(self.get_slot(v, "s_dt_db")**2.0)
             for v in self._vars
         ]))
     s_dt_dl_norm = math_ops.sqrt(
         math_ops.add_n([
             math_ops.reduce_sum(self.get_slot(v, "s_dt_dl")**2.0)
             for v in self._vars
         ]))
     alpha0 = control_flow_ops.cond(
         cond_t1, lambda: math_ops.log(0.5 * math_ops.minimum(
             s_dt_db_norm / math_ops.sqrt(
                 math_ops.add_n([
                     math_ops.reduce_sum(sgb**2.0) for sgb in scaled_s_dg_db
                 ])), s_dt_dl_norm / math_ops.sqrt(
                     math_ops.add_n([
                         math_ops.reduce_sum(sgl**2.0)
                         for sgl in scaled_s_dg_dl
                     ])))), lambda: float('Inf'))
     cond_a0 = math_ops.greater(alpha, alpha0)
     alpha = control_flow_ops.cond(math_ops.logical_and(cond_t1, cond_a0),
                                   lambda: alpha - 2.0 * self._delta_t,
                                   lambda: alpha)
     beta = control_flow_ops.cond(math_ops.logical_and(cond_t1, cond_a0),
                                  lambda: math_ops.exp(alpha0),
                                  lambda: beta)
     eg2 = self._get_non_slot_variable("e_g2", graph)
     em2 = self._get_non_slot_variable("e_m2", graph)
     gamma = control_flow_ops.cond(
         cond_t1, lambda: math_ops.minimum(
             1.0,
             math_ops.minimum(self._C_t * eg2 / s_dt_db_norm**2.0, self._C_t
                              * em2 / s_dt_dl_norm**2.0)),
         lambda: self._get_non_slot_variable("gamma", graph))
     cond_gl = math_ops.greater(lambd, gamma)
     eta = control_flow_ops.cond(math_ops.logical_and(cond_t1, cond_gl),
                                 lambda: eta - 2.0 * self._delta_t,
                                 lambda: eta)
     lambd = control_flow_ops.cond(math_ops.logical_and(cond_t1, cond_gl),
                                   lambda: gamma, lambda: lambd)
     assignments.append(
         state_ops.assign(self._get_non_slot_variable("alpha", graph),
                          alpha))
     assignments.append(
         state_ops.assign(self._get_non_slot_variable("beta", graph), beta))
     assignments.append(
         state_ops.assign(self._get_non_slot_variable("eta", graph), eta))
     assignments.append(
         state_ops.assign(self._get_non_slot_variable("lambda", graph),
                          lambd))
     assignments.append(
         state_ops.assign(self._get_non_slot_variable("gamma", graph),
                          gamma))
     return assignments, beta, lambd, gamma
예제 #47
0
def _model_loss(model,
                inputs,
                targets,
                output_loss_metrics=None,
                sample_weights=None,
                training=False):
    """Calculates the loss for a given model.

  Arguments:
      model: The model on which metrics are being calculated.
      inputs: Either a dictionary of inputs to the model or a list of input
        arrays.
      targets: List of target arrays.
      output_loss_metrics: List of metrics that are used to aggregated output
        loss values.
      sample_weights: Optional list of sample weight arrays.
      training: Whether the model should be run in inference or training mode.

  Returns:
     Returns the model output, total loss, loss value calculated using the
     specified loss function and masks for each output. The total loss includes
     regularization losses and applies masking and sample weighting
     to the loss value.
  """
    total_loss = 0
    kwargs = {}
    if model._expects_training_arg:
        kwargs['training'] = training
    if len(inputs) == 1 and not isinstance(inputs, dict):
        inputs = inputs[0]

    if model._compute_output_and_mask_jointly:
        outs, masks = model._call_and_compute_mask(inputs, **kwargs)
        masks = generic_utils.to_list(masks)
    else:
        outs = model.call(inputs, **kwargs)
        masks = None

    outs = generic_utils.to_list(outs)
    if masks is None:
        masks = [None for _ in outs]
    targets = generic_utils.to_list(targets)

    loss_metrics = []
    aggregated_loss_metrics = []
    with backend.name_scope('loss'):
        for i, loss_fn in enumerate(model.loss_functions):
            if sample_weights:
                weights = sample_weights[i]
            else:
                weights = None
            mask = masks[i]

            weighted_masked_fn = training_utils.weighted_masked_objective(
                loss_fn)
            with backend.name_scope(model.output_names[i] + '_loss'):
                output_loss = weighted_masked_fn(targets[i],
                                                 outs[i],
                                                 weights,
                                                 mask=mask)
            # If the number of outputs is 1 then we don't append the loss metric
            # associated with each model output. When there are multiple outputs
            # associated with a model, each output's loss is calculated and returned
            # as part of the loss_metrics.
            if len(model.outputs) > 1:
                loss_metrics.append(backend.mean(output_loss))

                if output_loss_metrics is not None:
                    # Keep track of the stateful loss result.
                    aggregated_loss_metrics.append(
                        training_utils.call_metric_function(
                            output_loss_metrics[i],
                            targets[i],
                            outs[i],
                            weights=weights,
                            mask=mask))

            loss_weight = model.loss_weights_list[i]
            if total_loss is None:
                total_loss = loss_weight * output_loss
            else:
                total_loss += loss_weight * output_loss

        total_loss = backend.mean(total_loss)
        # Add regularization losses
        custom_losses = model.losses
        if custom_losses:
            total_loss += math_ops.add_n(custom_losses)
        model._clear_losses()

    return outs, total_loss, loss_metrics, aggregated_loss_metrics, masks
예제 #48
0
    def _update_vars_and_estimators(self, scaled_g, beta, prev_lambd, lambd,
                                    gamma, mu, scaled_s_dg_db, scaled_s_dg_dl,
                                    dq_db, dq_dl, dbj_dmu, w_t, graph):
        """
        Updates variables and estimator values.
        :return: Assignments of new values.
        """
        assignments = []
        momentum_values = []
        for v, ov, g, sgb, sgl in zip(self._vars, self._original_vars,
                                      scaled_g, scaled_s_dg_db,
                                      scaled_s_dg_dl):
            sbtb = self.get_slot(v, "s_dbt_db")
            sbtl = self.get_slot(v, "s_dbt_dl")
            p = self.get_slot(v, "phi")
            btmu = self.get_slot(v, "dbt_dmu")
            stb = self.get_slot(v, "s_dt_db")
            stl = self.get_slot(v, "s_dt_dl")
            smb = self.get_slot(v, "s_dm_db")
            sml = self.get_slot(v, "s_dm_dl")
            m = self.get_slot(v, "momentum")
            if not self._use_ag:
                s_dbt_db = mu * gamma * sbtb + (1.0 - mu) * gamma * stb
                s_dbt_dl = mu * gamma * sbtl + (1.0 - mu) * gamma * stl
                dbt_dmu = -p + mu * btmu
                s_dm_db = lambd * gamma * smb - g - beta * gamma * sgb
                s_dm_dl = m + lambd * gamma * sml - beta * gamma * sgl
                momentum = lambd * m - beta * g
                phi = mu * p + momentum
                new_v = ov + momentum - phi
                s_dt_db = gamma * stb + s_dm_db
                s_dt_dl = gamma * stl + s_dm_dl
            else:
                s_dbt_db = mu * gamma * sbtb + \
                           (1.0 - mu) * gamma * (stb - lambd * smb)
                s_dbt_dl = mu * gamma * sbtl + \
                           (1.0 - mu) * (gamma * stl - m - lambd * gamma * sml)
                dbt_dmu = -p + mu * btmu
                s_dm_db = prev_lambd * gamma * smb - g - beta * gamma * sgb
                s_dm_dl = m + prev_lambd * gamma * sml - beta * sgl
                momentum = prev_lambd * m - beta * g
                phi = mu * p + momentum
                new_v = ov - beta * g - phi
                s_dt_db = gamma * stb + lambd * s_dm_db - g - beta * gamma * sgb
                s_dt_dl = gamma * stl + momentum + lambd * gamma * s_dm_dl - beta * gamma * sgl
            momentum_values.append(momentum)
            assignments.append(
                state_ops.assign(self.get_slot(v, "s_dbt_db"), s_dbt_db))
            assignments.append(
                state_ops.assign(self.get_slot(v, "s_dbt_dl"), s_dbt_dl))
            assignments.append(
                state_ops.assign(self.get_slot(v, "dbt_dmu"), dbt_dmu))
            assignments.append(
                state_ops.assign(self.get_slot(v, "s_dm_db"), s_dm_db))
            assignments.append(
                state_ops.assign(self.get_slot(v, "s_dm_dl"), s_dm_dl))
            assignments.append(
                state_ops.assign(self.get_slot(v, "momentum"), momentum))
            assignments.append(state_ops.assign(v, new_v))
            assignments.append(state_ops.assign(self.get_slot(v, "phi"), phi))
            assignments.append(
                state_ops.assign(self.get_slot(v, "s_dt_db"), s_dt_db))
            assignments.append(
                state_ops.assign(self.get_slot(v, "s_dt_dl"), s_dt_dl))
        e_dq_db2 = w_t * self._get_non_slot_variable("e_dq_db2", graph) + \
                   (1.0 - w_t) * (dq_db ** 2.0)
        e_dq_dl2 = w_t * self._get_non_slot_variable("e_dq_dl2", graph) + \
                   (1.0 - w_t) * (dq_dl ** 2.0)
        e_dbj_dmu2 = w_t * self._get_non_slot_variable("e_dbj_dmu2", graph) + \
                     (1.0 - w_t) * (dbj_dmu ** 2.0)
        e_g2 = w_t * self._get_non_slot_variable("e_g2", graph) + \
               (1.0 - w_t) * math_ops.add_n([math_ops.reduce_sum(g ** 2.0)
                                             for g in scaled_g])
        e_m2 = w_t * self._get_non_slot_variable("e_m2", graph) + \
               (1.0 - w_t) * math_ops.add_n([math_ops.reduce_sum(m ** 2.0)
                                             for m in momentum_values])
        assignments.append(
            state_ops.assign(self._get_non_slot_variable("e_dq_db2", graph),
                             e_dq_db2))
        assignments.append(
            state_ops.assign(self._get_non_slot_variable("e_dq_dl2", graph),
                             e_dq_dl2))
        assignments.append(
            state_ops.assign(self._get_non_slot_variable("e_dbj_dmu2", graph),
                             e_dbj_dmu2))
        assignments.append(
            state_ops.assign(self._get_non_slot_variable("e_g2", graph), e_g2))
        assignments.append(
            state_ops.assign(self._get_non_slot_variable("e_m2", graph), e_m2))

        return assignments
예제 #49
0
 def add(self, x):
     return x + math_ops.add_n(self.a) + self.b["a"]
예제 #50
0
 def sum_reg(weights, name=None):
   """Applies the sum of all the input regularizers."""
   with ops.op_scope([weights], name, 'sum_regularizer') as scope:
     regularizer_tensors = [reg(weights) for reg in regularizer_list]
     return math_ops.add_n(regularizer_tensors, name=scope)
예제 #51
0
 def loop_fn(i):
     x1 = array_ops.gather(x, i)
     return math_ops.add_n([x1, y, z])
예제 #52
0
 def fn():
     outputs = []
     for _ in range(20):
         outputs.append(v * constant_op.constant(2.0))
     return math_ops.add_n(outputs)
예제 #53
0
    def __call__(self, y_true, y_pred, sample_weight=None):
        """Computes the overall loss.

    Arguments:
      y_true: An arbitrary structure of Tensors representing the ground truth.
      y_pred: An arbitrary structure of Tensors representing a Model's outputs.
      sample_weight: An arbitrary structure of Tensors representing the
        per-sample loss weights. If one Tensor is passed, it is used for all
        losses. If multiple Tensors are passed, the structure should match
        `y_pred`.

    Returns:
      Tuple of `(total_loss, per_output_loss_list)`
    """
        if not self._built:
            self._build(y_pred)

        y_true = nest.flatten(y_true)
        y_pred = nest.flatten(y_pred)

        # TODO(omalleyt): Remove ambiguity here.
        # This is currently needed to support passing only 1 loss and 1 target
        # to a Functional Model with multiple outputs. However, this is
        # ambiguous, especially with subclass, and we should reconsider how we
        # support this.
        if len(y_true) == 1 and len(y_pred) > 1:
            y_true = y_true * len(y_pred)

        sample_weight = nest.flatten(sample_weight)
        # Allows passing one sample-weight array for all outputs.
        if len(sample_weight) == 1 and len(y_pred) > 1:
            sample_weight = sample_weight * len(y_pred)

        loss_values = []
        metric_loss_values = []  # The loss value passed on to `Mean` metrics.
        zip_args = (y_true, y_pred, sample_weight, self._losses,
                    self._loss_weights)
        for y_t, y_p, sw, loss_obj, loss_weight in zip(*zip_args):
            if loss_obj is None:  # Ok to have no loss for an output.
                continue

            y_t = math_ops.cast(y_t, y_p.dtype)
            if sw is not None:
                sw = math_ops.cast(sw, y_p.dtype)

            # Handle Keras mask on outputs.
            mask = getattr(y_p, '_keras_mask', None)
            if mask is not None:
                mask = math_ops.cast(mask, y_p.dtype)
                if sw is not None:
                    mask, _, sw = (
                        tf_losses_utils.squeeze_or_expand_dimensions(
                            mask, sample_weight=sw))
                    sw *= mask
                else:
                    sw = mask

            loss_value = loss_obj(y_t, y_p, sample_weight=sw)
            if loss_weight is not None:
                loss_value *= loss_weight
            metric_loss_values.append(loss_value)

            # TODO(omalleyt): Should this be in the `Loss` class?
            if (loss_obj.reduction
                    == losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE
                    or loss_obj.reduction == losses_utils.ReductionV2.AUTO):
                loss_value = losses_utils.scale_loss_for_distribution(
                    loss_value)
            loss_values.append(loss_value)

        # Ok for a model to have no compiled loss.
        total_loss = math_ops.add_n(
            loss_values) if loss_values else array_ops.zeros((1, ))

        # TODO(omalleyt): Don't return per-output losses once MetricsContainer
        # handles this.
        return total_loss, metric_loss_values
def surrogate_loss(sample_losses,
                   stochastic_tensors=None,
                   name="SurrogateLoss"):
    """Surrogate loss for stochastic graphs.

  This function will call `loss_fn` on each `StochasticTensor`
  upstream of `sample_losses`, passing the losses that it influenced.

  Note that currently `surrogate_loss` does not work with `StochasticTensor`s
  instantiated in `while_loop`s or other control structures.

  Args:
    sample_losses: a list or tuple of final losses. Each loss should be per
      example in the batch (and possibly per sample); that is, it should have
      dimensionality of 1 or greater. All losses should have the same shape.
    stochastic_tensors: a list of `StochasticTensor`s to add loss terms for.
      If None, defaults to all `StochasticTensor`s in the graph upstream of
      the `Tensor`s in `sample_losses`.
    name: the name with which to prepend created ops.

  Returns:
    `Tensor` loss, which is the sum of `sample_losses` and the
    `loss_fn`s returned by the `StochasticTensor`s.

  Raises:
    TypeError: if `sample_losses` is not a list or tuple, or if its elements
      are not `Tensor`s.
    ValueError: if any loss in `sample_losses` does not have dimensionality 1
      or greater.
  """
    with ops.name_scope(name, values=sample_losses):
        if not isinstance(sample_losses, (list, tuple)):
            raise TypeError("sample_losses must be a list or tuple")
        for loss in sample_losses:
            if not isinstance(loss, ops.Tensor):
                raise TypeError("loss is not a Tensor: %s" % loss)
            ndims = loss.get_shape().ndims
            if not (ndims is not None and ndims >= 1):
                raise ValueError(
                    "loss must have dimensionality 1 or greater: %s" % loss)

        stoch_dependencies_map = _stochastic_dependencies_map(
            sample_losses, stochastic_tensors=stochastic_tensors)
        if not stoch_dependencies_map:
            logging.warn(
                "No collection of Stochastic Tensors found for current graph.")
            return math_ops.add_n(sample_losses)

        # Iterate through all of the stochastic dependencies, adding
        # surrogate terms where necessary.
        sample_losses = [ops.convert_to_tensor(loss) for loss in sample_losses]
        loss_terms = sample_losses
        for (stoch_node, dependent_losses) in stoch_dependencies_map.items():
            dependent_losses = list(dependent_losses)

            logging.info("Losses influenced by StochasticTensor %s: [%s]",
                         stoch_node.name,
                         ", ".join([loss.name for loss in dependent_losses]))

            # Sum up the downstream losses for this ST
            influenced_loss = _add_n_or_sum(dependent_losses)

            # Compute surrogate loss term
            loss_term = stoch_node.loss(
                array_ops.stop_gradient(influenced_loss))
            if loss_term is not None:
                loss_terms.append(loss_term)

        return _add_n_or_sum(loss_terms)
예제 #55
0
    def __call__(self,
                 y_true,
                 y_pred,
                 sample_weight=None,
                 regularization_losses=None):
        """Computes the overall loss.

    Arguments:
      y_true: An arbitrary structure of Tensors representing the ground truth.
      y_pred: An arbitrary structure of Tensors representing a Model's outputs.
      sample_weight: An arbitrary structure of Tensors representing the
        per-sample loss weights. If one Tensor is passed, it is used for all
        losses. If multiple Tensors are passed, the structure should match
        `y_pred`.
      regularization_losses: Additional losses to be added to the total loss.

    Returns:
      Tuple of `(total_loss, per_output_loss_list)`
    """
        y_true = self._conform_to_outputs(y_pred, y_true)
        sample_weight = self._conform_to_outputs(y_pred, sample_weight)

        if not self._built:
            self._build(y_pred)

        y_pred = nest.flatten(y_pred)
        y_true = nest.flatten(y_true)
        sample_weight = nest.flatten(sample_weight)

        loss_values = []  # Used for gradient calculation.
        loss_metric_values = []  # Used for loss metric calculation.
        batch_dim = None
        zip_args = (y_true, y_pred, sample_weight, self._losses,
                    self._loss_weights, self._per_output_metrics)
        for y_t, y_p, sw, loss_obj, loss_weight, metric_obj in zip(*zip_args):
            if y_t is None or loss_obj is None:  # Ok to have no loss for an output.
                continue

            y_t, y_p, sw = match_dtype_and_rank(y_t, y_p, sw)
            sw = apply_mask(y_p, sw, get_mask(y_p))
            loss_value = loss_obj(y_t, y_p, sample_weight=sw)

            loss_metric_value = loss_value
            # Correct for the `Mean` loss metrics counting each replica as a batch.
            if loss_obj.reduction == losses_utils.ReductionV2.SUM:
                loss_metric_value *= ds_context.get_strategy(
                ).num_replicas_in_sync

            if batch_dim is None:
                batch_dim = array_ops.shape(y_t)[0]
            if metric_obj is not None:
                metric_obj.update_state(loss_metric_value,
                                        sample_weight=batch_dim)

            if loss_weight is not None:
                loss_value *= loss_weight
                loss_metric_value *= loss_weight

            if (loss_obj.reduction
                    == losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE
                    or loss_obj.reduction == losses_utils.ReductionV2.AUTO):
                loss_value = losses_utils.scale_loss_for_distribution(
                    loss_value)

            loss_values.append(loss_value)
            loss_metric_values.append(loss_metric_value)

        if regularization_losses:
            regularization_losses = losses_utils.cast_losses_to_common_dtype(
                regularization_losses)
            reg_loss = math_ops.add_n(regularization_losses)
            loss_metric_values.append(reg_loss)
            loss_values.append(
                losses_utils.scale_loss_for_distribution(reg_loss))

        if loss_values:
            loss_metric_values = losses_utils.cast_losses_to_common_dtype(
                loss_metric_values)
            total_loss_metric_value = math_ops.add_n(loss_metric_values)
            self._loss_metric.update_state(total_loss_metric_value,
                                           sample_weight=batch_dim)

            loss_values = losses_utils.cast_losses_to_common_dtype(loss_values)
            total_loss = math_ops.add_n(loss_values)
            return total_loss
        else:
            # Ok for a model to have no compiled loss.
            return array_ops.zeros(shape=())
예제 #56
0
def _AggregatedGrads(grads,
                     op,
                     gradient_uid,
                     loop_state,
                     aggregation_method=None):
    """Get the aggregated gradients for op.

  Args:
    grads: The map of memoized gradients.
    op: The op to get gradients for.
    gradient_uid: A unique identifier within the graph indicating
      which invocation of gradients is being executed. Used to cluster
      ops for compilation.
    loop_state: An object for maintaining the state of the while loops in the
                graph. It is of type ControlFlowState. None if the graph
                contains no while loops.
    aggregation_method: Specifies the method used to combine gradient terms.
      Accepted values are constants defined in the class `AggregationMethod`.

  Returns:
    A list of gradients, one per each output of `op`. If the gradients
      for a particular output is a list, this function aggregates it
      before returning.

  Raises:
    TypeError: if the incoming grads are not Tensors or IndexedSlices.
    ValueError: if the arguments are invalid.

  """
    if aggregation_method is None:
        aggregation_method = AggregationMethod.DEFAULT
    valid_aggregation_methods = [
        AggregationMethod.ADD_N, AggregationMethod.EXPERIMENTAL_TREE,
        AggregationMethod.EXPERIMENTAL_ACCUMULATE_N
    ]
    if aggregation_method not in valid_aggregation_methods:
        raise ValueError(
            f"Invalid `aggregation_method` specified {aggregation_method}. "
            f"Accepted values are {valid_aggregation_methods}.")
    out_grads = _GetGrads(grads, op)
    for i, out_grad in enumerate(out_grads):
        if loop_state:
            if isinstance(out_grad, (ops.Tensor, ops.IndexedSlices)):
                assert control_flow_util.IsLoopSwitch(op)
                continue
        # Grads have to be Tensors or IndexedSlices
        if (isinstance(out_grad, collections_abc.Sequence) and not all(
                isinstance(g, (ops.Tensor, ops.IndexedSlices))
                for g in out_grad if g is not None)):
            raise TypeError(
                f"Invalid gradient {out_grad} [index = {i}]. Gradients "
                "have to be either all Tensors or all IndexedSlices")
        # Aggregate multiple gradients, and convert [] to None.
        if out_grad:
            if len(out_grad) < 2:
                used = "nop"
                out_grads[i] = out_grad[0]
            elif all(
                    isinstance(g, ops.Tensor) for g in out_grad
                    if g is not None):
                tensor_shape = _AccumulatorShape(out_grad)
                if aggregation_method in [
                        AggregationMethod.EXPERIMENTAL_TREE,
                        AggregationMethod.EXPERIMENTAL_ACCUMULATE_N
                ]:
                    # Aggregate all gradients by doing pairwise sums: this may
                    # reduce performance, but it can improve memory because the
                    # gradients can be released earlier.
                    #
                    # TODO(vrv): Consider replacing this with a version of
                    # tf.AddN() that eagerly frees its inputs as soon as they are
                    # ready, so the order of this tree does not become a problem.
                    used = "tree"
                    with ops.name_scope(op.name + "_gradient_sum"):
                        running_sum = out_grad[0]
                        for grad in out_grad[1:]:
                            running_sum = math_ops.add_n([running_sum, grad])
                        out_grads[i] = running_sum
                else:
                    used = "add_n"
                    out_grads[i] = _MultiDeviceAddN(out_grad, gradient_uid)
                logging.vlog(2, "  _AggregatedGrads %d x %s using %s",
                             len(out_grad), tensor_shape, used)
            else:
                out_grads[i] = backprop.aggregate_indexed_slices_gradients(
                    out_grad)  # pylint: disable=protected-access
        else:  # not out_grad
            # out_grads[i] is [], thus its aggregation is simply None.
            out_grads[i] = None
    return out_grads
예제 #57
0
 def _total_loss(self):
   return math_ops.add_n(tuple(loss.evaluate() for loss in self._losses))
예제 #58
0
 def total_sampled_loss(self):
     return math_ops.add_n(
         tuple(loss.evaluate_on_sample() for loss in self.losses))
def _add_n():
    inputs = keras.Input(shape=(10, ))
    outputs = math_ops.add_n([inputs, inputs, inputs])
    return keras.Model(inputs, outputs)
예제 #60
0
 def cond_body():
     reduced = collective.reduce(reduce_util.ReduceOp.SUM,
                                 value, value, options)
     return math_ops.add_n(self.as_list(reduced)) / len(devices)