예제 #1
0
 def test_maxlength(self):
   with self.session():
     self.assertAllEqual(
         self.evaluate(bincount_ops.bincount([5], maxlength=3)), [0, 0, 0])
     self.assertAllEqual(
         self.evaluate(bincount_ops.bincount([1], maxlength=3)), [0, 1])
     self.assertAllEqual(
         self.evaluate(bincount_ops.bincount([], maxlength=3)), [])
예제 #2
0
 def test_bincount_determinism_error(self):
   arr = np.random.randint(0, 1000, size=1000)
   with test_util.deterministic_ops(), self.assertRaisesRegex(
       errors_impl.UnimplementedError,
       "Determinism is not yet supported in GPU implementation of Bincount."):
     self.evaluate(bincount_ops.bincount(arr, None, axis=None))
   arr = np.random.randint(0, 1000, size=(100, 100))
   with test_util.deterministic_ops(), self.assertRaisesRegex(
       errors_impl.UnimplementedError,
       "Determinism is not yet supported in GPU implementation of "
       "DenseBincount."):
     self.evaluate(bincount_ops.bincount(arr, None, axis=-1))
예제 #3
0
  def call(self, inputs):
    self._called = True
    if self._max_tokens is None:
      out_depth = K.get_value(self.num_elements)
    else:
      out_depth = self._max_tokens

    if self._output_mode == TFIDF:
      # If the input is a sparse tensor, we densify it with the default value of
      # -1. Because -1 is ignored by one_hot, this effectively drops the non-set
      # positions from the output encoding.
      if isinstance(inputs, sparse_tensor.SparseTensor):
        inputs = sparse_ops.sparse_tensor_to_dense(inputs, default_value=-1)
      one_hot_data = array_ops.one_hot(inputs, depth=out_depth)
      counts = math_ops.reduce_sum(one_hot_data, axis=1)
      tf_idf_data = math_ops.multiply(counts, self.tf_idf_weights)
      tf_idf_data.set_shape(tensor_shape.TensorShape((None, out_depth)))
      return tf_idf_data

    binary_output = (self._output_mode == BINARY)
    if self._sparse:
      return bincount_ops.sparse_bincount(
          inputs, minlength=out_depth, axis=-1, binary_output=binary_output)
    else:
      result = bincount_ops.bincount(
          inputs,
          minlength=out_depth,
          dtype=dtypes.int64,
          axis=-1,
          binary_output=binary_output)
      result.set_shape(tensor_shape.TensorShape((None, out_depth)))
      return result
예제 #4
0
    def call(self, inputs, count_weights=None):
        if isinstance(inputs, (list, np.ndarray)):
            inputs = ops.convert_to_tensor_v2(inputs)
        if inputs.shape.rank == 1:
            inputs = array_ops.expand_dims(inputs, 1)

        if count_weights is not None and self._output_mode != COUNT:
            raise ValueError(
                "count_weights is not used in `output_mode='tf-idf'`, "
                "or `output_mode='binary'`. Please pass a single input.")
        self._called = True
        if self._max_tokens is None:
            out_depth = K.get_value(self.num_elements)
            if out_depth == 0:
                raise RuntimeError(
                    "If you construct a `CategoryEncoding` layer with "
                    "`max_tokens=None`, you need to call `adapt()` "
                    "on it before using it")
        else:
            out_depth = self._max_tokens

        if self._output_mode == TFIDF:
            # If the input is a sparse tensor, we densify it with the default value of
            # -1. Because -1 is ignored by one_hot, this effectively drops the non-set
            # positions from the output encoding.
            if self._sparse:
                raise ValueError("`sparse=True` with `output_mode=tfidf` "
                                 "is not supported.")
            if isinstance(inputs, sparse_tensor.SparseTensor):
                inputs = sparse_ops.sparse_tensor_to_dense(inputs,
                                                           default_value=-1)
            one_hot_data = array_ops.one_hot(inputs, depth=out_depth)
            counts = math_ops.reduce_sum(one_hot_data, axis=1)
            tf_idf_data = math_ops.multiply(counts, self.tf_idf_weights)
            tf_idf_data.set_shape(tensor_shape.TensorShape((None, out_depth)))
            return tf_idf_data

        binary_output = (self._output_mode == BINARY)
        if self._sparse:
            result = bincount_ops.sparse_bincount(inputs,
                                                  weights=count_weights,
                                                  minlength=out_depth,
                                                  axis=-1,
                                                  binary_output=binary_output)
            result = math_ops.cast(result, K.floatx())
            batch_size = array_ops.shape(result)[0]
            result = sparse_tensor.SparseTensor(
                indices=result.indices,
                values=result.values,
                dense_shape=[batch_size, out_depth])
            return result
        else:
            result = bincount_ops.bincount(inputs,
                                           weights=count_weights,
                                           minlength=out_depth,
                                           dtype=K.floatx(),
                                           axis=-1,
                                           binary_output=binary_output)
            result.set_shape(tensor_shape.TensorShape((None, out_depth)))
            return result
예제 #5
0
 def test_sparse_input_col_reduce_binary(self, dtype):
     num_rows = 128
     num_cols = 27
     size = 100
     np.random.seed(42)
     inp = np.random.randint(0, size, (num_rows, num_cols), dtype=dtype)
     np_out = np.reshape(
         np.concatenate([
             np.where(np.bincount(inp[j, :], minlength=size) > 0, 1, 0)
             for j in range(num_rows)
         ],
                        axis=0), (num_rows, size))
     # from_dense will filter out 0s.
     inp = inp + 1
     # from_dense will cause OOM in GPU.
     with ops.device("/CPU:0"):
         inp_sparse = sparse_ops.from_dense(inp)
         inp_sparse = sparse_tensor.SparseTensor(inp_sparse.indices,
                                                 inp_sparse.values - 1,
                                                 inp_sparse.dense_shape)
     self.assertAllEqual(
         np_out,
         self.evaluate(
             bincount_ops.bincount(arr=inp_sparse,
                                   axis=-1,
                                   binary_output=True)))
예제 #6
0
 def test_empty(self):
     with self.session():
         self.assertAllEqual(
             self.evaluate(bincount_ops.bincount([], minlength=5)),
             [0, 0, 0, 0, 0])
         self.assertAllEqual(
             self.evaluate(bincount_ops.bincount([], minlength=1)), [0])
         self.assertAllEqual(
             self.evaluate(bincount_ops.bincount([], minlength=0)), [])
         self.assertEqual(
             self.evaluate(
                 bincount_ops.bincount([], minlength=0,
                                       dtype=np.float32)).dtype, np.float32)
         self.assertEqual(
             self.evaluate(
                 bincount_ops.bincount([], minlength=3,
                                       dtype=np.float64)).dtype, np.float64)
예제 #7
0
 def test_random_without_weights(self):
     num_samples = 10000
     with self.session():
         np.random.seed(42)
         for dtype in [np.int32, np.float32]:
             arr = np.random.randint(0, 1000, num_samples)
             weights = np.ones(num_samples).astype(dtype)
             self.assertAllClose(
                 self.evaluate(bincount_ops.bincount(arr, None)),
                 np.bincount(arr, weights))
예제 #8
0
 def test_ragged_input_binary(self, dtype):
     x = ragged_factory_ops.constant([[], [], [3, 0, 1], [], [5, 0, 4, 4]])
     # pyformat: disable
     expected_output = [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0],
                        [1, 1, 0, 1, 0, 0], [0, 0, 0, 0, 0, 0],
                        [1, 0, 0, 0, 1, 1]]
     # pyformat: enable
     self.assertAllEqual(
         expected_output,
         self.evaluate(
             bincount_ops.bincount(arr=x, axis=-1, binary_output=True)))
예제 #9
0
def dense_bincount(inputs, out_depth, multi_hot_output, count_weights=None):
    """Apply binary or count encoding to an input."""
    result = bincount_ops.bincount(inputs,
                                   weights=count_weights,
                                   minlength=out_depth,
                                   maxlength=out_depth,
                                   dtype=backend.floatx(),
                                   axis=-1,
                                   binary_output=multi_hot_output)
    batch_size = inputs.shape.as_list()[0]
    result.set_shape(tensor_shape.TensorShape((batch_size, out_depth)))
    return result
예제 #10
0
def dense_bincount(inputs, out_depth, binary_output, count_weights=None):
  """Apply binary or count encoding to an input."""
  result = bincount_ops.bincount(
      inputs,
      weights=count_weights,
      minlength=out_depth,
      maxlength=out_depth,
      dtype=K.floatx(),
      axis=-1,
      binary_output=binary_output)
  result.set_shape(tensor_shape.TensorShape((None, out_depth)))
  return result
예제 #11
0
 def test_random_with_weights(self):
   num_samples = 10000
   with self.session():
     np.random.seed(42)
     for dtype in [dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64]:
       arr = np.random.randint(0, 1000, num_samples)
       if dtype == dtypes.int32 or dtype == dtypes.int64:
         weights = np.random.randint(-100, 100, num_samples)
       else:
         weights = np.random.random(num_samples)
       self.assertAllClose(
           self.evaluate(bincount_ops.bincount(arr, weights)),
           np.bincount(arr, weights))
예제 #12
0
 def test_ragged_input_count_with_weights(self, dtype):
     x = ragged_factory_ops.constant([[], [], [3, 0, 1], [], [5, 0, 4, 4]])
     weights = ragged_factory_ops.constant([[], [], [.1, .2, .3], [],
                                            [.2, .5, .6, .3]])
     # pyformat: disable
     expected_output = [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0],
                        [.2, .3, 0, .1, 0, 0], [0, 0, 0, 0, 0, 0],
                        [.5, 0, 0, 0, .9, .2]]
     # pyformat: enable
     self.assertAllClose(
         expected_output,
         self.evaluate(
             bincount_ops.bincount(arr=x, weights=weights, axis=-1)))
예제 #13
0
 def test_bincount_determinism_error(self):
   num_samples = 10000
   np.random.seed(42)
   arr = np.random.randint(0, 1000, num_samples)
   try:
     config.enable_deterministic_ops(True)
     with test_util.use_gpu():
       if test_util.is_gpu_available(cuda_only=True):
         with self.assertRaisesRegexp(
             errors_impl.UnimplementedError, "Determinism is not yet "
             "supported for Bincount."):
           self.evaluate(bincount_ops.bincount(arr, None))
   finally:
     config.enable_deterministic_ops(False)
예제 #14
0
 def test_ragged_input_count_np(self, dtype):
   np.random.seed(42)
   num_rows = 128
   num_cols = 27
   size = 1000
   inp = np.random.randint(0, size, (num_rows, num_cols), dtype=dtype)
   np_out = np.reshape(
       np.concatenate(
           [np.bincount(inp[j, :], minlength=size) for j in range(num_rows)],
           axis=0), (num_rows, size))
   x = ragged_tensor.RaggedTensor.from_tensor(inp)
   self.assertAllEqual(
       np_out,
       self.evaluate(bincount_ops.bincount(arr=x, minlength=size, axis=-1)))
예제 #15
0
  def test_sparse_input_all_count(self, dtype):
    np.random.seed(42)
    num_rows = 128
    size = 1000
    n_elems = 4096
    inp_indices = np.random.randint(0, num_rows, (n_elems, 1))
    inp_indices = np.concatenate([inp_indices, np.zeros((n_elems, 1))], axis=1)
    inp_vals = np.random.randint(0, size, (n_elems,), dtype=dtype)
    sparse_inp = sparse_tensor.SparseTensor(inp_indices, inp_vals,
                                            [num_rows, 1])

    np_out = np.bincount(inp_vals, minlength=size)
    self.assertAllEqual(
        np_out, self.evaluate(bincount_ops.bincount(sparse_inp, axis=0)))
예제 #16
0
    def call(self, inputs, count_weights=None):
        if count_weights is not None and self._output_mode != COUNT:
            raise ValueError(
                "count_weights is not used in `output_mode='tf-idf'`, "
                "or `output_mode='binary'`. Please pass a single input.")
        self._called = True
        if self._max_tokens is None:
            out_depth = K.get_value(self.num_elements)
        else:
            out_depth = self._max_tokens

        if self._output_mode == TFIDF:
            # If the input is a sparse tensor, we densify it with the default value of
            # -1. Because -1 is ignored by one_hot, this effectively drops the non-set
            # positions from the output encoding.
            if isinstance(inputs, sparse_tensor.SparseTensor):
                inputs = sparse_ops.sparse_tensor_to_dense(inputs,
                                                           default_value=-1)
            one_hot_data = array_ops.one_hot(inputs, depth=out_depth)
            counts = math_ops.reduce_sum(one_hot_data, axis=1)
            tf_idf_data = math_ops.multiply(counts, self.tf_idf_weights)
            tf_idf_data.set_shape(tensor_shape.TensorShape((None, out_depth)))
            return tf_idf_data

        binary_output = (self._output_mode == BINARY)
        if self._sparse:
            result = bincount_ops.sparse_bincount(inputs,
                                                  weights=count_weights,
                                                  minlength=out_depth,
                                                  axis=-1,
                                                  binary_output=binary_output)
            return math_ops.cast(result, K.floatx())
        else:
            result = bincount_ops.bincount(inputs,
                                           weights=count_weights,
                                           minlength=out_depth,
                                           dtype=K.floatx(),
                                           axis=-1,
                                           binary_output=binary_output)
            result.set_shape(tensor_shape.TensorShape((None, out_depth)))
            return result
예제 #17
0
  def test_values(self):
    with self.session():
      self.assertAllEqual(
          self.evaluate(bincount_ops.bincount([1, 1, 1, 2, 2, 3])),
          [0, 3, 2, 1])
      arr = [1, 1, 2, 1, 2, 3, 1, 2, 3, 4, 1, 2, 3, 4, 5]
      self.assertAllEqual(
          self.evaluate(bincount_ops.bincount(arr)), [0, 5, 4, 3, 2, 1])
      arr += [0, 0, 0, 0, 0, 0]
      self.assertAllEqual(
          self.evaluate(bincount_ops.bincount(arr)), [6, 5, 4, 3, 2, 1])

      self.assertAllEqual(self.evaluate(bincount_ops.bincount([])), [])
      self.assertAllEqual(self.evaluate(bincount_ops.bincount([0, 0, 0])), [3])
      self.assertAllEqual(
          self.evaluate(bincount_ops.bincount([5])), [0, 0, 0, 0, 0, 1])
      self.assertAllEqual(
          self.evaluate(bincount_ops.bincount(np.arange(10000))),
          np.ones(10000))
예제 #18
0
  def from_value_rowids(cls,
                        value_rowids,
                        nrows=None,
                        validate=True,
                        preferred_dtype=None):
    """Creates a `RowPartition` with rows partitioned by `value_rowids`.

    This `RowPartition` divides a sequence `values` into rows by specifying
    which row each value should be added to:

    ```python
    partitioned_rows = [[] for _ in nrows]
    for (value, rowid) in zip(values, value_rowids):
      partitioned_rows[rowid].append(value)
    ``

    Args:
      value_rowids: A 1-D integer tensor with shape `[nvals]`, which corresponds
        one-to-one with `values`, and specifies each value's row index.  Must be
        nonnegative, and must be sorted in ascending order.
      nrows: An integer scalar specifying the number of rows.  This should be
        specified if the `RowPartition` may containing empty training rows. Must
        be greater than `value_rowids[-1]` (or greater than or equal to zero if
        `value_rowids` is empty). Defaults to `value_rowids[-1]` (or zero if
        `value_rowids` is empty).
      validate: If true, then use assertions to check that the arguments form a
        valid `RowPartition`.
      preferred_dtype: The dtype to encode value_rowids if it doesn't already
        have one. The default is tf.int64.

    Returns:
      A `RowPartition`.

    Raises:
      ValueError: If `nrows` is incompatible with `value_rowids`.

    #### Example:

    >>> print(RowPartition.from_value_rowids(
    ...     value_rowids=[0, 0, 0, 0, 2, 2, 2, 3],
    ...     nrows=4))
    tf.RowPartition(row_splits=tf.Tensor([0 4 4 7 8], shape=(5,), dtype=int64))
    """
    # Local import bincount_ops to avoid import-cycle since bincount_ops
    # imports ragged_tensor.
    from tensorflow.python.ops import bincount_ops  # pylint: disable=g-import-not-at-top
    if not isinstance(validate, bool):
      raise TypeError("validate must have type bool")
    with ops.name_scope(None, "RowPartitionFromValueRowIds",
                        [value_rowids, nrows]):
      value_rowids = cls._convert_row_partition(value_rowids, "value_rowids",
                                                preferred_dtype)
      if nrows is None:
        const_rowids = tensor_util.constant_value(value_rowids)
        if const_rowids is None:
          nrows = array_ops.concat([value_rowids[-1:], [-1]], axis=0)[0] + 1
          const_nrows = None
        else:
          const_nrows = const_rowids[-1] + 1 if const_rowids.size > 0 else 0
          nrows = ops.convert_to_tensor(
              const_nrows, value_rowids.dtype, name="nrows")
      else:
        nrows = ops.convert_to_tensor(nrows, value_rowids.dtype, "nrows")
        const_nrows = tensor_util.constant_value(nrows)
        if const_nrows is not None:
          if const_nrows < 0:
            raise ValueError("Expected nrows >= 0; got %d" % const_nrows)
          const_rowids = tensor_util.constant_value(value_rowids)
          if const_rowids is not None and const_rowids.size > 0:
            if not const_nrows >= const_rowids[-1] + 1:
              raise ValueError(
                  "Expected nrows >= value_rowids[-1] + 1; got nrows=%d, "
                  "value_rowids[-1]=%d" % (const_nrows, const_rowids[-1]))

      value_rowids.shape.assert_has_rank(1)
      nrows.shape.assert_has_rank(0)

      if validate:
        msg = ("Arguments to from_value_rowids do not form a valid "
               "RowPartition")
        checks = [
            check_ops.assert_rank(value_rowids, 1, message=msg),
            check_ops.assert_rank(nrows, 0, message=msg),
            check_ops.assert_non_negative(value_rowids[:1], message=msg),
            _assert_monotonic_increasing(value_rowids, message=msg),
            check_ops.assert_less(value_rowids[-1:], nrows, message=msg),
        ]
        value_rowids = control_flow_ops.with_dependencies(checks, value_rowids)

      # Convert value_rowids & nrows to row_splits.
      # Note: we don't use segment_ids_to_row_splits() here because we want
      # to save the intermediate value `row_lengths`, so we can cache it.
      # TODO(b/116708836) Upgrade bincount to accept int64 so we can skip the
      # cast.
      value_rowids_int32 = math_ops.cast(value_rowids, dtypes.int32)
      nrows_int32 = math_ops.cast(nrows, dtypes.int32)
      row_lengths = bincount_ops.bincount(
          value_rowids_int32,
          minlength=nrows_int32,
          maxlength=nrows_int32,
          dtype=value_rowids.dtype)
      row_splits = array_ops.concat([[0], math_ops.cumsum(row_lengths)], axis=0)
      if const_nrows is not None:
        row_lengths.set_shape([const_nrows])
        row_splits.set_shape([const_nrows + 1])

      return cls(
          row_splits=row_splits,
          row_lengths=row_lengths,
          value_rowids=value_rowids,
          nrows=nrows,
          internal=_row_partition_factory_key)
예제 #19
0
 def test_negative(self):
     # unsorted_segment_sum will only report InvalidArgumentError on CPU
     with self.cached_session(), ops.device("/CPU:0"):
         with self.assertRaises(errors.InvalidArgumentError):
             self.evaluate(bincount_ops.bincount([1, 2, 3, -1, 6, 8]))
예제 #20
0
 def test_zero_weights(self):
     with self.session():
         self.assertAllEqual(
             self.evaluate(
                 bincount_ops.bincount(np.arange(1000), np.zeros(1000))),
             np.zeros(1000))
예제 #21
0
def segment_ids_to_row_splits(segment_ids,
                              num_segments=None,
                              out_type=None,
                              name=None):
    """Generates the RaggedTensor `row_splits` corresponding to a segmentation.

  Returns an integer vector `splits`, where `splits[0] = 0` and
  `splits[i] = splits[i-1] + count(segment_ids==i)`.  Example:

  >>> print(tf.ragged.segment_ids_to_row_splits([0, 0, 0, 2, 2, 3, 4, 4, 4]))
  tf.Tensor([0 3 3 5 6 9], shape=(6,), dtype=int64)

  Args:
    segment_ids: A 1-D integer Tensor.
    num_segments: A scalar integer indicating the number of segments.  Defaults
      to `max(segment_ids) + 1` (or zero if `segment_ids` is empty).
    out_type: The dtype for the return value.  Defaults to `segment_ids.dtype`,
      or `tf.int64` if `segment_ids` does not have a dtype.
    name: A name prefix for the returned tensor (optional).

  Returns:
    A sorted 1-D integer Tensor, with `shape=[num_segments + 1]`.
  """
    # Local import bincount_ops to avoid import-cycle.
    from tensorflow.python.ops import bincount_ops  # pylint: disable=g-import-not-at-top
    if out_type is None:
        if isinstance(segment_ids, ops.Tensor):
            out_type = segment_ids.dtype
        elif isinstance(num_segments, ops.Tensor):
            out_type = num_segments.dtype
        else:
            out_type = dtypes.int64
    else:
        out_type = dtypes.as_dtype(out_type)
    with ops.name_scope(name, "SegmentIdsToRaggedSplits",
                        [segment_ids]) as name:
        # Note: we cast int64 tensors to int32, since bincount currently only
        # supports int32 inputs.
        segment_ids = ragged_util.convert_to_int_tensor(segment_ids,
                                                        "segment_ids",
                                                        dtype=dtypes.int32)
        segment_ids.shape.assert_has_rank(1)
        if num_segments is not None:
            num_segments = ragged_util.convert_to_int_tensor(
                num_segments, "num_segments", dtype=dtypes.int32)
            num_segments.shape.assert_has_rank(0)

        row_lengths = bincount_ops.bincount(segment_ids,
                                            minlength=num_segments,
                                            maxlength=num_segments,
                                            dtype=out_type)
        splits = array_ops.concat([[0], math_ops.cumsum(row_lengths)], axis=0)

        # Update shape information, if possible.
        if num_segments is not None:
            const_num_segments = tensor_util.constant_value(num_segments)
            if const_num_segments is not None:
                splits.set_shape(
                    tensor_shape.TensorShape([const_num_segments + 1]))

        return splits