def test_dense(self): """Tests only dense inputs. """ op = sparse_feature_cross_op.sparse_feature_cross([ constant_op.constant([['batch1-FC1-F1', 'batch1-FC1-F2'], ['batch2-FC1-F1', 'batch2-FC1-F2']], dtypes.string), constant_op.constant([['batch1-FC2-F1', 'batch1-FC2-F2'], ['batch2-FC2-F1', 'batch2-FC2-F2']], dtypes.string), ]) expected_out = self._sparse_tensor( [[ 'batch1-FC1-F1_X_batch1-FC2-F1', 'batch1-FC1-F1_X_batch1-FC2-F2', 'batch1-FC1-F2_X_batch1-FC2-F1', 'batch1-FC1-F2_X_batch1-FC2-F2' ], [ 'batch2-FC1-F1_X_batch2-FC2-F1', 'batch2-FC1-F1_X_batch2-FC2-F2', 'batch2-FC1-F2_X_batch2-FC2-F1', 'batch2-FC1-F2_X_batch2-FC2-F2' ]]) with self.cached_session() as sess: self._assert_sparse_tensor_equals(expected_out, sess.run(op))
def test_one_column_empty(self): """Tests when one column is empty. The crossed tensor should be empty. """ op = sparse_feature_cross_op.sparse_feature_cross([ self._sparse_tensor([['batch1-FC1-F1', 'batch1-FC1-F2']]), self._sparse_tensor([], 1), self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']]) ]) with self.cached_session() as sess: self._assert_sparse_tensor_empty(sess.run(op))
def test_all_columns_empty(self): """Tests when all columns are empty. The crossed tensor should be empty. """ op = sparse_feature_cross_op.sparse_feature_cross([ self._sparse_tensor([]), self._sparse_tensor([]), self._sparse_tensor([]) ]) with self.cached_session() as sess: self._assert_sparse_tensor_empty(sess.run(op))
def test_hashed_output_v1_has_collision(self): """Tests the old version of the fingerprint concatenation has collisions. """ # The last 10 bits of 359 and 1024+359 are identical. # As a result, all the crosses collide. t1 = constant_op.constant([[359], [359 + 1024]]) t2 = constant_op.constant([list(range(10)), list(range(10))]) cross = sparse_feature_cross_op.sparse_feature_cross( [t2, t1], hashed_output=True, num_buckets=1024) cross_dense = sparse_ops.sparse_tensor_to_dense(cross) with session.Session(): values = cross_dense.eval() self.assertTrue(numpy.equal(values[0], values[1]).all())
def test_hashed_output_zero_bucket(self): """Tests a simple scenario. """ op = sparse_feature_cross_op.sparse_feature_cross([ self._sparse_tensor([['batch1-FC1-F1']]), self._sparse_tensor([['batch1-FC2-F1']]), self._sparse_tensor([['batch1-FC3-F1']]) ], hashed_output=True) # Check actual hashed output to prevent unintentional hashing changes. expected_out = self._sparse_tensor([[3735511728867393167]]) with self.cached_session() as sess: self._assert_sparse_tensor_equals(expected_out, sess.run(op))
def test_hashed_output_zero_bucket_v2(self): """Tests a simple scenario. """ op = sparse_feature_cross_op.sparse_feature_cross( [ self._sparse_tensor([['batch1-FC1-F1']]), self._sparse_tensor([['batch1-FC2-F1']]), self._sparse_tensor([['batch1-FC3-F1']]) ], hashed_output=True, hash_key=layers.SPARSE_FEATURE_CROSS_DEFAULT_HASH_KEY) # Check actual hashed output to prevent unintentional hashing changes. expected_out = self._sparse_tensor([[1971693436396284976]]) with self.cached_session() as sess: self._assert_sparse_tensor_equals(expected_out, sess.run(op))
def test_integer_mixed_string_sparse(self): """Tests mixed type.""" op = sparse_feature_cross_op.sparse_feature_cross([ self._sparse_tensor([[11], [333, 55555]]), self._sparse_tensor([['batch1-FC2-F1'], ['batch2-FC2-F1', 'batch2-FC2-F2']]) ]) expected_out = self._sparse_tensor([['11_X_batch1-FC2-F1'], [ '333_X_batch2-FC2-F1', '333_X_batch2-FC2-F2', '55555_X_batch2-FC2-F1', '55555_X_batch2-FC2-F2' ]]) with self.cached_session() as sess: self._assert_sparse_tensor_equals(expected_out, sess.run(op))
def test_hashed_output_v2_has_no_collision(self): """Tests the new version of the fingerprint concatenation has no collisions. """ # Although the last 10 bits of 359 and 1024+359 are identical. # As a result, all the crosses shouldn't collide. t1 = constant_op.constant([[359], [359 + 1024]]) t2 = constant_op.constant([list(range(10)), list(range(10))]) cross = sparse_feature_cross_op.sparse_feature_cross( [t2, t1], hashed_output=True, num_buckets=1024, hash_key=layers.SPARSE_FEATURE_CROSS_DEFAULT_HASH_KEY) cross_dense = sparse_ops.sparse_tensor_to_dense(cross) with session.Session(): values = cross_dense.eval() self.assertTrue(numpy.not_equal(values[0], values[1]).all())
def test_integer_sparse_input(self): """Tests mixed type sparse and dense inputs.""" op = sparse_feature_cross_op.sparse_feature_cross([ self._sparse_tensor([[11], [333, 5555]]), constant_op.constant([['batch1-FC2-F1', 'batch1-FC2-F2'], ['batch2-FC2-F1', 'batch2-FC2-F2']], dtypes.string), ]) expected_out = self._sparse_tensor( [['11_X_batch1-FC2-F1', '11_X_batch1-FC2-F2'], [ '333_X_batch2-FC2-F1', '333_X_batch2-FC2-F2', '5555_X_batch2-FC2-F1', '5555_X_batch2-FC2-F2' ]]) with self.cached_session() as sess: self._assert_sparse_tensor_equals(expected_out, sess.run(op))
def test_hashed_3x1x2(self): """Tests 3x1x2 permutation with hashed output. """ op = sparse_feature_cross_op.sparse_feature_cross([ self._sparse_tensor( [['batch1-FC1-F1', 'batch1-FC1-F2', 'batch1-FC1-F3']]), self._sparse_tensor([['batch1-FC2-F1']]), self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']]) ], hashed_output=True, num_buckets=1000) with self.cached_session() as sess: out = sess.run(op) self.assertEqual(6, len(out.values)) self.assertAllEqual([[0, i] for i in range(6)], out.indices) self.assertTrue(all(x < 1000 and x >= 0 for x in out.values)) all_values_are_different = len(out.values) == len(set(out.values)) self.assertTrue(all_values_are_different)
def test_some_columns_empty(self): """Tests when more than one columns are empty. Cross for the corresponding batch should be empty. """ op = sparse_feature_cross_op.sparse_feature_cross([ self._sparse_tensor([['batch1-FC1-F1', 'batch1-FC1-F2']], 2), self._sparse_tensor([['batch1-FC2-F1'], ['batch2-FC2-F1']], 2), self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']], 2) ]) expected_out = self._sparse_tensor([[ 'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F1', 'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F2', 'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F1', 'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F2' ]], 2) with self.cached_session() as sess: self._assert_sparse_tensor_equals(expected_out, sess.run(op))
def test_simple(self): """Tests a simple scenario. """ op = sparse_feature_cross_op.sparse_feature_cross([ self._sparse_tensor([['batch1-FC1-F1'], ['batch2-FC1-F1', 'batch2-FC1-F2']]), self._sparse_tensor([['batch1-FC2-F1'], ['batch2-FC2-F1', 'batch2-FC2-F2']]) ]) expected_out = self._sparse_tensor( [['batch1-FC1-F1_X_batch1-FC2-F1'], [ 'batch2-FC1-F1_X_batch2-FC2-F1', 'batch2-FC1-F1_X_batch2-FC2-F2', 'batch2-FC1-F2_X_batch2-FC2-F1', 'batch2-FC1-F2_X_batch2-FC2-F2' ]]) with self.cached_session() as sess: self._assert_sparse_tensor_equals(expected_out, sess.run(op))
def test_permutation_3x1x2(self): """Tests 3x1x2 permutation. """ op = sparse_feature_cross_op.sparse_feature_cross([ self._sparse_tensor( [['batch1-FC1-F1', 'batch1-FC1-F2', 'batch1-FC1-F3']]), self._sparse_tensor([['batch1-FC2-F1']]), self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']]) ]) expected_out = self._sparse_tensor([[ 'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F1', 'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F2', 'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F1', 'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F2', 'batch1-FC1-F3_X_batch1-FC2-F1_X_batch1-FC3-F1', 'batch1-FC1-F3_X_batch1-FC2-F1_X_batch1-FC3-F2' ]]) with self.cached_session() as sess: self._assert_sparse_tensor_equals(expected_out, sess.run(op))
def test_large_batch(self): """Tests with large batch size to force multithreading. """ batch_size = 5000 col1 = [] col2 = [] col3 = [] for b in range(batch_size): col1.append([ 'batch%d-FC1-F1' % b, 'batch%d-FC1-F2' % b, 'batch%d-FC1-F3' % b ]) col2.append(['batch%d-FC2-F1' % b]) col3.append(['batch%d-FC3-F1' % b, 'batch%d-FC3-F2' % b]) op = sparse_feature_cross_op.sparse_feature_cross([ self._sparse_tensor(col1), self._sparse_tensor(col2), self._sparse_tensor(col3) ]) col_out = [] for b in range(batch_size): col_out.append([ 'batch%d-FC1-F1_X_batch%d-FC2-F1_X_batch%d-FC3-F1' % (b, b, b), 'batch%d-FC1-F1_X_batch%d-FC2-F1_X_batch%d-FC3-F2' % (b, b, b), 'batch%d-FC1-F2_X_batch%d-FC2-F1_X_batch%d-FC3-F1' % (b, b, b), 'batch%d-FC1-F2_X_batch%d-FC2-F1_X_batch%d-FC3-F2' % (b, b, b), 'batch%d-FC1-F3_X_batch%d-FC2-F1_X_batch%d-FC3-F1' % (b, b, b), 'batch%d-FC1-F3_X_batch%d-FC2-F1_X_batch%d-FC3-F2' % (b, b, b) ]) expected_out = self._sparse_tensor(col_out) with self.cached_session() as sess: self._assert_sparse_tensor_equals(expected_out, sess.run(op))
def _sampled_scattered_embedding_lookup(params, values, dimension=None, sampled_candidates=None, hash_key=None, name=None): """Looks up embeddings using parameter hashing for each value in `values`. This method looks up selected embedding dimensions if `sampled_candidates` is given, otherwise looks up all dimensions. The i-th embedding component of a value v in `values` is found by retrieving the weight whose index is a fingerprint of the pair (v,i). The concept is explored as "feature hashing" for model compression in this paper: http://arxiv.org/pdf/1504.04788.pdf Feature hashing has the pleasant effect of allowing us to compute an embedding without needing a pre-determined vocabulary, relieving some amount of process complexity. It also allows for us to maintain embeddings for possibly trillions of features with a fixed amount of memory. Note that this is superior to out-of-vocabulary shared "hash buckets" in that the embedding is extremely likely to be unique for each token as opposed to being shared across probably-colliding tokens. The price is that we must compute a hash once for each scalar in the token's embedding as opposed to once per token. If `params` is a list, it represents a partition of the embedding parameters. Each tensor in the list should have the same length, except for the first ones which may have an additional element. For instance 10 parameters can be partitioned in 4 tensors with length `[3, 3, 2, 2]`. Args: params: A `Tensor`, `list` of `Tensors`, or `PartitionedVariable`. Each tensor must be of rank 1 with fully-defined shape. values: `Tensor` of values to be embedded with shape `[d0, ..., dn]`. dimension: Embedding dimension. The user must specify either `dimension` or `sampled_candidates`. sampled_candidates: An optional `Tensor` of slice indices to keep along the final dimension with shape `[d0, ..., dn, N]`. If given, `dimension` is ignored. If `None`, looks up all candidates. hash_key: Specify the hash_key that will be used by the `FingerprintCat64` function to combine the crosses fingerprints on SparseFeatureCrossOp (optional). name: An optional name for this op. Returns: A `Tensor` with shape `[d0, ..., dn, dimension]`. If `sampled_candidates` is given, the output shape is `[d0, ..., dn, N]` Raises: ValueError: if dimension is not positive or the partition size is invalid. """ if isinstance(params, variables.PartitionedVariable): params = list(params) if not isinstance(params, list): params = [params] with ops.name_scope(name, "scattered_embedding_lookup", params + [dimension, values]): # Flatten the values values_shape = array_ops.shape(values) values = array_ops.reshape(values, [-1, 1]) if sampled_candidates is None: if dimension is None: raise ValueError( "You must specify either dimension or sampled_candidates.") if dimension <= 0: raise ValueError("Dimension must be >0. Given is %d" % dimension) sampled_candidates = array_ops.tile( array_ops.expand_dims(math_ops.range(0, dimension), 0), array_ops.shape(values)) else: dimension = array_ops.shape(sampled_candidates)[math_ops.subtract( array_ops.rank(sampled_candidates), 1)] sampled_candidates_shape = array_ops.shape(sampled_candidates) dimension_tensor = array_ops.reshape( dimension, shape=[ 1, ]) expected_shape = array_ops.concat([values_shape, dimension_tensor], 0) with ops.control_dependencies([ control_flow_ops.Assert( math_ops.reduce_all( math_ops.equal(sampled_candidates_shape, expected_shape)), [ "The shape of sampled_candidates: ", sampled_candidates_shape, " does not match the shape of values: ", values_shape ]) ]): # Flatten sampled_candidates, same way as values are flattened. sampled_candidates = array_ops.reshape(sampled_candidates, [-1, dimension]) num_partitions = len(params) partition_sizes = [] for p in range(num_partitions): shape = params[p].get_shape() shape.assert_has_rank(1) shape.assert_is_fully_defined() partition_sizes.append(tensor_shape.dimension_value(shape[0])) num_params = sum(partition_sizes) # Total number of parameters. # Assert the size of each partition. for p in range(num_partitions): expected_size = (num_params - p - 1) // num_partitions + 1 if partition_sizes[p] != expected_size: raise ValueError("Tensor %d in params has size %d, expected %d." % (p, partition_sizes[p], expected_size)) # With two values v1 and v2 and 3 dimensions, we will cross # [[0, 1, 2], [0, 1, 2]] with [[v1], [v2]]. tensors_to_cross = [sampled_candidates, values] ids = sparse_feature_cross_op.sparse_feature_cross( tensors_to_cross, hashed_output=True, num_buckets=num_params, hash_key=hash_key) ids = sparse_ops.sparse_tensor_to_dense(ids) # No need to validate the indices since we have checked the params # dimensions and we know the largest id. result = embedding_ops.embedding_lookup( params, ids, partition_strategy="div") return array_ops.reshape(result, array_ops.concat([values_shape, [dimension]], 0))