def bow_encoder(ids,
                vocab_size,
                embed_dim,
                sparse_lookup=True,
                initializer=None,
                regularizer=None,
                trainable=True,
                scope=None,
                reuse=None):
    """Maps a sequence of symbols to a vector per example by averaging embeddings.

  Args:
    ids: `[batch_size, doc_length]` `Tensor` or `SparseTensor` of type
      `int32` or `int64` with symbol ids.
    vocab_size: Integer number of symbols in vocabulary.
    embed_dim: Integer number of dimensions for embedding matrix.
    sparse_lookup: `bool`, if `True`, converts ids to a `SparseTensor`
        and performs a sparse embedding lookup. This is usually faster,
        but not desirable if padding tokens should have an embedding. Empty rows
        are assigned a special embedding.
    initializer: An initializer for the embeddings, if `None` default for
        current scope is used.
    regularizer: Optional regularizer for the embeddings.
    trainable: If `True` also add variables to the graph collection
      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
    scope: Optional string specifying the variable scope for the op, required
        if `reuse=True`.
    reuse: If `True`, variables inside the op will be reused.

  Returns:
    Encoding `Tensor` `[batch_size, embed_dim]` produced by
    averaging embeddings.

  Raises:
    ValueError: If `embed_dim` or `vocab_size` are not specified.
  """
    if not vocab_size or not embed_dim:
        raise ValueError('Must specify vocab size and embedding dimension')
    with variable_scope.variable_scope(scope,
                                       'bow_encoder', [ids],
                                       reuse=reuse):
        embeddings = variables.model_variable('embeddings',
                                              shape=[vocab_size, embed_dim],
                                              initializer=initializer,
                                              regularizer=regularizer,
                                              trainable=trainable)
        if sparse_lookup:
            if isinstance(ids, sparse_tensor.SparseTensor):
                sparse_ids = ids
            else:
                sparse_ids = sparse_ops.dense_to_sparse_tensor(ids)
            return contrib_embedding_ops.safe_embedding_lookup_sparse(
                [embeddings], sparse_ids, combiner='mean', default_id=0)
        else:
            if isinstance(ids, sparse_tensor.SparseTensor):
                raise TypeError('ids are expected to be dense Tensor, got: %s',
                                ids)
            return math_ops.reduce_mean(embedding_ops.embedding_lookup(
                embeddings, ids),
                                        axis=1)
Пример #2
0
    def test_safe_embedding_lookup_sparse_3d_return_zero_vector(self):
        with self.cached_session():
            embedding_weights = self._random_weights()
            sparse_ids, sparse_weights = self._ids_and_weights_3d()

            embedding_lookup_result = (
                embedding_ops.safe_embedding_lookup_sparse(
                    embedding_weights, sparse_ids, sparse_weights).eval())

            self.assertAllClose(embedding_lookup_result, [[
                (1.0 * embedding_weights[0][0] + 2.0 * embedding_weights[0][1])
                / 3.0, [0] * 4, [0] * 4
            ], [embedding_weights[0][2], [0] * 4, [0] * 4]])
Пример #3
0
    def test_safe_embedding_lookup_sparse_no_weights(self):
        with self.cached_session():
            embedding_weights = self._random_weights()
            sparse_ids, _ = self._ids_and_weights_2d()

            embedding_lookup_result = (
                embedding_ops.safe_embedding_lookup_sparse(
                    embedding_weights, sparse_ids, None).eval())

            self.assertAllClose(
                embedding_lookup_result,
                [(embedding_weights[0][0] + embedding_weights[0][1]) / 2.0,
                 [0] * 4, [0] * 4, embedding_weights[0][2],
                 (embedding_weights[0][0] + embedding_weights[0][1]) / 2.0])
Пример #4
0
    def test_safe_embedding_lookup_sparse_partitioned(self):
        with self.cached_session():
            embedding_weights = self._random_weights(num_shards=3)
            sparse_ids, _ = self._ids_and_weights_2d()

            embedding_lookup_result = (
                embedding_ops.safe_embedding_lookup_sparse(
                    embedding_weights, sparse_ids, None).eval())

            embedding_weights = list(itertools.chain(*embedding_weights))
            self.assertAllClose(
                embedding_lookup_result,
                [(embedding_weights[0] + embedding_weights[1]) / 2.0, [0] * 4,
                 [0] * 4, embedding_weights[2],
                 (embedding_weights[0] + embedding_weights[1]) / 2.0])
Пример #5
0
    def test_safe_embedding_lookup_sparse_return_special_vector(self):
        with self.cached_session():
            embedding_weights = self._random_weights()
            sparse_ids, sparse_weights = self._ids_and_weights_2d()

            embedding_lookup_result = (
                embedding_ops.safe_embedding_lookup_sparse(
                    embedding_weights,
                    sparse_ids,
                    sparse_weights,
                    default_id=3).eval())

            self.assertAllClose(embedding_lookup_result, [
                (1.0 * embedding_weights[0][0] + 2.0 * embedding_weights[0][1])
                / 3.0, embedding_weights[0][3], embedding_weights[0][3],
                embedding_weights[0][2], embedding_weights[0][3]
            ])
Пример #6
0
def _create_joint_embedding_lookup(columns_to_tensors,
                                   embedding_lookup_arguments, num_outputs,
                                   trainable, weight_collections):
    """Creates an embedding lookup for all columns sharing a single weight."""
    for arg in embedding_lookup_arguments:
        assert arg.weight_tensor is None, (
            'Joint sums for weighted sparse columns are not supported. '
            'Please use weighted_sum_from_feature_columns instead.')
        assert arg.combiner == 'sum', (
            'Combiners other than sum are not supported for joint sums. '
            'Please use weighted_sum_from_feature_columns instead.')
    assert len(embedding_lookup_arguments) >= 1, (
        'At least one column must be in the model.')
    prev_size = 0
    sparse_tensors = []
    for a in embedding_lookup_arguments:
        t = a.input_tensor
        values = t.values + prev_size
        prev_size += a.vocab_size
        sparse_tensors.append(
            sparse_tensor_py.SparseTensor(t.indices, values, t.dense_shape))
    sparse_tensor = sparse_ops.sparse_concat(1, sparse_tensors)
    with variable_scope.variable_scope(None,
                                       default_name='linear_weights',
                                       values=columns_to_tensors.values()):
        variable = contrib_variables.model_variable(
            name='weights',
            shape=[prev_size, num_outputs],
            dtype=dtypes.float32,
            initializer=init_ops.zeros_initializer(),
            trainable=trainable,
            collections=weight_collections)
        if fc._is_variable(variable):  # pylint: disable=protected-access
            variable = [variable]
        else:
            variable = variable._get_variable_list()  # pylint: disable=protected-access
        predictions = embedding_ops.safe_embedding_lookup_sparse(
            variable,
            sparse_tensor,
            sparse_weights=None,
            combiner='sum',
            name='_weights')
        return variable, predictions
Пример #7
0
def _create_embedding_lookup(column, columns_to_tensors,
                             embedding_lookup_arguments, num_outputs,
                             trainable, weight_collections):
    """Creates variables and returns predictions for linear weights in a model.

  Args:
   column: the column we're working on.
   columns_to_tensors: a map from column name to tensors.
   embedding_lookup_arguments: arguments for embedding lookup.
   num_outputs: how many outputs.
   trainable: whether the variable we create is trainable.
   weight_collections: weights will be placed here.

  Returns:
  variables: the created embeddings.
  predictions: the computed predictions.
  """
    with variable_scope.variable_scope(None,
                                       default_name=column.name,
                                       values=columns_to_tensors.values()):
        variable = contrib_variables.model_variable(
            name='weights',
            shape=[embedding_lookup_arguments.vocab_size, num_outputs],
            dtype=dtypes.float32,
            initializer=embedding_lookup_arguments.initializer,
            trainable=trainable,
            collections=weight_collections)
        if fc._is_variable(variable):  # pylint: disable=protected-access
            variable = [variable]
        else:
            variable = variable._get_variable_list()  # pylint: disable=protected-access
        predictions = embedding_ops.safe_embedding_lookup_sparse(
            variable,
            embedding_lookup_arguments.input_tensor,
            sparse_weights=embedding_lookup_arguments.weight_tensor,
            combiner=embedding_lookup_arguments.combiner,
            name=column.name + '_weights')
        return variable, predictions