def bow_encoder(ids, vocab_size, embed_dim, sparse_lookup=True, initializer=None, regularizer=None, trainable=True, scope=None, reuse=None): """Maps a sequence of symbols to a vector per example by averaging embeddings. Args: ids: `[batch_size, doc_length]` `Tensor` or `SparseTensor` of type `int32` or `int64` with symbol ids. vocab_size: Integer number of symbols in vocabulary. embed_dim: Integer number of dimensions for embedding matrix. sparse_lookup: `bool`, if `True`, converts ids to a `SparseTensor` and performs a sparse embedding lookup. This is usually faster, but not desirable if padding tokens should have an embedding. Empty rows are assigned a special embedding. initializer: An initializer for the embeddings, if `None` default for current scope is used. regularizer: Optional regularizer for the embeddings. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). scope: Optional string specifying the variable scope for the op, required if `reuse=True`. reuse: If `True`, variables inside the op will be reused. Returns: Encoding `Tensor` `[batch_size, embed_dim]` produced by averaging embeddings. Raises: ValueError: If `embed_dim` or `vocab_size` are not specified. """ if not vocab_size or not embed_dim: raise ValueError('Must specify vocab size and embedding dimension') with variable_scope.variable_scope(scope, 'bow_encoder', [ids], reuse=reuse): embeddings = variables.model_variable('embeddings', shape=[vocab_size, embed_dim], initializer=initializer, regularizer=regularizer, trainable=trainable) if sparse_lookup: if isinstance(ids, ops.SparseTensor): sparse_ids = ids else: sparse_ids = sparse_ops.dense_to_sparse_tensor(ids) return contrib_embedding_ops.safe_embedding_lookup_sparse( [embeddings], sparse_ids, combiner='mean', default_id=0) else: if isinstance(ids, ops.SparseTensor): raise TypeError('ids are expected to be dense Tensor, got: %s', ids) return math_ops.reduce_mean(embedding_ops.embedding_lookup( embeddings, ids), reduction_indices=1)
def bow_encoder(ids, vocab_size, embed_dim, sparse_lookup=True, initializer=None, regularizer=None, trainable=True, scope=None, reuse=None): """Maps a sequence of symbols to a vector per example by averaging embeddings. Args: ids: `[batch_size, doc_length]` `Tensor` or `SparseTensor` of type `int32` or `int64` with symbol ids. vocab_size: Integer number of symbols in vocabulary. embed_dim: Integer number of dimensions for embedding matrix. sparse_lookup: `bool`, if `True`, converts ids to a `SparseTensor` and performs a sparse embedding lookup. This is usually faster, but not desirable if padding tokens should have an embedding. Empty rows are assigned a special embedding. initializer: An initializer for the embeddings, if `None` default for current scope is used. regularizer: Optional regularizer for the embeddings. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). scope: Optional string specifying the variable scope for the op, required if `reuse=True`. reuse: If `True`, variables inside the op will be reused. Returns: Encoding `Tensor` `[batch_size, embed_dim]` produced by averaging embeddings. Raises: ValueError: If `embed_dim` or `vocab_size` are not specified. """ if not vocab_size or not embed_dim: raise ValueError('Must specify vocab size and embedding dimension') with variable_scope.variable_scope( scope, 'bow_encoder', [ids], reuse=reuse): embeddings = variables.model_variable( 'embeddings', shape=[vocab_size, embed_dim], initializer=initializer, regularizer=regularizer, trainable=trainable) if sparse_lookup: if isinstance(ids, sparse_tensor.SparseTensor): sparse_ids = ids else: sparse_ids = sparse_ops.dense_to_sparse_tensor(ids) return contrib_embedding_ops.safe_embedding_lookup_sparse( [embeddings], sparse_ids, combiner='mean', default_id=0) else: if isinstance(ids, sparse_tensor.SparseTensor): raise TypeError('ids are expected to be dense Tensor, got: %s', ids) return math_ops.reduce_mean( embedding_ops.embedding_lookup(embeddings, ids), reduction_indices=1)
def test_safe_embedding_lookup_sparse_3d_return_zero_vector(self): with self.cached_session(): embedding_weights = self._random_weights() sparse_ids, sparse_weights = self._ids_and_weights_3d() embedding_lookup_result = (embedding_ops.safe_embedding_lookup_sparse( embedding_weights, sparse_ids, sparse_weights).eval()) self.assertAllClose(embedding_lookup_result, [[ (1.0 * embedding_weights[0][0] + 2.0 * embedding_weights[0][1]) / 3.0, [0] * 4, [0] * 4 ], [embedding_weights[0][2], [0] * 4, [0] * 4]])
def test_safe_embedding_lookup_sparse_return_zero_vector(self): with self.test_session(): embedding_weights = self._random_weights() sparse_ids, sparse_weights = self._ids_and_weights_2d() embedding_lookup_result = (embedding_ops.safe_embedding_lookup_sparse( embedding_weights, sparse_ids, sparse_weights).eval()) self.assertAllClose( embedding_lookup_result, [(1.0 * embedding_weights[0][0] + 2.0 * embedding_weights[0][1]) / 3.0, [0] * 4, [0] * 4, embedding_weights[0][2], [0] * 4])
def test_safe_embedding_lookup_sparse_partitioned(self): with self.cached_session(): embedding_weights = self._random_weights(num_shards=3) sparse_ids, _ = self._ids_and_weights_2d() embedding_lookup_result = (embedding_ops.safe_embedding_lookup_sparse( embedding_weights, sparse_ids, None).eval()) embedding_weights = list(itertools.chain(*embedding_weights)) self.assertAllClose(embedding_lookup_result, [(embedding_weights[0] + embedding_weights[1]) / 2.0, [0] * 4, [0] * 4, embedding_weights[2], (embedding_weights[0] + embedding_weights[1]) / 2.0])
def test_safe_embedding_lookup_sparse_no_weights(self): with self.cached_session(): embedding_weights = self._random_weights() sparse_ids, _ = self._ids_and_weights_2d() embedding_lookup_result = (embedding_ops.safe_embedding_lookup_sparse( embedding_weights, sparse_ids, None).eval()) self.assertAllClose( embedding_lookup_result, [(embedding_weights[0][0] + embedding_weights[0][1]) / 2.0, [0] * 4, [0] * 4, embedding_weights[0][2], ( embedding_weights[0][0] + embedding_weights[0][1]) / 2.0])
def test_safe_embedding_lookup_sparse_return_special_vector(self): with self.cached_session(): embedding_weights = self._random_weights() sparse_ids, sparse_weights = self._ids_and_weights_2d() embedding_lookup_result = (embedding_ops.safe_embedding_lookup_sparse( embedding_weights, sparse_ids, sparse_weights, default_id=3).eval()) self.assertAllClose( embedding_lookup_result, [(1.0 * embedding_weights[0][0] + 2.0 * embedding_weights[0][1]) / 3.0, embedding_weights[0][3], embedding_weights[0][3], embedding_weights[0][2], embedding_weights[0][3]])
def test_safe_embedding_lookup_sparse_partitioned(self): with self.test_session(): embedding_weights = self._random_weights(num_shards=3) sparse_ids, _ = self._ids_and_weights_2d() embedding_lookup_result = (embedding_ops.safe_embedding_lookup_sparse( embedding_weights, sparse_ids, None).eval()) embedding_weights = list(itertools.chain(*embedding_weights)) self.assertAllClose(embedding_lookup_result, [(embedding_weights[0] + embedding_weights[1]) / 2.0, [0] * 4, [0] * 4, embedding_weights[2], (embedding_weights[0] + embedding_weights[1]) / 2.0])
def test_safe_embedding_lookup_sparse_no_weights(self): with self.test_session(): embedding_weights = self._random_weights() sparse_ids, _ = self._ids_and_weights_2d() embedding_lookup_result = (embedding_ops.safe_embedding_lookup_sparse( embedding_weights, sparse_ids, None).eval()) self.assertAllClose( embedding_lookup_result, [(embedding_weights[0][0] + embedding_weights[0][1]) / 2.0, [0] * 4, [0] * 4, embedding_weights[0][2], (embedding_weights[0][0] + embedding_weights[0][1]) / 2.0])
def test_safe_embedding_lookup_sparse_3d_return_special_vector(self): with self.test_session(): embedding_weights = self._random_weights() sparse_ids, sparse_weights = self._ids_and_weights_3d() embedding_lookup_result = (embedding_ops.safe_embedding_lookup_sparse( embedding_weights, sparse_ids, sparse_weights, default_id=3).eval()) self.assertAllClose( embedding_lookup_result, [[(1.0 * embedding_weights[0][0] + 2.0 * embedding_weights[0][1]) / 3.0, embedding_weights[0][3], embedding_weights[0][3]], [ embedding_weights[0][2], embedding_weights[0][3], embedding_weights[0][3] ]])
def _create_joint_embedding_lookup(columns_to_tensors, embedding_lookup_arguments, num_outputs, trainable, weight_collections): """Creates an embedding lookup for all columns sharing a single weight.""" for arg in embedding_lookup_arguments: assert arg.weight_tensor is None, ( 'Joint sums for weighted sparse columns are not supported. ' 'Please use weighted_sum_from_feature_columns instead.') assert arg.combiner == 'sum', ( 'Combiners other than sum are not supported for joint sums. ' 'Please use weighted_sum_from_feature_columns instead.') assert len(embedding_lookup_arguments) >= 1, ( 'At least one column must be in the model.') prev_size = 0 sparse_tensors = [] for a in embedding_lookup_arguments: t = a.input_tensor values = t.values + prev_size prev_size += a.vocab_size sparse_tensors.append( ops.SparseTensor(t.indices, values, t.shape)) sparse_tensor = sparse_ops.sparse_concat(1, sparse_tensors) with variable_scope.variable_scope( None, default_name='linear_weights', values=columns_to_tensors.values()): variable = contrib_variables.model_variable( name='weights', shape=[prev_size, num_outputs], dtype=dtypes.float32, initializer=init_ops.zeros_initializer, trainable=trainable, collections=weight_collections) if isinstance(variable, variables.Variable): variable = [variable] else: variable = variable._get_variable_list() # pylint: disable=protected-access predictions = embedding_ops.safe_embedding_lookup_sparse( variable, sparse_tensor, sparse_weights=None, default_id=0, combiner='sum', name='_weights') return variable, predictions
def _create_embedding_lookup(column, columns_to_tensors, embedding_lookup_arguments, num_outputs, trainable, weight_collections): """Creates variables and returns predictions for linear weights in a model. Args: column: the column we're working on. columns_to_tensors: a map from column name to tensors. embedding_lookup_arguments: arguments for embedding lookup. num_outputs: how many outputs. trainable: whether the variable we create is trainable. weight_collections: weights will be placed here. Returns: variables: the created embeddings. predictions: the computed predictions. """ with variable_scope.variable_scope( None, default_name=column.name, values=columns_to_tensors.values()): variable = contrib_variables.model_variable( name='weights', shape=[embedding_lookup_arguments.vocab_size, num_outputs], dtype=dtypes.float32, initializer=embedding_lookup_arguments.initializer, trainable=trainable, collections=weight_collections) if isinstance(variable, variables.Variable): variable = [variable] else: variable = variable._get_variable_list() # pylint: disable=protected-access predictions = embedding_ops.safe_embedding_lookup_sparse( variable, embedding_lookup_arguments.input_tensor, sparse_weights=embedding_lookup_arguments.weight_tensor, default_id=0, combiner=embedding_lookup_arguments.combiner, name=column.name + '_weights') return variable, predictions
def _embeddings_from_arguments(column, args, weight_collections, trainable, output_rank=2): """Returns embeddings for a column based on the computed arguments. Args: column: the column name. args: the _DeepEmbeddingLookupArguments for this column. weight_collections: collections to store weights in. trainable: whether these embeddings should be trainable. output_rank: the desired rank of the returned `Tensor`. Inner dimensions will be combined to produce the desired rank. Returns: the embeddings. Raises: ValueError: if not possible to create. """ # pylint: disable=protected-access input_tensor = layers._inner_flatten(args.input_tensor, output_rank) weight_tensor = None if args.weight_tensor is not None: weight_tensor = layers._inner_flatten(args.weight_tensor, output_rank) # pylint: enable=protected-access # This option is only enabled for scattered_embedding_column. if args.hash_key: embeddings = contrib_variables.model_variable( name='weights', shape=[args.vocab_size], dtype=dtypes.float32, initializer=args.initializer, trainable=trainable, collections=weight_collections) return embedding_ops.scattered_embedding_lookup_sparse( embeddings, input_tensor, args.dimension, hash_key=args.hash_key, combiner=args.combiner, name='lookup') if args.shared_embedding_name is not None: shared_embedding_collection_name = ( 'SHARED_EMBEDDING_COLLECTION_' + args.shared_embedding_name.upper()) graph = ops.get_default_graph() shared_embedding_collection = ( graph.get_collection_ref(shared_embedding_collection_name)) shape = [args.vocab_size, args.dimension] if shared_embedding_collection: if len(shared_embedding_collection) > 1: raise ValueError('Collection %s can only contain one ' '(partitioned) variable.' % shared_embedding_collection_name) else: embeddings = shared_embedding_collection[0] if embeddings.get_shape() != shape: raise ValueError('The embedding variable with name {} already ' 'exists, but its shape does not match required ' 'embedding shape here. Please make sure to use ' 'different shared_embedding_name for different ' 'shared embeddings.'.format( args.shared_embedding_name)) else: embeddings = contrib_variables.model_variable( name=args.shared_embedding_name, shape=shape, dtype=dtypes.float32, initializer=args.initializer, trainable=trainable, collections=weight_collections) graph.add_to_collection(shared_embedding_collection_name, embeddings) else: embeddings = contrib_variables.model_variable( name='weights', shape=[args.vocab_size, args.dimension], dtype=dtypes.float32, initializer=args.initializer, trainable=trainable, collections=weight_collections) if isinstance(embeddings, variables.Variable): embeddings = [embeddings] else: embeddings = embeddings._get_variable_list() # pylint: disable=protected-access # pylint: disable=protected-access _maybe_restore_from_checkpoint( column._checkpoint_path(), embeddings) return embedding_ops.safe_embedding_lookup_sparse( embeddings, input_tensor, sparse_weights=weight_tensor, combiner=args.combiner, name=column.name + 'weights', max_norm=args.max_norm)
def _embeddings_from_arguments(column, args, weight_collections, trainable, output_rank=2): """Returns embeddings for a column based on the computed arguments. Args: column: the column name. args: the _DeepEmbeddingLookupArguments for this column. weight_collections: collections to store weights in. trainable: whether these embeddings should be trainable. output_rank: the desired rank of the returned `Tensor`. Inner dimensions will be combined to produce the desired rank. Returns: the embeddings. Raises: ValueError: if not possible to create. """ # pylint: disable=protected-access input_tensor = layers._inner_flatten(args.input_tensor, output_rank) weight_tensor = None if args.weight_tensor is not None: weight_tensor = layers._inner_flatten(args.weight_tensor, output_rank) # pylint: enable=protected-access if args.hashed: embeddings = contrib_variables.model_variable( name='weights', shape=[args.vocab_size], dtype=dtypes.float32, initializer=args.initializer, trainable=trainable, collections=weight_collections) return embedding_ops.hashed_embedding_lookup_sparse( embeddings, input_tensor, args.dimension, combiner=args.combiner, name='lookup') if args.shared_embedding_name is not None: shared_embedding_collection_name = ( 'SHARED_EMBEDDING_COLLECTION_' + args.shared_embedding_name.upper()) graph = ops.get_default_graph() shared_embedding_collection = ( graph.get_collection_ref(shared_embedding_collection_name)) shape = [args.vocab_size, args.dimension] if shared_embedding_collection: if len(shared_embedding_collection) > 1: raise ValueError('Collection %s can only contain one ' '(partitioned) variable.' % shared_embedding_collection_name) else: embeddings = shared_embedding_collection[0] if embeddings.get_shape() != shape: raise ValueError('The embedding variable with name {} already ' 'exists, but its shape does not match required ' 'embedding shape here. Please make sure to use ' 'different shared_embedding_name for different ' 'shared embeddings.'.format( args.shared_embedding_name)) else: embeddings = contrib_variables.model_variable( name=args.shared_embedding_name, shape=shape, dtype=dtypes.float32, initializer=args.initializer, trainable=trainable, collections=weight_collections) graph.add_to_collection(shared_embedding_collection_name, embeddings) else: embeddings = contrib_variables.model_variable( name='weights', shape=[args.vocab_size, args.dimension], dtype=dtypes.float32, initializer=args.initializer, trainable=trainable, collections=weight_collections) if isinstance(embeddings, variables.Variable): embeddings = [embeddings] else: embeddings = embeddings._get_variable_list() # pylint: disable=protected-access # pylint: disable=protected-access _maybe_restore_from_checkpoint( column._checkpoint_path(), embeddings) return embedding_ops.safe_embedding_lookup_sparse( embeddings, input_tensor, sparse_weights=weight_tensor, combiner=args.combiner, name=column.name + 'weights', max_norm=args.max_norm)