def testSparseColumnWithVocabularyFile(self): b = fc.sparse_column_with_vocabulary_file("bbb", vocabulary_file="a_file", vocab_size=454) self.assertEqual(b.dtype, dtypes.string) self.assertEqual(b.lookup_config.vocab_size, 454) self.assertEqual(b.lookup_config.vocabulary_file, "a_file") with self.assertRaises(ValueError): # Vocabulary size should be defined if vocabulary_file is used. fc.sparse_column_with_vocabulary_file("bbb", vocabulary_file="somefile") b = fc.sparse_column_with_vocabulary_file("bbb", vocabulary_file="a_file", vocab_size=454, dtype=dtypes.int64) self.assertEqual(b.dtype, dtypes.int64) with self.assertRaisesRegexp(ValueError, "dtype must be string or integer"): b = fc.sparse_column_with_vocabulary_file("bbb", vocabulary_file="a_file", vocab_size=454, dtype=dtypes.float32)
def testSparseColumnWithVocabularyFile(self): b = fc.sparse_column_with_vocabulary_file( "bbb", vocabulary_file="a_file", vocab_size=454) self.assertEqual(b.dtype, dtypes.string) self.assertEqual(b.lookup_config.vocab_size, 454) self.assertEqual(b.lookup_config.vocabulary_file, "a_file") with self.assertRaises(ValueError): # Vocabulary size should be defined if vocabulary_file is used. fc.sparse_column_with_vocabulary_file("bbb", vocabulary_file="somefile") b = fc.sparse_column_with_vocabulary_file( "bbb", vocabulary_file="a_file", vocab_size=454, dtype=dtypes.int64) self.assertEqual(b.dtype, dtypes.int64) with self.assertRaisesRegexp(ValueError, "dtype must be string or integer"): b = fc.sparse_column_with_vocabulary_file( "bbb", vocabulary_file="a_file", vocab_size=454, dtype=dtypes.float32)
def testWeightedSparseColumnWithVocabularyFile(self): ids = fc.sparse_column_with_vocabulary_file( "ids", "a_file", num_oov_buckets=7, vocab_size=3) weighted_ids = fc.weighted_sparse_column(ids, "weights") self.assertEqual(weighted_ids.name, "ids_weighted_by_weights") self.assertEqual(weighted_ids.lookup_config, ids.lookup_config) self.assertEqual(weighted_ids.lookup_config.vocab_size, 3) self.assertEqual(weighted_ids.lookup_config.num_oov_buckets, 7) self.assertEqual(weighted_ids.lookup_config.vocabulary_file, "a_file")
def testSparseColumnVocabularyDeepCopy(self): """Tests deepcopy of sparse_column_with_vocabulary_file.""" column = fc.sparse_column_with_vocabulary_file( "a", vocabulary_file="path_to_file", vocab_size=3) self.assertEqual("a", column.name) column_copy = copy.deepcopy(column) self.assertEqual("a", column_copy.name) self.assertEqual( fc._SparseIdLookupConfig( # pylint: disable=protected-access vocabulary_file="path_to_file", num_oov_buckets=0, vocab_size=3, default_value=-1), column_copy.lookup_config) self.assertFalse(column_copy.is_integerized)
def testSparseColumnVocabularyDeepCopy(self): """Tests deepcopy of sparse_column_with_vocabulary_file.""" column = fc.sparse_column_with_vocabulary_file( "a", vocabulary_file="path_to_file", vocab_size=3) self.assertEqual("a", column.name) column_copy = copy.deepcopy(column) self.assertEqual("a", column_copy.name) self.assertEqual( fc._SparseIdLookupConfig( # pylint: disable=protected-access vocabulary_file="path_to_file", num_oov_buckets=0, vocab_size=3, default_value=-1), column_copy.lookup_config) self.assertFalse(column_copy.is_integerized)
def gen_feature(feature_conf): name = feature_conf[feature_name_key] value_type = feature_conf[value_type_key] if "vocab_size" in feature_conf: id_feature = fc.sparse_column_with_keys( column_name=name, keys=range(feature_conf['vocab_size']), dtype=tf.string) return fc._EmbeddingColumn( id_feature, dimension=feature_conf['embedding_dimension'], shared_embedding_name=feature_conf.get(feature_name_key), ) elif "hash_bucket_size" in feature_conf \ and "embedding_dimension" not in feature_conf: if value_type == "Int": id_feature = layers.sparse_column_with_integerized_feature( column_name=name, bucket_size=feature_conf['hash_bucket_size'], combiner=_get_combiner(feature_conf), # use_hashmap=use_hashmap ) else: id_feature = layers.sparse_column_with_hash_bucket( column_name=name, hash_bucket_size=feature_conf['hash_bucket_size'], combiner=_get_combiner(feature_conf), # use_hashmap=use_hashmap ) return id_feature elif "embedding_dimension" in feature_conf \ and "hash_bucket_size" in feature_conf \ and "boundaries" not in feature_conf \ and "vocabulary_file" not in feature_conf: if value_type == "Int": return _EmbeddingColumn( sparse_id_column=layers.sparse_column_with_integerized_feature( column_name=name, bucket_size=feature_conf['hash_bucket_size'], combiner=_get_combiner(feature_conf), # use_hashmap=use_hashmap ), dimension=feature_conf['embedding_dimension'], combiner=_get_combiner(feature_conf), shared_embedding_name=feature_conf.get('shared_name', None)) else: id_feature = layers.sparse_column_with_hash_bucket( column_name=name, hash_bucket_size=feature_conf['hash_bucket_size'], # use_hashmap=use_hashmap ) return _EmbeddingColumn( id_feature, dimension=feature_conf['embedding_dimension'], combiner=_get_combiner(feature_conf), shared_embedding_name=feature_conf.get('shared_name', None), max_norm=None) elif "embedding_dimension" in feature_conf \ and "boundaries" not in feature_conf and "vocabulary_file" in feature_conf: use_hashmap = feature_conf.get("use_hashmap", False) if value_type == "Int": raise Exception( "embedding with vocabulary_file does not support Int type") else: id_feature = fc.sparse_column_with_vocabulary_file( column_name=name, vocabulary_file=feature_conf["vocabulary_file"], num_oov_buckets=feature_conf["num_oov_buckets"], vocab_size=feature_conf["vocab_size"], ) return _EmbeddingColumn( id_feature, dimension=feature_conf['embedding_dimension'], combiner=_get_combiner(feature_conf), shared_embedding_name=feature_conf.get('shared_name', None), max_norm=None) elif "embedding_dimension" in feature_conf \ and "boundaries" in feature_conf: return embedding_bucketized_column( layers.real_valued_column( column_name=name, dimension=feature_conf.get('dimension', 1), default_value=[ 0.0 for _ in range(int(feature_conf.get('dimension', 1))) ]), boundaries=[ float(b) for b in feature_conf['boundaries'].split(',') ], embedding_dimension=feature_conf["embedding_dimension"], max_norm=None, shared_name=feature_conf.get('shared_name', None), add_random=feature_conf.get('add_random', False)) elif "embedding_dimension" not in feature_conf \ and "boundaries" in feature_conf: return layers.bucketized_column( layers.real_valued_column( column_name=name, dimension=feature_conf.get('dimension', 1), default_value=[ 0.0 for _ in range(int(feature_conf.get('dimension', 1))) ]), boundaries=[ float(b) for b in feature_conf['boundaries'].split(',') ]) else: return layers.real_valued_column( column_name=name, dimension=feature_conf.get('dimension', 1), default_value=[ 0.0 for _ in range(int(feature_conf.get('dimension', 1))) ], normalizer=None if 'l2_norm' not in feature_conf else lambda x: tf.nn.l2_normalize(x, dim=-1))