def testSparseColumnWithVocabularyFile(self):
        b = fc.sparse_column_with_vocabulary_file("bbb",
                                                  vocabulary_file="a_file",
                                                  vocab_size=454)
        self.assertEqual(b.dtype, dtypes.string)
        self.assertEqual(b.lookup_config.vocab_size, 454)
        self.assertEqual(b.lookup_config.vocabulary_file, "a_file")

        with self.assertRaises(ValueError):
            # Vocabulary size should be defined if vocabulary_file is used.
            fc.sparse_column_with_vocabulary_file("bbb",
                                                  vocabulary_file="somefile")

        b = fc.sparse_column_with_vocabulary_file("bbb",
                                                  vocabulary_file="a_file",
                                                  vocab_size=454,
                                                  dtype=dtypes.int64)
        self.assertEqual(b.dtype, dtypes.int64)

        with self.assertRaisesRegexp(ValueError,
                                     "dtype must be string or integer"):
            b = fc.sparse_column_with_vocabulary_file("bbb",
                                                      vocabulary_file="a_file",
                                                      vocab_size=454,
                                                      dtype=dtypes.float32)
  def testSparseColumnWithVocabularyFile(self):
    b = fc.sparse_column_with_vocabulary_file(
        "bbb", vocabulary_file="a_file", vocab_size=454)
    self.assertEqual(b.dtype, dtypes.string)
    self.assertEqual(b.lookup_config.vocab_size, 454)
    self.assertEqual(b.lookup_config.vocabulary_file, "a_file")

    with self.assertRaises(ValueError):
      # Vocabulary size should be defined if vocabulary_file is used.
      fc.sparse_column_with_vocabulary_file("bbb", vocabulary_file="somefile")

    b = fc.sparse_column_with_vocabulary_file(
        "bbb", vocabulary_file="a_file", vocab_size=454, dtype=dtypes.int64)
    self.assertEqual(b.dtype, dtypes.int64)

    with self.assertRaisesRegexp(ValueError, "dtype must be string or integer"):
      b = fc.sparse_column_with_vocabulary_file(
          "bbb", vocabulary_file="a_file", vocab_size=454, dtype=dtypes.float32)
 def testWeightedSparseColumnWithVocabularyFile(self):
   ids = fc.sparse_column_with_vocabulary_file(
       "ids", "a_file", num_oov_buckets=7, vocab_size=3)
   weighted_ids = fc.weighted_sparse_column(ids, "weights")
   self.assertEqual(weighted_ids.name, "ids_weighted_by_weights")
   self.assertEqual(weighted_ids.lookup_config, ids.lookup_config)
   self.assertEqual(weighted_ids.lookup_config.vocab_size, 3)
   self.assertEqual(weighted_ids.lookup_config.num_oov_buckets, 7)
   self.assertEqual(weighted_ids.lookup_config.vocabulary_file, "a_file")
 def testSparseColumnVocabularyDeepCopy(self):
   """Tests deepcopy of sparse_column_with_vocabulary_file."""
   column = fc.sparse_column_with_vocabulary_file(
       "a", vocabulary_file="path_to_file", vocab_size=3)
   self.assertEqual("a", column.name)
   column_copy = copy.deepcopy(column)
   self.assertEqual("a", column_copy.name)
   self.assertEqual(
       fc._SparseIdLookupConfig(  # pylint: disable=protected-access
           vocabulary_file="path_to_file",
           num_oov_buckets=0,
           vocab_size=3,
           default_value=-1),
       column_copy.lookup_config)
   self.assertFalse(column_copy.is_integerized)
Exemplo n.º 5
0
 def testSparseColumnVocabularyDeepCopy(self):
   """Tests deepcopy of sparse_column_with_vocabulary_file."""
   column = fc.sparse_column_with_vocabulary_file(
       "a", vocabulary_file="path_to_file", vocab_size=3)
   self.assertEqual("a", column.name)
   column_copy = copy.deepcopy(column)
   self.assertEqual("a", column_copy.name)
   self.assertEqual(
       fc._SparseIdLookupConfig(  # pylint: disable=protected-access
           vocabulary_file="path_to_file",
           num_oov_buckets=0,
           vocab_size=3,
           default_value=-1),
       column_copy.lookup_config)
   self.assertFalse(column_copy.is_integerized)
Exemplo n.º 6
0
def gen_feature(feature_conf):
    name = feature_conf[feature_name_key]
    value_type = feature_conf[value_type_key]

    if "vocab_size" in feature_conf:
        id_feature = fc.sparse_column_with_keys(
            column_name=name,
            keys=range(feature_conf['vocab_size']),
            dtype=tf.string)

        return fc._EmbeddingColumn(
            id_feature,
            dimension=feature_conf['embedding_dimension'],
            shared_embedding_name=feature_conf.get(feature_name_key),
        )
    elif "hash_bucket_size" in feature_conf \
            and "embedding_dimension" not in feature_conf:
        if value_type == "Int":
            id_feature = layers.sparse_column_with_integerized_feature(
                column_name=name,
                bucket_size=feature_conf['hash_bucket_size'],
                combiner=_get_combiner(feature_conf),
                # use_hashmap=use_hashmap
            )
        else:
            id_feature = layers.sparse_column_with_hash_bucket(
                column_name=name,
                hash_bucket_size=feature_conf['hash_bucket_size'],
                combiner=_get_combiner(feature_conf),
                # use_hashmap=use_hashmap
            )
        return id_feature
    elif "embedding_dimension" in feature_conf \
            and "hash_bucket_size" in feature_conf \
            and "boundaries" not in feature_conf \
            and "vocabulary_file" not in feature_conf:
        if value_type == "Int":
            return _EmbeddingColumn(
                sparse_id_column=layers.sparse_column_with_integerized_feature(
                    column_name=name,
                    bucket_size=feature_conf['hash_bucket_size'],
                    combiner=_get_combiner(feature_conf),
                    # use_hashmap=use_hashmap
                ),
                dimension=feature_conf['embedding_dimension'],
                combiner=_get_combiner(feature_conf),
                shared_embedding_name=feature_conf.get('shared_name', None))
        else:
            id_feature = layers.sparse_column_with_hash_bucket(
                column_name=name,
                hash_bucket_size=feature_conf['hash_bucket_size'],
                # use_hashmap=use_hashmap
            )
            return _EmbeddingColumn(
                id_feature,
                dimension=feature_conf['embedding_dimension'],
                combiner=_get_combiner(feature_conf),
                shared_embedding_name=feature_conf.get('shared_name', None),
                max_norm=None)
    elif "embedding_dimension" in feature_conf \
            and "boundaries" not in feature_conf and "vocabulary_file" in feature_conf:
        use_hashmap = feature_conf.get("use_hashmap", False)
        if value_type == "Int":
            raise Exception(
                "embedding with vocabulary_file does not support Int type")
        else:
            id_feature = fc.sparse_column_with_vocabulary_file(
                column_name=name,
                vocabulary_file=feature_conf["vocabulary_file"],
                num_oov_buckets=feature_conf["num_oov_buckets"],
                vocab_size=feature_conf["vocab_size"],
            )
            return _EmbeddingColumn(
                id_feature,
                dimension=feature_conf['embedding_dimension'],
                combiner=_get_combiner(feature_conf),
                shared_embedding_name=feature_conf.get('shared_name', None),
                max_norm=None)
    elif "embedding_dimension" in feature_conf \
            and "boundaries" in feature_conf:
        return embedding_bucketized_column(
            layers.real_valued_column(
                column_name=name,
                dimension=feature_conf.get('dimension', 1),
                default_value=[
                    0.0 for _ in range(int(feature_conf.get('dimension', 1)))
                ]),
            boundaries=[
                float(b) for b in feature_conf['boundaries'].split(',')
            ],
            embedding_dimension=feature_conf["embedding_dimension"],
            max_norm=None,
            shared_name=feature_conf.get('shared_name', None),
            add_random=feature_conf.get('add_random', False))
    elif "embedding_dimension" not in feature_conf \
            and "boundaries" in feature_conf:
        return layers.bucketized_column(
            layers.real_valued_column(
                column_name=name,
                dimension=feature_conf.get('dimension', 1),
                default_value=[
                    0.0 for _ in range(int(feature_conf.get('dimension', 1)))
                ]),
            boundaries=[
                float(b) for b in feature_conf['boundaries'].split(',')
            ])
    else:
        return layers.real_valued_column(
            column_name=name,
            dimension=feature_conf.get('dimension', 1),
            default_value=[
                0.0 for _ in range(int(feature_conf.get('dimension', 1)))
            ],
            normalizer=None if 'l2_norm' not in feature_conf else
            lambda x: tf.nn.l2_normalize(x, dim=-1))