def test_table(self): initializer = lookup_ops.TextFileInitializer( self._vocab_path, key_dtype=dtypes.string, key_index=lookup_ops.TextFileIndex.WHOLE_LINE, value_dtype=dtypes.int64, value_index=lookup_ops.TextFileIndex.LINE_NUMBER) root = util.Checkpoint( table=lookup_ops.HashTable(initializer, default_value=-1)) root.table_user = def_function.function( root.table.lookup, input_signature=[tensor_spec.TensorSpec(None, dtypes.string)]) self.assertEqual( 2, self.evaluate(root.table_user(constant_op.constant("gamma")))) save_dir = os.path.join(self.get_temp_dir(), "saved_model") save.save(root, save_dir) file_io.delete_file(self._vocab_path) self.assertAllClose({"output_0": [2, 0]}, _import_and_infer(save_dir, {"keys": ["gamma", "alpha"]})) second_dir = os.path.join(self.get_temp_dir(), "second_dir") # Asset paths should track the location the SavedModel is loaded from. file_io.rename(save_dir, second_dir) self.assertAllClose({"output_0": [2, 1]}, _import_and_infer(second_dir, {"keys": ["gamma", "beta"]}))
def textFileInitializer(self, vals): file = os.path.join(self.get_temp_dir(), "text_file_initializer") with open(file, "w") as f: f.write("\n".join(str(v) for v in vals) + "\n") return lookup_ops.TextFileInitializer( file, dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER, dtypes.int64, lookup_ops.TextFileIndex.WHOLE_LINE)
def _v1_asset_saved_model(self): export_graph = ops.Graph() vocab_path = os.path.join(self.get_temp_dir(), "vocab.txt") with open(vocab_path, "w") as f: f.write("alpha\nbeta\ngamma\n") with export_graph.as_default(): initializer = lookup_ops.TextFileInitializer( vocab_path, key_dtype=dtypes.string, key_index=lookup_ops.TextFileIndex.WHOLE_LINE, value_dtype=dtypes.int64, value_index=lookup_ops.TextFileIndex.LINE_NUMBER) table = lookup_ops.HashTable(initializer, default_value=-1) start = array_ops.placeholder(shape=None, dtype=dtypes.string, name="in") output = table.lookup(start, name="out") with session_lib.Session() as session: session.run([table.initializer]) path = os.path.join(self.get_temp_dir(), "saved_model", str(ops.uid())) simple_save.simple_save(session, path, inputs={"start": start}, outputs={"output": output}, legacy_init_op=table.initializer) file_io.delete_file(vocab_path) return path
def get_static_table(tmpdir, vocab_list, mask_token=None, dtype=dtypes.string, oov_tokens=None): vocabulary_file = os.path.join(tmpdir, "tmp_vocab.txt") if dtype == dtypes.string: with open(vocabulary_file, "w") as f: f.write("\n".join(vocab_list) + "\n") else: with open(vocabulary_file, "w") as f: f.write("\n".join([str(v) for v in vocab_list]) + "\n") offset = ((0 if mask_token is None else 1) + (len(oov_tokens) if oov_tokens is not None else 0)) init = lookup_ops.TextFileInitializer(vocabulary_file, dtype, lookup_ops.TextFileIndex.WHOLE_LINE, dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER, value_index_offset=offset) if context.executing_eagerly(): table = lookup_ops.StaticHashTable(init, default_value=-7) else: table = lookup_ops.StaticHashTableV1(init, default_value=-7) return table_utils.TableHandler( table, oov_tokens, mask_token=mask_token, use_v1_apis=(not context.executing_eagerly()))
def __init__(self): self.asset = asset.Asset( test.test_src_dir_path( "cc/saved_model/testdata/static_hashtable_asset.txt")) self.table = lookup_ops.StaticHashTable( lookup_ops.TextFileInitializer( self.asset, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE, dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER), -1)
def test_untracked_table_useful_message(self): root = module.Module() initializer = lookup_ops.TextFileInitializer( self._vocab_path, key_dtype=dtypes.string, key_index=lookup_ops.TextFileIndex.WHOLE_LINE, value_dtype=dtypes.int64, value_index=lookup_ops.TextFileIndex.LINE_NUMBER) table = lookup_ops.HashTable(initializer, default_value=-1) root.table_user = def_function.function( table.lookup, input_signature=[tensor_spec.TensorSpec(None, dtypes.string)]) root.table_user(constant_op.constant("gamma")) save_dir = os.path.join(self.get_temp_dir(), "saved_model") with self.assertRaisesRegexp(AssertionError, "HashTable"): save.save(root, save_dir)
def make_initializer(self, init_source, vals): if init_source == "textfile": file = os.path.join(self.get_temp_dir(), "text_file_initializer") with open(file, "w") as f: f.write("\n".join(str(v) for v in vals) + "\n") return lookup_ops.TextFileInitializer( filename=file, key_dtype=dtypes.int64, key_index=lookup_ops.TextFileIndex.LINE_NUMBER, value_dtype=dtypes.int64, value_index=lookup_ops.TextFileIndex.WHOLE_LINE) elif init_source == "keyvaluetensor": keys_tensor = constant_op.constant( list(range(len(vals))), dtype=dtypes.int64) vals_tensor = constant_op.constant(vals) return lookup_ops.KeyValueTensorInitializer(keys_tensor, vals_tensor) else: raise ValueError("Unrecognized init_source: " + init_source)
def test_table(self, cycles): # TODO(b/123408779): Handle generic TrackableResources and enable this test self.skipTest("Need to handle generic TrackableResources") vocab_path = self._make_asset("alpha\nbeta\ngamma\n") initializer = lookup_ops.TextFileInitializer( vocab_path, key_dtype=dtypes.string, key_index=lookup_ops.TextFileIndex.WHOLE_LINE, value_dtype=dtypes.int64, value_index=lookup_ops.TextFileIndex.LINE_NUMBER) root = util.Checkpoint(table=lookup_ops.HashTable( initializer, default_value=-1)) root.table_user = def_function.function( root.table.lookup, input_signature=[tensor_spec.TensorSpec(None, dtypes.string)]) self.assertEqual(2, root.table_user(constant_op.constant("gamma")).numpy()) imported = self.cycle(root, cycles) self.assertEqual( 2, imported.table_user(constant_op.constant("gamma")).numpy())
def __init__(self, init_source, filepath): vals = [0, 1, 2] if init_source == "textfile": with open(filepath, "w") as f: f.write("\n".join(str(v) for v in vals) + "\n") self.initializer = lookup_ops.TextFileInitializer( filepath, dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER, dtypes.int64, lookup_ops.TextFileIndex.WHOLE_LINE) else: keys_tensor = constant_op.constant( list(range(len(vals))), dtype=dtypes.int64) vals_tensor = constant_op.constant(vals) self.initializer = lookup_ops.KeyValueTensorInitializer( keys_tensor, vals_tensor) self.table = lookup_ops.StaticHashTable( self.initializer, default_value=-2)
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') shutil.rmtree(FLAGS.saved_model_path) variable_scope.enable_resource_variables() # Create the graph table_initializer = lookup_ops.TextFileInitializer( write_vocabulary_file(['cat', 'is', 'on', 'the', 'mat']), dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE, dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER) table = lookup_ops.StaticVocabularyTable(table_initializer, num_oov_buckets=10) key = array_ops.placeholder(dtypes.string, shape=(), name='input') result = table.lookup(key) sess = session.Session() sess.run(variables.global_variables_initializer()) sm_builder = builder.SavedModelBuilder(FLAGS.saved_model_path) tensor_info_x = utils.build_tensor_info(key) tensor_info_r = utils.build_tensor_info(result) toy_signature = (signature_def_utils.build_signature_def( inputs={'x': tensor_info_x}, outputs={'r': tensor_info_r}, method_name=signature_constants.PREDICT_METHOD_NAME)) sm_builder.add_meta_graph_and_variables( sess, [tag_constants.SERVING], signature_def_map={ signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: toy_signature, }, main_op=lookup_ops.tables_initializer(), assets_collection=ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS), strip_default_attrs=True) sm_builder.save()
def bm_adapt_implementation(self, num_elements, batch_size): """Test the KPL adapt implementation.""" vocab = get_vocab() vocab_file = self._write_to_temp_file("vocab", vocab) vocabulary_initializer = lookup_ops.TextFileInitializer( filename=vocab_file, key_dtype=dtypes.string, key_index=lookup_ops.TextFileIndex.WHOLE_LINE, value_dtype=dtypes.int64, value_index=lookup_ops.TextFileIndex.LINE_NUMBER, value_index_offset=2) input_t = keras.Input(shape=(), dtype=dtypes.string) layer = index_lookup.IndexLookup(vocabulary=vocabulary_initializer, max_tokens=None, num_oov_indices=1, mask_token="", oov_token="OOV", dtype=dtypes.string) out_t = layer(input_t) model = keras.Model(input_t, out_t) num_repeats = 5 starts = [] ends = [] data = tensor_gen(batch_size, num_elements) _ = model(data) for _ in range(num_repeats): starts.append(time.time()) _ = model(data) ends.append(time.time()) avg_time = np.mean(np.array(ends) - np.array(starts)) baseline, _ = self.run_numpy_implementation(data, vocab) extras = { "numpy implementation baseline": baseline, "delta seconds": (baseline - avg_time), "delta percent": ((baseline - avg_time) / baseline) * 100 } name = "index_lookup_forward|%s_elements|batch_%s" % (num_elements, batch_size) self.report_benchmark(iters=num_repeats, wall_time=avg_time, extras=extras, name=name)
def testDistributeLookupTable(self, init_from_file): cluster = data_service_test_base.TestCluster(num_workers=1) if init_from_file: file = os.path.join(self.get_temp_dir(), "distribute_lookup_table") with open(file, "w") as f: f.write("10\n11\n") initializer = lookup_ops.TextFileInitializer( file, dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER, dtypes.int64, lookup_ops.TextFileIndex.WHOLE_LINE) else: keys_tensor = constant_op.constant([0, 1], dtype=dtypes.int64) vals_tensor = constant_op.constant([10, 11]) initializer = lookup_ops.KeyValueTensorInitializer( keys_tensor, vals_tensor) table = lookup_ops.StaticHashTable(initializer, -1) ds = dataset_ops.Dataset.range(3) ds = ds.map(table.lookup) ds = self.make_distributed_dataset(ds, cluster) self.evaluate(lookup_ops.tables_initializer()) self.assertDatasetProduces(ds, [10, 11, -1], requires_initialization=True)
def testLookupTableGraphSerialization(self, init_from_file): if init_from_file: file = os.path.join(self.get_temp_dir(), "lookup_table_graph_serialize") with open(file, "w") as f: f.write("10\n11\n") initializer = lookup_ops.TextFileInitializer( file, dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER, dtypes.int64, lookup_ops.TextFileIndex.WHOLE_LINE) else: keys_tensor = constant_op.constant([0, 1], dtype=dtypes.int64) vals_tensor = constant_op.constant([10, 11]) initializer = lookup_ops.KeyValueTensorInitializer( keys_tensor, vals_tensor) table = lookup_ops.StaticHashTable(initializer, -1) dataset = dataset_ops.Dataset.range(3) dataset = dataset.map(table.lookup) self.evaluate(lookup_ops.tables_initializer()) round_tripped = self.graphRoundTrip(dataset) del table del dataset self.assertDatasetProduces(round_tripped, [10, 11, -1], requires_initialization=True)
def __init__(self, max_tokens, num_oov_indices, mask_token, oov_token, vocabulary=None, invert=False, output_mode=INT, sparse=False, pad_to_max_tokens=False, **kwargs): # If max_tokens is set, the value must be greater than 1 - otherwise we # are creating a 0-element vocab, which doesn't make sense. if max_tokens is not None and max_tokens <= 1: raise ValueError("If set, `max_tokens` must be greater than 1. " "You passed {}".format(max_tokens)) if num_oov_indices < 0: raise ValueError("`num_oov_indices` must be greater than or equal to 0. " "You passed {}".format(num_oov_indices)) # Support deprecated names for output_modes. if output_mode == "binary": output_mode = MULTI_HOT if output_mode == "tf-idf": output_mode = TF_IDF # 'output_mode' must be one of (INT, MULTI_HOT, COUNT, TF_IDF) layer_utils.validate_string_arg( output_mode, allowable_strings=(INT, MULTI_HOT, COUNT, TF_IDF), layer_name=self.__class__.__name__, arg_name="output_mode") if invert and output_mode != INT: raise ValueError("`output_mode` must be {} when `invert` is true. You " "passed {}".format(INT, output_mode)) self.invert = invert self.max_tokens = max_tokens self.num_oov_indices = num_oov_indices self.output_mode = output_mode self.sparse = sparse self.pad_to_max_tokens = pad_to_max_tokens self._called = False # A note on vocab_size: we need to always keep a non-Tensor representation # of vocab_size around to use in graph building. Because we might be # in a tf.function, we can't rely on evaluating the actual tables to # find the value either. self._vocab_size = None # We need to keep track our current vocab size outside of our layer weights # to support a static output shape when `output_mode != INT`. The bincount # ops do not set shape on their outputs, which means we have to set it # ourselves. We persist the current vocab size as a hidden part of the # config when serializing our model. if "vocabulary_size" in kwargs: self._vocab_size = kwargs["vocabulary_size"] del kwargs["vocabulary_size"] restore_from_static_table = kwargs.pop("has_static_table", False) # Make sure the mask token and oov token are truly of the dtype we want. We # can ignore strings here, because they have only one dtype. dtype = kwargs["dtype"] if dtype == dtypes.int32: mask_token = None if mask_token is None else np.int32(mask_token) oov_token = None if oov_token is None else np.int32(oov_token) elif dtype == dtypes.int64: mask_token = None if mask_token is None else np.int64(mask_token) oov_token = None if oov_token is None else np.int64(oov_token) self.mask_token = mask_token self.oov_token = oov_token if max_tokens is not None: available_vocab_size = max_tokens - self._token_start_index() else: available_vocab_size = None super(IndexLookup, self).__init__( combiner=_IndexLookupCombiner( vocab_size=available_vocab_size, mask_value=mask_token, oov_value=oov_token, compute_idf=(output_mode == TF_IDF)), **kwargs) # We need to save the key dtype so that we know if we're expecting int64 # keys. If we are, we will cast int32 inputs to int64 as well. if invert: self._key_dtype = dtypes.int64 self._value_dtype = self.dtype self._mask_key = 0 self._mask_value = mask_token key_index = lookup_ops.TextFileIndex.LINE_NUMBER value_index = lookup_ops.TextFileIndex.WHOLE_LINE default_value = self.oov_token oov_indices = None else: self._key_dtype = self.dtype self._value_dtype = dtypes.int64 self._mask_key = mask_token key_index = lookup_ops.TextFileIndex.WHOLE_LINE value_index = lookup_ops.TextFileIndex.LINE_NUMBER # Masks should map to 0 for int output and be dropped otherwise. Max ints # will be dropped from the bincount op. self._mask_value = 0 if self.output_mode == INT else dtypes.int64.max oov_start = self._oov_start_index() token_start = self._token_start_index() if self.num_oov_indices == 0: # If there are no OOV indices, we map OOV tokens to -1 and error out # during call if we find a negative index. default_value = -1 oov_indices = None elif self.num_oov_indices == 1: # If there is only one OOV index, we can set that index as the default # value of the index_lookup table. default_value = oov_start oov_indices = None else: # If we hav multiple OOV values, we need to do a further hashing step; # to make this easier, we set the OOV value to -1. (This lets us do a # vectorized add and cast to boolean to determine locations where we # need to do extra hashing.) default_value = -1 oov_indices = list(range(oov_start, token_start)) self._static_vocabulary_path = None has_vocab_path = (vocabulary is not None and isinstance(vocabulary, str)) if has_vocab_path or restore_from_static_table: self._has_static_table = True if vocabulary is None: # If we're restoring a layer that was saved with a static table # initializer, we create a fake initializer object to let the code # progress. The savedmodel restoration code will handle restoring # the actual data. initializer = _NullInitializer(self._key_dtype, self._value_dtype) else: if not gfile.Exists(vocabulary): raise ValueError("Vocabulary file %s does not exist." % (vocabulary,)) self._static_vocabulary_path = vocabulary num_tokens = table_utils.num_tokens_in_file(vocabulary) self._vocab_size = self._token_start_index() + num_tokens initializer = lookup_ops.TextFileInitializer( filename=vocabulary, key_dtype=self._key_dtype, key_index=key_index, value_dtype=self._value_dtype, value_index=value_index, value_index_offset=self._token_start_index()) self._table = lookup_ops.StaticHashTable( initializer, default_value=default_value) self._table_handler = table_utils.TableHandler( table=self._table, mask_token=self._mask_key if self.mask_token is not None else None, mask_value=self._mask_value, oov_tokens=oov_indices) tracked_table = self._add_trackable(self._table, trainable=False) else: self._has_static_table = False self._table = lookup_ops.MutableHashTable( key_dtype=self._key_dtype, value_dtype=self._value_dtype, default_value=default_value, name=(self._name + "_index_table")) self._table_handler = table_utils.TableHandler( table=self._table, oov_tokens=oov_indices) if vocabulary is not None: self.set_vocabulary(vocabulary) tracked_table = self._add_trackable(self._table, trainable=False) if self.output_mode == TF_IDF: # The TF-IDF weight may have a (None,) tensorshape. This creates # a 1D variable with arbitrary shape, which we can assign any weight to # so long as it has 1 dimension. In order to properly initialize this # weight in Keras, we need to provide a custom callable initializer which # does not depend on the shape of the weight (as all other initializers # do) since the weight is not known. Hence the lambda shape, dtype: [0]. if not self.pad_to_max_tokens or max_tokens is None: initializer = lambda shape, dtype: [0] else: initializer = init_ops.zeros_initializer # We are adding these here instead of in build() since they do not depend # on the input shape at all. idf_shape = (max_tokens,) if self.pad_to_max_tokens else (None,) self.tf_idf_weights = self._add_state_variable( name="idf", shape=tensor_shape.TensorShape(idf_shape), dtype=backend.floatx(), initializer=initializer) # This is a workaround for summary() on this layer. Because the table is # not mutable during training, the effective number of parameters (and so # the weight shape) is 0; we add this as an attr so that the parameter # counting code in the Model object doesn't throw an attribute error. tracked_table.shape = tensor_shape.TensorShape((0,))
def skip_gram_sample_with_text_vocab(input_tensor, vocab_freq_file, vocab_token_index=0, vocab_token_dtype=tf.dtypes.string, vocab_freq_index=1, vocab_freq_dtype=tf.dtypes.float64, vocab_delimiter=",", vocab_min_count=0, vocab_subsampling=None, corpus_size=None, min_skips=1, max_skips=5, start=0, limit=-1, emit_self_as_target=False, batch_size=None, batch_capacity=None, seed=None, name=None): """Skip-gram sampling with a text vocabulary file. Wrapper around `skip_gram_sample()` for use with a text vocabulary file. The vocabulary file is expected to be a plain-text file, with lines of `vocab_delimiter`-separated columns. The `vocab_token_index` column should contain the vocabulary term, while the `vocab_freq_index` column should contain the number of times that term occurs in the corpus. For example, with a text vocabulary file of: ``` bonjour,fr,42 hello,en,777 hola,es,99 ``` You should set `vocab_delimiter=","`, `vocab_token_index=0`, and `vocab_freq_index=2`. See `skip_gram_sample()` documentation for more details about the skip-gram sampling process. Args: input_tensor: A rank-1 `Tensor` from which to generate skip-gram candidates. vocab_freq_file: `string` specifying full file path to the text vocab file. vocab_token_index: `int` specifying which column in the text vocab file contains the tokens. vocab_token_dtype: `DType` specifying the format of the tokens in the text vocab file. vocab_freq_index: `int` specifying which column in the text vocab file contains the frequency counts of the tokens. vocab_freq_dtype: `DType` specifying the format of the frequency counts in the text vocab file. vocab_delimiter: `string` specifying the delimiter used in the text vocab file. vocab_min_count: `int`, `float`, or scalar `Tensor` specifying minimum frequency threshold (from `vocab_freq_file`) for a token to be kept in `input_tensor`. This should correspond with `vocab_freq_dtype`. vocab_subsampling: (Optional) `float` specifying frequency proportion threshold for tokens from `input_tensor`. Tokens that occur more frequently will be randomly down-sampled. Reasonable starting values may be around 1e-3 or 1e-5. See Eq. 5 in http://arxiv.org/abs/1310.4546 for more details. corpus_size: (Optional) `int`, `float`, or scalar `Tensor` specifying the total number of tokens in the corpus (e.g., sum of all the frequency counts of `vocab_freq_file`). Used with `vocab_subsampling` for down-sampling frequently occurring tokens. If this is specified, `vocab_freq_file` and `vocab_subsampling` must also be specified. If `corpus_size` is needed but not supplied, then it will be calculated from `vocab_freq_file`. You might want to supply your own value if you have already eliminated infrequent tokens from your vocabulary files (where frequency < vocab_min_count) to save memory in the internal token lookup table. Otherwise, the unused tokens' variables will waste memory. The user-supplied `corpus_size` value must be greater than or equal to the sum of all the frequency counts of `vocab_freq_file`. min_skips: `int` or scalar `Tensor` specifying the minimum window size to randomly use for each token. Must be >= 0 and <= `max_skips`. If `min_skips` and `max_skips` are both 0, the only label outputted will be the token itself. max_skips: `int` or scalar `Tensor` specifying the maximum window size to randomly use for each token. Must be >= 0. start: `int` or scalar `Tensor` specifying the position in `input_tensor` from which to start generating skip-gram candidates. limit: `int` or scalar `Tensor` specifying the maximum number of elements in `input_tensor` to use in generating skip-gram candidates. -1 means to use the rest of the `Tensor` after `start`. emit_self_as_target: `bool` or scalar `Tensor` specifying whether to emit each token as a label for itself. batch_size: (Optional) `int` specifying batch size of returned `Tensors`. batch_capacity: (Optional) `int` specifying batch capacity for the queue used for batching returned `Tensors`. Only has an effect if `batch_size` > 0. Defaults to 100 * `batch_size` if not specified. seed: (Optional) `int` used to create a random seed for window size and subsampling. See [`set_random_seed`](../../g3doc/python/constant_op.md#set_random_seed) for behavior. name: (Optional) A `string` name or a name scope for the operations. Returns: A `tuple` containing (token, label) `Tensors`. Each output `Tensor` is of rank-1 and has the same type as `input_tensor`. The `Tensors` will be of length `batch_size`; if `batch_size` is not specified, they will be of random length, though they will be in sync with each other as long as they are evaluated together. Raises: ValueError: If `vocab_token_index` or `vocab_freq_index` is less than 0 or exceeds the number of columns in `vocab_freq_file`. If `vocab_token_index` and `vocab_freq_index` are both set to the same column. If any token in `vocab_freq_file` has a negative frequency. """ if vocab_token_index < 0 or vocab_freq_index < 0: raise ValueError( "vocab_token_index={} and vocab_freq_index={} must both be >= 0.". format(vocab_token_index, vocab_freq_index)) if vocab_token_index == vocab_freq_index: raise ValueError( "vocab_token_index and vocab_freq_index should be different, but are " "both {}.".format(vocab_token_index)) # Iterates through the vocab file and calculates the number of vocab terms as # well as the total corpus size (by summing the frequency counts of all the # vocab terms). calculated_corpus_size = 0.0 vocab_size = 0 with tf.io.gfile.GFile(vocab_freq_file, mode="r") as f: reader = csv.reader(f, delimiter=vocab_delimiter) for row in reader: if vocab_token_index >= len(row) or vocab_freq_index >= len(row): raise ValueError( "Row in vocab file only has {} columns, so vocab_token_index={} or " "vocab_freq_index={} is out of bounds. Row content: {}". format(len(row), vocab_token_index, vocab_freq_index, row)) vocab_size += 1 freq = vocab_freq_dtype.as_numpy_dtype(row[vocab_freq_index]) if freq < 0: raise ValueError( "Row in vocab file has negative frequency of {}. Row content: {}" .format(freq, row)) # Note: tokens whose frequencies are below vocab_min_count will still # contribute to the total corpus size used for vocab subsampling. calculated_corpus_size += freq if not corpus_size: corpus_size = calculated_corpus_size elif calculated_corpus_size - corpus_size > 1e-6: raise ValueError( "`corpus_size`={} must be greater than or equal to the sum of all the " "frequency counts ({}) of `vocab_freq_file` ({}).".format( corpus_size, calculated_corpus_size, vocab_freq_file)) vocab_freq_table = lookup_ops.HashTable( lookup_ops.TextFileInitializer(filename=vocab_freq_file, key_dtype=vocab_token_dtype, key_index=vocab_token_index, value_dtype=vocab_freq_dtype, value_index=vocab_freq_index, vocab_size=vocab_size, delimiter=vocab_delimiter), # For vocab terms not in vocab file, use a default value of -1. default_value=-1) return skip_gram_sample( input_tensor, min_skips=min_skips, max_skips=max_skips, start=start, limit=limit, emit_self_as_target=emit_self_as_target, vocab_freq_table=vocab_freq_table, vocab_min_count=vocab_min_count, vocab_subsampling=vocab_subsampling, # corpus_size is not used unless vocab_subsampling is specified. corpus_size=None if vocab_subsampling is None else corpus_size, batch_size=batch_size, batch_capacity=batch_capacity, seed=seed, name=name)
def __init__(self, max_tokens, num_oov_indices, mask_token, oov_token, vocabulary=None, invert=False, output_mode=INT, sparse=False, pad_to_max_tokens=False, **kwargs): # If max_tokens is set, the value must be greater than 1 - otherwise we # are creating a 0-element vocab, which doesn't make sense. if max_tokens is not None and max_tokens <= 1: raise ValueError("If set, `max_tokens` must be greater than 1. " "You passed {}".format(max_tokens)) if num_oov_indices < 0: raise ValueError( "`num_oov_indices` must be greater than or equal to 0. " "You passed {}".format(num_oov_indices)) # 'output_mode' must be one of (INT, BINARY, COUNT, TFIDF) layer_utils.validate_string_arg(output_mode, allowable_strings=(INT, BINARY, COUNT, TFIDF), layer_name=self.__class__.__name__, arg_name="output_mode") if invert and output_mode != INT: raise ValueError( "`output_mode` must be {} when `invert` is true. You " "passed {}".format(INT, output_mode)) self.invert = invert self.max_tokens = max_tokens self.num_oov_indices = num_oov_indices self.oov_token = oov_token self.mask_token = mask_token self.output_mode = output_mode self.sparse = sparse self.pad_to_max_tokens = pad_to_max_tokens self._called = False self._vocab_size = 0 # We need to keep track our current vocab size outside of our layer weights # to support a static output shape when `output_mode != INT`. The bincount # ops do not set shape on their outputs, which means we have to set it # ourselves. We persist the current vocab size as a hidden part of the # config when serializing our model. if "vocabulary_size" in kwargs: self._vocab_size = kwargs["vocabulary_size"] del kwargs["vocabulary_size"] if max_tokens is not None: available_vocab_size = max_tokens - self._token_start_index() else: available_vocab_size = None super(IndexLookup, self).__init__(combiner=_IndexLookupCombiner( vocab_size=available_vocab_size, mask_value=mask_token, oov_value=oov_token, compute_idf=(output_mode == TFIDF)), **kwargs) # We need to save the key dtype so that we know if we're expecting int64 # keys. If we are, we will cast int32 inputs to int64 as well. if invert: self._key_dtype = dtypes.int64 self._value_dtype = self.dtype self._mask_key = 0 self._mask_value = mask_token key_index = lookup_ops.TextFileIndex.LINE_NUMBER value_index = lookup_ops.TextFileIndex.WHOLE_LINE default_value = self.oov_token oov_indices = None else: self._key_dtype = self.dtype self._value_dtype = dtypes.int64 self._mask_key = mask_token key_index = lookup_ops.TextFileIndex.WHOLE_LINE value_index = lookup_ops.TextFileIndex.LINE_NUMBER # Masks should map to 0 for int output and be dropped otherwise. Max ints # will be dropped from the bincount op. self._mask_value = 0 if self.output_mode == INT else dtypes.int64.max oov_start = self._oov_start_index() token_start = self._token_start_index() if self.num_oov_indices == 0: # If there are no OOV indices, we map OOV tokens to -1 for int output # and drop them from bagged output. Max ints will be dropped from the # bincount op. default_value = -1 if self.output_mode == INT else dtypes.int64.max oov_indices = None elif self.num_oov_indices == 1: # If there is only one OOV index, we can set that index as the default # value of the index_lookup table. default_value = oov_start oov_indices = None else: # If we hav multiple OOV values, we need to do a further hashing step; # to make this easier, we set the OOV value to -1. (This lets us do a # vectorized add and cast to boolean to determine locations where we # need to do extra hashing.) default_value = -1 oov_indices = list(range(oov_start, token_start)) if vocabulary is not None and isinstance(vocabulary, str): if not os.path.exists(vocabulary): raise ValueError("Vocabulary file %s does not exist." % vocabulary) total_offset = 0 if mask_token is None else 1 total_offset += num_oov_indices initializer = lookup_ops.TextFileInitializer( filename=vocabulary, key_dtype=self._key_dtype, key_index=key_index, value_dtype=self._value_dtype, value_index=value_index, value_index_offset=total_offset) self._table = self._static_table_class()( initializer, default_value=default_value) self._table_handler = table_utils.TableHandler( table=self._table, mask_token=self._mask_key, mask_value=self._mask_value, oov_tokens=oov_indices, use_v1_apis=self._use_v1_apis()) self.max_tokens = (self._table_handler.table_size() + self.num_oov_indices + (0 if mask_token is None else 1)) else: self._table = lookup_ops.MutableHashTable( key_dtype=self._key_dtype, value_dtype=self._value_dtype, default_value=default_value, name=(self._name + "_index_table")) self._table_handler = table_utils.TableHandler( table=self._table, oov_tokens=oov_indices, use_v1_apis=self._use_v1_apis()) if vocabulary is not None: self.set_vocabulary(vocabulary) if self.output_mode == TFIDF: # The TF-IDF weight may have a (None,) tensorshape. This creates # a 1D variable with arbitrary shape, which we can assign any weight to # so long as it has 1 dimension. In order to properly initialize this # weight in Keras, we need to provide a custom callable initializer which # does not depend on the shape of the weight (as all other initializers # do) since the weight is not known. Hence the lambda shape, dtype: [0]. if not self.pad_to_max_tokens or max_tokens is None: initializer = lambda shape, dtype: [0] else: initializer = init_ops.zeros_initializer # We are adding these here instead of in build() since they do not depend # on the input shape at all. idf_shape = (max_tokens, ) if self.pad_to_max_tokens else (None, ) self.tf_idf_weights = self._add_state_variable( name="idf", shape=tensor_shape.TensorShape(idf_shape), dtype=K.floatx(), initializer=initializer) tracked_table = self._add_trackable(self._table, trainable=False) # This is a workaround for summary() on this layer. Because the table is # not mutable during training, the effective number of parameters (and so # the weight shape) is 0; we add this as an attr so that the parameter # counting code in the Model object doesn't throw an attribute error. tracked_table.shape = tensor_shape.TensorShape((0, ))