Пример #1
0
 def test_table(self):
     initializer = lookup_ops.TextFileInitializer(
         self._vocab_path,
         key_dtype=dtypes.string,
         key_index=lookup_ops.TextFileIndex.WHOLE_LINE,
         value_dtype=dtypes.int64,
         value_index=lookup_ops.TextFileIndex.LINE_NUMBER)
     root = util.Checkpoint(
         table=lookup_ops.HashTable(initializer, default_value=-1))
     root.table_user = def_function.function(
         root.table.lookup,
         input_signature=[tensor_spec.TensorSpec(None, dtypes.string)])
     self.assertEqual(
         2, self.evaluate(root.table_user(constant_op.constant("gamma"))))
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
     save.save(root, save_dir)
     file_io.delete_file(self._vocab_path)
     self.assertAllClose({"output_0": [2, 0]},
                         _import_and_infer(save_dir,
                                           {"keys": ["gamma", "alpha"]}))
     second_dir = os.path.join(self.get_temp_dir(), "second_dir")
     # Asset paths should track the location the SavedModel is loaded from.
     file_io.rename(save_dir, second_dir)
     self.assertAllClose({"output_0": [2, 1]},
                         _import_and_infer(second_dir,
                                           {"keys": ["gamma", "beta"]}))
Пример #2
0
 def textFileInitializer(self, vals):
     file = os.path.join(self.get_temp_dir(), "text_file_initializer")
     with open(file, "w") as f:
         f.write("\n".join(str(v) for v in vals) + "\n")
     return lookup_ops.TextFileInitializer(
         file, dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER,
         dtypes.int64, lookup_ops.TextFileIndex.WHOLE_LINE)
Пример #3
0
 def _v1_asset_saved_model(self):
     export_graph = ops.Graph()
     vocab_path = os.path.join(self.get_temp_dir(), "vocab.txt")
     with open(vocab_path, "w") as f:
         f.write("alpha\nbeta\ngamma\n")
     with export_graph.as_default():
         initializer = lookup_ops.TextFileInitializer(
             vocab_path,
             key_dtype=dtypes.string,
             key_index=lookup_ops.TextFileIndex.WHOLE_LINE,
             value_dtype=dtypes.int64,
             value_index=lookup_ops.TextFileIndex.LINE_NUMBER)
         table = lookup_ops.HashTable(initializer, default_value=-1)
         start = array_ops.placeholder(shape=None,
                                       dtype=dtypes.string,
                                       name="in")
         output = table.lookup(start, name="out")
         with session_lib.Session() as session:
             session.run([table.initializer])
             path = os.path.join(self.get_temp_dir(), "saved_model",
                                 str(ops.uid()))
             simple_save.simple_save(session,
                                     path,
                                     inputs={"start": start},
                                     outputs={"output": output},
                                     legacy_init_op=table.initializer)
     file_io.delete_file(vocab_path)
     return path
Пример #4
0
def get_static_table(tmpdir,
                     vocab_list,
                     mask_token=None,
                     dtype=dtypes.string,
                     oov_tokens=None):
    vocabulary_file = os.path.join(tmpdir, "tmp_vocab.txt")

    if dtype == dtypes.string:
        with open(vocabulary_file, "w") as f:
            f.write("\n".join(vocab_list) + "\n")
    else:
        with open(vocabulary_file, "w") as f:
            f.write("\n".join([str(v) for v in vocab_list]) + "\n")

    offset = ((0 if mask_token is None else 1) +
              (len(oov_tokens) if oov_tokens is not None else 0))
    init = lookup_ops.TextFileInitializer(vocabulary_file,
                                          dtype,
                                          lookup_ops.TextFileIndex.WHOLE_LINE,
                                          dtypes.int64,
                                          lookup_ops.TextFileIndex.LINE_NUMBER,
                                          value_index_offset=offset)
    if context.executing_eagerly():
        table = lookup_ops.StaticHashTable(init, default_value=-7)
    else:
        table = lookup_ops.StaticHashTableV1(init, default_value=-7)

    return table_utils.TableHandler(
        table,
        oov_tokens,
        mask_token=mask_token,
        use_v1_apis=(not context.executing_eagerly()))
Пример #5
0
 def __init__(self):
     self.asset = asset.Asset(
         test.test_src_dir_path(
             "cc/saved_model/testdata/static_hashtable_asset.txt"))
     self.table = lookup_ops.StaticHashTable(
         lookup_ops.TextFileInitializer(
             self.asset, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
             dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER), -1)
Пример #6
0
 def test_untracked_table_useful_message(self):
   root = module.Module()
   initializer = lookup_ops.TextFileInitializer(
       self._vocab_path,
       key_dtype=dtypes.string,
       key_index=lookup_ops.TextFileIndex.WHOLE_LINE,
       value_dtype=dtypes.int64,
       value_index=lookup_ops.TextFileIndex.LINE_NUMBER)
   table = lookup_ops.HashTable(initializer, default_value=-1)
   root.table_user = def_function.function(
       table.lookup,
       input_signature=[tensor_spec.TensorSpec(None, dtypes.string)])
   root.table_user(constant_op.constant("gamma"))
   save_dir = os.path.join(self.get_temp_dir(), "saved_model")
   with self.assertRaisesRegexp(AssertionError, "HashTable"):
     save.save(root, save_dir)
 def make_initializer(self, init_source, vals):
   if init_source == "textfile":
     file = os.path.join(self.get_temp_dir(), "text_file_initializer")
     with open(file, "w") as f:
       f.write("\n".join(str(v) for v in vals) + "\n")
     return lookup_ops.TextFileInitializer(
         filename=file,
         key_dtype=dtypes.int64,
         key_index=lookup_ops.TextFileIndex.LINE_NUMBER,
         value_dtype=dtypes.int64,
         value_index=lookup_ops.TextFileIndex.WHOLE_LINE)
   elif init_source == "keyvaluetensor":
     keys_tensor = constant_op.constant(
         list(range(len(vals))), dtype=dtypes.int64)
     vals_tensor = constant_op.constant(vals)
     return lookup_ops.KeyValueTensorInitializer(keys_tensor, vals_tensor)
   else:
     raise ValueError("Unrecognized init_source: " + init_source)
Пример #8
0
 def test_table(self, cycles):
   # TODO(b/123408779): Handle generic TrackableResources and enable this test
   self.skipTest("Need to handle generic TrackableResources")
   vocab_path = self._make_asset("alpha\nbeta\ngamma\n")
   initializer = lookup_ops.TextFileInitializer(
       vocab_path,
       key_dtype=dtypes.string,
       key_index=lookup_ops.TextFileIndex.WHOLE_LINE,
       value_dtype=dtypes.int64,
       value_index=lookup_ops.TextFileIndex.LINE_NUMBER)
   root = util.Checkpoint(table=lookup_ops.HashTable(
       initializer, default_value=-1))
   root.table_user = def_function.function(
       root.table.lookup,
       input_signature=[tensor_spec.TensorSpec(None, dtypes.string)])
   self.assertEqual(2, root.table_user(constant_op.constant("gamma")).numpy())
   imported = self.cycle(root, cycles)
   self.assertEqual(
       2, imported.table_user(constant_op.constant("gamma")).numpy())
    def __init__(self, init_source, filepath):
      vals = [0, 1, 2]
      if init_source == "textfile":

        with open(filepath, "w") as f:
          f.write("\n".join(str(v) for v in vals) + "\n")

        self.initializer = lookup_ops.TextFileInitializer(
            filepath, dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER,
            dtypes.int64, lookup_ops.TextFileIndex.WHOLE_LINE)
      else:
        keys_tensor = constant_op.constant(
            list(range(len(vals))), dtype=dtypes.int64)
        vals_tensor = constant_op.constant(vals)
        self.initializer = lookup_ops.KeyValueTensorInitializer(
            keys_tensor, vals_tensor)

      self.table = lookup_ops.StaticHashTable(
          self.initializer, default_value=-2)
def main(argv):
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')

    shutil.rmtree(FLAGS.saved_model_path)

    variable_scope.enable_resource_variables()

    # Create the graph
    table_initializer = lookup_ops.TextFileInitializer(
        write_vocabulary_file(['cat', 'is', 'on', 'the', 'mat']),
        dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE, dtypes.int64,
        lookup_ops.TextFileIndex.LINE_NUMBER)
    table = lookup_ops.StaticVocabularyTable(table_initializer,
                                             num_oov_buckets=10)

    key = array_ops.placeholder(dtypes.string, shape=(), name='input')
    result = table.lookup(key)

    sess = session.Session()

    sess.run(variables.global_variables_initializer())

    sm_builder = builder.SavedModelBuilder(FLAGS.saved_model_path)
    tensor_info_x = utils.build_tensor_info(key)
    tensor_info_r = utils.build_tensor_info(result)

    toy_signature = (signature_def_utils.build_signature_def(
        inputs={'x': tensor_info_x},
        outputs={'r': tensor_info_r},
        method_name=signature_constants.PREDICT_METHOD_NAME))

    sm_builder.add_meta_graph_and_variables(
        sess, [tag_constants.SERVING],
        signature_def_map={
            signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
            toy_signature,
        },
        main_op=lookup_ops.tables_initializer(),
        assets_collection=ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS),
        strip_default_attrs=True)
    sm_builder.save()
Пример #11
0
 def bm_adapt_implementation(self, num_elements, batch_size):
     """Test the KPL adapt implementation."""
     vocab = get_vocab()
     vocab_file = self._write_to_temp_file("vocab", vocab)
     vocabulary_initializer = lookup_ops.TextFileInitializer(
         filename=vocab_file,
         key_dtype=dtypes.string,
         key_index=lookup_ops.TextFileIndex.WHOLE_LINE,
         value_dtype=dtypes.int64,
         value_index=lookup_ops.TextFileIndex.LINE_NUMBER,
         value_index_offset=2)
     input_t = keras.Input(shape=(), dtype=dtypes.string)
     layer = index_lookup.IndexLookup(vocabulary=vocabulary_initializer,
                                      max_tokens=None,
                                      num_oov_indices=1,
                                      mask_token="",
                                      oov_token="OOV",
                                      dtype=dtypes.string)
     out_t = layer(input_t)
     model = keras.Model(input_t, out_t)
     num_repeats = 5
     starts = []
     ends = []
     data = tensor_gen(batch_size, num_elements)
     _ = model(data)
     for _ in range(num_repeats):
         starts.append(time.time())
         _ = model(data)
         ends.append(time.time())
     avg_time = np.mean(np.array(ends) - np.array(starts))
     baseline, _ = self.run_numpy_implementation(data, vocab)
     extras = {
         "numpy implementation baseline": baseline,
         "delta seconds": (baseline - avg_time),
         "delta percent": ((baseline - avg_time) / baseline) * 100
     }
     name = "index_lookup_forward|%s_elements|batch_%s" % (num_elements,
                                                           batch_size)
     self.report_benchmark(iters=num_repeats,
                           wall_time=avg_time,
                           extras=extras,
                           name=name)
Пример #12
0
 def testDistributeLookupTable(self, init_from_file):
     cluster = data_service_test_base.TestCluster(num_workers=1)
     if init_from_file:
         file = os.path.join(self.get_temp_dir(), "distribute_lookup_table")
         with open(file, "w") as f:
             f.write("10\n11\n")
         initializer = lookup_ops.TextFileInitializer(
             file, dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER,
             dtypes.int64, lookup_ops.TextFileIndex.WHOLE_LINE)
     else:
         keys_tensor = constant_op.constant([0, 1], dtype=dtypes.int64)
         vals_tensor = constant_op.constant([10, 11])
         initializer = lookup_ops.KeyValueTensorInitializer(
             keys_tensor, vals_tensor)
     table = lookup_ops.StaticHashTable(initializer, -1)
     ds = dataset_ops.Dataset.range(3)
     ds = ds.map(table.lookup)
     ds = self.make_distributed_dataset(ds, cluster)
     self.evaluate(lookup_ops.tables_initializer())
     self.assertDatasetProduces(ds, [10, 11, -1],
                                requires_initialization=True)
Пример #13
0
    def testLookupTableGraphSerialization(self, init_from_file):
        if init_from_file:
            file = os.path.join(self.get_temp_dir(),
                                "lookup_table_graph_serialize")
            with open(file, "w") as f:
                f.write("10\n11\n")
            initializer = lookup_ops.TextFileInitializer(
                file, dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER,
                dtypes.int64, lookup_ops.TextFileIndex.WHOLE_LINE)
        else:
            keys_tensor = constant_op.constant([0, 1], dtype=dtypes.int64)
            vals_tensor = constant_op.constant([10, 11])
            initializer = lookup_ops.KeyValueTensorInitializer(
                keys_tensor, vals_tensor)

        table = lookup_ops.StaticHashTable(initializer, -1)
        dataset = dataset_ops.Dataset.range(3)
        dataset = dataset.map(table.lookup)
        self.evaluate(lookup_ops.tables_initializer())
        round_tripped = self.graphRoundTrip(dataset)
        del table
        del dataset
        self.assertDatasetProduces(round_tripped, [10, 11, -1],
                                   requires_initialization=True)
Пример #14
0
  def __init__(self,
               max_tokens,
               num_oov_indices,
               mask_token,
               oov_token,
               vocabulary=None,
               invert=False,
               output_mode=INT,
               sparse=False,
               pad_to_max_tokens=False,
               **kwargs):
    # If max_tokens is set, the value must be greater than 1 - otherwise we
    # are creating a 0-element vocab, which doesn't make sense.
    if max_tokens is not None and max_tokens <= 1:
      raise ValueError("If set, `max_tokens` must be greater than 1. "
                       "You passed {}".format(max_tokens))

    if num_oov_indices < 0:
      raise ValueError("`num_oov_indices` must be greater than or equal to 0. "
                       "You passed {}".format(num_oov_indices))

    # Support deprecated names for output_modes.
    if output_mode == "binary":
      output_mode = MULTI_HOT
    if output_mode == "tf-idf":
      output_mode = TF_IDF
    # 'output_mode' must be one of (INT, MULTI_HOT, COUNT, TF_IDF)
    layer_utils.validate_string_arg(
        output_mode,
        allowable_strings=(INT, MULTI_HOT, COUNT, TF_IDF),
        layer_name=self.__class__.__name__,
        arg_name="output_mode")

    if invert and output_mode != INT:
      raise ValueError("`output_mode` must be {} when `invert` is true. You "
                       "passed {}".format(INT, output_mode))

    self.invert = invert
    self.max_tokens = max_tokens
    self.num_oov_indices = num_oov_indices
    self.output_mode = output_mode
    self.sparse = sparse
    self.pad_to_max_tokens = pad_to_max_tokens
    self._called = False

    # A note on vocab_size: we need to always keep a non-Tensor representation
    # of vocab_size around to use in graph building. Because we might be
    # in a tf.function, we can't rely on evaluating the actual tables to
    # find the value either.
    self._vocab_size = None
    # We need to keep track our current vocab size outside of our layer weights
    # to support a static output shape when `output_mode != INT`. The bincount
    # ops do not set shape on their outputs, which means we have to set it
    # ourselves. We persist the current vocab size as a hidden part of the
    # config when serializing our model.
    if "vocabulary_size" in kwargs:
      self._vocab_size = kwargs["vocabulary_size"]
      del kwargs["vocabulary_size"]

    restore_from_static_table = kwargs.pop("has_static_table", False)

    # Make sure the mask token and oov token are truly of the dtype we want. We
    # can ignore strings here, because they have only one dtype.
    dtype = kwargs["dtype"]
    if dtype == dtypes.int32:
      mask_token = None if mask_token is None else np.int32(mask_token)
      oov_token = None if oov_token is None else np.int32(oov_token)
    elif dtype == dtypes.int64:
      mask_token = None if mask_token is None else np.int64(mask_token)
      oov_token = None if oov_token is None else np.int64(oov_token)
    self.mask_token = mask_token
    self.oov_token = oov_token

    if max_tokens is not None:
      available_vocab_size = max_tokens - self._token_start_index()
    else:
      available_vocab_size = None

    super(IndexLookup, self).__init__(
        combiner=_IndexLookupCombiner(
            vocab_size=available_vocab_size,
            mask_value=mask_token,
            oov_value=oov_token,
            compute_idf=(output_mode == TF_IDF)),
        **kwargs)

    # We need to save the key dtype so that we know if we're expecting int64
    # keys. If we are, we will cast int32 inputs to int64 as well.
    if invert:
      self._key_dtype = dtypes.int64
      self._value_dtype = self.dtype
      self._mask_key = 0
      self._mask_value = mask_token
      key_index = lookup_ops.TextFileIndex.LINE_NUMBER
      value_index = lookup_ops.TextFileIndex.WHOLE_LINE
      default_value = self.oov_token
      oov_indices = None
    else:
      self._key_dtype = self.dtype
      self._value_dtype = dtypes.int64
      self._mask_key = mask_token
      key_index = lookup_ops.TextFileIndex.WHOLE_LINE
      value_index = lookup_ops.TextFileIndex.LINE_NUMBER
      # Masks should map to 0 for int output and be dropped otherwise. Max ints
      # will be dropped from the bincount op.
      self._mask_value = 0 if self.output_mode == INT else dtypes.int64.max
      oov_start = self._oov_start_index()
      token_start = self._token_start_index()
      if self.num_oov_indices == 0:
        # If there are no OOV indices, we map OOV tokens to -1 and error out
        # during call if we find a negative index.
        default_value = -1
        oov_indices = None
      elif self.num_oov_indices == 1:
        # If there is only one OOV index, we can set that index as the default
        # value of the index_lookup table.
        default_value = oov_start
        oov_indices = None
      else:
        # If we hav multiple OOV values, we need to do a further hashing step;
        # to make this easier, we set the OOV value to -1. (This lets us do a
        # vectorized add and cast to boolean to determine locations where we
        # need to do extra hashing.)
        default_value = -1
        oov_indices = list(range(oov_start, token_start))

    self._static_vocabulary_path = None
    has_vocab_path = (vocabulary is not None and isinstance(vocabulary, str))
    if has_vocab_path or restore_from_static_table:
      self._has_static_table = True
      if vocabulary is None:
        # If we're restoring a layer that was saved with a static table
        # initializer, we create a fake initializer object to let the code
        # progress. The savedmodel restoration code will handle restoring
        # the actual data.
        initializer = _NullInitializer(self._key_dtype, self._value_dtype)
      else:
        if not gfile.Exists(vocabulary):
          raise ValueError("Vocabulary file %s does not exist." % (vocabulary,))
        self._static_vocabulary_path = vocabulary
        num_tokens = table_utils.num_tokens_in_file(vocabulary)
        self._vocab_size = self._token_start_index() + num_tokens

        initializer = lookup_ops.TextFileInitializer(
            filename=vocabulary,
            key_dtype=self._key_dtype,
            key_index=key_index,
            value_dtype=self._value_dtype,
            value_index=value_index,
            value_index_offset=self._token_start_index())

      self._table = lookup_ops.StaticHashTable(
          initializer, default_value=default_value)
      self._table_handler = table_utils.TableHandler(
          table=self._table,
          mask_token=self._mask_key if self.mask_token is not None else None,
          mask_value=self._mask_value,
          oov_tokens=oov_indices)

      tracked_table = self._add_trackable(self._table, trainable=False)

    else:
      self._has_static_table = False
      self._table = lookup_ops.MutableHashTable(
          key_dtype=self._key_dtype,
          value_dtype=self._value_dtype,
          default_value=default_value,
          name=(self._name + "_index_table"))
      self._table_handler = table_utils.TableHandler(
          table=self._table,
          oov_tokens=oov_indices)
      if vocabulary is not None:
        self.set_vocabulary(vocabulary)
      tracked_table = self._add_trackable(self._table, trainable=False)

    if self.output_mode == TF_IDF:
      # The TF-IDF weight may have a (None,) tensorshape. This creates
      # a 1D variable with arbitrary shape, which we can assign any weight to
      # so long as it has 1 dimension. In order to properly initialize this
      # weight in Keras, we need to provide a custom callable initializer which
      # does not depend on the shape of the weight (as all other initializers
      # do) since the weight is not known. Hence the lambda shape, dtype: [0].
      if not self.pad_to_max_tokens or max_tokens is None:
        initializer = lambda shape, dtype: [0]
      else:
        initializer = init_ops.zeros_initializer

      # We are adding these here instead of in build() since they do not depend
      # on the input shape at all.
      idf_shape = (max_tokens,) if self.pad_to_max_tokens else (None,)
      self.tf_idf_weights = self._add_state_variable(
          name="idf",
          shape=tensor_shape.TensorShape(idf_shape),
          dtype=backend.floatx(),
          initializer=initializer)

    # This is a workaround for summary() on this layer. Because the table is
    # not mutable during training, the effective number of parameters (and so
    # the weight shape) is 0; we add this as an attr so that the parameter
    # counting code in the Model object doesn't throw an attribute error.
    tracked_table.shape = tensor_shape.TensorShape((0,))
Пример #15
0
def skip_gram_sample_with_text_vocab(input_tensor,
                                     vocab_freq_file,
                                     vocab_token_index=0,
                                     vocab_token_dtype=tf.dtypes.string,
                                     vocab_freq_index=1,
                                     vocab_freq_dtype=tf.dtypes.float64,
                                     vocab_delimiter=",",
                                     vocab_min_count=0,
                                     vocab_subsampling=None,
                                     corpus_size=None,
                                     min_skips=1,
                                     max_skips=5,
                                     start=0,
                                     limit=-1,
                                     emit_self_as_target=False,
                                     batch_size=None,
                                     batch_capacity=None,
                                     seed=None,
                                     name=None):
    """Skip-gram sampling with a text vocabulary file.

    Wrapper around `skip_gram_sample()` for use with a text vocabulary file. The
    vocabulary file is expected to be a plain-text file, with lines of
    `vocab_delimiter`-separated columns. The `vocab_token_index` column should
    contain the vocabulary term, while the `vocab_freq_index` column should
    contain the number of times that term occurs in the corpus. For example, with
    a text vocabulary file of:

      ```
      bonjour,fr,42
      hello,en,777
      hola,es,99
      ```

    You should set `vocab_delimiter=","`, `vocab_token_index=0`, and
    `vocab_freq_index=2`.

    See `skip_gram_sample()` documentation for more details about the skip-gram
    sampling process.

    Args:
      input_tensor: A rank-1 `Tensor` from which to generate skip-gram candidates.
      vocab_freq_file: `string` specifying full file path to the text vocab file.
      vocab_token_index: `int` specifying which column in the text vocab file
        contains the tokens.
      vocab_token_dtype: `DType` specifying the format of the tokens in the text
        vocab file.
      vocab_freq_index: `int` specifying which column in the text vocab file
        contains the frequency counts of the tokens.
      vocab_freq_dtype: `DType` specifying the format of the frequency counts in
        the text vocab file.
      vocab_delimiter: `string` specifying the delimiter used in the text vocab
        file.
      vocab_min_count: `int`, `float`, or scalar `Tensor` specifying
        minimum frequency threshold (from `vocab_freq_file`) for a token to be
        kept in `input_tensor`. This should correspond with `vocab_freq_dtype`.
      vocab_subsampling: (Optional) `float` specifying frequency proportion
        threshold for tokens from `input_tensor`. Tokens that occur more
        frequently will be randomly down-sampled. Reasonable starting values may
        be around 1e-3 or 1e-5. See Eq. 5 in http://arxiv.org/abs/1310.4546 for
        more details.
      corpus_size: (Optional) `int`, `float`, or scalar `Tensor` specifying the
        total number of tokens in the corpus (e.g., sum of all the frequency
        counts of `vocab_freq_file`). Used with `vocab_subsampling` for
        down-sampling frequently occurring tokens. If this is specified,
        `vocab_freq_file` and `vocab_subsampling` must also be specified.
        If `corpus_size` is needed but not supplied, then it will be calculated
        from `vocab_freq_file`. You might want to supply your own value if you
        have already eliminated infrequent tokens from your vocabulary files
        (where frequency < vocab_min_count) to save memory in the internal token
        lookup table. Otherwise, the unused tokens' variables will waste memory.
        The user-supplied `corpus_size` value must be greater than or equal to the
        sum of all the frequency counts of `vocab_freq_file`.
      min_skips: `int` or scalar `Tensor` specifying the minimum window size to
        randomly use for each token. Must be >= 0 and <= `max_skips`. If
        `min_skips` and `max_skips` are both 0, the only label outputted will be
        the token itself.
      max_skips: `int` or scalar `Tensor` specifying the maximum window size to
        randomly use for each token. Must be >= 0.
      start: `int` or scalar `Tensor` specifying the position in `input_tensor`
        from which to start generating skip-gram candidates.
      limit: `int` or scalar `Tensor` specifying the maximum number of elements in
        `input_tensor` to use in generating skip-gram candidates. -1 means to use
        the rest of the `Tensor` after `start`.
      emit_self_as_target: `bool` or scalar `Tensor` specifying whether to emit
        each token as a label for itself.
      batch_size: (Optional) `int` specifying batch size of returned `Tensors`.
      batch_capacity: (Optional) `int` specifying batch capacity for the queue
        used for batching returned `Tensors`. Only has an effect if
        `batch_size` > 0. Defaults to 100 * `batch_size` if not specified.
      seed: (Optional) `int` used to create a random seed for window size and
        subsampling. See
        [`set_random_seed`](../../g3doc/python/constant_op.md#set_random_seed)
        for behavior.
      name: (Optional) A `string` name or a name scope for the operations.

    Returns:
      A `tuple` containing (token, label) `Tensors`. Each output `Tensor` is of
      rank-1 and has the same type as `input_tensor`. The `Tensors` will be of
      length `batch_size`; if `batch_size` is not specified, they will be of
      random length, though they will be in sync with each other as long as they
      are evaluated together.

    Raises:
      ValueError: If `vocab_token_index` or `vocab_freq_index` is less than 0 or
        exceeds the number of columns in `vocab_freq_file`. If `vocab_token_index`
        and `vocab_freq_index` are both set to the same column. If any token in
        `vocab_freq_file` has a negative frequency.
    """

    if vocab_token_index < 0 or vocab_freq_index < 0:
        raise ValueError(
            "vocab_token_index={} and vocab_freq_index={} must both be >= 0.".
            format(vocab_token_index, vocab_freq_index))
    if vocab_token_index == vocab_freq_index:
        raise ValueError(
            "vocab_token_index and vocab_freq_index should be different, but are "
            "both {}.".format(vocab_token_index))

    # Iterates through the vocab file and calculates the number of vocab terms as
    # well as the total corpus size (by summing the frequency counts of all the
    # vocab terms).
    calculated_corpus_size = 0.0
    vocab_size = 0
    with tf.io.gfile.GFile(vocab_freq_file, mode="r") as f:
        reader = csv.reader(f, delimiter=vocab_delimiter)
        for row in reader:
            if vocab_token_index >= len(row) or vocab_freq_index >= len(row):
                raise ValueError(
                    "Row in vocab file only has {} columns, so vocab_token_index={} or "
                    "vocab_freq_index={} is out of bounds. Row content: {}".
                    format(len(row), vocab_token_index, vocab_freq_index, row))
            vocab_size += 1
            freq = vocab_freq_dtype.as_numpy_dtype(row[vocab_freq_index])
            if freq < 0:
                raise ValueError(
                    "Row in vocab file has negative frequency of {}. Row content: {}"
                    .format(freq, row))
            # Note: tokens whose frequencies are below vocab_min_count will still
            # contribute to the total corpus size used for vocab subsampling.
            calculated_corpus_size += freq

    if not corpus_size:
        corpus_size = calculated_corpus_size
    elif calculated_corpus_size - corpus_size > 1e-6:
        raise ValueError(
            "`corpus_size`={} must be greater than or equal to the sum of all the "
            "frequency counts ({}) of `vocab_freq_file` ({}).".format(
                corpus_size, calculated_corpus_size, vocab_freq_file))

    vocab_freq_table = lookup_ops.HashTable(
        lookup_ops.TextFileInitializer(filename=vocab_freq_file,
                                       key_dtype=vocab_token_dtype,
                                       key_index=vocab_token_index,
                                       value_dtype=vocab_freq_dtype,
                                       value_index=vocab_freq_index,
                                       vocab_size=vocab_size,
                                       delimiter=vocab_delimiter),
        # For vocab terms not in vocab file, use a default value of -1.
        default_value=-1)

    return skip_gram_sample(
        input_tensor,
        min_skips=min_skips,
        max_skips=max_skips,
        start=start,
        limit=limit,
        emit_self_as_target=emit_self_as_target,
        vocab_freq_table=vocab_freq_table,
        vocab_min_count=vocab_min_count,
        vocab_subsampling=vocab_subsampling,
        # corpus_size is not used unless vocab_subsampling is specified.
        corpus_size=None if vocab_subsampling is None else corpus_size,
        batch_size=batch_size,
        batch_capacity=batch_capacity,
        seed=seed,
        name=name)
Пример #16
0
    def __init__(self,
                 max_tokens,
                 num_oov_indices,
                 mask_token,
                 oov_token,
                 vocabulary=None,
                 invert=False,
                 output_mode=INT,
                 sparse=False,
                 pad_to_max_tokens=False,
                 **kwargs):

        # If max_tokens is set, the value must be greater than 1 - otherwise we
        # are creating a 0-element vocab, which doesn't make sense.
        if max_tokens is not None and max_tokens <= 1:
            raise ValueError("If set, `max_tokens` must be greater than 1. "
                             "You passed {}".format(max_tokens))

        if num_oov_indices < 0:
            raise ValueError(
                "`num_oov_indices` must be greater than or equal to 0. "
                "You passed {}".format(num_oov_indices))

        # 'output_mode' must be one of (INT, BINARY, COUNT, TFIDF)
        layer_utils.validate_string_arg(output_mode,
                                        allowable_strings=(INT, BINARY, COUNT,
                                                           TFIDF),
                                        layer_name=self.__class__.__name__,
                                        arg_name="output_mode")

        if invert and output_mode != INT:
            raise ValueError(
                "`output_mode` must be {} when `invert` is true. You "
                "passed {}".format(INT, output_mode))

        self.invert = invert
        self.max_tokens = max_tokens
        self.num_oov_indices = num_oov_indices
        self.oov_token = oov_token
        self.mask_token = mask_token
        self.output_mode = output_mode
        self.sparse = sparse
        self.pad_to_max_tokens = pad_to_max_tokens
        self._called = False
        self._vocab_size = 0
        # We need to keep track our current vocab size outside of our layer weights
        # to support a static output shape when `output_mode != INT`. The bincount
        # ops do not set shape on their outputs, which means we have to set it
        # ourselves. We persist the current vocab size as a hidden part of the
        # config when serializing our model.
        if "vocabulary_size" in kwargs:
            self._vocab_size = kwargs["vocabulary_size"]
            del kwargs["vocabulary_size"]

        if max_tokens is not None:
            available_vocab_size = max_tokens - self._token_start_index()
        else:
            available_vocab_size = None

        super(IndexLookup, self).__init__(combiner=_IndexLookupCombiner(
            vocab_size=available_vocab_size,
            mask_value=mask_token,
            oov_value=oov_token,
            compute_idf=(output_mode == TFIDF)),
                                          **kwargs)

        # We need to save the key dtype so that we know if we're expecting int64
        # keys. If we are, we will cast int32 inputs to int64 as well.
        if invert:
            self._key_dtype = dtypes.int64
            self._value_dtype = self.dtype
            self._mask_key = 0
            self._mask_value = mask_token
            key_index = lookup_ops.TextFileIndex.LINE_NUMBER
            value_index = lookup_ops.TextFileIndex.WHOLE_LINE
            default_value = self.oov_token
            oov_indices = None
        else:
            self._key_dtype = self.dtype
            self._value_dtype = dtypes.int64
            self._mask_key = mask_token
            key_index = lookup_ops.TextFileIndex.WHOLE_LINE
            value_index = lookup_ops.TextFileIndex.LINE_NUMBER
            # Masks should map to 0 for int output and be dropped otherwise. Max ints
            # will be dropped from the bincount op.
            self._mask_value = 0 if self.output_mode == INT else dtypes.int64.max
            oov_start = self._oov_start_index()
            token_start = self._token_start_index()
            if self.num_oov_indices == 0:
                # If there are no OOV indices, we map OOV tokens to -1 for int output
                # and drop them from bagged output. Max ints will be dropped from the
                # bincount op.
                default_value = -1 if self.output_mode == INT else dtypes.int64.max
                oov_indices = None
            elif self.num_oov_indices == 1:
                # If there is only one OOV index, we can set that index as the default
                # value of the index_lookup table.
                default_value = oov_start
                oov_indices = None
            else:
                # If we hav multiple OOV values, we need to do a further hashing step;
                # to make this easier, we set the OOV value to -1. (This lets us do a
                # vectorized add and cast to boolean to determine locations where we
                # need to do extra hashing.)
                default_value = -1
                oov_indices = list(range(oov_start, token_start))

        if vocabulary is not None and isinstance(vocabulary, str):
            if not os.path.exists(vocabulary):
                raise ValueError("Vocabulary file %s does not exist." %
                                 vocabulary)

            total_offset = 0 if mask_token is None else 1
            total_offset += num_oov_indices
            initializer = lookup_ops.TextFileInitializer(
                filename=vocabulary,
                key_dtype=self._key_dtype,
                key_index=key_index,
                value_dtype=self._value_dtype,
                value_index=value_index,
                value_index_offset=total_offset)

            self._table = self._static_table_class()(
                initializer, default_value=default_value)
            self._table_handler = table_utils.TableHandler(
                table=self._table,
                mask_token=self._mask_key,
                mask_value=self._mask_value,
                oov_tokens=oov_indices,
                use_v1_apis=self._use_v1_apis())
            self.max_tokens = (self._table_handler.table_size() +
                               self.num_oov_indices +
                               (0 if mask_token is None else 1))
        else:
            self._table = lookup_ops.MutableHashTable(
                key_dtype=self._key_dtype,
                value_dtype=self._value_dtype,
                default_value=default_value,
                name=(self._name + "_index_table"))
            self._table_handler = table_utils.TableHandler(
                table=self._table,
                oov_tokens=oov_indices,
                use_v1_apis=self._use_v1_apis())
            if vocabulary is not None:
                self.set_vocabulary(vocabulary)

        if self.output_mode == TFIDF:
            # The TF-IDF weight may have a (None,) tensorshape. This creates
            # a 1D variable with arbitrary shape, which we can assign any weight to
            # so long as it has 1 dimension. In order to properly initialize this
            # weight in Keras, we need to provide a custom callable initializer which
            # does not depend on the shape of the weight (as all other initializers
            # do) since the weight is not known. Hence the lambda shape, dtype: [0].
            if not self.pad_to_max_tokens or max_tokens is None:
                initializer = lambda shape, dtype: [0]
            else:
                initializer = init_ops.zeros_initializer

            # We are adding these here instead of in build() since they do not depend
            # on the input shape at all.
            idf_shape = (max_tokens, ) if self.pad_to_max_tokens else (None, )
            self.tf_idf_weights = self._add_state_variable(
                name="idf",
                shape=tensor_shape.TensorShape(idf_shape),
                dtype=K.floatx(),
                initializer=initializer)

        tracked_table = self._add_trackable(self._table, trainable=False)
        # This is a workaround for summary() on this layer. Because the table is
        # not mutable during training, the effective number of parameters (and so
        # the weight shape) is 0; we add this as an attr so that the parameter
        # counting code in the Model object doesn't throw an attribute error.
        tracked_table.shape = tensor_shape.TensorShape((0, ))