예제 #1
0
def get_static_table(tmpdir,
                     vocab_list,
                     mask_token=None,
                     dtype=tf.string,
                     oov_tokens=None):
    vocabulary_file = os.path.join(tmpdir, "tmp_vocab.txt")

    if dtype == tf.string:
        with open(vocabulary_file, "w") as f:
            f.write("\n".join(vocab_list) + "\n")
    else:
        with open(vocabulary_file, "w") as f:
            f.write("\n".join([str(v) for v in vocab_list]) + "\n")

    offset = ((0 if mask_token is None else 1) +
              (len(oov_tokens) if oov_tokens is not None else 0))
    init = tf.lookup.TextFileInitializer(vocabulary_file,
                                         dtype,
                                         tf.lookup.TextFileIndex.WHOLE_LINE,
                                         tf.int64,
                                         tf.lookup.TextFileIndex.LINE_NUMBER,
                                         value_index_offset=offset)
    if tf.executing_eagerly():
        table = tf.lookup.StaticHashTable(init, default_value=-7)
    else:
        table = tf.compat.v1.lookup.StaticHashTable(init, default_value=-7)

    return table_utils.TableHandler(table,
                                    oov_tokens,
                                    mask_token=mask_token,
                                    use_v1_apis=(not tf.executing_eagerly()))
예제 #2
0
def get_table(dtype=tf.string, oov_tokens=None):
    table = lookup_ops.MutableHashTable(key_dtype=dtype,
                                        value_dtype=tf.int64,
                                        default_value=-7,
                                        name="index_table")
    return table_utils.TableHandler(table,
                                    oov_tokens,
                                    use_v1_apis=(not tf.executing_eagerly()))
예제 #3
0
    def __init__(self,
                 max_tokens,
                 num_oov_indices,
                 mask_token,
                 oov_token,
                 vocabulary=None,
                 invert=False,
                 output_mode=INT,
                 sparse=False,
                 pad_to_max_tokens=False,
                 **kwargs):

        # If max_tokens is set, the value must be greater than 1 - otherwise we
        # are creating a 0-element vocab, which doesn't make sense.
        if max_tokens is not None and max_tokens <= 1:
            raise ValueError("If set, `max_tokens` must be greater than 1. "
                             "You passed {}".format(max_tokens))

        if num_oov_indices < 0:
            raise ValueError(
                "`num_oov_indices` must be greater than or equal to 0. "
                "You passed {}".format(num_oov_indices))

        # 'output_mode' must be one of (INT, BINARY, COUNT, TFIDF)
        layer_utils.validate_string_arg(output_mode,
                                        allowable_strings=(INT, BINARY, COUNT,
                                                           TFIDF),
                                        layer_name=self.__class__.__name__,
                                        arg_name="output_mode")

        self.invert = invert
        self.max_tokens = max_tokens
        self.num_oov_indices = num_oov_indices
        self.oov_token = oov_token
        self.mask_token = mask_token
        self.output_mode = output_mode
        self.sparse = sparse
        self.pad_to_max_tokens = pad_to_max_tokens
        self._called = False
        self._num_special_tokens = self.num_oov_indices
        if self.mask_token is not None:
            self._num_special_tokens += 1
        self._vocab_size = 0
        # We need to keep track our current vocab size outside of our layer weights
        # to support a static output shape when `output_mode != INT`. The bincount
        # ops do not set shape on their outputs, which means we have to set it
        # ourselves. We persist the current vocab size as a hidden part of the
        # config when serializing our model.
        if "vocab_size" in kwargs:
            self._vocab_size = kwargs["vocab_size"]
            del kwargs["vocab_size"]

        # If there is only one OOV bucket, we can determine the OOV value (either 0
        # or 1 depending on whether 0 is reserved) and set that as the default
        # value of the index_lookup table. If we hav multiple OOV values, we need to
        # do a further hashing step; to make this easier, we set the OOV value to
        # -1. (This lets us do a vectorized add and cast to boolean to determine
        # locations where we need to do extra hashing.)
        if self.num_oov_indices == 1:
            self._oov_value = 0 if mask_token is None else 1
        else:
            self._oov_value = -1

        if max_tokens is not None:
            available_vocab_size = max_tokens - self._num_special_tokens
        else:
            available_vocab_size = None

        super(IndexLookup, self).__init__(combiner=_IndexLookupCombiner(
            vocab_size=available_vocab_size,
            mask_value=mask_token,
            oov_value=oov_token,
            compute_idf=(output_mode == TFIDF)),
                                          **kwargs)

        # We need to save the key dtype so that we know if we're expecting int64
        # keys. If we are, we will cast int32 inputs to int64 as well.
        if invert:
            self._key_dtype = tf.int64
            self._value_dtype = self.dtype
            oov_value = self.oov_token
            oov_indices = None
        else:
            self._key_dtype = self.dtype
            self._value_dtype = tf.int64
            oov_value = self._oov_value
            if self.num_oov_indices <= 1:
                oov_indices = None
            else:
                oov_start = 1 if mask_token is not None else 0
                oov_end = oov_start + num_oov_indices
                oov_indices = list(range(oov_start, oov_end))

        if vocabulary is not None and isinstance(
                vocabulary, tf.lookup.TextFileInitializer):
            self._table = self._static_table_class()(vocabulary,
                                                     default_value=oov_value)
            self._table_handler = table_utils.TableHandler(
                table=self._table,
                mask_token=mask_token,
                oov_tokens=oov_indices,
                use_v1_apis=self._use_v1_apis())
            self.max_tokens = (self._table_handler.table_size() +
                               self.num_oov_indices +
                               (0 if mask_token is None else 1))
        else:
            self._table = lookup_ops.MutableHashTable(
                key_dtype=self._key_dtype,
                value_dtype=self._value_dtype,
                default_value=oov_value,
                name=(self._name + "_index_table"))
            self._table_handler = table_utils.TableHandler(
                table=self._table,
                oov_tokens=oov_indices,
                use_v1_apis=self._use_v1_apis())
            if vocabulary is not None:
                self.set_vocabulary(vocabulary)

        if self.output_mode == TFIDF:
            # The TF-IDF weight may have a (None,) tensorshape. This creates
            # a 1D variable with arbitrary shape, which we can assign any weight to
            # so long as it has 1 dimension. In order to properly initialize this
            # weight in Keras, we need to provide a custom callable initializer which
            # does not depend on the shape of the weight (as all other initializers
            # do) since the weight is not known. Hence the lambda shape, dtype: [0].
            if not self.pad_to_max_tokens or max_tokens is None:
                initializer = lambda shape, dtype: [0]
            else:
                initializer = tf.compat.v1.zeros_initializer

            # We are adding these here instead of in build() since they do not depend
            # on the input shape at all.
            idf_shape = (max_tokens, ) if self.pad_to_max_tokens else (None, )
            self.tf_idf_weights = self._add_state_variable(
                name="idf",
                shape=tf.TensorShape(idf_shape),
                dtype=K.floatx(),
                initializer=initializer)

        tracked_table = self._add_trackable(self._table, trainable=False)
        # This is a workaround for summary() on this layer. Because the table is
        # not mutable during training, the effective number of parameters (and so
        # the weight shape) is 0; we add this as an attr so that the parameter
        # counting code in the Model object doesn't throw an attribute error.
        tracked_table.shape = tf.TensorShape((0, ))
예제 #4
0
    def __init__(self,
                 max_tokens,
                 num_oov_indices,
                 mask_token,
                 oov_token,
                 vocabulary=None,
                 invert=False,
                 output_mode=INT,
                 sparse=False,
                 pad_to_max_tokens=False,
                 **kwargs):
        # If max_tokens is set, the value must be greater than 1 - otherwise we
        # are creating a 0-element vocab, which doesn't make sense.
        if max_tokens is not None and max_tokens <= 1:
            raise ValueError("If set, `max_tokens` must be greater than 1. "
                             "You passed {}".format(max_tokens))

        if num_oov_indices < 0:
            raise ValueError(
                "`num_oov_indices` must be greater than or equal to 0. "
                "You passed {}".format(num_oov_indices))

        # 'output_mode' must be one of (INT, BINARY, COUNT, TFIDF)
        layer_utils.validate_string_arg(output_mode,
                                        allowable_strings=(INT, BINARY, COUNT,
                                                           TFIDF),
                                        layer_name=self.__class__.__name__,
                                        arg_name="output_mode")

        if invert and output_mode != INT:
            raise ValueError(
                "`output_mode` must be {} when `invert` is true. You "
                "passed {}".format(INT, output_mode))

        self.invert = invert
        self.max_tokens = max_tokens
        self.num_oov_indices = num_oov_indices
        self.oov_token = oov_token
        self.output_mode = output_mode
        self.sparse = sparse
        self.pad_to_max_tokens = pad_to_max_tokens
        self._called = False

        # A note on vocab_size: we need to always keep a non-Tensor representation
        # of vocab_size around to use in graph building. Because we might be
        # in a tf.function, we can't rely on evaluating the actual tables to
        # find the value either.
        self._vocab_size = None
        # We need to keep track our current vocab size outside of our layer weights
        # to support a static output shape when `output_mode != INT`. The bincount
        # ops do not set shape on their outputs, which means we have to set it
        # ourselves. We persist the current vocab size as a hidden part of the
        # config when serializing our model.
        if "vocabulary_size" in kwargs:
            self._vocab_size = kwargs["vocabulary_size"]
            del kwargs["vocabulary_size"]

        restore_from_static_table = kwargs.pop("has_static_table", False)

        # Make sure the mask token is truly of the dtype we want. We can ignore
        # strings here, because they have only one dtype.
        if mask_token is not None:
            dtype = kwargs["dtype"]
            if dtype == tf.int32:
                mask_token = np.int32(mask_token)
            elif dtype == tf.int64:
                mask_token = np.int64(mask_token)
        self.mask_token = mask_token

        if max_tokens is not None:
            available_vocab_size = max_tokens - self._token_start_index()
        else:
            available_vocab_size = None

        super(IndexLookup, self).__init__(combiner=_IndexLookupCombiner(
            vocab_size=available_vocab_size,
            mask_value=mask_token,
            oov_value=oov_token,
            compute_idf=(output_mode == TFIDF)),
                                          **kwargs)

        # We need to save the key dtype so that we know if we're expecting int64
        # keys. If we are, we will cast int32 inputs to int64 as well.
        if invert:
            self._key_dtype = tf.int64
            self._value_dtype = self.dtype
            self._mask_key = 0
            self._mask_value = mask_token
            key_index = tf.lookup.TextFileIndex.LINE_NUMBER
            value_index = tf.lookup.TextFileIndex.WHOLE_LINE
            default_value = self.oov_token
            oov_indices = None
        else:
            self._key_dtype = self.dtype
            self._value_dtype = tf.int64
            self._mask_key = mask_token
            key_index = tf.lookup.TextFileIndex.WHOLE_LINE
            value_index = tf.lookup.TextFileIndex.LINE_NUMBER
            # Masks should map to 0 for int output and be dropped otherwise. Max ints
            # will be dropped from the bincount op.
            self._mask_value = 0 if self.output_mode == INT else tf.int64.max
            oov_start = self._oov_start_index()
            token_start = self._token_start_index()
            if self.num_oov_indices == 0:
                # If there are no OOV indices, we map OOV tokens to -1 for int output
                # and drop them from bagged output. Max ints will be dropped from the
                # bincount op.
                default_value = -1 if self.output_mode == INT else tf.int64.max
                oov_indices = None
            elif self.num_oov_indices == 1:
                # If there is only one OOV index, we can set that index as the default
                # value of the index_lookup table.
                default_value = oov_start
                oov_indices = None
            else:
                # If we hav multiple OOV values, we need to do a further hashing step;
                # to make this easier, we set the OOV value to -1. (This lets us do a
                # vectorized add and cast to boolean to determine locations where we
                # need to do extra hashing.)
                default_value = -1
                oov_indices = list(range(oov_start, token_start))

        self._static_vocabulary_path = None
        has_vocab_path = (vocabulary is not None
                          and isinstance(vocabulary, str))
        if has_vocab_path or restore_from_static_table:
            self._has_static_table = True
            if vocabulary is None:
                # If we're restoring a layer that was saved with a static table
                # initializer, we create a fake initializer object to let the code
                # progress. The savedmodel restoration code will handle restoring
                # the actual data.
                initializer = _NullInitializer(self._key_dtype,
                                               self._value_dtype)
            else:
                if not os.path.exists(vocabulary):
                    raise ValueError("Vocabulary file %s does not exist." %
                                     (vocabulary, ))
                self._static_vocabulary_path = vocabulary
                num_tokens = table_utils.num_tokens_in_file(vocabulary)
                self._vocab_size = self._token_start_index() + num_tokens

                initializer = tf.lookup.TextFileInitializer(
                    filename=vocabulary,
                    key_dtype=self._key_dtype,
                    key_index=key_index,
                    value_dtype=self._value_dtype,
                    value_index=value_index,
                    value_index_offset=self._token_start_index())

            self._table = self._static_table_class()(
                initializer, default_value=default_value)
            self._table_handler = table_utils.TableHandler(
                table=self._table,
                mask_token=self._mask_key,
                mask_value=self._mask_value,
                oov_tokens=oov_indices,
                use_v1_apis=self._use_v1_apis())

            tracked_table = self._add_trackable(self._table, trainable=False)

        else:
            self._has_static_table = False
            self._table = lookup_ops.MutableHashTable(
                key_dtype=self._key_dtype,
                value_dtype=self._value_dtype,
                default_value=default_value,
                name=(self._name + "_index_table"))
            self._table_handler = table_utils.TableHandler(
                table=self._table,
                oov_tokens=oov_indices,
                use_v1_apis=self._use_v1_apis())
            if vocabulary is not None:
                self.set_vocabulary(vocabulary)
            tracked_table = self._add_trackable(self._table, trainable=False)

        if self.output_mode == TFIDF:
            # The TF-IDF weight may have a (None,) tensorshape. This creates
            # a 1D variable with arbitrary shape, which we can assign any weight to
            # so long as it has 1 dimension. In order to properly initialize this
            # weight in Keras, we need to provide a custom callable initializer which
            # does not depend on the shape of the weight (as all other initializers
            # do) since the weight is not known. Hence the lambda shape, dtype: [0].
            if not self.pad_to_max_tokens or max_tokens is None:
                initializer = lambda shape, dtype: [0]
            else:
                initializer = tf.compat.v1.zeros_initializer

            # We are adding these here instead of in build() since they do not depend
            # on the input shape at all.
            idf_shape = (max_tokens, ) if self.pad_to_max_tokens else (None, )
            self.tf_idf_weights = self._add_state_variable(
                name="idf",
                shape=tf.TensorShape(idf_shape),
                dtype=backend.floatx(),
                initializer=initializer)

        # This is a workaround for summary() on this layer. Because the table is
        # not mutable during training, the effective number of parameters (and so
        # the weight shape) is 0; we add this as an attr so that the parameter
        # counting code in the Model object doesn't throw an attribute error.
        tracked_table.shape = tf.TensorShape((0, ))
예제 #5
0
  def __init__(self,
               max_tokens,
               num_oov_indices,
               mask_token,
               oov_token,
               vocabulary=None,
               invert=False,
               output_mode=INT,
               sparse=False,
               **kwargs):

    # If max_tokens is set, the value must be greater than 1 - otherwise we
    # are creating a 0-element vocab, which doesn't make sense.
    if max_tokens is not None and max_tokens <= 1:
      raise ValueError("If set, max_tokens must be greater than 1. "
                       "You passed %s" % (max_tokens,))

    if num_oov_indices < 0:
      raise ValueError("`num_oov_indices` must be greater than 0. You passed "
                       "%s" % (num_oov_indices,))

    if invert and num_oov_indices != 1:
      raise ValueError("`num_oov_tokens` must be 1 when `invert` is True.")

    # 'output_mode' must be one of (INT, BINARY, COUNT)
    layer_utils.validate_string_arg(
        output_mode,
        allowable_strings=(INT, BINARY, COUNT),
        layer_name=self.__class__.__name__,
        arg_name="output_mode")

    self.invert = invert
    self.max_tokens = max_tokens
    self.num_oov_indices = num_oov_indices
    self.oov_token = oov_token
    self.mask_token = mask_token
    self.output_mode = output_mode
    self.sparse = sparse

    # If there is only one OOV bucket, we can determine the OOV value (either 0
    # or 1 depending on whether 0 is reserved) and set that as the default
    # value of the index_lookup table. If we hav multiple OOV values, we need to
    # do a further hashing step; to make this easier, we set the OOV value to
    # -1. (This lets us do a vectorized add and cast to boolean to determine
    # locations where we need to do extra hashing.)
    if self.num_oov_indices == 1:
      self._oov_value = 0 if mask_token is None else 1
    else:
      self._oov_value = -1

    if max_tokens is not None:
      num_mask_tokens = (0 if mask_token is None else 1)
      vocab_size = max_tokens - (num_oov_indices + num_mask_tokens)
    else:
      vocab_size = None

    super(IndexLookup, self).__init__(
        combiner=_IndexLookupCombiner(vocab_size, self.mask_token), **kwargs)

    self._output_dtype = tf.int64

    # We need to save the key dtype so that we know if we're expecting int64
    # keys. If we are, we will cast int32 inputs to int64 as well.
    if invert:
      self._key_dtype = self._output_dtype
      value_dtype = self.dtype
      oov_value = self.oov_token
    else:
      self._key_dtype = self.dtype
      value_dtype = self._output_dtype
      oov_value = self._oov_value

    self._table = lookup_ops.MutableHashTable(
        key_dtype=self._key_dtype,
        value_dtype=value_dtype,
        default_value=oov_value,
        name=(self._name + "_index_table"))
    tracked_table = self._add_trackable(self._table, trainable=False)
    # This is a workaround for summary() on this layer. Because the table is
    # not mutable during training, the effective number of parameters (and so
    # the weight shape) is 0; we add this as an attr so that the parameter
    # counting code in the Model object doesn't throw an attribute error.
    tracked_table.shape = tf.TensorShape((0,))

    if self.num_oov_indices <= 1:
      oov_indices = None
    else:
      oov_start = 1 if mask_token is not None else 0
      oov_end = oov_start + num_oov_indices
      oov_indices = list(range(oov_start, oov_end))

    self._table_handler = table_utils.TableHandler(
        table=self._table,
        oov_tokens=oov_indices,
        use_v1_apis=self._use_v1_apis())

    if vocabulary is not None:
      self.set_vocabulary(vocabulary)
예제 #6
0
def get_table(dtype=tf.string, oov_tokens=None):
    table = lookup_ops.MutableHashTable(key_dtype=dtype,
                                        value_dtype=tf.int64,
                                        default_value=-7,
                                        name="index_table")
    return table_utils.TableHandler(table, oov_tokens)