Python MutableHashTable示例，tensorflow.python.ops.lookup_ops.MutableHashTable Python示例

示例#1

0

显示文件

文件： table_utils_test.py 项目： zzsnow/tensorflow

def get_table(dtype=dtypes.string, oov_tokens=None):
    table = lookup_ops.MutableHashTable(key_dtype=dtype,
                                        value_dtype=dtypes.int64,
                                        default_value=-7,
                                        name="index_table")
    return table_utils.TableHandler(
        table, oov_tokens, use_v1_apis=(not context.executing_eagerly()))

示例#2

0

显示文件

文件： base_layer_utils_test.py 项目： TheVinhLuong102/tensorflow

 def get_table_handler(self):
   # Note: There is some repetition in these tests' setup. However, Tensorflow
   # does not play nicely with a separate setUp() call (causing errors related
   # to graph building), so we have to use a called setup instead of a setUp()
   # call.
   table = lookup_ops.MutableHashTable(
       key_dtype=dtypes.string, value_dtype=dtypes.int32, default_value=0)
   return base_layer_utils.TrackableWeightHandler(table)

示例#3

0

显示文件

文件： saved_model_estimator_test.py 项目： xwli-chelsea/estimator

 def model_fn(features, labels, mode):
   tb = lookup_ops.MutableHashTable(
       key_dtype=tf.dtypes.int32,
       value_dtype=tf.dtypes.int32,
       default_value=-1)
   predictions = tb.lookup(features['x'])
   train_op = None
   if mode == ModeKeys.TRAIN:
     train_op = tf.group(
         tb.insert(features['x'], labels),
         tf.compat.v1.assign_add(tf.compat.v1.train.get_global_step(), 1))
   return model_fn_lib.EstimatorSpec(
       mode, loss=tf.constant(0), predictions=predictions, train_op=train_op)

示例#4

0

显示文件

    def testDistributeMutableHashTable(self, value_rank):
        def value(v):
            for _ in range(value_rank):
                v = [v, v]
            return v

        v1 = value(10)
        v2 = value(11)
        default_value = value(-1)

        cluster = data_service_test_base.TestCluster(num_workers=1)
        table = lookup_ops.MutableHashTable(dtypes.int64, dtypes.int64,
                                            default_value)
        self.evaluate(table.insert([0, 1], [v1, v2]))
        ds = dataset_ops.Dataset.range(3)
        ds = ds.map(table.lookup)
        ds = self.make_distributed_dataset(ds, cluster)
        self.assertDatasetProduces(ds, [v1, v2, default_value],
                                   requires_initialization=True)

示例#5

0

显示文件

    def __init__(self,
                 max_tokens=None,
                 num_oov_tokens=1,
                 vocabulary=None,
                 reserve_zero=True,
                 mask_zero=False,
                 **kwargs):
        allowed_dtypes = [dtypes.string, dtypes.int64]
        if "dtype" in kwargs and kwargs["dtype"] not in allowed_dtypes:
            raise ValueError(
                "TextVectorization may only have a dtype of string or int64.")
        elif "dtype" not in kwargs:
            kwargs["dtype"] = dtypes.string

        # If max_tokens is set, the value must be greater than 1 - otherwise we
        # are creating a 0-element vocab, which doesn't make sense.
        if max_tokens is not None and max_tokens <= 1:
            raise ValueError("max_tokens must be greater than 1.")

        # For now, limit the num_oov_tokens to one.
        if num_oov_tokens < 0:
            raise ValueError(
                "num_oov_tokens must be greater than 0. You passed %s" %
                num_oov_tokens)

        self.max_tokens = max_tokens
        self.num_oov_tokens = num_oov_tokens
        self.reserve_zero = reserve_zero
        self.mask_zero = mask_zero

        # We need to reserve at least num_oov_tokens tokens, plus one additional
        # value if we are reserving the zero value in our output.
        if reserve_zero:
            self._reserved_values = (num_oov_tokens + 1)
        else:
            self._reserved_values = num_oov_tokens

        # We need to account for the OOV buckets in our vocabulary size.
        if max_tokens is not None:
            self._max_elements = max_tokens - num_oov_tokens
        else:
            self._max_elements = None

        # If there is only one OOV bucket, we can determine the OOV value (either 0
        # or 1 depending on whether 0 is reserved) and set that as the default
        # value of the index_lookup table. If we hav multiple OOV values, we need to
        # do a further hashing step; to make this easier, we set the OOV value to
        # -1. (This lets us do a vectorized add and cast to boolean to determine
        # locations where we need to do extra hashing.)
        if self.num_oov_tokens == 1:
            self._oov_value = 1 if reserve_zero else 0
        else:
            self._oov_value = -1

        super(IndexLookup,
              self).__init__(combiner=_IndexLookupCombiner(self.max_tokens),
                             **kwargs)

        # This layer supports RaggedTensor inputs.
        self._supports_ragged_inputs = True

        # If the layer's input type is int32, we can only output int32 values -
        # MutableHashTable doesn't allow us to map int32->int64.
        if self.dtype == dtypes.int32:
            self._output_dtype = dtypes.int32
        else:
            self._output_dtype = dtypes.int64
        self._table = lookup_ops.MutableHashTable(
            key_dtype=self.dtype,
            value_dtype=self._output_dtype,
            default_value=self._oov_value,
            name=(self._name + "_index_table"))
        tracked_table = self._add_trackable(self._table, trainable=False)
        # This is a workaround for summary() on this layer. Because the table is
        # not mutable during training, the effective number of parameters (and so
        # the weight shape) is 0; we add this as an attr so that the parameter
        # counting code in the Model object doesn't throw an attribute error.
        tracked_table.shape = tensor_shape.TensorShape((0, ))

        self._inverse_table = None

        if vocabulary is not None:
            if isinstance(vocabulary, str):
                vocabulary = self._get_vocabulary_from_file(vocabulary)

            vocabulary_set = set(vocabulary)
            if len(vocabulary) != len(vocabulary_set):
                repeated_items = [
                    item
                    for item, count in collections.Counter(vocabulary).items()
                    if count > 1
                ]
                raise ValueError(
                    "The passed vocabulary has at least one repeated "
                    "term. Please uniquify your dataset before passing "
                    "it to IndexLookup(). The repeated terms are %s" %
                    repeated_items)
            self.set_vocabulary(vocabulary)

示例#6

0

显示文件

    def __init__(self,
                 max_tokens=None,
                 num_oov_tokens=1,
                 vocabulary=None,
                 reserve_zero=True,
                 mask_zero=False,
                 **kwargs):
        allowed_dtypes = [dtypes.string, dtypes.int64]
        if "dtype" in kwargs and kwargs["dtype"] not in allowed_dtypes:
            raise ValueError(
                "TextVectorization may only have a dtype of string or int64.")
        elif "dtype" not in kwargs:
            kwargs["dtype"] = dtypes.string

        # If max_tokens is set, the value must be greater than 1 - otherwise we
        # are creating a 0-element vocab, which doesn't make sense.
        if max_tokens is not None and max_tokens <= 1:
            raise ValueError("max_tokens must be greater than 1.")

        # For now, limit the num_oov_tokens to one.
        if num_oov_tokens != 1:
            raise ValueError(
                "num_oov_tokens must be 1 for the time being. Other "
                "values will be supported in the near future. "
                "You passed %s" % num_oov_tokens)

        self.max_tokens = max_tokens
        self.num_oov_tokens = num_oov_tokens
        self.reserve_zero = reserve_zero
        self.mask_zero = mask_zero

        # We need to reserve at least num_oov_tokens tokens, plus one additional
        # value if we are reserving the zero value in our output.
        if reserve_zero:
            self._reserved_values = (num_oov_tokens + 1)
        else:
            self._reserved_values = num_oov_tokens

        # We need to account for the OOV buckets in our vocabulary size.
        if max_tokens is not None:
            self._max_elements = max_tokens - num_oov_tokens
        else:
            self._max_elements = None

        # If there is only one OOV bucket, we can determine the OOV value (either 0
        # or 1 depending on whether 0 is reserved) and set that as the default
        # value of the index_lookup table. If we hav multiple OOV values, we need to
        # do a further hashing step; to make this easier, we set the OOV value to
        # -1. (This lets us do a vectorized add and cast to boolean to determine
        # locations where we need to do extra hashing.)
        if self.num_oov_tokens == 1:
            self._oov_value = 1 if reserve_zero else 0
        else:
            self._oov_value = -1

        super(IndexLookup,
              self).__init__(combiner=_IndexLookupCombiner(self.max_tokens),
                             **kwargs)

        # This layer supports RaggedTensor inputs.
        self._supports_ragged_inputs = True

        # If the layer's input type is int32, we can only output int32 values -
        # MutableHashTable doesn't allow us to map int32->int64.
        if self.dtype == dtypes.int32:
            self._output_dtype = dtypes.int32
        else:
            self._output_dtype = dtypes.int64
        self._table = lookup_ops.MutableHashTable(
            key_dtype=self.dtype,
            value_dtype=self._output_dtype,
            default_value=self._oov_value,
            name=(self._name + "_index_table"))
        tracked_table = self._add_trackable(self._table, trainable=False)
        # This is a workaround for summary() on this layer. Because the table is
        # not mutable during training, the effective number of parameters (and so
        # the weight shape) is 0; we add this as an attr so that the parameter
        # counting code in the Model object doesn't throw an attribute error.
        tracked_table.shape = tensor_shape.TensorShape((0, ))

        # This is a workaround for saving not working yet for MutableHashTables.
        # By replacing the existing function call by an explicit failure, we
        # can provide a more user-friendly error message.
        def fail(_):
            raise NotImplementedError(
                "Saving is not yet supported for IndexLookup layers.")

        self._table._list_extra_dependencies_for_serialization = fail  # pylint: disable=protected-access
        self._inverse_table = None

        if vocabulary is not None:
            self._export_vocab = True
            self.set_vocabulary(vocabulary)
        else:
            self._export_vocab = False

示例#7

0

显示文件

文件： index_lookup.py 项目： saishyammenon/tensorflow-1

    def __init__(self,
                 max_tokens=None,
                 num_oov_tokens=1,
                 vocabulary=None,
                 reserve_zero=True,
                 mask_zero=False,
                 **kwargs):
        invert = False
        if invert:
            allowed_dtypes = [dtypes.int32, dtypes.int64]
        else:
            allowed_dtypes = [dtypes.string, dtypes.int32, dtypes.int64]

        if "dtype" in kwargs and kwargs["dtype"] not in allowed_dtypes:
            raise ValueError("TextVectorization may only have a dtype in %s." %
                             allowed_dtypes)

        if "dtype" not in kwargs:
            kwargs["dtype"] = dtypes.int64 if invert else dtypes.string

        # If max_tokens is set, the value must be greater than 1 - otherwise we
        # are creating a 0-element vocab, which doesn't make sense.
        if max_tokens is not None and max_tokens <= 1:
            raise ValueError("If set, max_tokens must be greater than 1.")

        if num_oov_tokens < 0:
            raise ValueError(
                "num_oov_tokens must be greater than 0. You passed %s" %
                num_oov_tokens)

        self.invert = invert
        self.max_tokens = max_tokens
        self.num_oov_tokens = num_oov_tokens
        self.reserve_zero = reserve_zero
        self.mask_zero = mask_zero

        # We need to reserve at least num_oov_tokens tokens, plus one additional
        # value if we are reserving the zero value in our output.
        if reserve_zero:
            self._reserved_values = (num_oov_tokens + 1)
        else:
            self._reserved_values = num_oov_tokens

        # We need to account for the OOV buckets in our vocabulary size.
        if max_tokens is not None:
            self._max_elements = max_tokens - num_oov_tokens
        else:
            self._max_elements = None

        # If there is only one OOV bucket, we can determine the OOV value (either 0
        # or 1 depending on whether 0 is reserved) and set that as the default
        # value of the index_lookup table. If we hav multiple OOV values, we need to
        # do a further hashing step; to make this easier, we set the OOV value to
        # -1. (This lets us do a vectorized add and cast to boolean to determine
        # locations where we need to do extra hashing.)
        if self.num_oov_tokens == 1:
            self._oov_value = 1 if reserve_zero else 0
        else:
            self._oov_value = -1

        super(IndexLookup,
              self).__init__(combiner=_IndexLookupCombiner(self.max_tokens),
                             **kwargs)

        # If the layer's input type is int32, we can only output int32 values -
        # MutableHashTable doesn't allow us to map int32->int64.
        if self.dtype == dtypes.int32:
            self._output_dtype = dtypes.int32
        else:
            self._output_dtype = dtypes.int64
        self._table = lookup_ops.MutableHashTable(
            key_dtype=self.dtype,
            value_dtype=self._output_dtype,
            default_value=self._oov_value,
            name=(self._name + "_index_table"))
        tracked_table = self._add_trackable(self._table, trainable=False)
        # This is a workaround for summary() on this layer. Because the table is
        # not mutable during training, the effective number of parameters (and so
        # the weight shape) is 0; we add this as an attr so that the parameter
        # counting code in the Model object doesn't throw an attribute error.
        tracked_table.shape = tensor_shape.TensorShape((0, ))

        if self.num_oov_tokens <= 1:
            oov_tokens = None
        else:
            oov_start = 1 if reserve_zero else 0
            oov_tokens = list(range(oov_start, self._reserved_values))

        self._table_handler = table_utils.TableHandler(
            table=self._table,
            oov_tokens=oov_tokens,
            use_v1_apis=self._use_v1_apis())

        if vocabulary is not None:
            if isinstance(vocabulary, str):
                vocabulary = table_utils.get_vocabulary_from_file(vocabulary)
            table_utils.validate_vocabulary_is_unique(vocabulary)

            self.set_vocabulary(vocabulary)

示例#8

0

显示文件

文件： saved_model_test.py 项目： georgeslabreche/tensorflow-opssat-smartcam

 def __init__(self):
     self.v1 = tf.Variable([0, 0, 0, 0])
     self.v2 = tf.Variable([1, 1, 1, 1])
     self.table = lookup_ops.MutableHashTable(key_dtype=tf.int32,
                                              value_dtype=tf.int32,
                                              default_value=-1)

示例#9

0

显示文件

文件： index_lookup.py 项目： JaiminRana01/AI-Project

    def __init__(self,
                 max_tokens,
                 num_oov_indices,
                 mask_token,
                 oov_token,
                 vocabulary=None,
                 invert=False,
                 output_mode=INT,
                 sparse=False,
                 pad_to_max_tokens=False,
                 **kwargs):
        # If max_tokens is set, the value must be greater than 1 - otherwise we
        # are creating a 0-element vocab, which doesn't make sense.
        if max_tokens is not None and max_tokens <= 1:
            raise ValueError("If set, `max_tokens` must be greater than 1. "
                             "You passed {}".format(max_tokens))

        if num_oov_indices < 0:
            raise ValueError(
                "`num_oov_indices` must be greater than or equal to 0. "
                "You passed {}".format(num_oov_indices))

        # 'output_mode' must be one of (INT, BINARY, COUNT, TFIDF)
        layer_utils.validate_string_arg(output_mode,
                                        allowable_strings=(INT, BINARY, COUNT,
                                                           TFIDF),
                                        layer_name=self.__class__.__name__,
                                        arg_name="output_mode")

        if invert and output_mode != INT:
            raise ValueError(
                "`output_mode` must be {} when `invert` is true. You "
                "passed {}".format(INT, output_mode))

        self.invert = invert
        self.max_tokens = max_tokens
        self.num_oov_indices = num_oov_indices
        self.oov_token = oov_token
        self.output_mode = output_mode
        self.sparse = sparse
        self.pad_to_max_tokens = pad_to_max_tokens
        self._called = False

        # A note on vocab_size: we need to always keep a non-Tensor representation
        # of vocab_size around to use in graph building. Because we might be
        # in a tf.function, we can't rely on evaluating the actual tables to
        # find the value either.
        self._vocab_size = None
        # We need to keep track our current vocab size outside of our layer weights
        # to support a static output shape when `output_mode != INT`. The bincount
        # ops do not set shape on their outputs, which means we have to set it
        # ourselves. We persist the current vocab size as a hidden part of the
        # config when serializing our model.
        if "vocabulary_size" in kwargs:
            self._vocab_size = kwargs["vocabulary_size"]
            del kwargs["vocabulary_size"]

        restore_from_static_table = kwargs.pop("has_static_table", False)

        # Make sure the mask token is truly of the dtype we want. We can ignore
        # strings here, because they have only one dtype.
        if mask_token is not None:
            dtype = kwargs["dtype"]
            if dtype == tf.int32:
                mask_token = np.int32(mask_token)
            elif dtype == tf.int64:
                mask_token = np.int64(mask_token)
        self.mask_token = mask_token

        if max_tokens is not None:
            available_vocab_size = max_tokens - self._token_start_index()
        else:
            available_vocab_size = None

        super(IndexLookup, self).__init__(combiner=_IndexLookupCombiner(
            vocab_size=available_vocab_size,
            mask_value=mask_token,
            oov_value=oov_token,
            compute_idf=(output_mode == TFIDF)),
                                          **kwargs)

        # We need to save the key dtype so that we know if we're expecting int64
        # keys. If we are, we will cast int32 inputs to int64 as well.
        if invert:
            self._key_dtype = tf.int64
            self._value_dtype = self.dtype
            self._mask_key = 0
            self._mask_value = mask_token
            key_index = tf.lookup.TextFileIndex.LINE_NUMBER
            value_index = tf.lookup.TextFileIndex.WHOLE_LINE
            default_value = self.oov_token
            oov_indices = None
        else:
            self._key_dtype = self.dtype
            self._value_dtype = tf.int64
            self._mask_key = mask_token
            key_index = tf.lookup.TextFileIndex.WHOLE_LINE
            value_index = tf.lookup.TextFileIndex.LINE_NUMBER
            # Masks should map to 0 for int output and be dropped otherwise. Max ints
            # will be dropped from the bincount op.
            self._mask_value = 0 if self.output_mode == INT else tf.int64.max
            oov_start = self._oov_start_index()
            token_start = self._token_start_index()
            if self.num_oov_indices == 0:
                # If there are no OOV indices, we map OOV tokens to -1 for int output
                # and drop them from bagged output. Max ints will be dropped from the
                # bincount op.
                default_value = -1 if self.output_mode == INT else tf.int64.max
                oov_indices = None
            elif self.num_oov_indices == 1:
                # If there is only one OOV index, we can set that index as the default
                # value of the index_lookup table.
                default_value = oov_start
                oov_indices = None
            else:
                # If we hav multiple OOV values, we need to do a further hashing step;
                # to make this easier, we set the OOV value to -1. (This lets us do a
                # vectorized add and cast to boolean to determine locations where we
                # need to do extra hashing.)
                default_value = -1
                oov_indices = list(range(oov_start, token_start))

        self._static_vocabulary_path = None
        has_vocab_path = (vocabulary is not None
                          and isinstance(vocabulary, str))
        if has_vocab_path or restore_from_static_table:
            self._has_static_table = True
            if vocabulary is None:
                # If we're restoring a layer that was saved with a static table
                # initializer, we create a fake initializer object to let the code
                # progress. The savedmodel restoration code will handle restoring
                # the actual data.
                initializer = _NullInitializer(self._key_dtype,
                                               self._value_dtype)
            else:
                if not os.path.exists(vocabulary):
                    raise ValueError("Vocabulary file %s does not exist." %
                                     (vocabulary, ))
                self._static_vocabulary_path = vocabulary
                num_tokens = table_utils.num_tokens_in_file(vocabulary)
                self._vocab_size = self._token_start_index() + num_tokens

                initializer = tf.lookup.TextFileInitializer(
                    filename=vocabulary,
                    key_dtype=self._key_dtype,
                    key_index=key_index,
                    value_dtype=self._value_dtype,
                    value_index=value_index,
                    value_index_offset=self._token_start_index())

            self._table = self._static_table_class()(
                initializer, default_value=default_value)
            self._table_handler = table_utils.TableHandler(
                table=self._table,
                mask_token=self._mask_key,
                mask_value=self._mask_value,
                oov_tokens=oov_indices,
                use_v1_apis=self._use_v1_apis())

            tracked_table = self._add_trackable(self._table, trainable=False)

        else:
            self._has_static_table = False
            self._table = lookup_ops.MutableHashTable(
                key_dtype=self._key_dtype,
                value_dtype=self._value_dtype,
                default_value=default_value,
                name=(self._name + "_index_table"))
            self._table_handler = table_utils.TableHandler(
                table=self._table,
                oov_tokens=oov_indices,
                use_v1_apis=self._use_v1_apis())
            if vocabulary is not None:
                self.set_vocabulary(vocabulary)
            tracked_table = self._add_trackable(self._table, trainable=False)

        if self.output_mode == TFIDF:
            # The TF-IDF weight may have a (None,) tensorshape. This creates
            # a 1D variable with arbitrary shape, which we can assign any weight to
            # so long as it has 1 dimension. In order to properly initialize this
            # weight in Keras, we need to provide a custom callable initializer which
            # does not depend on the shape of the weight (as all other initializers
            # do) since the weight is not known. Hence the lambda shape, dtype: [0].
            if not self.pad_to_max_tokens or max_tokens is None:
                initializer = lambda shape, dtype: [0]
            else:
                initializer = tf.compat.v1.zeros_initializer

            # We are adding these here instead of in build() since they do not depend
            # on the input shape at all.
            idf_shape = (max_tokens, ) if self.pad_to_max_tokens else (None, )
            self.tf_idf_weights = self._add_state_variable(
                name="idf",
                shape=tf.TensorShape(idf_shape),
                dtype=backend.floatx(),
                initializer=initializer)

        # This is a workaround for summary() on this layer. Because the table is
        # not mutable during training, the effective number of parameters (and so
        # the weight shape) is 0; we add this as an attr so that the parameter
        # counting code in the Model object doesn't throw an attribute error.
        tracked_table.shape = tf.TensorShape((0, ))

示例#10

0

显示文件

文件： text_vectorization.py 项目： matfurrier/tensorflowpython

    def __init__(self,
                 max_tokens=None,
                 standardize=LOWER_AND_STRIP_PUNCTUATION,
                 split=SPLIT_ON_WHITESPACE,
                 ngrams=None,
                 output_mode=INT,
                 output_sequence_length=None,
                 pad_to_max_tokens=True,
                 **kwargs):

        # This layer only applies to string processing, and so should only have
        # a dtype of 'string'.
        if "dtype" in kwargs and kwargs["dtype"] != dtypes.string:
            raise ValueError(
                "TextVectorization may only have a dtype of string.")
        elif "dtype" not in kwargs:
            kwargs["dtype"] = dtypes.string

        # 'standardize' must be one of (None, LOWER_AND_STRIP_PUNCTUATION, callable)
        layer_utils.validate_string_arg(
            standardize,
            allowable_strings=[LOWER_AND_STRIP_PUNCTUATION],
            layer_name="TextVectorization",
            arg_name="standardize",
            allow_none=True,
            allow_callables=True)

        # 'split' must be one of (None, SPLIT_ON_WHITESPACE, callable)
        layer_utils.validate_string_arg(
            split,
            allowable_strings=[SPLIT_ON_WHITESPACE],
            layer_name="TextVectorization",
            arg_name="split",
            allow_none=True,
            allow_callables=True)

        # 'output_mode' must be one of (None, INT, COUNT, BINARY, TFIDF)
        layer_utils.validate_string_arg(
            output_mode,
            allowable_strings=[INT, COUNT, BINARY, TFIDF],
            layer_name="TextVectorization",
            arg_name="output_mode",
            allow_none=True)

        # 'ngrams' must be one of (None, int, tuple(int))
        if not (ngrams is None or isinstance(ngrams, int)
                or isinstance(ngrams, tuple)
                and all(isinstance(item, int) for item in ngrams)):
            raise ValueError(
                ("`ngrams` must be None, an integer, or a tuple of "
                 "integers. Got %s") % (ngrams, ))

        # 'output_sequence_length' must be one of (None, int) and is only
        # set if output_mode is INT.
        if (output_mode == INT
                and not (isinstance(output_sequence_length, int) or
                         (output_sequence_length is None))):
            raise ValueError(
                "`output_sequence_length` must be either None or an "
                "integer when `output_mode` is 'int'. "
                "Got %s" % output_sequence_length)

        if output_mode != INT and output_sequence_length is not None:
            raise ValueError("`output_sequence_length` must not be set if "
                             "`output_mode` is not 'int'.")

        # If max_tokens is set, the value must be greater than 1 - otherwise we
        # are creating a 0-element vocab, which doesn't make sense.
        if max_tokens is not None and max_tokens < 1:
            raise ValueError("max_tokens must be > 1.")

        self._max_tokens = max_tokens

        # In INT mode, we have two reserved values (PAD and OOV). However, non-INT
        # modes don't have a PAD value, so we only need to reserve one value.
        self._reserved_values = 2 if output_mode == INT else 1

        # In INT mode, the zero value is reserved for padding (per Keras standard
        # padding approaches). In non-INT modes, there is no padding so we can set
        # the OOV value to zero instead of one.
        self._oov_value = 1 if output_mode == INT else 0

        # We always reduce the max token number by 1 to account for the OOV token
        # if it is set. Keras' use of the reserved number 0 for padding tokens,
        # if the output is in INT mode, does not really count as a 'token' for
        # vocabulary purposes, so we only reduce vocab size by 1 here.
        self._max_vocab_size = max_tokens - 1 if max_tokens is not None else None

        self._standardize = standardize
        self._split = split
        self._ngrams_arg = ngrams
        if isinstance(ngrams, int):
            self._ngrams = tuple(range(1, ngrams + 1))
        else:
            self._ngrams = ngrams

        self._output_mode = output_mode
        self._output_sequence_length = output_sequence_length
        self._pad_to_max = pad_to_max_tokens
        self._vocab_size = 0
        self._called = False

        super(TextVectorization,
              self).__init__(combiner=_TextVectorizationCombiner(
                  self._max_vocab_size, compute_idf=output_mode == TFIDF),
                             **kwargs)

        self._table = lookup_ops.MutableHashTable(
            key_dtype=dtypes.string,
            value_dtype=dtypes.int64,
            default_value=self._oov_value,
            name=(self._name + "_index_table"))

        def fail(_):
            raise NotImplementedError(
                "Saving is not yet supported for TextVectorization layers.")

        self._table._list_extra_dependencies_for_serialization = fail  # pylint: disable=protected-access

        tracked_table = self._add_trackable(self._table, trainable=False)

        # This is a workaround for summary() on this layer. Because the table is
        # not mutable during training, the effective number of parameters (and so
        # the weight shape) is 0; we add this as an attr so that the parameter
        # counting code in the Model object doesn't throw an attribute error.
        tracked_table.shape = tensor_shape.TensorShape((0, ))

        # If this layer is configured for string or integer output, we do not
        # create a vectorization layer (as the output is not vectorized).
        if self._output_mode in [None, INT]:
            return

        if max_tokens is not None and self._pad_to_max:
            vectorize_max_tokens = max_tokens
        else:
            vectorize_max_tokens = None
        self._vectorize_layer = self._get_vectorization_class()(
            max_tokens=vectorize_max_tokens, output_mode=self._output_mode)

示例#11

0

显示文件

文件： text_vectorization.py 项目： zhongyanjiu/tensorflow

  def __init__(self,
               max_tokens=None,
               standardize=LOWER_AND_STRIP_PUNCTUATION,
               split=SPLIT_ON_WHITESPACE,
               ngrams=None,
               output_mode=INT,
               output_sequence_length=None,
               pad_to_max_tokens=True,
               **kwargs):

    # This layer only applies to string processing, and so should only have
    # a dtype of 'string'.
    if "dtype" in kwargs and kwargs["dtype"] != dtypes.string:
      raise ValueError("TextVectorization may only have a dtype of string.")
    elif "dtype" not in kwargs:
      kwargs["dtype"] = dtypes.string

    # 'standardize' must be one of (None, LOWER_AND_STRIP_PUNCTUATION, callable)
    _validate_string_arg(
        standardize,
        allowable_strings=[LOWER_AND_STRIP_PUNCTUATION],
        arg_name="standardize")

    # 'split' must be one of (None, SPLIT_ON_WHITESPACE, callable)
    _validate_string_arg(
        split, allowable_strings=[SPLIT_ON_WHITESPACE], arg_name="split")

    # 'output_mode' must be one of (None, INT, COUNT, BINARY, TFIDF)
    _validate_string_arg(
        output_mode,
        allowable_strings=[INT, COUNT, BINARY, TFIDF],
        arg_name="output_mode",
        allow_callables=False)

    # 'ngrams' must be one of (None, int, tuple(int))
    if not (ngrams is None or
            isinstance(ngrams, int) or
            isinstance(ngrams, tuple) and
            all(isinstance(item, int) for item in ngrams)):
      raise ValueError(("`ngrams` must be None, an integer, or a tuple of "
                        "integers. Got %s") % (ngrams,))

    # 'output_sequence_length' must be one of (None, int) and is only
    # set if output_mode is INT.
    if (output_mode == INT and not (isinstance(output_sequence_length, int) or
                                    (output_sequence_length is None))):
      raise ValueError("`output_sequence_length` must be either None or an "
                       "integer when `output_mode` is 'int'. "
                       "Got %s" % output_sequence_length)

    if output_mode != INT and output_sequence_length is not None:
      raise ValueError("`output_sequence_length` must not be set if "
                       "`output_mode` is not 'int'.")

    self._max_tokens = max_tokens

    # In INT mode, we have two reserved values (PAD and OOV). However, non-INT
    # modes don't have a PAD value, so we only need to reserve one value.
    self._reserved_values = 2 if output_mode == INT else 1

    # In INT mode, the zero value is reserved for padding (per Keras standard
    # padding approaches). In non-INT modes, there is no padding so we can set
    # the OOV value to zero instead of one.
    self._oov_value = 1 if output_mode == INT else 0

    # We always reduce the max token number by 1 to account for the OOV token
    # if it is set. The PAD marker isn't really a token (it's the absence of a
    # token) so we don't account for it here.
    self._max_vocab_size = max_tokens - 1 if max_tokens is not None else None

    self._standardize = standardize
    self._split = split
    self._ngrams_arg = ngrams
    if isinstance(ngrams, int):
      self._ngrams = tuple(range(1, ngrams + 1))
    else:
      self._ngrams = ngrams

    self._output_mode = output_mode
    self._output_sequence_length = output_sequence_length
    self._pad_to_max = pad_to_max_tokens
    self._has_vocab = False

    super(TextVectorization, self).__init__(
        combiner=_TextVectorizationCombiner(
            self._max_vocab_size, compute_idf=output_mode == TFIDF),
        **kwargs)

    self._table = lookup_ops.MutableHashTable(
        key_dtype=dtypes.string,
        value_dtype=dtypes.int64,
        default_value=self._oov_value,
        name=(self._name + "_index_table"))

    def fail(_):
      raise NotImplementedError(
          "Saving is not yet supported for TextVectorization layers.")
    self._table._list_extra_dependencies_for_serialization = fail  # pylint: disable=protected-access

    self._add_trackable(self._table, trainable=False)

    # We are adding this here instead of in build() since it does not depend
    # on the input shape at all.
    if self._output_mode == TFIDF:
      # Create the TFIDF weight, but use a (None,) tensorshape. This creates
      # a 1D variable with arbitrary shape, which we can assign any weight to
      # so long as it has 1 dimension. In order to properly initialize this
      # weight in Keras, we need to provide a custom callable initializer which
      # does not depend on the shape of the weight (as all other initializers
      # do) since the weight is not known. Hence the lambda shape, dtype: [0].
      self._tf_idf_weights = self.add_weight(
          name="tfidf_data",
          shape=tensor_shape.TensorShape((None,)),
          dtype=K.floatx(),
          trainable=False,
          initializer=lambda shape, dtype: [0])

示例#12

0

显示文件

文件： index_lookup.py 项目： wyt97/tensorflow

    def __init__(self,
                 max_tokens,
                 num_oov_indices,
                 mask_token,
                 oov_token,
                 vocabulary=None,
                 **kwargs):

        # If max_tokens is set, the value must be greater than 1 - otherwise we
        # are creating a 0-element vocab, which doesn't make sense.
        if max_tokens is not None and max_tokens <= 1:
            raise ValueError("If set, max_tokens must be greater than 1.")

        if num_oov_indices < 0:
            raise ValueError(
                "num_oov_indices must be greater than 0. You passed %s" %
                num_oov_indices)

        self.max_tokens = max_tokens
        self.num_oov_indices = num_oov_indices
        self.oov_token = oov_token
        self.mask_token = mask_token

        # If there is only one OOV bucket, we can determine the OOV value (either 0
        # or 1 depending on whether 0 is reserved) and set that as the default
        # value of the index_lookup table. If we hav multiple OOV values, we need to
        # do a further hashing step; to make this easier, we set the OOV value to
        # -1. (This lets us do a vectorized add and cast to boolean to determine
        # locations where we need to do extra hashing.)
        if self.num_oov_indices == 1:
            self._oov_value = 0 if mask_token is None else 1
        else:
            self._oov_value = -1

        super(IndexLookup, self).__init__(combiner=_IndexLookupCombiner(
            self.max_tokens, self.mask_token),
                                          **kwargs)

        self._output_dtype = dtypes.int64

        self._table = lookup_ops.MutableHashTable(
            key_dtype=self.dtype,
            value_dtype=self._output_dtype,
            default_value=self._oov_value,
            name=(self._name + "_index_table"))
        tracked_table = self._add_trackable(self._table, trainable=False)
        # This is a workaround for summary() on this layer. Because the table is
        # not mutable during training, the effective number of parameters (and so
        # the weight shape) is 0; we add this as an attr so that the parameter
        # counting code in the Model object doesn't throw an attribute error.
        tracked_table.shape = tensor_shape.TensorShape((0, ))

        if self.num_oov_indices <= 1:
            oov_indices = None
        else:
            oov_start = 1 if mask_token is not None else 0
            oov_end = oov_start + num_oov_indices
            oov_indices = list(range(oov_start, oov_end))

        self._table_handler = table_utils.TableHandler(
            table=self._table,
            oov_tokens=oov_indices,
            use_v1_apis=self._use_v1_apis())

        if vocabulary is not None:
            self.set_vocabulary(vocabulary)

示例#13

0

显示文件

    def __init__(self,
                 max_tokens=None,
                 standardize=LOWER_AND_STRIP_PUNCTUATION,
                 split=SPLIT_ON_WHITESPACE,
                 ngrams=None,
                 output_mode=INT,
                 output_sequence_length=None,
                 pad_to_max_tokens=True,
                 **kwargs):

        # This layer only applies to string processing, and so should only have
        # a dtype of 'string'.
        if "dtype" in kwargs and kwargs["dtype"] != dtypes.string:
            raise ValueError(
                "TextVectorization may only have a dtype of string.")
        elif "dtype" not in kwargs:
            kwargs["dtype"] = dtypes.string

        # TODO(momernick): Validate the inputs. The following must apply:
        # 'standardize' must be one of (None, LOWER_AND_STRIP, callable)
        # 'split' must be one of (None, WHITESPACE, callable)
        # 'ngrams' must be one of (None, int, tuple(int))
        # 'output_mode' must be one of (None, INT, COUNT, BINARY, TFIDF)
        # 'output_sequence_length' must be one of (None, int) and is only
        # set if output_mode is INT.

        self._max_tokens = max_tokens

        # In INT mode, we have two reserved values (PAD and OOV). However, non-INT
        # modes don't have a PAD value, so we only need to reserve one value.
        self._reserved_values = 2 if output_mode == INT else 1

        # In INT mode, the zero value is reserved for padding (per Keras standard
        # padding approaches). In non-INT modes, there is no padding so we can set
        # the OOV value to zero instead of one.
        self._oov_value = 1 if output_mode == INT else 0

        # We always reduce the max token number by 1 to account for the OOV token
        # if it is set. The PAD marker isn't really a token (it's the absence of a
        # token) so we don't account for it here.
        self._max_vocab_size = max_tokens - 1 if max_tokens is not None else None

        # This is an explicit regex of all the tokens that will be stripped if
        # LOWER_AND_STRIP_PUNCTUATION is set. If an application requires other
        # stripping, a Callable should be passed into the 'standardize' arg.
        self._strip_regex = r'[!"#$%&()\*\+,-\./:;<=>?@\[\\\]^_`{|}~\t\n\']'

        self._standardize = standardize
        self._split = split
        self._ngrams_arg = ngrams
        if isinstance(ngrams, int):
            self._ngrams = tuple(builtin_range(1, ngrams + 1))
        else:
            self._ngrams = ngrams

        self._output_mode = output_mode
        self._output_sequence_length = output_sequence_length
        self._pad_to_max = pad_to_max_tokens
        self._has_vocab = False

        super(TextVectorization,
              self).__init__(combiner=_TextVectorizationCombiner(
                  self._max_vocab_size, compute_idf=output_mode == TFIDF),
                             **kwargs)

        self._table = lookup_ops.MutableHashTable(
            key_dtype=dtypes.string,
            value_dtype=dtypes.int64,
            default_value=self._oov_value,
            name=(self._name + "_index_table"))

        self._add_trackable(self._table, trainable=False)

        # We are adding this here instead of in build() since it does not depend
        # on the input shape at all.
        if self._output_mode == TFIDF:
            # Create the TFIDF weight, but use a (None,) tensorshape. This creates
            # a 1D variable with arbitrary shape, which we can assign any weight to
            # so long as it has 1 dimension. In order to properly initialize this
            # weight in Keras, we need to provide a custom callable initializer which
            # does not depend on the shape of the weight (as all other initializers
            # do) since the weight is not known. Hence the lambda shape, dtype: [0].
            self._tf_idf_weights = self.add_weight(
                name="tfidf_data",
                shape=tensor_shape.TensorShape((None, )),
                dtype=K.floatx(),
                trainable=False,
                initializer=lambda shape, dtype: [0])

示例#14

0

显示文件

文件： dynamic_embedding_ops.py 项目： rhdong/recommenders-addons-private

    def __init__(self,
                 key_dtype=dtypes.int64,
                 value_dtype=dtypes.float32,
                 dim=1,
                 devices=None,
                 partitioner=default_partition_fn,
                 shared_name=None,
                 name="DynamicEmbedding_Variable",
                 initializer=None,
                 trainable=True,
                 checkpoint=True):
        """Creates an empty `Variable` object.

    Creates a group of tables placed on devices,
    the type of its keys and values are specified by key_dtype
    and value_dtype, respectively.
    The environment variables 'TF_HASHTABLE_INIT_SIZE' can be used to set the
    inital size of each tables, which can help reduce rehash times.
    The default initial table size : 1,048,576 for CPU, 16,777,216 for GPU.

    Args:
      key_dtype: the type of the key tensors.
      value_dtype: the type of the value tensors.
      dim: the length of the value array for each key.
      devices: the list of devices holding the tables.
        One table will be created on each device.
      partitioner: partition function of keys,
        return the partition index for each key.

      Example partition func:
      ```python
      def default_partition_fn(keys, shard_num):
        return tf.cast(keys % shard_num, dtype=tf.int32)
      ```
      shared_name: No used.
      name: A name for the operation (optional).
      initializer: The value to use if a key is missing in the hash table.
        which can be a python number, numpy array or `tf.initializer` instances.
        If initializer is `None` (the default), `0` will be taken.
      trainable: True, will be treated as a trainable Variable, and add to
        to the list of variables collected in the graph under the key
        `GraphKeys.TRAINABLE_VARIABLES`.
      checkpoint: if True, the contents of the SparseVariable are
        saved to and restored from checkpoints.
        If `shared_name` is empty for a checkpointed table,
        it is shared using the table node name.

    Returns:
      A `Variable` object.
    """
        self.key_dtype = key_dtype
        self.value_dtype = value_dtype
        self.dim = dim

        def _get_default_devices():
            gpu_list = [
                x.name for x in device_lib.list_local_devices()
                if x.device_type == 'GPU'
            ]
            return gpu_list[0:1] or [
                "/CPU:0",
            ]

        devices_ = devices or _get_default_devices()
        self.devices = devices_ if isinstance(devices_, list) else [
            devices,
        ]
        self.partition_fn = partitioner
        self.name = name
        self.shared_name = shared_name or "shared_name.{}".format(name)

        self.initializer = None

        self.trainable = trainable
        self.checkpoint = checkpoint

        self._tables = []
        self.size_ops = []
        self.shard_num = len(self.devices)

        key_dtype_list = [dtypes.int32, dtypes.int64]
        value_dtype_list = [
            dtypes.int32, dtypes.int64, dtypes.bool, dtypes.float32,
            dtypes.float64, dtypes.half, dtypes.int8
        ]
        if 'GPU' in self.devices[0].upper():
            key_dtype_list = [dtypes.int64]
            value_dtype_list = [
                dtypes.int32, dtypes.float32, dtypes.half, dtypes.int8
            ]
        if key_dtype not in key_dtype_list:
            raise TypeError("key_dtype should be ", key_dtype_list)
        if value_dtype not in value_dtype_list:
            raise TypeError("value_dtype should be ", value_dtype_list)

        _initializer = initializer
        if _initializer is None:
            _initializer = init_ops.zeros_initializer(dtype=self.value_dtype)
        static_default_value = self._convert_anything_to_init(
            _initializer, dim)
        scope_name = self.name.split("/")[-1]
        with ops.name_scope(scope_name, "DynamicEmbedding_Variable"):
            with ops.colocate_with(None, ignore_existing=True):
                for idx in range(len(self.devices)):
                    with ops.device(self.devices[idx]):
                        mht = None
                        mht = lookup_ops.MutableHashTable(
                            key_dtype=self.key_dtype,
                            value_dtype=self.value_dtype,
                            default_value=static_default_value,
                            name=self._make_name(idx),
                            checkpoint=self.checkpoint)

                        self._tables.append(mht)
        super(Variable, self).__init__()

        self.trainable_wrappers = []

示例#15

0

显示文件

文件： index_lookup.py 项目： masterzenith/tensorflow

    def __init__(self,
                 max_tokens,
                 num_oov_indices,
                 mask_token,
                 oov_token,
                 vocabulary=None,
                 invert=False,
                 output_mode=INT,
                 sparse=False,
                 **kwargs):

        # If max_tokens is set, the value must be greater than 1 - otherwise we
        # are creating a 0-element vocab, which doesn't make sense.
        if max_tokens is not None and max_tokens <= 1:
            raise ValueError("If set, max_tokens must be greater than 1. "
                             "You passed %s" % (max_tokens, ))

        if num_oov_indices < 0:
            raise ValueError(
                "`num_oov_indices` must be greater than 0. You passed "
                "%s" % (num_oov_indices, ))

        if invert and num_oov_indices != 1:
            raise ValueError(
                "`num_oov_tokens` must be 1 when `invert` is True.")

        # 'output_mode' must be one of (INT, BINARY, COUNT)
        layer_utils.validate_string_arg(output_mode,
                                        allowable_strings=(INT, BINARY, COUNT),
                                        layer_name=self.__class__.__name__,
                                        arg_name="output_mode")

        self.invert = invert
        self.max_tokens = max_tokens
        self.num_oov_indices = num_oov_indices
        self.oov_token = oov_token
        self.mask_token = mask_token
        self.output_mode = output_mode
        self.sparse = sparse

        # If there is only one OOV bucket, we can determine the OOV value (either 0
        # or 1 depending on whether 0 is reserved) and set that as the default
        # value of the index_lookup table. If we hav multiple OOV values, we need to
        # do a further hashing step; to make this easier, we set the OOV value to
        # -1. (This lets us do a vectorized add and cast to boolean to determine
        # locations where we need to do extra hashing.)
        if self.num_oov_indices == 1:
            self._oov_value = 0 if mask_token is None else 1
        else:
            self._oov_value = -1

        if max_tokens is not None:
            num_mask_tokens = (0 if mask_token is None else 1)
            vocab_size = max_tokens - (num_oov_indices + num_mask_tokens)
        else:
            vocab_size = None

        super(IndexLookup, self).__init__(combiner=_IndexLookupCombiner(
            vocab_size, self.mask_token),
                                          **kwargs)

        self._output_dtype = dtypes.int64

        # We need to save the key dtype so that we know if we're expecting int64
        # keys. If we are, we will cast int32 inputs to int64 as well.
        if invert:
            self._key_dtype = self._output_dtype
            value_dtype = self.dtype
            oov_value = self.oov_token
        else:
            self._key_dtype = self.dtype
            value_dtype = self._output_dtype
            oov_value = self._oov_value

        self._table = lookup_ops.MutableHashTable(key_dtype=self._key_dtype,
                                                  value_dtype=value_dtype,
                                                  default_value=oov_value,
                                                  name=(self._name +
                                                        "_index_table"))
        tracked_table = self._add_trackable(self._table, trainable=False)
        # This is a workaround for summary() on this layer. Because the table is
        # not mutable during training, the effective number of parameters (and so
        # the weight shape) is 0; we add this as an attr so that the parameter
        # counting code in the Model object doesn't throw an attribute error.
        tracked_table.shape = tensor_shape.TensorShape((0, ))

        if self.num_oov_indices <= 1:
            oov_indices = None
        else:
            oov_start = 1 if mask_token is not None else 0
            oov_end = oov_start + num_oov_indices
            oov_indices = list(range(oov_start, oov_end))

        self._table_handler = table_utils.TableHandler(
            table=self._table,
            oov_tokens=oov_indices,
            use_v1_apis=self._use_v1_apis())

        if vocabulary is not None:
            self.set_vocabulary(vocabulary)

示例#16

0

显示文件

文件： index_lookup.py 项目： mehrdad-moradi/keras

    def __init__(self,
                 max_tokens,
                 num_oov_indices,
                 mask_token,
                 oov_token,
                 vocabulary=None,
                 invert=False,
                 output_mode=INT,
                 sparse=False,
                 pad_to_max_tokens=False,
                 **kwargs):

        # If max_tokens is set, the value must be greater than 1 - otherwise we
        # are creating a 0-element vocab, which doesn't make sense.
        if max_tokens is not None and max_tokens <= 1:
            raise ValueError("If set, `max_tokens` must be greater than 1. "
                             "You passed {}".format(max_tokens))

        if num_oov_indices < 0:
            raise ValueError(
                "`num_oov_indices` must be greater than or equal to 0. "
                "You passed {}".format(num_oov_indices))

        # 'output_mode' must be one of (INT, BINARY, COUNT, TFIDF)
        layer_utils.validate_string_arg(output_mode,
                                        allowable_strings=(INT, BINARY, COUNT,
                                                           TFIDF),
                                        layer_name=self.__class__.__name__,
                                        arg_name="output_mode")

        self.invert = invert
        self.max_tokens = max_tokens
        self.num_oov_indices = num_oov_indices
        self.oov_token = oov_token
        self.mask_token = mask_token
        self.output_mode = output_mode
        self.sparse = sparse
        self.pad_to_max_tokens = pad_to_max_tokens
        self._called = False
        self._num_special_tokens = self.num_oov_indices
        if self.mask_token is not None:
            self._num_special_tokens += 1
        self._vocab_size = 0
        # We need to keep track our current vocab size outside of our layer weights
        # to support a static output shape when `output_mode != INT`. The bincount
        # ops do not set shape on their outputs, which means we have to set it
        # ourselves. We persist the current vocab size as a hidden part of the
        # config when serializing our model.
        if "vocab_size" in kwargs:
            self._vocab_size = kwargs["vocab_size"]
            del kwargs["vocab_size"]

        # If there is only one OOV bucket, we can determine the OOV value (either 0
        # or 1 depending on whether 0 is reserved) and set that as the default
        # value of the index_lookup table. If we hav multiple OOV values, we need to
        # do a further hashing step; to make this easier, we set the OOV value to
        # -1. (This lets us do a vectorized add and cast to boolean to determine
        # locations where we need to do extra hashing.)
        if self.num_oov_indices == 1:
            self._oov_value = 0 if mask_token is None else 1
        else:
            self._oov_value = -1

        if max_tokens is not None:
            available_vocab_size = max_tokens - self._num_special_tokens
        else:
            available_vocab_size = None

        super(IndexLookup, self).__init__(combiner=_IndexLookupCombiner(
            vocab_size=available_vocab_size,
            mask_value=mask_token,
            oov_value=oov_token,
            compute_idf=(output_mode == TFIDF)),
                                          **kwargs)

        # We need to save the key dtype so that we know if we're expecting int64
        # keys. If we are, we will cast int32 inputs to int64 as well.
        if invert:
            self._key_dtype = tf.int64
            self._value_dtype = self.dtype
            oov_value = self.oov_token
            oov_indices = None
        else:
            self._key_dtype = self.dtype
            self._value_dtype = tf.int64
            oov_value = self._oov_value
            if self.num_oov_indices <= 1:
                oov_indices = None
            else:
                oov_start = 1 if mask_token is not None else 0
                oov_end = oov_start + num_oov_indices
                oov_indices = list(range(oov_start, oov_end))

        if vocabulary is not None and isinstance(
                vocabulary, tf.lookup.TextFileInitializer):
            self._table = self._static_table_class()(vocabulary,
                                                     default_value=oov_value)
            self._table_handler = table_utils.TableHandler(
                table=self._table,
                mask_token=mask_token,
                oov_tokens=oov_indices,
                use_v1_apis=self._use_v1_apis())
            self.max_tokens = (self._table_handler.table_size() +
                               self.num_oov_indices +
                               (0 if mask_token is None else 1))
        else:
            self._table = lookup_ops.MutableHashTable(
                key_dtype=self._key_dtype,
                value_dtype=self._value_dtype,
                default_value=oov_value,
                name=(self._name + "_index_table"))
            self._table_handler = table_utils.TableHandler(
                table=self._table,
                oov_tokens=oov_indices,
                use_v1_apis=self._use_v1_apis())
            if vocabulary is not None:
                self.set_vocabulary(vocabulary)

        if self.output_mode == TFIDF:
            # The TF-IDF weight may have a (None,) tensorshape. This creates
            # a 1D variable with arbitrary shape, which we can assign any weight to
            # so long as it has 1 dimension. In order to properly initialize this
            # weight in Keras, we need to provide a custom callable initializer which
            # does not depend on the shape of the weight (as all other initializers
            # do) since the weight is not known. Hence the lambda shape, dtype: [0].
            if not self.pad_to_max_tokens or max_tokens is None:
                initializer = lambda shape, dtype: [0]
            else:
                initializer = tf.compat.v1.zeros_initializer

            # We are adding these here instead of in build() since they do not depend
            # on the input shape at all.
            idf_shape = (max_tokens, ) if self.pad_to_max_tokens else (None, )
            self.tf_idf_weights = self._add_state_variable(
                name="idf",
                shape=tf.TensorShape(idf_shape),
                dtype=K.floatx(),
                initializer=initializer)

        tracked_table = self._add_trackable(self._table, trainable=False)
        # This is a workaround for summary() on this layer. Because the table is
        # not mutable during training, the effective number of parameters (and so
        # the weight shape) is 0; we add this as an attr so that the parameter
        # counting code in the Model object doesn't throw an attribute error.
        tracked_table.shape = tf.TensorShape((0, ))

示例#17

0

显示文件

def get_table(dtype=tf.string, oov_tokens=None):
    table = lookup_ops.MutableHashTable(key_dtype=dtype,
                                        value_dtype=tf.int64,
                                        default_value=-7,
                                        name="index_table")
    return table_utils.TableHandler(table, oov_tokens)