예제 #1
0
from tensorflow.python.framework import ops
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import lookup_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops.ragged import ragged_tensor
from tensorflow.python.ops.ragged.ragged_tensor import RaggedTensor
from tensorflow_text.python.ops.tokenization import TokenizerWithOffsets

# pylint: disable=g-bad-import-order
from tensorflow.python.framework import load_library
from tensorflow.python.platform import resource_loader
gen_wordpiece_tokenizer = load_library.load_op_library(
    resource_loader.get_path_to_datafile('_wordpiece_tokenizer.so'))

_tf_text_wordpiece_tokenizer_op_create_counter = monitoring.Counter(
    '/nlx/api/python/wordpiece_tokenizer_create_counter',
    'Counter for number of WordpieceTokenizers created in Python.')


class WordpieceTokenizer(TokenizerWithOffsets):
    """Tokenizes a tensor of UTF-8 string tokens into subword pieces."""
    def __init__(self,
                 vocab_lookup_table,
                 suffix_indicator='##',
                 max_bytes_per_word=100,
                 max_chars_per_token=None,
                 token_out_type=dtypes.int64,
                 unknown_token='[UNK]',
                 split_unknown_characters=False):
        """Initializes the WordpieceTokenizer.
예제 #2
0
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Python TFLite metrics helper."""
from typing import Optional, Text
import uuid

from tensorflow.lite.python import metrics_interface
from tensorflow.lite.python.metrics_wrapper import _pywrap_tensorflow_lite_metrics_wrapper as _metrics_wrapper
from tensorflow.lite.python.metrics_wrapper import converter_error_data_pb2
from tensorflow.python.eager import monitoring

_counter_debugger_creation = monitoring.Counter(
    '/tensorflow/lite/quantization_debugger/created',
    'Counter for the number of debugger created.')

_counter_interpreter_creation = monitoring.Counter(
    '/tensorflow/lite/interpreter/created',
    'Counter for number of interpreter created in Python.', 'language')

# The following are conversion metrics. Attempt and success are kept separated
# instead of using a single metric with a label because the converter may
# raise exceptions if conversion failed. That may lead to cases when we are
# unable to capture the conversion attempt. Increasing attempt count at the
# beginning of conversion process and the success count at the end is more
# suitable in these cases.
_counter_conversion_attempt = monitoring.Counter(
    '/tensorflow/lite/convert/attempt',
    'Counter for number of conversion attempts.')
예제 #3
0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Splitter that uses a Hub module."""

import tensorflow_hub as hub
from tensorflow.python.eager import monitoring
from tensorflow.python.ops import array_ops
from tensorflow.python.ops.ragged import ragged_tensor
from tensorflow_text.python.ops.splitter import SplitterWithOffsets

_tf_text_hub_module_splitter_create_counter = monitoring.Counter(
    '/nlx/api/python/hub_module_splitter_create_counter',
    'Counter for number of HubModuleSplitters created in Python.')


class HubModuleSplitter(SplitterWithOffsets):
    """Splitter that uses a Hub module.

  The TensorFlow graph from the module performs the real work.  The Python code
  from this class handles the details of interfacing with that module, as well
  as the support for ragged tensors and high-rank tensors.

  The Hub module should be supported by `hub.load()
  <https://www.tensorflow.org/hub/api_docs/python/hub/load>`_ If a v1 module, it
  should have a graph variant with an empty set of tags; we consider that graph
  variant to be the module and ignore everything else.  The module should have a
  signature named `default` that takes a `text` input (a rank-1 tensor of
예제 #4
0
    b"Stack",
    b"StridedSlice",
    b"StridedSliceGrad",
    b"TensorListConcatV2",
    b"TensorListGather",
    b"TensorListGetItem",
    b"TensorListPopBack",
    b"TensorListStack",
    b"Transpose",
    b"Unpack",
)

_state = threading.local()

_check_numerics_callback_create_counter = monitoring.Counter(
    "/tensorflow/api/python/debugging/check_numerics_callback_create_counter",
    "Counter for number of times the check_numerics op callback is created.")


def limit_string_length(string, max_len=50):
    """Limit the length of input string.

  Args:
    string: Input string.
    max_len: (int or None) If int, the length limit. If None, no limit.

  Returns:
    Possibly length-limited string.
  """
    if max_len is None or len(string) <= max_len:
        return string
import copy

from tensorflow.python.eager import monitoring
from tensorflow.python.framework import dtypes
from tensorflow.python.framework import ops
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import lookup_ops
from tensorflow.python.ops import string_ops
from tensorflow_text.python.ops import regex_split_ops
from tensorflow_text.python.ops.normalize_ops import case_fold_utf8
from tensorflow_text.python.ops.normalize_ops import normalize_utf8
from tensorflow_text.python.ops.tokenization import TokenizerWithOffsets
from tensorflow_text.python.ops.wordpiece_tokenizer import WordpieceTokenizer

_tf_text_bert_tokenizer_op_create_counter = monitoring.Counter(
    "/nlx/api/python/bert_tokenizer_create_counter",
    "Counter for number of BertTokenizers created in Python.")

_DELIM_REGEX = [
    r"\s+",
    r"|".join([
        r"[!-/]",
        r"[:-@]",
        r"[\[-`]",
        r"[{-~]",
        r"[\p{P}]",
    ]),
    r"|".join([
        r"[\x{4E00}-\x{9FFF}]",
        r"[\x{3400}-\x{4DBF}]",
        r"[\x{20000}-\x{2A6DF}]",
from __future__ import print_function

from tensorflow.python.eager import monitoring
from tensorflow.python.framework import ops
from tensorflow.python.ops.ragged.ragged_tensor import RaggedTensor
from tensorflow_text.python.ops.tokenization import TokenizerWithOffsets

# pylint: disable=g-bad-import-order
from tensorflow.python.framework import load_library
from tensorflow.python.platform import resource_loader
gen_split_merge_from_logits_tokenizer = load_library.load_op_library(
    resource_loader.get_path_to_datafile(
        '_split_merge_from_logits_tokenizer.so'))

_tf_text_split_merge_from_logits_tokenizer_op_create_counter = monitoring.Counter(
    '/nlx/api/python/split_merge_from_logits_tokenizer_create_counter',
    'Counter for number of SplitMergeFromLogitsTokenizer instances '
    'created in Python.')


class SplitMergeFromLogitsTokenizer(TokenizerWithOffsets):
    """Tokenizes a tensor of UTF-8 string into words according to logits."""
    def __init__(self, force_split_at_break_character=True):
        """Initializes a new instance.

    Args:
      force_split_at_break_character: a bool that indicates whether to force
        start a new word after an ICU-defined whitespace character.  Regardless
        of this parameter, we never include a whitespace into a token, and we
        always ignore the split/merge action for the whitespace character
        itself.  This parameter indicates what happens after a whitespace.
         -if force_split_at_break_character is true, create a new word starting
예제 #7
0
from tensorflow.python.framework import dtypes
from tensorflow.python.framework import ops
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops.ragged import ragged_tensor
from tensorflow.python.ops.ragged.ragged_tensor import RaggedTensor
from tensorflow_text.python.ops.tokenization import TokenizerWithOffsets

# pylint: disable=g-bad-import-order
from tensorflow.python.framework import load_library
from tensorflow.python.platform import resource_loader
gen_split_merge_tokenizer = load_library.load_op_library(
    resource_loader.get_path_to_datafile('_split_merge_tokenizer.so'))

_tf_text_split_merge_tokenizer_op_create_counter = monitoring.Counter(
    '/nlx/api/python/split_merge_tokenizer_create_counter',
    'Counter for number of SplitMergeTokenizers created in Python.')


class SplitMergeTokenizer(TokenizerWithOffsets):
    """Tokenizes a tensor of UTF-8 string into words according to labels."""
    def __init__(self):
        """Initializes a new instance.
    """
        super(SplitMergeTokenizer, self).__init__()
        _tf_text_split_merge_tokenizer_op_create_counter.get_cell(
        ).increase_by(1)

    def tokenize(
            self,
            input,  # pylint: disable=redefined-builtin
예제 #8
0
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import string_ops
from tensorflow.python.ops.ragged import ragged_conversion_ops
from tensorflow.python.ops.ragged import ragged_string_ops
from tensorflow.python.ops.ragged import ragged_tensor
from tensorflow.python.ops.ragged.ragged_tensor import RaggedTensor
from tensorflow_text.python.ops.tokenization import TokenizerWithOffsets

# pylint: disable=g-bad-import-order
from tensorflow.python.framework import load_library
from tensorflow.python.platform import resource_loader
gen_unicode_script_tokenizer = load_library.load_op_library(
    resource_loader.get_path_to_datafile('_unicode_script_tokenizer.so'))

_tf_text_unicode_script_tokenizer_create_counter = monitoring.Counter(
    "/nlx/api/python/unicode_script_tokenizer_create_counter",
    "Counter for number of UnicodeScriptTokenizers created in Python.")


class UnicodeScriptTokenizer(TokenizerWithOffsets):
    """Tokenizes a tensor of UTF-8 strings on Unicode script boundaries."""
    def __init__(self, keep_whitespace=False):
        """Initializes a new instance.

    Args:
      keep_whitespace: A boolean that specifices whether to emit whitespace
          tokens (default `False`).
    """
        super(UnicodeScriptTokenizer, self).__init__()
        _tf_text_unicode_script_tokenizer_create_counter.get_cell(
        ).increase_by(1)
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Global streamz counters."""

from tensorflow.python.eager import monitoring


progressive_policy_creation_counter = monitoring.Counter(
    "/tensorflow/training/fast_training/progressive_policy_creation",
    "Counter for the number of ProgressivePolicy creations.")


stack_vars_to_vars_call_counter = monitoring.Counter(
    "/tensorflow/training/fast_training/tf_vars_to_vars",
    "Counter for the number of low-level stacking API calls.")
예제 #10
0
 def test_same_counter(self):
     counter1 = monitoring.Counter('test/same_counter', 'test counter')  # pylint: disable=unused-variable
     with self.assertRaises(errors.AlreadyExistsError):
         counter2 = monitoring.Counter('test/same_counter', 'test counter')  # pylint: disable=unused-variable
예제 #11
0
 def test_counter(self):
     counter = monitoring.Counter('test/counter', 'test counter')
     counter.get_cell().increase_by(1)
     self.assertEqual(counter.get_cell().value(), 1)
     counter.get_cell().increase_by(5)
     self.assertEqual(counter.get_cell().value(), 6)
예제 #12
0
from tensorflow.python.framework import ops
from tensorflow.python.ops import array_ops
from tensorflow.python.ops.ragged import ragged_conversion_ops
from tensorflow.python.ops.ragged import ragged_tensor
from tensorflow.python.ops.ragged.ragged_tensor import RaggedTensor
from tensorflow.python.training.tracking import tracking
from tensorflow_text.python.ops.tokenization import Detokenizer
from tensorflow_text.python.ops.tokenization import TokenizerWithOffsets

from tensorflow.python.framework import load_library
from tensorflow.python.platform import resource_loader
gen_sentencepiece_tokenizer = load_library.load_op_library(
    resource_loader.get_path_to_datafile('_sentencepiece_tokenizer.so'))  # pylint: disable=g-bad-import-order

_tf_text_sentencepiece_tokenizer_op_create_counter = monitoring.Counter(
    "/nlx/api/python/sentencepiece_tokenizer_create_counter",
    "Counter for number of SentencepieceTokenizers created in Python.")


class _SentencepieceModelResource(tracking.TrackableResource):
    """Utility to track the model resource tensor (for SavedModel support)."""
    def __init__(self, model, name):
        super(_SentencepieceModelResource, self).__init__()
        self._model = model
        self._name = name
        _ = self.resource_handle  # Accessing this property creates the resource.

    def _create_resource(self):
        model, name = self._model, self._name
        with ops.name_scope(name, "SentenceTokenizerInitializer", [model]):
            return gen_sentencepiece_tokenizer.sentencepiece_op(model=model)