示例#1
0
 def __init__(self, telemetry_descriptors: Optional[List[Text]],
              logical_format: Text, physical_format: Text,
              dist_update_prob: float):
     if telemetry_descriptors is None:
         telemetry_descriptors = _UNKNOWN_TELEMETRY_DESCRIPTORS
     metric_namespace = telemetry_util.MakeTfxNamespace(
         telemetry_descriptors + _IO_TELEMETRY_DESCRIPTOR)
     namer = _GetMetricNamer(logical_format, physical_format)
     self._num_rows = beam.metrics.Metrics.counter(metric_namespace,
                                                   namer("num_rows"))
     self._byte_size_dist = beam.metrics.Metrics.distribution(
         metric_namespace, namer("record_batch_byte_size"))
     self._num_columns_dist = beam.metrics.Metrics.distribution(
         metric_namespace, namer("num_columns"))
     self._num_feature_values_dist = beam.metrics.Metrics.distribution(
         metric_namespace, namer("num_feature_values"))
     self._num_feature_values_dist_by_type = {
         t: beam.metrics.Metrics.distribution(
             metric_namespace,
             namer("num_feature_values[{}]".format(t.name)))
         for t in _ValueType
     }
     self._num_cells_by_type = {
         t:
         beam.metrics.Metrics.counter(metric_namespace,
                                      namer("num_cells[{}]".format(t.name)))
         for t in _ValueType
     }
     self._dist_update_prob = dist_update_prob
示例#2
0
    def __init__(self, inference_spec_type: model_spec_pb2.InferenceSpecType):
      operation_type = _get_operation_type(inference_spec_type)
      proximity_descriptor = (
          _METRICS_DESCRIPTOR_IN_PROCESS
          if _using_in_process_inference(inference_spec_type) else
          _METRICS_DESCRIPTOR_CLOUD_AI_PREDICTION)
      namespace = util.MakeTfxNamespace(
          [_METRICS_DESCRIPTOR_INFERENCE, operation_type, proximity_descriptor])

      # Metrics
      self._inference_counter = beam.metrics.Metrics.counter(
          namespace, 'num_inferences')
      self._num_instances = beam.metrics.Metrics.counter(
          namespace, 'num_instances')
      self._inference_request_batch_size = beam.metrics.Metrics.distribution(
          namespace, 'inference_request_batch_size')
      self._inference_request_batch_byte_size = (
          beam.metrics.Metrics.distribution(
              namespace, 'inference_request_batch_byte_size'))
      # Batch inference latency in microseconds.
      self._inference_batch_latency_micro_secs = (
          beam.metrics.Metrics.distribution(
              namespace, 'inference_batch_latency_micro_secs'))
      self._model_byte_size = beam.metrics.Metrics.distribution(
          namespace, 'model_byte_size')
      # Model load latency in milliseconds.
      self._load_model_latency_milli_secs = beam.metrics.Metrics.distribution(
          namespace, 'load_model_latency_milli_secs')

      # Metrics cache
      self.load_model_latency_milli_secs_cache = None
      self.model_byte_size_cache = None
示例#3
0
 def __init__(self, inference_spec_type: model_spec_pb2.InferenceSpecType):
     operation_type = _get_operation_type(inference_spec_type)
     proximity_descriptor = (
         _METRICS_DESCRIPTOR_IN_PROCESS
         if _using_in_process_inference(inference_spec_type) else
         _METRICS_DESCRIPTOR_CLOUD_AI_PREDICTION)
     self._metrics_namespace = util.MakeTfxNamespace([
         _METRICS_DESCRIPTOR_INFERENCE, operation_type, proximity_descriptor
     ])
示例#4
0
 def __init__(self, telemetry_descriptors: Optional[List[Text]],
              logical_format: Text, physical_format: Text):
     if telemetry_descriptors is None:
         telemetry_descriptors = _UNKNOWN_TELEMETRY_DESCRIPTORS
     metric_namespace = telemetry_util.MakeTfxNamespace(
         telemetry_descriptors + _IO_TELEMETRY_DESCRIPTOR)
     namer = _GetMetricNamer(logical_format, physical_format)
     self._num_rows = beam.metrics.Metrics.counter(metric_namespace,
                                                   namer("num_raw_records"))
     self._byte_size_dist = beam.metrics.Metrics.distribution(
         metric_namespace, namer("raw_record_byte_size"))
示例#5
0
  def __init__(self, saved_decoder_path: Text,
               telemetry_descriptors: List[Text],
               shared_decode_fn_handle: Optional[shared.Shared],
               raw_record_column_name: Optional[Text],
               record_index_column_name: Optional[Text]):
    super().__init__()
    self._saved_decoder_path = saved_decoder_path
    self._raw_record_column_name = raw_record_column_name
    self._record_index_column_name = record_index_column_name
    self._shared_decode_fn_handle = shared_decode_fn_handle

    self._tensors_to_record_batch_converter = None
    self._decode_fn = None
    self._decoder_load_seconds_distribution = beam.metrics.Metrics.distribution(
        telemetry_util.MakeTfxNamespace(telemetry_descriptors),
        "record_to_tensor_tfxio_decoder_load_seconds")
    self._decoder_load_seconds = None
示例#6
0
# Types of TF models
TFMA_EVAL = 'tfma_eval'
TF_ESTIMATOR = 'tf_estimator'
TF_KERAS = 'tf_keras'
TF_GENERIC = 'tf_generic'
TF_LITE = 'tf_lite'
TF_JS = 'tf_js'
VALID_TF_MODEL_TYPES = (TFMA_EVAL, TF_GENERIC, TF_ESTIMATOR, TF_KERAS, TF_LITE,
                        TF_JS)

# This constant is only used for telemetry
MODEL_AGNOSTIC = 'model_agnostic'

# LINT.IfChange
METRICS_NAMESPACE = util.MakeTfxNamespace(['ModelAnalysis'])
# LINT.ThenChange(../../../learning/fairness/infra/plx/scripts/tfma_metrics_computed_tracker_macros.sql)

# Keys for Extracts dictionary (keys starting with _ will not be materialized).

# Input key. Could be a serialized tf.train.Example, a CSV row, JSON data, etc
# depending on what the EvalInputReceiver was configured to accept as input.
INPUT_KEY = 'input'

# This holds an Arrow RecordBatch representing a batch of examples.
ARROW_RECORD_BATCH_KEY = 'arrow_record_batch'

# This holds the column name containing the raw input (Could be a serialized
# tf.train.Example, a CSV row, JSON data, etc) in an Arrow RecordBatch.
ARROW_INPUT_COLUMN = '__raw_record__'
示例#7
0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Constants used in TensorFlow Data Validation."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from tfx_bsl.telemetry import util

# Name of the default slice containing all examples.
DEFAULT_SLICE_KEY = 'All Examples'

# Namespace for all TFDV metrics.
METRICS_NAMESPACE = util.MakeTfxNamespace(['DataValidation'])

# Default input batch size.
# This needs to be large enough to allow for efficient TF invocations during
# batch flushing, but shouldn't be too large as it also acts as cap on the
# maximum memory usage of the computation.
DEFAULT_DESIRED_INPUT_BATCH_SIZE = 1000

# Placeholder for non-utf8 sequences in top-k results.
NON_UTF8_PLACEHOLDER = '__BYTES_VALUE__'
# Placeholder for large sequences in top-k results.
LARGE_BYTES_PLACEHOLDER = '__LARGE_BYTES__'
from six import with_metaclass
import tensorflow as tf
from tfx import types
from tfx.components.example_gen import utils
from tfx.components.util import examples_utils
from tfx.dsl.components.base import base_executor
from tfx.proto import example_gen_pb2
from tfx.types import artifact_utils
from tfx.utils import proto_utils
from tfx_bsl.telemetry import util

# Default file name for TFRecord output file prefix.
DEFAULT_FILE_NAME = 'data_tfrecord'

# Metrics namespace for ExampleGen.
_METRICS_NAMESPACE = util.MakeTfxNamespace(['ExampleGen'])


def _GeneratePartitionKey(record: Union[tf.train.Example,
                                        tf.train.SequenceExample, bytes],
                          split_config: example_gen_pb2.SplitConfig) -> bytes:
    """Generates key for partition."""

    if not split_config.HasField('partition_feature_name'):
        if isinstance(record, bytes):
            return record
        return record.SerializeToString(deterministic=True)

    if isinstance(record, tf.train.Example):
        features = record.features.feature  # pytype: disable=attribute-error
    elif isinstance(record, tf.train.SequenceExample):
示例#9
0
import apache_beam as beam
from apache_beam.typehints import Union
from six import binary_type
from six import integer_types
from six import string_types
from tensorflow_transform import nodes
from tfx_bsl.telemetry import util
# TODO(https://issues.apache.org/jira/browse/SPARK-22674): Switch to
# `collections.namedtuple` or `typing.NamedTuple` once the Spark issue is
# resolved.
from tfx_bsl.types import tfx_namedtuple

NUMERIC_TYPE = Union[float, Union[integer_types]]
PRIMITIVE_TYPE = Union[NUMERIC_TYPE, Union[string_types], binary_type]

METRICS_NAMESPACE = util.MakeTfxNamespace(['Transform'])


# Depending on the environment, (TF 1.x vs 2.x for e.g.,) we may want to
# register different implementations of beam nodes for the TFT beam nodes. These
# tags are used to identify the implementation to use under the current
# environment.
class EnvironmentTags(enum.Enum):
    TF_COMPAT_V1 = 'tf_compat_v1'
    TF_V2_ONLY = 'tf_v2_only'


_ALLOWED_PTRANSFORM_TAGS = [tag.value for tag in EnvironmentTags]


def get_unique_temp_path(base_temp_dir):
示例#10
0
 def testMakeTfxNamespace(self):
     self.assertEqual("tfx", util.MakeTfxNamespace([]))
     self.assertEqual("tfx.some.component",
                      util.MakeTfxNamespace(("some", "component")))