def testKeepalive(self): count = Count() shared_handle = shared.Shared() other_shared_handle = shared.Shared() def dummy_acquire_fn(): return None def acquire_fn(): return Marker(count) p1 = shared_handle.acquire(acquire_fn) self.assertEquals(1, count.get_total()) self.assertEquals(1, count.get_active()) del p1 gc.collect() # Won't be garbage collected, because of the keep-alive self.assertEquals(1, count.get_active()) # Reacquire. p2 = shared_handle.acquire(acquire_fn) self.assertEquals(1, count.get_total()) # No reinitialisation. self.assertEquals(1, count.get_active()) # Get rid of the keepalive other_shared_handle.acquire(dummy_acquire_fn) del p2 gc.collect() self.assertEquals(0, count.get_active())
def BuildDiagnosticTable( # pylint: disable=invalid-name examples, eval_saved_model_path, extractors=None, desired_batch_size=None): """Build diagnostics for the spacified EvalSavedModel and example collection. Args: examples: PCollection of input examples. Can be any format the model accepts (e.g. string containing CSV row, TensorFlow.Example, etc). eval_saved_model_path: Path to EvalSavedModel. This directory should contain the saved_model.pb file. extractors: Optional list of Extractors to execute prior to slicing and aggregating the metrics. If not provided, a default set will be run. desired_batch_size: Optional batch size for batching in Predict and Aggregate. Returns: PCollection of ExampleAndExtracts """ if not extractors: extractors = [ PredictExtractor(eval_saved_model_path, None, shared.Shared(), desired_batch_size), types.Extractor(stage_name='ExtractFeatures', ptransform=feature_extractor.ExtractFeatures()), ] return ( examples | 'ToExampleAndExtracts' >> beam.Map(lambda x: types.ExampleAndExtracts(example=x, extracts={})) | Extract(extractors=extractors))
def __init__( self, model_agnostic_config: agnostic_predict.ModelAgnosticConfig ) -> None: self._model_agnostic_config = model_agnostic_config self._shared_handle = shared.Shared() self._model_load_seconds = beam.metrics.Metrics.distribution( _METRICS_NAMESPACE, 'model_load_seconds')
def BuildDiagnosticTable( # pylint: disable=invalid-name examples, eval_saved_model_path, desired_batch_size=None): """Build diagnostics for the spacified EvalSavedModel and example collection. Args: examples: PCollection of input examples. Can be any format the model accepts (e.g. string containing CSV row, TensorFlow.Example, etc). eval_saved_model_path: Path to EvalSavedModel. This directory should contain the saved_model.pb file. desired_batch_size: Optional batch size for batching in Predict and Aggregate. Returns: PCollection of ExampleAndExtracts """ return ( examples | 'ToExampleAndExtracts' >> beam.Map(lambda x: types.ExampleAndExtracts(example=x, extracts={})) | 'Predict' >> predict_extractor.TFMAPredict( eval_saved_model_path, add_metrics_callbacks=None, shared_handle=shared.Shared(), desired_batch_size=desired_batch_size) | 'ExtractFeatures' >> feature_extractor.ExtractFeatures())
def _ExtractOutput( # pylint: disable=invalid-name aggregate_result, eval_saved_model_path, add_metrics_callbacks): return aggregate_result | beam.ParDo( _ExtractOutputDoFn(eval_saved_model_path=eval_saved_model_path, add_metrics_callbacks=add_metrics_callbacks, shared_handle=shared.Shared())).with_outputs( _ExtractOutputDoFn.OUTPUT_TAG_PLOTS, main=_ExtractOutputDoFn.OUTPUT_TAG_METRICS)
def testConcurrentCallsDeduped(self): # Test that only one among many calls to acquire will actually run the # initialisation function. count = Count() shared_handle = shared.Shared() other_shared_handle = shared.Shared() refs = [] ref_lock = threading.Lock() def dummy_acquire_fn(): return None def acquire_fn(): time.sleep(1) return Marker(count) def thread_fn(): p = shared_handle.acquire(acquire_fn) with ref_lock: refs.append(p) threads = [] for _ in xrange(100): t = threading.Thread(target=thread_fn) threads.append(t) t.start() for t in threads: t.join() self.assertEquals(1, count.get_total()) self.assertEquals(1, count.get_active()) other_shared_handle.acquire( dummy_acquire_fn) # Get rid of the keepalive with ref_lock: del refs[:] gc.collect() self.assertEquals(0, count.get_active())
def expand(self, inputs): input_values, tensor_pcoll_mapping = ( self._maybe_deep_copy_pcollection_inputs(inputs)) saved_model_dir = (tensor_pcoll_mapping | 'CreateSavedModelForAnalyzerInputs' >> _ReplaceTensorsWithConstants( self._unbound_saved_model_dir, self._base_temp_dir, input_values.pipeline)) # Run this saved model on the input dataset to obtain the inputs to the # analyzers. analyzer_input_values = ( input_values | 'BatchAnalyzerInputs' >> _BatchElements() | 'ComputeAnalyzerInputs' >> beam.ParDo( _RunMetaGraphDoFn( self._input_schema, self._serialized_tf_config, shared_graph_state_handle=shared.Shared(), passthrough_keys=Context.get_passthrough_keys()), saved_model_dir=beam.pvalue.AsSingleton(saved_model_dir))) # For each analyzer output, look up its input values (by tensor name) # and run the analyzer on these values. result = {} for analyzer_info in self._analyzer_infos: num_outputs = len(analyzer_info.output_infos) inputs = (analyzer_input_values | 'ExtractInputs[%s]' % analyzer_info.attributes.name >> beam.Map(lambda batch, keys: tuple(batch[key] for key in keys), keys=analyzer_info.input_tensor_names)) ptransform = common.lookup_registered_ptransform( analyzer_info.attributes) output_pcolls = ( (inputs, ) | ptransform(num_outputs, analyzer_info.attributes, serialized_tf_config=self._serialized_tf_config, base_temp_dir=self._base_temp_dir)) if len(output_pcolls) != num_outputs: raise ValueError( 'Analyzer {} has {} outputs but its implementation produced {} ' 'pcollections'.format(analyzer_info.name, num_outputs, len(output_pcolls))) for index, (output_pcoll, (name, is_asset)) in enumerate( zip(output_pcolls, analyzer_info.output_infos)): result[name] = (output_pcoll | 'WrapAsTensorValue[%s][%d]' % (analyzer_info.attributes.name, index) >> beam.Map(_TensorValue, is_asset)) return result
def testPredict(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = linear_classifier.simple_linear_classifier( None, temp_eval_export_dir) with beam.Pipeline() as pipeline: example1 = self._makeExample(age=3.0, language='english', label=1.0) example2 = self._makeExample(age=3.0, language='chinese', label=0.0) example3 = self._makeExample(age=4.0, language='english', label=1.0) example4 = self._makeExample(age=5.0, language='chinese', label=0.0) predict_extracts = ( pipeline | beam.Create([ example1.SerializeToString(), example2.SerializeToString(), example3.SerializeToString(), example4.SerializeToString() ]) # Our diagnostic outputs, pass types.ExampleAndExtracts throughout, # however our aggregating functions do not use this interface. | beam.Map( lambda x: types.ExampleAndExtracts(example=x, extracts={})) | 'Predict' >> predict_extractor.TFMAPredict( eval_saved_model_path=eval_export_dir, add_metrics_callbacks=None, shared_handle=shared.Shared(), desired_batch_size=3)) def check_result(got): try: self.assertEqual(4, len(got), 'got: %s' % got) for item in got: extracts_dict = item.extracts self.assertTrue(extracts_dict.has_key('fpl')) fpl = extracts_dict['fpl'] # Verify fpl contains features, probabilities, and correct labels. self.assertIn('language', fpl.features) self.assertIn('age', fpl.features) self.assertIn('label', fpl.features) self.assertIn('probabilities', fpl.predictions) self.assertAlmostEqual(fpl.features['label'], fpl.labels['__labels']) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(predict_extracts, check_result)
def testMultiple(self): count = Count() shared_handle = shared.Shared() other_shared_handle = shared.Shared() def dummy_acquire_fn(): return None def acquire_fn(): return Marker(count) p = shared_handle.acquire(acquire_fn) other_shared_handle.acquire( dummy_acquire_fn) # Get rid of the keepalive self.assertEquals(1, count.get_total()) self.assertEquals(1, count.get_active()) del p gc.collect() self.assertEquals(0, count.get_active()) # Shared value should be garbage collected. # Acquiring multiple times only results in one initialisation p1 = shared_handle.acquire(acquire_fn) # Since shared value was released, expect a reinitialisation. self.assertEquals(2, count.get_total()) self.assertEquals(1, count.get_active()) p2 = shared_handle.acquire(acquire_fn) self.assertEquals(2, count.get_total()) self.assertEquals(1, count.get_active()) other_shared_handle.acquire( dummy_acquire_fn) # Get rid of the keepalive # Check that shared object isn't destroyed if there's still a reference to # it. del p2 gc.collect() self.assertEquals(1, count.get_active()) del p1 gc.collect() self.assertEquals(0, count.get_active())
def expand(self, dataset_and_transform_fn): """Transforms the dataset using the transform_fn. Args: dataset_and_transform_fn: A tuple of dataset and preprocessing function. Returns: A dataset transformed according to the transform_fn. """ (input_values, input_metadata), (transform_fn, output_metadata) = (dataset_and_transform_fn) # If exclude_outputs is set, update the output metadata. if self._exclude_outputs is not None: if isinstance(output_metadata, beam_metadata_io.BeamDatasetMetadata): new_metadata = _remove_columns_from_metadata( output_metadata.dataset_metadata, self._exclude_outputs) new_deferred_metadata = ( output_metadata.deferred_metadata | 'RemoveColumms' >> beam.Map( _remove_columns_from_metadata, self._exclude_outputs)) output_metadata = beam_metadata_io.BeamDatasetMetadata( new_metadata, new_deferred_metadata) else: output_metadata = _remove_columns_from_metadata( output_metadata, self._exclude_outputs) serialized_tf_config = ( common._DEFAULT_TENSORFLOW_CONFIG_BY_RUNNER.get( # pylint: disable=protected-access self.pipeline.runner)) output_instances = ( input_values | 'Batch' >> _BatchElements() | 'Transform' >> beam.ParDo( _RunMetaGraphDoFn( input_metadata.schema, serialized_tf_config, shared_graph_state_handle=shared.Shared(), passthrough_keys=Context.get_passthrough_keys(), exclude_outputs=self._exclude_outputs), saved_model_dir=beam.pvalue.AsSingleton(transform_fn)) | 'ConvertAndUnbatch' >> beam.FlatMap( _convert_and_unbatch_to_instance_dicts, schema=output_metadata.schema, passthrough_keys=Context.get_passthrough_keys())) _clear_shared_state_after_barrier(self.pipeline, output_instances) return (output_instances, output_metadata)
def _clear_shared_state_after_barrier(pipeline, input_barrier): """Clears any shared state from within a pipeline context. This will only be cleared once input_barrier becomes available. """ empty_pcoll = input_barrier | 'MakeCheapBarrier' >> beam.FlatMap( lambda x: None) return (pipeline | 'PrepareToClearSharedKeepAlives' >> beam.Create([None]) | 'WaitAndClearSharedKeepAlives' >> beam.Map( lambda x, empty_side_input: shared.Shared().acquire(lambda: None), beam.pvalue.AsIter(empty_pcoll)))
def __new__(cls, model_path, add_metrics_callbacks=None, example_weight_key=None, shared_handle=None): if not add_metrics_callbacks: add_metrics_callbacks = [] if not shared_handle: shared_handle = shared.Shared() return super(EvalSharedModel, cls).__new__(cls, model_path, add_metrics_callbacks, example_weight_key, shared_handle)
def expand(self, inputs): input_values, tensor_pcoll_mapping = inputs saved_model_dir = (tensor_pcoll_mapping | 'CreateSavedModelForAnalyzerInputs' >> _ReplaceTensorsWithConstants( self._unbound_saved_model_dir, self._base_temp_dir, input_values.pipeline)) # Run this saved model on the input dataset to obtain the inputs to the # analyzers. analyzer_input_values = ( input_values | 'BatchAnalyzerInputs' >> _BatchElements() | 'ComputeAnalyzerInputs' >> beam.ParDo( _RunMetaGraphDoFn( self._input_schema, self._serialized_tf_config, shared_graph_state_handle=shared.Shared(), passthrough_keys=Context.get_passthrough_keys()), saved_model_dir=beam.pvalue.AsSingleton(saved_model_dir))) # For each analyzer output, look up its input values (by tensor name) # and run the analyzer on these values. result = {} for analyzer_info in self._analyzer_infos: temp_assets_dir = _make_unique_temp_dir(self._base_temp_dir) tf.gfile.MkDir(temp_assets_dir) output_pcolls = ( analyzer_input_values | 'ExtractInputs[%s]' % analyzer_info.name >> beam.Map( lambda batch, keys: [batch[key] for key in keys], keys=analyzer_info.input_tensor_names) | 'Analyze[%s]' % analyzer_info.name >> analyzer_impls._AnalyzerImpl(analyzer_info.spec, temp_assets_dir)) # pylint: enable=protected-access if len(output_pcolls) != len(analyzer_info.output_infos): raise ValueError( 'Analyzer {} has {} outputs but its implementation produced {} ' 'pcollections'.format(analyzer_info.name, len(analyzer_info.output_infos), len(output_pcolls))) for index, (output_pcoll, (name, is_asset)) in enumerate( zip(output_pcolls, analyzer_info.output_infos)): result[name] = (output_pcoll | 'WrapAsTensorValue[%s][%d]' % (analyzer_info.name, index) >> beam.Map( _TensorValue, is_asset)) return result
def _Aggregate( # pylint: disable=invalid-name slice_result, eval_saved_model_path, add_metrics_callbacks, desired_batch_size = None, ): return (slice_result | 'CombinePerKey' >> beam.CombinePerKey( _AggregateCombineFn( eval_saved_model_path=eval_saved_model_path, add_metrics_callbacks=add_metrics_callbacks, shared_handle=shared.Shared(), desired_batch_size=desired_batch_size)))
def _Predict( # pylint: disable=invalid-name examples, eval_saved_model_path, desired_batch_size=None): batch_args = {} if desired_batch_size: batch_args = dict(min_batch_size=desired_batch_size, max_batch_size=desired_batch_size) return (examples | 'Batch' >> beam.BatchElements(**batch_args) | beam.ParDo( _PredictionDoFn(eval_saved_model_path=eval_saved_model_path, add_metrics_callbacks=None, shared_handle=shared.Shared())))
def testDifferentObjects(self): sequence = Sequence() def dummy_acquire_fn(): return None first_handle = shared.Shared() second_handle = shared.Shared() dummy_handle = shared.Shared() f1 = first_handle.acquire(sequence.make_acquire_fn()) s1 = second_handle.acquire(sequence.make_acquire_fn()) self.assertEquals('sequence1', f1.get_name()) self.assertEquals('sequence2', s1.get_name()) f2 = first_handle.acquire(sequence.make_acquire_fn()) s2 = second_handle.acquire(sequence.make_acquire_fn()) # Check that the repeated acquisitions return the earlier objects self.assertEquals('sequence1', f2.get_name()) self.assertEquals('sequence2', s2.get_name()) # Release all references and force garbage-collection del f1 del f2 del s1 del s2 dummy_handle.acquire(dummy_acquire_fn) # Get rid of the keepalive gc.collect() # Check that acquiring again after they're released gives new objects f3 = first_handle.acquire(sequence.make_acquire_fn()) s3 = second_handle.acquire(sequence.make_acquire_fn()) self.assertEquals('sequence3', f3.get_name()) self.assertEquals('sequence4', s3.get_name())
def __new__(cls, model_path=None, add_metrics_callbacks=None, include_default_metrics=True, example_weight_key=None, shared_handle=None, construct_fn=None): if not add_metrics_callbacks: add_metrics_callbacks = [] if not shared_handle: shared_handle = shared.Shared() return super(EvalSharedModel, cls).__new__(cls, model_path, add_metrics_callbacks, include_default_metrics, example_weight_key, shared_handle, construct_fn)
def __new__( cls, model_path: Optional[Text] = None, add_metrics_callbacks: Optional[List[AddMetricsCallbackType]] = None, include_default_metrics: Optional[bool] = True, example_weight_key: Optional[Text] = None, additional_fetches: Optional[List[Text]] = None, shared_handle: Optional[shared.Shared] = None, construct_fn: Optional[Callable[..., Any]] = None): if not add_metrics_callbacks: add_metrics_callbacks = [] if not shared_handle: shared_handle = shared.Shared() return super(EvalSharedModel, cls).__new__( cls, model_path, add_metrics_callbacks, include_default_metrics, example_weight_key, additional_fetches, shared_handle, construct_fn)
def expand(self, dataset_and_transform_fn): """Transforms the dataset using the transform_fn. Args: dataset_and_transform_fn: A tuple of dataset and preprocessing function. Returns: A dataset transformed according to the transform_fn. """ (input_values, input_metadata), (transform_fn, (output_metadata, _)) = (dataset_and_transform_fn) # If exclude_outputs is set, update the output metadata, which will also # cause _RunMetaGraphDoFn not to create the excluded outputs. if self._exclude_outputs is not None: schema = output_metadata.schema output_metadata = dataset_metadata.DatasetMetadata( schema=dataset_schema.Schema({ key: column_schema for key, column_schema in six.iteritems( schema.column_schemas) if key not in self._exclude_outputs })) def convert_and_unbatch(batch_dict): return impl_helper.to_instance_dicts( impl_helper.make_output_dict(output_metadata.schema, batch_dict)) serialized_tf_config = _DEFAULT_TENSORFLOW_CONFIG_BY_RUNNER.get( self.pipeline.runner) output_instances = ( input_values | 'Transform' >> beam.ParDo( _RunMetaGraphDoFn(input_metadata.schema, output_metadata.schema, serialized_tf_config, shared_graph_state_handle=shared.Shared(), exclude_outputs=self._exclude_outputs), saved_model_dir=beam.pvalue.AsSingleton(transform_fn)) | 'ConvertAndUnbatch' >> beam.FlatMap(convert_and_unbatch)) _clear_shared_state_after_barrier(self.pipeline, output_instances) return (output_instances, output_metadata)
def expand(self, inputs): # We don't deep_copy pcollections used for the first phase, or when # the user defined `Context` disables it. if self._phase > 0 and Context.get_use_deep_copy_optimization(): # Obviates unnecessary data materialization when the input data source is # safe to read more than once. tf.logging.info('Deep copying inputs for phase: %d', self._phase) input_values = deep_copy.deep_copy(self._input_values_pcoll) else: input_values = self._input_values_pcoll return (input_values | 'BatchInputs' >> _BatchElements() | 'ApplySavedModel' >> beam.ParDo( _RunMetaGraphDoFn( self._input_schema, self._serialized_tf_config, shared_graph_state_handle=shared.Shared(), passthrough_keys=Context.get_passthrough_keys()), saved_model_dir=beam.pvalue.AsSingleton(inputs[0])))
def __new__(cls, shared_handle: Optional[shared.Shared] = None, construct_fn: Optional[Callable[..., Any]] = None): if not shared_handle: shared_handle = shared.Shared() return super(ModelLoader, cls).__new__(cls, shared_handle, construct_fn)
def Evaluate( # pylint: disable=invalid-name examples, eval_saved_model_path, add_metrics_callbacks=None, slice_spec=None, desired_batch_size=None, ): """Evaluate the given EvalSavedModel on the given examples. This is for TFMA use only. Users should call tfma.EvaluateAndWriteResults instead of this function. Args: examples: PCollection of input examples. Can be any format the model accepts (e.g. string containing CSV row, TensorFlow.Example, etc). eval_saved_model_path: Path to EvalSavedModel. This directory should contain the saved_model.pb file. add_metrics_callbacks: Optional list of callbacks for adding additional metrics to the graph. The names of the metrics added by the callbacks should not conflict with existing metrics, or metrics added by other callbacks. See below for more details about what each callback should do. slice_spec: Optional list of SingleSliceSpec specifying the slices to slice the data into. If None, defaults to the overall slice. desired_batch_size: Optional batch size for batching in Predict and Aggregate. More details on add_metrics_callbacks: Each add_metrics_callback should have the following prototype: def add_metrics_callback(features_dict, predictions_dict, labels_dict): Note that features_dict, predictions_dict and labels_dict are not necessarily dictionaries - they might also be Tensors, depending on what the model's eval_input_receiver_fn returns. It should create and return a metric_ops dictionary, such that metric_ops['metric_name'] = (value_op, update_op), just as in the Trainer. Short example: def add_metrics_callback(features_dict, predictions_dict, labels): metrics_ops = {} metric_ops['mean_label'] = tf.metrics.mean(labels) metric_ops['mean_probability'] = tf.metrics.mean(tf.slice( predictions_dict['probabilities'], [0, 1], [2, 1])) return metric_ops Returns: DoOutputsTuple. The tuple entries are PCollection of (slice key, metrics) and PCollection of (slice key, plot metrics). """ if slice_spec is None: slice_spec = [slicer.SingleSliceSpec()] shared_handle = shared.Shared() # pylint: disable=no-value-for-parameter return ( examples # Our diagnostic outputs, pass types.ExampleAndExtracts throughout, # however our aggregating functions do not use this interface. | 'ToExampleAndExtracts' >> beam.Map(lambda x: types.ExampleAndExtracts(example=x, extracts={})) # Map function which loads and runs the eval_saved_model against every # example, yielding an types.ExampleAndExtracts containing a # FeaturesPredictionsLabels value (where key is 'fpl'). | 'Predict' >> predict_extractor.TFMAPredict( eval_saved_model_path=eval_saved_model_path, add_metrics_callbacks=add_metrics_callbacks, shared_handle=shared_handle, desired_batch_size=desired_batch_size) # Input: one example fpl at a time # Output: one fpl example per slice key (notice that the example turns # into n, replicated once per applicable slice key) | 'Slice' >> slice_api.Slice(slice_spec) # Each slice key lands on one shard where metrics are computed for all # examples in that shard -- the "map" and "reduce" parts of the # computation happen within this shard. # Output: Tuple[slicer.SliceKeyType, MetricVariablesType] | 'Aggregate' >> _Aggregate(eval_saved_model_path=eval_saved_model_path, add_metrics_callbacks=add_metrics_callbacks, shared_handle=shared_handle, desired_batch_size=desired_batch_size) # Different metrics for a given slice key are brought together. | 'ExtractOutput' >> _ExtractOutput( eval_saved_model_path=eval_saved_model_path, add_metrics_callbacks=add_metrics_callbacks, shared_handle=shared_handle))
def expand(self, dataset_and_transform_fn): """Transforms the dataset using the transform_fn. Args: dataset_and_transform_fn: A tuple of dataset and preprocessing function. Returns: A dataset transformed according to the transform_fn. """ (input_values, input_metadata), (transform_fn, output_metadata) = ( dataset_and_transform_fn) # If exclude_outputs is set, update the output metadata. if self._exclude_outputs is not None: if isinstance(output_metadata, beam_metadata_io.BeamDatasetMetadata): # Unwrap BeamDatasetMetadata into DatasetMetadata and pcollections dict. output_metadata, pcollections = output_metadata schema = output_metadata.schema # Update DatasetMetadata to remove excluded outputs output_metadata = dataset_metadata.DatasetMetadata( schema=dataset_schema.Schema({ key: column_schema for key, column_schema in six.iteritems(schema.column_schemas) if key not in self._exclude_outputs })) # Update pcollections to keep only pcollections that resolve futures in # the updated metadata. unresolved_future_names = set( future.name for future in output_metadata.substitute_futures({})) pcollections = { name: pcollection for name, pcollection in six.iteritems(pcollections) if name in unresolved_future_names } # Wrap DatasetMetadata and pcollections as BeamDatasetMetadata output_metadata = beam_metadata_io.BeamDatasetMetadata( output_metadata, pcollections) else: schema = output_metadata.schema output_metadata = dataset_metadata.DatasetMetadata( schema=dataset_schema.Schema({ key: column_schema for key, column_schema in six.iteritems(schema.column_schemas) if key not in self._exclude_outputs })) serialized_tf_config = ( common._DEFAULT_TENSORFLOW_CONFIG_BY_RUNNER.get( # pylint: disable=protected-access self.pipeline.runner)) output_instances = ( input_values | 'Batch' >> _BatchElements() | 'Transform' >> beam.ParDo( _RunMetaGraphDoFn( input_metadata.schema, serialized_tf_config, shared_graph_state_handle=shared.Shared(), passthrough_keys=Context.get_passthrough_keys(), exclude_outputs=self._exclude_outputs), saved_model_dir=beam.pvalue.AsSingleton(transform_fn)) | 'ConvertAndUnbatch' >> beam.FlatMap( _convert_and_unbatch_to_instance_dicts, schema=output_metadata.schema, passthrough_keys=Context.get_passthrough_keys())) _clear_shared_state_after_barrier(self.pipeline, output_instances) return (output_instances, output_metadata)
def expand(self, dataset): """Analyze the dataset. Args: dataset: A dataset. Returns: A TransformFn containing the deferred transform function. Raises: ValueError: If preprocessing_fn has no outputs. """ input_values, input_metadata = dataset input_schema = input_metadata.schema base_temp_dir = Context.create_base_temp_dir() graph = tf.Graph() with graph.as_default(): with tf.name_scope('inputs'): inputs = input_schema.as_batched_placeholders() # In order to avoid a bug where import_graph_def fails when the input_map # and return_elements of an imported graph are the same (b/34288791), we # avoid using the placeholder of an input column as an output of a graph. # We do this by applying tf.identity to all inputs of the # preprocessing_fn. Note this applies at the level of raw tensors. outputs = self._preprocessing_fn(impl_helper.copy_tensors(inputs)) # At this point we check that the preprocessing_fn has at least one # output. This is because if we allowed the output of preprocessing_fn to # be empty, we wouldn't be able to determine how many instances to # "unbatch" the output into. if not outputs: raise ValueError('The preprocessing function returned an empty dict') if graph.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES): raise ValueError( 'The preprocessing function contained trainable variables ' '{}'.format( graph.get_collection_ref(tf.GraphKeys.TRAINABLE_VARIABLES))) # NOTE: it's important that create_phases is called directly after # preprocessing_fn, because we later mutate the graph's TABLE_INITIALIZERS # collection which would break the logic in create_phases. phases = impl_helper.create_phases() # Iterate through levels. tensor_pcoll_mapping is a mapping from tensor # names to singleton PCollections containing a _TensorValue. We compute # tensor_pcoll_mapping in phases, where at each phase we compute the # analyzers that are ready to run and update tensor_pcoll_mapping. tensor_pcoll_mapping = {} table_initializers = graph.get_collection_ref( tf.GraphKeys.TABLE_INITIALIZERS) original_table_initializers = list(table_initializers) del table_initializers[:] serialized_tf_config = ( common._DEFAULT_TENSORFLOW_CONFIG_BY_RUNNER.get( # pylint: disable=protected-access input_values.pipeline.runner)) for level, phase in enumerate(phases): # Create a SavedModel that describes the mapping from the input data # to the inputs of the analyzers at this level. The colum names of the # outputs are the tensor names of the analyzer inputs in the graph. # This graph has the anaylzer outputs computed so far replaced with # constants. analyzer_inputs = {} for analyzer in phase.analyzers: for input_tensor in analyzer.inputs: analyzer_inputs[input_tensor.name] = input_tensor table_initializers.extend(phase.table_initializers) unbound_saved_model_dir = _make_unique_temp_dir(base_temp_dir) _write_saved_transform(graph, inputs, analyzer_inputs, unbound_saved_model_dir) saved_model_dir = ( tensor_pcoll_mapping | 'CreateSavedModelForAnalyzerInputs[%d]' % level >> _ReplaceTensorsWithConstants(unbound_saved_model_dir, base_temp_dir, input_values.pipeline)) # Run this saved model on the input dataset to obtain the inputs to the # analyzers. analyzer_input_values = ( input_values | 'BatchAnalyzerInputs[%d]' % level >> _BatchElements() | 'ComputeAnalyzerInputs[%d]' % level >> beam.ParDo( _RunMetaGraphDoFn( input_schema, serialized_tf_config, shared_graph_state_handle=shared.Shared(), passthrough_keys=Context.get_passthrough_keys()), saved_model_dir=beam.pvalue.AsSingleton(saved_model_dir))) # Compute the analyzers from their inputs. `analyzer_outputs_dict` is a # map from tensor names to singleton PCollections of `_TensorValue`s. analyzer_outputs_dict = ( analyzer_input_values | 'ComputeAnalyzerOutputs[%d]' % level >> _ComputeAnalyzerOutputs( phase.analyzers, base_temp_dir)) # Update the mapping for all analyzers. tensor_pcoll_mapping.update(analyzer_outputs_dict) del table_initializers[:] table_initializers.extend(original_table_initializers) saved_model_dir = _make_unique_temp_dir(base_temp_dir) _write_saved_transform(graph, inputs, outputs, saved_model_dir) transform_fn = ( tensor_pcoll_mapping | 'ReplaceTensorsWithConstants' >> _ReplaceTensorsWithConstants( saved_model_dir, base_temp_dir, input_values.pipeline)) # Infer metadata. The metadata may contain Futures that refer to the # values of tensors in the graph. In that case, the tensors must be # "constant" in that they don't depend on input data. The tensors can # depend on analyzer outputs though. This allows us to set metadata that # depends on analyzer outputs. # # We first extract the names of the tensors that are referenced by the # Futures, and then compute them by calling _ComputeScalarConstants with # the tensor-PCollection mapping representing the analyzer outputs. metadata = dataset_metadata.DatasetMetadata( schema=impl_helper.infer_feature_schema(outputs)) deferred_metadata_tensor_names = { future.name for column_schema in metadata.schema.column_schemas.values() for future in column_schema.substitute_futures({}) } name_pcoll_dict = ( tensor_pcoll_mapping | 'ComputeTensorValues' >> _ComputeTensorValues(deferred_metadata_tensor_names, saved_model_dir, input_values.pipeline)) full_metadata = beam_metadata_io.BeamDatasetMetadata( metadata, name_pcoll_dict) _clear_shared_state_after_barrier(input_values.pipeline, transform_fn) return transform_fn, full_metadata
def expand(self, dataset): """Analyze the dataset. Args: dataset: A dataset. Returns: A TransformFn containing the deferred transform function. """ input_values, input_metadata = dataset input_schema = input_metadata.schema base_temp_dir = Context.create_base_temp_dir() # NOTE: it's important that create_phases is called directly after # run_preprocessing_fn, because we later mutate the graph's # TABLE_INITIALIZERS collection which would break the logic in # create_phases. graph, inputs, outputs = impl_helper.run_preprocessing_fn( self._preprocessing_fn, input_schema) phases = impl_helper.create_phases(graph) # Iterate through levels. tensor_pcoll_mapping is a mapping from tensor # names to singleton PCollections containing a _TensorValue. We compute # tensor_pcoll_mapping in phases, where at each phase we compute the # analyzers that are ready to run and update tensor_pcoll_mapping. tensor_pcoll_mapping = {} table_initializers = graph.get_collection_ref( tf.GraphKeys.TABLE_INITIALIZERS) original_table_initializers = list(table_initializers) del table_initializers[:] serialized_tf_config = ( analyzer_impls._DEFAULT_TENSORFLOW_CONFIG_BY_RUNNER.get( # pylint: disable=protected-access input_values.pipeline.runner)) for level, phase in enumerate(phases): # Create a SavedModel that describes the mapping from the input data # to the inputs of the analyzers at this level. The colum names of the # outputs are the tensor names of the analyzer inputs in the graph. This # graph has the anaylzer outputs computed so far replaced with constants. analyzer_inputs = {} for analyzer in phase.analyzers: for input_tensor in analyzer.inputs: analyzer_inputs[input_tensor.name] = input_tensor table_initializers.extend(phase.table_initializers) unbound_saved_model_dir = _make_unique_temp_dir(base_temp_dir) _write_saved_transform( graph, inputs, analyzer_inputs, unbound_saved_model_dir) saved_model_dir = ( tensor_pcoll_mapping | 'CreateSavedModelForAnaylzerInputs[%d]' % level >> _ReplaceTensorsWithConstants( unbound_saved_model_dir, base_temp_dir, input_values.pipeline)) # Run this saved model on the input dataset to obtain the inputs to the # analyzers. analyzer_input_values = ( input_values | 'ComputeAnalyzerInputs[%d]' % level >> beam.ParDo( _RunMetaGraphDoFn( input_schema, serialized_tf_config, shared_graph_state_handle=shared.Shared()), saved_model_dir=beam.pvalue.AsSingleton(saved_model_dir))) # Compute the analyzers from their inputs. `analyzer_outputs_dict` is a # map from tensor names to singleton PCollections of `_TensorValue`s. analyzer_outputs_dict = ( analyzer_input_values | 'ComputeAnalyzerOutputs[%d]' % level >> _ComputeAnalyzerOutputs(phase.analyzers, base_temp_dir)) # Update the mapping for all analyzers. tensor_pcoll_mapping.update(analyzer_outputs_dict) del table_initializers[:] table_initializers.extend(original_table_initializers) saved_model_dir = _make_unique_temp_dir(base_temp_dir) _write_saved_transform(graph, inputs, outputs, saved_model_dir) transform_fn = ( tensor_pcoll_mapping | 'ReplaceTensorsWithConstants' >> _ReplaceTensorsWithConstants( saved_model_dir, base_temp_dir, input_values.pipeline)) # Infer metadata. The metadata may contain Futures that refer to the values # of tensors in the graph. In that case, the tensors must be "constant" in # that they don't depend on input data. The tensors can depend on analyzer # outputs though. This allows us to set metadata that depends on analyzer # outputs. # # We first extract the names of the tensors that are referenced by the # Futures, and then compute them by calling _ComputeScalarConstants with the # tensor-PCollection mapping representing the analyzer outputs. metadata = dataset_metadata.DatasetMetadata( schema=impl_helper.infer_feature_schema(graph, outputs)) deferred_metadata_tensor_names = [ future.name for column_schema in tft_api.get_column_schemas(graph).values() for future in column_schema.substitute_futures({})] name_pcoll_dict = ( tensor_pcoll_mapping | 'ComputeTensorValues' >> _ComputeTensorValues( deferred_metadata_tensor_names, saved_model_dir, input_values.pipeline)) full_metadata = beam_metadata_io.BeamDatasetMetadata( metadata, name_pcoll_dict) _clear_shared_state_after_barrier(input_values.pipeline, transform_fn) return transform_fn, full_metadata