def __init__(self, statistics: types.Channel = None, schema: types.Channel = None, output: Optional[types.Channel] = None, stats: Optional[types.Channel] = None, instance_name: Optional[Text] = None): """Construct an ExampleValidator component. Args: statistics: A Channel of 'ExampleStatisticsPath` type. This should contain at least 'eval' split. Other splits are ignored currently. schema: A Channel of "SchemaPath' type. _required_ output: Output channel of 'ExampleValidationPath' type. stats: Backwards compatibility alias for the 'statistics' argument. instance_name: Optional name assigned to this specific instance of ExampleValidator. Required only if multiple ExampleValidator components are declared in the same pipeline. Either `stats` or `statistics` must be present in the arguments. """ statistics = statistics or stats output = output or types.Channel( type=standard_artifacts.ExampleAnomalies, artifacts=[standard_artifacts.ExampleAnomalies()]) spec = ExampleValidatorSpec(stats=statistics, schema=schema, output=output) super(ExampleValidator, self).__init__(spec=spec, instance_name=instance_name)
def __init__(self, statistics: types.Channel = None, schema: types.Channel = None, output: Optional[types.Channel] = None, stats: Optional[types.Channel] = None, instance_name: Optional[Text] = None): """Construct an ExampleValidator component. Args: statistics: A Channel of type `standard_artifacts.ExampleStatistics`. This should contain at least 'eval' split. Other splits are currently ignored. schema: A Channel of type `standard_artifacts.Schema`. _required_ output: Output channel of type `standard_artifacts.ExampleAnomalies`. stats: Backwards compatibility alias for the 'statistics' argument. instance_name: Optional name assigned to this specific instance of ExampleValidator. Required only if multiple ExampleValidator components are declared in the same pipeline. Either `stats` or `statistics` must be present in the arguments. """ if stats: absl.logging.warning( 'The "stats" argument to the StatisticsGen component has ' 'been renamed to "statistics" and is deprecated. Please update your ' 'usage as support for this argument will be removed soon.') statistics = stats anomalies = output or types.Channel( type=standard_artifacts.ExampleAnomalies, artifacts=[standard_artifacts.ExampleAnomalies()]) spec = ExampleValidatorSpec(statistics=statistics, schema=schema, anomalies=anomalies) super(ExampleValidator, self).__init__(spec=spec, instance_name=instance_name)
def testDo(self): source_data_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') eval_stats_artifact = standard_artifacts.ExampleStatistics() eval_stats_artifact.uri = os.path.join(source_data_dir, 'statistics_gen') eval_stats_artifact.split_names = artifact_utils.encode_split_names( ['train', 'eval', 'test']) schema_artifact = standard_artifacts.Schema() schema_artifact.uri = os.path.join(source_data_dir, 'schema_gen') output_data_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) validation_output = standard_artifacts.ExampleAnomalies() validation_output.uri = os.path.join(output_data_dir, 'output') input_dict = { STATISTICS_KEY: [eval_stats_artifact], SCHEMA_KEY: [schema_artifact], } exec_properties = { # List needs to be serialized before being passed into Do function. EXCLUDE_SPLITS_KEY: json_utils.dumps(['test']) } output_dict = { ANOMALIES_KEY: [validation_output], } example_validator_executor = executor.Executor() example_validator_executor.Do(input_dict, output_dict, exec_properties) self.assertEqual(artifact_utils.encode_split_names(['train', 'eval']), validation_output.split_names) # Check example_validator outputs. train_anomalies_path = os.path.join(validation_output.uri, 'Split-train', 'SchemaDiff.pb') eval_anomalies_path = os.path.join(validation_output.uri, 'Split-eval', 'SchemaDiff.pb') self.assertTrue(fileio.exists(train_anomalies_path)) self.assertTrue(fileio.exists(eval_anomalies_path)) train_anomalies_bytes = io_utils.read_bytes_file(train_anomalies_path) train_anomalies = anomalies_pb2.Anomalies() train_anomalies.ParseFromString(train_anomalies_bytes) eval_anomalies_bytes = io_utils.read_bytes_file(eval_anomalies_path) eval_anomalies = anomalies_pb2.Anomalies() eval_anomalies.ParseFromString(eval_anomalies_bytes) self.assertEqual(0, len(train_anomalies.anomaly_info)) self.assertEqual(0, len(eval_anomalies.anomaly_info)) # Assert 'test' split is excluded. train_file_path = os.path.join(validation_output.uri, 'Split-test', 'SchemaDiff.pb') self.assertFalse(fileio.exists(train_file_path))
def __init__(self, statistics: types.Channel = None, schema: types.Channel = None, output: Optional[types.Channel] = None, stats: Optional[types.Channel] = None, instance_name: Optional[Text] = None): """An ExampleValidator component for examples. TFX has its ExampleValidator component, and this one uses the same executor. The reason for this one to exist, is that the TFX component does not allow to specify the splits to use, it just assumes `train` and `eval`. This component will be unnecessary once TFX Transform allows to set the input and output splits as other components do""" if stats: logging.warning( 'The "stats" argument to the StatisticsGen component has ' 'been renamed to "statistics" and is deprecated. Please update' ' your usage as support for this argument will be removed' ' soon.') statistics = stats anomalies = output or types.Channel( type=standard_artifacts.ExampleAnomalies, artifacts=[standard_artifacts.ExampleAnomalies()]) spec = ExampleValidatorSpec(statistics=statistics, schema=schema, anomalies=anomalies) super(ExampleValidator, self).__init__(spec=spec, instance_name=instance_name)
def testShow(self, *unused_mocks): context = interactive_context.InteractiveContext() mock_object = mock.MagicMock() standard_visualizations.ExampleAnomaliesVisualization.display = mock_object mock_object.assert_not_called() artifact = standard_artifacts.ExampleAnomalies() context.show( types.Channel(type=standard_artifacts.ExampleAnomalies, artifacts=[artifact])) mock_object.assert_called_with(artifact)
def __init__(self, statistics: types.Channel = None, schema: types.Channel = None, exclude_splits: Optional[List[Text]] = None, output: Optional[types.Channel] = None, stats: Optional[types.Channel] = None, instance_name: Optional[Text] = None): """Construct an ExampleValidator component. Args: statistics: A Channel of type `standard_artifacts.ExampleStatistics`. This should contain at least 'eval' split. Other splits are currently ignored. schema: A Channel of type `standard_artifacts.Schema`. _required_ exclude_splits: Names of splits that the example validator should not validate. Default behavior (when exclude_splits is set to None) is excluding no splits. output: Output channel of type `standard_artifacts.ExampleAnomalies`. stats: Backwards compatibility alias for the 'statistics' argument. instance_name: Optional name assigned to this specific instance of ExampleValidator. Required only if multiple ExampleValidator components are declared in the same pipeline. Either `stats` or `statistics` must be present in the arguments. """ if stats: logging.warning( 'The "stats" argument to the StatisticsGen component has ' 'been renamed to "statistics" and is deprecated. Please update your ' 'usage as support for this argument will be removed soon.') statistics = stats if exclude_splits is None: exclude_splits = [] logging.info('Excluding no splits because exclude_splits is not set.') anomalies = output if not anomalies: anomalies_artifact = standard_artifacts.ExampleAnomalies() statistics_split_names = artifact_utils.decode_split_names( artifact_utils.get_single_instance(list( statistics.get())).split_names) split_names = [ split for split in statistics_split_names if split not in exclude_splits ] anomalies_artifact.split_names = artifact_utils.encode_split_names( split_names) anomalies = types.Channel( type=standard_artifacts.ExampleAnomalies, artifacts=[anomalies_artifact]) spec = ExampleValidatorSpec( statistics=statistics, schema=schema, exclude_splits=json_utils.dumps(exclude_splits), anomalies=anomalies) super(ExampleValidator, self).__init__( spec=spec, instance_name=instance_name)
def testGetStatusOutputPathsEntries(self): # disabled. self.assertEmpty(executor_utils.GetStatsOutputPathEntries(True, {})) # enabled. pre_transform_stats = standard_artifacts.ExampleStatistics() pre_transform_stats.uri = '/pre_transform_stats' pre_transform_schema = standard_artifacts.Schema() pre_transform_schema.uri = '/pre_transform_schema' post_transform_anomalies = standard_artifacts.ExampleAnomalies() post_transform_anomalies.uri = '/post_transform_anomalies' post_transform_stats = standard_artifacts.ExampleStatistics() post_transform_stats.uri = '/post_transform_stats' post_transform_schema = standard_artifacts.Schema() post_transform_schema.uri = '/post_transform_schema' result = executor_utils.GetStatsOutputPathEntries( False, { standard_component_specs.PRE_TRANSFORM_STATS_KEY: [pre_transform_stats], standard_component_specs.PRE_TRANSFORM_SCHEMA_KEY: [pre_transform_schema], standard_component_specs.POST_TRANSFORM_ANOMALIES_KEY: [post_transform_anomalies], standard_component_specs.POST_TRANSFORM_STATS_KEY: [post_transform_stats], standard_component_specs.POST_TRANSFORM_SCHEMA_KEY: [post_transform_schema], }) self.assertEqual( { labels.PRE_TRANSFORM_OUTPUT_STATS_PATH_LABEL: '/pre_transform_stats', labels.PRE_TRANSFORM_OUTPUT_SCHEMA_PATH_LABEL: '/pre_transform_schema', labels.POST_TRANSFORM_OUTPUT_ANOMALIES_PATH_LABEL: '/post_transform_anomalies', labels.POST_TRANSFORM_OUTPUT_STATS_PATH_LABEL: '/post_transform_stats', labels.POST_TRANSFORM_OUTPUT_SCHEMA_PATH_LABEL: '/post_transform_schema', }, result)
def __init__(self, stats: types.Channel, schema: types.Channel, output: Optional[types.Channel] = None, name: Optional[Text] = None): """Construct an ExampleValidator component. Args: stats: A Channel of 'ExampleStatisticsPath' type. This should contain at least 'eval' split. Other splits are ignored currently. schema: A Channel of "SchemaPath' type. output: Optional output channel of 'ExampleValidationPath' type. name: Optional unique name. Necessary iff multiple ExampleValidator components are declared in the same pipeline. """ output = output or types.Channel( type=standard_artifacts.ExampleAnomalies, artifacts=[standard_artifacts.ExampleAnomalies()]) spec = ExampleValidatorSpec(stats=stats, schema=schema, output=output) super(ExampleValidator, self).__init__(spec=spec, name=name)
def testDo(self): source_data_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') eval_stats_artifact = standard_artifacts.ExampleStatistics() eval_stats_artifact.uri = os.path.join(source_data_dir, 'statistics_gen') eval_stats_artifact.split_names = artifact_utils.encode_split_names( ['eval']) schema_artifact = standard_artifacts.Schema() schema_artifact.uri = os.path.join(source_data_dir, 'schema_gen') output_data_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) validation_output = standard_artifacts.ExampleAnomalies() validation_output.uri = os.path.join(output_data_dir, 'output') input_dict = { executor.STATISTICS_KEY: [eval_stats_artifact], executor.SCHEMA_KEY: [schema_artifact], } output_dict = { executor.ANOMALIES_KEY: [validation_output], } exec_properties = {} example_validator_executor = executor.Executor() example_validator_executor.Do(input_dict, output_dict, exec_properties) self.assertEqual(['anomalies.pbtxt'], tf.io.gfile.listdir(validation_output.uri)) anomalies = io_utils.parse_pbtxt_file( os.path.join(validation_output.uri, 'anomalies.pbtxt'), anomalies_pb2.Anomalies()) self.assertNotEqual(0, len(anomalies.anomaly_info))
def testDo(self): source_data_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') eval_stats_artifact = types.Artifact('ExampleStatsPath', split='eval') eval_stats_artifact.uri = os.path.join(source_data_dir, 'statistics_gen/eval/') schema_artifact = standard_artifacts.Schema() schema_artifact.uri = os.path.join(source_data_dir, 'schema_gen/') output_data_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) validation_output = standard_artifacts.ExampleAnomalies() validation_output.uri = os.path.join(output_data_dir, 'output') input_dict = { 'stats': [eval_stats_artifact], 'schema': [schema_artifact], } output_dict = { 'output': [validation_output], } exec_properties = {} example_validator_executor = executor.Executor() example_validator_executor.Do(input_dict, output_dict, exec_properties) self.assertEqual(['anomalies.pbtxt'], tf.gfile.ListDirectory(validation_output.uri)) anomalies = io_utils.parse_pbtxt_file( os.path.join(validation_output.uri, 'anomalies.pbtxt'), anomalies_pb2.Anomalies()) self.assertNotEqual(0, len(anomalies.anomaly_info))
def _make_base_do_params(self, source_data_dir, output_data_dir): # Create input dict. example1 = standard_artifacts.Examples() example1.uri = self._ARTIFACT1_URI example1.split_names = artifact_utils.encode_split_names( ['train', 'eval']) example2 = copy.deepcopy(example1) example2.uri = self._ARTIFACT2_URI self._example_artifacts = [example1, example2] schema_artifact = standard_artifacts.Schema() schema_artifact.uri = os.path.join(source_data_dir, 'schema_gen') self._input_dict = { standard_component_specs.EXAMPLES_KEY: self._example_artifacts[:1], standard_component_specs.SCHEMA_KEY: [schema_artifact], } # Create output dict. self._transformed_output = standard_artifacts.TransformGraph() self._transformed_output.uri = os.path.join(output_data_dir, 'transformed_graph') transformed1 = standard_artifacts.Examples() transformed1.uri = os.path.join(output_data_dir, 'transformed_examples', '0') transformed2 = standard_artifacts.Examples() transformed2.uri = os.path.join(output_data_dir, 'transformed_examples', '1') self._transformed_example_artifacts = [transformed1, transformed2] temp_path_output = _TempPath() temp_path_output.uri = tempfile.mkdtemp() self._updated_analyzer_cache_artifact = standard_artifacts.TransformCache( ) self._updated_analyzer_cache_artifact.uri = os.path.join( self._output_data_dir, 'CACHE') self._pre_transform_schema = standard_artifacts.Schema() self._pre_transform_schema.uri = os.path.join(output_data_dir, 'pre_transform_schema', '0') self._pre_transform_stats = standard_artifacts.ExampleStatistics() self._pre_transform_stats.uri = os.path.join(output_data_dir, 'pre_transform_stats', '0') self._post_transform_schema = standard_artifacts.Schema() self._post_transform_schema.uri = os.path.join( output_data_dir, 'post_transform_schema', '0') self._post_transform_stats = standard_artifacts.ExampleStatistics() self._post_transform_stats.uri = os.path.join(output_data_dir, 'post_transform_stats', '0') self._post_transform_anomalies = standard_artifacts.ExampleAnomalies() self._post_transform_anomalies.uri = os.path.join( output_data_dir, 'post_transform_anomalies', '0') self._output_dict = { standard_component_specs.TRANSFORM_GRAPH_KEY: [self._transformed_output], standard_component_specs.TRANSFORMED_EXAMPLES_KEY: self._transformed_example_artifacts[:1], executor.TEMP_PATH_KEY: [temp_path_output], standard_component_specs.UPDATED_ANALYZER_CACHE_KEY: [self._updated_analyzer_cache_artifact], standard_component_specs.PRE_TRANSFORM_STATS_KEY: [self._pre_transform_stats], standard_component_specs.PRE_TRANSFORM_SCHEMA_KEY: [self._pre_transform_schema], standard_component_specs.POST_TRANSFORM_ANOMALIES_KEY: [self._post_transform_anomalies], standard_component_specs.POST_TRANSFORM_STATS_KEY: [self._post_transform_stats], standard_component_specs.POST_TRANSFORM_SCHEMA_KEY: [self._post_transform_schema], } # Create exec properties skeleton. self._exec_properties = {}