def testMakeDefaultOutputConfig(self): output_config = utils.make_default_output_config( utils.make_default_input_config()) self.assertEqual(2, len(output_config.split_config.splits)) output_config = utils.make_default_output_config( example_gen_pb2.Input(splits=[ example_gen_pb2.Input.Split(name='train', pattern='train/*'), example_gen_pb2.Input.Split(name='eval', pattern='eval/*') ])) self.assertEqual(0, len(output_config.split_config.splits))
def __init__(self, executor: Any, input_base: Optional[channel.Channel] = None, input_config: Optional[example_gen_pb2.Input] = None, output_config: Optional[example_gen_pb2.Output] = None, component_name: Optional[Text] = 'ExampleGen', unique_name: Optional[Text] = None, outputs: Optional[base_component.ComponentOutputs] = None): if input_base is None and input_config is None: raise RuntimeError( 'One of input_base and input_config must be set.') input_dict = { 'input-base': channel.as_channel(input_base) } if input_base else {} # Default value need to be set in component instead of executor as output # artifacts depend on it. self._input_config = input_config or utils.make_default_input_config() self._output_config = output_config or utils.make_default_output_config( self._input_config) exec_properties = { 'input': json_format.MessageToJson(self._input_config), 'output': json_format.MessageToJson(self._output_config) } super(ExampleGen, self).__init__( component_name=component_name, unique_name=unique_name, driver=driver.Driver if input_base else base_driver.BaseDriver, executor=executor, input_dict=input_dict, outputs=outputs, exec_properties=exec_properties)
def __init__( self, input_config: Union[example_gen_pb2.Input, data_types.RuntimeParameter], output_config: Optional[Union[example_gen_pb2.Output, data_types.RuntimeParameter]] = None, custom_config: Optional[Union[example_gen_pb2.CustomConfig, data_types.RuntimeParameter]] = None, range_config: Optional[Union[range_config_pb2.RangeConfig, data_types.RuntimeParameter]] = None, output_data_format: Optional[int] = example_gen_pb2.FORMAT_TF_EXAMPLE, output_file_format: Optional[int] = example_gen_pb2. FORMAT_TFRECORDS_GZIP, ): """Construct a QueryBasedExampleGen component. Args: input_config: An [example_gen_pb2.Input](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto) instance, providing input configuration. _required_ output_config: An [example_gen_pb2.Output](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto) instance, providing output configuration. If unset, the default splits will be labeled as 'train' and 'eval' with a distribution ratio of 2:1. custom_config: An [example_gen_pb2.CustomConfig](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto) instance, providing custom configuration for ExampleGen. range_config: An optional range_config_pb2.RangeConfig instance, specifying the range of span values to consider. output_data_format: Payload format of generated data in output artifact, one of example_gen_pb2.PayloadFormat enum. output_file_format: File format of generated data in output artifact, one of example_gen_pb2.FileFormat enum. Raises: ValueError: The output_data_format, output_file_format value must be defined in the example_gen_pb2.PayloadFormat proto. """ # Configure outputs. output_config = output_config or utils.make_default_output_config( input_config) example_artifacts = types.Channel(type=standard_artifacts.Examples) if output_data_format not in example_gen_pb2.PayloadFormat.values(): raise ValueError( 'The value of output_data_format must be defined in' 'the example_gen_pb2.PayloadFormat proto.') if output_file_format not in example_gen_pb2.FileFormat.values(): raise ValueError( 'The value of output_file_format must be defined in' 'the example_gen_pb2.FileFormat proto.') spec = standard_component_specs.QueryBasedExampleGenSpec( input_config=input_config, output_config=output_config, range_config=range_config, output_data_format=output_data_format, output_file_format=output_file_format, custom_config=custom_config, examples=example_artifacts) super().__init__(spec=spec)
def __init__(self, input_config: example_gen_pb2.Input, output_config: Optional[example_gen_pb2.Output] = None, component_name: Optional[Text] = 'ExampleGen', example_artifacts: Optional[channel.Channel] = None, name: Optional[Text] = None): """Construct an QueryBasedExampleGen component. Args: input_config: An example_gen_pb2.Input instance, providing input configuration. output_config: An example_gen_pb2.Output instance, providing output configuration. If unset, default splits will be 'train' and 'eval' with size 2:1. component_name: Name of the component, should be unique per component class. Default to 'ExampleGen', can be overwritten by sub-classes. example_artifacts: Optional channel of 'ExamplesPath' for output train and eval examples. name: Unique name for every component class instance. """ # Configure outputs. output_config = output_config or utils.make_default_output_config( input_config) example_artifacts = example_artifacts or channel.as_channel([ types.TfxArtifact('ExamplesPath', split=split_name) for split_name in utils.generate_output_split_names( input_config, output_config) ]) spec = QueryBasedExampleGenSpec(component_name=component_name, input_config=input_config, output_config=output_config, examples=example_artifacts) super(_QueryBasedExampleGen, self).__init__(spec=spec, name=name)
def __init__( self, input_base: Optional[Text] = None, input_config: Optional[Union[example_gen_pb2.Input, Dict[Text, Any]]] = None, output_config: Optional[Union[example_gen_pb2.Output, Dict[Text, Any]]] = None, custom_config: Optional[Union[example_gen_pb2.CustomConfig, Dict[Text, Any]]] = None, range_config: Optional[Union[range_config_pb2.RangeConfig, Dict[Text, Any]]] = None, output_data_format: Optional[int] = example_gen_pb2. FORMAT_TF_EXAMPLE, example_artifacts: Optional[types.Channel] = None, custom_executor_spec: Optional[executor_spec.ExecutorSpec] = None, instance_name: Optional[Text] = None): """Construct a FileBasedExampleGen component. Args: input_base: an external directory containing the data files. input_config: An [`example_gen_pb2.Input`](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto) instance, providing input configuration. If unset, input files will be treated as a single split. output_config: An example_gen_pb2.Output instance, providing the output configuration. If unset, default splits will be 'train' and 'eval' with size 2:1. custom_config: An optional example_gen_pb2.CustomConfig instance, providing custom configuration for executor. range_config: An optional range_config_pb2.RangeConfig instance, specifying the range of span values to consider. If unset, driver will default to searching for latest span with no restrictions. output_data_format: Payload format of generated data in output artifact, one of example_gen_pb2.PayloadFormat enum. example_artifacts: Channel of 'ExamplesPath' for output train and eval examples. custom_executor_spec: Optional custom executor spec overriding the default executor spec specified in the component attribute. instance_name: Optional unique instance name. Required only if multiple ExampleGen components are declared in the same pipeline. """ # Configure inputs and outputs. input_config = input_config or utils.make_default_input_config() output_config = output_config or utils.make_default_output_config( input_config) if not example_artifacts: example_artifacts = types.Channel(type=standard_artifacts.Examples) spec = FileBasedExampleGenSpec(input_base=input_base, input_config=input_config, output_config=output_config, custom_config=custom_config, range_config=range_config, output_data_format=output_data_format, examples=example_artifacts) super(FileBasedExampleGen, self).__init__(spec=spec, custom_executor_spec=custom_executor_spec, instance_name=instance_name)
def __init__(self, input_config: Union[example_gen_pb2.Input, Dict[Text, Any]], output_config: Optional[Union[example_gen_pb2.Output, Dict[Text, Any]]] = None, custom_config: Optional[Union[example_gen_pb2.CustomConfig, Dict[Text, Any]]] = None, output_data_format: Optional[int] = example_gen_pb2. FORMAT_TF_EXAMPLE, example_artifacts: Optional[types.Channel] = None, instance_name: Optional[Text] = None): """Construct a QueryBasedExampleGen component. Args: input_config: An [example_gen_pb2.Input](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto) instance, providing input configuration. If any field is provided as a RuntimeParameter, input_config should be constructed as a dict with the same field names as Input proto message. _required_ output_config: An [example_gen_pb2.Output](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto) instance, providing output configuration. If unset, the default splits will be labeled as 'train' and 'eval' with a distribution ratio of 2:1. If any field is provided as a RuntimeParameter, output_config should be constructed as a dict with the same field names as Output proto message. custom_config: An [example_gen_pb2.CustomConfig](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto) instance, providing custom configuration for ExampleGen. If any field is provided as a RuntimeParameter, output_config should be constructed as a dict. output_data_format: Payload format of generated data in output artifact, one of example_gen_pb2.PayloadFormat enum. example_artifacts: Channel of `standard_artifacts.Examples` for output train and eval examples. instance_name: Optional unique instance name. Required only if multiple ExampleGen components are declared in the same pipeline. Raises: ValueError: The output_data_format value must be defined in the example_gen_pb2.PayloadFormat proto. """ # Configure outputs. output_config = output_config or utils.make_default_output_config( input_config) if not example_artifacts: example_artifacts = types.Channel(type=standard_artifacts.Examples) if output_data_format not in example_gen_pb2.PayloadFormat.values(): raise ValueError( 'The value of output_data_format must be defined in' 'the example_gen_pb2.PayloadFormat proto.') spec = QueryBasedExampleGenSpec(input_config=input_config, output_config=output_config, output_data_format=output_data_format, custom_config=custom_config, examples=example_artifacts) super(QueryBasedExampleGen, self).__init__(spec=spec, instance_name=instance_name)
def __init__( self, input: types.Channel = None, # pylint: disable=redefined-builtin input_config: Optional[Union[example_gen_pb2.Input, Dict[Text, Any]]] = None, output_config: Optional[Union[example_gen_pb2.Output, Dict[Text, Any]]] = None, custom_config: Optional[Union[example_gen_pb2.CustomConfig, Dict[Text, Any]]] = None, example_artifacts: Optional[types.Channel] = None, custom_executor_spec: Optional[executor_spec.ExecutorSpec] = None, input_base: Optional[types.Channel] = None, instance_name: Optional[Text] = None): """Construct a FileBasedExampleGen component. Args: input: A Channel of type `standard_artifacts.ExternalArtifact`, which includes one artifact whose uri is an external directory containing the data files. _required_ input_config: An [`example_gen_pb2.Input`](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto) instance, providing input configuration. If unset, the files under input_base will be treated as a single dataset. output_config: An example_gen_pb2.Output instance, providing the output configuration. If unset, default splits will be 'train' and 'eval' with size 2:1. custom_config: An optional example_gen_pb2.CustomConfig instance, providing custom configuration for executor. example_artifacts: Channel of 'ExamplesPath' for output train and eval examples. custom_executor_spec: Optional custom executor spec overriding the default executor spec specified in the component attribute. input_base: Backwards compatibility alias for the 'input' argument. instance_name: Optional unique instance name. Required only if multiple ExampleGen components are declared in the same pipeline. Either `input_base` or `input` must be present in the input arguments. """ input = input or input_base # Configure inputs and outputs. input_config = input_config or utils.make_default_input_config() output_config = output_config or utils.make_default_output_config( input_config) example_artifacts = example_artifacts or channel_utils.as_channel([ standard_artifacts.Examples(split=str(split_name)) for split_name in utils.generate_output_split_names( input_config, output_config) ]) spec = FileBasedExampleGenSpec(input_base=input, input_config=input_config, output_config=output_config, custom_config=custom_config, examples=example_artifacts) super(FileBasedExampleGen, self).__init__(spec=spec, custom_executor_spec=custom_executor_spec, instance_name=instance_name)
def __init__( self, input_base: Optional[str] = None, input_config: Optional[Union[example_gen_pb2.Input, data_types.RuntimeParameter]] = None, output_config: Optional[Union[example_gen_pb2.Output, data_types.RuntimeParameter]] = None, custom_config: Optional[Union[example_gen_pb2.CustomConfig, data_types.RuntimeParameter]] = None, range_config: Optional[Union[range_config_pb2.RangeConfig, data_types.RuntimeParameter]] = None, output_data_format: Optional[int] = example_gen_pb2. FORMAT_TF_EXAMPLE, output_file_format: Optional[int] = example_gen_pb2. FORMAT_TFRECORDS_GZIP, custom_executor_spec: Optional[executor_spec.ExecutorSpec] = None): """Construct a FileBasedExampleGen component. Args: input_base: an external directory containing the data files. input_config: An [`example_gen_pb2.Input`](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto) instance, providing input configuration. If unset, input files will be treated as a single split. output_config: An example_gen_pb2.Output instance, providing the output configuration. If unset, default splits will be 'train' and 'eval' with size 2:1. custom_config: An optional example_gen_pb2.CustomConfig instance, providing custom configuration for executor. range_config: An optional range_config_pb2.RangeConfig instance, specifying the range of span values to consider. If unset, driver will default to searching for latest span with no restrictions. output_data_format: Payload format of generated data in output artifact, one of example_gen_pb2.PayloadFormat enum. output_file_format: File format of generated data in output artifact, one of example_gen_pb2.FileFormat enum. custom_executor_spec: Optional custom executor spec overriding the default executor spec specified in the component attribute. """ # Configure inputs and outputs. input_config = input_config or utils.make_default_input_config() output_config = output_config or utils.make_default_output_config( input_config) example_artifacts = types.Channel(type=standard_artifacts.Examples) spec = standard_component_specs.FileBasedExampleGenSpec( input_base=input_base, input_config=input_config, output_config=output_config, custom_config=custom_config, range_config=range_config, output_data_format=output_data_format, output_file_format=output_file_format, examples=example_artifacts) super().__init__(spec=spec, custom_executor_spec=custom_executor_spec)
def __init__(self, input_config: Union[example_gen_pb2.Input, Dict[Text, Any]], output_config: Optional[Union[example_gen_pb2.Output, Dict[Text, Any]]] = None, custom_config: Optional[Union[example_gen_pb2.CustomConfig, Dict[Text, Any]]] = None, example_artifacts: Optional[types.Channel] = None, instance_name: Optional[Text] = None, enable_cache: Optional[bool] = None): """Construct an QueryBasedExampleGen component. Args: input_config: An [example_gen_pb2.Input](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto) instance, providing input configuration. If any field is provided as a RuntimeParameter, input_config should be constructed as a dict with the same field names as Input proto message. _required_ output_config: An [example_gen_pb2.Output](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto) instance, providing output configuration. If unset, the default splits will be labeled as 'train' and 'eval' with a distribution ratio of 2:1. If any field is provided as a RuntimeParameter, output_config should be constructed as a dict with the same field names as Output proto message. custom_config: An [example_gen_pb2.CustomConfig](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto) instance, providing custom configuration for ExampleGen. If any field is provided as a RuntimeParameter, output_config should be constructed as a dict. example_artifacts: Channel of `standard_artifacts.Examples` for output train and eval examples. instance_name: Optional unique instance name. Required only if multiple ExampleGen components are declared in the same pipeline. enable_cache: Optional boolean to indicate if cache is enabled for the QueryBasedExampleGen component. If not specified, defaults to the value specified for pipeline's enable_cache parameter. """ # Configure outputs. output_config = output_config or utils.make_default_output_config( input_config) if not example_artifacts: artifact = standard_artifacts.Examples() artifact.split_names = artifact_utils.encode_split_names( utils.generate_output_split_names(input_config, output_config)) example_artifacts = channel_utils.as_channel([artifact]) spec = QueryBasedExampleGenSpec(input_config=input_config, output_config=output_config, custom_config=custom_config, examples=example_artifacts) super(_QueryBasedExampleGen, self).__init__(spec=spec, instance_name=instance_name, enable_cache=enable_cache)
def __init__( self, input_base: types.Channel = None, input_config: Optional[example_gen_pb2.Input] = None, output_config: Optional[example_gen_pb2.Output] = None, custom_config: Optional[example_gen_pb2.CustomConfig] = None, component_name: Optional[Text] = 'ExampleGen', example_artifacts: Optional[types.Channel] = None, executor_class: Optional[Type[base_executor.BaseExecutor]] = None, input: Optional[types.Channel] = None, # pylint: disable=redefined-builtin name: Optional[Text] = None): """Construct a FileBasedExampleGen component. Args: input_base: A Channel of 'ExternalPath' type, which includes one artifact whose uri is an external directory with data files inside (required). input_config: An optional example_gen_pb2.Input instance, providing input configuration. If unset, the files under input_base (must set) will be treated as a single split. output_config: An optional example_gen_pb2.Output instance, providing output configuration. If unset, default splits will be 'train' and 'eval' with size 2:1. custom_config: An optional example_gen_pb2.CustomConfig instance, providing custom configuration for executor. component_name: Name of the component, should be unique per component class. Default to 'ExampleGen', can be overwritten by sub-classes. example_artifacts: Optional channel of 'ExamplesPath' for output train and eval examples. executor_class: Optional custom executor class overriding the default executor specified in the component attribute. input: Forwards compatibility alias for the 'input_base' argument. name: Unique name for every component class instance. """ input_base = input_base or input # Configure inputs and outputs. input_config = input_config or utils.make_default_input_config() output_config = output_config or utils.make_default_output_config( input_config) example_artifacts = example_artifacts or channel_utils.as_channel([ standard_artifacts.Examples(split=split_name) for split_name in utils.generate_output_split_names( input_config, output_config) ]) spec = FileBasedExampleGenSpec(input_base=input_base, input_config=input_config, output_config=output_config, custom_config=custom_config, examples=example_artifacts) super(FileBasedExampleGen, self).__init__(spec=spec, custom_executor_class=executor_class, name=name)
def testMakeDefaultOutputConfigWithParameter(self): split_name_param = data_types.RuntimeParameter( name='split-name', ptype=str, default=u'train') output_config = utils.make_default_output_config({ 'splits': [{ 'name': split_name_param, 'pattern': 'train/*' }, { 'name': 'eval', 'pattern': 'eval/*' }] }) self.assertEqual(0, len(output_config.split_config.splits))
def __init__(self, conn_config: presto_config_pb2.PrestoConnConfig, query: Optional[Text] = None, input_config: Optional[example_gen_pb2.Input] = None, output_config: Optional[example_gen_pb2.Output] = None, example_artifacts: Optional[channel.Channel] = None, name: Optional[Text] = None): """Constructs a PrestoExampleGen component. Args: conn_config: Parameters for Presto connection client. query: Presto sql string, query result will be treated as a single split, can be overwritten by input_config. input_config: An example_gen_pb2.Input instance with Split.pattern as Presto sql string. If set, it overwrites the 'query' arg, and allows different queries per split. output_config: An example_gen_pb2.Output instance, providing output configuration. If unset, default splits will be 'train' and 'eval' with size 2:1. example_artifacts: Optional channel of 'ExamplesPath' for output train and eval examples. name: Optional unique name. Necessary if multiple PrestoExampleGen components are declared in the same pipeline. Raises: RuntimeError: Only one of query and input_config should be set. Or required host field in connection_config should be set. """ if bool(query) == bool(input_config): raise RuntimeError( 'Exactly one of query and input_config should be set.') if not bool(conn_config.host): raise RuntimeError( 'Required host field in connection config should be set.') input_config = input_config or utils.make_default_input_config(query) packed_custom_config = example_gen_pb2.CustomConfig() packed_custom_config.custom_config.Pack(conn_config) output_config = output_config or utils.make_default_output_config( input_config) super(PrestoExampleGen, self).__init__(input_config=input_config, output_config=output_config, custom_config=packed_custom_config, component_name='PrestoExampleGen', example_artifacts=example_artifacts, name=name)
def __init__( self, input_example: channel.Channel, string_execution_parameter: Text, integer_execution_parameter: int, output_example: Optional[channel.Channel] = None, # don't change these three: input_config: Optional[example_gen_pb2.Input] = None, output_config: Optional[example_gen_pb2.Output] = None, name: Optional[Text] = None): """Constructs a Head Component. Args: input_example: A Channel of 'RandomTypeNameForInput' type, (type can be any string, as long as it consistent in the channel, spec and artifacts) string_execution_parameter: An string execution parameter (only used in executor, not persistent or shared up stream) integer_execution_parameter: An integer execution parameter (only used in executor, not persistent or shared up stream) output_example: Optional output channel of 'RandomTypeNameForOutput' (type can be any string, as long as it consistent in the channel, spec and artifacts); will be created for you if not specified. input_config: An optional example_gen_pb2.Input instance, providing input configuration. If unset, the files under input_base (must set) will be treated as a single split. output_config: An optional example_gen_pb2.Output instance, providing output configuration. If unset, default splits will be 'train' and ' eval' with size 2:1. name: Optional unique name. Necessary if multiple Pusher components are declared in the same pipeline. """ # Configure inputs and outputs (don't change). input_config = input_config or utils.make_default_input_config() output_config = output_config or utils.make_default_output_config( input_config) output_example = output_example or channel.Channel( type_name='RandomTypeNameForOutput', artifacts=[types.TfxArtifact('RandomTypeNameForOutput')]) spec = CustomHeadComponentSpec( input_example=input_example, integer_execution_parameter=integer_execution_parameter, string_execution_parameter=string_execution_parameter, input_config=input_config, output_config=output_config, output_example=output_example) super(CustomHeadComponent, self).__init__(spec=spec, name=name)
def __init__(self, query: Optional[Text] = None, input_config: Optional[example_gen_pb2.Input] = None, output_config: Optional[example_gen_pb2.Output] = None, name: Optional[Text] = None, outputs: Optional[base_component.ComponentOutputs] = None): if bool(query) == bool(input_config): raise RuntimeError('Only one of query and input_config should be set.') input_config = input_config or utils.make_default_input_config(query) output_config = output_config or utils.make_default_output_config( input_config) super(BigQueryExampleGen, self).__init__( executor=executor.Executor, input_base=None, input_config=input_config, output_config=output_config, component_name='BigQueryExampleGen', unique_name=name, outputs=outputs)
def __init__( self, input_base: channel.Channel, input_config: Optional[example_gen_pb2.Input] = None, output_config: Optional[example_gen_pb2.Output] = None, example_artifacts: Optional[channel.Channel] = None, executor_class: Optional[Type[base_executor.BaseExecutor]] = None, name: Optional[Text] = None): """Construct a FileBasedExampleGen component. Args: input_base: A Channel of 'ExternalPath' type, which includes one artifact whose uri is an external directory with data files inside. input_config: An optional example_gen_pb2.Input instance, providing input configuration. If unset, the files under input_base (must set) will be treated as a single split. output_config: An optional example_gen_pb2.Output instance, providing output configuration. If unset, default splits will be 'train' and 'eval' with size 2:1. example_artifacts: Optional channel of 'ExamplesPath' for output train and eval examples. executor_class: Optional custom executor class overriding the default executor specified in the component attribute. name: Unique name for every component class instance. """ # Configure inputs and outputs. input_config = input_config or utils.make_default_input_config() output_config = output_config or utils.make_default_output_config( input_config) example_artifacts = example_artifacts or channel.as_channel([ types.TfxArtifact('ExamplesPath', split=split_name) for split_name in utils.generate_output_split_names( input_config, output_config) ]) spec = FileBasedExampleGenSpec(input_base=input_base, input_config=input_config, output_config=output_config, examples=example_artifacts) super(FileBasedExampleGen, self).__init__(spec=spec, custom_executor_class=executor_class, name=name)
def __init__(self, conn_config: presto_config_pb2.PrestoConnConfig, query: Optional[str] = None, input_config: Optional[example_gen_pb2.Input] = None, output_config: Optional[example_gen_pb2.Output] = None): """Constructs a PrestoExampleGen component. Args: conn_config: Parameters for Presto connection client. query: Presto sql string, query result will be treated as a single split, can be overwritten by input_config. input_config: An example_gen_pb2.Input instance with Split.pattern as Presto sql string. If set, it overwrites the 'query' arg, and allows different queries per split. output_config: An example_gen_pb2.Output instance, providing output configuration. If unset, default splits will be 'train' and 'eval' with size 2:1. Raises: RuntimeError: Only one of query and input_config should be set. Or required host field in connection_config should be set. """ if bool(query) == bool(input_config): raise RuntimeError('Exactly one of query and input_config should be set.') if not bool(conn_config.host): raise RuntimeError( 'Required host field in connection config should be set.') input_config = input_config or utils.make_default_input_config(query) packed_custom_config = example_gen_pb2.CustomConfig() packed_custom_config.custom_config.Pack(conn_config) output_config = output_config or utils.make_default_output_config( input_config) super().__init__( input_config=input_config, output_config=output_config, custom_config=packed_custom_config)
def __init__(self, input_base: channel.Channel, input_config: Optional[example_gen_pb2.Input] = None, output_config: Optional[example_gen_pb2.Output] = None, component_name: Optional[Text] = 'ExampleGen', example_artifacts: Optional[channel.Channel] = None, name: Optional[Text] = None): """Construct a FileBasedExampleGen component. Args: input_base: A Channel of 'ExternalPath' type, which includes one artifact whose uri is an external directory with data files inside. input_config: An optional example_gen_pb2.Input instance, providing input configuration. If unset, the files under input_base (must set) will be treated as a single split. output_config: An optional example_gen_pb2.Output instance, providing output configuration. If unset, default splits will be 'train' and 'eval' with size 2:1. component_name: Name of the component, should be unique per component class. Default to 'ExampleGen', can be overwritten by sub-classes. example_artifacts: Optional channel of 'ExamplesPath' for output train and eval examples. name: Unique name for every component class instance. """ # Configure inputs and outputs. input_config = input_config or utils.make_default_input_config() output_config = output_config or utils.make_default_output_config( input_config) example_artifacts = example_artifacts or channel.as_channel([ types.TfxArtifact('ExamplesPath', split=split_name) for split_name in utils.generate_output_split_names( input_config, output_config) ]) spec = FileBasedExampleGenSpec(component_name=component_name, input_base=input_base, input_config=input_config, output_config=output_config, examples=example_artifacts) super(_FileBasedExampleGen, self).__init__(spec=spec, name=name)
def __init__(self, input_config: example_gen_pb2.Input, output_config: Optional[example_gen_pb2.Output] = None, custom_config: Optional[example_gen_pb2.CustomConfig] = None, example_artifacts: Optional[types.Channel] = None, instance_name: Optional[Text] = None): """Construct an QueryBasedExampleGen component. Args: input_config: An [example_gen_pb2.Input](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto) instance, providing input configuration. _required_ output_config: An [example_gen_pb2.Output](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto) instance, providing output configuration. If unset, the default splits will be labeled as 'train' and 'eval' with a distribution ratio of 2:1. custom_config: An [example_gen_pb2.CustomConfig](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto) instance, providing custom configuration for ExampleGen. example_artifacts: Channel of 'ExamplesPath' for output train and eval examples. instance_name: Optional unique instance name. Required only if multiple ExampleGen components are declared in the same pipeline. """ # Configure outputs. output_config = output_config or utils.make_default_output_config( input_config) example_artifacts = example_artifacts or channel_utils.as_channel([ standard_artifacts.Examples(split=split_name) for split_name in utils.generate_output_split_names( input_config, output_config) ]) spec = QueryBasedExampleGenSpec( input_config=input_config, output_config=output_config, custom_config=custom_config, examples=example_artifacts) super(_QueryBasedExampleGen, self).__init__( spec=spec, instance_name=instance_name)
def __init__(self, query: Optional[Text] = None, input_config: Optional[example_gen_pb2.Input] = None, output_config: Optional[example_gen_pb2.Output] = None, example_artifacts: Optional[channel.Channel] = None, name: Optional[Text] = None): """Constructs a BigQueryExampleGen component. Args: query: BigQuery sql string, query result will be treated as a single split, can be overwritten by input_config. input_config: An example_gen_pb2.Input instance with Split.pattern as BigQuery sql string. If set, it overwrites the 'query' arg, and allows different queries per split. output_config: An example_gen_pb2.Output instance, providing output configuration. If unset, default splits will be 'train' and 'eval' with size 2:1. example_artifacts: Optional channel of 'ExamplesPath' for output train and eval examples. name: Optional unique name. Necessary if multiple BigQueryExampleGen components are declared in the same pipeline. Raises: RuntimeError: Only one of query and input_config should be set. """ if bool(query) == bool(input_config): raise RuntimeError( 'Exactly one of query and input_config should be set.') input_config = input_config or utils.make_default_input_config(query) output_config = output_config or utils.make_default_output_config( input_config) super(BigQueryExampleGen, self).__init__(input_config=input_config, output_config=output_config, component_name='BigQueryExampleGen', example_artifacts=example_artifacts, name=name)
def __init__( self, # TODO(b/159467778): deprecate this, use input_base instead. input: Optional[types.Channel] = None, # pylint: disable=redefined-builtin input_base: Optional[Text] = None, input_config: Optional[Union[example_gen_pb2.Input, Dict[Text, Any]]] = None, output_config: Optional[Union[example_gen_pb2.Output, Dict[Text, Any]]] = None, custom_config: Optional[Union[example_gen_pb2.CustomConfig, Dict[Text, Any]]] = None, output_data_format: Optional[int] = example_gen_pb2. FORMAT_TF_EXAMPLE, example_artifacts: Optional[types.Channel] = None, custom_executor_spec: Optional[executor_spec.ExecutorSpec] = None, instance_name: Optional[Text] = None): """Construct a FileBasedExampleGen component. Args: input: A Channel of type `standard_artifacts.ExternalArtifact`, which includes one artifact whose uri is an external directory containing the data files. (Deprecated by input_base) input_base: an external directory containing the data files. input_config: An [`example_gen_pb2.Input`](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto) instance, providing input configuration. If unset, input files will be treated as a single split. output_config: An example_gen_pb2.Output instance, providing the output configuration. If unset, default splits will be 'train' and 'eval' with size 2:1. custom_config: An optional example_gen_pb2.CustomConfig instance, providing custom configuration for executor. output_data_format: Payload format of generated data in output artifact, one of example_gen_pb2.PayloadFormat enum. example_artifacts: Channel of 'ExamplesPath' for output train and eval examples. custom_executor_spec: Optional custom executor spec overriding the default executor spec specified in the component attribute. instance_name: Optional unique instance name. Required only if multiple ExampleGen components are declared in the same pipeline. """ if input: logging.warning( 'The "input" argument to the ExampleGen component has been ' 'deprecated by "input_base". Please update your usage as support for ' 'this argument will be removed soon.') input_base = artifact_utils.get_single_uri(list(input.get())) # Configure inputs and outputs. input_config = input_config or utils.make_default_input_config() output_config = output_config or utils.make_default_output_config( input_config) if not example_artifacts: example_artifacts = types.Channel(type=standard_artifacts.Examples) spec = FileBasedExampleGenSpec(input_base=input_base, input_config=input_config, output_config=output_config, custom_config=custom_config, output_data_format=output_data_format, examples=example_artifacts) super(FileBasedExampleGen, self).__init__(spec=spec, custom_executor_spec=custom_executor_spec, instance_name=instance_name)
def __init__(self, query: Optional[Text] = None, beam_transform: beam.PTransform = None, bucket_name: Optional[Text] = None, output_schema: Optional[Text] = None, table_name: Optional[Text] = None, use_bigquery_source: Optional[Any] = False, input_config: Optional[example_gen_pb2.Input] = None, output_config: Optional[example_gen_pb2.Output] = None, example_artifacts: Optional[types.Channel] = None, instance_name: Optional[Text] = None): """Constructs a BigQueryExampleGen component. Args: query: BigQuery sql string, query result will be treated as a single split, can be overwritten by input_config. input_config: An example_gen_pb2.Input instance with Split.pattern as BigQuery sql string. If set, it overwrites the 'query' arg, and allows different queries per split. If any field is provided as a RuntimeParameter, input_config should be constructed as a dict with the same field names as Input proto message. beam_transform: beam.PTransform pipeline. Will be used to processed data ingested by the BigQuery query. bucket_name: string containing a GCS bucket name. Will be used as a temporary storage space to read query and pickle file. table_name: string containing the BigQuery output table name. use_bigquery_source: Whether to use BigQuerySource instead of experimental `ReadFromBigQuery` PTransform (required by the BigQueryExampleGen executor) input_config: An example_gen_pb2.Input instance with Split.pattern as BigQuery sql string. If set, it overwrites the 'query' arg, and allows different queries per split. If any field is provided as a RuntimeParameter, input_config should be constructed as a dict with the same field names as Input proto message. output_config: An example_gen_pb2.Output instance, providing output configuration. If unset, default splits will be 'train' and 'eval' with size 2:1. If any field is provided as a RuntimeParameter, input_config should be constructed as a dict with the same field names as Output proto message. example_artifacts: Optional channel of 'ExamplesPath' for output train and eval examples. instance_name: Optional unique instance name. Necessary if multiple BigQueryExampleGen components are declared in the same pipeline. Raises: RuntimeError: Only one of query and input_config should be set. """ # Configure inputs and outputs input_config = input_config or utils.make_default_input_config() output_config = output_config or utils.make_default_output_config( input_config) if not example_artifacts: example_artifacts = channel_utils.as_channel( [standard_artifacts.Examples()]) # Upload Beam Transform to a GCS Bucket beam_transform_uri = upload_beam_to_gcs(beam_transform, bucket_name) spec = TCGAPreprocessingSpec( # custom parameters query=query, output_schema=output_schema, table_name=table_name, use_bigquery_source=use_bigquery_source, # default parameters input_config=input_config, output_config=output_config, input_base=beam_transform_uri, # outputs examples=example_artifacts) super(TCGAPreprocessing, self).__init__(spec=spec, instance_name=instance_name)
def __init__( self, input: types.Channel = None, # pylint: disable=redefined-builtin input_config: Optional[Union[example_gen_pb2.Input, Dict[Text, Any]]] = None, output_config: Optional[Union[example_gen_pb2.Output, Dict[Text, Any]]] = None, custom_config: Optional[Union[example_gen_pb2.CustomConfig, Dict[Text, Any]]] = None, example_artifacts: Optional[types.Channel] = None, custom_executor_spec: Optional[executor_spec.ExecutorSpec] = None, input_base: Optional[types.Channel] = None, instance_name: Optional[Text] = None, enable_cache: Optional[bool] = None): """Construct a FileBasedExampleGen component. Args: input: A Channel of type `standard_artifacts.ExternalArtifact`, which includes one artifact whose uri is an external directory containing the data files. _required_ input_config: An [`example_gen_pb2.Input`](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto) instance, providing input configuration. If unset, the files under input_base will be treated as a single dataset. output_config: An example_gen_pb2.Output instance, providing the output configuration. If unset, default splits will be 'train' and 'eval' with size 2:1. custom_config: An optional example_gen_pb2.CustomConfig instance, providing custom configuration for executor. example_artifacts: Channel of 'ExamplesPath' for output train and eval examples. custom_executor_spec: Optional custom executor spec overriding the default executor spec specified in the component attribute. input_base: Backwards compatibility alias for the 'input' argument. instance_name: Optional unique instance name. Required only if multiple ExampleGen components are declared in the same pipeline. Either `input_base` or `input` must be present in the input arguments. enable_cache: Optional boolean to indicate if cache is enabled for the FileBasedExampleGen component. If not specified, defaults to the value specified for pipeline's enable_cache parameter. """ if input_base: absl.logging.warning( 'The "input_base" argument to the ExampleGen component has ' 'been renamed to "input" and is deprecated. Please update your ' 'usage as support for this argument will be removed soon.') input = input_base # Configure inputs and outputs. input_config = input_config or utils.make_default_input_config() output_config = output_config or utils.make_default_output_config( input_config) if not example_artifacts: artifact = standard_artifacts.Examples() artifact.split_names = artifact_utils.encode_split_names( utils.generate_output_split_names(input_config, output_config)) example_artifacts = channel_utils.as_channel([artifact]) spec = FileBasedExampleGenSpec(input=input, input_config=input_config, output_config=output_config, custom_config=custom_config, examples=example_artifacts) super(FileBasedExampleGen, self).__init__(spec=spec, custom_executor_spec=custom_executor_spec, instance_name=instance_name, enable_cache=enable_cache)