def testReadTransformFn(self): path = self.get_temp_dir() # NOTE: we don't need to create or write to the transform_fn directory since # ReadTransformFn never inspects this directory. transform_fn_dir = os.path.join(path, 'transform_fn') transformed_metadata_dir = os.path.join(path, 'transformed_metadata') metadata_io.write_metadata(_TEST_METADATA, transformed_metadata_dir) with beam.Pipeline() as pipeline: saved_model_dir_pcoll, metadata = ( pipeline | transform_fn_io.ReadTransformFn(path)) beam_test_util.assert_that(saved_model_dir_pcoll, beam_test_util.equal_to( [transform_fn_dir]), label='AssertSavedModelDir') # NOTE: metadata is currently read in a non-deferred manner. self.assertEqual(metadata, _TEST_METADATA)
def testTransformFnExportAndImportRoundtrip(self): tranform_fn_dir = os.path.join(self.get_temp_dir(), 'export_transform_fn') metadata_dir = os.path.join(self.get_temp_dir(), 'export_metadata') with beam.Pipeline() as p: def preprocessing_fn(inputs): return {'x_scaled': tft.scale_to_0_1(inputs['x'])} metadata = self.toMetadata( {'x': tf.FixedLenFeature((), tf.float32, 0)}) columns = p | 'CreateTrainingData' >> beam.Create([{ 'x': v } for v in [4, 1, 5, 2]]) with beam_impl.Context(temp_dir=self.get_temp_dir()): _, transform_fn = ( (columns, metadata) | 'Analyze and Transform' >> beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) _ = transform_fn | transform_fn_io.WriteTransformFn( tranform_fn_dir) _ = metadata | beam_metadata_io.WriteMetadata(metadata_dir, pipeline=p) with beam.Pipeline() as p: transform_fn = p | transform_fn_io.ReadTransformFn(tranform_fn_dir) metadata = p | beam_metadata_io.ReadMetadata(metadata_dir) # Run transform_columns on some eval dataset. eval_data = p | 'CreateEvalData' >> beam.Create([{ 'x': v } for v in [6, 3]]) transformed_eval_data, _ = ( ((eval_data, metadata), transform_fn) | 'Transform' >> beam_impl.TransformDataset()) expected_transformed_eval_data = [{ 'x_scaled': v } for v in [1.25, 0.5]] beam_test_util.assert_that( transformed_eval_data, beam_test_util.equal_to(expected_transformed_eval_data))
def transform_data(input_handle, outfile_prefix, working_dir, schema_file, transform_dir=None, max_rows=None, pipeline_args=None): """The main tf.transform method which analyzes and transforms data. Args: input_handle: BigQuery table name to process specified as DATASET.TABLE or path to csv file with input data. outfile_prefix: Filename prefix for emitted transformed examples working_dir: Directory in which transformed examples and transform function will be emitted. schema_file: An file path that contains a text-serialized TensorFlow metadata schema of the input data. transform_dir: Directory in which the transform output is located. If provided, this will load the transform_fn from disk instead of computing it over the data. Hint: this is useful for transforming eval data. max_rows: Number of rows to query from BigQuery pipeline_args: additional DataflowRunner or DirectRunner args passed to the beam pipeline. """ def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in taxi.DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[taxi.transformed_name(key)] = transform.scale_to_z_score( _fill_in_missing(inputs[key])) for key in taxi.VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[taxi.transformed_name( key)] = transform.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), top_k=taxi.VOCAB_SIZE, num_oov_buckets=taxi.OOV_SIZE) for key in taxi.BUCKET_FEATURE_KEYS: outputs[taxi.transformed_name(key)] = transform.bucketize( _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT) for key in taxi.CATEGORICAL_FEATURE_KEYS: outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key]) # Was this passenger a big tipper? taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY]) tips = _fill_in_missing(inputs[taxi.LABEL_KEY]) outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where( tf.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64), # Test if the tip was > 20% of the fare. tf.cast(tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64)) return outputs schema = taxi.read_schema(schema_file) raw_feature_spec = taxi.get_raw_feature_spec(schema) raw_schema = dataset_schema.from_feature_spec(raw_feature_spec) raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema) with beam.Pipeline(argv=pipeline_args) as pipeline: with beam_impl.Context(temp_dir=working_dir): if input_handle.lower().endswith('csv'): csv_coder = taxi.make_csv_coder(schema) raw_data = (pipeline | 'ReadFromText' >> beam.io.ReadFromText( input_handle, skip_header_lines=1) | 'ParseCSV' >> beam.Map(csv_coder.decode)) else: query = taxi.make_sql(input_handle, max_rows, for_eval=False) raw_data = ( pipeline | 'ReadBigQuery' >> beam.io.Read( beam.io.BigQuerySource(query=query, use_standard_sql=True)) | 'CleanData' >> beam.Map(taxi.clean_raw_data_dict, raw_feature_spec=raw_feature_spec)) if transform_dir is None: transform_fn = ( (raw_data, raw_data_metadata) | ('Analyze' >> beam_impl.AnalyzeDataset(preprocessing_fn))) _ = (transform_fn | ('WriteTransformFn' >> transform_fn_io.WriteTransformFn(working_dir))) else: transform_fn = pipeline | transform_fn_io.ReadTransformFn( transform_dir) # Shuffling the data before materialization will improve Training # effectiveness downstream. shuffled_data = raw_data | 'RandomizeData' >> beam.transforms.Reshuffle( ) (transformed_data, transformed_metadata) = ( ((shuffled_data, raw_data_metadata), transform_fn) | 'Transform' >> beam_impl.TransformDataset()) coder = example_proto_coder.ExampleProtoCoder( transformed_metadata.schema) _ = (transformed_data | 'SerializeExamples' >> beam.Map(coder.encode) | 'WriteExamples' >> beam.io.WriteToTFRecord( os.path.join(working_dir, outfile_prefix), file_name_suffix='.gz'))
def _RunBeamImpl(self, analyze_data_list: List[executor._Dataset], transform_data_list: List[executor._Dataset], transform_graph_uri: Text, input_dataset_metadata: dataset_metadata.DatasetMetadata, transform_output_path: Text, raw_examples_data_format: int, temp_path: Text, compute_statistics: bool, per_set_stats_output_paths: Sequence[Text], materialization_format: Optional[Text], analyze_paths_count: int) -> executor._Status: """Perform data preprocessing with TFT. Args: analyze_data_list: List of datasets for analysis. transform_data_list: List of datasets for transform. preprocessing_fn: The tf.Transform preprocessing_fn. input_dataset_metadata: A DatasetMetadata object for the input data. transform_output_path: An absolute path to write the output to. raw_examples_data_format: The data format of the raw examples. One of the enums from example_gen_pb2.PayloadFormat. temp_path: A path to a temporary dir. compute_statistics: A bool indicating whether or not compute statistics. per_set_stats_output_paths: Paths to per-set statistics output. If empty, per-set statistics is not produced. materialization_format: A string describing the format of the materialized data or None if materialization is not enabled. analyze_paths_count: An integer, the number of paths that should be used for analysis. Returns: Status of the execution. """ self._AssertSameTFXIOSchema(analyze_data_list) unprojected_typespecs = ( analyze_data_list[0].tfxio.TensorAdapter().OriginalTypeSpecs()) tf_transform_output = tft.TFTransformOutput(transform_graph_uri) analyze_input_columns = tft.get_analyze_input_columns( tf_transform_output.transform_raw_features, unprojected_typespecs) transform_input_columns = tft.get_transform_input_columns( tf_transform_output.transform_raw_features, unprojected_typespecs) # Use the same dataset (same columns) for AnalyzeDataset and computing # pre-transform stats so that the data will only be read once for these # two operations. if compute_statistics: analyze_input_columns = list( set( list(analyze_input_columns) + list(transform_input_columns))) for d in analyze_data_list: d.tfxio = d.tfxio.Project(analyze_input_columns) self._AssertSameTFXIOSchema(analyze_data_list) analyze_data_tensor_adapter_config = ( analyze_data_list[0].tfxio.TensorAdapterConfig()) for d in transform_data_list: d.tfxio = d.tfxio.Project(transform_input_columns) desired_batch_size = self._GetDesiredBatchSize( raw_examples_data_format) with self._CreatePipeline(transform_output_path) as pipeline: with tft_beam.Context( temp_dir=temp_path, desired_batch_size=desired_batch_size, passthrough_keys=self._GetTFXIOPassthroughKeys(), use_deep_copy_optimization=True, use_tfxio=True): # pylint: disable=expression-not-assigned # pylint: disable=no-value-for-parameter # _ = ( # pipeline # | 'IncrementPipelineMetrics' >> self._IncrementPipelineMetrics( # len(unprojected_typespecs), len(analyze_input_columns), # len(transform_input_columns), analyze_paths_count)) # # # (new_analyze_data_dict, input_cache) = ( # # pipeline # # | 'OptimizeRun' >> self._OptimizeRun( # # input_cache_dir, output_cache_dir, analyze_data_list, # # unprojected_typespecs, preprocessing_fn, # # self._GetCacheSource())) # # # if input_cache: # # absl.logging.debug('Analyzing data with cache.') # # full_analyze_dataset_keys_list = [ # dataset.dataset_key for dataset in analyze_data_list # ] # # # Removing unneeded datasets if they won't be needed for statistics or # # materialization. # # if materialization_format is None and not compute_statistics: # # if None in new_analyze_data_dict.values(): # # absl.logging.debug( # # 'Not reading the following datasets due to cache: %s', [ # # dataset.file_pattern # # for dataset in analyze_data_list # # if new_analyze_data_dict[dataset.dataset_key] is None # # ]) # # analyze_data_list = [ # # d for d in new_analyze_data_dict.values() if d is not None # # ] # # input_analysis_data = {} # for dataset in analyze_data_list: # infix = 'AnalysisIndex{}'.format(dataset.index) # dataset.standardized = ( # pipeline # | 'TFXIOReadAndDecode[{}]'.format(infix) >> # dataset.tfxio.BeamSource(desired_batch_size)) # # input_analysis_data[dataset.dataset_key] = dataset.standardized # # input_analysis_data = {} # # for key, dataset in new_analyze_data_dict.items(): # # input_analysis_data[key] = ( # # None if dataset is None else dataset.standardized) # # # transform_fn, cache_output = ( # # (input_analysis_data, input_cache, # # analyze_data_tensor_adapter_config) # # | 'Analyze' >> tft_beam.AnalyzeDatasetWithCache( # # preprocessing_fn, pipeline=pipeline)) # transform_fn = ( # (input_analysis_data, analyze_data_tensor_adapter_config) # | 'Analyze' >> tft_beam.AnalyzeDataset( # tf_transform_output.transform_raw_features, pipeline=pipeline)) # WriteTransformFn writes transform_fn and metadata to subdirectories # tensorflow_transform.SAVED_MODEL_DIR and # tensorflow_transform.TRANSFORMED_METADATA_DIR respectively. # (transform_fn # | 'WriteTransformFn' # >> tft_beam.WriteTransformFn(transform_output_path)) if compute_statistics or materialization_format is not None: transform_fn = ( pipeline | transform_fn_io.ReadTransformFn(transform_graph_uri)) # Do not compute pre-transform stats if the input format is raw proto, # as StatsGen would treat any input as tf.Example. Note that # tf.SequenceExamples are wire-format compatible with tf.Examples. if (compute_statistics and not self._IsDataFormatProto( raw_examples_data_format)): # Aggregated feature stats before transformation. pre_transform_feature_stats_path = os.path.join( transform_output_path, tft.TFTransformOutput. PRE_TRANSFORM_FEATURE_STATS_PATH) if self._IsDataFormatSequenceExample( raw_examples_data_format): schema_proto = None else: schema_proto = executor._GetSchemaProto( input_dataset_metadata) if self._IsDataFormatSequenceExample( raw_examples_data_format): def _ExtractRawExampleBatches(record_batch): return record_batch.column( record_batch.schema.get_field_index( RAW_EXAMPLE_KEY)).flatten().to_pylist( ) # Make use of the fact that tf.SequenceExample is wire-format # compatible with tf.Example stats_input = [] for dataset in analyze_data_list: infix = 'AnalysisIndex{}'.format(dataset.index) stats_input.append( dataset.standardized | 'ExtractRawExampleBatches[{}]'.format( infix) >> beam.Map( _ExtractRawExampleBatches) | 'DecodeSequenceExamplesAsExamplesIntoRecordBatches[{}]' .format(infix) >> beam.ParDo( self._ToArrowRecordBatchesFn( schema_proto))) else: stats_input = [ dataset.standardized for dataset in analyze_data_list ] pre_transform_stats_options = ( transform_stats_options. get_pre_transform_stats_options()) (stats_input | 'FlattenAnalysisDatasets' >> beam.Flatten(pipeline=pipeline) | 'GenerateStats[FlattenedAnalysisDataset]' >> self._GenerateStats( pre_transform_feature_stats_path, schema_proto, stats_options=pre_transform_stats_options)) # transform_data_list is a superset of analyze_data_list, we pay the # cost to read the same dataset (analyze_data_list) again here to # prevent certain beam runner from doing large temp materialization. for dataset in transform_data_list: infix = 'TransformIndex{}'.format(dataset.index) dataset.standardized = ( pipeline | 'TFXIOReadAndDecode[{}]'.format(infix) >> dataset.tfxio.BeamSource(desired_batch_size)) (dataset.transformed, metadata) = (((dataset.standardized, dataset.tfxio.TensorAdapterConfig()), transform_fn) | 'Transform[{}]'.format(infix) >> tft_beam.TransformDataset()) dataset.transformed_and_serialized = ( dataset.transformed | 'EncodeAndSerialize[{}]'.format(infix) >> beam.ParDo(self._EncodeAsSerializedExamples(), executor._GetSchemaProto(metadata))) if compute_statistics: # Aggregated feature stats after transformation. _, metadata = transform_fn # TODO(b/70392441): Retain tf.Metadata (e.g., IntDomain) in # schema. Currently input dataset schema only contains dtypes, # and other metadata is dropped due to roundtrip to tensors. transformed_schema_proto = executor._GetSchemaProto( metadata) for dataset in transform_data_list: infix = 'TransformIndex{}'.format(dataset.index) dataset.transformed_and_standardized = ( dataset.transformed_and_serialized | 'FromTransformedToArrowRecordBatches[{}]'. format(infix) >> self._ToArrowRecordBatches( schema=transformed_schema_proto)) post_transform_feature_stats_path = os.path.join( transform_output_path, tft.TFTransformOutput. POST_TRANSFORM_FEATURE_STATS_PATH) post_transform_stats_options = ( transform_stats_options. get_post_transform_stats_options()) ([ dataset.transformed_and_standardized for dataset in transform_data_list ] | 'FlattenTransformedDatasets' >> beam.Flatten() | 'GenerateStats[FlattenedTransformedDatasets]' >> self._GenerateStats( post_transform_feature_stats_path, transformed_schema_proto, stats_options=post_transform_stats_options)) if per_set_stats_output_paths: # TODO(b/130885503): Remove duplicate stats gen compute that is # done both on a flattened view of the data, and on each span # below. for dataset in transform_data_list: infix = 'TransformIndex{}'.format( dataset.index) (dataset.transformed_and_standardized | 'GenerateStats[{}]'.format(infix) >> self._GenerateStats( dataset.stats_output_path, transformed_schema_proto, stats_options=post_transform_stats_options )) if materialization_format is not None: for dataset in transform_data_list: infix = 'TransformIndex{}'.format(dataset.index) (dataset.transformed_and_serialized | 'Materialize[{}]'.format(infix) >> self._WriteExamples( materialization_format, dataset.materialize_output_path)) return executor._Status.OK()
def write_to_tfrecord(args): """ This function is supposed to be called as a script. """ # Decode arguments current_index, num_shards, train_split_fname_out, eval_split_fname_out, \ exp_log_data_file_train_tfrecord, exp_log_data_file_eval_tfrecord, working_dir, data_formatter_module_path = args # num_shards = "32" current_index, num_shards = int(current_index), int(num_shards) split_train_file_pattern = '{}-{:05}-of-{:05}'.format( train_split_fname_out, current_index, num_shards) + '*' split_eval_file_pattern = '{}-{:05}-of-{:05}'.format( eval_split_fname_out, current_index, num_shards) log.info('exp_log_data_file_train_tfrecord {}'.format( exp_log_data_file_train_tfrecord)) log.info('exp_log_data_file_eval_tfrecord {}'.format( exp_log_data_file_eval_tfrecord)) log.info('split_train_file_pattern {}'.format(split_train_file_pattern)) log.info('split_eval_file_pattern {}'.format(split_eval_file_pattern)) data_formatter = import_from_uri( data_formatter_module_path).DataFormatter() # Set up the preprocessing pipeline. pipeline = beam.Pipeline(runner=DirectRunner()) with beam_impl.Context(temp_dir=tempfile.mkdtemp()): # Read raw data files: CSV format ordered according to the `data_formatter`, that are then converted # into a cleaned up format. raw_train_data = ( pipeline | 'ReadTrainDataFile' >> textio.ReadFromText( split_train_file_pattern, skip_header_lines=0) | 'DecodeTrainDataCSV' >> MapAndFilterErrors( tft_coders.CsvCoder( data_formatter.get_features_and_targets(), data_formatter.get_features_metadata().schema).decode)) raw_eval_data = ( pipeline | 'ReadEvalDataFile' >> textio.ReadFromText( split_eval_file_pattern, skip_header_lines=0) | 'DecodeEvalDataCSV' >> MapAndFilterErrors( tft_coders.CsvCoder( data_formatter.get_features_and_targets(), data_formatter.get_features_metadata().schema).decode)) # Examples in tf-example format (for model analysis purposes). # raw_feature_spec = data_formatter.RAW_DATA_METADATA.schema.as_feature_spec() # raw_schema = dataset_schema.from_feature_spec(raw_feature_spec) # coder = example_proto_coder.ExampleProtoCoder(raw_schema) # # _ = ( # raw_eval_data # | 'ToSerializedTFExample' >> beam.Map(coder.encode) # | 'WriteAnalysisTFRecord' >> tfrecordio.WriteToTFRecord( # '{}-{:05}-of-{:05}'.format(analysis_fname_out, i, num_shards), # shard_name_template='', num_shards=1) # ) # Write SavedModel and metadata to two subdirectories of working_dir, given by # `transform_fn_io.TRANSFORM_FN_DIR` and `transform_fn_io.TRANSFORMED_METADATA_DIR` respectively. transform_fn = (pipeline | 'ReadTransformGraph' >> transform_fn_io.ReadTransformFn(working_dir)) # Applies the transformation `transform_fn` to the raw eval dataset (transformed_train_data, transformed_metadata) = ( ((raw_train_data, data_formatter.get_features_metadata()), transform_fn) | 'TransformTrainData' >> beam_impl.TransformDataset()) # Applies the transformation `transform_fn` to the raw eval dataset (transformed_eval_data, transformed_metadata) = ( ((raw_eval_data, data_formatter.get_features_metadata()), transform_fn) | 'TransformEvalData' >> beam_impl.TransformDataset()) # The data schema of the transformed data gets used to build a signature to create # a TFRecord (tf binary data format). This signature is a wrapper function used to # encode transformed data. transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) _ = (transformed_train_data | 'EncodeTrainDataTransform' >> MapAndFilterErrors( transformed_data_coder.encode) | 'WriteTrainDataTFRecord' >> tfrecordio.WriteToTFRecord('{}-{:05}-of-{:05}'.format( exp_log_data_file_train_tfrecord, current_index, num_shards), shard_name_template='', num_shards=1)) _ = (transformed_eval_data | 'EncodeEvalDataTransform' >> MapAndFilterErrors( transformed_data_coder.encode) | 'WriteEvalDataTFRecord' >> tfrecordio.WriteToTFRecord('{}-{:05}-of-{:05}'.format( exp_log_data_file_eval_tfrecord, current_index, num_shards), shard_name_template='', num_shards=1)) result = pipeline.run() result.wait_until_finish()
def tftransform( pipeline_args, # type: List[str] temp_location, # type: str schema_file, # type: str output_dir, # type: str preprocessing_fn, # type: Any training_data=None, # type: Union[None, str] evaluation_data=None, # type: Union[None, str] transform_fn_dir=None, # type: Union[None, str] compression_type=None # type: str ): # type: (...) -> PipelineState """ Generic tf.transform pipeline that takes tf.{example, record} training and evaluation datasets and outputs transformed data together with transform function Saved Model. :param pipeline_args: un-parsed Dataflow arguments :param temp_location: temporary location for dataflow job working dir :param schema_file: path to the raw feature schema text file :param output_dir: output dir for transformed data and function :param preprocessing_fn: tf.transform preprocessing function :param training_data: path to the training data :param evaluation_data: path to the evaluation data :param transform_fn_dir: dir to previously saved transformation function to apply :param compression_type: compression type for writing of tf.records :return final state of the Beam pipeline """ assert_not_empty_string(temp_location) assert_not_empty_string(schema_file) assert_not_empty_string(output_dir) assert_not_none(preprocessing_fn) if compression_type is None: compression_type = CompressionTypes.AUTO raw_feature_spec = schema_txt_file_to_feature_spec(schema_file) raw_schema = dataset_schema.from_feature_spec(raw_feature_spec) raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema) raw_data_coder = ExampleProtoCoder(raw_data_metadata.schema) transformed_train_output_dir = os.path.join(output_dir, "training") transformed_eval_output_dir = os.path.join(output_dir, "evaluation") if not any(i.startswith("--job_name") for i in pipeline_args): pipeline_args.append("--job_name=tf-transform-{}-{}".format( getpass.getuser(), int(time.time()))) pipeline = beam.Pipeline(argv=pipeline_args) with beam_impl.Context(temp_dir=temp_location): if training_data is not None: # if training data is provided, transform_fn_dir will be ignored if transform_fn_dir is not None: warnings.warn( "Transform_fn_dir is ignored because training_data is provided" ) transform_fn_output = os.path.join(output_dir, "transform_fn", "saved_model.pb") if FileSystems.exists(transform_fn_output): raise ValueError("Transform fn already exists at %s!" % transform_fn_output) # compute the transform_fn and apply to the training data raw_train_data = (pipeline | "ReadTrainData" >> tfrecordio.ReadFromTFRecord( training_data, coder=raw_data_coder)) ((transformed_train_data, transformed_train_metadata), transform_fn) = ( (raw_train_data, raw_data_metadata) | ("AnalyzeAndTransformTrainData" >> beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) ) # noqa: E501 _ = ( # noqa: F841 transform_fn | "WriteTransformFn" >> transform_fn_io.WriteTransformFn(output_dir)) transformed_train_coder = ExampleProtoCoder( transformed_train_metadata.schema) _ = ( # noqa: F841 transformed_train_data | "WriteTransformedTrainData" >> tfrecordio.WriteToTFRecord( os.path.join(transformed_train_output_dir, "part"), # noqa: E501 coder=transformed_train_coder, # noqa: E501 compression_type=compression_type, # noqa: E501 file_name_suffix=".tfrecords")) # noqa: E501 else: if transform_fn_dir is None: raise ValueError( "Either training_data or transformed_fn needs to be provided" ) # load the transform_fn transform_fn = pipeline | transform_fn_io.ReadTransformFn( transform_fn_dir) if evaluation_data is not None: # if evaluation_data exists, apply the transform_fn to the evaluation data raw_eval_data = (pipeline | "ReadEvalData" >> tfrecordio.ReadFromTFRecord( evaluation_data, coder=raw_data_coder)) (transformed_eval_data, transformed_eval_metadata) = ( ((raw_eval_data, raw_data_metadata), transform_fn) | "TransformEvalData" >> beam_impl.TransformDataset()) transformed_eval_coder = ExampleProtoCoder( transformed_eval_metadata.schema) _ = ( # noqa: F841 transformed_eval_data | "WriteTransformedEvalData" >> tfrecordio.WriteToTFRecord( os.path.join(transformed_eval_output_dir, "part"), # noqa: E501 coder=transformed_eval_coder, # noqa: E501 compression_type=compression_type, # noqa: E501 file_name_suffix=".tfrecords")) # noqa: E501 result = pipeline.run().wait_until_finish() return result
def transform_data(input_handle, outfile_prefix, working_dir, setup_file, ts1, ts2, project=None, max_rows=None, mode=None, stage=None, preprocessing_fn=None): """The main tf.transform method which analyzes and transforms data. Args: input_handle: BigQuery table name to process specified as DATASET.TABLE or path to csv file with input data. outfile_prefix: Filename prefix for emitted transformed examples working_dir: Directory in which transformed examples and transform function will be emitted. max_rows: Number of rows to query from BigQuery pipeline_args: additional DataflowRunner or DirectRunner args passed to the beam pipeline. """ def def_preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in taxi.DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[taxi.transformed_name(key)] = transform.scale_to_z_score( _fill_in_missing(inputs[key])) for key in taxi.VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[taxi.transformed_name( key)] = transform.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), top_k=taxi.VOCAB_SIZE, num_oov_buckets=taxi.OOV_SIZE) for key in taxi.BUCKET_FEATURE_KEYS: outputs[taxi.transformed_name(key)] = transform.bucketize( _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT) for key in taxi.CATEGORICAL_FEATURE_KEYS: outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key]) # Was this passenger a big tipper? taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY]) tips = _fill_in_missing(inputs[taxi.LABEL_KEY]) outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where( tf.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64), # Test if the tip was > 20% of the fare. tf.cast(tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64)) return outputs preprocessing_fn = preprocessing_fn or def_preprocessing_fn print('ts1 %s, ts2 %s' % (ts1, ts2)) schema = taxi.read_schema('./schema.pbtxt') raw_feature_spec = taxi.get_raw_feature_spec(schema) raw_schema = dataset_schema.from_feature_spec(raw_feature_spec) raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema) transform_dir = None temp_dir = os.path.join(working_dir, 'tmp') if stage is None: stage = 'train' if mode == 'local': options = {'project': project} pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options) runner = 'DirectRunner' elif mode == 'cloud': options = { 'job_name': 'tft-' + stage + '-' + str(uuid.uuid4()), 'temp_location': temp_dir, 'project': project, 'save_main_session': True, 'setup_file': setup_file } pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options) runner = 'DataFlowRunner' else: raise ValueError("Invalid mode %s." % mode) with beam.Pipeline(runner, options=pipeline_options) as pipeline: with beam_impl.Context(temp_dir=temp_dir): csv_coder = taxi.make_csv_coder(schema) # temp tft bug workaround mcsv_coder = make_mcsv_coder(schema) if 'csv' in input_handle.lower(): # if input_handle.lower().endswith('csv'): raw_data = (pipeline | 'ReadFromText' >> beam.io.ReadFromText( input_handle, skip_header_lines=1) | 'ParseCSV' >> beam.Map(csv_coder.decode)) else: query = make_sql(input_handle, ts1, ts2, stage, max_rows=max_rows, for_eval=False) raw_data1 = (pipeline | 'ReadBigQuery' >> beam.io.Read( beam.io.BigQuerySource( query=query, use_standard_sql=True))) raw_data = ( raw_data1 | 'CleanData' >> beam.Map(lambda x: (taxi.clean_raw_data_dict(x, raw_feature_spec)))) if transform_dir is None: transform_fn = ( (raw_data, raw_data_metadata) | ('Analyze' >> beam_impl.AnalyzeDataset(preprocessing_fn))) _ = (transform_fn | ('WriteTransformFn' >> transform_fn_io.WriteTransformFn(working_dir))) else: transform_fn = pipeline | transform_fn_io.ReadTransformFn( transform_dir) # Shuffling the data before materialization will improve Training # effectiveness downstream. shuffled_data = raw_data | 'RandomizeData' >> beam.transforms.Reshuffle( ) (transformed_data, transformed_metadata) = ( ((shuffled_data, raw_data_metadata), transform_fn) | 'Transform' >> beam_impl.TransformDataset()) if 'csv' not in input_handle.lower(): # if querying BQ _ = (raw_data | beam.Map(mcsv_coder.encode) | beam.io.WriteToText(os.path.join( working_dir, '{}.csv'.format(stage)), num_shards=1)) coder = example_proto_coder.ExampleProtoCoder( transformed_metadata.schema) _ = (transformed_data | 'SerializeExamples' >> beam.Map(coder.encode) | 'WriteExamples' >> beam.io.WriteToTFRecord( os.path.join(working_dir, outfile_prefix), file_name_suffix='.gz'))