def encode(): """ Creates a Beam pipeline that generates data, transforms it and encodes it in ELWC """ output_path = "./output" options = PipelineOptions() options.view_as(StandardOptions).runner = "DirectRunner" with beam.Pipeline(options=options) as pipeline: with tft_beam.Context(temp_dir="./tmp"): raw_data = generate_data(100) input_data = (pipeline | beam.Create(raw_data)) transformed_data, transform_fn = ( (input_data, raw_metadata) | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn)) elwc_coder = ELWCProtoCoder(context_specs, examples_specs) data, metadata = transformed_data _ = (data | beam.Map(elwc_coder.encode) | beam.io.WriteToTFRecord( file_path_prefix="{}/data".format(output_path), file_name_suffix=".tfrecords")) _ = (transform_fn | tft_beam.WriteTransformFn(output_path))
def testPreprocessingFn(self): schema_file = os.path.join(self._testdata_path, 'schema_gen/schema.pbtxt') schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema()) feature_spec = taxi_utils._get_raw_feature_spec(schema) working_dir = self.get_temp_dir() transform_graph_path = os.path.join(working_dir, 'transform_graph') transformed_examples_path = os.path.join( working_dir, 'transformed_examples') # Run very simplified version of executor logic. # TODO(kestert): Replace with tft_unit.assertAnalyzeAndTransformResults. # Generate legacy `DatasetMetadata` object. Future version of Transform # will accept the `Schema` proto directly. legacy_metadata = dataset_metadata.DatasetMetadata( schema_utils.schema_from_feature_spec(feature_spec)) tfxio = tf_example_record.TFExampleRecord( file_pattern=os.path.join(self._testdata_path, 'csv_example_gen/Split-train/*'), telemetry_descriptors=['Tests'], schema=legacy_metadata.schema) with beam.Pipeline() as p: with tft_beam.Context(temp_dir=os.path.join(working_dir, 'tmp')): examples = p | 'ReadTrainData' >> tfxio.BeamSource() (transformed_examples, transformed_metadata), transform_fn = ( (examples, tfxio.TensorAdapterConfig()) | 'AnalyzeAndTransform' >> tft_beam.AnalyzeAndTransformDataset( taxi_utils.preprocessing_fn)) # WriteTransformFn writes transform_fn and metadata to subdirectories # tensorflow_transform.SAVED_MODEL_DIR and # tensorflow_transform.TRANSFORMED_METADATA_DIR respectively. # pylint: disable=expression-not-assigned (transform_fn | 'WriteTransformFn' >> tft_beam.WriteTransformFn(transform_graph_path)) encoder = tft.coders.ExampleProtoCoder(transformed_metadata.schema) (transformed_examples | 'EncodeTrainData' >> beam.Map(encoder.encode) | 'WriteTrainData' >> beam.io.WriteToTFRecord( os.path.join(transformed_examples_path, 'Split-train/transformed_examples.gz'), coder=beam.coders.BytesCoder())) # pylint: enable=expression-not-assigned # Verify the output matches golden output. # NOTE: we don't verify that transformed examples match golden output. expected_transformed_schema = io_utils.parse_pbtxt_file( os.path.join( self._testdata_path, 'transform/transform_graph/transformed_metadata/schema.pbtxt'), schema_pb2.Schema()) transformed_schema = io_utils.parse_pbtxt_file( os.path.join(transform_graph_path, 'transformed_metadata/schema.pbtxt'), schema_pb2.Schema()) # Clear annotations so we only have to test main schema. transformed_schema.ClearField('annotation') for feature in transformed_schema.feature: feature.ClearField('annotation') self.assertEqual(transformed_schema, expected_transformed_schema)
def run(): pipeline_options = PipelineOptions(['--runner=DirectRunner']) def preprocessing_fn(inputs): word = inputs['word'] count = inputs['count'] return { 'word': word, 'count': count, 'count_normalized': tft.scale_to_0_1(count) } with beam.Pipeline(options=pipeline_options) as pipeline: with tft_beam.Context(temp_dir=tempfile.mkdtemp()): counts_data = (pipeline | "Load" >> ReadFromText(INPUT_FILE) | "CountWords" >> CountWordsTransform()) (transformed_data, transformed_metadata), _ = ( (counts_data, COUNTS_METADATA) | "AnalyzeAndTransform" >> tft_beam.AnalyzeAndTransformDataset(preprocessing_fn)) output_column_names = ['word', 'count', 'count_normalized'] transformed_data_coder = tft.coders.CsvCoder( output_column_names, transformed_metadata.schema) _ = (transformed_data | "EncodeToCsv" >> beam.Map(transformed_data_coder.encode) | "Save" >> WriteToText(OUTPUT_FILE))
def _transform_and_write_tfr( dataset: pvalue.PCollection, tfr_writer: Callable[[], beam.io.tfrecordio.WriteToTFRecord], raw_metadata: types.BeamDatasetMetadata, preprocessing_fn: Optional[Callable] = None, transform_fn: Optional[types.TransformFn] = None, label: str = 'data'): """Applies TF Transform to dataset and outputs it as TFRecords.""" dataset_metadata = (dataset, raw_metadata) if transform_fn: transformed_dataset, transformed_metadata = ( (dataset_metadata, transform_fn) | f'Transform{label}' >> tft_beam.TransformDataset()) else: if not preprocessing_fn: preprocessing_fn = lambda x: x (transformed_dataset, transformed_metadata), transform_fn = ( dataset_metadata | f'AnalyzeAndTransform{label}' >> tft_beam.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) _ = ( transformed_dataset | f'Encode{label}' >> beam.Map(transformed_data_coder.encode) | f'Write{label}' >> tfr_writer(prefix=label.lower())) return transform_fn
def run_metrics(): """Creates a pipeline to measure wordpiece vocab metrics over a corpus.""" metrics_pipeline = beam.Pipeline() with tft_beam.Context(temp_dir=tempfile.mkdtemp()): # Read raw data and convert to TF Transform encoded dict. raw_data = ( metrics_pipeline | 'ReadInputData' >> beam.io.tfrecordio.ReadFromTFRecord( data_file, coder=beam.coders.ProtoCoder(tf.train.Example)) | 'DecodeInputData' >> beam.Map(example_converter.decode)) # Apply transform to wordpiece-tokenize input. (metrics_transformed_data, _), _ = ( (raw_data, raw_metadata) | 'WordpieceTokenizeInput' >> tft_beam.AnalyzeAndTransformDataset( utils.metrics_preprocessing_fn(FLAGS.vocab_file, FLAGS.text_key, FLAGS.language_code_key))) # Initialize CSV coder. Aggregate values for each lang, calculate metrics, # and write to output to a CSV file. csv_converter = tft.coders.CsvCoder(columns, csv_schema) _ = ( metrics_transformed_data | 'CompileTokenInfo' >> beam.ParDo(utils.CompileTokenizationInfo()) | 'CombineStatsForLang' >> beam.CombineGlobally(utils.AggregateLang()) | 'CalculateMetrics' >> beam.ParDo(utils.CalculateMetrics()) | 'EncodeMetrics' >> beam.Map(csv_converter.encode) | 'WriteMetrics' >> beam.io.WriteToText( metrics_file, shard_name_template='', header=','.join(columns))) return metrics_pipeline
def word_count(input_path, output_path, raw_metadata, min_token_frequency=2): """Returns a pipeline counting words and writing the output. Args: input_path: recordio file to read output_path: path in which to write the output raw_metadata: metadata of input tf.Examples min_token_frequency: the min frequency for a token to be included """ lang_set = set(FLAGS.lang_set.split(',')) # Create pipeline. pipeline = beam.Pipeline() with tft_beam.Context(temp_dir=tempfile.mkdtemp()): converter = tft.coders.ExampleProtoCoder(raw_metadata.schema, serialized=False) # Read raw data and convert to TF Transform encoded dict. raw_data = ( pipeline | 'ReadInputData' >> beam.io.tfrecordio.ReadFromTFRecord( input_path, coder=beam.coders.ProtoCoder(tf.train.Example)) | 'DecodeInputData' >> beam.Map(converter.decode)) # Apply TF Transform. (transformed_data, _), _ = ( (raw_data, raw_metadata) | 'FilterLangAndExtractToken' >> tft_beam.AnalyzeAndTransformDataset( utils.count_preprocessing_fn(FLAGS.text_key, FLAGS.language_code_key))) # Filter by languages. tokens = ( transformed_data | 'FilterByLang' >> beam.ParDo(utils.FilterTokensByLang(lang_set))) # Calculate smoothing coefficients. coeffs = (tokens | 'CalculateSmoothingCoefficients' >> beam.CombineGlobally( utils.CalculateCoefficients(FLAGS.smoothing_exponent))) # Apply smoothing, aggregate counts, and sort words by count. _ = (tokens | 'ApplyExponentialSmoothing' >> beam.ParDo( utils.ExponentialSmoothing(), beam.pvalue.AsSingleton(coeffs)) | 'SumCounts' >> beam.CombinePerKey(sum) | 'FilterLowCounts' >> beam.ParDo( utils.FilterByCount(FLAGS.max_word_length, min_token_frequency)) | 'MergeAndSortCounts' >> beam.CombineGlobally(utils.SortByCount()) | 'Flatten' >> beam.FlatMap(lambda x: x) | 'FormatCounts' >> beam.Map(lambda tc: '%s\t%s' % (tc[0], tc[1])) | 'WriteSortedCount' >> beam.io.WriteToText( output_path, shard_name_template='')) return pipeline
def run(pipeline_options, known_args): global force_tf_compat_v1 argv = None # if None, uses sys.argv pipeline_options = PipelineOptions(argv) pipeline = beam.Pipeline(options=pipeline_options) if "universal-sentence-encoder" in MODEL_URL and int( MODEL_URL.split("/")[-1]) <= 2: # https://github.com/tensorflow/transform/issues/160 force_tf_compat_v1 = True with tft_beam.Context(temp_dir=tempfile.mkdtemp(), force_tf_compat_v1=force_tf_compat_v1): print("Context force_tf_compat_v1: {}".format( tft_beam.Context.get_use_tf_compat_v1())) articles = ( pipeline | beam.Create([ { "id": "01", "text": "To be, or not to be: that is the question: " }, { "id": "02", "text": "Whether 'tis nobler in the mind to suffer " }, { "id": "03", "text": "The slings and arrows of outrageous fortune, " }, { "id": "04", "text": "Or to take arms against a sea of troubles, " }, ])) articles_dataset = (articles, get_metadata()) transformed_dataset, transform_fn = ( articles_dataset | "Extract embeddings" >> tft_beam.AnalyzeAndTransformDataset(preprocess_fn)) transformed_data, transformed_metadata = transformed_dataset _ = (transformed_data | "Print embeddings" >> beam.Map(print_pass) | "Write embeddings to TFRecords" >> beam.io.tfrecordio.WriteToTFRecord( file_path_prefix="{0}".format(known_args.output_dir), file_name_suffix=".tfrecords", coder=tft_coders.example_proto_coder.ExampleProtoCoder( transformed_metadata.schema), num_shards=1)) job = pipeline.run() if pipeline_options.get_all_options()["runner"] == "DirectRunner": job.wait_until_finish()
def main(): with tft_beam.Context(temp_dir=tempfile.mkdtemp()): transformed_dataset, transform_fn = ( # pylint: disable=unused-variable (_RAW_DATA, _RAW_DATA_METADATA) | tft_beam.AnalyzeAndTransformDataset(_preprocessing_fn)) transformed_data, transformed_metadata = transformed_dataset # pylint: disable=unused-variable pprint.pprint(transformed_data)
def data_transform(): with tft_beam.Context(temp_dir=tempfile.mkdtemp()): transformed_dataset, transform_fn = ( (dict_features, data_metadata) | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data, transformed_metadata = transformed_dataset for i in range(len(transformed_data)): print("Initial: ", dict_features[i]) print("Transformed: ", transformed_data[i])
def generate_skipgrams(data_uri, feature_names, vocabulary_size=10, window_size=2, negative_samples=0., save_path="temp"): def parse_tensor_f(x): xp = tf.io.parse_tensor(x, tf.int64) xp.set_shape([None]) return {fname: xp[i] for i, fname in enumerate(feature_names)} raw_data = tf.data.TFRecordDataset(data_uri).map( parse_tensor_f).as_numpy_iterator() raw_data_schema = dataset_metadata.DatasetMetadata( schema_utils.schema_from_feature_spec({ fname: tf.io.FixedLenFeature([], tf.int64) for fname in feature_names })) dataset = (raw_data, raw_data_schema) # Make the preprocessing_fn preprocessing_fn = make_preproc_func(vocabulary_size, window_size, negative_samples, feature_names) # Run the beam pipeline with beam.Pipeline() as pipeline: with tft_beam.Context(temp_dir=tempfile.mkdtemp(), desired_batch_size=2): transformed_dataset, transform_fn = ( dataset | "Make Skipgrams" >> tft_beam.AnalyzeAndTransformDataset(preprocessing_fn)) print('Transformed dataset:\n{}'.format( pprint.pformat(transformed_dataset))) # pylint: disable=unused-variable transformed_data, transformed_metadata = transformed_dataset saved_results = ( transformed_data | "Write to TFRecord" >> beam.io.tfrecordio.WriteToTFRecord( file_path_prefix=save_path, file_name_suffix=".tfrecords", coder=tft.coders.example_proto_coder.ExampleProtoCoder( transformed_metadata.schema))) print('\nRaw data:\n{}\n'.format(pprint.pformat(raw_data))) print('Transformed data:\n{}'.format( pprint.pformat(transformed_data))) # Return the list of paths of tfrecords num_rows_saved = len(transformed_data) return saved_results, num_rows_saved
def run_vocab(): """Creates a pipeline to generate wordpiece vocab over a corpus.""" vocab_pipeline = beam.Pipeline() with tft_beam.Context(temp_dir=tempfile.mkdtemp()): # Read raw data and convert to TF Transform encoded dict. raw_data = ( vocab_pipeline | 'ReadInputData' >> beam.io.tfrecordio.ReadFromTFRecord( data_file, coder=beam.coders.ProtoCoder(tf.train.Example)) | 'DecodeInputData' >> beam.Map(example_converter.decode)) # Apply TF Transform. (transformed_data, _), _ = ((raw_data, raw_metadata) | 'FilterLangAndExtractToken' >> tft_beam.AnalyzeAndTransformDataset( utils.count_preprocessing_fn( FLAGS.text_key, FLAGS.language_code_key))) # Filter by languages. tokens = (transformed_data | 'FilterByLang' >> beam.ParDo( utils.FilterTokensByLang(lang_set))) # Calculate smoothing coefficients. coeffs = ( tokens | 'CalculateSmoothingCoefficients' >> beam.CombineGlobally( utils.CalculateCoefficients(FLAGS.smoothing_exponent))) # Apply smoothing, aggregate counts, and sort words by count. _ = (tokens | 'ApplyExponentialSmoothing' >> beam.ParDo( utils.ExponentialSmoothing(), beam.pvalue.AsSingleton(coeffs)) | 'SumCounts' >> beam.CombinePerKey(sum) | 'FilterLowCounts' >> beam.ParDo( utils.FilterByCount(FLAGS.max_word_length, min_token_frequency)) | 'MergeAndSortCounts' >> beam.CombineGlobally( utils.SortByCount()) | 'LearnVocab' >> beam.ParDo(utils.LearnVocab(params)) | 'Flatten' >> beam.FlatMap(lambda x: x + '\n') | 'WriteVocab' >> beam.io.WriteToText( vocab_file, shard_name_template='', append_trailing_newlines=False)) return vocab_pipeline
def transform_tft(train_data, test_data, working_dir): options = PipelineOptions() options.view_as(StandardOptions).runner = 'DirectRunner' with beam.Pipeline(options=options) as pipeline: with tft_beam.Context(temp_dir=tempfile.mkdtemp()): data_shape = train_data[0][0].shape raw_data = ( pipeline | 'ReadTrainData' >> beam.Create(train_data) | 'CreateTrainData' >> beam.Map(lambda data: format(data))) raw_data_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ IMAGE_KEY: tf.FixedLenFeature(list(data_shape), tf.float32), LABEL_KEY: tf.FixedLenFeature([], tf.int64) })) raw_dataset = (raw_data, raw_data_metadata) transformed_dataset, transform_fn = ( raw_dataset | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data, transformed_metadata = transformed_dataset transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) _ = ( transformed_data | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode) | 'WriteTrainData' >> beam.io.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TRAIN_DATA_FILEBASE), file_name_suffix='.tfrecords')) raw_test_data = ( pipeline | 'ReadTestData' >> beam.Create(test_data) | 'CreateTestData' >> beam.Map(lambda data: format(data))) raw_test_dataset = (raw_test_data, raw_data_metadata) transformed_test_dataset = ((raw_test_dataset, transform_fn) | tft_beam.TransformDataset()) # Don't need transformed data schema, it's the same as before. transformed_test_data, _ = transformed_test_dataset _ = (transformed_test_data | 'EncodeTestData' >> beam.Map(transformed_data_coder.encode) | 'WriteTestData' >> beam.io.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE), file_name_suffix='.tfrecords')) _ = (transform_fn | 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))
def transformed_data(working_dir): """数据处理与生成transform_fn""" def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" xi, yi = inputs["x"], inputs["y"] x_integerized = tft.compute_and_apply_vocabulary(xi, default_value=0, name="vocab") # , top_k=VOCAB_SIZE) y_integerized = tft.compute_and_apply_vocabulary(yi, default_value=0, name="label") # ,top_k=LABEL_SIZE return {"x": x_integerized, "y": y_integerized} # path_transform with tft_beam.Context(temp_dir=path_transform): transformed_dataset, transform_fn = ((xys, DATA_STRING_FEATURE_SPEC) | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_train_data, transformed_metadata = transformed_dataset _ = (transform_fn | tft_beam.WriteTransformFn(working_dir)) return transformed_train_data
def pipeline(root): """Pipeline instantiation function. Args: root: Source pipeline from which to extend. """ preprocessing_fn = compute_vocab_fn if FLAGS.vocab_gen_mode else apply_vocab_fn with tft_beam.Context(temp_dir=FLAGS.temp_dir): processed_lines = ( root # Read in TSV data. | beam.io.ReadFromText(data_path) # Fill in missing elements with the defaults (zeros). | "FillMissing" >> beam.ParDo(FillMissing()) # For numerical features, set negatives to zero. Then take log(x+1). | "NegsToZeroLog" >> beam.ParDo(NegsToZeroLog()) # For categorical features, mod the values with vocab size. | "HexToIntModRange" >> beam.ParDo(HexToIntModRange())) # CSV reader: List the cols in order, as dataset schema is not ordered. ordered_columns = [ LABEL_KEY ] + NUMERIC_FEATURE_KEYS + CATEGORICAL_FEATURE_KEYS converter = tft.coders.CsvCoder(ordered_columns, INPUT_METADATA.schema, delimiter=FLAGS.csv_delimeter) converted_data = (processed_lines | "DecodeData" >> beam.Map(converter.decode)) transformed_dataset, transform_fn = ( # pylint: disable=unused-variable (converted_data, INPUT_METADATA) | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data, transformed_metadata = transformed_dataset if not FLAGS.vocab_gen_mode: # Write to CSV. transformed_csv_coder = tft.coders.CsvCoder( ordered_columns, transformed_metadata.schema, delimiter=FLAGS.csv_delimeter) _ = (transformed_data | "EncodeDataCsv" >> beam.Map(transformed_csv_coder.encode) | "WriteDataCsv" >> beam.io.WriteToText(output_path))
def main(): def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" x = inputs['x'] y = inputs['y'] s = inputs['s'] x_centered = x - tft.mean(x) y_normalized = tft.scale_to_0_1(y) s_integerized = tft.compute_and_apply_vocabulary(s) x_centered_times_y_normalized = (x_centered * y_normalized) return { 'x_centered': x_centered, 'y_normalized': y_normalized, 'x_centered_times_y_normalized': x_centered_times_y_normalized, 's_integerized': s_integerized } raw_data = [ {'x': 1, 'y': 1, 's': 'hello'}, {'x': 2, 'y': 2, 's': 'world'}, {'x': 3, 'y': 3, 's': 'hello'} ] raw_data_metadata = dataset_metadata.DatasetMetadata( schema_utils.schema_from_feature_spec({ 's': tf.io.FixedLenFeature([], tf.string), 'y': tf.io.FixedLenFeature([], tf.float32), 'x': tf.io.FixedLenFeature([], tf.float32), })) with tft_beam.Context(temp_dir=tempfile.mkdtemp()): transformed_dataset, transform_fn = ( # pylint: disable=unused-variable (raw_data, raw_data_metadata) | tft_beam.AnalyzeAndTransformDataset( preprocessing_fn)) transformed_data, transformed_metadata = transformed_dataset # pylint: disable=unused-variable pprint.pprint(transformed_data)
def run_hub2emb(args): '''Runs the embedding generation pipeline''' options = beam.options.pipeline_options.PipelineOptions(**args) args = namedtuple("options", args.keys())(*args.values()) raw_metadata = create_metadata() converter = tft.coders.CsvCoder(column_names=['text'], schema=raw_metadata.schema) with beam.Pipeline(args.runner, options=options) as pipeline: with tft_beam.Context(args.temporary_dir): # Read the sentences from the input file sentences = ( pipeline | 'Read sentences from files' >> beam.io.ReadFromText(file_pattern='corpus/text.txt') # | 'Convert to dictionary' >> beam.Map(converter.decode) ) sentences_dataset = (sentences, raw_metadata) preprocess_fn = make_preprocess_fn(args.module_url, args.random_projection_matrix) # Generate the embeddings for the sentence using the TF-Hub module embeddings_dataset, _ = ( sentences_dataset | 'Extract embeddings' >> tft_beam.AnalyzeAndTransformDataset(preprocess_fn)) embeddings, transformed_metadata = embeddings_dataset # Write the embeddings to TFRecords files embeddings | 'Write embeddings to TFRecords' >> beam.io.tfrecordio.WriteToTFRecord( file_path_prefix='{}/emb'.format(args.output_dir), file_name_suffix='.tfrecords', coder=tft.coders.ExampleProtoCoder( transformed_metadata.schema))
def transform_data(train_data_file, test_data_file, working_dir): """Transform the data and write out as a TFRecord of Example protos. Read in the data using the CSV reader, and transform it using a preprocessing pipeline that scales numeric data and converts categorical data from strings to int64 values indices, by creating a vocabulary for each category. Args: train_data_file: File containing training data test_data_file: File containing test data working_dir: Directory to write transformed data and metadata to """ def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" # Since we are modifying some features and leaving others unchanged, we # start by setting `outputs` to a copy of `inputs. outputs = inputs.copy() # Scale numeric columns to have range [0, 1]. for key in NUMERIC_FEATURE_KEYS: outputs[key] = tft.scale_to_0_1(outputs[key]) for key in OPTIONAL_NUMERIC_FEATURE_KEYS: # This is a SparseTensor because it is optional. Here we fill in a default # value when it is missing. dense = tf.compat.v1.sparse_to_dense( outputs[key].indices, [outputs[key].dense_shape[0], 1], outputs[key].values, default_value=0.) # Reshaping from a batch of vectors of size 1 to a batch to scalars. dense = tf.squeeze(dense, axis=1) outputs[key] = tft.scale_to_0_1(dense) # For all categorical columns except the label column, we generate a # vocabulary but do not modify the feature. This vocabulary is instead # used in the trainer, by means of a feature column, to convert the feature # from a string to an integer id. for key in CATEGORICAL_FEATURE_KEYS: tft.vocabulary(inputs[key], vocab_filename=key) # For the label column we provide the mapping from string to index. table_keys = ['>50K', '<=50K'] initializer = tf.lookup.KeyValueTensorInitializer( keys=table_keys, values=tf.cast(tf.range(len(table_keys)), tf.int64), key_dtype=tf.string, value_dtype=tf.int64) table = tf.lookup.StaticHashTable(initializer, default_value=-1) outputs[LABEL_KEY] = table.lookup(outputs[LABEL_KEY]) return outputs # The "with" block will create a pipeline, and run that pipeline at the exit # of the block. with beam.Pipeline() as pipeline: with tft_beam.Context(temp_dir=tempfile.mkdtemp()): # Create a coder to read the census data with the schema. To do this we # need to list all columns in order since the schema doesn't specify the # order of columns in the csv. ordered_columns = [ 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'label' ] converter = tft.coders.CsvCoder(ordered_columns, RAW_DATA_METADATA.schema) # Read in raw data and convert using CSV converter. Note that we apply # some Beam transformations here, which will not be encoded in the TF # graph since we don't do the from within tf.Transform's methods # (AnalyzeDataset, TransformDataset etc.). These transformations are just # to get data into a format that the CSV converter can read, in particular # removing spaces after commas. # # We use MapAndFilterErrors instead of Map to filter out decode errors in # convert.decode which should only occur for the trailing blank line. raw_data = ( pipeline | 'ReadTrainData' >> beam.io.ReadFromText(train_data_file) | 'FixCommasTrainData' >> beam.Map(lambda line: line.replace(', ', ',')) | 'DecodeTrainData' >> MapAndFilterErrors(converter.decode)) # Combine data and schema into a dataset tuple. Note that we already used # the schema to read the CSV data, but we also need it to interpret # raw_data. raw_dataset = (raw_data, RAW_DATA_METADATA) transformed_dataset, transform_fn = ( raw_dataset | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data, transformed_metadata = transformed_dataset transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) _ = (transformed_data | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode) | 'WriteTrainData' >> beam.io.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TRAIN_DATA_FILEBASE))) # Now apply transform function to test data. In this case we remove the # trailing period at the end of each line, and also ignore the header line # that is present in the test data file. raw_test_data = ( pipeline | 'ReadTestData' >> beam.io.ReadFromText(test_data_file, skip_header_lines=1) | 'FixCommasTestData' >> beam.Map(lambda line: line.replace(', ', ',')) | 'RemoveTrailingPeriodsTestData' >> beam.Map(lambda line: line[:-1]) | 'DecodeTestData' >> MapAndFilterErrors(converter.decode)) raw_test_dataset = (raw_test_data, RAW_DATA_METADATA) transformed_test_dataset = ((raw_test_dataset, transform_fn) | tft_beam.TransformDataset()) # Don't need transformed data schema, it's the same as before. transformed_test_data, _ = transformed_test_dataset _ = ( transformed_test_data | 'EncodeTestData' >> beam.Map(transformed_data_coder.encode) | 'WriteTestData' >> beam.io.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE))) # Will write a SavedModel and metadata to working_dir, which can then # be read by the tft.TFTransformOutput class. _ = (transform_fn | 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))
'add': tf.FixedLenFeature([], tf.boolean), 'line_length': tf.FixedLenFeature([], tf.int32]))) input_data_metadata = dataset_metadata.DatasetMetadata(input_data_schema) with beam.Pipeline() as pipeline: with tft_beam.Context(temp_dir=tempfile.mkdtemp()): columns = text_fields + ["commented", "add", "line_length"] converter = tft.coders.CsvCoder(columns, input_data_schema) input_data = ( pipeline | 'ReadInputData' >> beam.io.ReadFromText(train_data_file) | 'CleanInputData' >> MapAndFilterErrors(converter.decode)) input_dataset = (input_data, input_data_metadata) transformed_dataset, transform_fn = ( input_dataset | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data, transfored_metadata = transformed_dataset transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) # Write the resulting data out _ = ( transformed_data | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode) | 'WriteTrainData' >> beam.io.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TRAIN_DATA_FILEBASE))) # We'll use the transform function later too _ = ( transform_fn | 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))
def generate_skipgram_beam( data_uri, feature_names, vocabulary_size=10, window_size=2, negative_samples=0.0, seed=None, temp_dir="/tmp", save_path="temp", beam_pipeline_args=None, ): """ Generate Skipgrams with an Apache Beam pipeline. Parameters ---------- data_uri : list(str) List of TFRecords that contains the tensors. feature_names : list(str), optional List of feature names, whose length must match the number of columns of features in the TFRecord. This helps determine the number of columns in the TFRecords. vocabulary_size : int, optional Size of skipgram vocabulary, by default 10 window_size : int, optional Window size of skipgram, by default 2 negative_samples : float, optional Fraction of negative samples of skipgram, by default 0.0 seed : int, optional Random seed, by default None temp_dir : str, optional Directory to save temporary results used by the Beam pipeline, by default "/tmp" save_path : str, optional Output path name (without the .tfrecord extention), by default "temp" beam_pipeline_args: dict, optional. Pipeline options of Beam runner. The default is None. Returns ------- saved_results: list(str) List of URIs / path to the TFRecord files. num_rows_saved: int Number of rows of the samples saved. """ def parse_tensor_f(x): xp = tf.io.parse_tensor(x, tf.int32) xp.set_shape([None]) return {fname: xp[i] for i, fname in enumerate(feature_names)} raw_data = tf.data.TFRecordDataset(data_uri).map( parse_tensor_f).as_numpy_iterator() raw_data_schema = dataset_metadata.DatasetMetadata( schema_utils.schema_from_feature_spec({ fname: tf.io.FixedLenFeature([], tf.int64) for fname in feature_names })) dataset = (raw_data, raw_data_schema) # Make the preprocessing_fn preprocessing_fn = make_preproc_func(vocabulary_size, window_size, negative_samples, feature_names, seed) # Run the beam pipeline pipeline_options = ( beam.options.pipeline_options.PipelineOptions.from_dictionary( beam_pipeline_args) if beam_pipeline_args is not None else None ) # None = DirectRunner, local mode with beam.Pipeline( options=pipeline_options) as Pipeline: # options=pipeline_options with tft_beam.Context(temp_dir=temp_dir): # pylint: disable=unused-variable ( transformed_dataset, transform_fn, ) = dataset | "Make Skipgrams " >> tft_beam.AnalyzeAndTransformDataset( preprocessing_fn) # pylint: disable=unused-variable transformed_data, transformed_metadata = transformed_dataset saved_results = ( transformed_data | "Write to TFRecord" >> beam.io.tfrecordio.WriteToTFRecord( file_path_prefix=save_path, file_name_suffix=".tfrecords", coder=tft.coders.example_proto_coder.ExampleProtoCoder( transformed_metadata.schema), )) # print('\nRaw data:\n{}\n'.format(pprint.pformat(dataset))) # print('Transformed data:\n{}'.format(pprint.pformat(transformed_data))) # Return the list of paths of tfrecords num_rows_saved = len(transformed_data) return saved_results, num_rows_saved
def calculate_metrics(): """Returns a pipeline to compute wordpiece model stats given a vocab and corpus.""" # Schema of input dataset. raw_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ 'text': tf.FixedLenFeature([], tf.string), 'language_code': tf.FixedLenFeature([], tf.string), })) # Schema to format metrics as CSV. csv_schema = dataset_schema.from_feature_spec({ 'lang': tf.FixedLenFeature([], tf.string), 'sample_count': tf.FixedLenFeature([], tf.int64), 'micro_drop_char_percent': tf.FixedLenFeature([], tf.string), 'macro_drop_char_percent': tf.FixedLenFeature([], tf.string), 'micro_compress_ratio': tf.FixedLenFeature([], tf.string), 'macro_compress_ratio': tf.FixedLenFeature([], tf.string), 'unweighted_en_wp_overlap_percent': tf.FixedLenFeature([], tf.string), 'weighted_en_wp_overlap_percent': tf.FixedLenFeature([], tf.string), }) columns = [ 'lang', 'sample_count', 'micro_drop_char_percent', 'macro_drop_char_percent', 'micro_compress_ratio', 'macro_compress_ratio', 'unweighted_en_wp_overlap_percent', 'weighted_en_wp_overlap_percent' ] # Create pipeline. pipeline = beam.Pipeline() with tft_beam.Context(temp_dir=tempfile.mkdtemp()): example_converter = tft.coders.ExampleProtoCoder(raw_metadata.schema, serialized=False) csv_converter = tft.coders.CsvCoder(columns, csv_schema) # Read raw data and convert to TF Transform encoded dict. raw_data = (pipeline | 'ReadInputData' >> beam.io.tfrecordio.ReadFromTFRecord( FLAGS.input_file, coder=beam.coders.ProtoCoder(tf.train.Example)) | 'DecodeInputData' >> beam.Map(example_converter.decode)) # Apply transform to wordpiece-tokenize input. (transformed_data, _), _ = ( (raw_data, raw_metadata) | 'WordpieceTokenizeInput' >> tft_beam.AnalyzeAndTransformDataset( utils.metrics_preprocessing_fn( FLAGS.vocab_file, FLAGS.text_key, FLAGS.language_code_key)) ) # Aggregate values for each lang, calculate metrics, and write to output. _ = (transformed_data | 'CompileTokenInfo' >> beam.ParDo(utils.CompileTokenizationInfo()) | 'CombineStatsForLang' >> beam.CombineGlobally( utils.AggregateLang()) | 'CalculateMetrics' >> beam.ParDo(utils.CalculateMetrics()) | 'EncodeMetrics' >> beam.Map(csv_converter.encode) | 'WriteMetrics' >> beam.io.WriteToText(FLAGS.output_file, shard_name_template='', header=','.join(columns))) return pipeline
def transform_data(train_data_file, test_data_file, working_dir): """Transform the data and write out as a TFRecord of Example protos. Read in the data using the CSV reader, and transform it using a preprocessing pipeline that scales numeric data and converts categorical data from strings to int64 values indices, by creating a vocabulary for each category. Args: train_data_file: File containing training data test_data_file: File containing test data working_dir: Directory to write transformed data and metadata to """ # The "with" block will create a pipeline, and run that pipeline at the exit # of the block. with apache_beam.Pipeline() as pipeline: with tft_beam.Context(temp_dir=tempfile.mkdtemp()): # Create a coder to read the census data with the schema. To do this we # need to list all columns in order since the schema doesn't specify the # order of columns in the csv. ordered_columns = [ 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'label' ] converter = tft.coders.CsvCoder(ordered_columns, RAW_DATA_METADATA.schema) # Read in raw data and convert using CSV converter. Note that we apply # some Beam transformations here, which will not be encoded in the TF # graph since we don't do them from within tf.Transform's methods # (AnalyzeDataset, TransformDataset etc.). These transformations are just # to get data into a format that the CSV converter can read, in particular # removing spaces after commas. # # We use MapAndFilterErrors instead of Map to filter out decode errors in # convert.decode which should only occur for the trailing blank line. raw_data = ( pipeline | 'ReadTrainData' >> apache_beam.io.ReadFromText(train_data_file) | 'FixCommasTrainData' >> apache_beam.Map(lambda line: line.replace(', ', ',')) | 'DecodeTrainData' >> MapAndFilterErrors(converter.decode)) # Combine data and schema into a dataset tuple. Note that we already used # the schema to read the CSV data, but we also need it to interpret # raw_data. raw_dataset = (raw_data, RAW_DATA_METADATA) transformed_dataset, transform_fn = ( raw_dataset | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data, transformed_metadata = transformed_dataset # A coder between TF Examples and tf.Transform datasets. # Used to encode a tf.transform encoded dict as tf.Example. transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) _ = (transformed_data | 'EncodeTrainData' >> apache_beam.Map( transformed_data_coder.encode) | 'WriteTrainData' >> apache_beam.io.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TRAIN_DATA_FILEBASE))) # Now apply transform function to test data. In this case we remove the # trailing period at the end of each line, and also ignore the header line # that is present in the test data file. raw_test_data = ( pipeline | 'ReadTestData' >> apache_beam.io.ReadFromText( test_data_file, skip_header_lines=1) | 'FixCommasTestData' >> apache_beam.Map(lambda line: line.replace(', ', ',')) | 'RemoveTrailingPeriodsTestData' >> apache_beam.Map(lambda line: line[:-1]) | 'DecodeTestData' >> MapAndFilterErrors(converter.decode)) raw_test_dataset = (raw_test_data, RAW_DATA_METADATA) transformed_test_dataset = ((raw_test_dataset, transform_fn) | tft_beam.TransformDataset()) # Don't need transformed data schema, it's the same as before. transformed_test_data, _ = transformed_test_dataset _ = ( transformed_test_data | 'EncodeTestData' >> apache_beam.Map( transformed_data_coder.encode) | 'WriteTestData' >> apache_beam.io.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE))) # Will write a SavedModel and metadata to working_dir, which can then # be read by the tft.TFTransformOutput class. _ = (transform_fn | 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))
def test_preprocessing_fn(self): schema_file = os.path.join(self._testdata_path, 'schema_gen/schema.pbtxt') schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema()) feature_spec = taxi_utils._get_raw_feature_spec(schema) working_dir = self.get_temp_dir() transform_output_path = os.path.join(working_dir, 'transform_output') transformed_examples_path = os.path.join(working_dir, 'transformed_examples') # Run very simplified version of executor logic. # TODO(kestert): Replace with tft_unit.assertAnalyzeAndTransformResults. # Generate legacy `DatasetMetadata` object. Future version of Transform # will accept the `Schema` proto directly. legacy_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec(feature_spec)) decoder = tft.coders.ExampleProtoCoder(legacy_metadata.schema) with beam.Pipeline() as p: with tft_beam.Context(temp_dir=os.path.join(working_dir, 'tmp')): examples = ( p | 'ReadTrainData' >> beam.io.ReadFromTFRecord( os.path.join(self._testdata_path, 'csv_example_gen/train/*'), coder=beam.coders.BytesCoder(), # TODO(b/114938612): Eventually remove this override. validate=False) | 'DecodeTrainData' >> beam.Map(decoder.decode)) (transformed_examples, transformed_metadata), transform_fn = ( (examples, legacy_metadata) | 'AnalyzeAndTransform' >> tft_beam.AnalyzeAndTransformDataset( taxi_utils.preprocessing_fn)) # WriteTransformFn writes transform_fn and metadata to subdirectories # tensorflow_transform.SAVED_MODEL_DIR and # tensorflow_transform.TRANSFORMED_METADATA_DIR respectively. # pylint: disable=expression-not-assigned (transform_fn | 'WriteTransformFn' >> tft_beam.WriteTransformFn(transform_output_path)) encoder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) (transformed_examples | 'EncodeTrainData' >> beam.Map(encoder.encode) | 'WriteTrainData' >> beam.io.WriteToTFRecord( os.path.join(transformed_examples_path, 'train/transformed_examples.gz'), coder=beam.coders.BytesCoder())) # pylint: enable=expression-not-assigned # Verify the output matches golden output. # NOTE: we don't verify that transformed examples match golden output. expected_transformed_schema = io_utils.parse_pbtxt_file( os.path.join( self._testdata_path, 'transform/transform_output/transformed_metadata/schema.pbtxt' ), schema_pb2.Schema()) transformed_schema = io_utils.parse_pbtxt_file( os.path.join(transform_output_path, 'transformed_metadata/schema.pbtxt'), schema_pb2.Schema()) # Clear annotations so we only have to test main schema. for feature in transformed_schema.feature: feature.ClearField('annotation') self.assertEqual(transformed_schema, expected_transformed_schema)
def transform_data(working_dir): """Transform the data and write out as a TFRecord of Example protos. Read in the data from the positive and negative examples on disk, and transform it using a preprocessing pipeline that removes punctuation, tokenizes and maps tokens to int64 values indices. Args: working_dir: Directory to read shuffled data from and write transformed data and metadata to. """ with beam.Pipeline() as pipeline: with tft_beam.Context( temp_dir=os.path.join(working_dir, TRANSFORM_TEMP_DIR)): tfxio_train_data = tfxio.TFExampleRecord(file_pattern=os.path.join( working_dir, SHUFFLED_TRAIN_DATA_FILEBASE + '*'), schema=SCHEMA) train_data = (pipeline | 'TFXIORead[Train]' >> tfxio_train_data.BeamSource()) tfxio_test_data = tfxio.TFExampleRecord(file_pattern=os.path.join( working_dir, SHUFFLED_TEST_DATA_FILEBASE + '*'), schema=SCHEMA) test_data = (pipeline | 'TFXIORead[Test]' >> tfxio_test_data.BeamSource()) def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" review = inputs[REVIEW_KEY] # Here tf.compat.v1.string_split behaves differently from # tf.strings.split. review_tokens = tf.compat.v1.string_split(review, DELIMITERS) review_indices = tft.compute_and_apply_vocabulary( review_tokens, top_k=VOCAB_SIZE) # Add one for the oov bucket created by compute_and_apply_vocabulary. review_bow_indices, review_weight = tft.tfidf( review_indices, VOCAB_SIZE + 1) return { REVIEW_KEY: review_bow_indices, REVIEW_WEIGHT_KEY: review_weight, LABEL_KEY: inputs[LABEL_KEY] } # Transformed metadata is not necessary for encoding. # The TFXIO output format is chosen for improved performance. (transformed_train_data, _), transform_fn = ( (train_data, tfxio_train_data.TensorAdapterConfig()) | 'AnalyzeAndTransform' >> tft_beam.AnalyzeAndTransformDataset( preprocessing_fn, output_record_batches=True)) transformed_test_data, _ = ( ((test_data, tfxio_test_data.TensorAdapterConfig()), transform_fn) | 'Transform' >> tft_beam.TransformDataset(output_record_batches=True)) # Extract transformed RecordBatches, encode and write them to the given # directory. coder = tfxio.RecordBatchToExamplesEncoder() _ = (transformed_train_data | 'EncodeTrainData' >> beam.FlatMapTuple(lambda batch, _: coder.encode(batch)) | 'WriteTrainData' >> beam.io.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TRAIN_DATA_FILEBASE))) _ = ( transformed_test_data | 'EncodeTestData' >> beam.FlatMapTuple(lambda batch, _: coder.encode(batch)) | 'WriteTestData' >> beam.io.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE))) # Will write a SavedModel and metadata to two subdirectories of # working_dir, given by tft.TRANSFORM_FN_DIR and # tft.TRANSFORMED_METADATA_DIR respectively. _ = (transform_fn | 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))
def transform_data(data): """ :param data: A list of raw data. :return: A numpy array of arrays of integers. """ with tft_beam.Context(temp_dir="temp/"): raw_data_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ # early_slack_count, midday_slack_count and late_slack_count is when a # slack message was sent in the day. 'early_slack_count': tensorflow.FixedLenFeature([], tensorflow.int64), 'midday_slack_count': tensorflow.FixedLenFeature([], tensorflow.int64), 'late_slack_count': tensorflow.FixedLenFeature([], tensorflow.int64), # negative_emoji, positive_emoji and neutral_emoji is the sentiment # of the emojis sent. 'negative_emoji': tensorflow.FixedLenFeature([], tensorflow.int64), 'positive_emoji': tensorflow.FixedLenFeature([], tensorflow.int64), 'neutral_emoji': tensorflow.FixedLenFeature([], tensorflow.int64), # Github count 'github_count': tensorflow.FixedLenFeature([], tensorflow.int64), # weekday 'weekday': tensorflow.FixedLenFeature([], tensorflow.int64), 'event_rating_ratio': tensorflow.FixedLenFeature([], tensorflow.int64), 'temperature': tensorflow.FixedLenFeature([], tensorflow.int64), 'precipitation': tensorflow.FixedLenFeature([], tensorflow.int64), 'slack_negative_ratio': tensorflow.FixedLenFeature([], tensorflow.int64), })) transformed_dataset, transform_fn = ( (data, raw_data_metadata) | tft_beam.AnalyzeAndTransformDataset(preprocess)) transformed_data, transformed_metadata = transformed_dataset # TODO: There should be an easier way to do this. retransformed_data = [] for trans in transformed_data: current = [ trans["early_slack_count_normalized"], trans["midday_slack_count_normalized"], trans["late_slack_count_normalized"], trans["negative_emoji_normalized"], trans["neutral_emoji_normalized"], trans["positive_emoji_normalized"], trans["github_count_normalized"], trans["weekday"], trans["event_rating_normalized"], trans["temperature_normalized"], trans["precipitation_normalized"], trans["slack_negative_normalized"] ] retransformed_data.append(current) return array(retransformed_data)
def transform_data(train_data_file, test_data_file, working_dir): """Transform the data and write out as a TFRecord of Example protos. Read in the data using the CSV reader, and transform it using a preprocessing pipeline that scales numeric data and converts categorical data from strings to int64 values indices, by creating a vocabulary for each category. Args: train_data_file: File containing training data test_data_file: File containing test data working_dir: Directory to write transformed data and metadata to """ def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" # Since we are modifying some features and leaving others unchanged, we # start by setting `outputs` to a copy of `inputs. outputs = inputs.copy() # Scale numeric columns to have range [0, 1]. for key in NUMERIC_FEATURE_KEYS: outputs[key] = tft.scale_to_0_1(inputs[key]) for key in OPTIONAL_NUMERIC_FEATURE_KEYS: # This is a SparseTensor because it is optional. Here we fill in a default # value when it is missing. sparse = tf.sparse.SparseTensor(inputs[key].indices, inputs[key].values, [inputs[key].dense_shape[0], 1]) dense = tf.sparse.to_dense(sp_input=sparse, default_value=0.) # Reshaping from a batch of vectors of size 1 to a batch to scalars. dense = tf.squeeze(dense, axis=1) outputs[key] = tft.scale_to_0_1(dense) # For all categorical columns except the label column, we generate a # vocabulary but do not modify the feature. This vocabulary is instead # used in the trainer, by means of a feature column, to convert the feature # from a string to an integer id. for key in CATEGORICAL_FEATURE_KEYS: outputs[key] = tft.compute_and_apply_vocabulary(tf.strings.strip( inputs[key]), num_oov_buckets=1, vocab_filename=key) # For the label column we provide the mapping from string to index. table_keys = ['>50K', '<=50K'] initializer = tf.lookup.KeyValueTensorInitializer( keys=table_keys, values=tf.cast(tf.range(len(table_keys)), tf.int64), key_dtype=tf.string, value_dtype=tf.int64) table = tf.lookup.StaticHashTable(initializer, default_value=-1) # Romove trailing periods for test data when the data is read with tf.data. label_str = tf.strings.regex_replace(inputs[LABEL_KEY], r'\.', '') label_str = tf.strings.strip(label_str) data_labels = table.lookup(label_str) transformed_label = tf.one_hot(indices=data_labels, depth=len(table_keys), on_value=1.0, off_value=0.0) outputs[LABEL_KEY] = tf.reshape(transformed_label, [-1, len(table_keys)]) return outputs # The "with" block will create a pipeline, and run that pipeline at the exit # of the block. with beam.Pipeline() as pipeline: with tft_beam.Context(temp_dir=tempfile.mkdtemp()): # Create a TFXIO to read the census data with the schema. To do this we # need to list all columns in order since the schema doesn't specify the # order of columns in the csv. # We first read CSV files and use BeamRecordCsvTFXIO whose .BeamSource() # accepts a PCollection[bytes] because we need to patch the records first # (see "FixCommasTrainData" below). Otherwise, tfxio.CsvTFXIO can be used # to both read the CSV files and parse them to TFT inputs: # csv_tfxio = tfxio.CsvTFXIO(...) # raw_data = (pipeline | 'ToRecordBatches' >> csv_tfxio.BeamSource()) csv_tfxio = tfxio.BeamRecordCsvTFXIO( physical_format='text', column_names=ORDERED_CSV_COLUMNS, schema=SCHEMA) # Read in raw data and convert using CSV TFXIO. Note that we apply # some Beam transformations here, which will not be encoded in the TF # graph since we don't do the from within tf.Transform's methods # (AnalyzeDataset, TransformDataset etc.). These transformations are just # to get data into a format that the CSV TFXIO can read, in particular # removing spaces after commas. raw_data = (pipeline | 'ReadTrainData' >> beam.io.ReadFromText( train_data_file, coder=beam.coders.BytesCoder()) | 'FixCommasTrainData' >> beam.Map(lambda line: line.replace(b', ', b',')) | 'DecodeTrainData' >> csv_tfxio.BeamSource()) # Combine data and schema into a dataset tuple. Note that we already used # the schema to read the CSV data, but we also need it to interpret # raw_data. raw_dataset = (raw_data, csv_tfxio.TensorAdapterConfig()) transformed_dataset, transform_fn = ( raw_dataset | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data, transformed_metadata = transformed_dataset transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) _ = (transformed_data | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode) | 'WriteTrainData' >> beam.io.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TRAIN_DATA_FILEBASE))) # Now apply transform function to test data. In this case we remove the # trailing period at the end of each line, and also ignore the header line # that is present in the test data file. raw_test_data = (pipeline | 'ReadTestData' >> beam.io.ReadFromText( test_data_file, skip_header_lines=1, coder=beam.coders.BytesCoder()) | 'FixCommasTestData' >> beam.Map(lambda line: line.replace(b', ', b',')) | 'RemoveTrailingPeriodsTestData' >> beam.Map(lambda line: line[:-1]) | 'DecodeTestData' >> csv_tfxio.BeamSource()) raw_test_dataset = (raw_test_data, csv_tfxio.TensorAdapterConfig()) transformed_test_dataset = ((raw_test_dataset, transform_fn) | tft_beam.TransformDataset()) # Don't need transformed data schema, it's the same as before. transformed_test_data, _ = transformed_test_dataset _ = ( transformed_test_data | 'EncodeTestData' >> beam.Map(transformed_data_coder.encode) | 'WriteTestData' >> beam.io.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE))) # Will write a SavedModel and metadata to working_dir, which can then # be read by the tft.TFTransformOutput class. _ = (transform_fn | 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))
def transform_data(working_dir): with beam.Pipeline() as pipeline: with tft_beam.Context( temp_dir=os.path.join(working_dir, TRANSFORM_TEMP_DIR)): train_coder = tft.coders.ExampleProtoCoder( TRAIN_RAW_DATA_METADATA.schema) test_coder = tft.coders.ExampleProtoCoder( TEST_RAW_DATA_METADATA.schema) train_data = (pipeline | 'Read Train' >> beam.io.ReadFromTFRecord( os.path.join(working_dir, TFRECORD_TRAIN_DATA_FILEBASE + '*')) | 'Decode Train' >> beam.Map(train_coder.decode)) test_data = (pipeline | 'Read Test' >> beam.io.ReadFromTFRecord( os.path.join(working_dir, TFRECORD_TEST_DATA_FILEBASE + '*')) | 'Decode Test' >> beam.Map(test_coder.decode)) def preprocessing_fn_train(inputs): """Preprocess input columns into transformed columns.""" context = inputs['Context'] utterance = inputs['Utterance'] vocab = tf.concat([context, utterance], 0) context_tokens = tf.compat.v1.string_split(context, DELIMITERS) utterance_tokens = tf.compat.v1.string_split( utterance, DELIMITERS) vocab_tokens = tf.compat.v1.string_split(vocab, DELIMITERS) vocab_mapping_file_path = tft.vocabulary( vocab_tokens, vocab_filename='anantvir_train_vocab') mapped_context = tft.apply_vocabulary( context_tokens, deferred_vocab_filename_tensor=vocab_mapping_file_path) print(mapped_context) mapped_utterance = tft.apply_vocabulary( utterance_tokens, deferred_vocab_filename_tensor=vocab_mapping_file_path) return { 'Context': mapped_context, 'Utterance': mapped_utterance, } def preprocessing_fn_test(inputs): """Preprocess input columns into transformed columns.""" context = inputs['Context'] ground_truth_utterance = inputs['Ground Truth Utterance'] distractor_0 = inputs['Distractor_0'] distractor_1 = inputs['Distractor_1'] distractor_2 = inputs['Distractor_2'] distractor_3 = inputs['Distractor_3'] distractor_4 = inputs['Distractor_4'] distractor_5 = inputs['Distractor_5'] distractor_6 = inputs['Distractor_6'] distractor_7 = inputs['Distractor_7'] distractor_8 = inputs['Distractor_8'] vocab = tf.concat([ context, ground_truth_utterance, distractor_0, distractor_1, distractor_2, distractor_3, distractor_4, distractor_5, distractor_6, distractor_7, distractor_8 ], 0) context_tokens = tf.compat.v1.string_split(context, DELIMITERS) ground_truth_utterance_tokens = tf.compat.v1.string_split( ground_truth_utterance, DELIMITERS) distractor_0_tokens = tf.compat.v1.string_split( distractor_0, DELIMITERS) distractor_1_tokens = tf.compat.v1.string_split( distractor_1, DELIMITERS) distractor_2_tokens = tf.compat.v1.string_split( distractor_2, DELIMITERS) distractor_3_tokens = tf.compat.v1.string_split( distractor_3, DELIMITERS) distractor_4_tokens = tf.compat.v1.string_split( distractor_4, DELIMITERS) distractor_5_tokens = tf.compat.v1.string_split( distractor_5, DELIMITERS) distractor_6_tokens = tf.compat.v1.string_split( distractor_6, DELIMITERS) distractor_7_tokens = tf.compat.v1.string_split( distractor_7, DELIMITERS) distractor_8_tokens = tf.compat.v1.string_split( distractor_8, DELIMITERS) vocab_tokens = tf.compat.v1.string_split(vocab, DELIMITERS) vocab_mapping_file_path = tft.vocabulary( vocab_tokens, vocab_filename='anantvir_test_vocab') mapped_context = tft.apply_vocabulary( context_tokens, deferred_vocab_filename_tensor=vocab_mapping_file_path) mapped_ground_truth_utterance = tft.apply_vocabulary( ground_truth_utterance_tokens, deferred_vocab_filename_tensor=vocab_mapping_file_path) mapped_distractor_0 = tft.apply_vocabulary( distractor_0_tokens, deferred_vocab_filename_tensor=vocab_mapping_file_path) mapped_distractor_1 = tft.apply_vocabulary( distractor_0_tokens, deferred_vocab_filename_tensor=vocab_mapping_file_path) mapped_distractor_2 = tft.apply_vocabulary( distractor_0_tokens, deferred_vocab_filename_tensor=vocab_mapping_file_path) mapped_distractor_3 = tft.apply_vocabulary( distractor_0_tokens, deferred_vocab_filename_tensor=vocab_mapping_file_path) mapped_distractor_4 = tft.apply_vocabulary( distractor_0_tokens, deferred_vocab_filename_tensor=vocab_mapping_file_path) mapped_distractor_5 = tft.apply_vocabulary( distractor_0_tokens, deferred_vocab_filename_tensor=vocab_mapping_file_path) mapped_distractor_6 = tft.apply_vocabulary( distractor_0_tokens, deferred_vocab_filename_tensor=vocab_mapping_file_path) mapped_distractor_7 = tft.apply_vocabulary( distractor_0_tokens, deferred_vocab_filename_tensor=vocab_mapping_file_path) mapped_distractor_8 = tft.apply_vocabulary( distractor_0_tokens, deferred_vocab_filename_tensor=vocab_mapping_file_path) return { 'Context': mapped_context, 'Ground Truth Utterance': mapped_ground_truth_utterance, 'Distractor_0': mapped_distractor_0, 'Distractor_1': mapped_distractor_1, 'Distractor_2': mapped_distractor_2, 'Distractor_3': mapped_distractor_3, 'Distractor_4': mapped_distractor_4, 'Distractor_5': mapped_distractor_5, 'Distractor_6': mapped_distractor_6, 'Distractor_7': mapped_distractor_7, 'Distractor_8': mapped_distractor_8, } # train_transform_fn = ( # # data, metadata = dataset # (train_data, TRAIN_RAW_DATA_METADATA) # | 'Analyze' >> tft_beam.AnalyzeDataset( # preprocessing_fn_train)) (transformed_train_data, transformed_train_metadata), train_transform_fn = ( (train_data, TRAIN_RAW_DATA_METADATA) | 'AnalyzeAndTransformTrain' >> tft_beam.AnalyzeAndTransformDataset(preprocessing_fn_train)) # https://stackoverflow.com/questions/46406419/collecting-output-from-apache-beam-pipeline-and-displaying-it-to-console def print_row(row): #raw_inputs = row['Context'] #padded_inputs = tf.keras.preprocessing.sequence.pad_sequences(raw_inputs,padding='post') print(row) _ = (transformed_train_data | 'print' >> beam.Map(print_row)) transformed_train_data_coder = tft.coders.ExampleProtoCoder( transformed_train_metadata.schema) (transformed_test_data, transformed_test_metadata), test_transform_fn = ( (test_data, TEST_RAW_DATA_METADATA) | 'AnalyzeAndTransformTest' >> tft_beam.AnalyzeAndTransformDataset(preprocessing_fn_test)) transformed_test_data_coder = tft.coders.ExampleProtoCoder( transformed_test_metadata.schema) _ = (transformed_train_data | 'EncodeTrainData' >> beam.Map( transformed_train_data_coder.encode) | 'WriteTrainData' >> beam.io.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TRAIN_DATA_FILEBASE))) _ = ( transformed_test_data | 'EncodeTestData' >> beam.Map( transformed_test_data_coder.encode) | 'WriteTestData' >> beam.io.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE)))
def build_pipeline(df: pd.DataFrame, job_label: str, runner: str, project: str, region: str, output_dir: str, compression: str, num_shards: int, dataflow_options: dict, integer_label: bool) -> beam.Pipeline: """Runs TFRecorder Beam Pipeline. Args: df: Pandas DataFrame job_label: User description for the beam job. runner: Beam Runner: (e.g. DataflowRunner, DirectRunner). project: GCP project ID (if DataflowRunner) region: GCP compute region (if DataflowRunner) output_dir: GCS or Local Path for output. compression: gzip or None. num_shards: Number of shards. dataflow_options: Dataflow Runner Options (optional) integer_label: Flags if label is already an integer. Returns: beam.Pipeline Note: These inputs must be validated upstream (by client.create_tfrecord()) """ job_name = _get_job_name(job_label) job_dir = _get_job_dir(output_dir, job_name) options = _get_pipeline_options(runner, job_name, job_dir, project, region, dataflow_options) #with beam.Pipeline(runner, options=options) as p: p = beam.Pipeline(options=options) with tft_beam.Context(temp_dir=os.path.join(job_dir, 'tft_tmp')): converter = tft.coders.CsvCoder(constants.IMAGE_CSV_COLUMNS, constants.IMAGE_CSV_METADATA.schema) extract_images_fn = beam_image.ExtractImagesDoFn( constants.IMAGE_URI_KEY) flatten_rows = ToCSVRows() # Each element in the image_csv_data PCollection will be a dict # including the image_csv_columns and the image features created from # extract_images_fn. image_csv_data = ( p | 'ReadFromDataFrame' >> beam.Create(df.values.tolist()) | 'ToCSVRows' >> beam.ParDo(flatten_rows) | 'DecodeCSV' >> beam.Map(converter.decode) | 'ReadImage' >> beam.ParDo(extract_images_fn)) # Split dataset into train and validation. train_data, val_data, test_data, discard_data = ( image_csv_data | 'SplitDataset' >> beam.Partition( _partition_fn, len(constants.SPLIT_VALUES))) train_dataset = (train_data, constants.RAW_METADATA) val_dataset = (val_data, constants.RAW_METADATA) test_dataset = (test_data, constants.RAW_METADATA) # TensorFlow Transform applied to all datasets. preprocessing_fn = functools.partial(_preprocessing_fn, integer_label=integer_label) transformed_train_dataset, transform_fn = ( train_dataset | 'AnalyzeAndTransformTrain' >> tft_beam.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_train_data, transformed_metadata = transformed_train_dataset transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) transformed_val_data, _ = ( (val_dataset, transform_fn) | 'TransformVal' >> tft_beam.TransformDataset()) transformed_test_data, _ = ( (test_dataset, transform_fn) | 'TransformTest' >> tft_beam.TransformDataset()) # Sinks for TFRecords and metadata. tfr_writer = functools.partial(_get_write_to_tfrecord, output_dir=job_dir, compress=compression, num_shards=num_shards) _ = (transformed_train_data | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode) | 'WriteTrainData' >> tfr_writer(prefix='train')) _ = (transformed_val_data | 'EncodeValData' >> beam.Map(transformed_data_coder.encode) | 'WriteValData' >> tfr_writer(prefix='val')) _ = (transformed_test_data | 'EncodeTestData' >> beam.Map(transformed_data_coder.encode) | 'WriteTestData' >> tfr_writer(prefix='test')) _ = (discard_data | 'DiscardDataWriter' >> beam.io.WriteToText( os.path.join(job_dir, 'discarded-data'))) # Output transform function and metadata _ = (transform_fn | 'WriteTransformFn' >> tft_beam.WriteTransformFn(job_dir)) # Output metadata schema _ = (transformed_metadata | 'WriteMetadata' >> tft_beam.WriteMetadata(job_dir, pipeline=p)) return p
def transform_data(train_data_file, test_data_file, working_dir): """Transform the data and write out as a TFRecord of Example protos. Read in the data using the parquet io, and transform it using a preprocessing pipeline that scales numeric data and converts categorical data from strings to int64 values indices, by creating a vocabulary for each category. Args: train_data_file: File containing training data test_data_file: File containing test data feature_config: named tuple with feature types working_dir: Directory to write transformed data and metadata to """ numerical_feats = [ "startCountTotal", "purchaseCountTotal", "globalStartCountTotal", "globalPurchaseCountTotal" ] categorical_feats = ["country", "sourceGameId", "platform"] def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" outputs = {} for key in numerical_feats: outputs[key] = tf.cast(tft.bucketize(inputs[key], 20), tf.float32) / 20.0 - 0.5 outputs["campaignCost_mod"] = inputs["campaignCost"] / 100.0 inputs["game_zone"] = tf.string_join( [inputs["sourceGameId"], inputs["zone"]], separator="_") inputs["game_campaignId"] = tf.string_join( [inputs["sourceGameId"], inputs["campaignId"]], separator="_") for key in categorical_feats + ["game_zone", "game_campaignId"]: vocab = tft.vocabulary(inputs[key], vocab_filename=key, frequency_threshold=100) outputs[key] = tft.apply_vocabulary(inputs[key], vocab, default_value=0) outputs["label"] = inputs["label"] outputs["key"] = inputs["key"] return outputs # Input schema definition RAW_DATA_METADATA = gather_raw_metadata( numerical_feats + ["campaignCost"], categorical_feats + ["zone", "campaignId", "key"]) # pipeline args to read from gcs, currently unused because reading local file pipeline_args = [ '--runner=DirectRunner', '--project=unity-ads-ds-prd', # '--staging_location=gs://unity-ads-ds-prd-users/villew/promo/staging', # '--temp_location=gs://unity-ads-ds-prd-users/villew/promo/temp', '--job_name=transform-promo-data-to-tf-records' ] pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True # create a beam pipeline with beam.Pipeline(options=pipeline_options) as pipeline: with tft_beam.Context(temp_dir=tempfile.mkdtemp()): raw_data = ( pipeline | 'ReadTrainData' >> beam.io.ReadFromParquet(train_data_file)) # Combine data and schema into a dataset tuple. raw_dataset = (raw_data, RAW_DATA_METADATA) transformed_dataset, transform_fn = ( raw_dataset | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data, transformed_metadata = transformed_dataset transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) # write to tf record _ = (transformed_data | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode) | 'WriteTrainData' >> beam.io.WriteToTFRecord( os.path.join(working_dir, "train_tfrecord"))) # Now apply transform function to test data. raw_test_data = ( pipeline | 'ReadTestData' >> beam.io.ReadFromParquet(test_data_file)) raw_test_dataset = (raw_test_data, RAW_DATA_METADATA) transformed_test_dataset = ((raw_test_dataset, transform_fn) | tft_beam.TransformDataset()) # Don't need transformed data schema, it's the same as before. transformed_test_data, _ = transformed_test_dataset _ = (transformed_test_data | 'EncodeTestData' >> beam.Map(transformed_data_coder.encode) | 'WriteTestData' >> beam.io.WriteToTFRecord( os.path.join(working_dir, "test_tfrecord"))) # Will write a SavedModel and metadata to working_dir, which can then # be read by the tft.TFTransformOutput class. _ = (transform_fn | 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))
def transform_data(working_dir): """Transform the data and write out as a TFRecord of Example protos. Read in the data from the positive and negative examples on disk, and transform it using a preprocessing pipeline that removes punctuation, tokenizes and maps tokens to int64 values indices. Args: working_dir: Directory to read shuffled data from and write transformed data and metadata to. """ with beam.Pipeline() as pipeline: with tft_beam.Context( temp_dir=os.path.join(working_dir, TRANSFORM_TEMP_DIR)): coder = tft.coders.ExampleProtoCoder(RAW_DATA_METADATA.schema) train_data = (pipeline | 'ReadTrain' >> beam.io.ReadFromTFRecord( os.path.join(working_dir, SHUFFLED_TRAIN_DATA_FILEBASE + '*')) | 'DecodeTrain' >> beam.Map(coder.decode)) test_data = (pipeline | 'ReadTest' >> beam.io.ReadFromTFRecord( os.path.join(working_dir, SHUFFLED_TEST_DATA_FILEBASE + '*')) | 'DecodeTest' >> beam.Map(coder.decode)) def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" review = inputs[REVIEW_KEY] # Here tf.compat.v1.string_split behaves differently from # tf.strings.split. review_tokens = tf.compat.v1.string_split(review, DELIMITERS) review_indices = tft.compute_and_apply_vocabulary( review_tokens, top_k=VOCAB_SIZE) # Add one for the oov bucket created by compute_and_apply_vocabulary. review_bow_indices, review_weight = tft.tfidf( review_indices, VOCAB_SIZE + 1) return { REVIEW_KEY: review_bow_indices, REVIEW_WEIGHT_KEY: review_weight, LABEL_KEY: inputs[LABEL_KEY] } (transformed_train_data, transformed_metadata), transform_fn = ( (train_data, RAW_DATA_METADATA) | 'AnalyzeAndTransform' >> tft_beam.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) transformed_test_data, _ = ( ((test_data, RAW_DATA_METADATA), transform_fn) | 'Transform' >> tft_beam.TransformDataset()) _ = (transformed_train_data | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode) | 'WriteTrainData' >> beam.io.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TRAIN_DATA_FILEBASE))) _ = ( transformed_test_data | 'EncodeTestData' >> beam.Map(transformed_data_coder.encode) | 'WriteTestData' >> beam.io.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE))) # Will write a SavedModel and metadata to two subdirectories of # working_dir, given by tft.TRANSFORM_FN_DIR and # tft.TRANSFORMED_METADATA_DIR respectively. _ = (transform_fn | 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))
def transform_data(input_features, preprocessing_fn, pipeline_args, train_data_file, cv_data_file, test_data_file, working_dir): """Transform the data and write out as a TFRecord of Example protos. Read in the data using the parquet io, and transform it using a preprocessing pipeline that scales numeric data and converts categorical data from strings to int64 values indices, by creating a vocabulary for each category. Args: train_data_file: File containing training data test_data_file: File containing test data feature_config: named tuple with feature types working_dir: Directory to write transformed data and metadata to """ # Input schema definition RAW_DATA_METADATA = _get_raw_metadata(input_features) # pipeline args to read from gcs, currently unused because reading local file pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True # create a beam pipeline with beam.Pipeline(options=pipeline_options) as pipeline: # Needs to be GCS location if the process is running on Dataflow, otherwise it can't share model files temp_dir = pipeline_options.get_all_options().get( 'temp_location') or tempfile.mkdtemp() with tft_beam.Context(temp_dir=temp_dir): raw_data = ( pipeline | 'ReadTrainData' >> beam.io.ReadFromParquet(train_data_file)) # Combine data and schema into a dataset tuple. raw_dataset = (raw_data, RAW_DATA_METADATA) transformed_dataset, transform_fn = ( raw_dataset | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data, transformed_metadata = transformed_dataset transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) # write to tf record _ = (transformed_data | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode) | 'WriteTrainData' >> beam.io.WriteToTFRecord( os.path.join(working_dir, "train_tfrecord"))) def encode_data(data_path, prefix, output_filename): # Apply transform function to test data. raw_data = ( pipeline | 'ReadData' + prefix >> beam.io.ReadFromParquet(data_path)) raw_dataset = (raw_data, RAW_DATA_METADATA) transformed_dataset = ( (raw_dataset, transform_fn) | 'Transform' + prefix >> tft_beam.TransformDataset()) # Don't need transformed data schema, it's the same as before. transformed_data, _ = transformed_dataset _ = (transformed_data | 'EncodeData' + prefix >> beam.Map( transformed_data_coder.encode) | 'WriteData' + prefix >> beam.io.WriteToTFRecord( os.path.join(working_dir, output_filename))) encode_data(cv_data_file, "-cv", "cv_tfrecord") encode_data(test_data_file, "-test", "test_tfrecord") # Will write a SavedModel and metadata to working_dir, which can then # be read by the tft.TFTransformOutput class. _ = (transform_fn | 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))