def pipeline(root): """Pipeline instantiation function. Args: root: Source pipeline from which to extend. """ # This pipeline is concerned only with searching the sparse features. with tft_beam.Context(temp_dir=FLAGS.temp_dir): processed_lines = ( root # Read in TSV data. | "ReadData" >> beam.io.ReadFromText(data_path) # For categorical features, search for the given values, as integers. | "HexSearchFilter" >> beam.ParDo(HexSearchFilter(), 1, [ 14198776, 26023586, 21084594 ]).with_outputs("malformed_entries", main="filtered_outputs")) malformed_lines = processed_lines.malformed_entries processed_lines = processed_lines.filtered_outputs _ = (processed_lines | "WriteData" >> beam.io.WriteToText(output_path)) _ = (malformed_lines | "WriteDataMalformed" >> beam.io.WriteToText(output_path + "_malformed"))
def testPreprocessingFn(self): schema_file = os.path.join(self._testdata_path, 'schema_gen/schema.pbtxt') schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema()) feature_spec = taxi_utils._get_raw_feature_spec(schema) working_dir = self.get_temp_dir() transform_graph_path = os.path.join(working_dir, 'transform_graph') transformed_examples_path = os.path.join( working_dir, 'transformed_examples') # Run very simplified version of executor logic. # TODO(kestert): Replace with tft_unit.assertAnalyzeAndTransformResults. # Generate legacy `DatasetMetadata` object. Future version of Transform # will accept the `Schema` proto directly. legacy_metadata = dataset_metadata.DatasetMetadata( schema_utils.schema_from_feature_spec(feature_spec)) tfxio = tf_example_record.TFExampleRecord( file_pattern=os.path.join(self._testdata_path, 'csv_example_gen/Split-train/*'), telemetry_descriptors=['Tests'], schema=legacy_metadata.schema) with beam.Pipeline() as p: with tft_beam.Context(temp_dir=os.path.join(working_dir, 'tmp')): examples = p | 'ReadTrainData' >> tfxio.BeamSource() (transformed_examples, transformed_metadata), transform_fn = ( (examples, tfxio.TensorAdapterConfig()) | 'AnalyzeAndTransform' >> tft_beam.AnalyzeAndTransformDataset( taxi_utils.preprocessing_fn)) # WriteTransformFn writes transform_fn and metadata to subdirectories # tensorflow_transform.SAVED_MODEL_DIR and # tensorflow_transform.TRANSFORMED_METADATA_DIR respectively. # pylint: disable=expression-not-assigned (transform_fn | 'WriteTransformFn' >> tft_beam.WriteTransformFn(transform_graph_path)) encoder = tft.coders.ExampleProtoCoder(transformed_metadata.schema) (transformed_examples | 'EncodeTrainData' >> beam.Map(encoder.encode) | 'WriteTrainData' >> beam.io.WriteToTFRecord( os.path.join(transformed_examples_path, 'Split-train/transformed_examples.gz'), coder=beam.coders.BytesCoder())) # pylint: enable=expression-not-assigned # Verify the output matches golden output. # NOTE: we don't verify that transformed examples match golden output. expected_transformed_schema = io_utils.parse_pbtxt_file( os.path.join( self._testdata_path, 'transform/transform_graph/transformed_metadata/schema.pbtxt'), schema_pb2.Schema()) transformed_schema = io_utils.parse_pbtxt_file( os.path.join(transform_graph_path, 'transformed_metadata/schema.pbtxt'), schema_pb2.Schema()) # Clear annotations so we only have to test main schema. transformed_schema.ClearField('annotation') for feature in transformed_schema.feature: feature.ClearField('annotation') self.assertEqual(transformed_schema, expected_transformed_schema)
def test_train(self): """Tests case where training data is passed.""" with self.pipeline as p: with tft_beam.Context(temp_dir=os.path.join(self.test_dir, 'tmp')): df = self.pre_tft_df[self.pre_tft_df.split == 'TRAIN'] dataset = self._get_dataset(p, df) preprocessing_fn = functools.partial( beam_pipeline._preprocessing_fn, schema_map=self.schema.pre_tft_schema_map) transform_fn = (beam_pipeline._transform_and_write_tfr( dataset, self.tfr_writer, preprocessing_fn=preprocessing_fn, metadata=self.pre_tft_metadata, label='Train')) _ = transform_fn | tft_beam.WriteTransformFn(self.test_dir) self.assertTrue( os.path.isdir(os.path.join(self.test_dir, 'transform_fn'))) self.assertTrue( os.path.isdir(os.path.join(self.test_dir, 'transformed_metadata'))) self.assertTrue(glob.glob(os.path.join(self.test_dir, 'train*.gz'))) self.assertFalse( glob.glob(os.path.join(self.test_dir, 'validation*.gz'))) self.assertFalse(glob.glob(os.path.join(self.test_dir, 'test*.gz')))
def _main(argv=None): logging.getLogger().setLevel(logging.INFO) parser = argparse.ArgumentParser() parser.add_argument('--raw_examples_path', required=True) parser.add_argument('--raw_examples_schema_path', required=True) parser.add_argument('--preprocessing_module_path', required=True) parser.add_argument('--transform_fn_dir', required=True) known_args, pipeline_args = parser.parse_known_args(argv) raw_examples_schema = load_schema(known_args.raw_examples_schema_path) raw_examples_coder = tft.coders.ExampleProtoCoder(raw_examples_schema) raw_examples_metadata = dataset_metadata.DatasetMetadata( raw_examples_schema) tft_preprocessing = load_module_from_file_path( 'tft_preprocessing', known_args.preprocessing_module_path) preprocessing_fn = tft_preprocessing.preprocessing_fn pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as pipeline: with tft_beam.Context(temp_dir=get_beam_temp_dir(pipeline_options)): raw_examples = pipeline | 'ReadRawExamples' >> beam.io.ReadFromTFRecord( known_args.raw_examples_path, coder=raw_examples_coder) raw_examples_dataset = (raw_examples, raw_examples_metadata) transform_fn = raw_examples_dataset | tft_beam.AnalyzeDataset( preprocessing_fn) transform_fn | 'WriteTransformFn' >> tft_beam.WriteTransformFn( known_args.transform_fn_dir)
def run_metrics(): """Creates a pipeline to measure wordpiece vocab metrics over a corpus.""" metrics_pipeline = beam.Pipeline() with tft_beam.Context(temp_dir=tempfile.mkdtemp()): # Read raw data and convert to TF Transform encoded dict. raw_data = ( metrics_pipeline | 'ReadInputData' >> beam.io.tfrecordio.ReadFromTFRecord( data_file, coder=beam.coders.ProtoCoder(tf.train.Example)) | 'DecodeInputData' >> beam.Map(example_converter.decode)) # Apply transform to wordpiece-tokenize input. (metrics_transformed_data, _), _ = ( (raw_data, raw_metadata) | 'WordpieceTokenizeInput' >> tft_beam.AnalyzeAndTransformDataset( utils.metrics_preprocessing_fn(FLAGS.vocab_file, FLAGS.text_key, FLAGS.language_code_key))) # Initialize CSV coder. Aggregate values for each lang, calculate metrics, # and write to output to a CSV file. csv_converter = tft.coders.CsvCoder(columns, csv_schema) _ = ( metrics_transformed_data | 'CompileTokenInfo' >> beam.ParDo(utils.CompileTokenizationInfo()) | 'CombineStatsForLang' >> beam.CombineGlobally(utils.AggregateLang()) | 'CalculateMetrics' >> beam.ParDo(utils.CalculateMetrics()) | 'EncodeMetrics' >> beam.Map(csv_converter.encode) | 'WriteMetrics' >> beam.io.WriteToText( metrics_file, shard_name_template='', header=','.join(columns))) return metrics_pipeline
def expand(self, pipeline): # TODO(b/147620802): Consider making this (and other parameters) # configurable to test more variants (e.g. with and without deep-copy # optimisation, with and without cache, etc). with tft_beam.Context( temp_dir=tempfile.mkdtemp(), force_tf_compat_v1=self._force_tf_compat_v1): raw_data = ( pipeline | "ReadDataset" >> beam.Create( self._dataset.read_raw_dataset( deserialize=False, limit=self._max_num_examples)) | "Decode" >> self._tfxio.BeamSource()) transform_fn, output_metadata = ( (raw_data, self._tfxio.TensorAdapterConfig()) | "AnalyzeDataset" >> tft_beam.AnalyzeDataset(self._preprocessing_fn)) if self._generate_dataset: _ = transform_fn | "CopySavedModel" >> _CopySavedModel( dest_path=self._dataset.tft_saved_model_path( self._force_tf_compat_v1)) (transformed_dataset, transformed_metadata) = ( ((raw_data, self._tfxio.TensorAdapterConfig()), (transform_fn, output_metadata)) | "TransformDataset" >> tft_beam.TransformDataset()) return transformed_dataset, transformed_metadata
def word_count(input_path, output_path, raw_metadata, min_token_frequency=2): """Returns a pipeline counting words and writing the output. Args: input_path: recordio file to read output_path: path in which to write the output raw_metadata: metadata of input tf.Examples min_token_frequency: the min frequency for a token to be included """ lang_set = set(FLAGS.lang_set.split(',')) # Create pipeline. pipeline = beam.Pipeline() with tft_beam.Context(temp_dir=tempfile.mkdtemp()): converter = tft.coders.ExampleProtoCoder(raw_metadata.schema, serialized=False) # Read raw data and convert to TF Transform encoded dict. raw_data = ( pipeline | 'ReadInputData' >> beam.io.tfrecordio.ReadFromTFRecord( input_path, coder=beam.coders.ProtoCoder(tf.train.Example)) | 'DecodeInputData' >> beam.Map(converter.decode)) # Apply TF Transform. (transformed_data, _), _ = ( (raw_data, raw_metadata) | 'FilterLangAndExtractToken' >> tft_beam.AnalyzeAndTransformDataset( utils.count_preprocessing_fn(FLAGS.text_key, FLAGS.language_code_key))) # Filter by languages. tokens = ( transformed_data | 'FilterByLang' >> beam.ParDo(utils.FilterTokensByLang(lang_set))) # Calculate smoothing coefficients. coeffs = (tokens | 'CalculateSmoothingCoefficients' >> beam.CombineGlobally( utils.CalculateCoefficients(FLAGS.smoothing_exponent))) # Apply smoothing, aggregate counts, and sort words by count. _ = (tokens | 'ApplyExponentialSmoothing' >> beam.ParDo( utils.ExponentialSmoothing(), beam.pvalue.AsSingleton(coeffs)) | 'SumCounts' >> beam.CombinePerKey(sum) | 'FilterLowCounts' >> beam.ParDo( utils.FilterByCount(FLAGS.max_word_length, min_token_frequency)) | 'MergeAndSortCounts' >> beam.CombineGlobally(utils.SortByCount()) | 'Flatten' >> beam.FlatMap(lambda x: x) | 'FormatCounts' >> beam.Map(lambda tc: '%s\t%s' % (tc[0], tc[1])) | 'WriteSortedCount' >> beam.io.WriteToText( output_path, shard_name_template='')) return pipeline
def expand(self, pipeline): # TODO(b/147620802): Consider making this (and other parameters) # configurable to test more variants (e.g. with and without deep-copy # optimisation, with and without cache, etc). with tft_beam.Context(temp_dir=tempfile.mkdtemp()): converter = tft.coders.ExampleProtoCoder(self._tf_metadata_schema, serialized=False) raw_data = ( pipeline | "ReadDataset" >> beam.Create(self._dataset.read_raw_dataset()) | "Decode" >> beam.Map(converter.decode)) transform_fn, output_metadata = ( (raw_data, self._transform_input_dataset_metadata) | "AnalyzeDataset" >> tft_beam.AnalyzeDataset( self._preprocessing_fn)) if self._generate_dataset: _ = transform_fn | "CopySavedModel" >> _CopySavedModel( dest_path=self._dataset.tft_saved_model_path()) (transformed_dataset, transformed_metadata) = ( ((raw_data, self._transform_input_dataset_metadata), (transform_fn, output_metadata)) | "TransformDataset" >> tft_beam.TransformDataset()) return transformed_dataset, transformed_metadata
def run(): pipeline_options = PipelineOptions(['--runner=DirectRunner']) def preprocessing_fn(inputs): word = inputs['word'] count = inputs['count'] return { 'word': word, 'count': count, 'count_normalized': tft.scale_to_0_1(count) } with beam.Pipeline(options=pipeline_options) as pipeline: with tft_beam.Context(temp_dir=tempfile.mkdtemp()): counts_data = (pipeline | "Load" >> ReadFromText(INPUT_FILE) | "CountWords" >> CountWordsTransform()) (transformed_data, transformed_metadata), _ = ( (counts_data, COUNTS_METADATA) | "AnalyzeAndTransform" >> tft_beam.AnalyzeAndTransformDataset(preprocessing_fn)) output_column_names = ['word', 'count', 'count_normalized'] transformed_data_coder = tft.coders.CsvCoder( output_column_names, transformed_metadata.schema) _ = (transformed_data | "EncodeToCsv" >> beam.Map(transformed_data_coder.encode) | "Save" >> WriteToText(OUTPUT_FILE))
def encode(): """ Creates a Beam pipeline that generates data, transforms it and encodes it in ELWC """ output_path = "./output" options = PipelineOptions() options.view_as(StandardOptions).runner = "DirectRunner" with beam.Pipeline(options=options) as pipeline: with tft_beam.Context(temp_dir="./tmp"): raw_data = generate_data(100) input_data = (pipeline | beam.Create(raw_data)) transformed_data, transform_fn = ( (input_data, raw_metadata) | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn)) elwc_coder = ELWCProtoCoder(context_specs, examples_specs) data, metadata = transformed_data _ = (data | beam.Map(elwc_coder.encode) | beam.io.WriteToTFRecord( file_path_prefix="{}/data".format(output_path), file_name_suffix=".tfrecords")) _ = (transform_fn | tft_beam.WriteTransformFn(output_path))
def testNestedContextCreateBaseTempDir(self): level_1_dir = self.get_temp_dir() with tft_beam.Context(temp_dir=level_1_dir): self.assertEqual( os.path.join(level_1_dir, tft_beam.Context._TEMP_SUBDIR), tft_beam.Context.create_base_temp_dir()) level_2_dir = self.get_temp_dir() with tft_beam.Context(temp_dir=level_2_dir): self.assertEqual( os.path.join(level_2_dir, tft_beam.Context._TEMP_SUBDIR), tft_beam.Context.create_base_temp_dir()) self.assertEqual( os.path.join(level_1_dir, tft_beam.Context._TEMP_SUBDIR), tft_beam.Context.create_base_temp_dir()) with self.assertRaises(ValueError): tft_beam.Context.create_base_temp_dir()
def run(pipeline_options, known_args): global force_tf_compat_v1 argv = None # if None, uses sys.argv pipeline_options = PipelineOptions(argv) pipeline = beam.Pipeline(options=pipeline_options) if "universal-sentence-encoder" in MODEL_URL and int( MODEL_URL.split("/")[-1]) <= 2: # https://github.com/tensorflow/transform/issues/160 force_tf_compat_v1 = True with tft_beam.Context(temp_dir=tempfile.mkdtemp(), force_tf_compat_v1=force_tf_compat_v1): print("Context force_tf_compat_v1: {}".format( tft_beam.Context.get_use_tf_compat_v1())) articles = ( pipeline | beam.Create([ { "id": "01", "text": "To be, or not to be: that is the question: " }, { "id": "02", "text": "Whether 'tis nobler in the mind to suffer " }, { "id": "03", "text": "The slings and arrows of outrageous fortune, " }, { "id": "04", "text": "Or to take arms against a sea of troubles, " }, ])) articles_dataset = (articles, get_metadata()) transformed_dataset, transform_fn = ( articles_dataset | "Extract embeddings" >> tft_beam.AnalyzeAndTransformDataset(preprocess_fn)) transformed_data, transformed_metadata = transformed_dataset _ = (transformed_data | "Print embeddings" >> beam.Map(print_pass) | "Write embeddings to TFRecords" >> beam.io.tfrecordio.WriteToTFRecord( file_path_prefix="{0}".format(known_args.output_dir), file_name_suffix=".tfrecords", coder=tft_coders.example_proto_coder.ExampleProtoCoder( transformed_metadata.schema), num_shards=1)) job = pipeline.run() if pipeline_options.get_all_options()["runner"] == "DirectRunner": job.wait_until_finish()
def setUp(self): super(CachedImplTest, self).setUp() self.base_test_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) self._cache_dir = os.path.join(self.base_test_dir, 'cache') self._context = tft_beam.Context(temp_dir=self.get_temp_dir()) self._context.__enter__()
def main(): with tft_beam.Context(temp_dir=tempfile.mkdtemp()): transformed_dataset, transform_fn = ( # pylint: disable=unused-variable (_RAW_DATA, _RAW_DATA_METADATA) | tft_beam.AnalyzeAndTransformDataset(_preprocessing_fn)) transformed_data, transformed_metadata = transformed_dataset # pylint: disable=unused-variable pprint.pprint(transformed_data)
def data_transform(): with tft_beam.Context(temp_dir=tempfile.mkdtemp()): transformed_dataset, transform_fn = ( (dict_features, data_metadata) | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data, transformed_metadata = transformed_dataset for i in range(len(transformed_data)): print("Initial: ", dict_features[i]) print("Transformed: ", transformed_data[i])
def main(): with beam.Pipeline() as p: with tft_beam.Context(temp_dir=tempfile.mkdtemp()): converter = tft.coders.CsvCoder(['f1', 'y'], raw_data_metadata.schema) coder = tft.coders.ExampleProtoCoder(raw_data_metadata.schema) raw_data = (p | beam.io.ReadFromText('./train.csv') | beam.Map(lambda line: line.replace(', ', ',')) | beam.Map(converter.decode) | beam.io.WriteToTFRecord('./train_tx', coder))
def generate_skipgrams(data_uri, feature_names, vocabulary_size=10, window_size=2, negative_samples=0., save_path="temp"): def parse_tensor_f(x): xp = tf.io.parse_tensor(x, tf.int64) xp.set_shape([None]) return {fname: xp[i] for i, fname in enumerate(feature_names)} raw_data = tf.data.TFRecordDataset(data_uri).map( parse_tensor_f).as_numpy_iterator() raw_data_schema = dataset_metadata.DatasetMetadata( schema_utils.schema_from_feature_spec({ fname: tf.io.FixedLenFeature([], tf.int64) for fname in feature_names })) dataset = (raw_data, raw_data_schema) # Make the preprocessing_fn preprocessing_fn = make_preproc_func(vocabulary_size, window_size, negative_samples, feature_names) # Run the beam pipeline with beam.Pipeline() as pipeline: with tft_beam.Context(temp_dir=tempfile.mkdtemp(), desired_batch_size=2): transformed_dataset, transform_fn = ( dataset | "Make Skipgrams" >> tft_beam.AnalyzeAndTransformDataset(preprocessing_fn)) print('Transformed dataset:\n{}'.format( pprint.pformat(transformed_dataset))) # pylint: disable=unused-variable transformed_data, transformed_metadata = transformed_dataset saved_results = ( transformed_data | "Write to TFRecord" >> beam.io.tfrecordio.WriteToTFRecord( file_path_prefix=save_path, file_name_suffix=".tfrecords", coder=tft.coders.example_proto_coder.ExampleProtoCoder( transformed_metadata.schema))) print('\nRaw data:\n{}\n'.format(pprint.pformat(raw_data))) print('Transformed data:\n{}'.format( pprint.pformat(transformed_data))) # Return the list of paths of tfrecords num_rows_saved = len(transformed_data) return saved_results, num_rows_saved
def run_vocab(): """Creates a pipeline to generate wordpiece vocab over a corpus.""" vocab_pipeline = beam.Pipeline() with tft_beam.Context(temp_dir=tempfile.mkdtemp()): # Read raw data and convert to TF Transform encoded dict. raw_data = ( vocab_pipeline | 'ReadInputData' >> beam.io.tfrecordio.ReadFromTFRecord( data_file, coder=beam.coders.ProtoCoder(tf.train.Example)) | 'DecodeInputData' >> beam.Map(example_converter.decode)) # Apply TF Transform. (transformed_data, _), _ = ((raw_data, raw_metadata) | 'FilterLangAndExtractToken' >> tft_beam.AnalyzeAndTransformDataset( utils.count_preprocessing_fn( FLAGS.text_key, FLAGS.language_code_key))) # Filter by languages. tokens = (transformed_data | 'FilterByLang' >> beam.ParDo( utils.FilterTokensByLang(lang_set))) # Calculate smoothing coefficients. coeffs = ( tokens | 'CalculateSmoothingCoefficients' >> beam.CombineGlobally( utils.CalculateCoefficients(FLAGS.smoothing_exponent))) # Apply smoothing, aggregate counts, and sort words by count. _ = (tokens | 'ApplyExponentialSmoothing' >> beam.ParDo( utils.ExponentialSmoothing(), beam.pvalue.AsSingleton(coeffs)) | 'SumCounts' >> beam.CombinePerKey(sum) | 'FilterLowCounts' >> beam.ParDo( utils.FilterByCount(FLAGS.max_word_length, min_token_frequency)) | 'MergeAndSortCounts' >> beam.CombineGlobally( utils.SortByCount()) | 'LearnVocab' >> beam.ParDo(utils.LearnVocab(params)) | 'Flatten' >> beam.FlatMap(lambda x: x + '\n') | 'WriteVocab' >> beam.io.WriteToText( vocab_file, shard_name_template='', append_trailing_newlines=False)) return vocab_pipeline
def transform_tft(train_data, test_data, working_dir): options = PipelineOptions() options.view_as(StandardOptions).runner = 'DirectRunner' with beam.Pipeline(options=options) as pipeline: with tft_beam.Context(temp_dir=tempfile.mkdtemp()): data_shape = train_data[0][0].shape raw_data = ( pipeline | 'ReadTrainData' >> beam.Create(train_data) | 'CreateTrainData' >> beam.Map(lambda data: format(data))) raw_data_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ IMAGE_KEY: tf.FixedLenFeature(list(data_shape), tf.float32), LABEL_KEY: tf.FixedLenFeature([], tf.int64) })) raw_dataset = (raw_data, raw_data_metadata) transformed_dataset, transform_fn = ( raw_dataset | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data, transformed_metadata = transformed_dataset transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) _ = ( transformed_data | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode) | 'WriteTrainData' >> beam.io.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TRAIN_DATA_FILEBASE), file_name_suffix='.tfrecords')) raw_test_data = ( pipeline | 'ReadTestData' >> beam.Create(test_data) | 'CreateTestData' >> beam.Map(lambda data: format(data))) raw_test_dataset = (raw_test_data, raw_data_metadata) transformed_test_dataset = ((raw_test_dataset, transform_fn) | tft_beam.TransformDataset()) # Don't need transformed data schema, it's the same as before. transformed_test_data, _ = transformed_test_dataset _ = (transformed_test_data | 'EncodeTestData' >> beam.Map(transformed_data_coder.encode) | 'WriteTestData' >> beam.io.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE), file_name_suffix='.tfrecords')) _ = (transform_fn | 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))
def transformed_data(working_dir): """数据处理与生成transform_fn""" def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" xi, yi = inputs["x"], inputs["y"] x_integerized = tft.compute_and_apply_vocabulary(xi, default_value=0, name="vocab") # , top_k=VOCAB_SIZE) y_integerized = tft.compute_and_apply_vocabulary(yi, default_value=0, name="label") # ,top_k=LABEL_SIZE return {"x": x_integerized, "y": y_integerized} # path_transform with tft_beam.Context(temp_dir=path_transform): transformed_dataset, transform_fn = ((xys, DATA_STRING_FEATURE_SPEC) | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_train_data, transformed_metadata = transformed_dataset _ = (transform_fn | tft_beam.WriteTransformFn(working_dir)) return transformed_train_data
def test_preprocessing_fn(self, with_deep_copy, features_config): # Fake features_config global variables to test their transformed values. features_config.TARGET_FEATURE = 't' features_config.ID_FEATURE = 'i' features_config.NUMERIC_FEATURES = ['n1', 'n2'] features_config.CATEGORICAL_FEATURES = ['c1', 'c2'] features_config.OOV_SIZE = 5 features_config.VOCAB_SIZE = 10 input_metadata = _create_input_metadata(features_config) input_data = [{ 't': [0.0], 'i': [0], 'n1': [1.0], 'n2': [2.0], 'c1': ['test1'], 'c2': ['test2'] }, { 't': [1.0], 'i': [1], 'n1': [3.0], 'n2': [4.0], 'c1': ['test2'], 'c2': ['test1'] }] expected_data = [{ 't': 0.0, 'i': [0], 'tr_n1': -1.0, 'tr_n2': -1.0, 'tr_c1': 1, 'tr_c2': 0 }, { 't': 1.0, 'i': [1], 'tr_n1': 1.0, 'tr_n2': 1.0, 'tr_c1': 0, 'tr_c2': 1 }] expected_metadata = _create_output_metadata(features_config, 0, 6) # Assert that transformed result matches expected_data & expected_metadata. with tft_beam.Context(use_deep_copy_optimization=with_deep_copy): self.assertAnalyzeAndTransformResults( input_data=input_data, input_metadata=input_metadata, preprocessing_fn=transformer.preprocessing_fn, expected_data=expected_data, expected_metadata=expected_metadata)
def test_non_training(self): """Tests case where dataset contains non-training (e.g. test) data.""" with self.pipeline as p: with tft_beam.Context(temp_dir=os.path.join(self.test_dir, 'tmp')): df = self.raw_df[self.raw_df.split == 'TEST'] dataset = self._get_dataset(p, df) transform_fn = p | tft_beam.ReadTransformFn(self.transform_fn_path) beam_pipeline._transform_and_write_tfr( dataset, self.tfr_writer, transform_fn=transform_fn, raw_metadata=self.raw_metadata, label='Test') self.assertFalse(glob.glob(os.path.join(self.test_dir, 'train*.gz'))) self.assertFalse(glob.glob(os.path.join(self.test_dir, 'validation*.gz'))) self.assertTrue(glob.glob(os.path.join(self.test_dir, 'test*.gz')))
def pipeline(root): """Pipeline instantiation function. Args: root: Source pipeline from which to extend. """ preprocessing_fn = compute_vocab_fn if FLAGS.vocab_gen_mode else apply_vocab_fn with tft_beam.Context(temp_dir=FLAGS.temp_dir): processed_lines = ( root # Read in TSV data. | beam.io.ReadFromText(data_path) # Fill in missing elements with the defaults (zeros). | "FillMissing" >> beam.ParDo(FillMissing()) # For numerical features, set negatives to zero. Then take log(x+1). | "NegsToZeroLog" >> beam.ParDo(NegsToZeroLog()) # For categorical features, mod the values with vocab size. | "HexToIntModRange" >> beam.ParDo(HexToIntModRange())) # CSV reader: List the cols in order, as dataset schema is not ordered. ordered_columns = [ LABEL_KEY ] + NUMERIC_FEATURE_KEYS + CATEGORICAL_FEATURE_KEYS converter = tft.coders.CsvCoder(ordered_columns, INPUT_METADATA.schema, delimiter=FLAGS.csv_delimeter) converted_data = (processed_lines | "DecodeData" >> beam.Map(converter.decode)) transformed_dataset, transform_fn = ( # pylint: disable=unused-variable (converted_data, INPUT_METADATA) | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data, transformed_metadata = transformed_dataset if not FLAGS.vocab_gen_mode: # Write to CSV. transformed_csv_coder = tft.coders.CsvCoder( ordered_columns, transformed_metadata.schema, delimiter=FLAGS.csv_delimeter) _ = (transformed_data | "EncodeDataCsv" >> beam.Map(transformed_csv_coder.encode) | "WriteDataCsv" >> beam.io.WriteToText(output_path))
def transform_data(train_data_file, working_dir): with beam.Pipeline() as pipeline: with tft_beam.Context(temp_dir=tempfile.mkdtemp()): ordered_columns = ['C' + str(i) for i in range(10)] print(ordered_columns) converter = tft.coders.CsvCoder(ordered_columns, RAW_DATA_METADATA.schema) raw_data = ( pipeline | 'Read Train Data' >> beam.io.ReadFromText( train_data_file, skip_header_lines=1) | 'Fix Commas in Train Data' >> beam.Map(lambda line: line.replace(', ', ',')) | 'Decode Train Data' >> MapAndFilterErrors(converter.decode)) print("\n\n\n", raw_data.__dict__) print("\n\n\n", raw_data.producer) print("\n\n\n", raw_data.producer.__dict__) raw_dataset = (raw_data, RAW_DATA_METADATA) transformed_dataset, transform_fn = ( raw_dataset | tft.beam.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data, transformed_metadata = transformed_dataset transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) _ = (transformed_data | 'Encode Train Data' >> beam.Map(transformed_data_coder.encode) | 'Write Train Data' >> beam.io.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TRAIN_DATA_FILEBASE))) _ = (transform_fn | 'Write TransformFn' >> tft.beam.WriteTransformFn(working_dir)) print("YOOHOO\n")
def main(): def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" x = inputs['x'] y = inputs['y'] s = inputs['s'] x_centered = x - tft.mean(x) y_normalized = tft.scale_to_0_1(y) s_integerized = tft.compute_and_apply_vocabulary(s) x_centered_times_y_normalized = (x_centered * y_normalized) return { 'x_centered': x_centered, 'y_normalized': y_normalized, 'x_centered_times_y_normalized': x_centered_times_y_normalized, 's_integerized': s_integerized } raw_data = [ {'x': 1, 'y': 1, 's': 'hello'}, {'x': 2, 'y': 2, 's': 'world'}, {'x': 3, 'y': 3, 's': 'hello'} ] raw_data_metadata = dataset_metadata.DatasetMetadata( schema_utils.schema_from_feature_spec({ 's': tf.io.FixedLenFeature([], tf.string), 'y': tf.io.FixedLenFeature([], tf.float32), 'x': tf.io.FixedLenFeature([], tf.float32), })) with tft_beam.Context(temp_dir=tempfile.mkdtemp()): transformed_dataset, transform_fn = ( # pylint: disable=unused-variable (raw_data, raw_data_metadata) | tft_beam.AnalyzeAndTransformDataset( preprocessing_fn)) transformed_data, transformed_metadata = transformed_dataset # pylint: disable=unused-variable pprint.pprint(transformed_data)
def _main(argv=None): logging.getLogger().setLevel(logging.INFO) parser = argparse.ArgumentParser() parser.add_argument('--raw_examples_path', required=True) parser.add_argument('--raw_examples_schema_path', required=True) parser.add_argument('--transform_fn_dir', required=True) parser.add_argument('--transformed_examples_path_prefix', required=True) known_args, pipeline_args = parser.parse_known_args(argv) raw_examples_schema = load_schema(known_args.raw_examples_schema_path) raw_examples_coder = tft.coders.ExampleProtoCoder(raw_examples_schema) raw_examples_metadata = dataset_metadata.DatasetMetadata( raw_examples_schema) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as pipeline: with tft_beam.Context(temp_dir=get_beam_temp_dir(pipeline_options)): transform_fn = pipeline | tft_beam.ReadTransformFn( known_args.transform_fn_dir) raw_examples = ( pipeline | 'ReadRawExamples' >> beam.io.ReadFromTFRecord( known_args.raw_examples_path, coder=raw_examples_coder)) raw_examples_dataset = (raw_examples, raw_examples_metadata) transformed_examples, transform_examples_metadata = ( (raw_examples_dataset, transform_fn) | tft_beam.TransformDataset()) transformed_examples_coder = tft.coders.ExampleProtoCoder( transform_examples_metadata.schema) transformed_examples | 'WriteTransformedExamples' >> beam.io.WriteToTFRecord( known_args.transformed_examples_path_prefix, file_name_suffix='.tfrecord.gz', coder=transformed_examples_coder)
def run_hub2emb(args): '''Runs the embedding generation pipeline''' options = beam.options.pipeline_options.PipelineOptions(**args) args = namedtuple("options", args.keys())(*args.values()) raw_metadata = create_metadata() converter = tft.coders.CsvCoder(column_names=['text'], schema=raw_metadata.schema) with beam.Pipeline(args.runner, options=options) as pipeline: with tft_beam.Context(args.temporary_dir): # Read the sentences from the input file sentences = ( pipeline | 'Read sentences from files' >> beam.io.ReadFromText(file_pattern='corpus/text.txt') # | 'Convert to dictionary' >> beam.Map(converter.decode) ) sentences_dataset = (sentences, raw_metadata) preprocess_fn = make_preprocess_fn(args.module_url, args.random_projection_matrix) # Generate the embeddings for the sentence using the TF-Hub module embeddings_dataset, _ = ( sentences_dataset | 'Extract embeddings' >> tft_beam.AnalyzeAndTransformDataset(preprocess_fn)) embeddings, transformed_metadata = embeddings_dataset # Write the embeddings to TFRecords files embeddings | 'Write embeddings to TFRecords' >> beam.io.tfrecordio.WriteToTFRecord( file_path_prefix='{}/emb'.format(args.output_dir), file_name_suffix='.tfrecords', coder=tft.coders.ExampleProtoCoder( transformed_metadata.schema))
def transform_data(working_dir): """Transform the data and write out as a TFRecord of Example protos. Read in the data from the positive and negative examples on disk, and transform it using a preprocessing pipeline that removes punctuation, tokenizes and maps tokens to int64 values indices. Args: working_dir: Directory to read shuffled data from and write transformed data and metadata to. """ with beam.Pipeline() as pipeline: with tft_beam.Context( temp_dir=os.path.join(working_dir, TRANSFORM_TEMP_DIR)): coder = tft.coders.ExampleProtoCoder(RAW_DATA_METADATA.schema) train_data = (pipeline | 'ReadTrain' >> beam.io.ReadFromTFRecord( os.path.join(working_dir, SHUFFLED_TRAIN_DATA_FILEBASE + '*')) | 'DecodeTrain' >> beam.Map(coder.decode)) test_data = (pipeline | 'ReadTest' >> beam.io.ReadFromTFRecord( os.path.join(working_dir, SHUFFLED_TEST_DATA_FILEBASE + '*')) | 'DecodeTest' >> beam.Map(coder.decode)) def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" review = inputs[REVIEW_KEY] # Here tf.compat.v1.string_split behaves differently from # tf.strings.split. review_tokens = tf.compat.v1.string_split(review, DELIMITERS) review_indices = tft.compute_and_apply_vocabulary( review_tokens, top_k=VOCAB_SIZE) # Add one for the oov bucket created by compute_and_apply_vocabulary. review_bow_indices, review_weight = tft.tfidf( review_indices, VOCAB_SIZE + 1) return { REVIEW_KEY: review_bow_indices, REVIEW_WEIGHT_KEY: review_weight, LABEL_KEY: inputs[LABEL_KEY] } (transformed_train_data, transformed_metadata), transform_fn = ( (train_data, RAW_DATA_METADATA) | 'AnalyzeAndTransform' >> tft_beam.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) transformed_test_data, _ = ( ((test_data, RAW_DATA_METADATA), transform_fn) | 'Transform' >> tft_beam.TransformDataset()) _ = (transformed_train_data | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode) | 'WriteTrainData' >> beam.io.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TRAIN_DATA_FILEBASE))) _ = ( transformed_test_data | 'EncodeTestData' >> beam.Map(transformed_data_coder.encode) | 'WriteTestData' >> beam.io.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE))) # Will write a SavedModel and metadata to two subdirectories of # working_dir, given by tft.TRANSFORM_FN_DIR and # tft.TRANSFORMED_METADATA_DIR respectively. _ = (transform_fn | 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))
def transform_data(input_handle, outfile_prefix, working_dir, schema_file, transform_dir=None, max_rows=None, pipeline_args=None): """The main tf.transform method which analyzes and transforms data. Args: input_handle: BigQuery table name to process specified as DATASET.TABLE or path to csv file with input data. outfile_prefix: Filename prefix for emitted transformed examples working_dir: Directory in which transformed examples and transform function will be emitted. schema_file: An file path that contains a text-serialized TensorFlow metadata schema of the input data. transform_dir: Directory in which the transform output is located. If provided, this will load the transform_fn from disk instead of computing it over the data. Hint: this is useful for transforming eval data. max_rows: Number of rows to query from BigQuery pipeline_args: additional DataflowRunner or DirectRunner args passed to the beam pipeline. """ def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in taxi.DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[taxi.transformed_name(key)] = transform.scale_to_z_score( _fill_in_missing(inputs[key])) for key in taxi.VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[taxi.transformed_name( key)] = transform.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), top_k=taxi.VOCAB_SIZE, num_oov_buckets=taxi.OOV_SIZE) for key in taxi.BUCKET_FEATURE_KEYS: outputs[taxi.transformed_name(key)] = transform.bucketize( _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT) for key in taxi.CATEGORICAL_FEATURE_KEYS: outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key]) # Was this passenger a big tipper? taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY]) tips = _fill_in_missing(inputs[taxi.LABEL_KEY]) outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where( tf.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64), # Test if the tip was > 20% of the fare. tf.cast(tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64)) return outputs schema = taxi.read_schema(schema_file) raw_feature_spec = taxi.get_raw_feature_spec(schema) raw_schema = dataset_schema.from_feature_spec(raw_feature_spec) raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema) with beam.Pipeline(argv=pipeline_args) as pipeline: with tft_beam.Context(temp_dir=working_dir): if input_handle.lower().endswith('csv'): csv_coder = taxi.make_csv_coder(schema) raw_data = (pipeline | 'ReadFromText' >> beam.io.ReadFromText( input_handle, skip_header_lines=1)) decode_transform = beam.Map(csv_coder.decode) else: query = taxi.make_sql(input_handle, max_rows, for_eval=False) raw_data = (pipeline | 'ReadBigQuery' >> beam.io.Read( beam.io.BigQuerySource(query=query, use_standard_sql=True))) decode_transform = beam.Map(taxi.clean_raw_data_dict, raw_feature_spec=raw_feature_spec) if transform_dir is None: decoded_data = raw_data | 'DecodeForAnalyze' >> decode_transform transform_fn = ( (decoded_data, raw_data_metadata) | ('Analyze' >> tft_beam.AnalyzeDataset(preprocessing_fn))) _ = (transform_fn | ('WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))) else: transform_fn = pipeline | tft_beam.ReadTransformFn( transform_dir) # Shuffling the data before materialization will improve Training # effectiveness downstream. Here we shuffle the raw_data (as opposed to # decoded data) since it has a compact representation. shuffled_data = raw_data | 'RandomizeData' >> beam.transforms.Reshuffle( ) decoded_data = shuffled_data | 'DecodeForTransform' >> decode_transform (transformed_data, transformed_metadata) = ( ((decoded_data, raw_data_metadata), transform_fn) | 'Transform' >> tft_beam.TransformDataset()) coder = example_proto_coder.ExampleProtoCoder( transformed_metadata.schema) _ = (transformed_data | 'SerializeExamples' >> beam.Map(coder.encode) | 'WriteExamples' >> beam.io.WriteToTFRecord( os.path.join(working_dir, outfile_prefix), file_name_suffix='.gz'))
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Get human review result on a model through Slack channel. Args: input_dict: Input dict from input key to a list of artifacts, including: - model_export: exported model from trainer. - model_blessing: model blessing path from model_validator. output_dict: Output dict from key to a list of artifacts, including: - slack_blessing: model blessing result. exec_properties: A dict of execution properties, including: - slack_token: Token used to setup connection with slack server. - slack_channel_id: The id of the Slack channel to send and receive messages. - timeout_sec: How long do we wait for response, in seconds. Returns: None Raises: TimeoutError: When there is no decision made within timeout_sec. ConnectionError: When connection to slack server cannot be established. """ self._log_startup(input_dict, output_dict, exec_properties) transform_graph_uri = artifact_utils.get_single_uri( input_dict[TRANSFORM_GRAPH_KEY]) temp_path = os.path.join(transform_graph_uri, _TEMP_DIR_IN_TRANSFORM_OUTPUT) # transformed_schema_file = os.path.join( # transform_graph_uri, # tft.TFTransformOutput.TRANSFORMED_METADATA_DIR, # 'schema.pbtxt' # ) # transformed_schema_proto = io_utils.parse_pbtxt_file( # transformed_schema_file, # schema_pb2.Schema() # ) transformed_train_output = artifact_utils.get_split_uri( output_dict[TRANSFORMED_EXAMPLES_KEY], 'train') transformed_eval_output = artifact_utils.get_split_uri( output_dict[TRANSFORMED_EXAMPLES_KEY], 'eval') tf_transform_output = tft.TFTransformOutput(transform_graph_uri) # transform_output_dataset_metadata = dataset_metadata.DatasetMetadata( # schema=transformed_schema_proto # ) # transform_fn = (tf_transform_output.transform_raw_features, transform_output_dataset_metadata) # feature_spec = schema_utils.schema_as_feature_spec(schema_proto).feature_spec schema_file = io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri(input_dict[SCHEMA_KEY])) schema_proto = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema()) transform_input_dataset_metadata = dataset_metadata.DatasetMetadata( schema_proto ) train_data_uri = artifact_utils.get_split_uri( input_dict[EXAMPLES_KEY], 'train' ) eval_data_uri = artifact_utils.get_split_uri( input_dict[EXAMPLES_KEY], 'eval' ) analyze_data_paths = [io_utils.all_files_pattern(train_data_uri)] transform_data_paths = [ io_utils.all_files_pattern(train_data_uri), io_utils.all_files_pattern(eval_data_uri), ] materialize_output_paths = [ os.path.join(transformed_train_output, _DEFAULT_TRANSFORMED_EXAMPLES_PREFIX), os.path.join(transformed_eval_output, _DEFAULT_TRANSFORMED_EXAMPLES_PREFIX) ] transform_data_list = self._MakeDatasetList( transform_data_paths, materialize_output_paths ) analyze_data_list = self._MakeDatasetList( analyze_data_paths, ) with self._make_beam_pipeline() as pipeline: with tft_beam.Context(temp_dir=temp_path): # NOTE: Unclear if there is a difference between input_dataset_metadata # and transform_input_dataset_metadata. Look at Transform executor. decode_fn = tft.coders.ExampleProtoCoder(schema_proto, serialized=True).decode input_analysis_data = {} for dataset in analyze_data_list: infix = 'AnalysisIndex{}'.format(dataset.index) dataset.serialized = ( pipeline | 'ReadDataset[{}]'.format(infix) >> self._ReadExamples( dataset, transform_input_dataset_metadata)) dataset.decoded = ( dataset.serialized | 'Decode[{}]'.format(infix) >> self._DecodeInputs(decode_fn)) input_analysis_data[dataset.dataset_key] = dataset.decoded if not hasattr(tft_beam.analyzer_cache, 'DatasetKey'): input_analysis_data = ( [ dataset for dataset in input_analysis_data.values() if dataset is not None ] | 'FlattenAnalysisDatasetsBecauseItIsRequired' >> beam.Flatten(pipeline=pipeline)) transform_fn = ( (input_analysis_data, transform_input_dataset_metadata) | 'Analyze' >> tft_beam.AnalyzeDataset( tf_transform_output.transform_raw_features, pipeline=pipeline)) for dataset in transform_data_list: infix = 'TransformIndex{}'.format(dataset.index) dataset.serialized = ( pipeline | 'ReadDataset[{}]'.format(infix) >> self._ReadExamples( dataset, transform_input_dataset_metadata)) dataset.decoded = ( dataset.serialized | 'Decode[{}]'.format(infix) >> self._DecodeInputs(decode_fn)) dataset.transformed, metadata = ( ((dataset.decoded, transform_input_dataset_metadata), transform_fn) | 'Transform[{}]'.format(infix) >> tft_beam.TransformDataset()) dataset.transformed_and_serialized = ( dataset.transformed | 'EncodeAndSerialize[{}]'.format(infix) >> beam.ParDo(self._EncodeAsSerializedExamples(), _GetSchemaProto(metadata))) _ = ( dataset.transformed_and_serialized | 'Materialize[{}]'.format(infix) >> self._WriteExamples(dataset.materialize_output_path))