def testNumericAnalyzersWithSparseInputs(self): def repeat(in_tensor, value): batch_size = tf.shape(in_tensor)[0] return tf.ones([batch_size], value.dtype) * value input_data = [{'a': [4, 5, 6]}, {'a': [1, 2]}] input_metadata = self.toMetadata({'a': tf.VarLenFeature(tf.int64)}) input_dataset = (input_data, input_metadata) with beam_impl.Context(temp_dir=self.get_temp_dir()): with self.assertRaises(TypeError): def min_fn(inputs): return { 'min': tft.map(repeat, inputs['a'], tft.min(inputs['a'])) } _ = input_dataset | beam_impl.AnalyzeDataset(min_fn) with self.assertRaises(TypeError): def max_fn(inputs): return { 'max': tft.map(repeat, inputs['a'], tft.max(inputs['a'])) } _ = input_dataset | beam_impl.AnalyzeDataset(max_fn) with self.assertRaises(TypeError): def sum_fn(inputs): return { 'sum': tft.map(repeat, inputs['a'], tft.sum(inputs['a'])) } _ = input_dataset | beam_impl.AnalyzeDataset(sum_fn) with self.assertRaises(TypeError): def size_fn(inputs): return { 'size': tft.map(repeat, inputs['a'], tft.size(inputs['a'])) } _ = input_dataset | beam_impl.AnalyzeDataset(size_fn) with self.assertRaises(TypeError): def mean_fn(inputs): return { 'mean': tft.map(repeat, inputs['a'], tft.mean(inputs['a'])) } _ = input_dataset | beam_impl.AnalyzeDataset(mean_fn)
def build_pipeline(p, flags): """Sets up Apache Beam pipeline for execution.""" raw_data = ( p | 'QueryTable' >> beam.io.Read( beam.io.BigQuerySource(query=query.get_query(flags.bq_table), project=flags.project_id, use_standard_sql=True)) # omit 'Generate data' step if working with real data | 'Generate data' >> beam.Map(_generate_fake_data) | 'Extract lifetime ' >> beam.Map(append_lifetime_duration) | 'Extract label' >> beam.Map(append_label) | 'Generate label array' >> beam.Map(combine_censorship_duration)) raw_train, raw_eval, raw_test = ( raw_data | 'RandomlySplitData' >> randomly_split( train_size=.7, validation_size=.15, test_size=.15)) raw_metadata = features.get_raw_dataset_metadata() preprocess_fn = features.preprocess_fn transform_fn = ((raw_train, raw_metadata) | 'AnalyzeTrain' >> tft_beam.AnalyzeDataset(preprocess_fn)) (transform_fn | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(flags.output_dir)) for dataset_type, dataset in [('Train', raw_train), ('Eval', raw_eval), ('Test', raw_test)]: transform_label = 'Transform{}'.format(dataset_type) t, metadata = (((dataset, raw_metadata), transform_fn) | transform_label >> tft_beam.TransformDataset()) if dataset_type == 'Train': (metadata | 'WriteMetadata' >> tft_beam_io.WriteMetadata( os.path.join(flags.output_dir, 'transformed_metadata'), pipeline=p)) write_label = 'Write{}TFRecord'.format(dataset_type) t | write_label >> write_tfrecord(dataset_type, flags.output_dir, metadata)
def testTransformWithExcludedOutputs(self): def preprocessing_fn(inputs): return { 'x_scaled': tft.scale_to_0_1(inputs['x']), 'y_scaled': tft.scale_to_0_1(inputs['y']) } # Run AnalyzeAndTransform on some input data and compare with expected # output. input_data = [{'x': 5, 'y': 1}, {'x': 1, 'y': 2}] input_metadata = self.toMetadata({ 'x': tf.FixedLenFeature((), tf.float32, 0), 'y': tf.FixedLenFeature((), tf.float32, 0) }) with beam_impl.Context(temp_dir=self.get_temp_dir()): transform_fn = ((input_data, input_metadata) | beam_impl.AnalyzeDataset(preprocessing_fn)) # Take the transform function and use TransformDataset to apply it to # some eval data, with missing 'y' column. eval_data = [{'x': 6}] eval_metadata = self.toMetadata( {'x': tf.FixedLenFeature((), tf.float32, 0)}) transformed_eval_dataset = ( ((eval_data, eval_metadata), transform_fn) | beam_impl.TransformDataset(exclude_outputs=['y_scaled'])) expected_transformed_eval_data = [{'x_scaled': 1.25}] expected_transformed_eval_schema = self.toMetadata( {'x_scaled': tf.FixedLenFeature((), tf.float32, None)}) self.assertDatasetsEqual( transformed_eval_dataset, (expected_transformed_eval_data, expected_transformed_eval_schema))
def testAnalyzeBeforeTransform(self): def preprocessing_fn(inputs): return {'x_scaled': tft.scale_to_0_1(inputs['x'])} # Run AnalyzeAndTransform on some input data and compare with expected # output. input_data = [{'x': 4}, {'x': 1}, {'x': 5}, {'x': 2}] input_metadata = self.toMetadata( {'x': tf.FixedLenFeature((), tf.float32, 0)}) transformed_dataset, transform_fn = ( (input_data, input_metadata) | beam_impl.AnalyzeAndTransformDataset( preprocessing_fn, os.path.join(self.get_temp_dir(), 'analyze_before_transform_at'))) expected_transformed_data = [{ 'x_scaled': 0.75 }, { 'x_scaled': 0.0 }, { 'x_scaled': 1.0 }, { 'x_scaled': 0.25 }] expected_transformed_metadata = self.toMetadata( {'x_scaled': tf.FixedLenFeature((), tf.float32, None)}) self.assertDatasetsEqual( transformed_dataset, (expected_transformed_data, expected_transformed_metadata)) # Take the transform function and use TransformDataset to apply it to # some eval data, and compare with expected output. eval_data = [{'x': 6}, {'x': 3}] transformed_eval_dataset = (((eval_data, input_metadata), transform_fn) | beam_impl.TransformDataset()) expected_transformed_eval_data = [{ 'x_scaled': 1.25 }, { 'x_scaled': 0.5 }] self.assertDatasetsEqual( transformed_eval_dataset, (expected_transformed_eval_data, expected_transformed_metadata)) # Redo test with eval data, using AnalyzeDataset instead of # AnalyzeAndTransformDataset to genereate transform_fn. transform_fn = ((input_data, input_metadata) | beam_impl.AnalyzeDataset( preprocessing_fn, os.path.join(self.get_temp_dir(), 'analyze_before_transform_a'))) transformed_eval_dataset = (((eval_data, input_metadata), transform_fn) | beam_impl.TransformDataset()) self.assertDatasetsEqual( transformed_eval_dataset, (expected_transformed_eval_data, expected_transformed_metadata))
def run(pipeline_options, known_args): pipeline = beam.Pipeline(options=pipeline_options) with impl.Context(known_args.transform_temp_dir): articles = ( pipeline | 'Get Paths' >> beam.Create(get_paths(known_args.file_pattern)) | 'Get Articles' >> beam.Map(get_articles) | 'Get Article' >> beam.FlatMap(lambda x: x) ) dataset = (articles, get_metadata()) transform_fn = ( dataset | 'Analyse dataset' >> impl.AnalyzeDataset(preprocess_fn) ) transformed_data_with_meta = ( (dataset, transform_fn) | 'Transform dataset' >> impl.TransformDataset() ) transformed_data, transformed_metadata = transformed_data_with_meta transform_fn | 'Export Transform Fn' >> transform_fn_io.WriteTransformFn( known_args.transform_export_dir) ( transformed_data | 'Convert to Insertable data' >> beam.Map(to_bq_row) | 'Write to BigQuery table' >> beam.io.WriteToBigQuery( project=known_args.bq_project, dataset=known_args.bq_dataset, table=known_args.bq_table, schema=get_bigquery_schema(), create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE) ) if known_args.enable_tfrecord: transformed_data | 'Write TFRecords' >> beam.io.tfrecordio.WriteToTFRecord( file_path_prefix='{0}/{1}'.format(known_args.tfrecord_export_dir, 'reuter'), file_name_suffix='.tfrecords', coder=tft_coders.example_proto_coder.ExampleProtoCoder(transformed_metadata.schema)) if known_args.enable_debug: transformed_data | 'Debug Output' >> beam.io.textio.WriteToText( file_path_prefix=known_args.debug_output_prefix, file_name_suffix='.txt') job = pipeline.run() if pipeline_options.get_all_options()['runner'] == 'DirectRunner': job.wait_until_finish()
def main(): def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" x = inputs['x'] y = inputs['y'] s = inputs['s'] x_centered = x - tft.mean(x) y_normalized = tft.scale_to_0_1(y) s_integerized = tft.string_to_int(s) x_centered_times_y_normalized = (x_centered * y_normalized) return { 'x_centered': x_centered, 'y_normalized': y_normalized, 'x_centered_times_y_normalized': x_centered_times_y_normalized, 's_integerized': s_integerized } raw_data = [{ 'x': 1, 'y': 1, 's': 'hello' }, { 'x': 2, 'y': 2, 's': 'world' }, { 'x': 3, 'y': 3, 's': 'hello' }] raw_data_metadata = dataset_metadata.DatasetMetadata( dataset_schema.Schema({ 's': dataset_schema.ColumnSchema( tf.string, [], dataset_schema.FixedColumnRepresentation()), 'y': dataset_schema.ColumnSchema( tf.float32, [], dataset_schema.FixedColumnRepresentation()), 'x': dataset_schema.ColumnSchema( tf.float32, [], dataset_schema.FixedColumnRepresentation()) })) with beam_impl.Context(temp_dir=tempfile.mkdtemp()): transform_fn = ((raw_data, raw_data_metadata) | beam_impl.AnalyzeDataset(preprocessing_fn)) transformed_dataset = (((raw_data, raw_data_metadata), transform_fn) | beam_impl.TransformDataset()) # pylint: disable=unused-variable transformed_data, transformed_metadata = transformed_dataset pprint.pprint(transformed_data)
def build_graph(self): # Move percentage of train data to .`PPGRAPH_EXT` files, used for graph building. # num_lines = 0 # for i in range(DATASET_NUM_SHARDS): # _fname = '{}-{:05}-of-{:05}'.format(self.train_fname_out, i, self.config.DATASET_NUM_SHARDS) # num_lines += sum(1 for _ in open(_fname)) # _fname_marked = '{}-{:05}-of-{:05}.{}'.format(self.train_fname_out, i, self.config.DATASET_NUM_SHARDS, # PPGRAPH_EXT) # shutil.move(_fname, _fname_marked) # if num_lines >= self.config.PPGRAPH_MAX_SAMPLES: # break # Set up the preprocessing pipeline for analyzing the dataset. The analyze call is not combined with the # transform call because we will parallelize the transform call later. We had the issue that this process # runs on a single core and tends to cause OOM issues. pipeline = beam.Pipeline(runner=DirectRunner()) with beam_impl.Context(temp_dir=tempfile.mkdtemp()): # todo: maybe, I should only use train data (or percentage of train data) to build the graph raw_train_data = ( pipeline | 'ReadTrainDataFile' >> textio.ReadFromText( 'data/features' + '*' + 'shard' + '*', skip_header_lines=0) | 'DecodeTrainDataCSV' >> MapAndFilterErrors( tft_coders.CsvCoder( self.data_formatter.get_ordered_columns(), self.data_formatter.get_raw_data_metadata().schema). decode)) # Combine data and schema into a dataset tuple. Note that we already used # the schema to read the CSV data, but we also need it to interpret # raw_data. # That is when to use vocabulary, scale_to_0_1 or sparse_to_dense ... transform_fn = ( (raw_train_data, self.data_formatter.get_raw_data_metadata()) | beam_impl.AnalyzeDataset( PreprocessingFunction().transform_to_tfrecord)) # Write SavedModel and metadata to two subdirectories of working_dir, given by # `transform_fn_io.TRANSFORM_FN_DIR` and `transform_fn_io.TRANSFORMED_METADATA_DIR` respectively. _ = (transform_fn | 'WriteTransformGraph' >> transform_fn_io.WriteTransformFn(TARGET_DIR)) # working dir # Run the Beam preprocessing pipeline. st = time.time() result = pipeline.run() result.wait_until_finish() self.logger.info( 'Transformation graph built and written in {:.2f} sec'.format( time.time() - st))
def make_transform_graph(output_dir, schema, features): """Writes a tft transform fn, and metadata files. Args: output_dir: output folder schema: schema list features: features dict """ tft_input_schema = make_tft_input_schema( schema, os.path.join(output_dir, STATS_FILE)) tft_input_metadata = dataset_metadata.DatasetMetadata( schema=tft_input_schema) preprocessing_fn = make_preprocessing_fn(output_dir, features) # preprocessing_fn does not use any analyzer, so we can run a local beam job # to properly make and write the transform function. temp_dir = os.path.join(output_dir, 'tmp') with beam.Pipeline('DirectRunner', options=None) as p: with tft_impl.Context(temp_dir=temp_dir): # Not going to transform, so no data is needed. train_data = p | beam.Create([]) transform_fn = ( (train_data, tft_input_metadata) | 'BuildTransformFn' # noqa >> tft_impl.AnalyzeDataset(preprocessing_fn)) # noqa # Writes transformed_metadata and transfrom_fn folders _ = (transform_fn | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(output_dir) ) # noqa # Write the raw_metadata metadata_io.write_metadata(metadata=tft_input_metadata, path=os.path.join( output_dir, RAW_METADATA_DIR))
def preprocess(p, output_dir, check_path, data_size, bq_table, split_data_path, project_id): """Main processing pipeline reading, processing and storing processed data. Performs the following operations: - reads data from BigQuery - adds hash key value to each row - scales data - shuffles and splits data in train / validation / test sets - oversamples train data - stores data as TFRecord - splits and stores test data into labels and features files Args: p: PCollection, initial pipeline. output_dir: string, path to directory to store output. check_path: string, path to directory to store data checks. data_size: tuple of float, ratio of data going respectively to train, validation and test sets. bq_table: string, name of table to read data from. split_data_path: string, path to directory to store train, validation and test raw datasets. project_id: string, GCP project id. Raises: ValueError: No test dataset found in pipeline output. """ train_size, validation_size, test_size = data_size data = (p | 'ReadData' >> read_data(bq_table=bq_table, project_id=project_id)) _ = data | 'StoreData' >> beam.io.WriteToText( posixpath.join(output_dir, check_path, 'processed_data.txt')) split_data = ( data | 'RandomlySplitData' >> randomly_split(train_size=train_size, validation_size=validation_size, test_size=test_size)) for k in split_data: split_data[k] |= 'AddHash_{}'.format(k.name) >> beam.ParDo( AddHash(), label_column=constants.LABEL_COLUMN, key_column=constants.KEY_COLUMN, dtype=k) # Splits test data into features pipeline and labels pipeline. if DatasetType.TEST not in split_data: raise ValueError('No test dataset found in pipeline output.') test_data = (split_data.pop(DatasetType.TEST) | 'SplitFeaturesLabels' >> split_features_labels( constants.LABEL_COLUMN, constants.KEY_COLUMN)) # Stores test data features and labels pipeline separately. for k in test_data: _ = (test_data[k] | 'ParseJsonToString_{}'.format(k) >> beam.Map(json.dumps) | 'StoreSplitData_{}'.format(k) >> beam.io.WriteToText( posixpath.join( output_dir, split_data_path, 'split_data_{}_{}.txt'.format(DatasetType.TEST.name, k)))) meta_data = dataset_metadata.DatasetMetadata(make_input_schema()) transform_fn = ( (split_data[DatasetType.TRAIN], meta_data) | 'AnalyzeTrainDataset' >> beam_impl.AnalyzeDataset(preprocessing_fn)) _ = (transform_fn | 'WriteTransformFn' >> tft.beam.tft_beam_io.WriteTransformFn( posixpath.join(output_dir, constants.PATH_INPUT_TRANSFORMATION))) _ = (meta_data | 'WriteInputMetadata' >> tft.beam.tft_beam_io.WriteMetadata( posixpath.join(output_dir, constants.PATH_INPUT_SCHEMA), pipeline=p)) transformed_metadata, transformed_data = {}, {} for k in [DatasetType.TRAIN, DatasetType.VAL]: transformed_data[k], transformed_metadata[k] = ( ((split_data[k], meta_data), transform_fn) | 'Transform{}'.format(k) >> beam_impl.TransformDataset()) transformed_data[DatasetType.TRAIN] = ( transformed_data[DatasetType.TRAIN] | 'OverSampleTraining' >> oversampling()) for k in transformed_data: _ = (transformed_data[k] | 'ShuffleData{}'.format(k) >> shuffle_data() | 'StoreData{}'.format(k) >> store_transformed_data( schema=transformed_metadata[k], path=posixpath.join(output_dir, constants.PATH_TRANSFORMED_DATA_SPLIT[k]), name=DatasetType(k).name)) for k in transformed_data: _ = (transformed_data[k] | 'CheckSize{}'.format(k.name) >> check_size( name=DatasetType(k).name, path=posixpath.join(output_dir, check_path, k.name)))
def preprocess(pipeline, training_data, eval_data, predict_data, output_dir, frequency_threshold): """Run pre-processing step as a pipeline. Args: pipeline: beam pipeline training_data: the name of the table to train on. eval_data: the name of the table to evaluate on. predict_data: the name of the table to predict on. output_dir: file path to where to write all the output files. frequency_threshold: frequency threshold to use for categorical values. """ # 1) The schema can be either defined in-memory or read from a configuration # file, in this case we are creating the schema in-memory. input_schema = reddit.make_input_schema() # 2) Read from BigQuery or from CSV. train_data = pipeline | 'ReadTrainingData' >> _ReadData(training_data) evaluate_data = pipeline | 'ReadEvalData' >> _ReadData(eval_data) input_metadata = dataset_metadata.DatasetMetadata(schema=input_schema) _ = (input_metadata | 'WriteInputMetadata' >> tft_beam_io.WriteMetadata( os.path.join(output_dir, path_constants.RAW_METADATA_DIR), pipeline=pipeline)) preprocessing_fn = reddit.make_preprocessing_fn(frequency_threshold) transform_fn = ((train_data, input_metadata) | 'Analyze' >> tft.AnalyzeDataset(preprocessing_fn)) # WriteTransformFn writes transform_fn and metadata to fixed subdirectories # of output_dir, which are given by path_constants.TRANSFORM_FN_DIR and # path_constants.TRANSFORMED_METADATA_DIR. _ = (transform_fn | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(output_dir)) @beam.ptransform_fn def TransformAndWrite(pcoll, path): # pylint: disable=invalid-name pcoll |= 'Shuffle' >> _Shuffle() # pylint: disable=no-value-for-parameter (dataset, metadata) = (((pcoll, input_metadata), transform_fn) | 'Transform' >> tft.TransformDataset()) coder = coders.ExampleProtoCoder(metadata.schema) _ = (dataset | 'SerializeExamples' >> beam.Map(coder.encode) | 'WriteExamples' >> beam.io.WriteToTFRecord( os.path.join(output_dir, path), file_name_suffix='.tfrecord.gz')) _ = train_data | 'TransformAndWriteTraining' >> TransformAndWrite( # pylint: disable=no-value-for-parameter path_constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX) _ = evaluate_data | 'TransformAndWriteEval' >> TransformAndWrite( # pylint: disable=no-value-for-parameter path_constants.TRANSFORMED_EVAL_DATA_FILE_PREFIX) # TODO(b/35300113) Remember to eventually also save the statistics. if predict_data: predict_mode = tf.contrib.learn.ModeKeys.INFER predict_schema = reddit.make_input_schema(mode=predict_mode) predict_coder = coders.ExampleProtoCoder(predict_schema) serialized_examples = ( pipeline | 'ReadPredictData' >> _ReadData(predict_data, mode=predict_mode) # TODO(b/35194257) Obviate the need for this explicit # serialization. | 'EncodePredictData' >> beam.Map(predict_coder.encode)) _ = (serialized_examples | 'WritePredictDataAsTFRecord' >> beam.io.WriteToTFRecord( os.path.join( output_dir, path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX), file_name_suffix='.tfrecord.gz')) _ = (serialized_examples | 'EncodePredictAsB64Json' >> beam.Map(_encode_as_b64_json) | 'WritePredictDataAsText' >> beam.io.WriteToText( os.path.join( output_dir, path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX), file_name_suffix='.txt'))
def assertAnalyzeAndTransformResults(self, input_data, input_metadata, preprocessing_fn, expected_data=None, expected_metadata=None, expected_vocab_file_contents=None, expected_asset_file_contents=None, test_data=None, desired_batch_size=None, beam_pipeline=None, temp_dir=None, use_tfxio=False, input_data_is_tfxio_format=False): """Assert that input data and metadata is transformed as expected. This methods asserts transformed data and transformed metadata match with expected_data and expected_metadata. Args: input_data: A sequence of dicts whose values are either strings, lists of strings, numeric types or a pair of those. input_metadata: DatasetMetadata describing input_data. preprocessing_fn: A function taking a dict of tensors and returning a dict of tensors. expected_data: (optional) A dataset with the same type constraints as input_data, but representing the output after transformation. If supplied, transformed data is asserted to be equal. expected_metadata: (optional) DatasetMetadata describing the transformed data. If supplied, transformed metadata is asserted to be equal. expected_vocab_file_contents: (optional) A dictionary from vocab filenames to their expected content as a list of text lines or a list of tuples of frequency and text. Values should be the expected result of calling f.readlines() on the given asset files. expected_asset_file_contents: deprecated. Use expected_vocab_file_contents. test_data: (optional) If this is provided then instead of calling AnalyzeAndTransformDataset with input_data, this function will call AnalyzeDataset with input_data and TransformDataset with test_data. Note that this is the case even if input_data and test_data are equal. test_data should also conform to input_metadata. desired_batch_size: (optional) A batch size to batch elements by. If not provided, a batch size will be computed automatically. beam_pipeline: (optional) A Beam Pipeline to use in this test. temp_dir: If set, it is used as output directory, else a new unique directory is created. use_tfxio: If True, invoke AnalyzeAndTransformDataset using the new API that accepts standardized inputs (Arrow `RecordBatch`es). Otherwise use the old API that accepts Dicts. input_data_is_tfxio_format: If True, `input_data` and `test_data` are Arrow `RecordBatch`es and the `input_metadata` is `tfxio.tensor_adapter.TensorAdapterConfig`. Otherwise the input data is a list of Dicts and input_metadata is a `DatasetMetadata`. Raises: AssertionError: if the expected data does not match the results of transforming input_data according to preprocessing_fn, or (if provided) if the expected metadata does not match. ValueError: if expected_vocab_file_contents and expected_asset_file_contents are both set. """ if (expected_vocab_file_contents is not None and expected_asset_file_contents is not None): raise ValueError('only one of expected_asset_file_contents and ' 'expected_asset_file_contents should be set') elif expected_asset_file_contents is not None: tf.compat.v1.logging.warn( 'expected_asset_file_contents is deprecated, use ' 'expected_vocab_file_contents') expected_vocab_file_contents = (expected_vocab_file_contents or expected_asset_file_contents or {}) del expected_asset_file_contents if not use_tfxio and input_data_is_tfxio_format: raise ValueError('Unable to feed TFXIO input format to the old, ' 'non-TFXIO API.') compatibility_tfxio_needed = use_tfxio and not input_data_is_tfxio_format # Note: we don't separately test AnalyzeDataset and TransformDataset as # AnalyzeAndTransformDataset currently simply composes these two # transforms. If in future versions of the code, the implementation # differs, we should also run AnalyzeDataset and TransformDataset composed. temp_dir = temp_dir or tempfile.mkdtemp(prefix=self._testMethodName, dir=self.get_temp_dir()) with beam_pipeline or self._makeTestPipeline() as pipeline: with beam_impl.Context(temp_dir=temp_dir, desired_batch_size=desired_batch_size, use_tfxio=use_tfxio): input_data = pipeline | 'CreateInput' >> beam.Create( input_data, reshuffle=False) if compatibility_tfxio_needed: legacy_input_metadata = input_metadata input_data, input_metadata = self.convert_to_tfxio_api_inputs( input_data, input_metadata, label='input_data') if test_data is None: (transformed_data, transformed_metadata), transform_fn = ( (input_data, input_metadata) | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) else: transform_fn = ((input_data, input_metadata) | beam_impl.AnalyzeDataset(preprocessing_fn)) test_data = pipeline | 'CreateTest' >> beam.Create( test_data) if compatibility_tfxio_needed: test_data, _ = self.convert_to_tfxio_api_inputs( test_data, legacy_input_metadata, label='test_data') transformed_data, transformed_metadata = ( ((test_data, input_metadata), transform_fn) | beam_impl.TransformDataset()) # Write transform_fn so we can test its assets _ = transform_fn | transform_fn_io.WriteTransformFn(temp_dir) if expected_data is not None: transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) transformed_data_path = os.path.join( temp_dir, 'transformed_data') _ = (transformed_data | beam.Map(transformed_data_coder.encode) | beam.io.tfrecordio.WriteToTFRecord( transformed_data_path, shard_name_template='')) # TODO(ebreck) Log transformed_data somewhere. if expected_data is not None: examples = tf.compat.v1.python_io.tf_record_iterator( path=transformed_data_path) transformed_data = [ transformed_data_coder.decode(x) for x in examples ] self.assertDataCloseOrEqual(expected_data, transformed_data) tf_transform_output = tft.TFTransformOutput(temp_dir) if expected_metadata: # Make a copy with no annotations. transformed_schema = schema_pb2.Schema() transformed_schema.CopyFrom( tf_transform_output.transformed_metadata.schema) transformed_schema.ClearField('annotation') for feature in transformed_schema.feature: feature.ClearField('annotation') self.assertEqual(expected_metadata.schema, transformed_schema) for filename, file_contents in six.iteritems( expected_vocab_file_contents): full_filename = tf_transform_output.vocabulary_file_by_name( filename) self.AssertVocabularyContents(full_filename, file_contents)
def assertAnalyzeAndTransformResults(self, input_data, input_metadata, preprocessing_fn, expected_data=None, expected_metadata=None, expected_vocab_file_contents=None, expected_asset_file_contents=None, test_data=None, desired_batch_size=None, beam_pipeline=None, temp_dir=None, force_tf_compat_v1=True): """Assert that input data and metadata is transformed as expected. This methods asserts transformed data and transformed metadata match with expected_data and expected_metadata. Args: input_data: Input data formatted in one of two ways: * A sequence of dicts whose values are one of: strings, lists of strings, numeric types or a pair of those. Must have at least one key so that we can infer the batch size, or * A sequence of pa.RecordBatch. input_metadata: One of - * DatasetMetadata describing input_data if `input_data` are dicts. * TensorAdapterConfig otherwise. preprocessing_fn: A function taking a dict of tensors and returning a dict of tensors. expected_data: (optional) A dataset with the same type constraints as input_data, but representing the output after transformation. If supplied, transformed data is asserted to be equal. expected_metadata: (optional) DatasetMetadata describing the transformed data. If supplied, transformed metadata is asserted to be equal. expected_vocab_file_contents: (optional) A dictionary from vocab filenames to their expected content as a list of text lines or a list of tuples of frequency and text. Values should be the expected result of calling f.readlines() on the given asset files. expected_asset_file_contents: deprecated. Use expected_vocab_file_contents. test_data: (optional) If this is provided then instead of calling AnalyzeAndTransformDataset with input_data, this function will call AnalyzeDataset with input_data and TransformDataset with test_data. Note that this is the case even if input_data and test_data are equal. test_data should also conform to input_metadata. desired_batch_size: (optional) A batch size to batch elements by. If not provided, a batch size will be computed automatically. beam_pipeline: (optional) A Beam Pipeline to use in this test. temp_dir: If set, it is used as output directory, else a new unique directory is created. force_tf_compat_v1: A `Boolean`. If `True`, TFT's public APIs use Tensorflow in compat.v1 mode. Raises: AssertionError: if the expected data does not match the results of transforming input_data according to preprocessing_fn, or (if provided) if the expected metadata does not match. ValueError: if expected_vocab_file_contents and expected_asset_file_contents are both set. """ if (expected_vocab_file_contents is not None and expected_asset_file_contents is not None): raise ValueError('only one of expected_asset_file_contents and ' 'expected_asset_file_contents should be set') elif expected_asset_file_contents is not None: tf.compat.v1.logging.warn( 'expected_asset_file_contents is deprecated, use ' 'expected_vocab_file_contents') expected_vocab_file_contents = (expected_vocab_file_contents or expected_asset_file_contents or {}) del expected_asset_file_contents # Note: we don't separately test AnalyzeDataset and TransformDataset as # AnalyzeAndTransformDataset currently simply composes these two # transforms. If in future versions of the code, the implementation # differs, we should also run AnalyzeDataset and TransformDataset composed. temp_dir = temp_dir or tempfile.mkdtemp(prefix=self._testMethodName, dir=self.get_temp_dir()) with beam_pipeline or self._makeTestPipeline() as pipeline: with beam_impl.Context(temp_dir=temp_dir, desired_batch_size=desired_batch_size, force_tf_compat_v1=force_tf_compat_v1): input_data = pipeline | 'CreateInput' >> beam.Create( input_data, reshuffle=False) if test_data is None: (transformed_data, transformed_metadata), transform_fn = ( (input_data, input_metadata) | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) else: transform_fn = ((input_data, input_metadata) | beam_impl.AnalyzeDataset(preprocessing_fn)) test_data = pipeline | 'CreateTest' >> beam.Create( test_data) transformed_data, transformed_metadata = ( ((test_data, input_metadata), transform_fn) | beam_impl.TransformDataset()) # Write transform_fn so we can test its assets _ = transform_fn | transform_fn_io.WriteTransformFn(temp_dir) if expected_data is not None: transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) transformed_data_path = os.path.join( temp_dir, 'transformed_data') _ = (transformed_data | beam.Map(transformed_data_coder.encode) | beam.io.tfrecordio.WriteToTFRecord( transformed_data_path, shard_name_template='')) # TODO(ebreck) Log transformed_data somewhere. if expected_data is not None: examples = tf.compat.v1.python_io.tf_record_iterator( path=transformed_data_path) shapes = { f.name: [s.size for s in f.shape.dim] if f.HasField('shape') else [-1] for f in transformed_metadata.schema.feature } transformed_data = [ _format_example_as_numpy_dict(e, shapes) for e in examples ] self.assertDataCloseOrEqual(expected_data, transformed_data) tf_transform_output = tft.TFTransformOutput(temp_dir) if expected_metadata: # Make a copy with no annotations. transformed_schema = schema_pb2.Schema() transformed_schema.CopyFrom( tf_transform_output.transformed_metadata.schema) transformed_schema.ClearField('annotation') for feature in transformed_schema.feature: feature.ClearField('annotation') self.assertEqual(expected_metadata.schema, transformed_schema) for filename, file_contents in six.iteritems( expected_vocab_file_contents): full_filename = tf_transform_output.vocabulary_file_by_name( filename) self.AssertVocabularyContents(full_filename, file_contents)
def transform_data(input_handle, outfile_prefix, working_dir, schema_file, transform_dir=None, max_rows=None, pipeline_args=None): """The main tf.transform method which analyzes and transforms data. Args: input_handle: BigQuery table name to process specified as DATASET.TABLE or path to csv file with input data. outfile_prefix: Filename prefix for emitted transformed examples working_dir: Directory in which transformed examples and transform function will be emitted. schema_file: An file path that contains a text-serialized TensorFlow metadata schema of the input data. transform_dir: Directory in which the transform output is located. If provided, this will load the transform_fn from disk instead of computing it over the data. Hint: this is useful for transforming eval data. max_rows: Number of rows to query from BigQuery pipeline_args: additional DataflowRunner or DirectRunner args passed to the beam pipeline. """ def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in taxi.DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[taxi.transformed_name(key)] = transform.scale_to_z_score( _fill_in_missing(inputs[key])) for key in taxi.VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[taxi.transformed_name( key)] = transform.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), top_k=taxi.VOCAB_SIZE, num_oov_buckets=taxi.OOV_SIZE) for key in taxi.BUCKET_FEATURE_KEYS: outputs[taxi.transformed_name(key)] = transform.bucketize( _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT) for key in taxi.CATEGORICAL_FEATURE_KEYS: outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key]) # Was this passenger a big tipper? taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY]) tips = _fill_in_missing(inputs[taxi.LABEL_KEY]) outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where( tf.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64), # Test if the tip was > 20% of the fare. tf.cast(tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64)) return outputs schema = taxi.read_schema(schema_file) raw_feature_spec = taxi.get_raw_feature_spec(schema) raw_schema = dataset_schema.from_feature_spec(raw_feature_spec) raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema) with beam.Pipeline(argv=pipeline_args) as pipeline: with beam_impl.Context(temp_dir=working_dir): if input_handle.lower().endswith('csv'): csv_coder = taxi.make_csv_coder(schema) raw_data = (pipeline | 'ReadFromText' >> beam.io.ReadFromText( input_handle, skip_header_lines=1) | 'ParseCSV' >> beam.Map(csv_coder.decode)) else: query = taxi.make_sql(input_handle, max_rows, for_eval=False) raw_data = ( pipeline | 'ReadBigQuery' >> beam.io.Read( beam.io.BigQuerySource(query=query, use_standard_sql=True)) | 'CleanData' >> beam.Map(taxi.clean_raw_data_dict, raw_feature_spec=raw_feature_spec)) if transform_dir is None: transform_fn = ( (raw_data, raw_data_metadata) | ('Analyze' >> beam_impl.AnalyzeDataset(preprocessing_fn))) _ = (transform_fn | ('WriteTransformFn' >> transform_fn_io.WriteTransformFn(working_dir))) else: transform_fn = pipeline | transform_fn_io.ReadTransformFn( transform_dir) # Shuffling the data before materialization will improve Training # effectiveness downstream. shuffled_data = raw_data | 'RandomizeData' >> beam.transforms.Reshuffle( ) (transformed_data, transformed_metadata) = ( ((shuffled_data, raw_data_metadata), transform_fn) | 'Transform' >> beam_impl.TransformDataset()) coder = example_proto_coder.ExampleProtoCoder( transformed_metadata.schema) _ = (transformed_data | 'SerializeExamples' >> beam.Map(coder.encode) | 'WriteExamples' >> beam.io.WriteToTFRecord( os.path.join(working_dir, outfile_prefix), file_name_suffix='.gz'))
def test_non_frequency_vocabulary_merge(self): """This test compares vocabularies produced with and without cache.""" mi_vocab_name = 'mutual_information_vocab' adjusted_mi_vocab_name = 'adjusted_mutual_information_vocab' weighted_frequency_vocab_name = 'weighted_frequency_vocab' def preprocessing_fn(inputs): _ = tft.vocabulary( inputs['s'], labels=inputs['label'], store_frequency=True, vocab_filename=mi_vocab_name, min_diff_from_avg=0.1, use_adjusted_mutual_info=False) _ = tft.vocabulary( inputs['s'], labels=inputs['label'], store_frequency=True, vocab_filename=adjusted_mi_vocab_name, min_diff_from_avg=1.0, use_adjusted_mutual_info=True) _ = tft.vocabulary( inputs['s'], weights=inputs['weight'], store_frequency=True, vocab_filename=weighted_frequency_vocab_name, use_adjusted_mutual_info=False) return inputs span_0_key = 'span-0' span_1_key = 'span-1' input_data = [ dict(s='a', weight=1, label=1), dict(s='a', weight=0.5, label=1), dict(s='b', weight=0.75, label=1), dict(s='b', weight=1, label=0), ] input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ 's': tf.io.FixedLenFeature([], tf.string), 'label': tf.io.FixedLenFeature([], tf.int64), 'weight': tf.io.FixedLenFeature([], tf.float32), })) input_data_dict = { span_0_key: input_data, span_1_key: input_data, } with _TestPipeline() as p: flat_data = p | 'CreateInputData' >> beam.Create( list(itertools.chain(*input_data_dict.values()))) # wrap each value in input_data_dict as a pcoll. input_data_pcoll_dict = {} for a, b in six.iteritems(input_data_dict): input_data_pcoll_dict[a] = p | a >> beam.Create(b) transform_fn_with_cache, output_cache = ( (flat_data, input_data_pcoll_dict, {}, input_metadata) | (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn))) transform_fn_with_cache_dir = os.path.join(self.base_test_dir, 'transform_fn_with_cache') _ = transform_fn_with_cache | tft_beam.WriteTransformFn( transform_fn_with_cache_dir) expected_accumulators = { b'__v0__VocabularyAccumulate[vocabulary]-\xd3\xe0p\x82\xb1\xa0z\xa3S\xd7N8@\x8f\xa2\xd7\xa1\x9e\xac;': [ b'["a", [2, [0.0, 1.0], [0.0, 0.0], 1.0]]', b'["b", [2, [0.5, 0.5], [0.0, 0.0], 1.0]]' ], b'__v0__VocabularyAccumulate[vocabulary_1]-A\xc7_0\xee\xff\x88@E<\xde\xcb\x8d\xff5\xebyZZ\x8d': [ b'["a", [2, [0.0, 1.0], [0.0, 0.0], 1.0]]', b'["b", [2, [0.5, 0.5], [0.0, 0.0], 1.0]]' ], b"__v0__VocabularyAccumulate[vocabulary_2]-\x97\x1c>\x851\x94'\xdc\xdf\xfd\xcc\x86\xb7\xb8\xe1\xe8*\x89B\t": [b'["a", 1.5]', b'["b", 1.75]'], } spans = [span_0_key, span_1_key] self.assertCountEqual(output_cache.keys(), spans) for span in spans: self.assertCountEqual(output_cache[span].keys(), expected_accumulators.keys()) for idx, (key, value) in enumerate(six.iteritems(expected_accumulators)): beam_test_util.assert_that( output_cache[span][key], beam_test_util.equal_to(value), label='AssertCache[{}][{}]'.format(span, idx)) # 4 from analysis on each of the input spans. self.assertEqual(_get_counter_value(p.metrics, 'num_instances'), 8) self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_decoded'), 0) self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_encoded'), 6) self.assertEqual(_get_counter_value(p.metrics, 'saved_models_created'), 2) with _TestPipeline() as p: flat_data = p | 'CreateInputData' >> beam.Create(input_data * 2) transform_fn_no_cache = ((flat_data, input_metadata) | (beam_impl.AnalyzeDataset(preprocessing_fn))) transform_fn_no_cache_dir = os.path.join(self.base_test_dir, 'transform_fn_no_cache') _ = transform_fn_no_cache | tft_beam.WriteTransformFn( transform_fn_no_cache_dir) # 4 from analysis on each of the input spans. self.assertEqual(_get_counter_value(p.metrics, 'num_instances'), 8) self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_decoded'), 0) self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_encoded'), 0) self.assertEqual(_get_counter_value(p.metrics, 'saved_models_created'), 2) tft_output_cache = tft.TFTransformOutput(transform_fn_with_cache_dir) tft_output_no_cache = tft.TFTransformOutput(transform_fn_no_cache_dir) for vocab_filename in (mi_vocab_name, adjusted_mi_vocab_name, weighted_frequency_vocab_name): cache_path = tft_output_cache.vocabulary_file_by_name(vocab_filename) no_cache_path = tft_output_no_cache.vocabulary_file_by_name( vocab_filename) with tf.io.gfile.GFile(cache_path, 'rb') as f1, tf.io.gfile.GFile( no_cache_path, 'rb') as f2: self.assertEqual( f1.readlines(), f2.readlines(), 'vocab with cache != vocab without cache for: {}'.format( vocab_filename))
def transform_data(input_handle, outfile_prefix, working_dir, setup_file, ts1, ts2, project=None, max_rows=None, mode=None, stage=None, preprocessing_fn=None): """The main tf.transform method which analyzes and transforms data. Args: input_handle: BigQuery table name to process specified as DATASET.TABLE or path to csv file with input data. outfile_prefix: Filename prefix for emitted transformed examples working_dir: Directory in which transformed examples and transform function will be emitted. max_rows: Number of rows to query from BigQuery pipeline_args: additional DataflowRunner or DirectRunner args passed to the beam pipeline. """ def def_preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in ts.DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[key] = transform.scale_to_z_score(inputs[key]) for key in ts.VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[key] = transform.string_to_int(inputs[key], top_k=ts.VOCAB_SIZE, num_oov_buckets=ts.OOV_SIZE) for key in ts.BUCKET_FEATURE_KEYS: outputs[key] = transform.bucketize(inputs[key], ts.FEATURE_BUCKET_COUNT) for key in ts.CATEGORICAL_FEATURE_KEYS: outputs[key] = inputs[key] # Was this passenger a big tipper? def convert_label(label): taxi_fare = inputs[ts.FARE_KEY] return tf.where( tf.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64), # Test if the tip was > 20% of the fare. tf.cast( tf.greater(label, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64)) outputs[ts.LABEL_KEY] = transform.apply_function( convert_label, inputs[ts.LABEL_KEY]) return outputs preprocessing_fn = preprocessing_fn or def_preprocessing_fn print('ts1 %s, ts2 %s' % (ts1, ts2)) raw_feature_spec = ts.get_raw_feature_spec() raw_schema = dataset_schema.from_feature_spec(raw_feature_spec) raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema) temp_dir = os.path.join(working_dir, 'tmp') if stage is None: stage = 'train' if mode == 'local': options = {'project': project} pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options) runner = 'DirectRunner' elif mode == 'cloud': options = { 'job_name': 'tft-' + stage + '-' + str(uuid.uuid4()), 'temp_location': temp_dir, 'project': project, 'save_main_session': True, 'setup_file': setup_file } pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options) runner = 'DataFlowRunner' else: raise ValueError("Invalid mode %s." % mode) with beam.Pipeline(runner, options=pipeline_options) as pipeline: with beam_impl.Context(temp_dir=temp_dir): csv_coder = ts.make_csv_coder() if 'csv' in input_handle.lower(): raw_data = (pipeline | 'ReadFromText' >> beam.io.ReadFromText( input_handle, skip_header_lines=1) | 'ParseCSV' >> beam.Map(csv_coder.decode)) else: query = make_sql(input_handle, ts1, ts2, stage, max_rows=max_rows, for_eval=False) raw_data = (pipeline | 'ReadBigQuery' >> beam.io.Read( beam.io.BigQuerySource(query=query, use_standard_sql=True))) raw_data |= 'CleanData' >> beam.Map(ts.clean_raw_data_dict) transform_fn = ( (raw_data, raw_data_metadata) | 'Analyze' >> beam_impl.AnalyzeDataset(preprocessing_fn)) _ = (transform_fn | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn(working_dir)) # Shuffling the data before materialization will improve training # effectiveness downstream. shuffled_data = raw_data | 'RandomizeData' >> beam.transforms.Reshuffle( ) (transformed_data, transformed_metadata) = ( ((shuffled_data, raw_data_metadata), transform_fn) | 'Transform' >> beam_impl.TransformDataset()) if 'csv' not in input_handle.lower(): # if querying BQ _ = (raw_data | beam.Map(csv_coder.encode) | beam.io.WriteToText(os.path.join( working_dir, '{}.csv'.format(stage)), num_shards=1)) coder = example_proto_coder.ExampleProtoCoder( transformed_metadata.schema) _ = ( transformed_data | 'SerializeExamples' >> beam.Map(coder.encode) | 'WriteExamples' >> beam.io.WriteToTFRecord( os.path.join(working_dir, outfile_prefix), compression_type=beam.io.filesystem.CompressionTypes.GZIP))
def preprocess(pipeline, training_data, eval_data, predict_data, output_dir, frequency_threshold, delimiter): """Run pre-processing step as a pipeline. Args: pipeline: beam pipeline training_data: file paths to input csv files. eval_data: file paths to input csv files. predict_data: file paths to input csv files. output_dir: file path to where to write all the output files. frequency_threshold: frequency threshold to use for categorical values. delimiter: the column delimiter for the CSV format. """ # 1) The schema can be either defined in-memory or read from a configuration # file, in this case we are creating the schema in-memory. input_schema = criteo.make_input_schema() # 2) Configure the coder to map the source file column names to a dictionary # of key -> tensor_proto with the appropiate type derived from the # input_schema. coder = criteo.make_csv_coder(input_schema, delimiter) # 3) Read from text using the coder. train_data = (pipeline | 'ReadTrainingData' >> beam.io.ReadFromText(training_data) | 'ParseTrainingCsv' >> beam.Map(coder.decode)) evaluate_data = (pipeline | 'ReadEvalData' >> beam.io.ReadFromText(eval_data) | 'ParseEvalCsv' >> beam.Map(coder.decode)) input_metadata = dataset_metadata.DatasetMetadata(schema=input_schema) _ = (input_metadata | 'WriteInputMetadata' >> tft_beam_io.WriteMetadata( os.path.join(output_dir, path_constants.RAW_METADATA_DIR), pipeline=pipeline)) preprocessing_fn = criteo.make_preprocessing_fn(frequency_threshold) transform_fn = ((train_data, input_metadata) | 'Analyze' >> tft.AnalyzeDataset(preprocessing_fn)) # WriteTransformFn writes transform_fn and metadata to fixed subdirectories # of output_dir, which are given by path_constants.TRANSFORM_FN_DIR and # path_constants.TRANSFORMED_METADATA_DIR. _ = (transform_fn | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(output_dir)) @beam.ptransform_fn def TransformAndWrite(pcoll, path): # pylint: disable=invalid-name pcoll |= 'Shuffle' >> _Shuffle() # pylint: disable=no-value-for-parameter (dataset, metadata) = (((pcoll, input_metadata), transform_fn) | 'Transform' >> tft.TransformDataset()) #coder = criteo.make_csv_coder(input_schema, delimiter) #coder = coders.ExampleProtoCoder(metadata.schema) column_names = ['clicked'] #for name in INTEGER_COLUMN_NAMES: # column_names.append(name) #for name in CATEGORICAL_COLUMN_NAMES: # column_names.append(name) #coder = coders.CsvCoder(column_names, metadata.schema, delimiter=",") coder = coders.ExampleProtoCoder(metadata.schema) _ = (dataset | 'SerializeExamples' >> beam.Map(coder.encode) | 'WriteExamples' >> beam.io.WriteToTFRecord( os.path.join(output_dir, path), file_name_suffix='.tfrecord.gz')) _ = train_data | 'TransformAndWriteTraining' >> TransformAndWrite( # pylint: disable=no-value-for-parameter path_constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX) _ = evaluate_data | 'TransformAndWriteEval' >> TransformAndWrite( # pylint: disable=no-value-for-parameter path_constants.TRANSFORMED_EVAL_DATA_FILE_PREFIX) # TODO(b/35300113) Remember to eventually also save the statistics. if predict_data: predict_mode = tf.contrib.learn.ModeKeys.INFER predict_schema = criteo.make_input_schema(mode=predict_mode) csv_coder = criteo.make_csv_coder(predict_schema, mode=predict_mode) predict_coder = coders.ExampleProtoCoder(predict_schema) serialized_examples = ( pipeline | 'ReadPredictData' >> beam.io.ReadFromText(predict_data) | 'ParsePredictCsv' >> beam.Map(csv_coder.decode) # TODO(b/35194257) Obviate the need for this explicit serialization. | 'EncodePredictData' >> beam.Map(predict_coder.encode)) _ = (serialized_examples | 'WritePredictDataAsTFRecord' >> beam.io.WriteToTFRecord( os.path.join( output_dir, path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX), file_name_suffix='.tfrecord.gz')) _ = (serialized_examples | 'EncodePredictAsB64Json' >> beam.Map(_encode_as_b64_json) | 'WritePredictDataAsText' >> beam.io.WriteToText( os.path.join( output_dir, path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX), file_name_suffix='.txt'))
movies_data=movies_sideinput, rating_threshold=args.eval_score_threshold, is_ranking_problem=(args.eval_type == RANKING), is_train=False, num_ranking_candidate_movie_ids=args.num_ranking_candidate_movie_ids) # TFTransform based preprocessing. raw_metadata = dataset_metadata.DatasetMetadata( schema=movielens.make_examples_schema()) _ = (raw_metadata | 'WriteRawMetadata' >> tft_beam_io.WriteMetadata( os.path.join(args.output_dir, 'raw_metadata'), pipeline)) preprocessing_fn = movielens.make_preprocessing_fn() transform_fn = ((train_data, raw_metadata) | 'Analyze' >> tft.AnalyzeDataset(preprocessing_fn)) _ = (transform_fn | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(args.output_dir)) @beam.ptransform_fn def TransformAndWrite(pcoll, path): # pylint: disable=invalid-name pcoll |= 'Shuffle' >> _Shuffle() # pylint: disable=no-value-for-parameter (dataset, metadata) = (((pcoll, raw_metadata), transform_fn) | 'Transform' >> tft.TransformDataset()) coder = tft_coders.ExampleProtoCoder(metadata.schema) _ = (dataset | 'SerializeExamples' >> beam.Map(coder.encode) | 'WriteExamples' >> beam.io.WriteToTFRecord( os.path.join(args.output_dir, path), file_name_suffix='.tfrecord.gz'))
def run(flags, pipeline_args): """Run Apache Beam pipeline to generate TFRecords for Survival Analysis""" options = PipelineOptions(flags=[], **pipeline_args) options.view_as(WorkerOptions).machine_type = flags.machine_type temp_dir = os.path.join(flags.output_dir, 'tmp') runner = 'DataflowRunner' if flags.cloud else 'DirectRunner' files = tf.gfile.Glob(flags.input_dir + "*") if not flags.cloud: files = files[0: 20] # if running locally for testing, process less files logging.warning("Number of files: " + str(len(files))) labels = get_labels_array( "gs://columbia-dl-storage-bucket/ADNI_t1_list_with_fsstatus_20190111.csv" ) with beam.Pipeline(runner, options=options) as p: with tft_beam.Context(temp_dir=temp_dir): input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec(features.RAW_FEATURE_SPEC)) filenames = (p | 'Create filenames' >> beam.Create(files)) nii = (filenames | 'Read NII' >> beam.Map(read_nii)) nii_with_labels = ( nii | 'Get Label' >> beam.FlatMap(lambda x: read_label(x, labels))) raw_train, raw_eval, raw_test = ( nii_with_labels | 'RandomlySplitData' >> randomly_split( train_size=.7, validation_size=.15, test_size=.15)) raw_train = raw_train | 'FlattenTrain' >> beam.FlatMap( lambda x: x[1]) raw_eval = (raw_eval | 'FlattenEval' >> beam.FlatMap(lambda x: x[1])) raw_test = (raw_test | 'FlattenTest' >> beam.FlatMap(lambda x: x[1])) raw_train | 'CountLabelFreq' >> extractAndCount(flags.output_dir) dataset_and_metadata, transform_fn = ( (raw_train, input_metadata) | 'TransformData' >> tft_beam.AnalyzeAndTransformDataset( features.preprocess)) transform_fn = ( (raw_train, input_metadata) | 'AnalyzeTrain' >> tft_beam.AnalyzeDataset(features.preprocess)) _ = (transform_fn | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn( flags.output_dir)) for dataset_type, dataset in [('Train', raw_train), ('Eval', raw_eval), ('Predict', raw_test)]: transform_label = 'Transform{}'.format(dataset_type) t, metadata = (((dataset, input_metadata), transform_fn) | transform_label >> tft_beam.TransformDataset()) if dataset_type == 'Train': _ = (metadata | 'WriteMetadata' >> tft_beam_io.WriteMetadata(os.path.join( flags.output_dir, 'transformed_metadata'), pipeline=p)) write_label = 'Write{}TFRecord'.format(dataset_type) _ = t | write_label >> WriteTFRecord( dataset_type, flags.output_dir, metadata)
def assertAnalyzeAndTransformResults(self, input_data, input_metadata, preprocessing_fn, expected_data=None, expected_metadata=None, only_check_core_metadata=False, expected_vocab_file_contents=None, expected_asset_file_contents=None, test_data=None, desired_batch_size=None): """Assert that input data and metadata is transformed as expected. This methods asserts transformed data and transformed metadata match with expected_data and expected_metadata. Args: input_data: A sequence of dicts whose values are either strings, lists of strings, numeric types or a pair of those. input_metadata: DatasetMetadata describing input_data. preprocessing_fn: A function taking a dict of tensors and returning a dict of tensors. expected_data: (optional) A dataset with the same type constraints as input_data, but representing the output after transformation. If supplied, transformed data is asserted to be equal. expected_metadata: (optional) DatasetMetadata describing the transformed data. If supplied, transformed metadata is asserted to be equal. only_check_core_metadata: A boolean to indicate if all elements in the transformed metadata is asserted to be equal to expected metadata. If True, only transformed feature names, dtypes and representations are asserted. expected_vocab_file_contents: (optional) A dictionary from vocab filenames to their expected content as a list of text lines. Values should be the expected result of calling f.readlines() on the given asset files. expected_asset_file_contents: deprecated. Use expected_vocab_file_contents. test_data: (optional) If this is provided then instead of calling AnalyzeAndTransformDataset with input_data, this function will call AnalyzeDataset with input_data and TransformDataset with test_data. Note that this is the case even if input_data and test_data are equal. test_data should also conform to input_metadata. desired_batch_size: (optional) A batch size to batch elements by. If not provided, a batch size will be computed automatically. Raises: AssertionError: if the expected data does not match the results of transforming input_data according to preprocessing_fn, or (if provided) if the expected metadata does not match. ValueError: if expected_vocab_file_contents and expected_asset_file_contents are both set. """ if (expected_vocab_file_contents is not None and expected_asset_file_contents is not None): raise ValueError('only one of expected_asset_file_contents and ' 'expected_asset_file_contents should be set') elif expected_asset_file_contents is not None: tf.logging.warn('expected_asset_file_contents is deprecated, use ' 'expected_vocab_file_contents') expected_vocab_file_contents = ( expected_vocab_file_contents or expected_asset_file_contents or {}) del expected_asset_file_contents # Note: we don't separately test AnalyzeDataset and TransformDataset as # AnalyzeAndTransformDataset currently simply composes these two # transforms. If in future versions of the code, the implementation # differs, we should also run AnalyzeDataset and TransformDatset composed. temp_dir = self.get_temp_dir() with beam_impl.Context( temp_dir=temp_dir, desired_batch_size=desired_batch_size): if test_data is None: (transformed_data, transformed_metadata), transform_fn = ( (input_data, input_metadata) | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) else: transform_fn = ((input_data, input_metadata) | beam_impl.AnalyzeDataset(preprocessing_fn)) transformed_data, transformed_metadata = ( ((test_data, input_metadata), transform_fn) | beam_impl.TransformDataset()) # Write transform_fn so we can test its assets if expected_vocab_file_contents: _ = transform_fn | transform_fn_io.WriteTransformFn(temp_dir) if expected_data is not None: self.assertDataCloseOrEqual(expected_data, transformed_data) if expected_metadata: # Now that the pipeline has run, transformed_metadata.deferred_metadata # should be a list containing a single DatasetMetadata with the full # metadata. assert len(transformed_metadata.deferred_metadata) == 1 transformed_metadata = transformed_metadata.deferred_metadata[0] if only_check_core_metadata: # preprocessing_fn may add metadata to column schema only relevant to # internal implementation such as vocabulary_file. As such, only check # feature names, dtypes and representations are as expected. self.assertSameElements( transformed_metadata.schema.column_schemas.keys(), expected_metadata.schema.column_schemas.keys()) for k, v in transformed_metadata.schema.column_schemas.iteritems(): expected_schema = expected_metadata.schema.column_schemas[k] self.assertEqual(expected_schema.representation, v.representation, "representation doesn't match for feature '%s'" % k) self.assertEqual(expected_schema.domain.dtype, v.domain.dtype, "dtype doesn't match for feature '%s'" % k) else: # Check the entire DatasetMetadata is as expected. # Use extra assertEqual for schemas, since full metadata assertEqual # error message is not conducive to debugging. self.assertEqual(expected_metadata.schema.column_schemas, transformed_metadata.schema.column_schemas) self.assertEqual(expected_metadata, transformed_metadata) tf_transform_output = tft.TFTransformOutput(temp_dir) for filename, file_contents in six.iteritems(expected_vocab_file_contents): full_filename = tf_transform_output.vocabulary_file_by_name(filename) with tf.gfile.Open(full_filename) as f: self.assertEqual(f.readlines(), file_contents)
def assertAnalyzeAndTransformResults(self, input_data, input_metadata, preprocessing_fn, expected_data=None, expected_metadata=None, only_check_core_metadata=False): """Assert that input data and metadata is transformed as expected. This methods asserts transformed data and transformed metadata match with expected_data and expected_metadata. Args: input_data: A sequence of dicts whose values are either strings, lists of strings, numeric types or a pair of those. input_metadata: DatasetMetadata describing input_data. preprocessing_fn: A function taking a dict of tensors and returning a dict of tensors. expected_data: (optional) A dataset with the same type constraints as input_data, but representing the output after transformation. If supplied, transformed data is asserted to be equal. expected_metadata: (optional) DatasetMetadata describing the transformed data. If supplied, transformed metadata is asserted to be equal. only_check_core_metadata: A boolean to indicate if all elements in the transformed metadata is asserted to be equal to expected metadata. If True, only transformed feature names, dtypes and representations are asserted. Raises: AssertionError: if the expected data does not match the results of transforming input_data according to preprocessing_fn, or (if provided) if the expected metadata does not match. """ # Note: we don't separately test AnalyzeDataset and TransformDataset as # AnalyzeAndTransformDataset currently simply composes these two # transforms. If in future versions of the code, the implementation # differs, we should also run AnalyzeDataset and TransformDatset composed. # # Also, the dataset_metadata that is returned along with # `transformed_data` is incomplete as it does not contain the deferred # components, so we instead inspect the metadata returned along with the # transform function. temp_dir = self.get_temp_dir() with beam_impl.Context(temp_dir=temp_dir): transform_fn, transformed_metadata = ( (input_data, input_metadata) | 'AnalyzeDataset' >> beam_impl.AnalyzeDataset(preprocessing_fn)) transformed_data, _ = ( ((input_data, input_metadata), (transform_fn, transformed_metadata)) | 'TransformDataset' >> beam_impl.TransformDataset()) if expected_data: self.assertDataCloseOrEqual(expected_data, transformed_data) if not expected_metadata: return transformed_metadata = self._resolveDeferredMetadata( transformed_metadata) if only_check_core_metadata: # preprocessing_fn may add metadata to column schema only relevant to # internal implementation such as vocabulary_file. As such, only check # feature names, dtypes and representations are as expected. self.assertSameElements( transformed_metadata.schema.column_schemas.keys(), expected_metadata.schema.column_schemas.keys()) for k, v in transformed_metadata.schema.column_schemas.iteritems(): expected_schema = expected_metadata.schema.column_schemas[k] self.assertEqual( expected_schema.representation, v.representation, "representation doesn't match for feature '%s'" % k) self.assertEqual(expected_schema.domain.dtype, v.domain.dtype, "dtype doesn't match for feature '%s'" % k) else: # Check the entire DatasetMetadata is as expected. # Use extra assertEqual for schemas, since full metadata assertEqual # error message is not conducive to debugging. self.assertEqual(expected_metadata.schema.column_schemas, transformed_metadata.schema.column_schemas) self.assertEqual(expected_metadata, transformed_metadata)
def test_non_frequency_vocabulary_merge(self): """This test compares vocabularies produced with and without cache.""" mi_vocab_name = 'mutual_information_vocab' adjusted_mi_vocab_name = 'adjusted_mutual_information_vocab' weighted_frequency_vocab_name = 'weighted_frequency_vocab' def preprocessing_fn(inputs): _ = tft.vocabulary(inputs['s'], labels=inputs['label'], store_frequency=True, vocab_filename=mi_vocab_name, min_diff_from_avg=0.1, use_adjusted_mutual_info=False) _ = tft.vocabulary(inputs['s'], labels=inputs['label'], store_frequency=True, vocab_filename=adjusted_mi_vocab_name, min_diff_from_avg=1.0, use_adjusted_mutual_info=True) _ = tft.vocabulary(inputs['s'], weights=inputs['weight'], store_frequency=True, vocab_filename=weighted_frequency_vocab_name, use_adjusted_mutual_info=False) return inputs span_0_key = 'span-0' span_1_key = 'span-1' input_data = [ dict(s='a', weight=1, label=1), dict(s='a', weight=0.5, label=1), dict(s='b', weight=0.75, label=1), dict(s='b', weight=1, label=0), ] input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ 's': tf.io.FixedLenFeature([], tf.string), 'label': tf.io.FixedLenFeature([], tf.int64), 'weight': tf.io.FixedLenFeature([], tf.float32), })) input_data_dict = { span_0_key: input_data, span_1_key: input_data, } with beam_impl.Context(temp_dir=self.get_temp_dir()): flat_data = input_data_dict.values() | 'Flatten' >> beam.Flatten() transform_fn_with_cache, output_cache = ( (flat_data, input_data_dict, {}, input_metadata) | (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn))) expected_accumulators = { '__v0__VocabularyAccumulate--vocabulary--': [b'["a", [2, 1.0, 0.0, 1.0]]', b'["b", [2, 0.5, 0.0, 1.0]]'], '__v0__VocabularyAccumulate--vocabulary_1--': [b'["a", [2, 1.0, 0.0, 1.0]]', b'["b", [2, 0.5, 0.0, 1.0]]'], '__v0__VocabularyAccumulate--vocabulary_2--': [b'["a", 1.5]', b'["b", 1.75]'], } spans = [span_0_key, span_1_key] self.assertCountEqual(output_cache.keys(), spans) for span in spans: self.assertCountEqual(output_cache[span].keys(), expected_accumulators.keys()) for key, value in six.iteritems(expected_accumulators): self.assertCountEqual(output_cache[span][key], value) transform_fn_no_cache = ( (input_data * 2, input_metadata) | (beam_impl.AnalyzeDataset(preprocessing_fn))) transform_fn_with_cache_dir = os.path.join(self.base_test_dir, 'transform_fn_with_cache') _ = transform_fn_with_cache | tft_beam.WriteTransformFn( transform_fn_with_cache_dir) transform_fn_no_cache_dir = os.path.join(self.base_test_dir, 'transform_fn_no_cache') _ = transform_fn_no_cache | tft_beam.WriteTransformFn( transform_fn_no_cache_dir) tft_output_cache = tft.TFTransformOutput(transform_fn_with_cache_dir) tft_output_no_cache = tft.TFTransformOutput(transform_fn_no_cache_dir) for vocab_filename in (mi_vocab_name, adjusted_mi_vocab_name, weighted_frequency_vocab_name): cache_path = tft_output_cache.vocabulary_file_by_name( vocab_filename) no_cache_path = tft_output_no_cache.vocabulary_file_by_name( vocab_filename) with tf.io.gfile.GFile(cache_path, 'rb') as f1, tf.io.gfile.GFile( no_cache_path, 'rb') as f2: self.assertEqual( f1.readlines(), f2.readlines(), 'vocab with cache != vocab without cache for: {}'.format( vocab_filename))
def assertAnalyzeAndTransformResults(self, input_data, input_metadata, preprocessing_fn, expected_data=None, expected_metadata=None, only_check_core_metadata=False, expected_vocab_file_contents=None, expected_asset_file_contents=None, test_data=None, desired_batch_size=None, beam_pipeline=None): """Assert that input data and metadata is transformed as expected. This methods asserts transformed data and transformed metadata match with expected_data and expected_metadata. Args: input_data: A sequence of dicts whose values are either strings, lists of strings, numeric types or a pair of those. input_metadata: DatasetMetadata describing input_data. preprocessing_fn: A function taking a dict of tensors and returning a dict of tensors. expected_data: (optional) A dataset with the same type constraints as input_data, but representing the output after transformation. If supplied, transformed data is asserted to be equal. expected_metadata: (optional) DatasetMetadata describing the transformed data. If supplied, transformed metadata is asserted to be equal. only_check_core_metadata: A boolean to indicate if all elements in the transformed metadata is asserted to be equal to expected metadata. If True, only transformed feature names, dtypes and representations are asserted. expected_vocab_file_contents: (optional) A dictionary from vocab filenames to their expected content as a list of text lines or a list of tuples of frequency and text. Values should be the expected result of calling f.readlines() on the given asset files. expected_asset_file_contents: deprecated. Use expected_vocab_file_contents. test_data: (optional) If this is provided then instead of calling AnalyzeAndTransformDataset with input_data, this function will call AnalyzeDataset with input_data and TransformDataset with test_data. Note that this is the case even if input_data and test_data are equal. test_data should also conform to input_metadata. desired_batch_size: (optional) A batch size to batch elements by. If not provided, a batch size will be computed automatically. beam_pipeline: (optional) A Beam Pipeline to use in this test. Raises: AssertionError: if the expected data does not match the results of transforming input_data according to preprocessing_fn, or (if provided) if the expected metadata does not match. ValueError: if expected_vocab_file_contents and expected_asset_file_contents are both set. """ if (expected_vocab_file_contents is not None and expected_asset_file_contents is not None): raise ValueError('only one of expected_asset_file_contents and ' 'expected_asset_file_contents should be set') elif expected_asset_file_contents is not None: tf.logging.warn('expected_asset_file_contents is deprecated, use ' 'expected_vocab_file_contents') expected_vocab_file_contents = (expected_vocab_file_contents or expected_asset_file_contents or {}) del expected_asset_file_contents # Note: we don't separately test AnalyzeDataset and TransformDataset as # AnalyzeAndTransformDataset currently simply composes these two # transforms. If in future versions of the code, the implementation # differs, we should also run AnalyzeDataset and TransformDatset composed. temp_dir = tempfile.mkdtemp(prefix=self._testMethodName, dir=self.get_temp_dir()) with beam_pipeline or beam.Pipeline( runner=self._makeRunner()) as pipeline: with beam_impl.Context(temp_dir=temp_dir, desired_batch_size=desired_batch_size): input_data = pipeline | 'CreateInput' >> beam.Create( input_data) if test_data is None: (transformed_data, transformed_metadata), transform_fn = ( (input_data, input_metadata) | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) else: transform_fn = ((input_data, input_metadata) | beam_impl.AnalyzeDataset(preprocessing_fn)) test_data = pipeline | 'CreateTest' >> beam.Create( test_data) transformed_data, transformed_metadata = ( ((test_data, input_metadata), transform_fn) | beam_impl.TransformDataset()) # Write transform_fn so we can test its assets _ = transform_fn | transform_fn_io.WriteTransformFn(temp_dir) if expected_data is not None: transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) transformed_data_path = os.path.join( temp_dir, 'transformed_data') _ = (transformed_data | beam.Map(transformed_data_coder.encode) | beam.io.tfrecordio.WriteToTFRecord( transformed_data_path, shard_name_template='')) if expected_data is not None: examples = tf.python_io.tf_record_iterator( path=transformed_data_path) transformed_data = [ transformed_data_coder.decode(x) for x in examples ] self.assertDataCloseOrEqual(expected_data, transformed_data) tf_transform_output = tft.TFTransformOutput(temp_dir) if expected_metadata: transformed_metadata = tf_transform_output.transformed_metadata if only_check_core_metadata: # preprocessing_fn may add metadata to column schema only relevant to # internal implementation such as vocabulary_file. As such, only check # feature names, dtypes and representations are as expected. self.assertSameElements( transformed_metadata.schema.column_schemas.keys(), expected_metadata.schema.column_schemas.keys()) for k, v in transformed_metadata.schema.column_schemas.iteritems( ): expected_schema = expected_metadata.schema.column_schemas[ k] self.assertEqual( expected_schema.representation, v.representation, "representation doesn't match for feature '%s'" % k) self.assertEqual( expected_schema.domain.dtype, v.domain.dtype, "dtype doesn't match for feature '%s'" % k) else: # Check the entire DatasetMetadata is as expected. # Use extra assertEqual for schemas, since full metadata assertEqual # error message is not conducive to debugging. self.assertEqual(expected_metadata.schema.column_schemas, transformed_metadata.schema.column_schemas) self.assertEqual(expected_metadata, transformed_metadata) for filename, file_contents in six.iteritems( expected_vocab_file_contents): full_filename = tf_transform_output.vocabulary_file_by_name( filename) with tf.gfile.Open(full_filename) as f: file_lines = f.readlines() # Store frequency case. if isinstance(file_contents[0], tuple): word_and_frequency_list = [] for content in file_lines: frequency, word = content.split(' ', 1) word_and_frequency_list.append( (word.strip('\n'), float(frequency.strip('\n')))) self.assertAllEqual( zip(*word_and_frequency_list)[0], zip(*file_contents)[0]) np.testing.assert_almost_equal( zip(*word_and_frequency_list)[1], zip(*file_contents)[1]) else: file_lines = [ content.strip('\n') for content in file_lines ] self.assertAllEqual(file_lines, file_contents)
def assertAnalyzeAndTransformResults(self, input_data, input_metadata, preprocessing_fn, expected_data=None, expected_metadata=None, expected_vocab_file_contents=None, expected_asset_file_contents=None, test_data=None, desired_batch_size=None, beam_pipeline=None, temp_dir=None): """Assert that input data and metadata is transformed as expected. This methods asserts transformed data and transformed metadata match with expected_data and expected_metadata. Args: input_data: A sequence of dicts whose values are either strings, lists of strings, numeric types or a pair of those. input_metadata: DatasetMetadata describing input_data. preprocessing_fn: A function taking a dict of tensors and returning a dict of tensors. expected_data: (optional) A dataset with the same type constraints as input_data, but representing the output after transformation. If supplied, transformed data is asserted to be equal. expected_metadata: (optional) DatasetMetadata describing the transformed data. If supplied, transformed metadata is asserted to be equal. expected_vocab_file_contents: (optional) A dictionary from vocab filenames to their expected content as a list of text lines or a list of tuples of frequency and text. Values should be the expected result of calling f.readlines() on the given asset files. expected_asset_file_contents: deprecated. Use expected_vocab_file_contents. test_data: (optional) If this is provided then instead of calling AnalyzeAndTransformDataset with input_data, this function will call AnalyzeDataset with input_data and TransformDataset with test_data. Note that this is the case even if input_data and test_data are equal. test_data should also conform to input_metadata. desired_batch_size: (optional) A batch size to batch elements by. If not provided, a batch size will be computed automatically. beam_pipeline: (optional) A Beam Pipeline to use in this test. temp_dir: If set, it is used as output directory, else a new unique directory is created. Raises: AssertionError: if the expected data does not match the results of transforming input_data according to preprocessing_fn, or (if provided) if the expected metadata does not match. ValueError: if expected_vocab_file_contents and expected_asset_file_contents are both set. """ if (expected_vocab_file_contents is not None and expected_asset_file_contents is not None): raise ValueError('only one of expected_asset_file_contents and ' 'expected_asset_file_contents should be set') elif expected_asset_file_contents is not None: tf.compat.v1.logging.warn( 'expected_asset_file_contents is deprecated, use ' 'expected_vocab_file_contents') expected_vocab_file_contents = (expected_vocab_file_contents or expected_asset_file_contents or {}) del expected_asset_file_contents # Note: we don't separately test AnalyzeDataset and TransformDataset as # AnalyzeAndTransformDataset currently simply composes these two # transforms. If in future versions of the code, the implementation # differs, we should also run AnalyzeDataset and TransformDatset composed. temp_dir = temp_dir or tempfile.mkdtemp(prefix=self._testMethodName, dir=self.get_temp_dir()) with beam_pipeline or self._makeTestPipeline() as pipeline: with beam_impl.Context(temp_dir=temp_dir, desired_batch_size=desired_batch_size): input_data = pipeline | 'CreateInput' >> beam.Create( input_data) if test_data is None: (transformed_data, transformed_metadata), transform_fn = ( (input_data, input_metadata) | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) else: transform_fn = ((input_data, input_metadata) | beam_impl.AnalyzeDataset(preprocessing_fn)) test_data = pipeline | 'CreateTest' >> beam.Create( test_data) transformed_data, transformed_metadata = ( ((test_data, input_metadata), transform_fn) | beam_impl.TransformDataset()) # Write transform_fn so we can test its assets _ = transform_fn | transform_fn_io.WriteTransformFn(temp_dir) if expected_data is not None: transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) transformed_data_path = os.path.join( temp_dir, 'transformed_data') _ = (transformed_data | beam.Map(transformed_data_coder.encode) | beam.io.tfrecordio.WriteToTFRecord( transformed_data_path, shard_name_template='')) # TODO(ebreck) Log transformed_data somewhere. if expected_data is not None: examples = tf.compat.v1.python_io.tf_record_iterator( path=transformed_data_path) transformed_data = [ transformed_data_coder.decode(x) for x in examples ] self.assertDataCloseOrEqual(expected_data, transformed_data) tf_transform_output = tft.TFTransformOutput(temp_dir) if expected_metadata: self.assertEqual(expected_metadata, tf_transform_output.transformed_metadata) for filename, file_contents in six.iteritems( expected_vocab_file_contents): full_filename = tf_transform_output.vocabulary_file_by_name( filename) with tf.io.gfile.GFile(full_filename, 'rb') as f: file_lines = f.readlines() # Store frequency case. if isinstance(file_contents[0], tuple): word_and_frequency_list = [] for content in file_lines: frequency, word = content.split(b' ', 1) word_and_frequency_list.append( (word.strip(b'\n'), float(frequency.strip(b'\n')))) expected_words, expected_frequency = zip( *word_and_frequency_list) actual_words, actual_frequency = zip(*file_contents) self.assertAllEqual(expected_words, actual_words) np.testing.assert_almost_equal(expected_frequency, actual_frequency) else: file_lines = [ content.strip(b'\n') for content in file_lines ] self.assertAllEqual(file_lines, file_contents)
def assertAnalyzeAndTransformResults(self, input_data, input_metadata, preprocessing_fn, expected_data=None, expected_metadata=None, expected_vocab_file_contents=None, test_data=None, desired_batch_size=None, beam_pipeline=None, temp_dir=None, force_tf_compat_v1=False, output_record_batches=False): """Assert that input data and metadata is transformed as expected. This methods asserts transformed data and transformed metadata match with expected_data and expected_metadata. Args: input_data: Input data formatted in one of two ways: * A sequence of dicts whose values are one of: strings, lists of strings, numeric types or a pair of those. Must have at least one key so that we can infer the batch size, or * A sequence of pa.RecordBatch. input_metadata: One of - * DatasetMetadata describing input_data if `input_data` are dicts. * TensorAdapterConfig otherwise. preprocessing_fn: A function taking a dict of tensors and returning a dict of tensors. expected_data: (optional) A dataset with the same type constraints as input_data, but representing the output after transformation. If supplied, transformed data is asserted to be equal. expected_metadata: (optional) DatasetMetadata describing the transformed data. If supplied, transformed metadata is asserted to be equal. expected_vocab_file_contents: (optional) A dictionary from vocab filenames to their expected content as a list of text lines or a list of tuples of frequency and text. Values should be the expected result of calling f.readlines() on the given asset files. test_data: (optional) If this is provided then instead of calling AnalyzeAndTransformDataset with input_data, this function will call AnalyzeDataset with input_data and TransformDataset with test_data. Note that this is the case even if input_data and test_data are equal. test_data should also conform to input_metadata. desired_batch_size: (optional) A batch size to batch elements by. If not provided, a batch size will be computed automatically. beam_pipeline: (optional) A Beam Pipeline to use in this test. temp_dir: If set, it is used as output directory, else a new unique directory is created. force_tf_compat_v1: A bool. If `True`, TFT's public APIs use Tensorflow in compat.v1 mode. output_record_batches: (optional) A bool. If `True`, `TransformDataset` and `AnalyzeAndTransformDataset` output `pyarrow.RecordBatch`es; otherwise, they output instance dicts. Raises: AssertionError: if the expected data does not match the results of transforming input_data according to preprocessing_fn, or (if provided) if the expected metadata does not match. """ expected_vocab_file_contents = expected_vocab_file_contents or {} # Note: we don't separately test AnalyzeDataset and TransformDataset as # AnalyzeAndTransformDataset currently simply composes these two # transforms. If in future versions of the code, the implementation # differs, we should also run AnalyzeDataset and TransformDataset composed. temp_dir = temp_dir or tempfile.mkdtemp( prefix=self._testMethodName, dir=self.get_temp_dir()) with beam_pipeline or self._makeTestPipeline() as pipeline: with beam_impl.Context( temp_dir=temp_dir, desired_batch_size=desired_batch_size, force_tf_compat_v1=force_tf_compat_v1): input_data = pipeline | 'CreateInput' >> beam.Create(input_data, reshuffle=False) if test_data is None: (transformed_data, transformed_metadata), transform_fn = ( (input_data, input_metadata) | beam_impl.AnalyzeAndTransformDataset( preprocessing_fn, output_record_batches=output_record_batches)) else: transform_fn = ((input_data, input_metadata) | beam_impl.AnalyzeDataset(preprocessing_fn)) test_data = pipeline | 'CreateTest' >> beam.Create(test_data) transformed_data, transformed_metadata = ( ((test_data, input_metadata), transform_fn) | beam_impl.TransformDataset( output_record_batches=output_record_batches)) # Write transform_fn so we can test its assets _ = transform_fn | transform_fn_io.WriteTransformFn(temp_dir) transformed_data_path = os.path.join(temp_dir, 'transformed_data') if expected_data is not None: if isinstance(transformed_metadata, beam_metadata_io.BeamDatasetMetadata): deferred_schema = ( transformed_metadata.deferred_metadata | 'GetDeferredSchema' >> beam.Map(lambda m: m.schema)) else: deferred_schema = ( self.pipeline | 'CreateDeferredSchema' >> beam.Create( [transformed_metadata.schema])) if output_record_batches: # Since we are using a deferred schema, obtain a pcollection # containing the data coder that will be created from it. transformed_data_coder_pcol = ( deferred_schema | 'RecordBatchToExamplesEncoder' >> beam.Map( example_coder.RecordBatchToExamplesEncoder)) encode_ptransform = 'EncodeRecordBatches' >> beam.FlatMap( _encode_transformed_data_batch, coder=beam.pvalue.AsSingleton(transformed_data_coder_pcol)) else: # Since we are using a deferred schema, obtain a pcollection # containing the data coder that will be created from it. transformed_data_coder_pcol = ( deferred_schema | 'ExampleProtoCoder' >> beam.Map(tft.coders.ExampleProtoCoder)) encode_ptransform = 'EncodeExamples' >> beam.Map( lambda data, data_coder: data_coder.encode(data), data_coder=beam.pvalue.AsSingleton(transformed_data_coder_pcol)) _ = ( transformed_data | encode_ptransform | beam.io.tfrecordio.WriteToTFRecord( transformed_data_path, shard_name_template='')) # TODO(ebreck) Log transformed_data somewhere. tf_transform_output = tft.TFTransformOutput(temp_dir) if expected_data is not None: examples = tf.compat.v1.python_io.tf_record_iterator( path=transformed_data_path) shapes = { f.name: [s.size for s in f.shape.dim] if f.HasField('shape') else [-1] for f in tf_transform_output.transformed_metadata.schema.feature } transformed_data = [ _format_example_as_numpy_dict(e, shapes) for e in examples ] self.assertDataCloseOrEqual(expected_data, transformed_data) if expected_metadata: # Make a copy with no annotations. transformed_schema = schema_pb2.Schema() transformed_schema.CopyFrom( tf_transform_output.transformed_metadata.schema) transformed_schema.ClearField('annotation') for feature in transformed_schema.feature: feature.ClearField('annotation') # assertProtoEqual has a size limit on the length of the # serialized as text strings. Therefore, we first try to use # assertProtoEqual, if that fails we try to use assertEqual, if that fails # as well then we raise the exception from assertProtoEqual. try: compare.assertProtoEqual(self, expected_metadata.schema, transformed_schema) except AssertionError as compare_exception: try: self.assertEqual(expected_metadata.schema, transformed_schema) except AssertionError: raise compare_exception for filename, file_contents in expected_vocab_file_contents.items(): full_filename = tf_transform_output.vocabulary_file_by_name(filename) self.AssertVocabularyContents(full_filename, file_contents)