def read_and_shuffle_data( train_neg_filepattern, train_pos_filepattern, test_neg_filepattern, test_pos_filepattern, working_dir): """Read and shuffle the data and write out as a TFRecord of Example protos. Read in the data from the positive and negative examples on disk, shuffle it and write it out in TFRecord format. transform it using a preprocessing pipeline that removes punctuation, tokenizes and maps tokens to int64 values indices. Args: train_neg_filepattern: Filepattern for training data negative examples train_pos_filepattern: Filepattern for training data positive examples test_neg_filepattern: Filepattern for test data negative examples test_pos_filepattern: Filepattern for test data positive examples working_dir: Directory to write shuffled data to """ with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter _ = ( pipeline | 'ReadAndShuffleTrain' >> ReadAndShuffleData( (train_neg_filepattern, train_pos_filepattern)) | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( os.path.join(working_dir, SHUFFLED_TRAIN_DATA_FILEBASE), coder=example_proto_coder.ExampleProtoCoder( RAW_DATA_METADATA.schema))) _ = ( pipeline | 'ReadAndShuffleTest' >> ReadAndShuffleData( (test_neg_filepattern, test_pos_filepattern)) | 'WriteTestData' >> tfrecordio.WriteToTFRecord( os.path.join(working_dir, SHUFFLED_TEST_DATA_FILEBASE), coder=example_proto_coder.ExampleProtoCoder( RAW_DATA_METADATA.schema)))
def test_example_proto_coder_bad_default_value(self): input_schema = dataset_schema.from_feature_spec({ 'scalar_feature_2': tf.FixedLenFeature(shape=[2], dtype=tf.float32, default_value=[1.0]), }) with self.assertRaisesRegexp(ValueError, 'got default value with incorrect shape'): example_proto_coder.ExampleProtoCoder(input_schema) input_schema = dataset_schema.from_feature_spec({ 'scalar_feature_2': tf.FixedLenFeature(shape=[], dtype=tf.float32, default_value=[0.0]), }) with self.assertRaisesRegexp(ValueError, 'got default value with incorrect shape'): example_proto_coder.ExampleProtoCoder(input_schema) input_schema = dataset_schema.from_feature_spec({ '2d_vector_feature': tf.FixedLenFeature( shape=[2, 3], dtype=tf.float32, default_value=[[1.0, 1.0], [1.0]]), }) with self.assertRaisesRegexp(ValueError, 'got default value with incorrect shape'): example_proto_coder.ExampleProtoCoder(input_schema)
def read_and_shuffle_data(train_neg_filepattern, train_pos_filepattern, test_neg_filepattern, test_pos_filepattern, shuffled_train_filebase, shuffled_test_filebase): """Read and shuffle the data and write out as a TFRecord of Example protos. Read in the data from the positive and negative examples on disk, shuffle it and write it out in TFRecord format. transform it using a preprocessing pipeline that removes punctuation, tokenizes and maps tokens to int64 values indices. Args: train_neg_filepattern: Filepattern for training data negative examples train_pos_filepattern: Filepattern for training data positive examples test_neg_filepattern: Filepattern for test data negative examples test_pos_filepattern: Filepattern for test data positive examples shuffled_train_filebase: Base filename for shuffled training data shards shuffled_test_filebase: Base filename for shuffled test data shards """ with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter _ = (pipeline | 'ReadAndShuffleTrain' >> ReadAndShuffleData( (train_neg_filepattern, train_pos_filepattern)) | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( shuffled_train_filebase, coder=example_proto_coder.ExampleProtoCoder( RAW_DATA_METADATA.schema))) _ = (pipeline | 'ReadAndShuffleTest' >> ReadAndShuffleData( (test_neg_filepattern, test_pos_filepattern)) | 'WriteTestData' >> tfrecordio.WriteToTFRecord( shuffled_test_filebase, coder=example_proto_coder.ExampleProtoCoder( RAW_DATA_METADATA.schema)))
def test_example_proto_coder_error(self): input_schema = dataset_schema.from_feature_spec({ '2d_vector_feature': tf.FixedLenFeature(shape=[2, 2], dtype=tf.int64), }) coder = example_proto_coder.ExampleProtoCoder(input_schema) example_decoded_value = { '2d_vector_feature': [1, 2, 3] } example_proto_text = """ features { feature { key: "1d_vector_feature" value { int64_list { value: [ 1, 2, 3 ] } } } } """ example = tf.train.Example() text_format.Merge(example_proto_text, example) # Ensure that we raise an exception for trying to encode invalid data. with self.assertRaisesRegexp(ValueError, 'got wrong number of values'): _ = coder.encode(example_decoded_value) # Ensure that we raise an exception for trying to parse invalid data. with self.assertRaisesRegexp(ValueError, 'got wrong number of values'): _ = coder.decode(example.SerializeToString())
def test_example_proto_coder_default_value(self): input_schema = dataset_schema.from_feature_spec({ 'scalar_feature_3': tf.FixedLenFeature(shape=[], dtype=tf.float32, default_value=1.0), 'scalar_feature_4': tf.FixedLenFeature(shape=[], dtype=tf.float32, default_value=0.0), '1d_vector_feature': tf.FixedLenFeature( shape=[1], dtype=tf.float32, default_value=[2.0]), '2d_vector_feature': tf.FixedLenFeature( shape=[2, 2], dtype=tf.float32, default_value=[[1.0, 2.0], [3.0, 4.0]]), }) coder = example_proto_coder.ExampleProtoCoder(input_schema) # Python types. example_proto_text = """ features { } """ example = tf.train.Example() text_format.Merge(example_proto_text, example) data = example.SerializeToString() # Assert the data is decoded into the expected format. expected_decoded = { 'scalar_feature_3': 1.0, 'scalar_feature_4': 0.0, '1d_vector_feature': [2.0], '2d_vector_feature': [[1.0, 2.0], [3.0, 4.0]], } decoded = coder.decode(data) np.testing.assert_equal(expected_decoded, decoded)
def test_encode_non_serialized(self, feature_spec, ascii_proto, instance, **kwargs): schema = schema_utils.schema_from_feature_spec(feature_spec) coder = example_proto_coder.ExampleProtoCoder( schema, serialized=False, **kwargs) proto = _ascii_to_example(ascii_proto) np.testing.assert_equal(coder.encode(instance), proto)
def test_decode_error(self, feature_spec, ascii_proto, error_msg, error_type=ValueError, **kwargs): schema = dataset_schema.from_feature_spec(feature_spec) coder = example_proto_coder.ExampleProtoCoder(schema, **kwargs) serialized_proto = _ascii_to_binary(ascii_proto) with self.assertRaisesRegexp(error_type, error_msg): coder.decode(serialized_proto)
def test_decode_non_serialized(self, feature_spec, ascii_proto, instance, **kwargs): schema = dataset_schema.from_feature_spec(feature_spec) coder = example_proto_coder.ExampleProtoCoder( schema, serialized=False, **kwargs) proto = _ascii_to_example(ascii_proto) np.testing.assert_equal(coder.decode(proto), instance)
def test_example_proto_coder_bad_default_value(self): input_schema = dataset_schema.from_feature_spec({ 'scalar_feature_2': tf.FixedLenFeature(shape=[2], dtype=tf.float32, default_value=[1.0, 2.0]), }) with self.assertRaisesRegexp( ValueError, 'only scalar default values are supported'): example_proto_coder.ExampleProtoCoder(input_schema) input_schema = dataset_schema.from_feature_spec({ 'scalar_feature_2': tf.FixedLenFeature(shape=[], dtype=tf.float32, default_value=[1.0]), }) with self.assertRaisesRegexp( ValueError, 'only scalar default values are supported'): example_proto_coder.ExampleProtoCoder(input_schema)
def test_encode_error(self, feature_spec, instance, error_msg, error_type=ValueError, **kwargs): schema = schema_utils.schema_from_feature_spec(feature_spec) with self.assertRaisesRegexp(error_type, error_msg): coder = example_proto_coder.ExampleProtoCoder(schema, **kwargs) coder.encode(instance)
def test_example_proto_coder_unicode(self): coder = example_proto_coder.ExampleProtoCoder( dataset_schema.from_feature_spec({ 'unicode_feature': tf.FixedLenFeature(shape=[], dtype=tf.string) })) encoded_example = coder.encode({'unicode_feature': u'Hello κόσμε'}) example = tf.train.Example() example.ParseFromString(encoded_example) self.assertEqual( example.features.feature['unicode_feature'].bytes_list.value[0], u'Hello κόσμε'.encode('utf-8'))
def test_example_proto_coder_picklable(self): encode_case = _maybe_extend_encode_case_with_ragged( _ENCODE_CASES['multiple_columns']) schema = schema_utils.schema_from_feature_spec( encode_case['feature_spec']) coder = example_proto_coder.ExampleProtoCoder(schema) ascii_proto = encode_case['ascii_proto'] instance = encode_case['instance'] serialized_proto = _ascii_to_binary(ascii_proto) for _ in range(2): coder = pickle.loads(pickle.dumps(coder)) self.assertSerializedProtosEqual(coder.encode(instance), serialized_proto)
def test_example_proto_coder_cache(self): """Test that the cache remains valid after reading/writing None.""" schema = schema_utils.schema_from_feature_spec({ 'varlen': tf.io.VarLenFeature(tf.int64), }) coder = example_proto_coder.ExampleProtoCoder(schema) ascii_protos = [ 'features {feature {key: "varlen" value {int64_list {value: [5] }}}}', 'features {feature {key: "varlen" value {}}}', 'features {feature {key: "varlen" value {int64_list {value: [6] }}}}', ] instances = [{'varlen': [5]}, {'varlen': None}, {'varlen': [6]}] serialized_protos = map(_ascii_to_binary, ascii_protos) for instance, serialized_proto in zip(instances, serialized_protos): self.assertSerializedProtosEqual(coder.encode(instance), serialized_proto)
def write_tfrecords(data, schema, filename, name): """ Converts input pcollection into a file of tfrecords following schema. Args ---- data: pcollection. schema: dataset_schema from tensorflow transform. name: str to identify operations. """ _ = (data | '{} tfrecords write'.format(name) >> beam.io.tfrecordio.WriteToTFRecord( filename, coder=example_proto_coder.ExampleProtoCoder( dataset_schema.Schema(schema))))
def store_transformed_data(data, schema, path, name=''): """Stores data from input pipeline into TFRecord in the specified path. Args: data: `PCollection`, input pipeline. schema: `DatasetMetadata` object, describes schema of the input pipeline. path: string, where to write output. name: string: name describing pipeline to be written. Returns: PCollection """ p = (data | 'WriteData{}'.format(name) >> tfrecordio.WriteToTFRecord( path, coder=example_proto_coder.ExampleProtoCoder(schema.schema))) return p
def test_example_proto_coder_picklable(self): schema = schema_utils.schema_from_feature_spec(_FEATURE_SPEC) coder = example_proto_coder.ExampleProtoCoder(schema) ascii_proto = """ features { feature { key: "scalar_feature_1" value { int64_list { value: [ 12 ] } } } feature { key: "varlen_feature_1" value { float_list { value: [ 89.0 ] } } } feature { key: "scalar_feature_2" value { int64_list { value: [ 12 ] } } } feature { key: "scalar_feature_3" value { float_list { value: [ 2.0 ] } } } feature { key: "1d_vector_feature" value { bytes_list { value: [ 'this is a ,text' ] } } } feature { key: "2d_vector_feature" value { float_list { value: [ 1.0, 2.0, 3.0, 4.0 ] } } } feature { key: "varlen_feature_2" value { bytes_list { value: [ 'female' ] } } } feature { key: "value" value { float_list { value: [ 12.0, 20.0 ] } } } feature { key: "idx" value { int64_list { value: [ 1, 4 ] } } } feature { key: "idx0" value { int64_list { value: [ 1, 1 ]} } } feature { key: "idx1" value { int64_list { value: [ 3, 7 ]} } } feature { key: "2d_val" value { float_list { value: [ 13.0, 23.0 ] } } } } """ instance = { 'scalar_feature_1': 12, 'scalar_feature_2': 12, 'scalar_feature_3': 2.0, 'varlen_feature_1': [89.0], '1d_vector_feature': [b'this is a ,text'], '2d_vector_feature': [[1.0, 2.0], [3.0, 4.0]], 'varlen_feature_2': [b'female'], 'idx': [1, 4], 'value': [12.0, 20.0], 'idx0': [1, 1], 'idx1': [3, 7], '2d_val': [13.0, 23.0], } serialized_proto = _ascii_to_binary(ascii_proto) for _ in range(2): coder = pickle.loads(pickle.dumps(coder)) self.assertSerializedProtosEqual(coder.encode(instance), serialized_proto)
def test_example_proto_coder_picklable(self): coder = example_proto_coder.ExampleProtoCoder(self._INPUT_SCHEMA) example_proto_text = """ features { feature { key: "scalar_feature_1" value { int64_list { value: [ 12 ] } } } feature { key: "varlen_feature_1" value { float_list { value: [ 89.0 ] } } } feature { key: "scalar_feature_2" value { int64_list { value: [ 12 ] } } } feature { key: "scalar_feature_3" value { float_list { value: [ 2.0 ] } } } feature { key: "1d_vector_feature" value { bytes_list { value: [ 'this is a ,text' ] } } } feature { key: "2d_vector_feature" value { float_list { value: [ 1.0, 2.0, 3.0, 4.0 ] } } } feature { key: "varlen_feature_2" value { bytes_list { value: [ 'female' ] } } } feature { key: "value" value { float_list { value: [ 12.0, 20.0 ] } } } feature { key: "idx" value { int64_list { value: [ 1, 4 ] } } } } """ expected_decoded = { 'scalar_feature_1': 12, 'scalar_feature_2': 12, 'scalar_feature_3': 2.0, 'varlen_feature_1': [89.0], '1d_vector_feature': ['this is a ,text'], '2d_vector_feature': [[1.0, 2.0], [3.0, 4.0]], 'varlen_feature_2': ['female'], 'sparse_feature': ([1, 4], [12.0, 20.0]) } # Ensure we can pickle right away. coder = pickle.loads(pickle.dumps(coder)) self._assert_encode_decode(coder, example_proto_text, expected_decoded) self._assert_decode_encode(coder, example_proto_text, expected_decoded) # And after use. coder = pickle.loads(pickle.dumps(coder)) self._assert_encode_decode(coder, example_proto_text, expected_decoded) self._assert_decode_encode(coder, example_proto_text, expected_decoded)
def transform_and_write(pcollection, input_metadata, output_dir, transform_fn, file_prefix): """Transforms data and writes results to local disc or Cloud Storage bucket. Args: pcollection: Pipeline data. input_metadata: DatasetMetadata object for given input data. output_dir: Directory to write transformed output. transform_fn: TensorFlow transform function. file_prefix: File prefix to add to output file. """ shuffled_data = (pcollection | 'RandomizeData' >> beam.transforms.Reshuffle()) (transformed_data, transformed_metadata) = (((shuffled_data, input_metadata), transform_fn) | 'Transform' >> tft_beam.TransformDataset()) coder = example_proto_coder.ExampleProtoCoder(transformed_metadata.schema) (transformed_data | 'SerializeExamples' >> beam.Map(coder.encode) | 'WriteExamples' >> beam.io.WriteToTFRecord( os.path.join(output_dir, file_prefix), file_name_suffix=_FILE_NAME_SUFFIX))
def transform_predict(pipeline, predict_data, data_source, output_dir, schema): """Transforms prediction input data. Args: pipeline: Beam Pipeline instance. predict_data: Prediction csv data. data_source: Input data source - path to CSV file or BigQuery table. Expects either `csv` or `bigquery`. output_dir: Directory to write transformed output. schema: A text-serialized TensorFlow metadata schema for the input data. """ data_schema = utils.make_dataset_schema( schema, mode=tf.estimator.ModeKeys.PREDICT) coder = example_proto_coder.ExampleProtoCoder(data_schema) raw_data = ( pipeline | 'ReadPredictData' >> ReadData(predict_data, data_source, schema, tf.estimator.ModeKeys.PREDICT)) (raw_data | 'EncodePredictData' >> beam.Map(coder.encode) | 'WritePredictDataAsTFRecord' >> beam.io.WriteToTFRecord( os.path.join(output_dir, _PREDICT_PREFIX), file_name_suffix='.tfrecord'))
def run(p, params): """Defines Beam preprocessing pipeline. Performs the following: - Reads text files from pattern. - Split text files in train and validation sets. Args: p: PCollection, initial pipeline. params: Object holding a set of parameters as name-value pairs. """ path_pattern = os.path.join(params.input_dir, '*', '*{}'.format(constants.FILE_EXTENSION)) data = (p | 'ListFiles' >> beam.Create(gfile.Glob(path_pattern)) | 'ReadFiles' >> beam.ParDo(ReadFile()) | 'SplitData' >> beam.ParDo( _SplitData(), train_size=params.train_size, val_label=_DatasetType.VAL.name).with_outputs( _DatasetType.VAL.name, main=_DatasetType.TRAIN.name)) schema = dataset_schema.from_feature_spec( utils.get_processed_data_schema()) for dataset in _DatasetType: if not dataset.value: continue _ = ( data[dataset.name] | 'Shuffle{}'.format(dataset.name) >> shuffle() # pylint: disable=no-value-for-parameter | 'WriteFiles{}'.format(dataset.name) >> tfrecordio.WriteToTFRecord( os.path.join(params.output_dir, dataset.name + constants.TFRECORD), coder=example_proto_coder.ExampleProtoCoder(schema)))
train_data = (pipeline | "Create train list" >> beam.Create(list_train) | "Read Images - Train" >> beam.ParDo(ReadImageDoFn())) transformed_train, transform_fn = ( (train_data, schema) | "Analyze and Transform - Train" >> impl.AnalyzeAndTransformDataset(lambda t: _preprocess_fn( t, new_shape=(known_args.image_dim, known_args.image_dim))) ) transformed_train_data, transformed_train_metadata = transformed_train _ = transformed_train_data | 'Write TFrecords - train' >> beam.io.tfrecordio.WriteToTFRecord( file_path_prefix=train_tfrecord_path, file_name_suffix=".tfrecords", num_shards=known_args.n_shards, coder=example_proto_coder.ExampleProtoCoder( transformed_train_metadata.schema)) # Process evaluation data eval_data = (pipeline | "Create eval list" >> beam.Create(list_eval) | "Read Images - Eval" >> beam.ParDo(ReadImageDoFn())) transformed_eval = (((eval_data, schema), transform_fn) | "Transform - Eval" >> impl.TransformDataset()) transformed_eval_data, transformed_eval_metadata = transformed_eval _ = transformed_eval_data | 'Write TFrecords - eval' >> beam.io.tfrecordio.WriteToTFRecord( file_path_prefix=eval_tfrecord_path, file_name_suffix=".tfrecords", num_shards=known_args.n_shards,
def transform_data(train_neg_filepattern, train_pos_filepattern, test_neg_filepattern, test_pos_filepattern, transformed_train_filebase, transformed_test_filebase, transformed_metadata_dir): """Transform the data and write out as a TFRecord of Example protos. Read in the data from the positive and negative examples on disk, and transform it using a preprocessing pipeline that removes punctuation, tokenizes and maps tokens to int64 values indices. Args: train_neg_filepattern: Filepattern for training data negative examples train_pos_filepattern: Filepattern for training data positive examples test_neg_filepattern: Filepattern for test data negative examples test_pos_filepattern: Filepattern for test data positive examples transformed_train_filebase: Base filename for transformed training data shards transformed_test_filebase: Base filename for transformed test data shards transformed_metadata_dir: Directory where metadata for transformed data should be written """ with beam.Pipeline() as pipeline: with beam_impl.Context(temp_dir=tempfile.mkdtemp()): # pylint: disable=no-value-for-parameter train_data = pipeline | 'ReadTrain' >> ReadAndShuffleData( (train_neg_filepattern, train_pos_filepattern)) # pylint: disable=no-value-for-parameter test_data = pipeline | 'ReadTest' >> ReadAndShuffleData( (test_neg_filepattern, test_pos_filepattern)) metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema({ REVIEW_COLUMN: dataset_schema.ColumnSchema( tf.string, [], dataset_schema.FixedColumnRepresentation()), LABEL_COLUMN: dataset_schema.ColumnSchema( tf.int64, [], dataset_schema.FixedColumnRepresentation()), })) def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" review = inputs[REVIEW_COLUMN] def remove_character(s, char): """Remove a character from a string. Args: s: A SparseTensor of rank 1 of type tf.string char: A string of length 1 Returns: The string `s` with the given character removed (i.e. replaced by '') """ # Hacky implementation where we split and rejoin. split = tf.string_split(s, char) rejoined = tf.reduce_join( tf.sparse_to_dense( split.indices, split.dense_shape, split.values, ''), 1) return rejoined def remove_punctuation(s): """Remove puncuation from a string. Args: s: A SparseTensor of rank 1 of type tf.string Returns: The string `s` with punctuation removed. """ for char in PUNCTUATION_CHARACTERS: s = remove_character(s, char) return s cleaned_review = tft.map(remove_punctuation, review) review_tokens = tft.map(tf.string_split, cleaned_review) review_indices = tft.string_to_int(review_tokens, top_k=VOCAB_SIZE) return { REVIEW_COLUMN: review_indices, LABEL_COLUMN: inputs[LABEL_COLUMN] } (transformed_train_data, transformed_metadata), transform_fn = ( (train_data, metadata) | 'AnalyzeAndTransform' >> beam_impl.AnalyzeAndTransformDataset( preprocessing_fn)) transformed_test_data, _ = ( ((test_data, metadata), transform_fn) | 'Transform' >> beam_impl.TransformDataset()) _ = ( transformed_train_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( transformed_train_filebase, coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema))) _ = ( transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord( transformed_test_filebase, coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema))) _ = ( transformed_metadata | 'WriteMetadata' >> beam_metadata_io.WriteMetadata( transformed_metadata_dir, pipeline=pipeline))
def test_encode(self, feature_spec, ascii_proto, instance, **kwargs): schema = schema_utils.schema_from_feature_spec(feature_spec) coder = example_proto_coder.ExampleProtoCoder(schema, **kwargs) serialized_proto = _ascii_to_binary(ascii_proto) self.assertSerializedProtosEqual(coder.encode(instance), serialized_proto)
def transform_data(input_handle, outfile_prefix, working_dir, schema_file, transform_dir=None, max_rows=None, pipeline_args=None): """The main tf.transform method which analyzes and transforms data. Args: input_handle: BigQuery table name to process specified as DATASET.TABLE or path to csv file with input data. outfile_prefix: Filename prefix for emitted transformed examples working_dir: Directory in which transformed examples and transform function will be emitted. schema_file: An file path that contains a text-serialized TensorFlow metadata schema of the input data. transform_dir: Directory in which the transform output is located. If provided, this will load the transform_fn from disk instead of computing it over the data. Hint: this is useful for transforming eval data. max_rows: Number of rows to query from BigQuery pipeline_args: additional DataflowRunner or DirectRunner args passed to the beam pipeline. """ def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in taxi.DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[taxi.transformed_name(key)] = transform.scale_to_z_score( _fill_in_missing(inputs[key])) for key in taxi.VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[taxi.transformed_name( key)] = transform.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), top_k=taxi.VOCAB_SIZE, num_oov_buckets=taxi.OOV_SIZE) for key in taxi.BUCKET_FEATURE_KEYS: outputs[taxi.transformed_name(key)] = transform.bucketize( _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT) for key in taxi.CATEGORICAL_FEATURE_KEYS: outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key]) # Was this passenger a big tipper? taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY]) tips = _fill_in_missing(inputs[taxi.LABEL_KEY]) outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where( tf.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64), # Test if the tip was > 20% of the fare. tf.cast(tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64)) return outputs schema = taxi.read_schema(schema_file) raw_feature_spec = taxi.get_raw_feature_spec(schema) raw_schema = dataset_schema.from_feature_spec(raw_feature_spec) raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema) with beam.Pipeline(argv=pipeline_args) as pipeline: with tft_beam.Context(temp_dir=working_dir): if input_handle.lower().endswith('csv'): csv_coder = taxi.make_csv_coder(schema) raw_data = (pipeline | 'ReadFromText' >> beam.io.ReadFromText( input_handle, skip_header_lines=1)) decode_transform = beam.Map(csv_coder.decode) else: query = taxi.make_sql(input_handle, max_rows, for_eval=False) raw_data = (pipeline | 'ReadBigQuery' >> beam.io.Read( beam.io.BigQuerySource(query=query, use_standard_sql=True))) decode_transform = beam.Map(taxi.clean_raw_data_dict, raw_feature_spec=raw_feature_spec) if transform_dir is None: decoded_data = raw_data | 'DecodeForAnalyze' >> decode_transform transform_fn = ( (decoded_data, raw_data_metadata) | ('Analyze' >> tft_beam.AnalyzeDataset(preprocessing_fn))) _ = (transform_fn | ('WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))) else: transform_fn = pipeline | tft_beam.ReadTransformFn( transform_dir) # Shuffling the data before materialization will improve Training # effectiveness downstream. Here we shuffle the raw_data (as opposed to # decoded data) since it has a compact representation. shuffled_data = raw_data | 'RandomizeData' >> beam.transforms.Reshuffle( ) decoded_data = shuffled_data | 'DecodeForTransform' >> decode_transform (transformed_data, transformed_metadata) = ( ((decoded_data, raw_data_metadata), transform_fn) | 'Transform' >> tft_beam.TransformDataset()) coder = example_proto_coder.ExampleProtoCoder( transformed_metadata.schema) _ = (transformed_data | 'SerializeExamples' >> beam.Map(coder.encode) | 'WriteExamples' >> beam.io.WriteToTFRecord( os.path.join(working_dir, outfile_prefix), file_name_suffix='.gz'))
def run(input_feature_spec, labels, feature_extraction, feature_scaling=None, eval_percent=20.0, beam_options=None, work_dir=None): """Runs the whole preprocessing step. This runs the feature extraction PTransform, validates that the data conforms to the schema provided, normalizes the features, and splits the dataset into a training and evaluation dataset. """ # Populate optional arguments if not feature_scaling: feature_scaling = lambda inputs: inputs # Type checking if not isinstance(labels, list): raise ValueError('`labels` must be list(str). ' 'Given: {} {}'.format(labels, type(labels))) if not isinstance(feature_extraction, beam.PTransform): raise ValueError('`feature_extraction` must be {}. ' 'Given: {} {}'.format(beam.PTransform, feature_extraction, type(feature_extraction))) if not callable(feature_scaling): raise ValueError('`feature_scaling` must be callable. ' 'Given: {} {}'.format(feature_scaling, type(feature_scaling))) if beam_options and not isinstance(beam_options, PipelineOptions): raise ValueError('`beam_options` must be {}. ' 'Given: {} {}'.format(PipelineOptions, beam_options, type(beam_options))) if not work_dir: work_dir = tempfile.mkdtemp(prefix='tensorflow-preprocessing') tft_temp_dir = os.path.join(work_dir, 'tft-temp') train_dataset_dir = os.path.join(work_dir, 'train-dataset') eval_dataset_dir = os.path.join(work_dir, 'eval-dataset') transform_fn_dir = os.path.join(work_dir, transform_fn_io.TRANSFORM_FN_DIR) # if tf.gfile.Exists(transform_fn_dir): if tf.io.gfile.exists(transform_fn_dir): tf.gfile.DeleteRecursively(transform_fn_dir) # [START dataflow_molecules_create_pipeline] # Build and run a Beam Pipeline with beam.Pipeline(options=beam_options) as p, \ beam_impl.Context(temp_dir=tft_temp_dir): # [END dataflow_molecules_create_pipeline] # [START dataflow_molecules_feature_extraction] # Transform and validate the input data matches the input schema dataset = ( p | 'Feature extraction' >> feature_extraction # [END dataflow_molecules_feature_extraction] # [START dataflow_molecules_validate_inputs] | 'Validate inputs' >> beam.ParDo( ValidateInputData(input_feature_spec))) # [END dataflow_molecules_validate_inputs] # [START dataflow_molecules_analyze_and_transform_dataset] # Apply the tf.Transform preprocessing_fn input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec(input_feature_spec)) dataset_and_metadata, transform_fn = ( (dataset, input_metadata) | 'Feature scaling' >> beam_impl.AnalyzeAndTransformDataset(feature_scaling)) dataset, metadata = dataset_and_metadata # [END dataflow_molecules_analyze_and_transform_dataset] # [START dataflow_molecules_split_to_train_and_eval_datasets] # Split the dataset into a training set and an evaluation set assert 0 < eval_percent < 100, 'eval_percent must in the range (0-100)' train_dataset, eval_dataset = ( dataset | 'Split dataset' >> beam.Partition( lambda elem, _: int(random.uniform(0, 100) < eval_percent), 2)) # [END dataflow_molecules_split_to_train_and_eval_datasets] # [START dataflow_molecules_write_tfrecords] # Write the datasets as TFRecords coder = example_proto_coder.ExampleProtoCoder(metadata.schema) train_dataset_prefix = os.path.join(train_dataset_dir, 'part') _ = (train_dataset | 'Write train dataset' >> tfrecordio.WriteToTFRecord( train_dataset_prefix, coder)) eval_dataset_prefix = os.path.join(eval_dataset_dir, 'part') _ = (eval_dataset | 'Write eval dataset' >> tfrecordio.WriteToTFRecord( eval_dataset_prefix, coder)) # Write the transform_fn _ = (transform_fn | 'Write transformFn' >> transform_fn_io.WriteTransformFn(work_dir)) # [END dataflow_molecules_write_tfrecords] return PreprocessData(input_feature_spec, labels, train_dataset_prefix + '*', eval_dataset_prefix + '*')
def transform_data(input_handle, outfile_prefix, working_dir, max_rows=None, pipeline_args=None): """The main tf.transform method which analyzes and transforms data. Args: input_handle: BigQuery table name to process specified as DATASET.TABLE or path to csv file with input data. outfile_prefix: Filename prefix for emitted transformed examples working_dir: Directory in which transformed examples and transform function will be emitted. max_rows: Number of rows to query from BigQuery pipeline_args: additional DataflowRunner or DirectRunner args passed to the beam pipeline. """ def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in taxi.DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[key] = transform.scale_to_z_score(inputs[key]) for key in taxi.VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[key] = transform.string_to_int( inputs[key], top_k=taxi.VOCAB_SIZE, num_oov_buckets=taxi.OOV_SIZE) for key in taxi.BUCKET_FEATURE_KEYS: outputs[key] = transform.bucketize(inputs[key], taxi.FEATURE_BUCKET_COUNT) for key in taxi.CATEGORICAL_FEATURE_KEYS: outputs[key] = inputs[key] # Was this passenger a big tipper? def convert_label(label): taxi_fare = inputs[taxi.FARE_KEY] return tf.where( tf.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64), # Test if the tip was > 20% of the fare. tf.cast( tf.greater(label, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64)) outputs[taxi.LABEL_KEY] = transform.apply_function( convert_label, inputs[taxi.LABEL_KEY]) return outputs raw_feature_spec = taxi.get_raw_feature_spec() raw_schema = dataset_schema.from_feature_spec(raw_feature_spec) raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema) with beam.Pipeline(argv=pipeline_args) as pipeline: with beam_impl.Context(temp_dir=working_dir): if input_handle.lower().endswith('csv'): csv_coder = taxi.make_csv_coder() raw_data = (pipeline | 'ReadFromText' >> beam.io.ReadFromText( input_handle, skip_header_lines=1) | 'ParseCSV' >> beam.Map(csv_coder.decode)) else: query = taxi.make_sql(input_handle, max_rows, for_eval=False) raw_data = (pipeline | 'ReadBigQuery' >> beam.io.Read( beam.io.BigQuerySource(query=query, use_standard_sql=True))) raw_data |= 'CleanData' >> beam.Map(taxi.clean_raw_data_dict) transform_fn = ( (raw_data, raw_data_metadata) | 'Analyze' >> beam_impl.AnalyzeDataset(preprocessing_fn)) _ = (transform_fn | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn(working_dir)) # Shuffling the data before materialization will improve Training # effectiveness downstream. shuffled_data = raw_data | 'RandomizeData' >> beam.transforms.Reshuffle( ) (transformed_data, transformed_metadata) = ( ((shuffled_data, raw_data_metadata), transform_fn) | 'Transform' >> beam_impl.TransformDataset()) coder = example_proto_coder.ExampleProtoCoder( transformed_metadata.schema) _ = ( transformed_data | 'SerializeExamples' >> beam.Map(coder.encode) | 'WriteExamples' >> beam.io.WriteToTFRecord( os.path.join(working_dir, outfile_prefix), compression_type=beam.io.filesystem.CompressionTypes.GZIP))
def transform_data(train_data_file, test_data_file, transformed_train_filebase, transformed_test_filebase, transformed_metadata_dir): """Transform the data and write out as a TFRecord of Example protos. Read in the data using the CSV reader, and transform it using a preprocessing pipeline that scales numeric data and coverts categorical data from strings to int64 values indices, by creating a vocabulary for each category. Args: train_data_file: File containing training data test_data_file: File containing test data transformed_train_filebase: Base filename for transformed training data shards transformed_test_filebase: Base filename for transformed test data shards transformed_metadata_dir: Directory where metadata for transformed data should be written """ raw_data_schema = { key: dataset_schema.ColumnSchema(tf.string, [], dataset_schema.FixedColumnRepresentation()) for key in CATEGORICAL_COLUMNS } raw_data_schema.update({ key: dataset_schema.ColumnSchema(tf.float32, [], dataset_schema.FixedColumnRepresentation()) for key in NUMERIC_COLUMNS }) raw_data_schema[LABEL_COLUMN] = dataset_schema.ColumnSchema( tf.string, [], dataset_schema.FixedColumnRepresentation()) raw_data_schema = dataset_schema.Schema(raw_data_schema) raw_data_metadata = dataset_metadata.DatasetMetadata(raw_data_schema) def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" outputs = {} # Scale numeric columns to have range [0, 1]. for key in NUMERIC_COLUMNS: outputs[key] = tft.scale_to_0_1(inputs[key]) # For all categorical columns except the label column, we use # tft.string_to_int which computes the set of unique values and uses this # to convert the strings to indices. for key in CATEGORICAL_COLUMNS: outputs[key] = tft.string_to_int(inputs[key]) # For the label column we provide the mapping from string to index. def convert_label(label): table = lookup.string_to_index_table_from_tensor(['>50K', '<=50K']) return table.lookup(label) outputs[LABEL_COLUMN] = tft.map(convert_label, inputs[LABEL_COLUMN]) return outputs # The "with" block will create a pipeline, and run that pipeline at the exit # of the block. with beam.Pipeline() as pipeline: with beam_impl.Context(temp_dir=tempfile.mkdtemp()): # Create a coder to read the census data with the schema. To do this we # need to list all columns in order since the schema doesn't specify the # order of columns in the csv. ordered_columns = [ 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'label' ] converter = csv_coder.CsvCoder(ordered_columns, raw_data_schema) # Read in raw data and convert using CSV converter. Note that we apply # some Beam transformations here, which will not be encoded in the TF # graph since we don't do the from within tf.Transform's methods # (AnalyzeDataset, TransformDataset etc.). These transformations are just # to get data into a format that the CSV converter can read, in particular # removing empty lines and removing spaces after commas. raw_data = (pipeline | 'ReadTrainData' >> textio.ReadFromText(train_data_file) | 'FilterTrainData' >> beam.Filter(lambda line: line) | 'FixCommasTrainData' >> beam.Map(lambda line: line.replace(', ', ',')) | 'DecodeTrainData' >> beam.Map(converter.decode)) # Combine data and schema into a dataset tuple. Note that we already used # the schema to read the CSV data, but we also need it to interpret # raw_data. raw_dataset = (raw_data, raw_data_metadata) transformed_dataset, transform_fn = ( raw_dataset | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data, transformed_metadata = transformed_dataset _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( transformed_train_filebase, coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema)) # Now apply transform function to test data. In this case we also remove # the header line from the CSV file and the trailing period at the end of # each line. raw_test_data = ( pipeline | 'ReadTestData' >> textio.ReadFromText(test_data_file) | 'FilterTestData' >> beam.Filter( lambda line: line and line != '|1x3 Cross validator') | 'FixCommasTestData' >> beam.Map(lambda line: line.replace(', ', ',')) | 'RemoveTrailingPeriodsTestData' >> beam.Map(lambda line: line[:-1]) | 'DecodeTestData' >> beam.Map(converter.decode)) raw_test_dataset = (raw_test_data, raw_data_metadata) transformed_test_dataset = ((raw_test_dataset, transform_fn) | beam_impl.TransformDataset()) # Don't need transformed data schema, it's the same as before. transformed_test_data, _ = transformed_test_dataset _ = transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord( transformed_test_filebase, coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema)) _ = (transformed_metadata | 'WriteMetadata' >> beam_metadata_io.WriteMetadata( transformed_metadata_dir, pipeline=pipeline))
def test_example_proto_coder(self): # We use a single coder and invoke multiple encodes and decodes on it to # make sure that cache consistency is implemented properly. coder = example_proto_coder.ExampleProtoCoder(self._INPUT_SCHEMA) # Python types. example_proto_text = """ features { feature { key: "scalar_feature_1" value { int64_list { value: [ 12 ] } } } feature { key: "varlen_feature_1" value { float_list { value: [ 89.0 ] } } } feature { key: "scalar_feature_2" value { int64_list { value: [ 12 ] } } } feature { key: "scalar_feature_3" value { float_list { value: [ 1.0 ] } } } feature { key: "1d_vector_feature" value { bytes_list { value: [ 'this is a ,text' ] } } } feature { key: "varlen_feature_2" value { bytes_list { value: [ 'female' ] } } } feature { key: "value" value { float_list { value: [ 12.0, 20.0 ] } } } feature { key: "idx" value { int64_list { value: [ 1, 4 ] } } } } """ expected_decoded = { 'scalar_feature_1': 12, 'scalar_feature_2': 12, 'scalar_feature_3': 1.0, 'varlen_feature_1': [89.0], '1d_vector_feature': ['this is a ,text'], 'varlen_feature_2': ['female'], 'sparse_feature': ([12.0, 20.0], [1, 4]) } self._assert_encode_decode(coder, example_proto_text, expected_decoded) self._assert_decode_encode(coder, example_proto_text, expected_decoded) # Numpy types (with different values from above). example_proto_text = """ features { feature { key: "scalar_feature_1" value { int64_list { value: [ 13 ] } } } feature { key: "varlen_feature_1" value { float_list { } } } feature { key: "scalar_feature_2" value { int64_list { value: [ 14 ] } } } feature { key: "scalar_feature_3" value { float_list { value: [ 2.0 ] } } } feature { key: "1d_vector_feature" value { bytes_list { value: [ 'this is another ,text' ] } } } feature { key: "varlen_feature_2" value { bytes_list { value: [ 'male' ] } } } feature { key: "value" value { float_list { value: [ 13.0, 21.0 ] } } } feature { key: "idx" value { int64_list { value: [ 2, 5 ] } } } } """ expected_decoded = { 'scalar_feature_1': np.array(13), 'scalar_feature_2': np.int32(14), 'scalar_feature_3': np.array(2.0), 'varlen_feature_1': np.array([]), '1d_vector_feature': np.array(['this is another ,text']), 'varlen_feature_2': np.array(['male']), 'sparse_feature': (np.array([13.0, 21.0]), np.array([2, 5])) } self._assert_encode_decode(coder, example_proto_text, expected_decoded) self._assert_decode_encode(coder, example_proto_text, expected_decoded)
orders_by_date_train | "Extract timeseries windows - train" >> beam.ParDo(ExtractRawTimeseriesWindow(known_args.window_size)) | "Fusion breaker train" >> beam.Reshuffle()) ts_windows_schema = _get_feature_spec(known_args.window_size) norm_ts_windows_train, transform_fn = ( (ts_windows_train, ts_windows_schema) | "Analyze and Transform - train" >> impl.AnalyzeAndTransformDataset(lambda t: _preprocess_fn( t, known_args.window_size, znorm_stats))) norm_ts_windows_train_data, norm_ts_windows_train_metadata = norm_ts_windows_train _ = norm_ts_windows_train_data | 'Write TFrecords - train' >> beam.io.tfrecordio.WriteToTFRecord( file_path_prefix=train_tfrecord_path, file_name_suffix=".tfrecords", coder=example_proto_coder.ExampleProtoCoder( norm_ts_windows_train_metadata.schema)) # Process evaluation data raw_data_eval = _read_data_from_bq(pipeline, known_args.split_date, known_args.end_date) orders_by_date_eval = ( raw_data_eval | "Merge SKUs - eval" >> beam.CombineGlobally( GroupItemsByDate(community_area_list, (split_datetime, end_datetime)))) ts_windows_eval = ( orders_by_date_eval | "Extract timeseries windows - eval" >> beam.ParDo(ExtractRawTimeseriesWindow(known_args.window_size)) | "Fusion breaker eval" >> beam.Reshuffle())
def test_decode(self, feature_spec, ascii_proto, instance, **kwargs): schema = schema_utils.schema_from_feature_spec(feature_spec) coder = example_proto_coder.ExampleProtoCoder(schema, **kwargs) serialized_proto = _ascii_to_binary(ascii_proto) np.testing.assert_equal(coder.decode(serialized_proto), instance)