def preprocess(pipeline, training_data, eval_data, predict_data, output_dir, frequency_threshold): """Run pre-processing step as a pipeline. Args: pipeline: beam pipeline training_data: the name of the table to train on. eval_data: the name of the table to evaluate on. predict_data: the name of the table to predict on. output_dir: file path to where to write all the output files. frequency_threshold: frequency threshold to use for categorical values. """ # 1) The schema can be either defined in-memory or read from a configuration # file, in this case we are creating the schema in-memory. input_schema = reddit.make_input_schema() # 2) Read from BigQuery or from CSV. train_data = pipeline | 'ReadTrainingData' >> _ReadData(training_data) evaluate_data = pipeline | 'ReadEvalData' >> _ReadData(eval_data) input_metadata = dataset_metadata.DatasetMetadata(schema=input_schema) _ = (input_metadata | 'WriteInputMetadata' >> tft_beam_io.WriteMetadata( os.path.join(output_dir, path_constants.RAW_METADATA_DIR), pipeline=pipeline)) preprocessing_fn = reddit.make_preprocessing_fn(frequency_threshold) (train_dataset, train_metadata), transform_fn = ( (train_data, input_metadata) | 'AnalyzeAndTransform' >> tft.AnalyzeAndTransformDataset( preprocessing_fn)) # WriteTransformFn writes transform_fn and metadata to fixed subdirectories # of output_dir, which are given by path_constants.TRANSFORM_FN_DIR and # path_constants.TRANSFORMED_METADATA_DIR. _ = (transform_fn | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(output_dir)) (evaluate_dataset, evaluate_metadata) = ( ((evaluate_data, input_metadata), transform_fn) | 'TransformEval' >> tft.TransformDataset()) # pylint: disable=expression-not-assigned # TODO(b/34231369) Remember to eventually also save the statistics and the # metadata. train_coder = coders.ExampleProtoCoder(train_metadata.schema) (train_dataset | 'SerializeTrainExamples' >> beam.Map(train_coder.encode) | 'ShuffleTraining' >> _Shuffle() # pylint: disable=no-value-for-parameter | 'WriteTraining' >> beam.io.WriteToTFRecord( os.path.join(output_dir, path_constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX), file_name_suffix='.tfrecord.gz')) evaluate_coder = coders.ExampleProtoCoder(evaluate_metadata.schema) (evaluate_dataset | 'SerializeEvalExamples' >> beam.Map(evaluate_coder.encode) | 'ShuffleEval' >> _Shuffle() # pylint: disable=no-value-for-parameter | 'WriteEval' >> beam.io.WriteToTFRecord( os.path.join(output_dir, path_constants.TRANSFORMED_EVAL_DATA_FILE_PREFIX), file_name_suffix='.tfrecord.gz')) if predict_data: predict_mode = tf.contrib.learn.ModeKeys.INFER predict_schema = reddit.make_input_schema(mode=predict_mode) predict_coder = coders.ExampleProtoCoder(predict_schema) serialized_examples = (pipeline | 'ReadPredictData' >> _ReadData( predict_data, mode=predict_mode) # TODO(b/35194257) Obviate the need for this explicit # serialization. | 'EncodePredictData' >> beam.Map( predict_coder.encode)) _ = (serialized_examples | 'WritePredictDataAsTFRecord' >> beam.io.WriteToTFRecord( os.path.join(output_dir, path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX), file_name_suffix='.tfrecord.gz')) _ = (serialized_examples | 'EncodePredictAsB64Json' >> beam.Map(_encode_as_b64_json) | 'WritePredictDataAsText' >> beam.io.WriteToText( os.path.join(output_dir, path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX), file_name_suffix='.txt'))
def preprocess(pipeline, training_data, eval_data, predict_data, output_dir, frequency_threshold): """Run pre-processing step as a pipeline. Args: pipeline: beam pipeline training_data: the name of the table to train on. eval_data: the name of the table to evaluate on. predict_data: the name of the table to predict on. output_dir: file path to where to write all the output files. frequency_threshold: frequency threshold to use for categorical values. """ work_dir = os.path.join(output_dir, path_constants.TEMP_DIR) # 1) The schema can be either defined in-memory or read from a configuration # file, in this case we are creating the schema in-memory. input_schema = reddit.make_input_schema() # 2) Read from BigQuery or from CSV. train_data = pipeline | 'ReadTrainingData' >> _ReadData(training_data) evaluate_data = pipeline | 'ReadEvalData' >> _ReadData(eval_data) # TODO(b/33688220) should the transform functions take shuffle as an optional # argument? # TODO(b/33688275) Should the transform functions have more user friendly # names? input_metadata = dataset_metadata.DatasetMetadata(schema=input_schema) _ = (input_metadata | 'WriteInputMetadata' >> io.WriteMetadata( os.path.join(output_dir, path_constants.RAW_METADATA_DIR), pipeline=pipeline)) preprocessing_fn = reddit.make_preprocessing_fn(frequency_threshold) (train_dataset, train_metadata), transform_fn = ( (train_data, input_metadata) | 'AnalyzeAndTransform' >> tft.AnalyzeAndTransformDataset( preprocessing_fn, work_dir)) # WriteTransformFn writes transform_fn and metadata to fixed subdirectories # of output_dir, which are given by path_constants.TRANSFORM_FN_DIR and # path_constants.TRANSFORMED_METADATA_DIR. _ = (transform_fn | 'WriteTransformFn' >> io.WriteTransformFn(output_dir)) (evaluate_dataset, evaluate_metadata) = ( ((evaluate_data, input_metadata), transform_fn) | 'TransformEval' >> tft.TransformDataset()) # pylint: disable=expression-not-assigned # TODO(b/34231369) Remember to eventually also save the statistics and the # metadata. train_coder = coders.ExampleProtoCoder(train_metadata.schema) (train_dataset | 'SerializeTrainExamples' >> beam.Map(train_coder.encode) | 'WriteTraining' >> beam.io.WriteToTFRecord( os.path.join(output_dir, path_constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX), file_name_suffix='.tfrecord.gz')) evaluate_coder = coders.ExampleProtoCoder(evaluate_metadata.schema) (evaluate_dataset | 'SerializeEvalExamples' >> beam.Map(evaluate_coder.encode) | 'WriteEval' >> beam.io.WriteToTFRecord( os.path.join(output_dir, path_constants.TRANSFORMED_EVAL_DATA_FILE_PREFIX), file_name_suffix='.tfrecord.gz')) if predict_data: predict_mode = tf.contrib.learn.ModeKeys.INFER predict_schema = reddit.make_input_schema(mode=predict_mode) predict_coder = coders.ExampleProtoCoder(predict_schema) # TODO(b/35653662): Simplify once tf.transform 0.1.5 is released. def encode_predict_data(d): try: return predict_coder.encode(d) except Exception: # pylint: disable=broad-except # Compatibility path for tf.transform < 0.1.5 return predict_coder.encode({ k: v.encode('utf-8') if isinstance(v, unicode) else v for k, v in d.items() }) serialized_examples = (pipeline | 'ReadPredictData' >> _ReadData( predict_data, mode=predict_mode) # TODO(b/35194257) Obviate the need for this explicit # serialization. | 'EncodePredictData' >> beam.Map( encode_predict_data)) _ = (serialized_examples | 'WritePredictDataAsTFRecord' >> beam.io.WriteToTFRecord( os.path.join(output_dir, path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX), file_name_suffix='.tfrecord.gz')) _ = (serialized_examples | 'EncodePredictAsB64Json' >> beam.Map(_encode_as_b64_json) | 'WritePredictDataAsText' >> beam.io.WriteToText( os.path.join(output_dir, path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX), file_name_suffix='.txt'))
def preprocess(pipeline, training_data, eval_data, predict_data, output_dir, frequency_threshold): """Run pre-processing step as a pipeline. Args: pipeline: beam pipeline training_data: the name of the table to train on. eval_data: the name of the table to evaluate on. predict_data: the name of the table to predict on. output_dir: file path to where to write all the output files. frequency_threshold: frequency threshold to use for categorical values. """ # 1) The schema can be either defined in-memory or read from a configuration # file, in this case we are creating the schema in-memory. input_schema = reddit.make_input_schema() # 2) Read from BigQuery or from CSV. train_data = pipeline | 'ReadTrainingData' >> _ReadData(training_data) evaluate_data = pipeline | 'ReadEvalData' >> _ReadData(eval_data) input_metadata = dataset_metadata.DatasetMetadata(schema=input_schema) _ = (input_metadata | 'WriteInputMetadata' >> tft_beam_io.WriteMetadata( os.path.join(output_dir, path_constants.RAW_METADATA_DIR), pipeline=pipeline)) preprocessing_fn = reddit.make_preprocessing_fn(frequency_threshold) transform_fn = ((train_data, input_metadata) | 'Analyze' >> tft.AnalyzeDataset(preprocessing_fn)) # WriteTransformFn writes transform_fn and metadata to fixed subdirectories # of output_dir, which are given by path_constants.TRANSFORM_FN_DIR and # path_constants.TRANSFORMED_METADATA_DIR. _ = (transform_fn | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(output_dir)) @beam.ptransform_fn def TransformAndWrite(pcoll, path): # pylint: disable=invalid-name pcoll |= 'Shuffle' >> _Shuffle() # pylint: disable=no-value-for-parameter (dataset, metadata) = (((pcoll, input_metadata), transform_fn) | 'Transform' >> tft.TransformDataset()) coder = coders.ExampleProtoCoder(metadata.schema) _ = (dataset | 'SerializeExamples' >> beam.Map(coder.encode) | 'WriteExamples' >> beam.io.WriteToTFRecord( os.path.join(output_dir, path), file_name_suffix='.tfrecord.gz')) _ = train_data | 'TransformAndWriteTraining' >> TransformAndWrite( # pylint: disable=no-value-for-parameter path_constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX) _ = evaluate_data | 'TransformAndWriteEval' >> TransformAndWrite( # pylint: disable=no-value-for-parameter path_constants.TRANSFORMED_EVAL_DATA_FILE_PREFIX) # TODO(b/35300113) Remember to eventually also save the statistics. if predict_data: predict_mode = tf.contrib.learn.ModeKeys.INFER predict_schema = reddit.make_input_schema(mode=predict_mode) predict_coder = coders.ExampleProtoCoder(predict_schema) serialized_examples = (pipeline | 'ReadPredictData' >> _ReadData( predict_data, mode=predict_mode) # TODO(b/35194257) Obviate the need for this explicit # serialization. | 'EncodePredictData' >> beam.Map( predict_coder.encode)) _ = (serialized_examples | 'WritePredictDataAsTFRecord' >> beam.io.WriteToTFRecord( os.path.join(output_dir, path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX), file_name_suffix='.tfrecord.gz')) _ = (serialized_examples | 'EncodePredictAsB64Json' >> beam.Map(_encode_as_b64_json) | 'WritePredictDataAsText' >> beam.io.WriteToText( os.path.join(output_dir, path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX), file_name_suffix='.txt'))