def main(argv=None): parser = argparse.ArgumentParser() parser.add_argument('--cloud', type=str, help='y' ) args = parser.parse_args(argv) # Parse the arguments if args.cloud=="y": pipeline_options = get_cloud_pipeline_options() else: pipeline_options = beam.pipeline.PipelineOptions(flags=[],**{'project': "iotpubsub-1536350750202"}) with beam_impl.Context(temp_dir="gs://relation_extraction/beam"): p = beam.Pipeline(options=pipeline_options) train_data, test_data = (p | "Read from bigquery" >> ReadBigQuery()) (test_data | "test it" >> beam.Map(printy)) train_data = (train_data, train_metadata) train_dataset, transform_fn = (train_data | 'AnalyzeAndTransform' >> beam_impl.AnalyzeAndTransformDataset(preprocessing_fn) ) test_data = (test_data, train_metadata) test_data, _ = ((test_data, transform_fn) | 'Transform test data' >> beam_impl.TransformDataset()) train_data, transformed_metadata = train_dataset transformed_data_coder = tft.coders.ExampleProtoCoder(transformed_metadata.schema) _ = (train_data | 'Encode train data to save it' >> beam.Map(transformed_data_coder.encode) | 'Write the train data to tfrecords' >> tfrecordio.WriteToTFRecord(os.path.join("gs://relation_extraction/beam/Train","TRAIN")) ) _ = (test_data | 'Encode test data to save it' >> beam.Map(transformed_data_coder.encode) | 'Write the test data to tfrecords' >> tfrecordio.WriteToTFRecord(os.path.join("gs://relation_extraction/beam/Test","TEST")) ) _ = (transform_fn | "WriteTransformFn" >> transform_fn_io.WriteTransformFn("gs://relation_extraction/beam/")) p.run().wait_until_finish()
def read_and_shuffle_data(train_neg_filepattern, train_pos_filepattern, test_neg_filepattern, test_pos_filepattern, working_dir): """Read and shuffle the data and write out as a TFRecord of Example protos. Read in the data from the positive and negative examples on disk, shuffle it and write it out in TFRecord format. transform it using a preprocessing pipeline that removes punctuation, tokenizes and maps tokens to int64 values indices. Args: train_neg_filepattern: Filepattern for training data negative examples train_pos_filepattern: Filepattern for training data positive examples test_neg_filepattern: Filepattern for test data negative examples test_pos_filepattern: Filepattern for test data positive examples working_dir: Directory to write shuffled data to """ with beam.Pipeline() as pipeline: coder = tft.coders.ExampleProtoCoder(RAW_DATA_METADATA.schema) # pylint: disable=no-value-for-parameter _ = (pipeline | 'ReadAndShuffleTrain' >> ReadAndShuffleData( (train_neg_filepattern, train_pos_filepattern)) | 'EncodeTrainData' >> beam.Map(coder.encode) | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( os.path.join(working_dir, SHUFFLED_TRAIN_DATA_FILEBASE))) _ = (pipeline | 'ReadAndShuffleTest' >> ReadAndShuffleData( (test_neg_filepattern, test_pos_filepattern)) | 'EncodeTestData' >> beam.Map(coder.encode) | 'WriteTestData' >> tfrecordio.WriteToTFRecord( os.path.join(working_dir, SHUFFLED_TEST_DATA_FILEBASE)))
def read_and_shuffle_data(train_neg_filepattern, train_pos_filepattern, test_neg_filepattern, test_pos_filepattern, shuffled_train_filebase, shuffled_test_filebase): """Read and shuffle the data and write out as a TFRecord of Example protos. Read in the data from the positive and negative examples on disk, shuffle it and write it out in TFRecord format. transform it using a preprocessing pipeline that removes punctuation, tokenizes and maps tokens to int64 values indices. Args: train_neg_filepattern: Filepattern for training data negative examples train_pos_filepattern: Filepattern for training data positive examples test_neg_filepattern: Filepattern for test data negative examples test_pos_filepattern: Filepattern for test data positive examples shuffled_train_filebase: Base filename for shuffled training data shards shuffled_test_filebase: Base filename for shuffled test data shards """ with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter _ = (pipeline | 'ReadAndShuffleTrain' >> ReadAndShuffleData( (train_neg_filepattern, train_pos_filepattern)) | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( shuffled_train_filebase, coder=example_proto_coder.ExampleProtoCoder( RAW_DATA_METADATA.schema))) _ = (pipeline | 'ReadAndShuffleTest' >> ReadAndShuffleData( (test_neg_filepattern, test_pos_filepattern)) | 'WriteTestData' >> tfrecordio.WriteToTFRecord( shuffled_test_filebase, coder=example_proto_coder.ExampleProtoCoder( RAW_DATA_METADATA.schema)))
def expand(self, features): return ( features | 'Write to %s' % self._file_path_prefix.replace('/', '_') >> tfrecordio.WriteToTFRecord(file_path_prefix=self._file_path_prefix, file_name_suffix='.tfrecord.gz', coder=ExampleProtoCoder()))
def run_pipeline(): ''' Apache beam pipeline ARGS: None ''' args, pipeline_args = get_args() pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True read_textline_from_csv = beam.io.ReadFromText( args.csv_path, skip_header_lines=1) load_img_from_path = LoadImageDoFn() augment_data = PreprocessImagesDoFn() img_to_tfexample = ImageToTfExampleDoFn() write_to_tf_record = tfrecordio.WriteToTFRecord( file_path_prefix='gs://bucket_name/Apache_beam_records/Test_records/', num_shards=20) with beam.Pipeline(options=pipeline_options) as pipe: _ = (pipe | 'ReadCSVFromText' >> read_textline_from_csv | 'LoadImageData' >> beam.ParDo(load_img_from_path) | 'PreprocessImages' >> beam.ParDo(augment_data) | 'ImageToTfExample' >> beam.ParDo(img_to_tfexample) | 'SerializeProto' >> beam.Map(lambda x: x.SerializeToString()) | 'WriteTfRecord' >> write_to_tf_record) print('Done running')
def expand(self, features): return (features | 'Write to %s' % self._file_path_prefix.replace('/', '_') >> tfrecordio.WriteToTFRecord( file_path_prefix=self._file_path_prefix, file_name_suffix='.tfrecord.gz', shard_name_template=fileio.DEFAULT_SHARD_NAME_TEMPLATE, coder=ExampleProtoCoder(), compression_type=fileio.CompressionTypes.AUTO))
def main(argv=None): '''Run Preprocessing as a Dataflow pipeline.''' args = parse_arguments(sys.argv if argv is None else argv) if args.cloud: logging.info('Start running in the cloud') options = { 'runner': 'DataflowRunner', 'job_name': ('mlengine-boilerplate-{}'.format( datetime.datetime.now().strftime('%Y%m%d%H%M%S'))), 'staging_location': os.path.join(args.output_dir, 'staging'), 'temp_location': os.path.join(args.output_dir, 'tmp'), 'project': args.project_id, 'zone': 'europe-west1-d', 'autoscaling_algorithm': 'THROUGHPUT_BASED', 'save_main_session': True, 'setup_file': './setup.py', } pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options) print(pipeline_options) else: pipeline_options = None train_coder = coders.ExampleProtoCoder(schema) p = beam.Pipeline(options=pipeline_options) examples = (p | 'ReadData' >> beam.io.ReadFromText(DATA_DIR + '/*', skip_header_lines=1) | 'buildExamples' >> beam.FlatMap(lambda raw_input: buildExample(raw_input))) examples_split = examples | beam.Partition(partition_fn, 3) example_dict = { 'train': examples_split[0], 'validation': examples_split[1], 'test': examples_split[2] } for part, examples in example_dict.items(): _ = examples | part + '_writeExamples' >> tfrecordio.WriteToTFRecord( file_path_prefix=os.path.join(args.output_dir, part + '_examples'), compression_type=filesystem.CompressionTypes.GZIP, coder=train_coder, file_name_suffix='.gz') p.run()
def run(argv=None): """Runs the revise preprocessed data pipeline. Args: argv: Pipeline options as a list of arguments. """ pipeline_options = PipelineOptions(flags=argv) revise_options = pipeline_options.view_as(ReviseOptions) cloud_options = pipeline_options.view_as(GoogleCloudOptions) output_dir = os.path.join( revise_options.output, datetime.datetime.now().strftime('%Y%m%d-%H%M%S')) pipeline_options.view_as(SetupOptions).save_main_session = True pipeline_options.view_as( WorkerOptions).autoscaling_algorithm = 'THROUGHPUT_BASED' cloud_options.staging_location = os.path.join(output_dir, 'tmp', 'staging') cloud_options.temp_location = os.path.join(output_dir, 'tmp') cloud_options.job_name = 'relabel-examples-%s' % ( datetime.datetime.now().strftime('%y%m%d-%H%M%S')) metadata_query = str( Template(open(revise_options.metadata, 'r').read()).render(METADATA_QUERY_REPLACEMENTS)) logging.info('metadata query : %s', metadata_query) with beam.Pipeline(options=pipeline_options) as p: # Gather our sample metadata into a python dictionary. samples_metadata = ( p | 'ReadSampleMetadata' >> beam.io.Read( beam.io.BigQuerySource(query=metadata_query, use_standard_sql=True)) | 'TableToDictionary' >> beam.CombineGlobally( util.TableToDictCombineFn(key_column=encoder.KEY_COLUMN))) # Read the tf.Example protos into a PCollection. examples = p | 'ReadExamples' >> tfrecordio.ReadFromTFRecord( file_pattern=revise_options.input, compression_type=CompressionTypes.GZIP) # Filter the TensorFlow Example Protocol Buffers. filtered_examples = (examples | 'ReviseExamples' >> beam.FlatMap( lambda example, samples_metadata: filter_and_revise_example( example, samples_metadata), beam.pvalue.AsSingleton(samples_metadata))) # Write the subset of tf.Example protos to Cloud Storage. _ = (filtered_examples | 'SerializeExamples' >> beam.Map(lambda example: example.SerializeToString()) | 'WriteExamples' >> tfrecordio.WriteToTFRecord( file_path_prefix=os.path.join(output_dir, 'examples'), compression_type=CompressionTypes.GZIP, file_name_suffix='.tfrecord.gz'))
def main(argv=None): """Run preprocessing as a Dataflow pipeline. Args: argv (list): list of arguments """ args = parse_arguments(sys.argv if argv is None else argv) if args.cloud: pipeline_options = get_cloud_pipeline_options() else: pipeline_options = None p = beam.Pipeline(options=pipeline_options) with beam_impl.Context(temp_dir=tempfile.mkdtemp()): # read data and join by key raw_data_input = (p | 'ReadInputData' >> beam.io.ReadFromText( TRAIN_INPUT_DATA, skip_header_lines=1) | 'ParseInputCSV' >> beam.Map(converter_input.decode) | 'ExtractBatchKeyIn' >> beam.Map(extract_batchkey)) raw_data_output = ( p | 'ReadOutputData' >> beam.io.ReadFromText(TRAIN_OUTPUT_DATA, skip_header_lines=1) | 'ParseOutputCSV' >> beam.Map(converter_output.decode) | 'ExtractBatchKeyOut' >> beam.Map(extract_batchkey)) raw_data = ((raw_data_input, raw_data_output) | 'JoinData' >> beam.CoGroupByKey() | 'RemoveKeys' >> beam.FlatMap(remove_keys)) # analyse and transform dataset raw_dataset = (raw_data, input_metadata) transformed_dataset, transform_fn = ( raw_dataset | 'AnalyzeAndTransform' >> beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data, transformed_metadata = transformed_dataset # save data and serialize TransformFn transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) _ = (transformed_data | 'EncodeData' >> beam.Map(transformed_data_coder.encode) | 'WriteData' >> tfrecordio.WriteToTFRecord( os.path.join(TFRECORD_DIR, 'records'))) _ = (transform_fn | "WriteTransformFn" >> transform_fn_io.WriteTransformFn(MODEL_DIR)) p.run().wait_until_finish()
def store_transformed_data(data, schema, path, name=''): """Stores data from input pipeline into TFRecord in the specified path. Args: data: `PCollection`, input pipeline. schema: `DatasetMetadata` object, describes schema of the input pipeline. path: string, where to write output. name: string: name describing pipeline to be written. Returns: PCollection """ p = (data | 'WriteData{}'.format(name) >> tfrecordio.WriteToTFRecord( path, coder=example_proto_coder.ExampleProtoCoder(schema.schema))) return p
def pipeline(root): """Method to pass into flume runner.""" _ = ( root | 'Read RecordIO TSV' >> beam.io.ReadFromText(FLAGS.input_path) | 'Validate sentence pair' >> beam.ParDo( ValidateSentencePair(FLAGS.min_edit_distance)) | 'Select TSV columns' >> beam.ParDo( SelectTSVColumns( source_column=FLAGS.tsv_source_column, target_column=FLAGS.tsv_target_column)) | 'Reshuffle' >> beam.Reshuffle() | 'Batch elements' >> beam.BatchElements( min_batch_size=1024, max_batch_size=1024) | 'Make tf.Examples' >> beam.ParDo( PrepareTfExamples( spm_path=FLAGS.spm_path, packed_length=FLAGS.packed_length, pad_length=FLAGS.pad_length)) | 'Write to tf.Record' >> tfrecordio.WriteToTFRecord(FLAGS.output_path))
def main(argv=None): """Run preprocessing as a Dataflow pipeline. Args: argv (list): list of arguments """ args = parse_arguments(sys.argv if argv is None else argv) if args.cloud: pipeline_options = get_cloud_pipeline_options(args.project_id, args.output_dir) else: pipeline_options = None pipeline = beam.Pipeline(options=pipeline_options) examples = ( pipeline # | 'ReadData' >> beam.Create(open('data/test.csv') # .readlines()[1:]) | 'ReadData' >> beam.io.ReadFromText(DATA_DIR + '*', skip_header_lines=1) | 'BuildExamples' >> beam.FlatMap(build_example)) examples_split = examples | beam.Partition(partition_fn, 3) example_dict = { 'train': examples_split[0], 'validation': examples_split[1], 'test': examples_split[2] } for part, examples in example_dict.items(): examples | part + '_writeExamples' >> tfrecordio.WriteToTFRecord( file_path_prefix=os.path.join(TFRECORD_DIR, part + '_examples'), compression_type=filesystem.CompressionTypes.GZIP, coder=coders.ExampleProtoCoder(schema), file_name_suffix='.tfrecord.gz') pipeline.run().wait_until_finish()
def run(argv=None): """Runs the sparse measurements preprocess pipeline. Args: argv: Pipeline options as a list of arguments. """ pipeline_options = PipelineOptions(flags=argv) preprocess_options = pipeline_options.view_as(PreprocessOptions) cloud_options = pipeline_options.view_as(GoogleCloudOptions) output_dir = os.path.join( preprocess_options.output, datetime.datetime.now().strftime('%Y%m%d-%H%M%S')) pipeline_options.view_as(SetupOptions).save_main_session = True pipeline_options.view_as( WorkerOptions).autoscaling_algorithm = 'THROUGHPUT_BASED' cloud_options.staging_location = os.path.join(output_dir, 'tmp', 'staging') cloud_options.temp_location = os.path.join(output_dir, 'tmp') cloud_options.job_name = 'preprocess-measurements-%s' % ( datetime.datetime.now().strftime('%y%m%d-%H%M%S')) data_query = str( Template(open(preprocess_options.input, 'r').read()).render(DATA_QUERY_REPLACEMENTS)) logging.info('data query : %s', data_query) with beam.Pipeline(options=pipeline_options) as p: # Read the table rows into a PCollection. rows = p | 'ReadMeasurements' >> beam.io.Read( beam.io.BigQuerySource(query=data_query, use_standard_sql=True)) # Convert the data into TensorFlow Example Protocol Buffers. examples = measurements_to_examples(rows) # Write the serialized compressed protocol buffers to Cloud Storage. _ = (examples | 'EncodeExamples' >> beam.Map(lambda example: example.SerializeToString()) | 'WriteExamples' >> tfrecordio.WriteToTFRecord( file_path_prefix=os.path.join(output_dir, 'examples'), compression_type=CompressionTypes.GZIP, file_name_suffix='.tfrecord.gz'))
def pipeline(root): """Method to pass into flume runner.""" for i, tsv_in in enumerate( tf.io.gfile.glob(os.path.join(FLAGS.input_path, '*.tsv'))): print('Processing tsv input: %s' % tsv_in) tfr_out = tsv_in.replace('.tsv', '.tfr') num_output_shards = FLAGS.num_train_shards if 'train' in tsv_in else FLAGS.num_guide_shards _ = (root | 'Read RecordIO TSV__%s' % i >> beam.io.ReadFromText(tsv_in) | 'Validate sentence pair__%s' % i >> beam.ParDo( ValidateSentencePair(FLAGS.min_edit_distance)) | 'Select TSV columns__%s' % i >> beam.ParDo( SelectTSVColumns(source_column=FLAGS.tsv_source_column, target_column=FLAGS.tsv_target_column)) | 'Reshuffle__%s' % i >> beam.Reshuffle() | 'Batch elements__%s' % i >> beam.BatchElements( min_batch_size=1024, max_batch_size=1024) | 'Make tf.Examples__%s' % i >> beam.ParDo( PrepareTfExamples(spm_path=FLAGS.spm_path, packed_length=FLAGS.packed_length, pad_length=FLAGS.pad_length)) | 'Write to tf.Record__%s' % i >> tfrecordio.WriteToTFRecord( tfr_out, num_shards=num_output_shards))
def run(p, params): """Defines Beam preprocessing pipeline. Performs the following: - Reads text files from pattern. - Split text files in train and validation sets. Args: p: PCollection, initial pipeline. params: Object holding a set of parameters as name-value pairs. """ path_pattern = os.path.join(params.input_dir, '*', '*{}'.format(constants.FILE_EXTENSION)) data = (p | 'ListFiles' >> beam.Create(gfile.Glob(path_pattern)) | 'ReadFiles' >> beam.ParDo(ReadFile()) | 'SplitData' >> beam.ParDo( _SplitData(), train_size=params.train_size, val_label=_DatasetType.VAL.name).with_outputs( _DatasetType.VAL.name, main=_DatasetType.TRAIN.name)) schema = dataset_schema.from_feature_spec( utils.get_processed_data_schema()) for dataset in _DatasetType: if not dataset.value: continue _ = ( data[dataset.name] | 'Shuffle{}'.format(dataset.name) >> shuffle() # pylint: disable=no-value-for-parameter | 'WriteFiles{}'.format(dataset.name) >> tfrecordio.WriteToTFRecord( os.path.join(params.output_dir, dataset.name + constants.TFRECORD), coder=example_proto_coder.ExampleProtoCoder(schema)))
def preprocess(query, in_test_mode): import os import os.path import tempfile from apache_beam.io import tfrecordio from tensorflow_transform.coders import example_proto_coder from tensorflow_transform.tf_metadata import dataset_metadata from tensorflow_transform.tf_metadata import dataset_schema from tensorflow_transform.beam.tft_beam_io import transform_fn_io job_name = 'preprocess-babyweight-features' + '-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S') if in_test_mode: import shutil print 'Launching local job ... hang on' OUTPUT_DIR = './preproc_tft' shutil.rmtree(OUTPUT_DIR, ignore_errors=True) else: print 'Launching Dataflow job {} ... hang on'.format(job_name) OUTPUT_DIR = 'gs://{0}/babyweight/preproc_tft/'.format(BUCKET) import subprocess subprocess.call('gsutil rm -r {}'.format(OUTPUT_DIR).split()) options = { 'staging_location': os.path.join(OUTPUT_DIR, 'tmp', 'staging'), 'temp_location': os.path.join(OUTPUT_DIR, 'tmp'), 'job_name': job_name, 'project': PROJECT, 'max_num_workers': 24, 'teardown_policy': 'TEARDOWN_ALWAYS', 'no_save_main_session': True, 'requirements_file': 'requirements.txt' } opts = beam.pipeline.PipelineOptions(flags=[], **options) if in_test_mode: RUNNER = 'DirectRunner' else: RUNNER = 'DataflowRunner' # set up metadata raw_data_schema = { colname : dataset_schema.ColumnSchema(tf.string, [], dataset_schema.FixedColumnRepresentation()) for colname in 'key,is_male,mother_race,mother_married,cigarette_use,alcohol_use'.split(',') } raw_data_schema.update({ colname : dataset_schema.ColumnSchema(tf.float32, [], dataset_schema.FixedColumnRepresentation()) for colname in 'weight_pounds,mother_age,plurality,gestation_weeks'.split(',') }) raw_data_metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema(raw_data_schema)) def read_rawdata(p, step, test_mode): if step == 'train': selquery = 'SELECT * FROM ({}) WHERE ABS(MOD(hashmonth, 4)) < 3'.format(query) else: selquery = 'SELECT * FROM ({}) WHERE ABS(MOD(hashmonth, 4)) = 3'.format(query) if in_test_mode: selquery = selquery + ' LIMIT 100' #print 'Processing {} data from {}'.format(step, selquery) return (p | '{}_read'.format(step) >> beam.io.Read(beam.io.BigQuerySource(query=selquery, use_standard_sql=True)) | '{}_cleanup'.format(step) >> beam.FlatMap(cleanup) ) # run Beam with beam.Pipeline(RUNNER, options=opts) as p: with beam_impl.Context(temp_dir=os.path.join(OUTPUT_DIR, 'tmp')): # analyze and transform training raw_data = read_rawdata(p, 'train', in_test_mode) raw_dataset = (raw_data, raw_data_metadata) transformed_dataset, transform_fn = ( raw_dataset | beam_impl.AnalyzeAndTransformDataset(preprocess_tft)) transformed_data, transformed_metadata = transformed_dataset _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( os.path.join(OUTPUT_DIR, 'train'), coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema)) # transform eval data raw_test_data = read_rawdata(p, 'eval', in_test_mode) raw_test_dataset = (raw_test_data, raw_data_metadata) transformed_test_dataset = ( (raw_test_dataset, transform_fn) | beam_impl.TransformDataset()) transformed_test_data, _ = transformed_test_dataset _ = transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord( os.path.join(OUTPUT_DIR, 'eval'), coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema)) _ = (transform_fn | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn(os.path.join(OUTPUT_DIR, 'metadata'))) job = p.run()
def transform_data(working_dir): """Transform the data and write out as a TFRecord of Example protos. Read in the data from the positive and negative examples on disk, and transform it using a preprocessing pipeline that removes punctuation, tokenizes and maps tokens to int64 values indices. Args: working_dir: Directory to read shuffled data from and write transformed data and metadata to. """ with beam.Pipeline() as pipeline: with beam_impl.Context(temp_dir=tempfile.mkdtemp()): coder = tft.coders.ExampleProtoCoder(RAW_DATA_METADATA.schema) train_data = (pipeline | 'ReadTrain' >> tfrecordio.ReadFromTFRecord( os.path.join(working_dir, SHUFFLED_TRAIN_DATA_FILEBASE + '*')) | 'DecodeTrain' >> beam.Map(coder.decode)) test_data = (pipeline | 'ReadTest' >> tfrecordio.ReadFromTFRecord( os.path.join(working_dir, SHUFFLED_TEST_DATA_FILEBASE + '*')) | 'DecodeTest' >> beam.Map(coder.decode)) def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" review = inputs[REVIEW_KEY] review_tokens = tf.string_split(review, DELIMITERS) review_indices = tft.string_to_int(review_tokens, top_k=VOCAB_SIZE) # Add one for the oov bucket created by string_to_int. review_bow_indices, review_weight = tft.tfidf( review_indices, VOCAB_SIZE + 1) return { REVIEW_KEY: review_bow_indices, REVIEW_WEIGHT_KEY: review_weight, LABEL_KEY: inputs[LABEL_KEY] } (transformed_train_data, transformed_metadata), transform_fn = ( (train_data, RAW_DATA_METADATA) | 'AnalyzeAndTransform' >> beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) transformed_test_data, _ = ( ((test_data, RAW_DATA_METADATA), transform_fn) | 'Transform' >> beam_impl.TransformDataset()) _ = (transformed_train_data | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode) | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TRAIN_DATA_FILEBASE))) _ = ( transformed_test_data | 'EncodeTestData' >> beam.Map(transformed_data_coder.encode) | 'WriteTestData' >> tfrecordio.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE))) # Will write a SavedModel and metadata to two subdirectories of # working_dir, given by transform_fn_io.TRANSFORM_FN_DIR and # transform_fn_io.TRANSFORMED_METADATA_DIR respectively. _ = (transform_fn | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn(working_dir))
def run(input_feature_spec, labels, feature_extraction, feature_scaling=None, eval_percent=20.0, beam_options=None, work_dir=None): """Runs the whole preprocessing step. This runs the feature extraction PTransform, validates that the data conforms to the schema provided, normalizes the features, and splits the dataset into a training and evaluation dataset. """ # Populate optional arguments if not feature_scaling: feature_scaling = lambda inputs: inputs # Type checking if not isinstance(labels, list): raise ValueError('`labels` must be list(str). ' 'Given: {} {}'.format(labels, type(labels))) if not isinstance(feature_extraction, beam.PTransform): raise ValueError('`feature_extraction` must be {}. ' 'Given: {} {}'.format(beam.PTransform, feature_extraction, type(feature_extraction))) if not callable(feature_scaling): raise ValueError('`feature_scaling` must be callable. ' 'Given: {} {}'.format(feature_scaling, type(feature_scaling))) if beam_options and not isinstance(beam_options, PipelineOptions): raise ValueError('`beam_options` must be {}. ' 'Given: {} {}'.format(PipelineOptions, beam_options, type(beam_options))) if not work_dir: work_dir = tempfile.mkdtemp(prefix='tensorflow-preprocessing') tft_temp_dir = os.path.join(work_dir, 'tft-temp') train_dataset_dir = os.path.join(work_dir, 'train-dataset') eval_dataset_dir = os.path.join(work_dir, 'eval-dataset') transform_fn_dir = os.path.join(work_dir, transform_fn_io.TRANSFORM_FN_DIR) # if tf.gfile.Exists(transform_fn_dir): if tf.io.gfile.exists(transform_fn_dir): tf.gfile.DeleteRecursively(transform_fn_dir) # [START dataflow_molecules_create_pipeline] # Build and run a Beam Pipeline with beam.Pipeline(options=beam_options) as p, \ beam_impl.Context(temp_dir=tft_temp_dir): # [END dataflow_molecules_create_pipeline] # [START dataflow_molecules_feature_extraction] # Transform and validate the input data matches the input schema dataset = ( p | 'Feature extraction' >> feature_extraction # [END dataflow_molecules_feature_extraction] # [START dataflow_molecules_validate_inputs] | 'Validate inputs' >> beam.ParDo( ValidateInputData(input_feature_spec))) # [END dataflow_molecules_validate_inputs] # [START dataflow_molecules_analyze_and_transform_dataset] # Apply the tf.Transform preprocessing_fn input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec(input_feature_spec)) dataset_and_metadata, transform_fn = ( (dataset, input_metadata) | 'Feature scaling' >> beam_impl.AnalyzeAndTransformDataset(feature_scaling)) dataset, metadata = dataset_and_metadata # [END dataflow_molecules_analyze_and_transform_dataset] # [START dataflow_molecules_split_to_train_and_eval_datasets] # Split the dataset into a training set and an evaluation set assert 0 < eval_percent < 100, 'eval_percent must in the range (0-100)' train_dataset, eval_dataset = ( dataset | 'Split dataset' >> beam.Partition( lambda elem, _: int(random.uniform(0, 100) < eval_percent), 2)) # [END dataflow_molecules_split_to_train_and_eval_datasets] # [START dataflow_molecules_write_tfrecords] # Write the datasets as TFRecords coder = example_proto_coder.ExampleProtoCoder(metadata.schema) train_dataset_prefix = os.path.join(train_dataset_dir, 'part') _ = (train_dataset | 'Write train dataset' >> tfrecordio.WriteToTFRecord( train_dataset_prefix, coder)) eval_dataset_prefix = os.path.join(eval_dataset_dir, 'part') _ = (eval_dataset | 'Write eval dataset' >> tfrecordio.WriteToTFRecord( eval_dataset_prefix, coder)) # Write the transform_fn _ = (transform_fn | 'Write transformFn' >> transform_fn_io.WriteTransformFn(work_dir)) # [END dataflow_molecules_write_tfrecords] return PreprocessData(input_feature_spec, labels, train_dataset_prefix + '*', eval_dataset_prefix + '*')
def transform_data(train_data_file, test_data_file, working_dir): """Transform the data and write out as a TFRecord of Example protos. Read in the data using the CSV reader, and transform it using a preprocessing pipeline that scales numeric data and converts categorical data from strings to int64 values indices, by creating a vocabulary for each category. Args: train_data_file: File containing training data test_data_file: File containing test data working_dir: Directory to write transformed data and metadata to """ def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" outputs = {} # Scale numeric columns to have range [0, 1]. for key in NUMERIC_FEATURE_KEYS: outputs[key] = tft.scale_to_0_1(inputs[key]) # For all categorical columns except the label column, we use # tft.string_to_int which computes the set of unique values and uses this # to convert the strings to indices. for key in CATEGORICAL_FEATURE_KEYS: outputs[key] = tft.string_to_int(inputs[key]) # For the label column we provide the mapping from string to index. def convert_label(label): table = lookup.index_table_from_tensor(['>50K', '<=50K']) return table.lookup(label) outputs[LABEL_KEY] = tft.apply_function(convert_label, inputs[LABEL_KEY]) return outputs # The "with" block will create a pipeline, and run that pipeline at the exit # of the block. with beam.Pipeline() as pipeline: with tft.Context(temp_dir=tempfile.mkdtemp()): # Create a coder to read the census data with the schema. To do this we # need to list all columns in order since the schema doesn't specify the # order of columns in the csv. ordered_columns = [ 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'label' ] converter = tft.CsvCoder(ordered_columns, RAW_DATA_METADATA.schema) # Read in raw data and convert using CSV converter. Note that we apply # some Beam transformations here, which will not be encoded in the TF # graph since we don't do the from within tf.Transform's methods # (AnalyzeDataset, TransformDataset etc.). These transformations are just # to get data into a format that the CSV converter can read, in particular # removing empty lines and removing spaces after commas. raw_data = (pipeline | 'ReadTrainData' >> textio.ReadFromText(train_data_file) | 'FilterTrainData' >> beam.Filter(lambda line: line) | 'FixCommasTrainData' >> beam.Map(lambda line: line.replace(', ', ',')) | 'DecodeTrainData' >> beam.Map(converter.decode)) # Combine data and schema into a dataset tuple. Note that we already used # the schema to read the CSV data, but we also need it to interpret # raw_data. raw_dataset = (raw_data, RAW_DATA_METADATA) transformed_dataset, transform_fn = ( raw_dataset | tft.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data, transformed_metadata = transformed_dataset _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TRAIN_DATA_FILEBASE), coder=tft.ExampleProtoCoder(transformed_metadata.schema)) # Now apply transform function to test data. In this case we also remove # the header line from the CSV file and the trailing period at the end of # each line. raw_test_data = ( pipeline | 'ReadTestData' >> textio.ReadFromText(test_data_file) | 'FilterTestData' >> beam.Filter( lambda line: line and line != '|1x3 Cross validator') | 'FixCommasTestData' >> beam.Map(lambda line: line.replace(', ', ',')) | 'RemoveTrailingPeriodsTestData' >> beam.Map(lambda line: line[:-1]) | 'DecodeTestData' >> beam.Map(converter.decode)) raw_test_dataset = (raw_test_data, RAW_DATA_METADATA) transformed_test_dataset = ((raw_test_dataset, transform_fn) | tft.TransformDataset()) # Don't need transformed data schema, it's the same as before. transformed_test_data, _ = transformed_test_dataset _ = transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE), coder=tft.ExampleProtoCoder(transformed_metadata.schema)) # Will write a SavedModel and metadata to two subdirectories of # working_dir, given by tft.TRANSFORM_FN_DIR and # tft.TRANSFORMED_METADATA_DIR respectively. _ = (transform_fn | 'WriteTransformFn' >> tft.WriteTransformFn(working_dir))
def write_to_tfrecord(args): """ This function is supposed to be called as a script. """ # Decode arguments current_index, num_shards, train_split_fname_out, eval_split_fname_out, \ exp_log_data_file_train_tfrecord, exp_log_data_file_eval_tfrecord, working_dir, data_formatter_module_path = args # num_shards = "32" current_index, num_shards = int(current_index), int(num_shards) split_train_file_pattern = '{}-{:05}-of-{:05}'.format( train_split_fname_out, current_index, num_shards) + '*' split_eval_file_pattern = '{}-{:05}-of-{:05}'.format( eval_split_fname_out, current_index, num_shards) log.info('exp_log_data_file_train_tfrecord {}'.format( exp_log_data_file_train_tfrecord)) log.info('exp_log_data_file_eval_tfrecord {}'.format( exp_log_data_file_eval_tfrecord)) log.info('split_train_file_pattern {}'.format(split_train_file_pattern)) log.info('split_eval_file_pattern {}'.format(split_eval_file_pattern)) data_formatter = import_from_uri( data_formatter_module_path).DataFormatter() # Set up the preprocessing pipeline. pipeline = beam.Pipeline(runner=DirectRunner()) with beam_impl.Context(temp_dir=tempfile.mkdtemp()): # Read raw data files: CSV format ordered according to the `data_formatter`, that are then converted # into a cleaned up format. raw_train_data = ( pipeline | 'ReadTrainDataFile' >> textio.ReadFromText( split_train_file_pattern, skip_header_lines=0) | 'DecodeTrainDataCSV' >> MapAndFilterErrors( tft_coders.CsvCoder( data_formatter.get_features_and_targets(), data_formatter.get_features_metadata().schema).decode)) raw_eval_data = ( pipeline | 'ReadEvalDataFile' >> textio.ReadFromText( split_eval_file_pattern, skip_header_lines=0) | 'DecodeEvalDataCSV' >> MapAndFilterErrors( tft_coders.CsvCoder( data_formatter.get_features_and_targets(), data_formatter.get_features_metadata().schema).decode)) # Examples in tf-example format (for model analysis purposes). # raw_feature_spec = data_formatter.RAW_DATA_METADATA.schema.as_feature_spec() # raw_schema = dataset_schema.from_feature_spec(raw_feature_spec) # coder = example_proto_coder.ExampleProtoCoder(raw_schema) # # _ = ( # raw_eval_data # | 'ToSerializedTFExample' >> beam.Map(coder.encode) # | 'WriteAnalysisTFRecord' >> tfrecordio.WriteToTFRecord( # '{}-{:05}-of-{:05}'.format(analysis_fname_out, i, num_shards), # shard_name_template='', num_shards=1) # ) # Write SavedModel and metadata to two subdirectories of working_dir, given by # `transform_fn_io.TRANSFORM_FN_DIR` and `transform_fn_io.TRANSFORMED_METADATA_DIR` respectively. transform_fn = (pipeline | 'ReadTransformGraph' >> transform_fn_io.ReadTransformFn(working_dir)) # Applies the transformation `transform_fn` to the raw eval dataset (transformed_train_data, transformed_metadata) = ( ((raw_train_data, data_formatter.get_features_metadata()), transform_fn) | 'TransformTrainData' >> beam_impl.TransformDataset()) # Applies the transformation `transform_fn` to the raw eval dataset (transformed_eval_data, transformed_metadata) = ( ((raw_eval_data, data_formatter.get_features_metadata()), transform_fn) | 'TransformEvalData' >> beam_impl.TransformDataset()) # The data schema of the transformed data gets used to build a signature to create # a TFRecord (tf binary data format). This signature is a wrapper function used to # encode transformed data. transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) _ = (transformed_train_data | 'EncodeTrainDataTransform' >> MapAndFilterErrors( transformed_data_coder.encode) | 'WriteTrainDataTFRecord' >> tfrecordio.WriteToTFRecord('{}-{:05}-of-{:05}'.format( exp_log_data_file_train_tfrecord, current_index, num_shards), shard_name_template='', num_shards=1)) _ = (transformed_eval_data | 'EncodeEvalDataTransform' >> MapAndFilterErrors( transformed_data_coder.encode) | 'WriteEvalDataTFRecord' >> tfrecordio.WriteToTFRecord('{}-{:05}-of-{:05}'.format( exp_log_data_file_eval_tfrecord, current_index, num_shards), shard_name_template='', num_shards=1)) result = pipeline.run() result.wait_until_finish()
def transform_data(train_data_file, test_data_file, working_dir): """Transform the data and write out as a TFRecord of Example protos. Read in the data using the CSV reader, and transform it using a preprocessing pipeline that scales numeric data and converts categorical data from strings to int64 values indices, by creating a vocabulary for each category. Args: train_data_file: File containing training data test_data_file: File containing test data working_dir: Directory to write transformed data and metadata to """ def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" # Since we are modifying some features and leaving others unchanged, we # start by setting `outputs` to a copy of `inputs. outputs = inputs.copy() # Scale numeric columns to have range [0, 1]. for key in NUMERIC_FEATURE_KEYS: outputs[key] = tft.scale_to_0_1(outputs[key]) # For all categorical columns except the label column, we generate a # vocabulary but do not modify the feature. This vocabulary is instead # used in the trainer, by means of a feature column, to convert the feature # from a string to an integer id. for key in CATEGORICAL_FEATURE_KEYS: tft.vocabulary(inputs[key], vocab_filename=key) # For the label column we provide the mapping from string to index. table = tf.contrib.lookup.index_table_from_tensor(['>50K', '<=50K']) outputs[LABEL_KEY] = table.lookup(outputs[LABEL_KEY]) return outputs # The "with" block will create a pipeline, and run that pipeline at the exit # of the block. with beam.Pipeline() as pipeline: with beam_impl.Context(temp_dir=tempfile.mkdtemp()): # Create a coder to read the census data with the schema. To do this we # need to list all columns in order since the schema doesn't specify the # order of columns in the csv. ordered_columns = [ 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'label' ] converter = tft.coders.CsvCoder(ordered_columns, RAW_DATA_METADATA.schema) # Read in raw data and convert using CSV converter. Note that we apply # some Beam transformations here, which will not be encoded in the TF # graph since we don't do the from within tf.Transform's methods # (AnalyzeDataset, TransformDataset etc.). These transformations are just # to get data into a format that the CSV converter can read, in particular # removing spaces after commas. # # We use MapAndFilterErrors instead of Map to filter out decode errors in # convert.decode which should only occur for the trailing blank line. raw_data = ( pipeline | 'ReadTrainData' >> textio.ReadFromText(train_data_file) | 'FixCommasTrainData' >> beam.Map( lambda line: line.replace(', ', ',')) | 'DecodeTrainData' >> MapAndFilterErrors(converter.decode)) # Combine data and schema into a dataset tuple. Note that we already used # the schema to read the CSV data, but we also need it to interpret # raw_data. raw_dataset = (raw_data, RAW_DATA_METADATA) transformed_dataset, transform_fn = ( raw_dataset | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data, transformed_metadata = transformed_dataset transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) _ = ( transformed_data | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode) | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TRAIN_DATA_FILEBASE))) # Now apply transform function to test data. In this case we remove the # trailing period at the end of each line, and also ignore the header line # that is present in the test data file. raw_test_data = ( pipeline | 'ReadTestData' >> textio.ReadFromText(test_data_file, skip_header_lines=1) | 'FixCommasTestData' >> beam.Map( lambda line: line.replace(', ', ',')) | 'RemoveTrailingPeriodsTestData' >> beam.Map(lambda line: line[:-1]) | 'DecodeTestData' >> MapAndFilterErrors(converter.decode)) raw_test_dataset = (raw_test_data, RAW_DATA_METADATA) transformed_test_dataset = ( (raw_test_dataset, transform_fn) | beam_impl.TransformDataset()) # Don't need transformed data schema, it's the same as before. transformed_test_data, _ = transformed_test_dataset _ = ( transformed_test_data | 'EncodeTestData' >> beam.Map(transformed_data_coder.encode) | 'WriteTestData' >> tfrecordio.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE))) # Will write a SavedModel and metadata to two subdirectories of # working_dir, given by transform_fn_io.TRANSFORM_FN_DIR and # transform_fn_io.TRANSFORMED_METADATA_DIR respectively. _ = ( transform_fn | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn(working_dir))
def data_transformation(EPOCHS: int, STEPS: int, BATCH_SIZE: int, HIDDEN_LAYER_SIZE: str, LEARNING_RATE: float): import os import shutil from kale.utils import pod_utils from kale.marshal import resource_save as _kale_resource_save from kale.marshal import resource_load as _kale_resource_load _kale_data_directory = "/marshal" if not os.path.isdir(_kale_data_directory): os.makedirs(_kale_data_directory, exist_ok=True) # -----------------------DATA LOADING START-------------------------------- _kale_directory_file_names = [ os.path.splitext(f)[0] for f in os.listdir(_kale_data_directory) if os.path.isfile(os.path.join(_kale_data_directory, f)) ] if "column_names" not in _kale_directory_file_names: raise ValueError("column_names" + " does not exists in directory") _kale_load_file_name = [ f for f in os.listdir(_kale_data_directory) if os.path.isfile(os.path.join(_kale_data_directory, f)) and os.path.splitext(f)[0] == "column_names" ] if len(_kale_load_file_name) > 1: raise ValueError("Found multiple files with name " + "column_names" + ": " + str(_kale_load_file_name)) _kale_load_file_name = _kale_load_file_name[0] column_names = _kale_resource_load(os.path.join( _kale_data_directory, _kale_load_file_name)) if "schema" not in _kale_directory_file_names: raise ValueError("schema" + " does not exists in directory") _kale_load_file_name = [ f for f in os.listdir(_kale_data_directory) if os.path.isfile(os.path.join(_kale_data_directory, f)) and os.path.splitext(f)[0] == "schema" ] if len(_kale_load_file_name) > 1: raise ValueError("Found multiple files with name " + "schema" + ": " + str(_kale_load_file_name)) _kale_load_file_name = _kale_load_file_name[0] schema = _kale_resource_load(os.path.join( _kale_data_directory, _kale_load_file_name)) # -----------------------DATA LOADING END---------------------------------- import os import shutil import logging import apache_beam as beam import tensorflow as tf import tensorflow_transform as tft import tensorflow_model_analysis as tfma import tensorflow_data_validation as tfdv from apache_beam.io import textio from apache_beam.io import tfrecordio from tensorflow_transform.beam import impl as beam_impl from tensorflow_transform.beam.tft_beam_io import transform_fn_io from tensorflow_transform.coders.csv_coder import CsvCoder from tensorflow_transform.coders.example_proto_coder import ExampleProtoCoder from tensorflow_transform.tf_metadata import dataset_metadata from tensorflow_transform.tf_metadata import metadata_io DATA_DIR = 'data/' TRAIN_DATA = os.path.join(DATA_DIR, 'taxi-cab-classification/train.csv') EVALUATION_DATA = os.path.join( DATA_DIR, 'taxi-cab-classification/eval.csv') # Categorical features are assumed to each have a maximum value in the dataset. MAX_CATEGORICAL_FEATURE_VALUES = [24, 31, 12] CATEGORICAL_FEATURE_KEYS = ['trip_start_hour', 'trip_start_day', 'trip_start_month'] DENSE_FLOAT_FEATURE_KEYS = ['trip_miles', 'fare', 'trip_seconds'] # Number of buckets used by tf.transform for encoding each feature. FEATURE_BUCKET_COUNT = 10 BUCKET_FEATURE_KEYS = [ 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude'] # Number of vocabulary terms used for encoding VOCAB_FEATURES by tf.transform VOCAB_SIZE = 1000 # Count of out-of-vocab buckets in which unrecognized VOCAB_FEATURES are hashed. OOV_SIZE = 10 VOCAB_FEATURE_KEYS = ['pickup_census_tract', 'dropoff_census_tract', 'payment_type', 'company', 'pickup_community_area', 'dropoff_community_area'] # allow nan values in these features. OPTIONAL_FEATURES = ['dropoff_latitude', 'dropoff_longitude', 'pickup_census_tract', 'dropoff_census_tract', 'company', 'trip_seconds', 'dropoff_community_area'] LABEL_KEY = 'tips' FARE_KEY = 'fare' tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) # tf.get_logger().setLevel(logging.ERROR) def to_dense(tensor): """Takes as input a SparseTensor and return a Tensor with correct default value Args: tensor: tf.SparseTensor Returns: tf.Tensor with default value """ if not isinstance(tensor, tf.sparse.SparseTensor): return tensor if tensor.dtype == tf.string: default_value = '' elif tensor.dtype == tf.float32: default_value = 0.0 elif tensor.dtype == tf.int32: default_value = 0 else: raise ValueError(f"Tensor type not recognized: {tensor.dtype}") return tf.squeeze(tf.sparse_to_dense(tensor.indices, [tensor.dense_shape[0], 1], tensor.values, default_value=default_value), axis=1) # TODO: Update to below version # return tf.squeeze(tf.sparse.to_dense(tensor, default_value=default_value), axis=1) def preprocess_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[key] = tft.scale_to_z_score(to_dense(inputs[key])) for key in VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. if inputs[key].dtype == tf.string: vocab_tensor = to_dense(inputs[key]) else: vocab_tensor = tf.as_string(to_dense(inputs[key])) outputs[key] = tft.compute_and_apply_vocabulary( vocab_tensor, vocab_filename='vocab_' + key, top_k=VOCAB_SIZE, num_oov_buckets=OOV_SIZE) for key in BUCKET_FEATURE_KEYS: outputs[key] = tft.bucketize( to_dense(inputs[key]), FEATURE_BUCKET_COUNT) for key in CATEGORICAL_FEATURE_KEYS: outputs[key] = tf.cast(to_dense(inputs[key]), tf.int64) taxi_fare = to_dense(inputs[FARE_KEY]) taxi_tip = to_dense(inputs[LABEL_KEY]) # Test if the tip was > 20% of the fare. tip_threshold = tf.multiply(taxi_fare, tf.constant(0.2)) outputs[LABEL_KEY] = tf.logical_and( tf.logical_not(tf.math.is_nan(taxi_fare)), tf.greater(taxi_tip, tip_threshold)) for key in outputs: if outputs[key].dtype == tf.bool: outputs[key] = tft.compute_and_apply_vocabulary(tf.as_string(outputs[key]), vocab_filename='vocab_' + key) return outputs trns_output = os.path.join(DATA_DIR, "transformed") if os.path.exists(trns_output): shutil.rmtree(trns_output) tft_input_metadata = dataset_metadata.DatasetMetadata(schema) runner = 'DirectRunner' with beam.Pipeline(runner, options=None) as p: with beam_impl.Context(temp_dir=os.path.join(trns_output, 'tmp')): converter = CsvCoder(column_names, tft_input_metadata.schema) # READ TRAIN DATA train_data = ( p | 'ReadTrainData' >> textio.ReadFromText(TRAIN_DATA, skip_header_lines=1) | 'DecodeTrainData' >> beam.Map(converter.decode)) # TRANSFORM TRAIN DATA (and get transform_fn function) transformed_dataset, transform_fn = ( (train_data, tft_input_metadata) | beam_impl.AnalyzeAndTransformDataset(preprocess_fn)) transformed_data, transformed_metadata = transformed_dataset # SAVE TRANSFORMED TRAIN DATA _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( os.path.join(trns_output, 'train'), coder=ExampleProtoCoder(transformed_metadata.schema)) # READ EVAL DATA eval_data = ( p | 'ReadEvalData' >> textio.ReadFromText(EVALUATION_DATA, skip_header_lines=1) | 'DecodeEvalData' >> beam.Map(converter.decode)) # TRANSFORM EVAL DATA (using previously created transform_fn function) eval_dataset = (eval_data, tft_input_metadata) transformed_eval_data, transformed_metadata = ( (eval_dataset, transform_fn) | beam_impl.TransformDataset()) # SAVE EVAL DATA _ = transformed_eval_data | 'WriteEvalData' >> tfrecordio.WriteToTFRecord( os.path.join(trns_output, 'eval'), coder=ExampleProtoCoder(transformed_metadata.schema)) # SAVE transform_fn FUNCTION FOR LATER USE # TODO: check out what is the transform function (transform_fn) that came from previous step _ = (transform_fn | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn(trns_output)) # SAVE TRANSFORMED METADATA metadata_io.write_metadata( metadata=tft_input_metadata, path=os.path.join(trns_output, 'metadata')) # -----------------------DATA SAVING START--------------------------------- if "trns_output" in locals(): _kale_resource_save(trns_output, os.path.join( _kale_data_directory, "trns_output")) else: print("_kale_resource_save: `trns_output` not found.")
beam.FlatMap(lambda stream: [stream.detrend('demean')]) | 'Remove trend' >> beam.FlatMap(lambda stream: [stream.detrend('linear')]) | 'Resample to 100 Hz' >> beam.FlatMap(lambda stream: [stream.resample(100)]) | 'Trim traces' >> beam.ParDo(TrimTrace(points=3001))) station_location = location | '(sta, loc)' >> beam.FlatMap( lambda loc: [(loc['station'], loc)]) station_pick = picks | '(sta, pick)' >> beam.FlatMap( lambda pick: [(pick.waveform_id.station_code, pick)]) station_stream = streams | '(sta, stream)' >> beam.FlatMap( lambda stream: [(stream[0].stats.station, stream)]) dataset = ( { 'pick': station_pick, 'stream': station_stream, 'location': station_location } | 'Join by station' >> beam.CoGroupByKey() | 'Drop empty station' >> beam.ParDo(DropEmptyStation()) | 'Group stream pick by time' >> GroupStreamPick() | 'Generate stream PDFs' >> beam.ParDo(GeneratePDF(sigma=0.1)) | 'Extract stream features' >> beam.ParDo(StreamFeatureExtraction())) transform = (dataset | 'Feature to Example' >> beam.ParDo(FeatureToExample()) | 'Write dataset' >> tfrecordio.WriteToTFRecord( tfrecord_dir, coder=beam.coders.ProtoCoder))
def transform_data(train_neg_filepattern, train_pos_filepattern, test_neg_filepattern, test_pos_filepattern, transformed_train_filebase, transformed_test_filebase, transformed_metadata_dir): """Transform the data and write out as a TFRecord of Example protos. Read in the data from the positive and negative examples on disk, and transform it using a preprocessing pipeline that removes punctuation, tokenizes and maps tokens to int64 values indices. Args: train_neg_filepattern: Filepattern for training data negative examples train_pos_filepattern: Filepattern for training data positive examples test_neg_filepattern: Filepattern for test data negative examples test_pos_filepattern: Filepattern for test data positive examples transformed_train_filebase: Base filename for transformed training data shards transformed_test_filebase: Base filename for transformed test data shards transformed_metadata_dir: Directory where metadata for transformed data should be written """ with beam.Pipeline() as pipeline: with beam_impl.Context(temp_dir=tempfile.mkdtemp()): # pylint: disable=no-value-for-parameter train_data = pipeline | 'ReadTrain' >> ReadAndShuffleData( (train_neg_filepattern, train_pos_filepattern)) # pylint: disable=no-value-for-parameter test_data = pipeline | 'ReadTest' >> ReadAndShuffleData( (test_neg_filepattern, test_pos_filepattern)) metadata = dataset_metadata.DatasetMetadata( dataset_schema.Schema({ REVIEW_COLUMN: dataset_schema.ColumnSchema( tf.string, [], dataset_schema.FixedColumnRepresentation()), LABEL_COLUMN: dataset_schema.ColumnSchema( tf.int64, [], dataset_schema.FixedColumnRepresentation()), })) def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" review = inputs[REVIEW_COLUMN] review_tokens = tf.string_split(review, DELIMITERS) review_indices = tft.string_to_int(review_tokens, top_k=VOCAB_SIZE) # Add one for the oov bucket created by string_to_int. review_bow_indices, review_weight = tft.tfidf( review_indices, VOCAB_SIZE + 1) return { REVIEW_COLUMN: review_bow_indices, REVIEW_WEIGHT: review_weight, LABEL_COLUMN: inputs[LABEL_COLUMN] } (transformed_train_data, transformed_metadata), transform_fn = ( (train_data, metadata) | 'AnalyzeAndTransform' >> beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_test_data, _ = ( ((test_data, metadata), transform_fn) | 'Transform' >> beam_impl.TransformDataset()) _ = (transformed_train_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( transformed_train_filebase, coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema))) _ = (transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord( transformed_test_filebase, coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema))) _ = (transformed_metadata | 'WriteMetadata' >> beam_metadata_io.WriteMetadata( transformed_metadata_dir, pipeline=pipeline))
def preprocess_data(train_neg_file_pattern, train_pos_file_pattern, test_neg_file_pattern, test_pos_file_pattern, transformed_train_file_pattern, transformed_test_file_pattern, transformed_metadata_dir, raw_metadata_dir, transform_func_dir, temp_dir, vocab_size, delimiters): """Transform the data and write out as a TFRecord of Example protos. Read in the data from the positive and negative examples on disk, and transform it using a preprocessing pipeline that removes punctuation, tokenizes and maps tokens to int64 values indices. Args: train_neg_filepattern: Filepattern for training data negative examples train_pos_filepattern: Filepattern for training data positive examples test_neg_filepattern: Filepattern for test data negative examples test_pos_filepattern: Filepattern for test data positive examples transformed_train_filebase: Base filename for transformed training data shards transformed_test_filebase: Base filename for transformed test data shards transformed_metadata_dir: Directory where metadata for transformed data should be written raw_metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema({ REVIEW_COLUMN: dataset_schema.ColumnSchema(tf.string, [], dataset_schema.FixedColumnRepresentation()), LABEL_COLUMN: dataset_schema.ColumnSchema(tf.int64, [], dataset_schema.FixedColumnRepresentation()), })) """ pipeline_name = 'DataflowRunner' options = { 'job_name': ('cloud-ml-hazmat-preprocess-{}'.format(datetime.datetime.now().strftime('%Y%m%d%H%M%S'))), 'temp_location': temp_dir, 'project': "stone-outpost-636", 'max_num_workers': 8 } pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options) #with beam.Pipeline(pipeline_name, options=pipeline_options) as pipeline: # with beam_impl.Context(temp_dir=temp_dir): with beam.Pipeline() as pipeline: with beam_impl.Context(temp_dir=tempfile.mkdtemp()): train_data = pipeline | 'ReadTrain' >> ReadAndShuffleData((train_neg_file_pattern, train_pos_file_pattern)) test_data = pipeline | 'ReadTest' >> ReadAndShuffleData((test_neg_file_pattern, test_pos_file_pattern)) preprocessing_fn = generate_preprocessing_fn(vocab_size, delimiters) (transformed_train_data, transformed_metadata), transform_fn = ((train_data, const.RAW_METADATA) | 'AnalyzeAndTransform' >> beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) _ = (transform_fn | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(transform_func_dir)) transformed_test_data, _ = (((test_data, const.RAW_METADATA), transform_fn) | 'Transform' >> beam_impl.TransformDataset()) _ = (transformed_train_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(transformed_train_file_pattern, coder=example_proto_coder.ExampleProtoCoder(transformed_metadata.schema))) _ = (transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord(transformed_test_file_pattern, coder=example_proto_coder.ExampleProtoCoder(transformed_metadata.schema))) _ = (transformed_metadata | 'WriteTransformedMetadata' >> beam_metadata_io.WriteMetadata(transformed_metadata_dir, pipeline=pipeline)) _ = (const.RAW_METADATA | 'WriteRawMetadata' >> beam_metadata_io.WriteMetadata(raw_metadata_dir, pipeline=pipeline))
def run_transform(output_dir, schema, train_data_file, eval_data_file, project, mode, preprocessing_fn=None): """Writes a tft transform fn, and metadata files. Args: output_dir: output folder schema: schema list. train_data_file: training data file pattern. eval_data_file: eval data file pattern. project: the project to run dataflow in. local: whether the job should be local or cloud. preprocessing_fn: a function used to preprocess the raw data. If not specified, a function will be automatically inferred from the schema. """ tft_input_metadata = make_tft_input_metadata(schema) temp_dir = os.path.join(output_dir, 'tmp') preprocessing_fn = preprocessing_fn or make_preprocessing_fn(schema) if mode == 'local': pipeline_options = None runner = 'DirectRunner' elif mode == 'cloud': options = { 'job_name': 'pipeline-tft-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S'), 'temp_location': temp_dir, 'project': project, 'extra_packages': [ 'gs://ml-pipeline-playground/tensorflow-transform-0.6.0.dev0.tar.gz' ] } pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options) runner = 'DataFlowRunner' else: raise ValueError("Invalid mode %s." % mode) with beam.Pipeline(runner, options=pipeline_options) as p: with beam_impl.Context(temp_dir=temp_dir): names = [x['name'] for x in schema] converter = CsvCoder(names, tft_input_metadata.schema) train_data = ( p | 'ReadTrainData' >> textio.ReadFromText(train_data_file) | 'DecodeTrainData' >> beam.Map(converter.decode)) train_dataset = (train_data, tft_input_metadata) transformed_dataset, transform_fn = ( train_dataset | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data, transformed_metadata = transformed_dataset # Writes transformed_metadata and transfrom_fn folders _ = (transform_fn | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn(output_dir)) # Write the raw_metadata metadata_io.write_metadata(metadata=tft_input_metadata, path=os.path.join( output_dir, 'metadata')) _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( os.path.join(output_dir, 'train'), coder=ExampleProtoCoder(transformed_metadata.schema)) eval_data = (p | 'ReadEvalData' >> textio.ReadFromText(eval_data_file) | 'DecodeEvalData' >> beam.Map(converter.decode)) eval_dataset = (eval_data, tft_input_metadata) transformed_eval_dataset = ((eval_dataset, transform_fn) | beam_impl.TransformDataset()) transformed_eval_data, transformed_metadata = transformed_eval_dataset _ = transformed_eval_data | 'WriteEvalData' >> tfrecordio.WriteToTFRecord( os.path.join(output_dir, 'eval'), coder=ExampleProtoCoder(transformed_metadata.schema))
def transform_data(train_neg_filepattern, train_pos_filepattern, test_neg_filepattern, test_pos_filepattern, transformed_train_filebase, transformed_test_filebase, transformed_metadata_dir): """Transform the data and write out as a TFRecord of Example protos. Read in the data from the positive and negative examples on disk, and transform it using a preprocessing pipeline that removes punctuation, tokenizes and maps tokens to int64 values indices. Args: train_neg_filepattern: Filepattern for training data negative examples train_pos_filepattern: Filepattern for training data positive examples test_neg_filepattern: Filepattern for test data negative examples test_pos_filepattern: Filepattern for test data positive examples transformed_train_filebase: Base filename for transformed training data shards transformed_test_filebase: Base filename for transformed test data shards transformed_metadata_dir: Directory where metadata for transformed data should be written """ with beam.Pipeline() as pipeline: with beam_impl.Context(temp_dir=tempfile.mkdtemp()): # pylint: disable=no-value-for-parameter train_data = pipeline | 'ReadTrain' >> ReadAndShuffleData( (train_neg_filepattern, train_pos_filepattern)) # pylint: disable=no-value-for-parameter test_data = pipeline | 'ReadTest' >> ReadAndShuffleData( (test_neg_filepattern, test_pos_filepattern)) metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema({ REVIEW_COLUMN: dataset_schema.ColumnSchema( tf.string, [], dataset_schema.FixedColumnRepresentation()), LABEL_COLUMN: dataset_schema.ColumnSchema( tf.int64, [], dataset_schema.FixedColumnRepresentation()), })) def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" review = inputs[REVIEW_COLUMN] def remove_character(s, char): """Remove a character from a string. Args: s: A SparseTensor of rank 1 of type tf.string char: A string of length 1 Returns: The string `s` with the given character removed (i.e. replaced by '') """ # Hacky implementation where we split and rejoin. split = tf.string_split(s, char) rejoined = tf.reduce_join( tf.sparse_to_dense( split.indices, split.dense_shape, split.values, ''), 1) return rejoined def remove_punctuation(s): """Remove puncuation from a string. Args: s: A SparseTensor of rank 1 of type tf.string Returns: The string `s` with punctuation removed. """ for char in PUNCTUATION_CHARACTERS: s = remove_character(s, char) return s cleaned_review = tft.map(remove_punctuation, review) review_tokens = tft.map(tf.string_split, cleaned_review) review_indices = tft.string_to_int(review_tokens, top_k=VOCAB_SIZE) return { REVIEW_COLUMN: review_indices, LABEL_COLUMN: inputs[LABEL_COLUMN] } (transformed_train_data, transformed_metadata), transform_fn = ( (train_data, metadata) | 'AnalyzeAndTransform' >> beam_impl.AnalyzeAndTransformDataset( preprocessing_fn)) transformed_test_data, _ = ( ((test_data, metadata), transform_fn) | 'Transform' >> beam_impl.TransformDataset()) _ = ( transformed_train_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( transformed_train_filebase, coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema))) _ = ( transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord( transformed_test_filebase, coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema))) _ = ( transformed_metadata | 'WriteMetadata' >> beam_metadata_io.WriteMetadata( transformed_metadata_dir, pipeline=pipeline))
def transform_data(train_data_file, test_data_file, transformed_train_filebase, transformed_test_filebase, transformed_metadata_dir): """Transform the data and write out as a TFRecord of Example protos. Read in the data using the CSV reader, and transform it using a preprocessing pipeline that scales numeric data and coverts categorical data from strings to int64 values indices, by creating a vocabulary for each category. Args: train_data_file: File containing training data test_data_file: File containing test data transformed_train_filebase: Base filename for transformed training data shards transformed_test_filebase: Base filename for transformed test data shards transformed_metadata_dir: Directory where metadata for transformed data should be written """ raw_data_schema = { key: dataset_schema.ColumnSchema(tf.string, [], dataset_schema.FixedColumnRepresentation()) for key in CATEGORICAL_COLUMNS } raw_data_schema.update({ key: dataset_schema.ColumnSchema(tf.float32, [], dataset_schema.FixedColumnRepresentation()) for key in NUMERIC_COLUMNS }) raw_data_schema[LABEL_COLUMN] = dataset_schema.ColumnSchema( tf.string, [], dataset_schema.FixedColumnRepresentation()) raw_data_schema = dataset_schema.Schema(raw_data_schema) raw_data_metadata = dataset_metadata.DatasetMetadata(raw_data_schema) def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" outputs = {} # Scale numeric columns to have range [0, 1]. for key in NUMERIC_COLUMNS: outputs[key] = tft.scale_to_0_1(inputs[key]) # For all categorical columns except the label column, we use # tft.string_to_int which computes the set of unique values and uses this # to convert the strings to indices. for key in CATEGORICAL_COLUMNS: outputs[key] = tft.string_to_int(inputs[key]) # For the label column we provide the mapping from string to index. def convert_label(label): table = lookup.string_to_index_table_from_tensor(['>50K', '<=50K']) return table.lookup(label) outputs[LABEL_COLUMN] = tft.map(convert_label, inputs[LABEL_COLUMN]) return outputs # The "with" block will create a pipeline, and run that pipeline at the exit # of the block. with beam.Pipeline() as pipeline: with beam_impl.Context(temp_dir=tempfile.mkdtemp()): # Create a coder to read the census data with the schema. To do this we # need to list all columns in order since the schema doesn't specify the # order of columns in the csv. ordered_columns = [ 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'label' ] converter = csv_coder.CsvCoder(ordered_columns, raw_data_schema) # Read in raw data and convert using CSV converter. Note that we apply # some Beam transformations here, which will not be encoded in the TF # graph since we don't do the from within tf.Transform's methods # (AnalyzeDataset, TransformDataset etc.). These transformations are just # to get data into a format that the CSV converter can read, in particular # removing empty lines and removing spaces after commas. raw_data = (pipeline | 'ReadTrainData' >> textio.ReadFromText(train_data_file) | 'FilterTrainData' >> beam.Filter(lambda line: line) | 'FixCommasTrainData' >> beam.Map(lambda line: line.replace(', ', ',')) | 'DecodeTrainData' >> beam.Map(converter.decode)) # Combine data and schema into a dataset tuple. Note that we already used # the schema to read the CSV data, but we also need it to interpret # raw_data. raw_dataset = (raw_data, raw_data_metadata) transformed_dataset, transform_fn = ( raw_dataset | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data, transformed_metadata = transformed_dataset _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( transformed_train_filebase, coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema)) # Now apply transform function to test data. In this case we also remove # the header line from the CSV file and the trailing period at the end of # each line. raw_test_data = ( pipeline | 'ReadTestData' >> textio.ReadFromText(test_data_file) | 'FilterTestData' >> beam.Filter( lambda line: line and line != '|1x3 Cross validator') | 'FixCommasTestData' >> beam.Map(lambda line: line.replace(', ', ',')) | 'RemoveTrailingPeriodsTestData' >> beam.Map(lambda line: line[:-1]) | 'DecodeTestData' >> beam.Map(converter.decode)) raw_test_dataset = (raw_test_data, raw_data_metadata) transformed_test_dataset = ((raw_test_dataset, transform_fn) | beam_impl.TransformDataset()) # Don't need transformed data schema, it's the same as before. transformed_test_data, _ = transformed_test_dataset _ = transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord( transformed_test_filebase, coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema)) _ = (transformed_metadata | 'WriteMetadata' >> beam_metadata_io.WriteMetadata( transformed_metadata_dir, pipeline=pipeline))
# split into train and test train, test = ( transformed_data | 'Partition train/test' >> beam.Partition( split_train_test, 2, split_dt=datetime(2020, 1, 1))) # encoder for TFRecords transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) # write train dataset _ = ( train | 'Encode & write train -> TFRecords' >> tfrecordio.WriteToTFRecord( file_path_prefix=os.path.join(args.data_dir, 'tfrecords', args.output_dir, TRAIN_FILES_PATTERN), coder=transformed_data_coder, file_name_suffix='.gz', num_shards=4, compression_type=beam.io.filesystem.CompressionTypes.GZIP)) # write validation dataset _ = ( test | 'Encode & write test -> TFRecords' >> tfrecordio.WriteToTFRecord( file_path_prefix=os.path.join(args.data_dir, 'tfrecords', args.output_dir, EVAL_FILES_PATTERN), coder=transformed_data_coder, file_name_suffix='.gz', num_shards=1, compression_type=beam.io.filesystem.CompressionTypes.GZIP))
def preprocess(in_test_mode): import os import os.path import tempfile from apache_beam.io import tfrecordio from tensorflow_transform.coders import example_proto_coder from tensorflow_transform.tf_metadata import dataset_metadata from tensorflow_transform.tf_metadata import dataset_schema from tensorflow_transform.beam import tft_beam_io from tensorflow_transform.beam.tft_beam_io import transform_fn_io job_name = 'preprocess-taxi-features' + '-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S') if in_test_mode: import shutil print 'Launching local job ... hang on' OUTPUT_DIR = './preproc_tft' shutil.rmtree(OUTPUT_DIR, ignore_errors=True) EVERY_N = 100000 else: print 'Launching Dataflow job {} ... hang on'.format(job_name) OUTPUT_DIR = 'gs://{0}/taxifare/preproc_tft/'.format(BUCKET) import subprocess subprocess.call('gsutil rm -r {}'.format(OUTPUT_DIR).split()) EVERY_N = 10000 options = { 'staging_location': os.path.join(OUTPUT_DIR, 'tmp', 'staging'), 'temp_location': os.path.join(OUTPUT_DIR, 'tmp'), 'job_name': job_name, 'project': PROJECT, 'max_num_workers': 24, 'teardown_policy': 'TEARDOWN_ALWAYS', 'no_save_main_session': True, 'requirements_file': 'requirements.txt' } opts = beam.pipeline.PipelineOptions(flags=[], **options) if in_test_mode: RUNNER = 'DirectRunner' else: RUNNER = 'DataflowRunner' # set up metadata raw_data_schema = { colname : dataset_schema.ColumnSchema(tf.string, [], dataset_schema.FixedColumnRepresentation()) for colname in 'dayofweek,key'.split(',') } raw_data_schema.update({ colname : dataset_schema.ColumnSchema(tf.float32, [], dataset_schema.FixedColumnRepresentation()) for colname in 'fare_amount,pickuplon,pickuplat,dropofflon,dropofflat'.split(',') }) raw_data_schema.update({ colname : dataset_schema.ColumnSchema(tf.int64, [], dataset_schema.FixedColumnRepresentation()) for colname in 'hourofday,passengers'.split(',') }) raw_data_metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema(raw_data_schema)) # run Beam with beam.Pipeline(RUNNER, options=opts) as p: with beam_impl.Context(temp_dir=os.path.join(OUTPUT_DIR, 'tmp')): # save the raw data metadata _ = (raw_data_metadata | 'WriteInputMetadata' >> tft_beam_io.WriteMetadata( os.path.join(OUTPUT_DIR, 'metadata/rawdata_metadata'), pipeline=p)) # analyze and transform training raw_data = (p | 'train_read' >> beam.io.Read(beam.io.BigQuerySource(query=create_query(1, EVERY_N), use_standard_sql=True)) | 'train_filter' >> beam.Filter(is_valid)) raw_dataset = (raw_data, raw_data_metadata) transformed_dataset, transform_fn = ( raw_dataset | beam_impl.AnalyzeAndTransformDataset(preprocess_tft)) transformed_data, transformed_metadata = transformed_dataset _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( os.path.join(OUTPUT_DIR, 'train'), file_name_suffix='.gz', coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema)) # transform eval data raw_test_data = (p | 'eval_read' >> beam.io.Read(beam.io.BigQuerySource(query=create_query(2, EVERY_N), use_standard_sql=True)) | 'eval_filter' >> beam.Filter(is_valid)) raw_test_dataset = (raw_test_data, raw_data_metadata) transformed_test_dataset = ( (raw_test_dataset, transform_fn) | beam_impl.TransformDataset()) transformed_test_data, _ = transformed_test_dataset _ = transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord( os.path.join(OUTPUT_DIR, 'eval'), file_name_suffix='.gz', coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema)) _ = (transform_fn | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn(os.path.join(OUTPUT_DIR, 'metadata')))