def build_pipeline(p, flags): """Sets up Apache Beam pipeline for execution.""" raw_data = ( p | 'QueryTable' >> beam.io.Read( beam.io.BigQuerySource(query=query.get_query(flags.bq_table), project=flags.project_id, use_standard_sql=True)) # omit 'Generate data' step if working with real data | 'Generate data' >> beam.Map(_generate_fake_data) | 'Extract lifetime ' >> beam.Map(append_lifetime_duration) | 'Extract label' >> beam.Map(append_label) | 'Generate label array' >> beam.Map(combine_censorship_duration)) raw_train, raw_eval, raw_test = ( raw_data | 'RandomlySplitData' >> randomly_split( train_size=.7, validation_size=.15, test_size=.15)) raw_metadata = features.get_raw_dataset_metadata() preprocess_fn = features.preprocess_fn transform_fn = ((raw_train, raw_metadata) | 'AnalyzeTrain' >> tft_beam.AnalyzeDataset(preprocess_fn)) (transform_fn | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(flags.output_dir)) for dataset_type, dataset in [('Train', raw_train), ('Eval', raw_eval), ('Test', raw_test)]: transform_label = 'Transform{}'.format(dataset_type) t, metadata = (((dataset, raw_metadata), transform_fn) | transform_label >> tft_beam.TransformDataset()) if dataset_type == 'Train': (metadata | 'WriteMetadata' >> tft_beam_io.WriteMetadata( os.path.join(flags.output_dir, 'transformed_metadata'), pipeline=p)) write_label = 'Write{}TFRecord'.format(dataset_type) t | write_label >> write_tfrecord(dataset_type, flags.output_dir, metadata)
def preprocess(p, args): """Run preprocessing as pipeline.""" train_eval_schema = _make_input_schema() train_eval_metadata = dataset_metadata.DatasetMetadata( schema=train_eval_schema) _ = (train_eval_metadata | 'WriteInputMetadata' >> tft_beam_io.WriteMetadata(os.path.join( args.output_dir, constants.RAW_METADATA_DIR), pipeline=p)) train_eval_data = (p | 'ReadDataFromBQ' >> beam.io.Read( beam.io.BigQuerySource(query=_get_query('bigquery-public-data', 'samples', 'gsod'), use_standard_sql=True))) train_eval_data = train_eval_data | 'ValidateData' >> beam.ParDo( DataValidator()) (transformed_train_eval_data, transformed_train_eval_metadata), transform_fn = ( (train_eval_data, train_eval_metadata) | 'AnalyzeAndTransform' >> tft_beam.AnalyzeAndTransformDataset( get_preprocessing_fn())) _ = (transform_fn | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(args.output_dir)) transformed_train_eval_coder = coders.ExampleProtoCoder( transformed_train_eval_metadata.schema) transformed_train_data, transformed_eval_data = ( transformed_train_eval_data | 'Partition' >> beam.Partition(get_partition_fn(0.7), 2)) (transformed_train_data | 'SerializeTrainExamples' >> beam.Map(transformed_train_eval_coder.encode) | 'WriteTraining' >> beam.io.WriteToTFRecord(os.path.join( args.output_dir, constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX), file_name_suffix=constants.DATA_FILE_SUFFIX)) (transformed_eval_data | 'SerializeEvalExamples' >> beam.Map(transformed_train_eval_coder.encode) | 'WriteEval' >> beam.io.WriteToTFRecord(os.path.join( args.output_dir, constants.TRANSFORMED_EVAL_DATA_FILE_PREFIX), file_name_suffix=constants.DATA_FILE_SUFFIX))
def make_transform_graph(output_dir, schema, features): """Writes a tft transform fn, and metadata files. Args: output_dir: output folder schema: schema list features: features dict """ tft_input_schema = make_tft_input_schema( schema, os.path.join(output_dir, STATS_FILE)) tft_input_metadata = dataset_metadata.DatasetMetadata( schema=tft_input_schema) preprocessing_fn = make_preprocessing_fn(output_dir, features) # preprocessing_fn does not use any analyzer, so we can run a local beam job # to properly make and write the transform function. temp_dir = os.path.join(output_dir, 'tmp') with beam.Pipeline('DirectRunner', options=None) as p: with tft_impl.Context(temp_dir=temp_dir): # Not going to transform, so no data is needed. train_data = p | beam.Create([]) transform_fn = ( (train_data, tft_input_metadata) | 'BuildTransformFn' # noqa >> tft_impl.AnalyzeDataset(preprocessing_fn)) # noqa # Writes transformed_metadata and transfrom_fn folders _ = (transform_fn | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(output_dir) ) # noqa # Write the raw_metadata metadata_io.write_metadata(metadata=tft_input_metadata, path=os.path.join( output_dir, RAW_METADATA_DIR))
def _create_test_data(self): """Makes local test data. The fllowing files and folders will be created in self.output_folder: self.output_folder/ features.json img.png input.csv schema.json raw_metadata/ (tft metadata files) transformed_metadata/ (tft metadata files) transform_fn/ (tft saved model file) """ self.output_folder = tempfile.mkdtemp() # Make image file self.img_filepath = os.path.join(self.output_folder, 'img.png') image = Image.new('RGBA', size=(50, 50), color=(155, 0, 0)) image.save(self.img_filepath, 'png') # Make csv input file self.csv_input_filepath = os.path.join(self.output_folder, 'input.csv') file_io.write_string_to_file( self.csv_input_filepath, '23.0,%s' % self.img_filepath) # Make schema file self.schema_filepath = os.path.join(self.output_folder, 'schema.json') file_io.write_string_to_file( self.schema_filepath, json.dumps([{'name': 'num_col', 'type': 'FLOAT'}, {'name': 'img_col', 'type': 'STRING'}])) # Make features file self.features_filepath = os.path.join(self.output_folder, 'features.json') file_io.write_string_to_file( self.features_filepath, json.dumps({'num_col': {'transform': 'target'}, 'img_col': {'transform': 'img_url_to_vec'}})) # Run a local beam job to make the transform_fn with beam.Pipeline('DirectRunner'): with tft_impl.Context(temp_dir=os.path.join(self.output_folder, 'tmp')): def preprocessing_fn(inputs): return {'img_col': tft.map(tf.decode_base64, inputs['img_col']), 'num_col': tft.map(lambda x: tf.add(x, 1), inputs['num_col'])} input_data = [{'img_col': base64.urlsafe_b64encode('abcd'), 'num_col': 3}] input_metadata = dataset_metadata.DatasetMetadata( schema=dataset_schema.from_feature_spec( {'img_col': tf.FixedLenFeature(shape=[], dtype=tf.string), 'num_col': tf.FixedLenFeature(shape=[], dtype=tf.float32)})) (dataset, train_metadata), transform_fn = ( (input_data, input_metadata) | 'AnalyzeAndTransform' # noqa: W503 >> tft_impl.AnalyzeAndTransformDataset(preprocessing_fn)) # noqa: W503 # WriteTransformFn writes transform_fn and metadata _ = (transform_fn # noqa: F841 | 'WriteTransformFn' # noqa: W503 >> tft_beam_io.WriteTransformFn(self.output_folder)) # noqa: W503 metadata_io.write_metadata( metadata=input_metadata, path=os.path.join(self.output_folder, 'raw_metadata'))
def preprocess(pipeline, training_data, eval_data, predict_data, output_dir, frequency_threshold, delimiter): """Run pre-processing step as a pipeline. Args: pipeline: beam pipeline training_data: file paths to input csv files. eval_data: file paths to input csv files. predict_data: file paths to input csv files. output_dir: file path to where to write all the output files. frequency_threshold: frequency threshold to use for categorical values. delimiter: the column delimiter for the CSV format. """ # 1) The schema can be either defined in-memory or read from a configuration # file, in this case we are creating the schema in-memory. input_schema = criteo.make_input_schema() # 2) Configure the coder to map the source file column names to a dictionary # of key -> tensor_proto with the appropiate type derived from the # input_schema. coder = criteo.make_csv_coder(input_schema, delimiter) # 3) Read from text using the coder. train_data = (pipeline | 'ReadTrainingData' >> beam.io.ReadFromText(training_data) | 'ParseTrainingCsv' >> beam.Map(coder.decode)) evaluate_data = (pipeline | 'ReadEvalData' >> beam.io.ReadFromText(eval_data) | 'ParseEvalCsv' >> beam.Map(coder.decode)) input_metadata = dataset_metadata.DatasetMetadata(schema=input_schema) _ = (input_metadata | 'WriteInputMetadata' >> tft_beam_io.WriteMetadata( os.path.join(output_dir, path_constants.RAW_METADATA_DIR), pipeline=pipeline)) preprocessing_fn = criteo.make_preprocessing_fn(frequency_threshold) (train_dataset, train_metadata), transform_fn = ( (train_data, input_metadata) | 'AnalyzeAndTransform' >> tft.AnalyzeAndTransformDataset(preprocessing_fn)) # WriteTransformFn writes transform_fn and metadata to fixed subdirectories # of output_dir, which are given by path_constants.TRANSFORM_FN_DIR and # path_constants.TRANSFORMED_METADATA_DIR. _ = (transform_fn | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(output_dir)) # TODO(b/34231369) Remember to eventually also save the statistics. (evaluate_dataset, evaluate_metadata) = (((evaluate_data, input_metadata), transform_fn) | 'TransformEval' >> tft.TransformDataset()) train_coder = coders.ExampleProtoCoder(train_metadata.schema) _ = ( train_dataset | 'SerializeTrainExamples' >> beam.Map(train_coder.encode) | 'ShuffleTraining' >> _Shuffle() # pylint: disable=no-value-for-parameter | 'WriteTraining' >> beam.io.WriteToTFRecord(os.path.join( output_dir, path_constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX), file_name_suffix='.tfrecord.gz')) evaluate_coder = coders.ExampleProtoCoder(evaluate_metadata.schema) _ = ( evaluate_dataset | 'SerializeEvalExamples' >> beam.Map(evaluate_coder.encode) | 'ShuffleEval' >> _Shuffle() # pylint: disable=no-value-for-parameter | 'WriteEval' >> beam.io.WriteToTFRecord(os.path.join( output_dir, path_constants.TRANSFORMED_EVAL_DATA_FILE_PREFIX), file_name_suffix='.tfrecord.gz')) if predict_data: predict_mode = tf.contrib.learn.ModeKeys.INFER predict_schema = criteo.make_input_schema(mode=predict_mode) csv_coder = criteo.make_csv_coder(predict_schema, mode=predict_mode) predict_coder = coders.ExampleProtoCoder(predict_schema) serialized_examples = ( pipeline | 'ReadPredictData' >> beam.io.ReadFromText(predict_data) | 'ParsePredictCsv' >> beam.Map(csv_coder.decode) # TODO(b/35194257) Obviate the need for this explicit serialization. | 'EncodePredictData' >> beam.Map(predict_coder.encode)) _ = (serialized_examples | 'WritePredictDataAsTFRecord' >> beam.io.WriteToTFRecord( os.path.join( output_dir, path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX), file_name_suffix='.tfrecord.gz')) _ = (serialized_examples | 'EncodePredictAsB64Json' >> beam.Map(_encode_as_b64_json) | 'WritePredictDataAsText' >> beam.io.WriteToText( os.path.join( output_dir, path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX), file_name_suffix='.txt'))
_ = (eval_dataset_transformed | 'EncodeEval' >> beam.Map(eval_coder.encode) | 'ShuffleEval' >> _Shuffle() # pylint: disable=no-value-for-parameter | 'WriteEval' >> beam.io.WriteToTFRecord( os.path.join(args.output_dir, 'features_eval'), file_name_suffix='.tfrecord.gz')) _ = (eval_data | 'EncodePrediction' >> beam.Map(prediction_coder.encode) | 'EncodeEvalAsB64Json' >> beam.Map(_encode_as_b64_json) | 'WritePredictDataAsText' >> beam.io.WriteToText( os.path.join(args.output_dir, 'features_eval'), file_name_suffix='.txt')) _ = (transform_fn | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(args.output_dir)) def _encode_as_b64_json(serialized_example): import base64 # pylint: disable=g-import-not-at-top import json # pylint: disable=g-import-not-at-top return json.dumps({'b64': base64.b64encode(serialized_example)}) def get_pipeline_name(runner, cloud): # Allow users to use cutom runner. if runner: return runner if cloud: return 'DataflowRunner' else:
def preprocess(pipeline, training_data, eval_data, predict_data, output_dir, frequency_threshold): """Run pre-processing step as a pipeline. Args: pipeline: beam pipeline training_data: the name of the table to train on. eval_data: the name of the table to evaluate on. predict_data: the name of the table to predict on. output_dir: file path to where to write all the output files. frequency_threshold: frequency threshold to use for categorical values. """ # 1) The schema can be either defined in-memory or read from a configuration # file, in this case we are creating the schema in-memory. input_schema = reddit.make_input_schema() # 2) Read from BigQuery or from CSV. train_data = pipeline | 'ReadTrainingData' >> _ReadData(training_data) evaluate_data = pipeline | 'ReadEvalData' >> _ReadData(eval_data) input_metadata = dataset_metadata.DatasetMetadata(schema=input_schema) _ = (input_metadata | 'WriteInputMetadata' >> tft_beam_io.WriteMetadata( os.path.join(output_dir, path_constants.RAW_METADATA_DIR), pipeline=pipeline)) preprocessing_fn = reddit.make_preprocessing_fn(frequency_threshold) (train_dataset, train_metadata), transform_fn = ( (train_data, input_metadata) | 'AnalyzeAndTransform' >> tft.AnalyzeAndTransformDataset( preprocessing_fn)) # WriteTransformFn writes transform_fn and metadata to fixed subdirectories # of output_dir, which are given by path_constants.TRANSFORM_FN_DIR and # path_constants.TRANSFORMED_METADATA_DIR. _ = (transform_fn | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(output_dir)) (evaluate_dataset, evaluate_metadata) = ( ((evaluate_data, input_metadata), transform_fn) | 'TransformEval' >> tft.TransformDataset()) # pylint: disable=expression-not-assigned # TODO(b/34231369) Remember to eventually also save the statistics and the # metadata. train_coder = coders.ExampleProtoCoder(train_metadata.schema) (train_dataset | 'SerializeTrainExamples' >> beam.Map(train_coder.encode) | 'ShuffleTraining' >> _Shuffle() # pylint: disable=no-value-for-parameter | 'WriteTraining' >> beam.io.WriteToTFRecord( os.path.join(output_dir, path_constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX), file_name_suffix='.tfrecord.gz')) evaluate_coder = coders.ExampleProtoCoder(evaluate_metadata.schema) (evaluate_dataset | 'SerializeEvalExamples' >> beam.Map(evaluate_coder.encode) | 'ShuffleEval' >> _Shuffle() # pylint: disable=no-value-for-parameter | 'WriteEval' >> beam.io.WriteToTFRecord( os.path.join(output_dir, path_constants.TRANSFORMED_EVAL_DATA_FILE_PREFIX), file_name_suffix='.tfrecord.gz')) if predict_data: predict_mode = tf.contrib.learn.ModeKeys.INFER predict_schema = reddit.make_input_schema(mode=predict_mode) predict_coder = coders.ExampleProtoCoder(predict_schema) serialized_examples = (pipeline | 'ReadPredictData' >> _ReadData( predict_data, mode=predict_mode) # TODO(b/35194257) Obviate the need for this explicit # serialization. | 'EncodePredictData' >> beam.Map( predict_coder.encode)) _ = (serialized_examples | 'WritePredictDataAsTFRecord' >> beam.io.WriteToTFRecord( os.path.join(output_dir, path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX), file_name_suffix='.tfrecord.gz')) _ = (serialized_examples | 'EncodePredictAsB64Json' >> beam.Map(_encode_as_b64_json) | 'WritePredictDataAsText' >> beam.io.WriteToText( os.path.join(output_dir, path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX), file_name_suffix='.txt'))
def preprocess_data(train_neg_file_pattern, train_pos_file_pattern, test_neg_file_pattern, test_pos_file_pattern, transformed_train_file_pattern, transformed_test_file_pattern, transformed_metadata_dir, raw_metadata_dir, transform_func_dir, temp_dir, vocab_size, delimiters): """Transform the data and write out as a TFRecord of Example protos. Read in the data from the positive and negative examples on disk, and transform it using a preprocessing pipeline that removes punctuation, tokenizes and maps tokens to int64 values indices. Args: train_neg_filepattern: Filepattern for training data negative examples train_pos_filepattern: Filepattern for training data positive examples test_neg_filepattern: Filepattern for test data negative examples test_pos_filepattern: Filepattern for test data positive examples transformed_train_filebase: Base filename for transformed training data shards transformed_test_filebase: Base filename for transformed test data shards transformed_metadata_dir: Directory where metadata for transformed data should be written raw_metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema({ REVIEW_COLUMN: dataset_schema.ColumnSchema(tf.string, [], dataset_schema.FixedColumnRepresentation()), LABEL_COLUMN: dataset_schema.ColumnSchema(tf.int64, [], dataset_schema.FixedColumnRepresentation()), })) """ pipeline_name = 'DataflowRunner' options = { 'job_name': ('cloud-ml-hazmat-preprocess-{}'.format(datetime.datetime.now().strftime('%Y%m%d%H%M%S'))), 'temp_location': temp_dir, 'project': "stone-outpost-636", 'max_num_workers': 8 } pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options) #with beam.Pipeline(pipeline_name, options=pipeline_options) as pipeline: # with beam_impl.Context(temp_dir=temp_dir): with beam.Pipeline() as pipeline: with beam_impl.Context(temp_dir=tempfile.mkdtemp()): train_data = pipeline | 'ReadTrain' >> ReadAndShuffleData((train_neg_file_pattern, train_pos_file_pattern)) test_data = pipeline | 'ReadTest' >> ReadAndShuffleData((test_neg_file_pattern, test_pos_file_pattern)) preprocessing_fn = generate_preprocessing_fn(vocab_size, delimiters) (transformed_train_data, transformed_metadata), transform_fn = ((train_data, const.RAW_METADATA) | 'AnalyzeAndTransform' >> beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) _ = (transform_fn | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(transform_func_dir)) transformed_test_data, _ = (((test_data, const.RAW_METADATA), transform_fn) | 'Transform' >> beam_impl.TransformDataset()) _ = (transformed_train_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(transformed_train_file_pattern, coder=example_proto_coder.ExampleProtoCoder(transformed_metadata.schema))) _ = (transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord(transformed_test_file_pattern, coder=example_proto_coder.ExampleProtoCoder(transformed_metadata.schema))) _ = (transformed_metadata | 'WriteTransformedMetadata' >> beam_metadata_io.WriteMetadata(transformed_metadata_dir, pipeline=pipeline)) _ = (const.RAW_METADATA | 'WriteRawMetadata' >> beam_metadata_io.WriteMetadata(raw_metadata_dir, pipeline=pipeline))
def transform_data(train_data_file, test_data_file, transformed_train_filebase, transformed_test_filebase, transformed_metadata_dir, transform_graph_dir): """Transform the data and write out as a TFRecord of Example protos. Read in the data using the CSV reader, and transform it using a preprocessing pipeline that scales numeric data and coverts categorical data from strings to int64 values indices, by creating a vocabulary for each category. Args: train_data_file: File containing training data test_data_file: File containing test data transformed_train_filebase: Base filename for transformed training data shards transformed_test_filebase: Base filename for transformed test data shards transformed_metadata_dir: Directory where metadata for transformed data should be written transform_graph_dir: dir where the beam tf graph should be written """ def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" outputs = {} # Scale numeric columns to have range [0, 1]. for key in NUMERIC_COLUMNS: outputs[key] = tft.scale_to_0_1(inputs[key]) # For all categorical columns except the label column, we use # tft.string_to_int which computes the set of unique values and uses this # to convert the strings to indices. for key in CATEGORICAL_COLUMNS: outputs[key] = tft.string_to_int(inputs[key]) # For the label column we provide the mapping from string to index. def convert_label(label): table = lookup.string_to_index_table_from_tensor(['>50K', '<=50K']) return table.lookup(label) outputs[LABEL_COLUMN] = tft.apply_function(convert_label, inputs[LABEL_COLUMN]) return outputs # The "with" block will create a pipeline, and run that pipeline at the exit # of the block. with beam.Pipeline() as pipeline: with beam_impl.Context(temp_dir=tempfile.mkdtemp()): # Create a coder to read the census data with the schema. To do this we # need to list all columns in order since the schema doesn't specify the # order of columns in the csv. ordered_columns = [ 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'label' ] converter = csv_coder.CsvCoder(ordered_columns, raw_data_schema) # Read in raw data and convert using CSV converter. Note that we apply # some Beam transformations here, which will not be encoded in the TF # graph since we don't do the from within tf.Transform's methods # (AnalyzeDataset, TransformDataset etc.). These transformations are just # to get data into a format that the CSV converter can read, in particular # removing empty lines and removing spaces after commas. raw_data = (pipeline | 'ReadTrainData' >> textio.ReadFromText(train_data_file) | 'FilterTrainData' >> beam.Filter(lambda line: line) | 'FixCommasTrainData' >> beam.Map(lambda line: line.replace(', ', ',')) | 'DecodeTrainData' >> beam.Map(converter.decode)) # Combine data and schema into a dataset tuple. Note that we already used # the schema to read the CSV data, but we also need it to interpret # raw_data. raw_dataset = (raw_data, raw_data_metadata) transformed_dataset, transform_fn = ( raw_dataset | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) #write the beam transform to disk if asked for if not transform_graph_dir is None: _ = (transform_fn | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(transform_graph_dir)) transformed_data, transformed_metadata = transformed_dataset _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( transformed_train_filebase, coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema)) # Now apply transform function to test data. In this case we also remove # the header line from the CSV file and the trailing period at the end of # each line. raw_test_data = ( pipeline | 'ReadTestData' >> textio.ReadFromText(test_data_file) | 'FilterTestData' >> beam.Filter( lambda line: line and line != '|1x3 Cross validator') | 'FixCommasTestData' >> beam.Map(lambda line: line.replace(', ', ',')) | 'RemoveTrailingPeriodsTestData' >> beam.Map(lambda line: line[:-1]) | 'DecodeTestData' >> beam.Map(converter.decode)) raw_test_dataset = (raw_test_data, raw_data_metadata) transformed_test_dataset = ((raw_test_dataset, transform_fn) | beam_impl.TransformDataset()) # Don't need transformed data schema, it's the same as before. transformed_test_data, _ = transformed_test_dataset _ = transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord( transformed_test_filebase, coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema)) _ = (transformed_metadata | 'WriteMetadata' >> beam_metadata_io.WriteMetadata( transformed_metadata_dir, pipeline=pipeline))
def run(flags, pipeline_args): """Run Apache Beam pipeline to generate TFRecords for Survival Analysis""" options = PipelineOptions(flags=[], **pipeline_args) options.view_as(WorkerOptions).machine_type = flags.machine_type temp_dir = os.path.join(flags.output_dir, 'tmp') runner = 'DataflowRunner' if flags.cloud else 'DirectRunner' files = tf.gfile.Glob(flags.input_dir + "*") if not flags.cloud: files = files[0: 20] # if running locally for testing, process less files logging.warning("Number of files: " + str(len(files))) labels = get_labels_array( "gs://columbia-dl-storage-bucket/ADNI_t1_list_with_fsstatus_20190111.csv" ) with beam.Pipeline(runner, options=options) as p: with tft_beam.Context(temp_dir=temp_dir): input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec(features.RAW_FEATURE_SPEC)) filenames = (p | 'Create filenames' >> beam.Create(files)) nii = (filenames | 'Read NII' >> beam.Map(read_nii)) nii_with_labels = ( nii | 'Get Label' >> beam.FlatMap(lambda x: read_label(x, labels))) raw_train, raw_eval, raw_test = ( nii_with_labels | 'RandomlySplitData' >> randomly_split( train_size=.7, validation_size=.15, test_size=.15)) raw_train = raw_train | 'FlattenTrain' >> beam.FlatMap( lambda x: x[1]) raw_eval = (raw_eval | 'FlattenEval' >> beam.FlatMap(lambda x: x[1])) raw_test = (raw_test | 'FlattenTest' >> beam.FlatMap(lambda x: x[1])) raw_train | 'CountLabelFreq' >> extractAndCount(flags.output_dir) dataset_and_metadata, transform_fn = ( (raw_train, input_metadata) | 'TransformData' >> tft_beam.AnalyzeAndTransformDataset( features.preprocess)) transform_fn = ( (raw_train, input_metadata) | 'AnalyzeTrain' >> tft_beam.AnalyzeDataset(features.preprocess)) _ = (transform_fn | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn( flags.output_dir)) for dataset_type, dataset in [('Train', raw_train), ('Eval', raw_eval), ('Predict', raw_test)]: transform_label = 'Transform{}'.format(dataset_type) t, metadata = (((dataset, input_metadata), transform_fn) | transform_label >> tft_beam.TransformDataset()) if dataset_type == 'Train': _ = (metadata | 'WriteMetadata' >> tft_beam_io.WriteMetadata(os.path.join( flags.output_dir, 'transformed_metadata'), pipeline=p)) write_label = 'Write{}TFRecord'.format(dataset_type) _ = t | write_label >> WriteTFRecord( dataset_type, flags.output_dir, metadata)