def preprocess(pipeline, args): """Run pre-processing step as a pipeline. Args: pipeline: beam pipeline. args: parsed command line arguments. """ from preproc import movielens # pylint: disable=g-import-not-at-top # 1) Read the data into pcollections. movies_coder = tft_coders.CsvCoder(movielens.MOVIE_COLUMNS, movielens.make_movies_schema(), secondary_delimiter='|', multivalent_columns=['genres']) movies_data = (pipeline | 'ReadMoviesData' >> beam.io.ReadFromText( os.path.join(args.input_dir, 'movies.csv'), # TODO(b/35653662): Obviate the need for setting this. coder=beam.coders.BytesCoder(), skip_header_lines=args.skip_header_lines) | 'DecodeMovies' >> beam.Map(movies_coder.decode) | 'KeyByMovie' >> beam.Map(lambda x: (x['movie_id'], x))) ratings_coder = tft_coders.CsvCoder(movielens.RATING_COLUMNS, movielens.make_ratings_schema()) ratings_data = (pipeline | 'ReadRatingsData' >> beam.io.ReadFromText( os.path.join(args.input_dir, 'ratings*'), skip_header_lines=args.skip_header_lines) | 'DecodeRatings' >> beam.Map(ratings_coder.decode) | 'KeyByUser' >> beam.Map(lambda x: (x['user_id'], x)) | 'GroupByUser' >> beam.GroupByKey()) def train_eval_partition_fn((user_id, _), unused_num_partitions): return movielens.partition_fn( user_id, args.partition_random_seed, args.percent_eval)
def make_csv_coder(schema_file, mode): """Creates instance of CsvCoder. Args: schema_file: Serialized Schema proto file. mode: One of tf.estimator.ModeKeys.{TRAIN, EVAL, PREDICT}. Returns: Instance of CsvCoder. """ schema = make_dataset_schema(schema_file, mode) if mode == tf.estimator.ModeKeys.PREDICT: features = list(features_config.ALL_FEATURES) features.remove(features_config.TARGET_FEATURE) return tft_coders.CsvCoder(features, schema) return tft_coders.CsvCoder(features_config.ALL_FEATURES, schema)
def make_csv_coder(schema): """Return a coder for tf.transform to read csv files.""" raw_feature_spec = get_raw_feature_spec(schema) parsing_schema = dataset_schema.from_feature_spec(raw_feature_spec) return tft_coders.CsvCoder(_CSV_COLUMNS_NAMES, parsing_schema, delimiter='|')
def make_csv_coder(schema, mode=tf.contrib.learn.ModeKeys.TRAIN): """Produces a CsvCoder from a data schema. Args: schema: A tf.Transform `Schema` object. mode: tf.contrib.learn.ModeKeys specifying if the source is being used for train/eval or prediction. Returns: A tf.Transform CsvCoder. """ column_names += ['score', 'subreddit', 'example_id'] return coders.CsvCoder(column_names, schema)
def make_csv_coder(): """Return a coder for tf.transform to read csv files.""" column_names = [ 'pickup_community_area', 'fare', 'trip_start_month', 'trip_start_hour', 'trip_start_day', 'trip_start_timestamp', 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude', 'trip_miles', 'pickup_census_tract', 'dropoff_census_tract', 'payment_type', 'company', 'dropoff_community_area', 'tips', 'trip_seconds' ] parsing_feature_spec = get_raw_feature_spec() parsing_schema = dataset_schema.from_feature_spec(parsing_feature_spec) return tft_coders.CsvCoder(column_names, parsing_schema)
def build_graph(self): # Move percentage of train data to .`PPGRAPH_EXT` files, used for graph building. # num_lines = 0 # for i in range(DATASET_NUM_SHARDS): # _fname = '{}-{:05}-of-{:05}'.format(self.train_fname_out, i, self.config.DATASET_NUM_SHARDS) # num_lines += sum(1 for _ in open(_fname)) # _fname_marked = '{}-{:05}-of-{:05}.{}'.format(self.train_fname_out, i, self.config.DATASET_NUM_SHARDS, # PPGRAPH_EXT) # shutil.move(_fname, _fname_marked) # if num_lines >= self.config.PPGRAPH_MAX_SAMPLES: # break # Set up the preprocessing pipeline for analyzing the dataset. The analyze call is not combined with the # transform call because we will parallelize the transform call later. We had the issue that this process # runs on a single core and tends to cause OOM issues. pipeline = beam.Pipeline(runner=DirectRunner()) with beam_impl.Context(temp_dir=tempfile.mkdtemp()): # todo: maybe, I should only use train data (or percentage of train data) to build the graph raw_train_data = ( pipeline | 'ReadTrainDataFile' >> textio.ReadFromText( 'data/features' + '*' + 'shard' + '*', skip_header_lines=0) | 'DecodeTrainDataCSV' >> MapAndFilterErrors( tft_coders.CsvCoder( self.data_formatter.get_ordered_columns(), self.data_formatter.get_raw_data_metadata().schema). decode)) # Combine data and schema into a dataset tuple. Note that we already used # the schema to read the CSV data, but we also need it to interpret # raw_data. # That is when to use vocabulary, scale_to_0_1 or sparse_to_dense ... transform_fn = ( (raw_train_data, self.data_formatter.get_raw_data_metadata()) | beam_impl.AnalyzeDataset( PreprocessingFunction().transform_to_tfrecord)) # Write SavedModel and metadata to two subdirectories of working_dir, given by # `transform_fn_io.TRANSFORM_FN_DIR` and `transform_fn_io.TRANSFORMED_METADATA_DIR` respectively. _ = (transform_fn | 'WriteTransformGraph' >> transform_fn_io.WriteTransformFn(TARGET_DIR)) # working dir # Run the Beam preprocessing pipeline. st = time.time() result = pipeline.run() result.wait_until_finish() self.logger.info( 'Transformation graph built and written in {:.2f} sec'.format( time.time() - st))
def run_analysis(output_dir, model_dir, eval_path, schema, project, mode, slice_columns): if mode == 'local': pipeline_options = None runner = 'DirectRunner' elif mode == 'cloud': tmp_location = os.path.join(output_dir, 'tmp') options = { 'job_name': 'pipeline-tfma-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S'), 'setup_file': './analysis/setup.py', 'project': project, 'temp_location': tmp_location, } pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options) runner = 'DataFlowRunner' else: raise ValueError("Invalid mode %s." % mode) column_names = [x['name'] for x in schema] for slice_column in slice_columns: if slice_column not in column_names: raise ValueError("Unknown slice column: %s" % slice_column) slice_spec = [ slicer.SingleSliceSpec( ), # An empty spec is required for the 'Overall' slice slicer.SingleSliceSpec(columns=slice_columns) ] with beam.Pipeline(runner=runner, options=pipeline_options) as pipeline: raw_feature_spec = get_raw_feature_spec(schema) raw_schema = dataset_schema.from_feature_spec(raw_feature_spec) example_coder = tft_coders.example_proto_coder.ExampleProtoCoder( raw_schema) csv_coder = tft_coders.CsvCoder(column_names, raw_schema) raw_data = ( pipeline | 'ReadFromText' >> beam.io.ReadFromText(eval_path) | 'ParseCSV' >> beam.Map(csv_coder.decode) | 'CleanData' >> beam.Map(clean_raw_data_dict(raw_feature_spec)) | 'ToSerializedTFExample' >> beam.Map(example_coder.encode) | 'EvaluateAndWriteResults' >> tfma.EvaluateAndWriteResults( eval_saved_model_path=model_dir, slice_spec=slice_spec, output_path=output_dir))
def _make_transform_fn(self, p, output_path): def preprocessing_fn(inputs): return {'x_scaled': tft.scale_to_0_1(inputs['x'])} schema = dataset_schema.from_feature_spec( {'x': tf.FixedLenFeature((), tf.float32, 0)}) metadata = dataset_metadata.DatasetMetadata(schema=schema) columns = p | 'CreateTrainingData' >> beam.Create([{ 'x': v } for v in [4, 1, 5, 2]]) _, result = ( (columns, metadata) | 'AnalyzeAndTransform' >> beam_impl.AnalyzeAndTransformDataset(preprocessing_fn, output_path)) coder = coders.CsvCoder(['x'], schema, delimiter='\t') return result, coder
def make_csv_coder(schema, mode=tf.contrib.learn.ModeKeys.TRAIN): """Produces a CsvCoder from a data schema. Args: schema: A tf.Transform `Schema` object. mode: tf.contrib.learn.ModeKeys specifying if the source is being used for train/eval or prediction. Returns: A tf.Transform CsvCoder. """ column_names = [] if mode == tf.contrib.learn.ModeKeys.INFER else ['score'] column_names += [ 'created_utc', 'subreddit', 'author', 'comment_body', 'comment_parent_body', 'toplevel', 'example_id' ] return coders.CsvCoder(column_names, schema)
def make_csv_coder(schema, mode=tf.contrib.learn.ModeKeys.TRAIN): """Produces a CsvCoder (with tab as the delimiter) from a data schema. Args: schema: A tf.Transform `Schema` object. mode: tf.contrib.learn.ModeKeys specifying if the source is being used for train/eval or prediction. Returns: A tf.Transform CsvCoder. """ column_names = CSV_ORDERED_COLUMNS if mode == tf.contrib.learn.ModeKeys.INFER: column_names.remove(LABEL_COLUMN) return coders.CsvCoder(column_names, schema, delimiter=',')
def make_tsv_coder(schema, mode=tf.contrib.learn.ModeKeys.TRAIN): """Produces a CsvCoder (with tab as the delimiter) from a data schema. Args: schema: A tf.Transform `Schema` object. mode: tf.contrib.learn.ModeKeys specifying if the source is being used for train/eval or prediction. Returns: A tf.Transform CsvCoder. """ column_names = [] if mode == tf.contrib.learn.ModeKeys.INFER else ['clicked'] for name in INTEGER_COLUMN_NAMES: column_names.append(name) for name in CATEGORICAL_COLUMN_NAMES: column_names.append(name) return coders.CsvCoder(column_names, schema, delimiter='\t')
def preprocess(pipeline, args): input_metadata = metadata_io.read_metadata( os.path.join(args.analyze_output_dir, RAW_METADATA_DIR)) schema = json.loads(file_io.read_file_to_string( os.path.join(args.analyze_output_dir, SCHEMA_FILE)).decode()) features = json.loads(file_io.read_file_to_string( os.path.join(args.analyze_output_dir, FEATURES_FILE)).decode()) column_names = [col['name'] for col in schema] exclude_outputs = None if not args.target: for name, transform in six.iteritems(features): if transform['transform'] == TARGET_TRANSFORM: target_name = name column_names.remove(target_name) exclude_outputs = [target_name] del input_metadata.schema.column_schemas[target_name] break if args.csv_file_pattern: coder = coders.CsvCoder(column_names, input_metadata.schema, delimiter=',') raw_data = ( pipeline | 'ReadCsvData' >> beam.io.ReadFromText(args.csv_file_pattern) | 'ParseCsvData' >> beam.Map(coder.decode)) else: columns = ', '.join(column_names) query = 'SELECT {columns} FROM `{table}`'.format(columns=columns, table=args.bigquery_table) raw_data = ( pipeline | 'ReadBiqQueryData' >> beam.io.Read(beam.io.BigQuerySource(query=query, use_standard_sql=True))) # Note that prepare_image_transforms does not make embeddints, it justs reads # the image files and converts them to base64 stings. tft.TransformDataset() # will apply the saved model that makes the image embeddings. image_columns = image_transform_columns(features) raw_data = ( raw_data | 'PreprocessTransferredLearningTransformations' >> beam.Map(prepare_image_transforms, image_columns)) if args.shuffle: raw_data = raw_data | 'ShuffleData' >> shuffle() transform_fn = ( pipeline | 'ReadTransformFn' >> tft_beam_io.ReadTransformFn(args.analyze_output_dir)) (transformed_data, transform_metadata) = ( ((raw_data, input_metadata), transform_fn) | 'ApplyTensorflowPreprocessingGraph' >> tft.TransformDataset(exclude_outputs)) tfexample_coder = coders.ExampleProtoCoder(transform_metadata.schema) _ = (transformed_data | 'SerializeExamples' >> beam.Map(tfexample_coder.encode) | 'WriteExamples' >> beam.io.WriteToTFRecord( os.path.join(args.output_dir, args.output_filename_prefix), file_name_suffix='.tfrecord.gz'))
import apache_beam as beam import tensorflow as tf import tensorflow_transform as tft from apache_beam.io import tfrecordio from tensorflow_transform import coders from tensorflow_transform.beam import impl as beam_impl from tensorflow_transform.beam.tft_beam_io import transform_fn_io from tensorflow_transform.tf_metadata import dataset_metadata from trainer.config import PROJECT_ID, BUCKET, TRAIN_INPUT_DATA, TRAIN_OUTPUT_DATA, TFRECORD_DIR, MODEL_DIR, \ INPUT_SCHEMA, OUTPUT_SCHEMA, EXAMPLE_SCHEMA delimiter = ';' converter_input = coders.CsvCoder([ 'BatchId', 'ButterMass', 'ButterTemperature', 'SugarMass', 'SugarHumidity', 'FlourMass', 'FlourHumidity', 'HeatingTime', 'MixingSpeed', 'MixingTime' ], INPUT_SCHEMA, delimiter=delimiter) converter_output = coders.CsvCoder([ 'BatchId', 'TotalVolume', 'Density', 'Temperature', 'Humidity', 'Energy', 'Problems' ], OUTPUT_SCHEMA, delimiter=delimiter) input_metadata = dataset_metadata.DatasetMetadata(schema=EXAMPLE_SCHEMA) def extract_batchkey(record): """Extracts the BatchId out of the record Args: record (dict): record of decoded CSV line
def _make_csv_coder(schema, column_names): """Return a coder for tf.transform to read csv files.""" raw_feature_spec = _get_raw_feature_spec(schema) parsing_schema = schema_utils.schema_from_feature_spec(raw_feature_spec) return tft_coders.CsvCoder(column_names, parsing_schema)
def make_csv_coder(schema): """Return a coder for tf.transform to read csv files.""" raw_feature_spec = get_raw_feature_spec(schema) parsing_schema = schema_utils.schema_from_feature_spec(raw_feature_spec) return tft_coders.CsvCoder(CSV_COLUMN_NAMES, parsing_schema)
def write_to_tfrecord(args): """ This function is supposed to be called as a script. """ # Decode arguments current_index, num_shards, train_split_fname_out, eval_split_fname_out, \ exp_log_data_file_train_tfrecord, exp_log_data_file_eval_tfrecord, working_dir, data_formatter_module_path = args # num_shards = "32" current_index, num_shards = int(current_index), int(num_shards) split_train_file_pattern = '{}-{:05}-of-{:05}'.format( train_split_fname_out, current_index, num_shards) + '*' split_eval_file_pattern = '{}-{:05}-of-{:05}'.format( eval_split_fname_out, current_index, num_shards) log.info('exp_log_data_file_train_tfrecord {}'.format( exp_log_data_file_train_tfrecord)) log.info('exp_log_data_file_eval_tfrecord {}'.format( exp_log_data_file_eval_tfrecord)) log.info('split_train_file_pattern {}'.format(split_train_file_pattern)) log.info('split_eval_file_pattern {}'.format(split_eval_file_pattern)) data_formatter = import_from_uri( data_formatter_module_path).DataFormatter() # Set up the preprocessing pipeline. pipeline = beam.Pipeline(runner=DirectRunner()) with beam_impl.Context(temp_dir=tempfile.mkdtemp()): # Read raw data files: CSV format ordered according to the `data_formatter`, that are then converted # into a cleaned up format. raw_train_data = ( pipeline | 'ReadTrainDataFile' >> textio.ReadFromText( split_train_file_pattern, skip_header_lines=0) | 'DecodeTrainDataCSV' >> MapAndFilterErrors( tft_coders.CsvCoder( data_formatter.get_features_and_targets(), data_formatter.get_features_metadata().schema).decode)) raw_eval_data = ( pipeline | 'ReadEvalDataFile' >> textio.ReadFromText( split_eval_file_pattern, skip_header_lines=0) | 'DecodeEvalDataCSV' >> MapAndFilterErrors( tft_coders.CsvCoder( data_formatter.get_features_and_targets(), data_formatter.get_features_metadata().schema).decode)) # Examples in tf-example format (for model analysis purposes). # raw_feature_spec = data_formatter.RAW_DATA_METADATA.schema.as_feature_spec() # raw_schema = dataset_schema.from_feature_spec(raw_feature_spec) # coder = example_proto_coder.ExampleProtoCoder(raw_schema) # # _ = ( # raw_eval_data # | 'ToSerializedTFExample' >> beam.Map(coder.encode) # | 'WriteAnalysisTFRecord' >> tfrecordio.WriteToTFRecord( # '{}-{:05}-of-{:05}'.format(analysis_fname_out, i, num_shards), # shard_name_template='', num_shards=1) # ) # Write SavedModel and metadata to two subdirectories of working_dir, given by # `transform_fn_io.TRANSFORM_FN_DIR` and `transform_fn_io.TRANSFORMED_METADATA_DIR` respectively. transform_fn = (pipeline | 'ReadTransformGraph' >> transform_fn_io.ReadTransformFn(working_dir)) # Applies the transformation `transform_fn` to the raw eval dataset (transformed_train_data, transformed_metadata) = ( ((raw_train_data, data_formatter.get_features_metadata()), transform_fn) | 'TransformTrainData' >> beam_impl.TransformDataset()) # Applies the transformation `transform_fn` to the raw eval dataset (transformed_eval_data, transformed_metadata) = ( ((raw_eval_data, data_formatter.get_features_metadata()), transform_fn) | 'TransformEvalData' >> beam_impl.TransformDataset()) # The data schema of the transformed data gets used to build a signature to create # a TFRecord (tf binary data format). This signature is a wrapper function used to # encode transformed data. transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) _ = (transformed_train_data | 'EncodeTrainDataTransform' >> MapAndFilterErrors( transformed_data_coder.encode) | 'WriteTrainDataTFRecord' >> tfrecordio.WriteToTFRecord('{}-{:05}-of-{:05}'.format( exp_log_data_file_train_tfrecord, current_index, num_shards), shard_name_template='', num_shards=1)) _ = (transformed_eval_data | 'EncodeEvalDataTransform' >> MapAndFilterErrors( transformed_data_coder.encode) | 'WriteEvalDataTFRecord' >> tfrecordio.WriteToTFRecord('{}-{:05}-of-{:05}'.format( exp_log_data_file_eval_tfrecord, current_index, num_shards), shard_name_template='', num_shards=1)) result = pipeline.run() result.wait_until_finish()