def main(_): parser = argparse.ArgumentParser() parser.add_argument( '--num_examples', help=('Number of examples to send to the server.'), default=1, type=int) parser.add_argument( '--server', help=('Prediction service host:port or mlengine:model'), required=True) parser.add_argument( '--examples_file', help=('Path to csv file containing examples.'), required=True) parser.add_argument( '--model_name', help=('Model name.'), required=True) parser.add_argument( '--schema_file', help='File holding the schema for the input data', default='schema.pbtxt') known_args, _ = parser.parse_known_args() _do_inference(known_args.server, known_args.examples_file, known_args.num_examples, taxi.read_schema(known_args.schema_file), known_args.model_name)
def train_and_maybe_evaluate(hparams): """Run the training and evaluate using the high level API. Args: hparams: Holds hyperparameters used to train the model as name/value pairs. Returns: The estimator that was used for training (and maybe eval) """ schema = taxi.read_schema(hparams.schema_file) train_input = lambda: model.input_fn( hparams.train_files, hparams.tf_transform_dir, batch_size=TRAIN_BATCH_SIZE ) eval_input = lambda: model.input_fn( hparams.eval_files, hparams.tf_transform_dir, batch_size=EVAL_BATCH_SIZE ) train_spec = tf.estimator.TrainSpec( train_input, max_steps=hparams.train_steps) serving_receiver_fn = lambda: model.example_serving_receiver_fn( hparams.tf_transform_dir, schema) exporter = tf.estimator.FinalExporter('chicago-taxi', serving_receiver_fn) eval_spec = tf.estimator.EvalSpec( eval_input, steps=hparams.eval_steps, exporters=[exporter], name='chicago-taxi-eval') run_config = tf.estimator.RunConfig( save_checkpoints_steps=999, keep_checkpoint_max=1) serving_model_dir = os.path.join(hparams.output_dir, SERVING_MODEL_DIR) run_config = run_config.replace(model_dir=serving_model_dir) estimator = model.build_estimator( hparams.tf_transform_dir, # Construct layers sizes with exponetial decay hidden_units=[ max(2, int(FIRST_DNN_LAYER_SIZE * DNN_DECAY_FACTOR**i)) for i in range(NUM_DNN_LAYERS) ], config=run_config) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) return estimator
def run_experiment(hparams): """Train the model then export it for tf.model_analysis evaluation. Args: hparams: Holds hyperparameters used to train the model as name/value pairs. """ estimator = train_and_maybe_evaluate(hparams) schema = taxi.read_schema(hparams.schema_file) # Save a model for tfma eval eval_model_dir = os.path.join(hparams.output_dir, EVAL_MODEL_DIR) receiver_fn = lambda: model.eval_input_receiver_fn( # pylint: disable=g-long-lambda hparams.tf_transform_dir, schema) tfma.export.export_eval_savedmodel(estimator=estimator, export_dir_base=eval_model_dir, eval_input_receiver_fn=receiver_fn)
def validate_stats(stats_path, schema_path, anomalies_path): """Validates the statistics against the schema and materializes anomalies. Args: stats_path: Location of the stats used to infer the schema. schema_path: Location of the schema to be used for validation. anomalies_path: Location where the detected anomalies are materialized. """ print('Validating schema against the computed statistics.') schema = taxi.read_schema(schema_path) stats = tfdv.load_statistics(stats_path) anomalies = tfdv.validate_statistics(stats, schema) print('Detected following anomalies:') print(text_format.MessageToString(anomalies)) print('Writing anomalies to anomalies path.') file_io.write_string_to_file(anomalies_path, text_format.MessageToString(anomalies))
def run_experiment(hparams): """Train the model then export it for tf.model_analysis evaluation. Args: hparams: Holds hyperparameters used to train the model as name/value pairs. """ estimator = train_and_maybe_evaluate(hparams) schema = taxi.read_schema(hparams.schema_file) # Save a model for tfma eval eval_model_dir = os.path.join(hparams.output_dir, EVAL_MODEL_DIR) receiver_fn = lambda: model.eval_input_receiver_fn( # pylint: disable=g-long-lambda hparams.tf_transform_dir, schema) tfma.export.export_eval_savedmodel( estimator=estimator, export_dir_base=eval_model_dir, eval_input_receiver_fn=receiver_fn)
def transform_data(input_handle, outfile_prefix, working_dir, schema_file, transform_dir=None, max_rows=None, pipeline_args=None): """The main tf.transform method which analyzes and transforms data. Args: input_handle: BigQuery table name to process specified as DATASET.TABLE or path to csv file with input data. outfile_prefix: Filename prefix for emitted transformed examples working_dir: Directory in which transformed examples and transform function will be emitted. schema_file: An file path that contains a text-serialized TensorFlow metadata schema of the input data. transform_dir: Directory in which the transform output is located. If provided, this will load the transform_fn from disk instead of computing it over the data. Hint: this is useful for transforming eval data. max_rows: Number of rows to query from BigQuery pipeline_args: additional DataflowRunner or DirectRunner args passed to the beam pipeline. """ def transform_ngrams(input, ngram_range): """ helper function to transform ngrams and print output. """ # this print statement causes output to concat itself! # input = tf.Print(input, [input], "raw input:", first_n=-1, summarize=100) transformed = transform.ngrams( tf.string_split(input, delimiter=" "), ngram_range=ngram_range, separator=' ') # SparseTensor basically cannot be printed because it's made up of 3 # tensors. We can use this trick to print the values column, but without the index # it's not too meaningful. # # values = tf.Print(transformed.values, [transformed.values], "ngram output:") # transformed = tf.SparseTensor( # indices=transformed.indices, # values=values, # dense_shape=transformed.dense_shape) return transformed def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. https://cloud.google.com/solutions/machine-learning/data-preprocessing-for-ml-with-tf-transform-pt2 Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in taxi.DENSE_FLOAT_FEATURE_KEYS: print('processing key', key) print('input:', inputs[key]) # Preserve this feature as a dense float, setting nan's to the mean. outputs[taxi.transformed_name(key)] = transform.scale_to_z_score( _fill_in_missing(inputs[key])) for key in taxi.VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[ taxi.transformed_name(key)] = transform.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), top_k=taxi.VOCAB_SIZE, num_oov_buckets=taxi.OOV_SIZE) # for key in taxi.FEATURE_NGRAM: # # Extract nggrams and build a vocab. # outputs[ # taxi.transformed_name(key)] = transform.compute_and_apply_vocabulary( # transform.ngrams( # tf.string_split(_fill_in_missing(inputs[key])), # ngram_range=taxi.NGRAM_RANGE, # separator=' '), # top_k=512, # num_oov_buckets=taxi.OOV_SIZE) for key in taxi.FEATURE_NGRAM: # Extract nggrams and build a vocab. outputs[ taxi.transformed_name(key)] = transform.compute_and_apply_vocabulary( transform_ngrams(_fill_in_missing(inputs[key]), taxi.NGRAM_RANGE), top_k=taxi.VOCAB_SIZE, num_oov_buckets=taxi.OOV_SIZE) for key in taxi.BUCKET_FEATURE_KEYS: outputs[taxi.transformed_name(key)] = transform.bucketize( _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT) for key in taxi.CATEGORICAL_FEATURE_KEYS: outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key]) # Was this passenger a big tipper? taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY]) tips = _fill_in_missing(inputs[taxi.LABEL_KEY]) outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where( tf.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64), # Test if the tip was > 20% of the fare. tf.cast( tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64)) return outputs schema = taxi.read_schema(schema_file) raw_feature_spec = taxi.get_raw_feature_spec(schema) raw_schema = dataset_schema.from_feature_spec(raw_feature_spec) raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema) with beam.Pipeline(argv=pipeline_args) as pipeline: with tft_beam.Context(temp_dir=working_dir): if input_handle.lower().endswith('csv'): csv_coder = taxi.make_csv_coder(schema, input_handle.lower()) raw_data = ( pipeline | 'ReadFromText' >> beam.io.ReadFromText( input_handle, skip_header_lines=1)) decode_transform = beam.Map(csv_coder.decode) else: query = taxi.make_sql(input_handle, max_rows, for_eval=False) raw_data = ( pipeline | 'ReadBigQuery' >> beam.io.Read( beam.io.BigQuerySource(query=query, use_standard_sql=True))) decode_transform = beam.Map( taxi.clean_raw_data_dict, raw_feature_spec=raw_feature_spec) if transform_dir is None: decoded_data = raw_data | 'DecodeForAnalyze' >> decode_transform transform_fn = ( (decoded_data, raw_data_metadata) | ('Analyze' >> tft_beam.AnalyzeDataset(preprocessing_fn))) _ = ( transform_fn | ('WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))) else: transform_fn = pipeline | tft_beam.ReadTransformFn(transform_dir) # Shuffling the data before materialization will improve Training # effectiveness downstream. Here we shuffle the raw_data (as opposed to # decoded data) since it has a compact representation. shuffled_data = raw_data | 'RandomizeData' >> beam.transforms.Reshuffle() decoded_data = shuffled_data | 'DecodeForTransform' >> decode_transform (transformed_data, transformed_metadata) = ( ((decoded_data, raw_data_metadata), transform_fn) | 'Transform' >> tft_beam.TransformDataset()) coder = example_proto_coder.ExampleProtoCoder(transformed_metadata.schema) _ = ( transformed_data | 'SerializeExamples' >> beam.Map(coder.encode) | 'WriteExamples' >> beam.io.WriteToTFRecord( os.path.join(working_dir, outfile_prefix), file_name_suffix='.gz') )
def process_tfma(eval_result_dir, schema_file, input_csv=None, big_query_table=None, eval_model_dir=None, max_eval_rows=None, pipeline_args=None): """Runs a batch job to evaluate the eval_model against the given input. Args: eval_result_dir: A directory where the evaluation result should be written to. schema_file: A file containing a text-serialized Schema that describes the eval data. input_csv: A path to a csv file which should be the input for evaluation. This can only be set if big_query_table is None. big_query_table: A BigQuery table name specified as DATASET.TABLE which should be the input for evaluation. This can only be set if input_csv is None. eval_model_dir: A directory where the eval model is located. max_eval_rows: Number of rows to query from BigQuery. pipeline_args: additional DataflowRunner or DirectRunner args passed to the beam pipeline. Raises: ValueError: if input_csv and big_query_table are not specified correctly. """ if input_csv == big_query_table and input_csv is None: raise ValueError( 'one of --input_csv or --big_query_table should be provided.') slice_spec = [ tfma.slicer.SingleSliceSpec(), tfma.slicer.SingleSliceSpec(columns=['trip_start_hour']) ] schema = taxi.read_schema(schema_file) eval_shared_model = tfma.default_eval_shared_model( eval_saved_model_path=eval_model_dir, add_metrics_callbacks=[ tfma.post_export_metrics.calibration_plot_and_prediction_histogram(), tfma.post_export_metrics.auc_plots() ]) with beam.Pipeline(argv=pipeline_args) as pipeline: if input_csv: csv_coder = taxi.make_csv_coder(schema) raw_data = ( pipeline | 'ReadFromText' >> beam.io.ReadFromText( input_csv, skip_header_lines=1) | 'ParseCSV' >> beam.Map(csv_coder.decode)) else: assert big_query_table query = taxi.make_sql(big_query_table, max_eval_rows, for_eval=True) raw_feature_spec = taxi.get_raw_feature_spec(schema) raw_data = ( pipeline | 'ReadBigQuery' >> beam.io.Read( beam.io.BigQuerySource(query=query, use_standard_sql=True)) | 'CleanData' >> beam.Map(lambda x: (taxi.clean_raw_data_dict(x, raw_feature_spec)))) # Examples must be in clean tf-example format. coder = taxi.make_proto_coder(schema) _ = ( raw_data | 'ToSerializedTFExample' >> beam.Map(coder.encode) | 'ExtractEvaluateAndWriteResults' >> tfma.ExtractEvaluateAndWriteResults( eval_shared_model=eval_shared_model, slice_spec=slice_spec, output_path=eval_result_dir))