def _do_inference(hostport, examples_file, num_examples): """Sends a request to the model and returns the result. Args: hostport: path to prediction service like host:port examples_file: path to csv file containing examples, with the first line assumed to have the column headers num_examples: number of requests to send to the server Returns: Response from model server """ host, port = hostport.split(':') channel = implementations.insecure_channel(host, int(port)) stub = prediction_service_pb2.beta_create_PredictionService_stub(channel) csv_coder = taxi.make_csv_coder() f = open(examples_file, 'r') f.readline() # skip header line for _ in range(num_examples): request = predict_pb2.PredictRequest() request.model_spec.name = 'chicago_taxi' request.model_spec.signature_name = 'predict' one_line = f.readline() if not one_line: print('End of example file reached') return one_example = taxi.clean_raw_data_dict(csv_coder.decode(one_line)) print(one_example) raw_feature_spec = taxi.get_raw_feature_spec() for key, val in six.iteritems(one_example): if key != 'tips': tfproto = tf.contrib.util.make_tensor_proto( val, shape=[1], dtype=raw_feature_spec[key].dtype) request.inputs[key].CopyFrom(tfproto) return stub.Predict(request, _TIMEOUT_SECONDS)
def process_tfma(schema_file, big_query_table=None, eval_model_dir=None, max_eval_rows=None, pipeline_args=None, publish_to_bq=False, project=None, metrics_table=None, metrics_dataset=None): """Runs a batch job to evaluate the eval_model against the given input. Args: schema_file: A file containing a text-serialized Schema that describes the eval data. big_query_table: A BigQuery table name specified as DATASET.TABLE which should be the input for evaluation. This can only be set if input_csv is None. eval_model_dir: A directory where the eval model is located. max_eval_rows: Number of rows to query from BigQuery. pipeline_args: additional DataflowRunner or DirectRunner args passed to the beam pipeline. publish_to_bq: project: metrics_dataset: metrics_table: Raises: ValueError: if input_csv and big_query_table are not specified correctly. """ if big_query_table is None: raise ValueError( '--big_query_table should be provided.') slice_spec = [ tfma.slicer.SingleSliceSpec(), tfma.slicer.SingleSliceSpec(columns=['trip_start_hour']) ] metrics_namespace = metrics_table schema = taxi.read_schema(schema_file) eval_shared_model = tfma.default_eval_shared_model( eval_saved_model_path=eval_model_dir, add_metrics_callbacks=[ tfma.post_export_metrics.calibration_plot_and_prediction_histogram(), tfma.post_export_metrics.auc_plots() ]) metrics_monitor = None if publish_to_bq: metrics_monitor = MetricsReader( publish_to_bq=publish_to_bq, project_name=project, bq_table=metrics_table, bq_dataset=metrics_dataset, filters=MetricsFilter().with_namespace(metrics_namespace) ) pipeline = beam.Pipeline(argv=pipeline_args) query = taxi.make_sql(big_query_table, max_eval_rows, for_eval=True) raw_feature_spec = taxi.get_raw_feature_spec(schema) raw_data = ( pipeline | 'ReadBigQuery' >> ReadFromBigQuery(query=query, project=project, use_standard_sql=True) | 'Measure time: Start' >> beam.ParDo(MeasureTime(metrics_namespace)) | 'CleanData' >> beam.Map(lambda x: ( taxi.clean_raw_data_dict(x, raw_feature_spec)))) # Examples must be in clean tf-example format. coder = taxi.make_proto_coder(schema) # Prepare arguments for Extract, Evaluate and Write steps extractors = tfma.default_extractors( eval_shared_model=eval_shared_model, slice_spec=slice_spec, desired_batch_size=None, materialize=False) evaluators = tfma.default_evaluators( eval_shared_model=eval_shared_model, desired_batch_size=None, num_bootstrap_samples=1) _ = ( raw_data | 'ToSerializedTFExample' >> beam.Map(coder.encode) | 'Extract Results' >> tfma.InputsToExtracts() | 'Extract and evaluate' >> tfma.ExtractAndEvaluate( extractors=extractors, evaluators=evaluators) | 'Map Evaluations to PCollection' >> MapEvalToPCollection() | 'Measure time: End' >> beam.ParDo( MeasureTime(metrics_namespace)) ) result = pipeline.run() result.wait_until_finish() if metrics_monitor: metrics_monitor.publish_metrics(result)
def process_tfma(eval_result_dir, schema_file, input_csv=None, big_query_table=None, eval_model_dir=None, max_eval_rows=None, pipeline_args=None): """Runs a batch job to evaluate the eval_model against the given input. Args: eval_result_dir: A directory where the evaluation result should be written to. schema_file: A file containing a text-serialized Schema that describes the eval data. input_csv: A path to a csv file which should be the input for evaluation. This can only be set if big_query_table is None. big_query_table: A BigQuery table name specified as DATASET.TABLE which should be the input for evaluation. This can only be set if input_csv is None. eval_model_dir: A directory where the eval model is located. max_eval_rows: Number of rows to query from BigQuery. pipeline_args: additional DataflowRunner or DirectRunner args passed to the beam pipeline. Raises: ValueError: if input_csv and big_query_table are not specified correctly. """ if input_csv == big_query_table and input_csv is None: raise ValueError( 'one of --input_csv or --big_query_table should be provided.') slice_spec = [ slicer.SingleSliceSpec(), slicer.SingleSliceSpec(columns=['trip_start_hour']) ] schema = taxi.read_schema(schema_file) with beam.Pipeline(argv=pipeline_args) as pipeline: if input_csv: csv_coder = taxi.make_csv_coder(schema) raw_data = ( pipeline | 'ReadFromText' >> beam.io.ReadFromText( input_csv, skip_header_lines=1) | 'ParseCSV' >> beam.Map(csv_coder.decode)) else: assert big_query_table query = taxi.make_sql(big_query_table, max_eval_rows, for_eval=True) raw_feature_spec = taxi.get_raw_feature_spec(schema) raw_data = ( pipeline | 'ReadBigQuery' >> beam.io.Read( beam.io.BigQuerySource(query=query, use_standard_sql=True)) | 'CleanData' >> beam.Map(lambda x: (taxi.clean_raw_data_dict(x, raw_feature_spec)))) # Examples must be in clean tf-example format. coder = taxi.make_proto_coder(schema) _ = ( raw_data | 'ToSerializedTFExample' >> beam.Map(coder.encode) | 'EvaluateAndWriteResults' >> tfma.EvaluateAndWriteResults( eval_saved_model_path=eval_model_dir, slice_spec=slice_spec, add_metrics_callbacks=[ post_export_metrics.calibration_plot_and_prediction_histogram(), post_export_metrics.auc_plots() ], output_path=eval_result_dir))
def process_tfma(eval_result_dir, schema_file, input_csv=None, big_query_table=None, eval_model_dir=None, max_eval_rows=None, pipeline_args=None): """Runs a batch job to evaluate the eval_model against the given input. Args: eval_result_dir: A directory where the evaluation result should be written to. schema_file: A file containing a text-serialized Schema that describes the eval data. input_csv: A path to a csv file which should be the input for evaluation. This can only be set if big_query_table is None. big_query_table: A BigQuery table name specified as DATASET.TABLE which should be the input for evaluation. This can only be set if input_csv is None. eval_model_dir: A directory where the eval model is located. max_eval_rows: Number of rows to query from BigQuery. pipeline_args: additional DataflowRunner or DirectRunner args passed to the beam pipeline. Raises: ValueError: if input_csv and big_query_table are not specified correctly. """ if input_csv == big_query_table and input_csv is None: raise ValueError( 'one of --input_csv or --big_query_table should be provided.') slice_spec = [ tfma.slicer.SingleSliceSpec(), tfma.slicer.SingleSliceSpec(columns=['trip_start_hour']) ] schema = taxi.read_schema(schema_file) eval_shared_model = tfma.default_eval_shared_model( eval_saved_model_path=eval_model_dir, add_metrics_callbacks=[ tfma.post_export_metrics.calibration_plot_and_prediction_histogram(), tfma.post_export_metrics.auc_plots() ]) with beam.Pipeline(argv=pipeline_args) as pipeline: if input_csv: csv_coder = taxi.make_csv_coder(schema) raw_data = ( pipeline | 'ReadFromText' >> beam.io.ReadFromText( input_csv, skip_header_lines=1) | 'ParseCSV' >> beam.Map(csv_coder.decode)) else: assert big_query_table query = taxi.make_sql(big_query_table, max_eval_rows, for_eval=True) raw_feature_spec = taxi.get_raw_feature_spec(schema) raw_data = ( pipeline | 'ReadBigQuery' >> beam.io.Read( beam.io.BigQuerySource(query=query, use_standard_sql=True)) | 'CleanData' >> beam.Map(lambda x: (taxi.clean_raw_data_dict(x, raw_feature_spec)))) # Examples must be in clean tf-example format. coder = taxi.make_proto_coder(schema) _ = ( raw_data | 'ToSerializedTFExample' >> beam.Map(coder.encode) | 'ExtractEvaluateAndWriteResults' >> tfma.ExtractEvaluateAndWriteResults( eval_shared_model=eval_shared_model, slice_spec=slice_spec, output_path=eval_result_dir))