Пример #1
0
def example_serving_receiver_fn(tf_transform_dir, schema):
  """Build the serving in inputs.

  Args:
    tf_transform_dir: directory in which the tf-transform model was written
      during the preprocessing step.
    schema: the schema of the input data.

  Returns:
    Tensorflow graph which parses examples, applying tf-transform to them.
  """
  raw_feature_spec = taxi.get_raw_feature_spec(schema)
  raw_feature_spec.pop(taxi.LABEL_KEY)

  raw_input_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(
      raw_feature_spec, default_batch_size=None)
  serving_input_receiver = raw_input_fn()

  _, transformed_features = (
      saved_transform_io.partially_apply_saved_transform(
          os.path.join(tf_transform_dir, transform_fn_io.TRANSFORM_FN_DIR),
          serving_input_receiver.features))

  return tf.estimator.export.ServingInputReceiver(
      transformed_features, serving_input_receiver.receiver_tensors)
Пример #2
0
def eval_input_receiver_fn(tf_transform_dir, schema):
    """Build everything needed for the tf-model-analysis to run the model.

  Args:
    tf_transform_dir: directory in which the tf-transform model was written
      during the preprocessing step.
    schema: the schema of the input data.

  Returns:
    EvalInputReceiver function, which contains:
      - Tensorflow graph which parses raw untranformed features, applies the
        tf-transform preprocessing operators.
      - Set of raw, untransformed features.
      - Label against which predictions will be compared.
  """
    # Notice that the inputs are raw features, not transformed features here.
    raw_feature_spec = taxi.get_raw_feature_spec(schema)

    serialized_tf_example = tf.placeholder(dtype=tf.string,
                                           shape=[None],
                                           name='input_example_tensor')

    # Add a parse_example operator to the tensorflow graph, which will parse
    # raw, untransformed, tf examples.
    features = tf.parse_example(serialized_tf_example, raw_feature_spec)

    # Now that we have our raw examples, process them through the tf-transform
    # function computed during the preprocessing step.
    _, transformed_features = (
        saved_transform_io.partially_apply_saved_transform(
            os.path.join(tf_transform_dir, transform_fn_io.TRANSFORM_FN_DIR),
            features))

    # The key name MUST be 'examples'.
    receiver_tensors = {'examples': serialized_tf_example}

    # NOTE: Model is driven by transformed features (since training works on the
    # materialized output of TFT, but slicing will happen on raw features.
    features.update(transformed_features)

    return tfma.export.EvalInputReceiver(
        features=features,
        receiver_tensors=receiver_tensors,
        labels=transformed_features[taxi.transformed_name(taxi.LABEL_KEY)])
Пример #3
0
def eval_input_receiver_fn(tf_transform_dir, schema):
  """Build everything needed for the tf-model-analysis to run the model.

  Args:
    tf_transform_dir: directory in which the tf-transform model was written
      during the preprocessing step.
    schema: the schema of the input data.

  Returns:
    EvalInputReceiver function, which contains:
      - Tensorflow graph which parses raw untranformed features, applies the
        tf-transform preprocessing operators.
      - Set of raw, untransformed features.
      - Label against which predictions will be compared.
  """
  # Notice that the inputs are raw features, not transformed features here.
  raw_feature_spec = taxi.get_raw_feature_spec(schema)

  serialized_tf_example = tf.placeholder(
      dtype=tf.string, shape=[None], name='input_example_tensor')

  # Add a parse_example operator to the tensorflow graph, which will parse
  # raw, untransformed, tf examples.
  features = tf.parse_example(serialized_tf_example, raw_feature_spec)

  # Now that we have our raw examples, process them through the tf-transform
  # function computed during the preprocessing step.
  _, transformed_features = (
      saved_transform_io.partially_apply_saved_transform(
          os.path.join(tf_transform_dir, transform_fn_io.TRANSFORM_FN_DIR),
          features))

  # The key name MUST be 'examples'.
  receiver_tensors = {'examples': serialized_tf_example}

  # NOTE: Model is driven by transformed features (since training works on the
  # materialized output of TFT, but slicing will happen on raw features.
  features.update(transformed_features)

  return tfma.export.EvalInputReceiver(
      features=features,
      receiver_tensors=receiver_tensors,
      labels=transformed_features[taxi.transformed_name(taxi.LABEL_KEY)])
Пример #4
0
def _do_inference(hostport, examples_file, num_examples):
    """Sends a request to the model and returns the result.

  Args:
    hostport: path to prediction service like host:port
    examples_file: path to csv file containing examples, with the first line
      assumed to have the column headers
    num_examples: number of requests to send to the server

  Returns:
    Response from model server
  """
    host, port = hostport.split(':')
    channel = implementations.insecure_channel(host, int(port))
    stub = prediction_service_pb2.beta_create_PredictionService_stub(channel)

    csv_coder = taxi.make_csv_coder()
    f = open(examples_file, 'r')
    f.readline()  # skip header line

    for _ in range(num_examples):
        request = predict_pb2.PredictRequest()
        request.model_spec.name = 'chicago_taxi'
        request.model_spec.signature_name = 'predict'
        one_line = f.readline()
        if not one_line:
            print('End of example file reached')
            return

        one_example = taxi.clean_raw_data_dict(csv_coder.decode(one_line))
        print(one_example)

        raw_feature_spec = taxi.get_raw_feature_spec()
        for key, val in six.iteritems(one_example):
            if key != 'tips':
                tfproto = tf.contrib.util.make_tensor_proto(
                    val, shape=[1], dtype=raw_feature_spec[key].dtype)
                request.inputs[key].CopyFrom(tfproto)

        return stub.Predict(request, _TIMEOUT_SECONDS)
Пример #5
0
def example_serving_receiver_fn(tf_transform_output, schema):
  """Build the serving in inputs.

  Args:
    tf_transform_output: A TFTransformOutput.
    schema: the schema of the input data.

  Returns:
    Tensorflow graph which parses examples, applying tf-transform to them.
  """
  raw_feature_spec = taxi.get_raw_feature_spec(schema)
  raw_feature_spec.pop(taxi.LABEL_KEY)

  raw_input_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(
      raw_feature_spec, default_batch_size=None)
  serving_input_receiver = raw_input_fn()

  transformed_features = tf_transform_output.transform_raw_features(
      serving_input_receiver.features)

  return tf.estimator.export.ServingInputReceiver(
      transformed_features, serving_input_receiver.receiver_tensors)
Пример #6
0
def transform_data(input_handle,
                   outfile_prefix,
                   working_dir,
                   schema_file,
                   transform_dir=None,
                   max_rows=None,
                   pipeline_args=None):
    """The main tf.transform method which analyzes and transforms data.

  Args:
    input_handle: BigQuery table name to process specified as DATASET.TABLE or
      path to csv file with input data.
    outfile_prefix: Filename prefix for emitted transformed examples
    working_dir: Directory in which transformed examples and transform function
      will be emitted.
    schema_file: An file path that contains a text-serialized TensorFlow
      metadata schema of the input data.
    transform_dir: Directory in which the transform output is located. If
      provided, this will load the transform_fn from disk instead of computing
      it over the data. Hint: this is useful for transforming eval data.
    max_rows: Number of rows to query from BigQuery
    pipeline_args: additional DataflowRunner or DirectRunner args passed to the
      beam pipeline.
  """
    def preprocessing_fn(inputs):
        """tf.transform's callback function for preprocessing inputs.

    Args:
      inputs: map from feature keys to raw not-yet-transformed features.

    Returns:
      Map from string feature key to transformed feature operations.
    """
        outputs = {}
        for key in taxi.DENSE_FLOAT_FEATURE_KEYS:
            # Preserve this feature as a dense float, setting nan's to the mean.
            outputs[taxi.transformed_name(key)] = transform.scale_to_z_score(
                _fill_in_missing(inputs[key]))

        for key in taxi.VOCAB_FEATURE_KEYS:
            # Build a vocabulary for this feature.
            outputs[taxi.transformed_name(
                key)] = transform.compute_and_apply_vocabulary(
                    _fill_in_missing(inputs[key]),
                    top_k=taxi.VOCAB_SIZE,
                    num_oov_buckets=taxi.OOV_SIZE)

        for key in taxi.BUCKET_FEATURE_KEYS:
            outputs[taxi.transformed_name(key)] = transform.bucketize(
                _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT)

        for key in taxi.CATEGORICAL_FEATURE_KEYS:
            outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key])

        # Was this passenger a big tipper?
        taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY])
        tips = _fill_in_missing(inputs[taxi.LABEL_KEY])
        outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where(
            tf.is_nan(taxi_fare),
            tf.cast(tf.zeros_like(taxi_fare), tf.int64),
            # Test if the tip was > 20% of the fare.
            tf.cast(tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))),
                    tf.int64))

        return outputs

    schema = taxi.read_schema(schema_file)
    raw_feature_spec = taxi.get_raw_feature_spec(schema)
    raw_schema = dataset_schema.from_feature_spec(raw_feature_spec)
    raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema)

    with beam.Pipeline(argv=pipeline_args) as pipeline:
        with tft_beam.Context(temp_dir=working_dir):
            if input_handle.lower().endswith('csv'):
                csv_coder = taxi.make_csv_coder(schema)
                raw_data = (pipeline
                            | 'ReadFromText' >> beam.io.ReadFromText(
                                input_handle, skip_header_lines=1)
                            | 'ParseCSV' >> beam.Map(csv_coder.decode))
            else:
                query = taxi.make_sql(input_handle, max_rows, for_eval=False)
                raw_data = (
                    pipeline
                    | 'ReadBigQuery' >> beam.io.Read(
                        beam.io.BigQuerySource(query=query,
                                               use_standard_sql=True))
                    |
                    'CleanData' >> beam.Map(taxi.clean_raw_data_dict,
                                            raw_feature_spec=raw_feature_spec))

            if transform_dir is None:
                transform_fn = (
                    (raw_data, raw_data_metadata)
                    | ('Analyze' >> tft_beam.AnalyzeDataset(preprocessing_fn)))

                _ = (transform_fn
                     | ('WriteTransformFn' >>
                        tft_beam.WriteTransformFn(working_dir)))
            else:
                transform_fn = pipeline | tft_beam.ReadTransformFn(
                    transform_dir)

            # Shuffling the data before materialization will improve Training
            # effectiveness downstream.
            shuffled_data = raw_data | 'RandomizeData' >> beam.transforms.Reshuffle(
            )

            (transformed_data, transformed_metadata) = (
                ((shuffled_data, raw_data_metadata), transform_fn)
                | 'Transform' >> tft_beam.TransformDataset())

            coder = example_proto_coder.ExampleProtoCoder(
                transformed_metadata.schema)
            _ = (transformed_data
                 | 'SerializeExamples' >> beam.Map(coder.encode)
                 | 'WriteExamples' >> beam.io.WriteToTFRecord(
                     os.path.join(working_dir, outfile_prefix),
                     file_name_suffix='.gz'))
Пример #7
0
def process_tfma(schema_file,
                 big_query_table=None,
                 eval_model_dir=None,
                 max_eval_rows=None,
                 pipeline_args=None,
                 publish_to_bq=False,
                 project=None,
                 metrics_table=None,
                 metrics_dataset=None):
  """Runs a batch job to evaluate the eval_model against the given input.

  Args:
  schema_file: A file containing a text-serialized Schema that describes the
      eval data.
  big_query_table: A BigQuery table name specified as DATASET.TABLE which
      should be the input for evaluation. This can only be set if input_csv is
      None.
  eval_model_dir: A directory where the eval model is located.
  max_eval_rows: Number of rows to query from BigQuery.
  pipeline_args: additional DataflowRunner or DirectRunner args passed to
  the beam pipeline.
  publish_to_bq:
  project:
  metrics_dataset:
  metrics_table:

  Raises:
  ValueError: if input_csv and big_query_table are not specified correctly.
  """

  if big_query_table is None:
    raise ValueError(
        '--big_query_table should be provided.')

  slice_spec = [
      tfma.slicer.SingleSliceSpec(),
      tfma.slicer.SingleSliceSpec(columns=['trip_start_hour'])
  ]
  metrics_namespace = metrics_table

  schema = taxi.read_schema(schema_file)

  eval_shared_model = tfma.default_eval_shared_model(
      eval_saved_model_path=eval_model_dir,
      add_metrics_callbacks=[
          tfma.post_export_metrics.calibration_plot_and_prediction_histogram(),
          tfma.post_export_metrics.auc_plots()
      ])

  metrics_monitor = None
  if publish_to_bq:
    metrics_monitor = MetricsReader(
        publish_to_bq=publish_to_bq,
        project_name=project,
        bq_table=metrics_table,
        bq_dataset=metrics_dataset,
        filters=MetricsFilter().with_namespace(metrics_namespace)
    )

  pipeline = beam.Pipeline(argv=pipeline_args)

  query = taxi.make_sql(big_query_table, max_eval_rows, for_eval=True)
  raw_feature_spec = taxi.get_raw_feature_spec(schema)
  raw_data = (
      pipeline
      | 'ReadBigQuery' >> ReadFromBigQuery(query=query, project=project,
                                           use_standard_sql=True)
      | 'Measure time: Start' >> beam.ParDo(MeasureTime(metrics_namespace))
      | 'CleanData' >> beam.Map(lambda x: (
          taxi.clean_raw_data_dict(x, raw_feature_spec))))

  # Examples must be in clean tf-example format.
  coder = taxi.make_proto_coder(schema)
  # Prepare arguments for Extract, Evaluate and Write steps
  extractors = tfma.default_extractors(
      eval_shared_model=eval_shared_model,
      slice_spec=slice_spec,
      desired_batch_size=None,
      materialize=False)

  evaluators = tfma.default_evaluators(
      eval_shared_model=eval_shared_model,
      desired_batch_size=None,
      num_bootstrap_samples=1)
  _ = (
      raw_data
      | 'ToSerializedTFExample' >> beam.Map(coder.encode)
      | 'Extract Results' >> tfma.InputsToExtracts()
      | 'Extract and evaluate' >> tfma.ExtractAndEvaluate(
          extractors=extractors,
          evaluators=evaluators)
      | 'Map Evaluations to PCollection' >> MapEvalToPCollection()
      | 'Measure time: End' >> beam.ParDo(
          MeasureTime(metrics_namespace))
  )
  result = pipeline.run()
  result.wait_until_finish()
  if metrics_monitor:
    metrics_monitor.publish_metrics(result)
Пример #8
0
def process_tfma(eval_result_dir,
                 schema_file,
                 input_csv=None,
                 big_query_table=None,
                 eval_model_dir=None,
                 max_eval_rows=None,
                 pipeline_args=None):
  """Runs a batch job to evaluate the eval_model against the given input.

  Args:
    eval_result_dir: A directory where the evaluation result should be written
      to.
    schema_file: A file containing a text-serialized Schema that describes the
      eval data.
    input_csv: A path to a csv file which should be the input for evaluation.
      This can only be set if big_query_table is None.
    big_query_table: A BigQuery table name specified as DATASET.TABLE which
      should be the input for evaluation. This can only be set if input_csv is
      None.
    eval_model_dir: A directory where the eval model is located.
    max_eval_rows: Number of rows to query from BigQuery.

    pipeline_args: additional DataflowRunner or DirectRunner args passed to the
      beam pipeline.

  Raises:
    ValueError: if input_csv and big_query_table are not specified correctly.
  """

  if input_csv == big_query_table and input_csv is None:
    raise ValueError(
        'one of --input_csv or --big_query_table should be provided.')

  slice_spec = [
      slicer.SingleSliceSpec(),
      slicer.SingleSliceSpec(columns=['trip_start_hour'])
  ]

  schema = taxi.read_schema(schema_file)

  with beam.Pipeline(argv=pipeline_args) as pipeline:
    if input_csv:
      csv_coder = taxi.make_csv_coder(schema)
      raw_data = (
          pipeline
          | 'ReadFromText' >> beam.io.ReadFromText(
              input_csv, skip_header_lines=1)
          | 'ParseCSV' >> beam.Map(csv_coder.decode))
    else:
      assert big_query_table
      query = taxi.make_sql(big_query_table, max_eval_rows, for_eval=True)
      raw_feature_spec = taxi.get_raw_feature_spec(schema)
      raw_data = (
          pipeline
          | 'ReadBigQuery' >> beam.io.Read(
              beam.io.BigQuerySource(query=query, use_standard_sql=True))
          | 'CleanData' >>
          beam.Map(lambda x: (taxi.clean_raw_data_dict(x, raw_feature_spec))))

    # Examples must be in clean tf-example format.
    coder = taxi.make_proto_coder(schema)

    _ = (
        raw_data
        | 'ToSerializedTFExample' >> beam.Map(coder.encode)
        | 'EvaluateAndWriteResults' >> tfma.EvaluateAndWriteResults(
            eval_saved_model_path=eval_model_dir,
            slice_spec=slice_spec,
            add_metrics_callbacks=[
                post_export_metrics.calibration_plot_and_prediction_histogram(),
                post_export_metrics.auc_plots()
            ],
            output_path=eval_result_dir))
Пример #9
0
def transform_data(input_handle,
                   outfile_prefix,
                   working_dir,
                   max_rows=None,
                   pipeline_args=None):
    """The main tf.transform method which analyzes and transforms data.

  Args:
    input_handle: BigQuery table name to process specified as
      DATASET.TABLE or path to csv file with input data.
    outfile_prefix: Filename prefix for emitted transformed examples
    working_dir: Directory in which transformed examples and transform
      function will be emitted.
    max_rows: Number of rows to query from BigQuery
    pipeline_args: additional DataflowRunner or DirectRunner args passed to the
      beam pipeline.
  """
    def preprocessing_fn(inputs):
        """tf.transform's callback function for preprocessing inputs.

    Args:
      inputs: map from feature keys to raw not-yet-transformed features.

    Returns:
      Map from string feature key to transformed feature operations.
    """
        outputs = {}
        for key in taxi.DENSE_FLOAT_FEATURE_KEYS:
            # Preserve this feature as a dense float, setting nan's to the mean.
            outputs[key] = transform.scale_to_z_score(inputs[key])

        for key in taxi.VOCAB_FEATURE_KEYS:
            # Build a vocabulary for this feature.
            outputs[key] = transform.string_to_int(
                inputs[key],
                top_k=taxi.VOCAB_SIZE,
                num_oov_buckets=taxi.OOV_SIZE)

        for key in taxi.BUCKET_FEATURE_KEYS:
            outputs[key] = transform.bucketize(inputs[key],
                                               taxi.FEATURE_BUCKET_COUNT)

        for key in taxi.CATEGORICAL_FEATURE_KEYS:
            outputs[key] = inputs[key]

        # Was this passenger a big tipper?
        def convert_label(label):
            taxi_fare = inputs[taxi.FARE_KEY]
            return tf.where(
                tf.is_nan(taxi_fare),
                tf.cast(tf.zeros_like(taxi_fare), tf.int64),
                # Test if the tip was > 20% of the fare.
                tf.cast(
                    tf.greater(label, tf.multiply(taxi_fare,
                                                  tf.constant(0.2))),
                    tf.int64))

        outputs[taxi.LABEL_KEY] = transform.apply_function(
            convert_label, inputs[taxi.LABEL_KEY])

        return outputs

    raw_feature_spec = taxi.get_raw_feature_spec()
    raw_schema = dataset_schema.from_feature_spec(raw_feature_spec)
    raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema)

    with beam.Pipeline(argv=pipeline_args) as pipeline:
        with beam_impl.Context(temp_dir=working_dir):
            if input_handle.lower().endswith('csv'):
                csv_coder = taxi.make_csv_coder()
                raw_data = (pipeline
                            | 'ReadFromText' >> beam.io.ReadFromText(
                                input_handle, skip_header_lines=1)
                            | 'ParseCSV' >> beam.Map(csv_coder.decode))
            else:
                query = taxi.make_sql(input_handle, max_rows, for_eval=False)
                raw_data = (pipeline
                            | 'ReadBigQuery' >> beam.io.Read(
                                beam.io.BigQuerySource(query=query,
                                                       use_standard_sql=True)))

            raw_data |= 'CleanData' >> beam.Map(taxi.clean_raw_data_dict)

            transform_fn = (
                (raw_data, raw_data_metadata)
                | 'Analyze' >> beam_impl.AnalyzeDataset(preprocessing_fn))

            _ = (transform_fn
                 | 'WriteTransformFn' >>
                 transform_fn_io.WriteTransformFn(working_dir))

            # Shuffling the data before materialization will improve Training
            # effectiveness downstream.
            shuffled_data = raw_data | 'RandomizeData' >> beam.transforms.Reshuffle(
            )

            (transformed_data, transformed_metadata) = (
                ((shuffled_data, raw_data_metadata), transform_fn)
                | 'Transform' >> beam_impl.TransformDataset())

            coder = example_proto_coder.ExampleProtoCoder(
                transformed_metadata.schema)
            _ = (
                transformed_data
                | 'SerializeExamples' >> beam.Map(coder.encode)
                | 'WriteExamples' >> beam.io.WriteToTFRecord(
                    os.path.join(working_dir, outfile_prefix),
                    compression_type=beam.io.filesystem.CompressionTypes.GZIP))
Пример #10
0
def transform_data(input_handle,
                   outfile_prefix,
                   working_dir,
                   schema_file,
                   transform_dir=None,
                   max_rows=None,
                   pipeline_args=None,
                   publish_to_bq=False,
                   project=None,
                   metrics_table=None,
                   metrics_dataset=None):
    """The main tf.transform method which analyzes and transforms data.

  Args:
    input_handle: BigQuery table name to process specified as DATASET.TABLE or
      path to csv file with input data.
    outfile_prefix: Filename prefix for emitted transformed examples
    working_dir: Directory in which transformed examples and transform function
      will be emitted.
    schema_file: An file path that contains a text-serialized TensorFlow
      metadata schema of the input data.
    transform_dir: Directory in which the transform output is located. If
      provided, this will load the transform_fn from disk instead of computing
      it over the data. Hint: this is useful for transforming eval data.
    max_rows: Number of rows to query from BigQuery
    pipeline_args: additional DataflowRunner or DirectRunner args passed to the
      beam pipeline.
  """
    def preprocessing_fn(inputs):
        """tf.transform's callback function for preprocessing inputs.

    Args:
      inputs: map from feature keys to raw not-yet-transformed features.

    Returns:
      Map from string feature key to transformed feature operations.
    """
        outputs = {}
        for key in taxi.DENSE_FLOAT_FEATURE_KEYS:
            # Preserve this feature as a dense float, setting nan's to the mean.
            outputs[taxi.transformed_name(key)] = transform.scale_to_z_score(
                _fill_in_missing(inputs[key]))

        for key in taxi.VOCAB_FEATURE_KEYS:
            # Build a vocabulary for this feature.
            outputs[taxi.transformed_name(
                key)] = transform.compute_and_apply_vocabulary(
                    _fill_in_missing(inputs[key]),
                    top_k=taxi.VOCAB_SIZE,
                    num_oov_buckets=taxi.OOV_SIZE)

        for key in taxi.BUCKET_FEATURE_KEYS:
            outputs[taxi.transformed_name(key)] = transform.bucketize(
                _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT)

        for key in taxi.CATEGORICAL_FEATURE_KEYS:
            outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key])

        # Was this passenger a big tipper?
        taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY])
        tips = _fill_in_missing(inputs[taxi.LABEL_KEY])
        outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where(
            tf.is_nan(taxi_fare),
            tf.cast(tf.zeros_like(taxi_fare), tf.int64),
            # Test if the tip was > 20% of the fare.
            tf.cast(tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))),
                    tf.int64))

        return outputs

    namespace = metrics_table
    metrics_monitor = None
    if publish_to_bq:
        metrics_monitor = MetricsReader(
            publish_to_bq=publish_to_bq,
            project_name=project,
            bq_table=metrics_table,
            bq_dataset=metrics_dataset,
            namespace=namespace,
            filters=MetricsFilter().with_namespace(namespace))
    schema = taxi.read_schema(schema_file)
    raw_feature_spec = taxi.get_raw_feature_spec(schema)
    raw_schema = schema_utils.schema_from_feature_spec(raw_feature_spec)
    raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema)

    pipeline = beam.Pipeline(argv=pipeline_args)
    with tft_beam.Context(temp_dir=working_dir):
        query = taxi.make_sql(input_handle, max_rows, for_eval=False)
        raw_data = (
            pipeline
            | 'ReadBigQuery' >> ReadFromBigQuery(
                query=query, project=project, use_standard_sql=True)
            | 'Measure time: start' >> beam.ParDo(MeasureTime(namespace)))
        decode_transform = beam.Map(taxi.clean_raw_data_dict,
                                    raw_feature_spec=raw_feature_spec)

        if transform_dir is None:
            decoded_data = raw_data | 'DecodeForAnalyze' >> decode_transform
            transform_fn = (
                (decoded_data, raw_data_metadata) |
                ('Analyze' >> tft_beam.AnalyzeDataset(preprocessing_fn)))

            _ = (
                transform_fn |
                ('WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir)))
        else:
            transform_fn = pipeline | tft_beam.ReadTransformFn(transform_dir)

        # Shuffling the data before materialization will improve Training
        # effectiveness downstream. Here we shuffle the raw_data (as opposed to
        # decoded data) since it has a compact representation.
        shuffled_data = raw_data | 'RandomizeData' >> beam.transforms.Reshuffle(
        )

        decoded_data = shuffled_data | 'DecodeForTransform' >> decode_transform
        (transformed_data, transformed_metadata) = (
            ((decoded_data, raw_data_metadata), transform_fn)
            | 'Transform' >> tft_beam.TransformDataset())

        coder = example_proto_coder.ExampleProtoCoder(
            transformed_metadata.schema)
        _ = (transformed_data
             | 'SerializeExamples' >> beam.Map(coder.encode)
             | 'Measure time: end' >> beam.ParDo(MeasureTime(namespace))
             | 'WriteExamples' >> beam.io.WriteToTFRecord(
                 os.path.join(working_dir, outfile_prefix),
                 file_name_suffix='.gz'))
    result = pipeline.run()
    result.wait_until_finish()
    if metrics_monitor:
        metrics_monitor.publish_metrics(result)
Пример #11
0
def transform_data(input_handle,
                   outfile_prefix,
                   working_dir,
                   schema_file,
                   transform_dir=None,
                   max_rows=None,
                   pipeline_args=None):
  """The main tf.transform method which analyzes and transforms data.

  Args:
    input_handle: BigQuery table name to process specified as DATASET.TABLE or
      path to csv file with input data.
    outfile_prefix: Filename prefix for emitted transformed examples
    working_dir: Directory in which transformed examples and transform function
      will be emitted.
    schema_file: An file path that contains a text-serialized TensorFlow
      metadata schema of the input data.
    transform_dir: Directory in which the transform output is located. If
      provided, this will load the transform_fn from disk instead of computing
      it over the data. Hint: this is useful for transforming eval data.
    max_rows: Number of rows to query from BigQuery
    pipeline_args: additional DataflowRunner or DirectRunner args passed to the
      beam pipeline.
  """

  def transform_ngrams(input, ngram_range):
    """ helper function to transform ngrams and print output. """
    # this print statement causes output to concat itself!
    # input = tf.Print(input, [input], "raw input:", first_n=-1, summarize=100)

    transformed = transform.ngrams(
      tf.string_split(input, delimiter=" "),
      ngram_range=ngram_range,
      separator=' ')

    # SparseTensor basically cannot be printed because it's made up of 3
    # tensors. We can use this trick to print the values column, but without the index
    # it's not too meaningful.
    #
    # values = tf.Print(transformed.values, [transformed.values], "ngram output:")
    # transformed = tf.SparseTensor(
    #       indices=transformed.indices,
    #       values=values,
    #       dense_shape=transformed.dense_shape)
    return transformed

  def preprocessing_fn(inputs):
    """tf.transform's callback function for preprocessing inputs.
    https://cloud.google.com/solutions/machine-learning/data-preprocessing-for-ml-with-tf-transform-pt2

    Args:
      inputs: map from feature keys to raw not-yet-transformed features.

    Returns:
      Map from string feature key to transformed feature operations.
    """
    outputs = {}
    for key in taxi.DENSE_FLOAT_FEATURE_KEYS:
      print('processing key', key)
      print('input:', inputs[key])
      # Preserve this feature as a dense float, setting nan's to the mean.
      outputs[taxi.transformed_name(key)] = transform.scale_to_z_score(
          _fill_in_missing(inputs[key]))

    for key in taxi.VOCAB_FEATURE_KEYS:
      # Build a vocabulary for this feature.
      outputs[
          taxi.transformed_name(key)] = transform.compute_and_apply_vocabulary(
              _fill_in_missing(inputs[key]),
              top_k=taxi.VOCAB_SIZE,
              num_oov_buckets=taxi.OOV_SIZE)

    # for key in taxi.FEATURE_NGRAM:
    #   # Extract nggrams and build a vocab.
    #   outputs[
    #       taxi.transformed_name(key)] = transform.compute_and_apply_vocabulary(
    #           transform.ngrams(
    #             tf.string_split(_fill_in_missing(inputs[key])),
    #             ngram_range=taxi.NGRAM_RANGE,
    #             separator=' '),
    #           top_k=512,
    #           num_oov_buckets=taxi.OOV_SIZE)

    for key in taxi.FEATURE_NGRAM:
      # Extract nggrams and build a vocab.
      outputs[
          taxi.transformed_name(key)] = transform.compute_and_apply_vocabulary(
            transform_ngrams(_fill_in_missing(inputs[key]), taxi.NGRAM_RANGE),
            top_k=taxi.VOCAB_SIZE,
            num_oov_buckets=taxi.OOV_SIZE)

    for key in taxi.BUCKET_FEATURE_KEYS:
      outputs[taxi.transformed_name(key)] = transform.bucketize(
          _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT)

    for key in taxi.CATEGORICAL_FEATURE_KEYS:
      outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key])

    # Was this passenger a big tipper?
    taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY])
    tips = _fill_in_missing(inputs[taxi.LABEL_KEY])
    outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where(
        tf.is_nan(taxi_fare),
        tf.cast(tf.zeros_like(taxi_fare), tf.int64),
        # Test if the tip was > 20% of the fare.
        tf.cast(
            tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))),
            tf.int64))

    return outputs

  schema = taxi.read_schema(schema_file)
  raw_feature_spec = taxi.get_raw_feature_spec(schema)
  raw_schema = dataset_schema.from_feature_spec(raw_feature_spec)
  raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema)

  with beam.Pipeline(argv=pipeline_args) as pipeline:
    with tft_beam.Context(temp_dir=working_dir):
      if input_handle.lower().endswith('csv'):
        csv_coder = taxi.make_csv_coder(schema, input_handle.lower())
        raw_data = (
            pipeline
            | 'ReadFromText' >> beam.io.ReadFromText(
                input_handle, skip_header_lines=1))
        decode_transform = beam.Map(csv_coder.decode)
      else:
        query = taxi.make_sql(input_handle, max_rows, for_eval=False)
        raw_data = (
            pipeline
            | 'ReadBigQuery' >> beam.io.Read(
                beam.io.BigQuerySource(query=query, use_standard_sql=True)))
        decode_transform = beam.Map(
            taxi.clean_raw_data_dict, raw_feature_spec=raw_feature_spec)

      if transform_dir is None:
        decoded_data = raw_data | 'DecodeForAnalyze' >> decode_transform
        transform_fn = (
            (decoded_data, raw_data_metadata) |
            ('Analyze' >> tft_beam.AnalyzeDataset(preprocessing_fn)))

        _ = (
            transform_fn
            | ('WriteTransformFn' >>
               tft_beam.WriteTransformFn(working_dir)))
      else:
        transform_fn = pipeline | tft_beam.ReadTransformFn(transform_dir)

      # Shuffling the data before materialization will improve Training
      # effectiveness downstream. Here we shuffle the raw_data (as opposed to
      # decoded data) since it has a compact representation.
      shuffled_data = raw_data | 'RandomizeData' >> beam.transforms.Reshuffle()

      decoded_data = shuffled_data | 'DecodeForTransform' >> decode_transform
      (transformed_data, transformed_metadata) = (
          ((decoded_data, raw_data_metadata), transform_fn)
          | 'Transform' >> tft_beam.TransformDataset())

      coder = example_proto_coder.ExampleProtoCoder(transformed_metadata.schema)
      _ = (
          transformed_data
          | 'SerializeExamples' >> beam.Map(coder.encode)
          | 'WriteExamples' >> beam.io.WriteToTFRecord(
              os.path.join(working_dir, outfile_prefix), file_name_suffix='.gz')
      )
Пример #12
0
def process_tfma(eval_result_dir,
                 schema_file,
                 input_csv=None,
                 big_query_table=None,
                 eval_model_dir=None,
                 max_eval_rows=None,
                 pipeline_args=None):
  """Runs a batch job to evaluate the eval_model against the given input.

  Args:
    eval_result_dir: A directory where the evaluation result should be written
      to.
    schema_file: A file containing a text-serialized Schema that describes the
      eval data.
    input_csv: A path to a csv file which should be the input for evaluation.
      This can only be set if big_query_table is None.
    big_query_table: A BigQuery table name specified as DATASET.TABLE which
      should be the input for evaluation. This can only be set if input_csv is
      None.
    eval_model_dir: A directory where the eval model is located.
    max_eval_rows: Number of rows to query from BigQuery.
    pipeline_args: additional DataflowRunner or DirectRunner args passed to the
      beam pipeline.

  Raises:
    ValueError: if input_csv and big_query_table are not specified correctly.
  """

  if input_csv == big_query_table and input_csv is None:
    raise ValueError(
        'one of --input_csv or --big_query_table should be provided.')

  slice_spec = [
      tfma.slicer.SingleSliceSpec(),
      tfma.slicer.SingleSliceSpec(columns=['trip_start_hour'])
  ]

  schema = taxi.read_schema(schema_file)

  eval_shared_model = tfma.default_eval_shared_model(
      eval_saved_model_path=eval_model_dir,
      add_metrics_callbacks=[
          tfma.post_export_metrics.calibration_plot_and_prediction_histogram(),
          tfma.post_export_metrics.auc_plots()
      ])

  with beam.Pipeline(argv=pipeline_args) as pipeline:
    if input_csv:
      csv_coder = taxi.make_csv_coder(schema)
      raw_data = (
          pipeline
          | 'ReadFromText' >> beam.io.ReadFromText(
              input_csv, skip_header_lines=1)
          | 'ParseCSV' >> beam.Map(csv_coder.decode))
    else:
      assert big_query_table
      query = taxi.make_sql(big_query_table, max_eval_rows, for_eval=True)
      raw_feature_spec = taxi.get_raw_feature_spec(schema)
      raw_data = (
          pipeline
          | 'ReadBigQuery' >> beam.io.Read(
              beam.io.BigQuerySource(query=query, use_standard_sql=True))
          | 'CleanData' >>
          beam.Map(lambda x: (taxi.clean_raw_data_dict(x, raw_feature_spec))))

    # Examples must be in clean tf-example format.
    coder = taxi.make_proto_coder(schema)

    _ = (
        raw_data
        | 'ToSerializedTFExample' >> beam.Map(coder.encode)
        |
        'ExtractEvaluateAndWriteResults' >> tfma.ExtractEvaluateAndWriteResults(
            eval_shared_model=eval_shared_model,
            slice_spec=slice_spec,
            output_path=eval_result_dir))
Пример #13
0
def main():
    tf.logging.set_verbosity(tf.logging.INFO)

    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--eval_model_dir',
        help='Input path to the model which will be evaluated.')
    parser.add_argument(
        '--eval_result_dir',
        help='Output directory in which the model analysis result is written.')
    parser.add_argument(
        '--big_query_table',
        help='BigQuery path to input examples which will be evaluated.')
    parser.add_argument(
        '--input_csv',
        help='CSV file containing raw data which will be evaluated.')
    parser.add_argument('--max_eval_rows',
                        help='Maximum number of rows to evaluate on.',
                        default=None,
                        type=int)

    known_args, pipeline_args = parser.parse_known_args()

    if known_args.eval_result_dir:
        eval_result_dir = known_args.eval_result_dir
    else:
        eval_result_dir = tempfile.mkdtemp()

    slice_spec = [
        slicer.SingleSliceSpec(),
        slicer.SingleSliceSpec(columns=['trip_start_hour'])
    ]

    with beam.Pipeline(argv=pipeline_args) as pipeline:
        if known_args.input_csv:
            csv_coder = taxi.make_csv_coder()
            raw_data = (pipeline
                        | 'ReadFromText' >> beam.io.ReadFromText(
                            known_args.input_csv, skip_header_lines=1)
                        | 'ParseCSV' >> beam.Map(csv_coder.decode))
        elif known_args.big_query_table:
            query = taxi.make_sql(known_args.big_query_table,
                                  known_args.max_eval_rows,
                                  for_eval=True)
            raw_data = (pipeline
                        | 'ReadBigQuery' >> beam.io.Read(
                            beam.io.BigQuerySource(query=query,
                                                   use_standard_sql=True)))
        else:
            raise ValueError(
                'one of --input_csv or --big_query_table should be '
                'provided.')

        # Examples must be in clean tf-example format.
        raw_feature_spec = taxi.get_raw_feature_spec()
        raw_schema = dataset_schema.from_feature_spec(raw_feature_spec)
        coder = example_proto_coder.ExampleProtoCoder(raw_schema)

        _ = (raw_data
             | 'CleanData' >> beam.Map(taxi.clean_raw_data_dict)
             | 'ToSerializedTFExample' >> beam.Map(coder.encode)
             | 'EvaluateAndWriteResults' >> tfma.EvaluateAndWriteResults(
                 eval_saved_model_path=known_args.eval_model_dir,
                 slice_spec=slice_spec,
                 add_metrics_callbacks=[
                     post_export_metrics.
                     calibration_plot_and_prediction_histogram(),
                     post_export_metrics.auc_plots()
                 ],
                 output_path=eval_result_dir,
                 desired_batch_size=100))