示例#1
0
  def test_preprocessing_fn(self):
    schema_file = os.path.join(self._testdata_path, 'schema_gen/schema.pbtxt')
    schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema())
    feature_spec = taxi_utils._get_raw_feature_spec(schema)
    working_dir = self.get_temp_dir()
    transform_output_path = os.path.join(working_dir, 'transform_output')
    transformed_examples_path = os.path.join(
        working_dir, 'transformed_examples')

    # Run very simplified version of executor logic.
    # TODO(kestert): Replace with tft_unit.assertAnalyzeAndTransformResults.
    # Generate legacy `DatasetMetadata` object.  Future version of Transform
    # will accept the `Schema` proto directly.
    legacy_metadata = dataset_metadata.DatasetMetadata(
        dataset_schema.from_feature_spec(feature_spec))
    decoder = tft.coders.ExampleProtoCoder(legacy_metadata.schema)
    with beam.Pipeline() as p:
      with tft_beam.Context(temp_dir=os.path.join(working_dir, 'tmp')):
        examples = (
            p
            | 'ReadTrainData' >> beam.io.ReadFromTFRecord(
                os.path.join(self._testdata_path, 'csv_example_gen/train/*'),
                coder=beam.coders.BytesCoder(),
                # TODO(b/114938612): Eventually remove this override.
                validate=False)
            | 'DecodeTrainData' >> beam.Map(decoder.decode))
        (transformed_examples, transformed_metadata), transform_fn = (
            (examples, legacy_metadata)
            | 'AnalyzeAndTransform' >> tft_beam.AnalyzeAndTransformDataset(
                taxi_utils.preprocessing_fn))

        # WriteTransformFn writes transform_fn and metadata to subdirectories
        # tensorflow_transform.SAVED_MODEL_DIR and
        # tensorflow_transform.TRANSFORMED_METADATA_DIR respectively.
        # pylint: disable=expression-not-assigned
        (transform_fn
         | 'WriteTransformFn' >> tft_beam.WriteTransformFn(
             transform_output_path))

        encoder = tft.coders.ExampleProtoCoder(transformed_metadata.schema)
        (transformed_examples
         | 'EncodeTrainData' >> beam.Map(encoder.encode)
         | 'WriteTrainData' >> beam.io.WriteToTFRecord(
             os.path.join(transformed_examples_path,
                          'train/transformed_exmaples.gz'),
             coder=beam.coders.BytesCoder()))
        # pylint: enable=expression-not-assigned

    # Verify the output matches golden output.
    # NOTE: we don't verify that transformed examples match golden output.
    expected_transformed_schema = io_utils.parse_pbtxt_file(
        os.path.join(
            self._testdata_path,
            'transform/transform_output/transformed_metadata/schema.pbtxt'),
        schema_pb2.Schema())
    transformed_schema = io_utils.parse_pbtxt_file(
        os.path.join(transform_output_path,
                     'transformed_metadata/schema.pbtxt'),
        schema_pb2.Schema())
    self.assertEqual(transformed_schema, expected_transformed_schema)
示例#2
0
def compare_anomalies(output_uri: Text, expected_uri: Text) -> bool:
    """Compares anomalies files in output uri and recorded uri.

  Args:
    output_uri: pipeline output artifact uri.
    expected_uri: recorded pipeline output artifact uri.

  Returns:
     boolean whether anomalies are same.
  """
    for dir_name, _, leaf_files in tf.io.gfile.walk(expected_uri):
        for leaf_file in leaf_files:
            expected_file_name = os.path.join(dir_name, leaf_file)
            file_name = os.path.join(
                dir_name.replace(expected_uri, output_uri, 1), leaf_file)
            anomalies = anomalies_pb2.Anomalies()
            io_utils.parse_pbtxt_file(os.path.join(output_uri, file_name),
                                      anomalies)
            expected_anomalies = anomalies_pb2.Anomalies()
            io_utils.parse_pbtxt_file(
                os.path.join(expected_uri, expected_file_name),
                expected_anomalies)
            if expected_anomalies.anomaly_info != anomalies.anomaly_info:
                return False
    return True
示例#3
0
def annotate_schema(
    ignore_features: Parameter[str],
    original_schema: InputArtifact[standard_artifacts.Schema],
    schema: OutputArtifact[standard_artifacts.Schema],
) -> None:  # pytype: disable=invalid-annotation,wrong-arg-types
    r"""Updates a schema with additional metadata.

  Args:
    ignore_features: Newline ('\n') separated list of features to mark as
      disabled in the output schema.
    original_schema: The Schema artifact to modify.
    schema: The output Schema with updates.
  """

    schema_file = io_utils.get_only_uri_in_dir(original_schema.uri)
    dataset_schema = schema_pb2.Schema()
    io_utils.parse_pbtxt_file(schema_file, dataset_schema)

    ignore_features = ignore_features.split("\n")
    for feature in dataset_schema.feature:
        if feature.name in ignore_features:
            logging.info("Marking '%s' as DISABLED.", feature.name)
            feature.lifecycle_stage = schema_pb2.LifecycleStage.DISABLED

    io_utils.write_pbtxt_file(os.path.join(schema.uri, "schema.txt"),
                              dataset_schema)
示例#4
0
  def testPreprocessingFn(self):
    schema_file = os.path.join(self._testdata_path, 'schema_gen/schema.pbtxt')
    schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema())
    feature_spec = taxi_utils._get_raw_feature_spec(schema)
    working_dir = self.get_temp_dir()
    transform_graph_path = os.path.join(working_dir, 'transform_graph')
    transformed_examples_path = os.path.join(
        working_dir, 'transformed_examples')

    # Run very simplified version of executor logic.
    # TODO(kestert): Replace with tft_unit.assertAnalyzeAndTransformResults.
    # Generate legacy `DatasetMetadata` object.  Future version of Transform
    # will accept the `Schema` proto directly.
    legacy_metadata = dataset_metadata.DatasetMetadata(
        schema_utils.schema_from_feature_spec(feature_spec))
    tfxio = tf_example_record.TFExampleRecord(
        file_pattern=os.path.join(self._testdata_path,
                                  'csv_example_gen/Split-train/*'),
        telemetry_descriptors=['Tests'],
        schema=legacy_metadata.schema)
    with beam.Pipeline() as p:
      with tft_beam.Context(temp_dir=os.path.join(working_dir, 'tmp')):
        examples = p | 'ReadTrainData' >> tfxio.BeamSource()
        (transformed_examples, transformed_metadata), transform_fn = (
            (examples, tfxio.TensorAdapterConfig())
            | 'AnalyzeAndTransform' >> tft_beam.AnalyzeAndTransformDataset(
                taxi_utils.preprocessing_fn))

        # WriteTransformFn writes transform_fn and metadata to subdirectories
        # tensorflow_transform.SAVED_MODEL_DIR and
        # tensorflow_transform.TRANSFORMED_METADATA_DIR respectively.
        # pylint: disable=expression-not-assigned
        (transform_fn
         |
         'WriteTransformFn' >> tft_beam.WriteTransformFn(transform_graph_path))

        encoder = tft.coders.ExampleProtoCoder(transformed_metadata.schema)
        (transformed_examples
         | 'EncodeTrainData' >> beam.Map(encoder.encode)
         | 'WriteTrainData' >> beam.io.WriteToTFRecord(
             os.path.join(transformed_examples_path,
                          'Split-train/transformed_examples.gz'),
             coder=beam.coders.BytesCoder()))
        # pylint: enable=expression-not-assigned

    # Verify the output matches golden output.
    # NOTE: we don't verify that transformed examples match golden output.
    expected_transformed_schema = io_utils.parse_pbtxt_file(
        os.path.join(
            self._testdata_path,
            'transform/transform_graph/transformed_metadata/schema.pbtxt'),
        schema_pb2.Schema())
    transformed_schema = io_utils.parse_pbtxt_file(
        os.path.join(transform_graph_path, 'transformed_metadata/schema.pbtxt'),
        schema_pb2.Schema())
    # Clear annotations so we only have to test main schema.
    transformed_schema.ClearField('annotation')
    for feature in transformed_schema.feature:
      feature.ClearField('annotation')
    self.assertEqual(transformed_schema, expected_transformed_schema)
示例#5
0
  def testDo(self):
    source_data_dir = os.path.join(
        os.path.dirname(os.path.dirname(__file__)), 'testdata')

    eval_stats_artifact = standard_artifacts.ExampleStatistics()
    eval_stats_artifact.uri = os.path.join(source_data_dir, 'statistics_gen')
    eval_stats_artifact.split_names = artifact_utils.encode_split_names(
        ['train', 'eval', 'test'])

    schema_artifact = standard_artifacts.Schema()
    schema_artifact.uri = os.path.join(source_data_dir, 'schema_gen')

    output_data_dir = os.path.join(
        os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
        self._testMethodName)

    validation_output = standard_artifacts.ExampleAnomalies()
    validation_output.uri = os.path.join(output_data_dir, 'output')

    input_dict = {
        executor.STATISTICS_KEY: [eval_stats_artifact],
        executor.SCHEMA_KEY: [schema_artifact],
    }

    exec_properties = {
        # List needs to be serialized before being passed into Do function.
        executor.EXCLUDE_SPLITS_KEY:
            json_utils.dumps(['test'])
    }

    output_dict = {
        executor.ANOMALIES_KEY: [validation_output],
    }

    example_validator_executor = executor.Executor()
    example_validator_executor.Do(input_dict, output_dict, exec_properties)

    self.assertEqual(
        artifact_utils.encode_split_names(['train', 'eval']),
        validation_output.split_names)

    # Check example_validator outputs.
    train_anomalies_path = os.path.join(validation_output.uri, 'train',
                                        'anomalies.pbtxt')
    eval_anomalies_path = os.path.join(validation_output.uri, 'eval',
                                       'anomalies.pbtxt')
    self.assertTrue(tf.io.gfile.exists(train_anomalies_path))
    self.assertTrue(tf.io.gfile.exists(eval_anomalies_path))
    train_anomalies = io_utils.parse_pbtxt_file(train_anomalies_path,
                                                anomalies_pb2.Anomalies())
    eval_anomalies = io_utils.parse_pbtxt_file(eval_anomalies_path,
                                               anomalies_pb2.Anomalies())
    self.assertEqual(0, len(train_anomalies.anomaly_info))
    self.assertEqual(0, len(eval_anomalies.anomaly_info))

    # Assert 'test' split is excluded.
    train_file_path = os.path.join(validation_output.uri, 'test',
                                   'anomalies.pbtxt')
    self.assertFalse(tf.io.gfile.exists(train_file_path))
示例#6
0
def _test_pipeline(pipeline_id: str, run_id: str):
  """Creates test pipeline with pipeline_id and run_id."""
  pipeline = pipeline_pb2.Pipeline()
  path = os.path.join(
      os.path.dirname(__file__), 'testdata', 'sync_pipeline.pbtxt')
  io_utils.parse_pbtxt_file(path, pipeline)
  pipeline.pipeline_info.id = pipeline_id
  runtime_parameter_utils.substitute_runtime_parameter(pipeline, {
      'pipeline_run_id': run_id,
  })
  return pipeline
示例#7
0
def _test_pipeline(ir_path: str, pipeline_id: str, run_id: str,
                   deployment_config: Optional[message.Message]):
  """Creates test pipeline with pipeline_id and run_id."""
  pipeline = pipeline_pb2.Pipeline()
  io_utils.parse_pbtxt_file(ir_path, pipeline)
  pipeline.pipeline_info.id = pipeline_id
  runtime_parameter_utils.substitute_runtime_parameter(pipeline, {
      constants.PIPELINE_RUN_ID_PARAMETER_NAME: run_id,
  })
  if deployment_config:
    pipeline.deployment_config.Pack(deployment_config)
  return pipeline
示例#8
0
文件: tuner_module.py 项目: zvrr/tfx
def tuner_fn(fn_args: FnArgs) -> TunerFnResult:
  """Build the tuner using the KerasTuner API.

  Args:
    fn_args: Holds args as name/value pairs.
      - working_dir: working dir for tuning.
      - train_files: List of file paths containing training tf.Example data.
      - eval_files: List of file paths containing eval tf.Example data.
      - train_steps: number of train steps.
      - eval_steps: number of eval steps.
      - schema_path: optional schema of the input data.
      - transform_graph_path: optional transform graph produced by TFT.

  Returns:
    A namedtuple contains the following:
      - tuner: A BaseTuner that will be used for tuning.
      - fit_kwargs: Args to pass to tuner's run_trial function for fitting the
                    model , e.g., the training and validation dataset. Required
                    args depend on the above tuner's implementation.
  """
  hp = kerastuner.HyperParameters()
  # Defines search space.
  hp.Choice('learning_rate', [1e-1, 1e-3])
  hp.Int('num_layers', 1, 5)

  # RandomSearch is a subclass of Keras model Tuner.
  tuner = kerastuner.RandomSearch(
      _build_keras_model,
      max_trials=5,
      hyperparameters=hp,
      allow_new_entries=False,
      objective='val_sparse_categorical_accuracy',
      directory=fn_args.working_dir,
      project_name='test')

  schema = schema_pb2.Schema()
  io_utils.parse_pbtxt_file(fn_args.schema_path, schema)
  train_dataset = _input_fn(fn_args.train_files, schema)
  eval_dataset = _input_fn(fn_args.eval_files, schema)

  return TunerFnResult(
      tuner=tuner,
      fit_kwargs={
          'x': train_dataset,
          'validation_data': eval_dataset,
          'steps_per_epoch': fn_args.train_steps,
          'validation_steps': fn_args.eval_steps
      })
示例#9
0
    def testTrainerFn(self):
        temp_dir = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName)

        schema_file = os.path.join(self._testdata_path,
                                   'schema_gen/schema.pbtxt')
        trainer_fn_args = trainer_executor.TrainerFnArgs(
            train_files=os.path.join(
                self._testdata_path,
                'transform/transformed_examples/train/*.gz'),
            transform_output=os.path.join(self._testdata_path,
                                          'transform/transform_output/'),
            serving_model_dir=os.path.join(temp_dir, 'serving_model_dir'),
            eval_files=os.path.join(
                self._testdata_path,
                'transform/transformed_examples/eval/*.gz'),
            schema_file=schema_file,
            train_steps=1,
            eval_steps=1,
            base_model=os.path.join(self._testdata_path,
                                    'trainer/current/serving_model_dir'),
            data_accessor=DataAccessor(tf_dataset_factory=tfxio_utils.
                                       get_tf_dataset_factory_from_artifact(
                                           [standard_artifacts.Examples()],
                                           []),
                                       record_batch_factory=None,
                                       data_view_decode_fn=None))
        schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema())
        training_spec = taxi_utils_bqml.trainer_fn(trainer_fn_args, schema)

        estimator = training_spec['estimator']
        train_spec = training_spec['train_spec']
        eval_spec = training_spec['eval_spec']
        eval_input_receiver_fn = training_spec['eval_input_receiver_fn']

        self.assertIsInstance(estimator, tf.estimator.Estimator)
        self.assertIsInstance(train_spec, tf.estimator.TrainSpec)
        self.assertIsInstance(eval_spec, tf.estimator.EvalSpec)
        self.assertIsInstance(eval_input_receiver_fn, types.FunctionType)

        # Train for one step, then eval for one step.
        eval_result, exports = tf.estimator.train_and_evaluate(
            estimator, train_spec, eval_spec)
        self.assertGreater(eval_result['loss'], 0.0)
        self.assertEqual(len(exports), 1)
        self.assertGreaterEqual(len(fileio.listdir(exports[0])), 1)

        # Export the eval saved model.
        eval_savedmodel_path = tfma.export.export_eval_savedmodel(
            estimator=estimator,
            export_dir_base=path_utils.eval_model_dir(temp_dir),
            eval_input_receiver_fn=eval_input_receiver_fn)
        self.assertGreaterEqual(len(fileio.listdir(eval_savedmodel_path)), 1)

        # Test exported serving graph.
        with tf.compat.v1.Session() as sess:
            metagraph_def = tf.compat.v1.saved_model.loader.load(
                sess, [tf.saved_model.SERVING], exports[0])
            self.assertIsInstance(metagraph_def, tf.compat.v1.MetaGraphDef)
示例#10
0
def run_fn(fn_args: TrainerFnArgs):
    """Train the model based on given args.

  Args:
    fn_args: Holds args used to train the model as name/value pairs.
  """
    schema = io_utils.parse_pbtxt_file(fn_args.schema_file,
                                       schema_pb2.Schema())

    x_train, y_train = _input_fn(fn_args.train_files, fn_args.data_accessor,
                                 schema)
    x_eval, y_eval = _input_fn(fn_args.eval_files, fn_args.data_accessor,
                               schema)

    steps_per_epoch = _TRAIN_DATA_SIZE / _TRAIN_BATCH_SIZE

    model = MLPClassifier(hidden_layer_sizes=[8, 8, 8],
                          activation='relu',
                          solver='adam',
                          batch_size=_TRAIN_BATCH_SIZE,
                          learning_rate_init=0.0005,
                          max_iter=int(fn_args.train_steps / steps_per_epoch),
                          verbose=True)
    model.fit(x_train, y_train)
    absl.logging.info(model)

    score = model.score(x_eval, y_eval)
    absl.logging.info('Accuracy: %f', score)

    os.makedirs(fn_args.serving_model_dir)

    model_path = os.path.join(fn_args.serving_model_dir, 'model.pkl')
    with tf.io.gfile.GFile(model_path, 'wb+') as f:
        pickle.dump(model, f)
示例#11
0
  def Do(self, input_dict: Dict[Text, List[types.Artifact]],
         output_dict: Dict[Text, List[types.Artifact]],
         exec_properties: Dict[Text, Any]) -> None:
    # KerasTuner generates tuning state (e.g., oracle, trials) to working dir.
    working_dir = self._get_tmp_dir()

    train_path = artifact_utils.get_split_uri(input_dict['examples'], 'train')
    eval_path = artifact_utils.get_split_uri(input_dict['examples'], 'eval')
    schema_file = io_utils.get_only_uri_in_dir(
        artifact_utils.get_single_uri(input_dict['schema']))
    schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema())

    tuner_fn = self._GetTunerFn(exec_properties)
    tuner_spec = tuner_fn(working_dir, io_utils.all_files_pattern(train_path),
                          io_utils.all_files_pattern(eval_path), schema)
    tuner = tuner_spec.tuner

    tuner.search_space_summary()
    # TODO(jyzhao): assert v2 behavior as KerasTuner doesn't work in v1.
    # TODO(jyzhao): make epochs configurable.
    tuner.search(
        tuner_spec.train_dataset,
        epochs=5,
        validation_data=tuner_spec.eval_dataset)
    tuner.results_summary()

    best_hparams = tuner.oracle.get_best_trials(
        1)[0].hyperparameters.get_config()
    best_hparams_path = os.path.join(
        artifact_utils.get_single_uri(output_dict['study_best_hparams_path']),
        _DEFAULT_FILE_NAME)
    io_utils.write_string_file(best_hparams_path, json.dumps(best_hparams))
    absl.logging.info('Best HParams is written to %s.' % best_hparams_path)
示例#12
0
  def test_trainer_fn(self):
    temp_dir = os.path.join(
        os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
        self._testMethodName)

    schema_file = os.path.join(self._testdata_path, 'schema_gen/schema.pbtxt')
    hparams = tf.contrib.training.HParams(
        train_files=os.path.join(temp_dir, 'train_files'),
        transform_output=os.path.join(self._testdata_path,
                                      'transform/transform_output/'),
        output_dir=os.path.join(temp_dir, 'output_dir'),
        serving_model_dir=os.path.join(temp_dir, 'serving_model_dir'),
        eval_files=os.path.join(temp_dir, 'eval_files'),
        schema_file=schema_file,
        train_steps=10001,
        eval_steps=5000,
        verbosity='INFO',
        warm_start_from=os.path.join(temp_dir, 'serving_model_dir'))
    schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema())
    training_spec = taxi_utils.trainer_fn(hparams, schema)

    self.assertIsInstance(training_spec['estimator'],
                          tf.estimator.DNNLinearCombinedClassifier)
    self.assertIsInstance(training_spec['train_spec'], tf.estimator.TrainSpec)
    self.assertIsInstance(training_spec['eval_spec'], tf.estimator.EvalSpec)
    self.assertIsInstance(training_spec['eval_input_receiver_fn'],
                          types.FunctionType)
示例#13
0
def run_fn(fn_args: executor.TrainerFnArgs):
    """Train the model based on given args.

  Args:
    fn_args: Holds args used to train the model as name/value pairs.
  """
    schema = io_utils.parse_pbtxt_file(fn_args.schema_file,
                                       schema_pb2.Schema())

    training_spec = trainer_fn(fn_args, schema)

    # Train the model
    absl.logging.info('Training model.')
    tf.estimator.train_and_evaluate(training_spec['estimator'],
                                    training_spec['train_spec'],
                                    training_spec['eval_spec'])
    absl.logging.info('Training complete.  Model written to %s',
                      fn_args.serving_model_dir)

    # Export an eval savedmodel for TFMA
    absl.logging.info('Exporting eval_savedmodel for TFMA.')
    tfma.export.export_eval_savedmodel(
        estimator=training_spec['estimator'],
        export_dir_base=fn_args.eval_model_dir,
        eval_input_receiver_fn=training_spec['eval_input_receiver_fn'])

    absl.logging.info('Exported eval_savedmodel to %s.',
                      fn_args.eval_model_dir)
示例#14
0
def run_fn(fn_args: executor.TrainerFnArgs):
    """Train the model based on given args.

  Args:
    fn_args: Holds args used to train the model as name/value pairs.
  """
    schema = io_utils.parse_pbtxt_file(fn_args.schema_file,
                                       schema_pb2.Schema())

    training_spec = trainer_fn(fn_args, schema)

    # Train the model
    absl.logging.info('Training model.')
    tf.estimator.train_and_evaluate(training_spec['estimator'],
                                    training_spec['train_spec'],
                                    training_spec['eval_spec'])
    absl.logging.info('Training complete.  Model written to %s',
                      fn_args.serving_model_dir)

    # Export an eval savedmodel for TFMA
    # NOTE: When trained in distributed training cluster, eval_savedmodel must be
    # exported only by the chief worker.
    absl.logging.info('Exporting eval_savedmodel for TFMA.')
    tfma.export.export_eval_savedmodel(
        estimator=training_spec['estimator'],
        export_dir_base=fn_args.eval_model_dir,
        eval_input_receiver_fn=training_spec['eval_input_receiver_fn'])

    # Simulate writing a log to the path given by fn_args
    io_utils.write_string_file(
        os.path.join(fn_args.model_run_dir, 'fake_log.txt'), '')

    absl.logging.info('Exported eval_savedmodel to %s.',
                      fn_args.eval_model_dir)
示例#15
0
    def test_trainer_fn(self):
        temp_dir = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName)

        schema_file = os.path.join(self._testdata_path,
                                   'schema_gen/schema.pbtxt')
        hparams = tf.contrib.training.HParams(
            train_files=os.path.join(temp_dir, 'train_files'),
            transform_output=os.path.join(self._testdata_path,
                                          'transform/transform_output/'),
            output_dir=os.path.join(temp_dir, 'output_dir'),
            serving_model_dir=os.path.join(temp_dir, 'serving_model_dir'),
            eval_files=os.path.join(temp_dir, 'eval_files'),
            schema_file=schema_file,
            train_steps=10001,
            eval_steps=5000,
            verbosity='INFO',
            warm_start_from=os.path.join(temp_dir, 'serving_model_dir'))
        schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema())
        training_spec = taxi_utils.trainer_fn(hparams, schema)

        self.assertIsInstance(training_spec['estimator'],
                              tf.estimator.DNNLinearCombinedClassifier)
        self.assertIsInstance(training_spec['train_spec'],
                              tf.estimator.TrainSpec)
        self.assertIsInstance(training_spec['eval_spec'],
                              tf.estimator.EvalSpec)
        self.assertIsInstance(training_spec['eval_input_receiver_fn'],
                              types.FunctionType)
示例#16
0
文件: model.py 项目: zvrr/tfx
def run_fn(fn_args):
    """Train the model based on given args.

  Args:
    fn_args: Holds args used to train the model as name/value pairs.
  """
    schema = io_utils.parse_pbtxt_file(fn_args.schema_file,
                                       schema_pb2.Schema())

    train_and_eval_spec = _create_train_and_eval_spec(fn_args, schema)

    # Train the model
    logging.info('Training model.')
    tf.estimator.train_and_evaluate(train_and_eval_spec['estimator'],
                                    train_and_eval_spec['train_spec'],
                                    train_and_eval_spec['eval_spec'])
    logging.info('Training complete.  Model written to %s',
                 fn_args.serving_model_dir)

    # Export an eval savedmodel for TFMA
    # NOTE: When trained in distributed training cluster, eval_savedmodel must be
    # exported only by the chief worker.
    logging.info('Exporting eval_savedmodel for TFMA.')
    tfma.export.export_eval_savedmodel(
        estimator=train_and_eval_spec['estimator'],
        export_dir_base=fn_args.eval_model_dir,
        eval_input_receiver_fn=train_and_eval_spec['eval_input_receiver_fn'])

    logging.info('Exported eval_savedmodel to %s.', fn_args.eval_model_dir)
示例#17
0
def train_custom_model(
    fn_args: TrainerFnArgs,
    inputs: Dict[str, Feature],
    outputs: Dict[str, Feature],
    custom_model: Callable[
        [
            TrainerFnArgs,
            tf.data.Dataset,
            tf.data.Dataset,
        ],
        Any,
    ],
    batch_size: int = DEFAULT_BATCH_SIZE,
) -> Callable[[TrainerFnArgs], None]:
    if fn_args.transform_output is None:
        tf_transform_output = None
        schema = io_utils.parse_pbtxt_file(fn_args.schema_file, schema_pb2.Schema())
        feature_spec = schema_utils.schema_as_feature_spec(schema).feature_spec
        for output_name in Features(outputs).names():
            feature_spec.pop(output_name)

        def transform_features(serialized_tf_examples):
            return tf.io.parse_example(serialized_tf_examples, feature_spec)

    else:
        tf_transform_output = tft.TFTransformOutput(fn_args.transform_output)
        tft_layer = tf_transform_output.transform_features_layer()
        schema = tf_transform_output.transformed_metadata.schema
        feature_spec = tf_transform_output.raw_feature_spec()
        for output_name in Features(outputs).names():
            feature_spec.pop(output_name)

        def transform_features(serialized_tf_examples):
            parsed_features = tf.io.parse_example(serialized_tf_examples, feature_spec)
            return tft_layer(parsed_features)

    def build_dataset(files):
        return (
            fn_args.data_accessor.tf_dataset_factory(
                files,
                dataset_options.TensorFlowDatasetOptions(batch_size),
                schema,
            )
            .map(
                lambda batch: (
                    Features(inputs).map(lambda name, _: batch[name]),
                    Features(outputs).map(lambda name, _: batch[name]),
                )
            )
            .repeat()
        )

    distributed_strategy = tf.distribute.MirroredStrategy()
    with distributed_strategy.scope():
        custom_model(
            fn_args=fn_args,
            train_dataset=build_dataset(fn_args.train_files),
            eval_dataset=build_dataset(fn_args.eval_files),
            transform_features=transform_features,
        )
示例#18
0
    def test_trainer_fn(self):
        temp_dir = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName)

        schema_file = os.path.join(self._testdata_path,
                                   'schema_gen/schema.pbtxt')
        output_dir = os.path.join(temp_dir, 'output_dir')
        hparams = tf.contrib.training.HParams(
            train_files=os.path.join(
                self._testdata_path,
                'transform/transformed_examples/train/*.gz'),
            transform_output=os.path.join(self._testdata_path,
                                          'transform/transform_output/'),
            output_dir=output_dir,
            serving_model_dir=os.path.join(temp_dir, 'serving_model_dir'),
            eval_files=os.path.join(
                self._testdata_path,
                'transform/transformed_examples/eval/*.gz'),
            schema_file=schema_file,
            train_steps=1,
            eval_steps=1,
            verbosity='INFO',
            warm_start_from=os.path.join(self._testdata_path,
                                         'trainer/current/serving_model_dir'))
        schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema())
        training_spec = taxi_utils.trainer_fn(hparams, schema)

        estimator = training_spec['estimator']
        train_spec = training_spec['train_spec']
        eval_spec = training_spec['eval_spec']
        eval_input_receiver_fn = training_spec['eval_input_receiver_fn']

        self.assertIsInstance(estimator,
                              tf.estimator.DNNLinearCombinedClassifier)
        self.assertIsInstance(train_spec, tf.estimator.TrainSpec)
        self.assertIsInstance(eval_spec, tf.estimator.EvalSpec)
        self.assertIsInstance(eval_input_receiver_fn, types.FunctionType)

        # Train for one step, then eval for one step.
        eval_result, exports = tf.estimator.train_and_evaluate(
            estimator, train_spec, eval_spec)
        self.assertGreater(eval_result['loss'], 0.0)
        self.assertEqual(len(exports), 1)
        self.assertGreaterEqual(len(tf.gfile.ListDirectory(exports[0])), 1)

        # Export the eval saved model.
        eval_savedmodel_path = tfma.export.export_eval_savedmodel(
            estimator=estimator,
            export_dir_base=path_utils.eval_model_dir(output_dir),
            eval_input_receiver_fn=eval_input_receiver_fn)
        self.assertGreaterEqual(
            len(tf.gfile.ListDirectory(eval_savedmodel_path)), 1)

        # Test exported serving graph.
        with tf.Session() as sess:
            metagraph_def = tf.compat.v1.saved_model.loader.load(
                sess, [tf.saved_model.tag_constants.SERVING], exports[0])
            self.assertIsInstance(metagraph_def, tf.MetaGraphDef)
示例#19
0
def run_fn(fn_args: FnArgs):
    """Train the model based on given args.

  Args:
    fn_args: Holds args used to train the model as name/value pairs.
  """
    schema = io_utils.parse_pbtxt_file(fn_args.schema_file,
                                       schema_pb2.Schema())

    x_train, y_train = _input_fn(fn_args.train_files, fn_args.data_accessor,
                                 schema)
    x_eval, y_eval = _input_fn(fn_args.eval_files, fn_args.data_accessor,
                               schema)

    steps_per_epoch = _TRAIN_DATA_SIZE / _TRAIN_BATCH_SIZE

    estimator = MLPClassifier(hidden_layer_sizes=[8, 8, 8],
                              activation='relu',
                              solver='adam',
                              batch_size=_TRAIN_BATCH_SIZE,
                              learning_rate_init=0.0005,
                              max_iter=int(fn_args.train_steps /
                                           steps_per_epoch),
                              verbose=True)

    # Create a pipeline that standardizes the input data before passing it to an
    # estimator. Once the scaler is fit, it will use the same mean and stdev to
    # transform inputs at both training and serving time.
    model = Pipeline([
        ('scaler', StandardScaler()),
        ('estimator', estimator),
    ])
    model.feature_keys = _FEATURE_KEYS
    model.label_key = _LABEL_KEY
    model.fit(x_train, y_train)
    absl.logging.info(model)

    score = model.score(x_eval, y_eval)
    absl.logging.info('Accuracy: %f', score)

    # Export the model as a pickle named model.pkl. AI Platform Prediction expects
    # sklearn model artifacts to follow this naming convention.
    os.makedirs(fn_args.serving_model_dir)

    model_path = os.path.join(fn_args.serving_model_dir, 'model.pkl')
    with fileio.open(model_path, 'wb+') as f:
        pickle.dump(model, f)
示例#20
0
def run_fn(fn_args: TrainerFnArgs):

    schema = io_utils.parse_pbtxt_file(fn_args.schema_file,
                                       schema_pb2.Schema())

    x_train, y_train = _input_fn(fn_args.train_files, fn_args.data_accessor,
                                 schema)

    cls = SGDClassifier(loss='log',
                        penalty='elasticnet',
                        learning_rate='adaptive',
                        eta0=2,
                        verbose=1,
                        tol=1e-2)

    count_vectorizer = CountVectorizer()

    pipeline = Pipeline([('vect', count_vectorizer), ('cls', cls)])

    grid = {
        'cls__alpha': [0.01, 0.5, 0.99, 2, 5],
        'cls__l1_ratio': [0.01, 0.5, 0.99]
    }

    grid_search_cv = GridSearchCV(pipeline, param_grid=grid, scoring='roc_auc')

    print(x_train.shape)
    print(x_train[0:2, :])
    # ravel here
    # model = grid_search_cv.fit(X=x_train.ravel(), y=y_train)
    model = pipeline.fit(X=x_train.ravel(), y=y_train)

    x_eval, y_eval = _input_fn(fn_args.eval_files, fn_args.data_accessor,
                               schema)

    score = model.score(x_eval.ravel(), y_eval)
    absl.logging.info('Accuracy: %f', score)

    os.makedirs(fn_args.serving_model_dir)

    model_path = os.path.join(fn_args.serving_model_dir, 'model.pkl')
    with fileio.open(model_path, 'wb+') as f:
        pickle.dump(model, f)
示例#21
0
def run_fn(fn_args: executor.TrainerFnArgs):
    """Train the model based on given args.

  Args:
    fn_args: Holds args used to train the model as name/value pairs.
  """
    schema = io_utils.parse_pbtxt_file(fn_args.schema_file,
                                       schema_pb2.Schema())

    training_spec = _trainer_fn(fn_args, schema)

    # Train the model
    absl.logging.info('Training model.')
    tf.estimator.train_and_evaluate(training_spec['estimator'],
                                    training_spec['train_spec'],
                                    training_spec['eval_spec'])
    absl.logging.info('Training complete.  Model written to %s',
                      fn_args.serving_model_dir)

    # Export an eval savedmodel for TFMA
    # NOTE: When trained in distributed training cluster, eval_savedmodel must be
    # exported only by the chief worker (check TF_CONFIG).
    absl.logging.info('Exporting eval_savedmodel for TFMA.')
    eval_export_dir = path_utils.eval_model_dir(fn_args.model_run_dir)
    tfma.export.export_eval_savedmodel(
        estimator=training_spec['estimator'],
        export_dir_base=eval_export_dir,
        eval_input_receiver_fn=training_spec['eval_input_receiver_fn'])

    absl.logging.info('Exported eval_savedmodel to %s.',
                      fn_args.eval_model_dir)

    # TODO(b/160795287): Deprecate estimator based executor.
    # Copy serving and eval model from model_run to model artifact directory.
    serving_source = path_utils.serving_model_path(fn_args.model_run_dir)
    io_utils.copy_dir(serving_source, fn_args.serving_model_dir)
    absl.logging.info('Serving model copied to: %s.',
                      fn_args.serving_model_dir)

    eval_source = path_utils.eval_model_path(fn_args.model_run_dir)
    io_utils.copy_dir(eval_source, fn_args.eval_model_dir)
    absl.logging.info('Eval model copied to: %s.', fn_args.eval_model_dir)
示例#22
0
def run_fn(fn_args):
    schema = io_utils.parse_pbtxt_file(fn_args.schema_file,
                                       schema_pb2.Schema())

    train_and_eval_spec = _create_train_and_eval_spec(fn_args, schema)

    logging.info('Training model.')
    tf.estimator.train_and_evaluate(train_and_eval_spec['estimator'],
                                    train_and_eval_spec['train_spec'],
                                    train_and_eval_spec['eval_spec'])
    logging.info('Training complete.  Model written to %s',
                 fn_args.serving_model_dir)

    logging.info('Exporting eval_savedmodel for TFMA.')
    tfma.export.export_eval_savedmodel(
        estimator=train_and_eval_spec['estimator'],
        export_dir_base=fn_args.eval_model_dir,
        eval_input_receiver_fn=train_and_eval_spec['eval_input_receiver_fn'])

    logging.info('Exported eval_savedmodel to %s.', fn_args.eval_model_dir)
示例#23
0
    def testDo(self):
        source_data_dir = os.path.join(
            os.path.dirname(os.path.dirname(__file__)), 'testdata')

        eval_stats_artifact = standard_artifacts.ExampleStatistics()
        eval_stats_artifact.uri = os.path.join(source_data_dir,
                                               'statistics_gen')
        eval_stats_artifact.split_names = artifact_utils.encode_split_names(
            ['eval'])

        schema_artifact = standard_artifacts.Schema()
        schema_artifact.uri = os.path.join(source_data_dir, 'schema_gen')

        output_data_dir = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName)

        validation_output = standard_artifacts.ExampleAnomalies()
        validation_output.uri = os.path.join(output_data_dir, 'output')

        input_dict = {
            executor.STATISTICS_KEY: [eval_stats_artifact],
            executor.SCHEMA_KEY: [schema_artifact],
        }
        output_dict = {
            executor.ANOMALIES_KEY: [validation_output],
        }

        exec_properties = {}

        example_validator_executor = executor.Executor()
        example_validator_executor.Do(input_dict, output_dict, exec_properties)
        self.assertEqual(['anomalies.pbtxt'],
                         tf.io.gfile.listdir(validation_output.uri))
        anomalies = io_utils.parse_pbtxt_file(
            os.path.join(validation_output.uri, 'anomalies.pbtxt'),
            anomalies_pb2.Anomalies())
        self.assertNotEqual(0, len(anomalies.anomaly_info))
示例#24
0
    def test_do(self):
        source_data_dir = os.path.join(
            os.path.dirname(os.path.dirname(__file__)), 'testdata')

        eval_stats_artifact = types.Artifact('ExampleStatsPath', split='eval')
        eval_stats_artifact.uri = os.path.join(source_data_dir,
                                               'statistics_gen/eval/')

        schema_artifact = standard_artifacts.Schema()
        schema_artifact.uri = os.path.join(source_data_dir, 'schema_gen/')

        output_data_dir = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName)

        validation_output = standard_artifacts.ExampleValidationResult()
        validation_output.uri = os.path.join(output_data_dir, 'output')

        input_dict = {
            'stats': [eval_stats_artifact],
            'schema': [schema_artifact],
        }
        output_dict = {
            'output': [validation_output],
        }

        exec_properties = {}

        example_validator_executor = executor.Executor()
        example_validator_executor.Do(input_dict, output_dict, exec_properties)
        self.assertEqual(['anomalies.pbtxt'],
                         tf.gfile.ListDirectory(validation_output.uri))
        anomalies = io_utils.parse_pbtxt_file(
            os.path.join(validation_output.uri, 'anomalies.pbtxt'),
            anomalies_pb2.Anomalies())
        self.assertNotEqual(0, len(anomalies.anomaly_info))
示例#25
0
  def test_do(self):
    source_data_dir = os.path.join(
        os.path.dirname(os.path.dirname(__file__)), 'testdata')

    eval_stats_artifact = types.TfxType('ExampleStatsPath', split='eval')
    eval_stats_artifact.uri = os.path.join(source_data_dir,
                                           'statistics_gen/eval/')

    schema_artifact = types.TfxType('SchemaPath')
    schema_artifact.uri = os.path.join(source_data_dir, 'schema_gen/')

    output_data_dir = os.path.join(
        os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
        self._testMethodName)

    validation_output = types.TfxType('ExampleValidationPath')
    validation_output.uri = os.path.join(output_data_dir, 'output')

    input_dict = {
        'stats': [eval_stats_artifact],
        'schema': [schema_artifact],
    }
    output_dict = {
        'output': [validation_output],
    }

    exec_properties = {}

    example_validator_executor = executor.Executor()
    example_validator_executor.Do(input_dict, output_dict, exec_properties)
    self.assertEqual(['anomalies.pbtxt'],
                     tf.gfile.ListDirectory(validation_output.uri))
    anomalies = io_utils.parse_pbtxt_file(
        os.path.join(validation_output.uri, 'anomalies.pbtxt'),
        anomalies_pb2.Anomalies())
    self.assertNotEqual(0, len(anomalies.anomaly_info))
示例#26
0
  def Do(self, input_dict: Dict[Text, List[types.Artifact]],
         output_dict: Dict[Text, List[types.Artifact]],
         exec_properties: Dict[Text, Any]) -> None:
    """Uses a user-supplied tf.estimator to train a TensorFlow model locally.

    The Trainer Executor invokes a training_fn callback function provided by
    the user via the module_file parameter.  With the tf.estimator returned by
    this function, the Trainer Executor then builds a TensorFlow model using the
    user-provided tf.estimator.

    Args:
      input_dict: Input dict from input key to a list of ML-Metadata Artifacts.
        - examples: Examples used for training, must include 'train' and 'eval'
          splits.
        - transform_output: Optional input transform graph.
        - schema: Schema of the data.
      output_dict: Output dict from output key to a list of Artifacts.
        - output: Exported model.
      exec_properties: A dict of execution properties.
        - train_args: JSON string of trainer_pb2.TrainArgs instance, providing
          args for training.
        - eval_args: JSON string of trainer_pb2.EvalArgs instance, providing
          args for eval.
        - module_file: Python module file containing UDF model definition.
        - warm_starting: Whether or not we need to do warm starting.
        - warm_start_from: Optional. If warm_starting is True, this is the
          directory to find previous model to warm start on.

    Returns:
      None

    Raises:
      ValueError: When neither or both of 'module_file' and 'trainer_fn'
        are present in 'exec_properties'.
    """
    self._log_startup(input_dict, output_dict, exec_properties)

    # TODO(zhitaoli): Deprecate this in a future version.
    if exec_properties.get('custom_config', None):
      cmle_args = exec_properties.get('custom_config',
                                      {}).get('cmle_training_args')
      if cmle_args:
        executor_class_path = '.'.join([Executor.__module__, Executor.__name__])
        absl.logging.warn(
            'Passing \'cmle_training_args\' to trainer directly is deprecated, '
            'please use extension executor at '
            'tfx.extensions.google_cloud_ai_platform.trainer.executor instead')

        return runner.start_cmle_training(input_dict, output_dict,
                                          exec_properties, executor_class_path,
                                          cmle_args)

    trainer_fn = self._GetTrainerFn(exec_properties)

    # Set up training parameters
    train_files = [
        _all_files_pattern(
            artifact_utils.get_split_uri(input_dict['examples'], 'train'))
    ]
    transform_output = artifact_utils.get_single_uri(
        input_dict['transform_output']) if input_dict.get(
            'transform_output', None) else None
    eval_files = [
        _all_files_pattern(
            artifact_utils.get_split_uri(input_dict['examples'], 'eval'))
    ]
    schema_file = io_utils.get_only_uri_in_dir(
        artifact_utils.get_single_uri(input_dict['schema']))

    train_args = trainer_pb2.TrainArgs()
    eval_args = trainer_pb2.EvalArgs()
    json_format.Parse(exec_properties['train_args'], train_args)
    json_format.Parse(exec_properties['eval_args'], eval_args)

    # https://github.com/tensorflow/tfx/issues/45: Replace num_steps=0 with
    # num_steps=None.  Conversion of the proto to python will set the default
    # value of an int as 0 so modify the value here.  Tensorflow will raise an
    # error if num_steps <= 0.
    train_steps = train_args.num_steps or None
    eval_steps = eval_args.num_steps or None

    output_path = artifact_utils.get_single_uri(output_dict['output'])
    serving_model_dir = path_utils.serving_model_dir(output_path)
    eval_model_dir = path_utils.eval_model_dir(output_path)

    # Assemble warm start path if needed.
    warm_start_from = None
    if exec_properties.get('warm_starting') and exec_properties.get(
        'warm_start_from'):
      previous_model_dir = os.path.join(exec_properties['warm_start_from'],
                                        path_utils.SERVING_MODEL_DIR)
      if previous_model_dir and tf.io.gfile.exists(
          os.path.join(previous_model_dir, self._CHECKPOINT_FILE_NAME)):
        warm_start_from = previous_model_dir

    # TODO(b/126242806) Use PipelineInputs when it is available in third_party.
    hparams = _HParamWrapper(
        # A list of uris for train files.
        train_files=train_files,
        # An optional single uri for transform graph produced by TFT. Will be
        # None if not specified.
        transform_output=transform_output,
        # A single uri for the output directory of the serving model.
        serving_model_dir=serving_model_dir,
        # A list of uris for eval files.
        eval_files=eval_files,
        # A single uri for schema file.
        schema_file=schema_file,
        # Number of train steps.
        train_steps=train_steps,
        # Number of eval steps.
        eval_steps=eval_steps,
        # A single uri for the model directory to warm start from.
        warm_start_from=warm_start_from)

    schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema())

    training_spec = trainer_fn(hparams, schema)

    # Train the model
    absl.logging.info('Training model.')
    tf.estimator.train_and_evaluate(training_spec['estimator'],
                                    training_spec['train_spec'],
                                    training_spec['eval_spec'])
    absl.logging.info('Training complete.  Model written to %s',
                      serving_model_dir)

    # Export an eval savedmodel for TFMA
    absl.logging.info('Exporting eval_savedmodel for TFMA.')
    tfma.export.export_eval_savedmodel(
        estimator=training_spec['estimator'],
        export_dir_base=eval_model_dir,
        eval_input_receiver_fn=training_spec['eval_input_receiver_fn'])

    absl.logging.info('Exported eval_savedmodel to %s.', eval_model_dir)
示例#27
0
 def load_proto_from_text(
         self, file_name: Text,
         proto_message: message.Message) -> message.Message:
     """Loads proto message from serialized text."""
     path = os.path.join(os.path.dirname(__file__), 'testdata', file_name)
     return io_utils.parse_pbtxt_file(path, proto_message)
示例#28
0
  def Do(self, input_dict: Dict[Text, List[types.Artifact]],
         output_dict: Dict[Text, List[types.Artifact]],
         exec_properties: Dict[Text, Any]) -> None:
    """Get human review result on a model through Slack channel.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - model_export: exported model from trainer.
        - model_blessing: model blessing path from model_validator.
      output_dict: Output dict from key to a list of artifacts, including:
        - slack_blessing: model blessing result.
      exec_properties: A dict of execution properties, including:
        - slack_token: Token used to setup connection with slack server.
        - slack_channel_id: The id of the Slack channel to send and receive
          messages.
        - timeout_sec: How long do we wait for response, in seconds.

    Returns:
      None

    Raises:
      TimeoutError:
        When there is no decision made within timeout_sec.
      ConnectionError:
        When connection to slack server cannot be established.

    """
    self._log_startup(input_dict, output_dict, exec_properties)
    transform_graph_uri = artifact_utils.get_single_uri(
        input_dict[TRANSFORM_GRAPH_KEY])
    temp_path = os.path.join(transform_graph_uri, _TEMP_DIR_IN_TRANSFORM_OUTPUT)
    # transformed_schema_file = os.path.join(
    #   transform_graph_uri,
    #   tft.TFTransformOutput.TRANSFORMED_METADATA_DIR,
    #   'schema.pbtxt'
    # )
    # transformed_schema_proto = io_utils.parse_pbtxt_file(
    #   transformed_schema_file,
    #   schema_pb2.Schema()
    # )
    transformed_train_output = artifact_utils.get_split_uri(
      output_dict[TRANSFORMED_EXAMPLES_KEY], 'train')
    transformed_eval_output = artifact_utils.get_split_uri(
      output_dict[TRANSFORMED_EXAMPLES_KEY], 'eval')

    tf_transform_output = tft.TFTransformOutput(transform_graph_uri)
    # transform_output_dataset_metadata = dataset_metadata.DatasetMetadata(
    #   schema=transformed_schema_proto
    # )

    # transform_fn = (tf_transform_output.transform_raw_features, transform_output_dataset_metadata)
    # feature_spec = schema_utils.schema_as_feature_spec(schema_proto).feature_spec
    schema_file = io_utils.get_only_uri_in_dir(
        artifact_utils.get_single_uri(input_dict[SCHEMA_KEY]))
    schema_proto = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema())
    transform_input_dataset_metadata = dataset_metadata.DatasetMetadata(
      schema_proto
    )

    train_data_uri = artifact_utils.get_split_uri(
      input_dict[EXAMPLES_KEY],
      'train'
    )
    eval_data_uri = artifact_utils.get_split_uri(
      input_dict[EXAMPLES_KEY],
      'eval'
    )
    analyze_data_paths = [io_utils.all_files_pattern(train_data_uri)]
    transform_data_paths = [
      io_utils.all_files_pattern(train_data_uri),
      io_utils.all_files_pattern(eval_data_uri),
    ]
    materialize_output_paths = [
      os.path.join(transformed_train_output, _DEFAULT_TRANSFORMED_EXAMPLES_PREFIX),
      os.path.join(transformed_eval_output, _DEFAULT_TRANSFORMED_EXAMPLES_PREFIX)
    ]
    transform_data_list = self._MakeDatasetList(
      transform_data_paths,
      materialize_output_paths
    )
    analyze_data_list = self._MakeDatasetList(
      analyze_data_paths,
    )

    with self._make_beam_pipeline() as pipeline:
      with tft_beam.Context(temp_dir=temp_path):
        # NOTE: Unclear if there is a difference between input_dataset_metadata
        # and transform_input_dataset_metadata. Look at Transform executor.
        decode_fn = tft.coders.ExampleProtoCoder(schema_proto, serialized=True).decode

        input_analysis_data = {}
        for dataset in analyze_data_list:
          infix = 'AnalysisIndex{}'.format(dataset.index)
          dataset.serialized = (
            pipeline
            | 'ReadDataset[{}]'.format(infix) >> self._ReadExamples(
                dataset, transform_input_dataset_metadata))
          dataset.decoded = (
            dataset.serialized
            | 'Decode[{}]'.format(infix)
            >> self._DecodeInputs(decode_fn))
          input_analysis_data[dataset.dataset_key] = dataset.decoded

        if not hasattr(tft_beam.analyzer_cache, 'DatasetKey'):
          input_analysis_data = (
              [
                  dataset for dataset in input_analysis_data.values()
                  if dataset is not None
              ]
              | 'FlattenAnalysisDatasetsBecauseItIsRequired' >>
              beam.Flatten(pipeline=pipeline))

        transform_fn = (
            (input_analysis_data, transform_input_dataset_metadata)
            | 'Analyze' >> tft_beam.AnalyzeDataset(
                tf_transform_output.transform_raw_features, pipeline=pipeline))

        for dataset in transform_data_list:
          infix = 'TransformIndex{}'.format(dataset.index)
          dataset.serialized = (
            pipeline
            | 'ReadDataset[{}]'.format(infix) >> self._ReadExamples(
                dataset, transform_input_dataset_metadata))

          dataset.decoded = (
            dataset.serialized
            | 'Decode[{}]'.format(infix)
            >> self._DecodeInputs(decode_fn))

          dataset.transformed, metadata = (
              ((dataset.decoded, transform_input_dataset_metadata), transform_fn)
              | 'Transform[{}]'.format(infix) >> tft_beam.TransformDataset())

          dataset.transformed_and_serialized = (
              dataset.transformed
              | 'EncodeAndSerialize[{}]'.format(infix)
              >> beam.ParDo(self._EncodeAsSerializedExamples(), _GetSchemaProto(metadata)))

          _ = (
            dataset.transformed_and_serialized
            | 'Materialize[{}]'.format(infix) >> self._WriteExamples(dataset.materialize_output_path))
示例#29
0
    def Do(self, input_dict, output_dict, exec_properties):
        """Runs trainer job the given input.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - transformed_examples: Transformed example.
        - transform_output: Input transform graph.
        - schema: Schema of the data.
      output_dict: Output dict from output key to a list of Artifacts.
        - output: Exported model.
      exec_properties: A dict of execution properties.
        - train_args: JSON string of trainer_pb2.TrainArgs instance, providing
          args for training.
        - eval_args: JSON string of trainer_pb2.EvalArgs instance, providing
          args for eval.
        - module_file: Python module file containing UDF model definition.
        - warm_starting: Whether or not we need to do warm starting.
        - warm_start_from: Optional. If warm_starting is True, this is the
          directory to find previous model to warm start on.

    Returns:
      None
    """
        self._log_startup(input_dict, output_dict, exec_properties)

        # TODO(khaas): Move this to tfx/extensions.
        if exec_properties.get('custom_config', None):
            cmle_args = exec_properties.get('custom_config',
                                            {}).get('cmle_training_args')
            if cmle_args:
                return cmle_runner.start_cmle_training(input_dict, output_dict,
                                                       exec_properties,
                                                       cmle_args)

        trainer_fn = io_utils.import_func(exec_properties['module_file'],
                                          'trainer_fn')

        # Set up training parameters
        train_files = [
            _all_files_pattern(
                types.get_split_uri(input_dict['transformed_examples'],
                                    'train'))
        ]
        transform_output = types.get_single_uri(input_dict['transform_output'])
        eval_files = _all_files_pattern(
            types.get_split_uri(input_dict['transformed_examples'], 'eval'))
        schema_file = io_utils.get_only_uri_in_dir(
            types.get_single_uri(input_dict['schema']))

        train_args = trainer_pb2.TrainArgs()
        eval_args = trainer_pb2.EvalArgs()
        json_format.Parse(exec_properties['train_args'], train_args)
        json_format.Parse(exec_properties['eval_args'], eval_args)

        # https://github.com/tensorflow/tfx/issues/45: Replace num_steps=0 with
        # num_steps=None.  Conversion of the proto to python will set the default
        # value of an int as 0 so modify the value here.  Tensorflow will raise an
        # error if num_steps <= 0.
        train_steps = train_args.num_steps or None
        eval_steps = eval_args.num_steps or None

        output_path = types.get_single_uri(output_dict['output'])
        serving_model_dir = path_utils.serving_model_dir(output_path)
        eval_model_dir = path_utils.eval_model_dir(output_path)

        # Assemble warm start path if needed.
        warm_start_from = None
        if exec_properties.get('warm_starting') and exec_properties.get(
                'warm_start_from'):
            previous_model_dir = os.path.join(
                exec_properties['warm_start_from'],
                path_utils.SERVING_MODEL_DIR)
            if previous_model_dir and tf.gfile.Exists(
                    os.path.join(previous_model_dir,
                                 self._CHECKPOINT_FILE_NAME)):
                warm_start_from = previous_model_dir

        # TODO(b/126242806) Use PipelineInputs when it is available in third_party.
        hparams = tf.contrib.training.HParams(
            train_files=train_files,
            transform_output=transform_output,
            output_dir=output_path,
            serving_model_dir=serving_model_dir,
            eval_files=eval_files,
            schema_file=schema_file,
            train_steps=train_steps,
            eval_steps=eval_steps,
            warm_start_from=warm_start_from)

        schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema())

        training_spec = trainer_fn(hparams, schema)

        # Train the model
        tf.logging.info('Training model.')
        tf.estimator.train_and_evaluate(training_spec['estimator'],
                                        training_spec['train_spec'],
                                        training_spec['eval_spec'])
        tf.logging.info('Training complete.  Model written to %s',
                        serving_model_dir)

        # Export an eval savedmodel for TFMA
        tf.logging.info('Exporting eval_savedmodel for TFMA.')
        tfma.export.export_eval_savedmodel(
            estimator=training_spec['estimator'],
            export_dir_base=eval_model_dir,
            eval_input_receiver_fn=training_spec['eval_input_receiver_fn'])

        tf.logging.info('Exported eval_savedmodel to %s.', eval_model_dir)
示例#30
0
def get_proto_from_test_data(filename: Text,
                             pb_message: message.Message) -> message.Message:
    """Helper function that gets proto from testdata."""
    filepath = os.path.join(os.path.dirname(__file__), 'testdata', filename)
    return io_utils.parse_pbtxt_file(filepath, pb_message)
示例#31
0
    def test_preprocessing_fn(self):
        schema_file = os.path.join(self._testdata_path,
                                   'schema_gen/schema.pbtxt')
        schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema())
        feature_spec = taxi_utils._get_raw_feature_spec(schema)
        working_dir = self.get_temp_dir()
        transform_output_path = os.path.join(working_dir, 'transform_output')
        transformed_examples_path = os.path.join(working_dir,
                                                 'transformed_examples')

        # Run very simplified version of executor logic.
        # TODO(kestert): Replace with tft_unit.assertAnalyzeAndTransformResults.
        # Generate legacy `DatasetMetadata` object.  Future version of Transform
        # will accept the `Schema` proto directly.
        legacy_metadata = dataset_metadata.DatasetMetadata(
            dataset_schema.from_feature_spec(feature_spec))
        decoder = tft.coders.ExampleProtoCoder(legacy_metadata.schema)
        with beam.Pipeline() as p:
            with tft_beam.Context(temp_dir=os.path.join(working_dir, 'tmp')):
                examples = (
                    p
                    | 'ReadTrainData' >> beam.io.ReadFromTFRecord(
                        os.path.join(self._testdata_path,
                                     'csv_example_gen/train/*'),
                        coder=beam.coders.BytesCoder(),
                        # TODO(b/114938612): Eventually remove this override.
                        validate=False)
                    | 'DecodeTrainData' >> beam.Map(decoder.decode))
                (transformed_examples, transformed_metadata), transform_fn = (
                    (examples, legacy_metadata)
                    | 'AnalyzeAndTransform' >>
                    tft_beam.AnalyzeAndTransformDataset(
                        taxi_utils.preprocessing_fn))

                # WriteTransformFn writes transform_fn and metadata to subdirectories
                # tensorflow_transform.SAVED_MODEL_DIR and
                # tensorflow_transform.TRANSFORMED_METADATA_DIR respectively.
                # pylint: disable=expression-not-assigned
                (transform_fn
                 | 'WriteTransformFn' >>
                 tft_beam.WriteTransformFn(transform_output_path))

                encoder = tft.coders.ExampleProtoCoder(
                    transformed_metadata.schema)
                (transformed_examples
                 | 'EncodeTrainData' >> beam.Map(encoder.encode)
                 | 'WriteTrainData' >> beam.io.WriteToTFRecord(
                     os.path.join(transformed_examples_path,
                                  'train/transformed_examples.gz'),
                     coder=beam.coders.BytesCoder()))
                # pylint: enable=expression-not-assigned

        # Verify the output matches golden output.
        # NOTE: we don't verify that transformed examples match golden output.
        expected_transformed_schema = io_utils.parse_pbtxt_file(
            os.path.join(
                self._testdata_path,
                'transform/transform_output/transformed_metadata/schema.pbtxt'
            ), schema_pb2.Schema())
        transformed_schema = io_utils.parse_pbtxt_file(
            os.path.join(transform_output_path,
                         'transformed_metadata/schema.pbtxt'),
            schema_pb2.Schema())
        # Clear annotations so we only have to test main schema.
        for feature in transformed_schema.feature:
            feature.ClearField('annotation')
        self.assertEqual(transformed_schema, expected_transformed_schema)
示例#32
0
  def Do(self, input_dict: Dict[Text, List[types.Artifact]],
         output_dict: Dict[Text, List[types.Artifact]],
         exec_properties: Dict[Text, Any]) -> None:
    """Uses a user-supplied tf.estimator to train a TensorFlow model locally.

    The Trainer Executor invokes a training_fn callback function provided by
    the user via the module_file parameter.  With the tf.estimator returned by
    this function, the Trainer Executor then builds a TensorFlow model using the
    user-provided tf.estimator.

    Args:
      input_dict: Input dict from input key to a list of ML-Metadata Artifacts.
        - examples: Examples used for training, must include 'train' and 'eval'
          splits.
        - transform_output: Optional input transform graph.
        - schema: Schema of the data.
      output_dict: Output dict from output key to a list of Artifacts.
        - model: Exported model.
        - model_run: Model training related outputs (e.g., Tensorboard logs)
      exec_properties: A dict of execution properties.
        - train_args: JSON string of trainer_pb2.TrainArgs instance, providing
          args for training.
        - eval_args: JSON string of trainer_pb2.EvalArgs instance, providing
          args for eval.
        - module_file: Python module file containing UDF model definition.
        - warm_starting: Whether or not we need to do warm starting.
        - warm_start_from: Optional. If warm_starting is True, this is the
          directory to find previous model to warm start on.
        - custom_config: Optional. JSON-serialized dict of additional parameters
          to pass to trainer function.

    Returns:
      None

    Raises:
      ValueError: When neither or both of 'module_file' and 'trainer_fn'
        are present in 'exec_properties'.
    """
    self._log_startup(input_dict, output_dict, exec_properties)

    fn_args = self._GetFnArgs(input_dict, output_dict, exec_properties)
    trainer_fn = udf_utils.get_fn(exec_properties, 'trainer_fn')

    schema = io_utils.parse_pbtxt_file(fn_args.schema_file, schema_pb2.Schema())

    # TODO(b/160795287): Deprecate estimator based executor.
    # Provide user with a modified fn_args, with model_run given as
    # the working directory. Executor will then copy user models to
    # model artifact directory.
    serving_dest = fn_args.serving_model_dir
    eval_dest = fn_args.eval_model_dir

    working_dir = fn_args.model_run_dir
    fn_args.serving_model_dir = path_utils.serving_model_dir(working_dir)
    fn_args.eval_model_dir = path_utils.eval_model_dir(working_dir)

    training_spec = trainer_fn(fn_args, schema)

    # Train the model
    absl.logging.info('Training model.')
    tf.estimator.train_and_evaluate(training_spec['estimator'],
                                    training_spec['train_spec'],
                                    training_spec['eval_spec'])

    absl.logging.info(
        'Training complete. Model written to %s. ModelRun written to %s',
        fn_args.serving_model_dir, fn_args.model_run_dir)

    # Export an eval savedmodel for TFMA. If distributed training, it must only
    # be written by the chief worker, as would be done for serving savedmodel.
    if _is_chief():
      absl.logging.info('Exporting eval_savedmodel for TFMA.')
      tfma.export.export_eval_savedmodel(
          estimator=training_spec['estimator'],
          export_dir_base=fn_args.eval_model_dir,
          eval_input_receiver_fn=training_spec['eval_input_receiver_fn'])

      absl.logging.info('Exported eval_savedmodel to %s.',
                        fn_args.eval_model_dir)

      # TODO(b/160795287): Deprecate estimator based executor.
      # Copy serving and eval model from model_run to model artifact directory.
      serving_source = path_utils.serving_model_path(fn_args.model_run_dir)
      io_utils.copy_dir(serving_source, serving_dest)
      absl.logging.info('Serving model copied to: %s.', serving_dest)

      eval_source = path_utils.eval_model_path(fn_args.model_run_dir)
      io_utils.copy_dir(eval_source, eval_dest)
      absl.logging.info('Eval model copied to: %s.', eval_dest)

    else:
      absl.logging.info(
          'Model export is skipped because this is not the chief worker.')
示例#33
0
  def Do(self, input_dict: Dict[Text, List[types.Artifact]],
         output_dict: Dict[Text, List[types.Artifact]],
         exec_properties: Dict[Text, Any]) -> None:
    """Uses a user-supplied tf.estimator to train a TensorFlow model locally.

    The Trainer Executor invokes a training_fn callback function provided by
    the user via the module_file parameter.  With the tf.estimator returned by
    this function, the Trainer Executor then builds a TensorFlow model using the
    user-provided tf.estimator.

    Args:
      input_dict: Input dict from input key to a list of ML-Metadata Artifacts.
        - examples: Examples used for training, must include 'train' and 'eval'
          splits.
        - transform_output: Optional input transform graph.
        - schema: Schema of the data.
      output_dict: Output dict from output key to a list of Artifacts.
        - output: Exported model.
      exec_properties: A dict of execution properties.
        - train_args: JSON string of trainer_pb2.TrainArgs instance, providing
          args for training.
        - eval_args: JSON string of trainer_pb2.EvalArgs instance, providing
          args for eval.
        - module_file: Python module file containing UDF model definition.
        - warm_starting: Whether or not we need to do warm starting.
        - warm_start_from: Optional. If warm_starting is True, this is the
          directory to find previous model to warm start on.
        - custom_config: Optional. Additional parameters to pass to trainer
          function.

    Returns:
      None

    Raises:
      ValueError: When neither or both of 'module_file' and 'trainer_fn'
        are present in 'exec_properties'.
    """
    self._log_startup(input_dict, output_dict, exec_properties)

    fn_args = self._GetFnArgs(input_dict, output_dict, exec_properties)
    trainer_fn = self._GetFn(exec_properties, 'trainer_fn')

    schema = io_utils.parse_pbtxt_file(fn_args.schema_file, schema_pb2.Schema())

    training_spec = trainer_fn(fn_args, schema)

    # Train the model
    absl.logging.info('Training model.')
    tf.estimator.train_and_evaluate(training_spec['estimator'],
                                    training_spec['train_spec'],
                                    training_spec['eval_spec'])
    absl.logging.info('Training complete.  Model written to %s',
                      fn_args.serving_model_dir)

    # Export an eval savedmodel for TFMA
    # For distributed training, master and worker(s) try to export multiple
    # eval_savedmodels (b/147378113). To avoid that, only export
    # eval_savedmodel if eval_model_dir does not exist as an intermediate
    # solution until b/147378113 is resolved.
    if not tf.io.gfile.exists(fn_args.eval_model_dir):
      absl.logging.info('Exporting eval_savedmodel for TFMA.')
      tfma.export.export_eval_savedmodel(
          estimator=training_spec['estimator'],
          export_dir_base=fn_args.eval_model_dir,
          eval_input_receiver_fn=training_spec['eval_input_receiver_fn'])

      absl.logging.info('Exported eval_savedmodel to %s.',
                        fn_args.eval_model_dir)