示例#1
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        encoding = exec_properties["encoding"]
        merged_text_dir = get_single_uri(input_dict["merged_text_dir"])
        encoding_dir = get_single_uri(input_dict["encoding_dir"])
        end_token = exec_properties["end_token"]
        logging.info("encoding as: {}".format(encoding))
        logging.info("merged text dir: {}".format(merged_text_dir))
        logging.info("encoding dir: {}".format(encoding_dir))
        logging.info("ending tokens: {}".format(end_token))

        logging.info('Reading files')
        enc = encoder.get_encoder(encoding_dir)

        chunks = load_dataset(enc,
                              merged_text_dir,
                              encoding=encoding,
                              end_token=end_token)
        logging.info("chunk size: {}".format(len(chunks)))
        logging.info("top 10 chunkds {}".format(chunks[:10]))

        dataset_path = os.path.join(get_single_uri(output_dict["dataset_dir"]),
                                    "dataset.npz")
        logging.info('Writing', dataset_path)
        np.savez_compressed(dataset_path, *chunks)
示例#2
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        model_name = exec_properties["model_name"]
        mlflow_tracking_url = exec_properties["mlflow_tracking_url"]

        model_dir = get_single_uri(input_dict["model_dir"])
        artifact_dir = get_single_uri(input_dict["artifact_dir"])
        hyperparameter_dir = get_single_uri(input_dict["hyperparameter_dir"])
        metric_dir = get_single_uri(input_dict["metric_dir"])

        mlflow.set_tracking_uri(mlflow_tracking_url)
        mlflow.set_experiment(model_name)
        with mlflow.start_run():
            with open(glob.glob(os.path.join(hyperparameter_dir, "*.pickle"))[0], 'rb') as fp:
                hyperparameter = pickle.load(fp)
                for k, v in hyperparameter.items():
                    mlflow.log_param(k, v)
            with open(glob.glob(os.path.join(metric_dir, "*.pickle"))[0], 'rb') as fp:
                metric = pickle.load(fp)
                for k, v in metric.items():
                    mlflow.log_metric(k, v)
            for artifact in glob.glob(os.path.join(artifact_dir, "*")):
                mlflow.log_artifact(artifact)
            with open(glob.glob(os.path.join(model_dir, "*.pickle"))[0], 'rb') as fp:
                mlflow.tensorflow.log_model(tf_saved_model_dir=model_dir, tf_meta_graph_tags=["serve"],
                                            tf_signature_def_key="predict", artifact_path="GPT2")
示例#3
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        self._log_startup(input_dict, output_dict, exec_properties)
        logging.info('Validating schema against the computed statistics.')

        split_uris: List[Text] = []
        for artifact in input_dict[executor.STATISTICS_KEY]:
            for split in artifact_utils.decode_split_names(
                    artifact.split_names):
                split_uris.append(split)

        label_inputs = {
            labels.STATS:
            tfdv.load_statistics(
                io_utils.get_only_uri_in_dir(
                    artifact_utils.get_split_uri(
                        input_dict[executor.STATISTICS_KEY], split_uris[0]))),
            labels.SCHEMA:
            io_utils.SchemaReader().read(
                io_utils.get_only_uri_in_dir(
                    artifact_utils.get_single_uri(
                        input_dict[executor.SCHEMA_KEY])))
        }
        output_uri = artifact_utils.get_single_uri(
            output_dict[executor.ANOMALIES_KEY])
        label_outputs = {labels.SCHEMA_DIFF_PATH: output_uri}
        self._Validate(label_inputs, label_outputs)
        logging.info(
            'Validation complete. Anomalies written to {}.'.format(output_uri))
示例#4
0
  def Do(self, input_dict: Dict[Text, List[types.Artifact]],
         output_dict: Dict[Text, List[types.Artifact]],
         exec_properties: Dict[Text, Any]) -> None:
    # KerasTuner generates tuning state (e.g., oracle, trials) to working dir.
    working_dir = self._get_tmp_dir()

    train_path = artifact_utils.get_split_uri(input_dict['examples'], 'train')
    eval_path = artifact_utils.get_split_uri(input_dict['examples'], 'eval')
    schema_file = io_utils.get_only_uri_in_dir(
        artifact_utils.get_single_uri(input_dict['schema']))
    schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema())

    tuner_fn = self._GetTunerFn(exec_properties)
    tuner_spec = tuner_fn(working_dir, io_utils.all_files_pattern(train_path),
                          io_utils.all_files_pattern(eval_path), schema)
    tuner = tuner_spec.tuner

    tuner.search_space_summary()
    # TODO(jyzhao): assert v2 behavior as KerasTuner doesn't work in v1.
    # TODO(jyzhao): make epochs configurable.
    tuner.search(
        tuner_spec.train_dataset,
        epochs=5,
        validation_data=tuner_spec.eval_dataset)
    tuner.results_summary()

    best_hparams = tuner.oracle.get_best_trials(
        1)[0].hyperparameters.get_config()
    best_hparams_path = os.path.join(
        artifact_utils.get_single_uri(output_dict['study_best_hparams_path']),
        _DEFAULT_FILE_NAME)
    io_utils.write_string_file(best_hparams_path, json.dumps(best_hparams))
    absl.logging.info('Best HParams is written to %s.' % best_hparams_path)
示例#5
0
    def _GetFnArgs(self, input_dict: Dict[str, List[types.Artifact]],
                   output_dict: Dict[str, List[types.Artifact]],
                   exec_properties: Dict[str, Any]) -> fn_args_utils.FnArgs:
        if input_dict.get(standard_component_specs.HYPERPARAMETERS_KEY):
            hyperparameters_file = io_utils.get_only_uri_in_dir(
                artifact_utils.get_single_uri(
                    input_dict[standard_component_specs.HYPERPARAMETERS_KEY]))
            hyperparameters_config = json.loads(
                file_io.read_file_to_string(hyperparameters_file))
        else:
            hyperparameters_config = None

        output_path = artifact_utils.get_single_uri(
            output_dict[standard_component_specs.MODEL_KEY])
        serving_model_dir = path_utils.serving_model_dir(output_path)
        eval_model_dir = path_utils.eval_model_dir(output_path)

        model_run_dir = artifact_utils.get_single_uri(
            output_dict[standard_component_specs.MODEL_RUN_KEY])

        # TODO(b/126242806) Use PipelineInputs when it is available in third_party.
        result = fn_args_utils.get_common_fn_args(input_dict, exec_properties)
        if result.custom_config and not isinstance(result.custom_config, dict):
            raise ValueError(
                'custom_config in execution properties needs to be a '
                'dict. Got %s instead.' % type(result.custom_config))
        result.transform_output = result.transform_graph_path
        result.serving_model_dir = serving_model_dir
        result.eval_model_dir = eval_model_dir
        result.model_run_dir = model_run_dir
        result.schema_file = result.schema_path
        result.hyperparameters = hyperparameters_config
        return result
示例#6
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]):
        self._log_startup(input_dict, output_dict, exec_properties)

        schema = io_utils.SchemaReader().read(
            io_utils.get_only_uri_in_dir(
                artifact_utils.get_single_uri(input_dict[SCHEMA_KEY])))

        groups = group_stats_and_examples(input_dict)
        for examples, datasets in groups:
            datasets = DatasetFeatureStatisticsList(
                datasets=list(datasets.values()))
            partitions = lists_to_partitions(
                datasets, schema, examples,
                partition_fn(datasets, schema, examples))

            for partition in partitions:
                output_uri = os.path.join(
                    artifact_utils.get_single_uri(output_dict[PARTITIONS_KEY]),
                    partition.name)
                io_utils.write_pbtxt_file(
                    os.path.join(output_uri, 'schema.pbtxt'), partition.schema)

                for i in range(0, len(partition.statistics.datasets)):
                    dataset = partition.statistics.datasets[i]
                    example_splits = partition.example_splits[i]

                    io_utils.write_tfrecord_file(
                        os.path.join(output_uri, example_splits.split,
                                     'stats_tfrecord'), dataset)
示例#7
0
文件: executor.py 项目: reddqian/tfx
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """Runs a batch job to evaluate the eval_model against the given input.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - model_exports: exported model.
        - examples: examples for eval the model.
      output_dict: Output dict from output key to a list of Artifacts.
        - output: model evaluation results.
      exec_properties: A dict of execution properties.
        - feature_slicing_spec: JSON string of evaluator_pb2.FeatureSlicingSpec
          instance, providing the way to slice the data.

    Returns:
      None
    """
        if 'model_exports' not in input_dict:
            raise ValueError('\'model_exports\' is missing in input dict.')
        if 'examples' not in input_dict:
            raise ValueError('\'examples\' is missing in input dict.')
        if 'output' not in output_dict:
            raise ValueError('\'output\' is missing in output dict.')

        self._log_startup(input_dict, output_dict, exec_properties)

        # Extract input artifacts
        model_exports_uri = artifact_utils.get_single_uri(
            input_dict['model_exports'])

        feature_slicing_spec = evaluator_pb2.FeatureSlicingSpec()
        json_format.Parse(exec_properties['feature_slicing_spec'],
                          feature_slicing_spec)
        slice_spec = self._get_slice_spec_from_feature_slicing_spec(
            feature_slicing_spec)

        output_uri = artifact_utils.get_single_uri(output_dict['output'])

        eval_model_path = path_utils.eval_model_path(model_exports_uri)

        tf.logging.info('Using {} for model eval.'.format(eval_model_path))
        eval_shared_model = tfma.default_eval_shared_model(
            eval_saved_model_path=eval_model_path)

        tf.logging.info('Evaluating model.')
        with self._make_beam_pipeline() as pipeline:
            # pylint: disable=expression-not-assigned
            (pipeline
             | 'ReadData' >>
             beam.io.ReadFromTFRecord(file_pattern=io_utils.all_files_pattern(
                 artifact_utils.get_split_uri(input_dict['examples'], 'eval')))
             | 'ExtractEvaluateAndWriteResults' >>
             tfma.ExtractEvaluateAndWriteResults(
                 eval_shared_model=eval_shared_model,
                 slice_spec=slice_spec,
                 output_path=output_uri))
        tf.logging.info(
            'Evaluation complete. Results written to {}.'.format(output_uri))
示例#8
0
文件: fn_args_utils.py 项目: zvrr/tfx
def get_common_fn_args(input_dict: Dict[Text, List[types.Artifact]],
                       exec_properties: Dict[Text, Any],
                       working_dir: Text = None) -> FnArgs:
  """Get common args of training and tuning."""
  train_files = [
      io_utils.all_files_pattern(
          artifact_utils.get_split_uri(input_dict[constants.EXAMPLES_KEY],
                                       'train'))
  ]
  eval_files = [
      io_utils.all_files_pattern(
          artifact_utils.get_split_uri(input_dict[constants.EXAMPLES_KEY],
                                       'eval'))
  ]

  if input_dict.get(constants.TRANSFORM_GRAPH_KEY):
    transform_graph_path = artifact_utils.get_single_uri(
        input_dict[constants.TRANSFORM_GRAPH_KEY])
  else:
    transform_graph_path = None

  if input_dict.get(constants.SCHEMA_KEY):
    schema_path = io_utils.get_only_uri_in_dir(
        artifact_utils.get_single_uri(input_dict[constants.SCHEMA_KEY]))
  else:
    schema_path = None

  train_args = trainer_pb2.TrainArgs()
  eval_args = trainer_pb2.EvalArgs()
  json_format.Parse(exec_properties[constants.TRAIN_ARGS_KEY], train_args)
  json_format.Parse(exec_properties[constants.EVAL_ARGS_KEY], eval_args)

  # https://github.com/tensorflow/tfx/issues/45: Replace num_steps=0 with
  # num_steps=None.  Conversion of the proto to python will set the default
  # value of an int as 0 so modify the value here.  Tensorflow will raise an
  # error if num_steps <= 0.
  train_steps = train_args.num_steps or None
  eval_steps = eval_args.num_steps or None

  # TODO(b/156929910): Refactor Trainer to be consistent with empty or None
  #                    custom_config handling.
  custom_config = json_utils.loads(
      exec_properties.get(constants.CUSTOM_CONFIG_KEY, 'null'))

  return FnArgs(
      working_dir=working_dir,
      train_files=train_files,
      eval_files=eval_files,
      train_steps=train_steps,
      eval_steps=eval_steps,
      schema_path=schema_path,
      transform_graph_path=transform_graph_path,
      custom_config=custom_config,
  )
示例#9
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        model_name = exec_properties["model_name"]
        encoding = exec_properties["encoding"]
        train_config = exec_properties["train_config"]
        end_token = exec_properties["end_token"]
        dataset_dir = get_single_uri(input_dict["dataset_dir"])
        checkpoint_dir = get_single_uri(input_dict["checkpoint_dir"])
        encoding_dir = get_single_uri(input_dict["encoding_dir"])

        trained_checkpoint_dir = get_single_uri(
            output_dict["trained_checkpoint_dir"])
        sample_dir = get_single_uri(output_dict["sample_dir"])
        tensorboard_dir = get_single_uri(output_dict["tensorboard_dir"])
        hyperparameter_dir = get_single_uri(output_dict["hyperparameter_dir"])
        metric_dir = get_single_uri(output_dict["metric_dir"])
        train_config, metrics = train_gpt2(
            dataset_dir=dataset_dir,
            checkpoint_dir=checkpoint_dir,
            encoding_dir=encoding_dir,
            model_name=model_name,
            train_config=train_config,
            encoding=encoding,
            trained_checkpoint_dir=trained_checkpoint_dir,
            sample_dir=sample_dir,
            tensorboard_dir=tensorboard_dir,
            end_token=end_token)

        with open(os.path.join(hyperparameter_dir, 'hyperparameter.pickle'),
                  'wb') as handle:
            pickle.dump(train_config, handle, protocol=pickle.HIGHEST_PROTOCOL)
        with open(os.path.join(metric_dir, 'metric.pickle'), 'wb') as handle:
            pickle.dump(metrics, handle, protocol=pickle.HIGHEST_PROTOCOL)
示例#10
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        train_config = exec_properties["train_config"]

        checkpoint_dir = get_single_uri(input_dict["checkpoint_dir"])
        model_path = get_single_uri(input_dict["model_path"])

        export_dir = get_single_uri(output_dict["export_dir"])

        export_for_serving(model_path=model_path,
                           checkpoint_dir=checkpoint_dir,
                           export_dir=export_dir,
                           train_config=train_config)
示例#11
0
文件: executor.py 项目: RominYue/tfx
  def _GetFnArgs(self, input_dict: Dict[Text, List[types.Artifact]],
                 output_dict: Dict[Text, List[types.Artifact]],
                 exec_properties: Dict[Text, Any]) -> fn_args_utils.FnArgs:
    # Load and deserialize custom config from execution properties.
    # Note that in the component interface the default serialization of custom
    # config is 'null' instead of '{}'. Therefore we need to default the
    # json_utils.loads to 'null' then populate it with an empty dict when
    # needed.
    custom_config = json_utils.loads(
        exec_properties.get(constants.CUSTOM_CONFIG_KEY, 'null')) or {}
    if not isinstance(custom_config, dict):
      raise ValueError('custom_config in execution properties needs to be a '
                       'dict. Got %s instead.' % type(custom_config))

    # TODO(ruoyu): Make this a dict of tag -> uri instead of list.
    if input_dict.get(constants.BASE_MODEL_KEY):
      base_model = path_utils.serving_model_path(
          artifact_utils.get_single_uri(input_dict[constants.BASE_MODEL_KEY]))
    else:
      base_model = None

    if input_dict.get(constants.HYPERPARAMETERS_KEY):
      hyperparameters_file = io_utils.get_only_uri_in_dir(
          artifact_utils.get_single_uri(
              input_dict[constants.HYPERPARAMETERS_KEY]))
      hyperparameters_config = json.loads(
          file_io.read_file_to_string(hyperparameters_file))
    else:
      hyperparameters_config = None

    output_path = artifact_utils.get_single_uri(
        output_dict[constants.MODEL_KEY])
    serving_model_dir = path_utils.serving_model_dir(output_path)
    eval_model_dir = path_utils.eval_model_dir(output_path)

    model_run_dir = artifact_utils.get_single_uri(
        output_dict[constants.MODEL_RUN_KEY])

    # TODO(b/126242806) Use PipelineInputs when it is available in third_party.
    result = fn_args_utils.get_common_fn_args(input_dict, exec_properties)
    result.transform_output = result.transform_graph_path
    result.serving_model_dir = serving_model_dir
    result.eval_model_dir = eval_model_dir
    result.model_run_dir = model_run_dir
    result.schema_file = result.schema_path
    result.base_model = base_model
    result.hyperparameters = hyperparameters_config
    result.custom_config = custom_config
    return result
示例#12
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        encoding = exec_properties["encoding"]
        combine = exec_properties["combine"]
        text_path = exec_properties["text_path"]
        model_path = get_single_uri(input_dict["model_path"])
        dataset_path = os.path.join(
            get_single_uri(output_dict["dataset_path"]), "dataset.npz")

        enc = encoder.get_encoder(model_path)
        logging.info('Reading files')
        chunks = load_dataset(enc, text_path, combine, encoding=encoding)
        logging.info('Writing', dataset_path)
        np.savez_compressed(dataset_path, *chunks)
示例#13
0
def _JsonToExample(
    pipeline: beam.Pipeline,
    input_dict: Dict[Text, List[Artifact]],
    exec_properties: Dict[Text, Any],  # pylint: disable=unused-argument
    split_pattern: Text,
) -> beam.pvalue.PCollection:
    input_base_uri = artifact_utils.get_single_uri(input_dict[INPUT_KEY])
    json_pattern = os.path.join(input_base_uri, split_pattern)

    logging.info(
        'Processing input json data {} to TFExample.'.format(json_pattern))
    json_files = tf.io.gfile.glob(json_pattern)
    if not json_files:
        raise RuntimeError(
            'Split pattern {} does not match any files.'.format(json_pattern))

    parsed_json_lines = (
        pipeline
        | 'ReadFromText' >> beam.io.ReadFromText(file_pattern=json_pattern)
        | 'ParseJSONLine' >> beam.ParDo(ParseJsonLine()))

    value_infos = beam.pvalue.AsSingleton(
        parsed_json_lines
        | 'InferColumnTypes' >> beam.CombineGlobally(ValueTypeInferrer()))

    return (parsed_json_lines
            |
            'ToTFExample' >> beam.ParDo(_ParsedJsonToTfExample(), value_infos))
示例#14
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]):
        """Overrides the tfx_pusher_executor.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - model_export: exported model from trainer.
        - model_blessing: model blessing path from model_validator.
      output_dict: Output dict from key to a list of artifacts, including:
        - model_push: A list of 'ModelPushPath' artifact of size one. It will
          include the model in this push execution if the model was pushed.
      exec_properties: Mostly a passthrough input dict for
        tfx.components.Pusher.executor.  custom_config.ai_platform_serving_args
        is consumed by this class.  For the full set of parameters supported by
        Google Cloud AI Platform, refer to
        https://cloud.google.com/ml-engine/docs/tensorflow/deploying-models#creating_a_model_version.

    Returns:
      None
    Raises:
      ValueError: if ai_platform_serving_args is not in
      exec_properties.custom_config.
      RuntimeError: if the Google Cloud AI Platform training job failed.
    """
        self._log_startup(input_dict, output_dict, exec_properties)
        if not self.CheckBlessing(input_dict, output_dict):
            return

        model_export = artifact_utils.get_single_instance(
            input_dict['model_export'])
        model_export_uri = model_export.uri
        model_blessing_uri = artifact_utils.get_single_uri(
            input_dict['model_blessing'])
        model_push = artifact_utils.get_single_instance(
            output_dict['model_push'])
        # TODO(jyzhao): should this be in driver or executor.
        if not tf.gfile.Exists(os.path.join(model_blessing_uri, 'BLESSED')):
            model_push.set_int_custom_property('pushed', 0)
            tf.logging.info('Model on %s was not blessed', model_blessing_uri)
            return

        exec_properties_copy = exec_properties.copy()
        custom_config = exec_properties_copy.pop('custom_config', {})
        ai_platform_serving_args = custom_config['ai_platform_serving_args']

        # Deploy the model.
        model_path = path_utils.serving_model_path(model_export_uri)
        # Note: we do not have a logical model version right now. This
        # model_version is a timestamp mapped to trainer's exporter.
        model_version = os.path.basename(model_path)
        if ai_platform_serving_args is not None:
            runner.deploy_model_for_cmle_serving(model_path, model_version,
                                                 ai_platform_serving_args)

        # Make sure artifacts are populated in a standard way by calling
        # tfx.pusher.executor.Executor.Do().
        exec_properties_copy['push_destination'] = exec_properties.get(
            'push_destination') or self._make_local_temp_destination()
        super(Executor, self).Do(input_dict, output_dict, exec_properties_copy)
示例#15
0
def GetStatsOutputPathEntries(
        disable_statistics: bool,
        output_dict: Dict[str, List[types.Artifact]]) -> Dict[str, str]:
    """Returns output entries for stats output path."""
    label_component_key_list = [
        (labels.PRE_TRANSFORM_OUTPUT_STATS_PATH_LABEL,
         standard_component_specs.PRE_TRANSFORM_STATS_KEY),
        (labels.PRE_TRANSFORM_OUTPUT_SCHEMA_PATH_LABEL,
         standard_component_specs.PRE_TRANSFORM_SCHEMA_KEY),
        (labels.POST_TRANSFORM_OUTPUT_ANOMALIES_PATH_LABEL,
         standard_component_specs.POST_TRANSFORM_ANOMALIES_KEY),
        (labels.POST_TRANSFORM_OUTPUT_STATS_PATH_LABEL,
         standard_component_specs.POST_TRANSFORM_STATS_KEY),
        (labels.POST_TRANSFORM_OUTPUT_SCHEMA_PATH_LABEL,
         standard_component_specs.POST_TRANSFORM_SCHEMA_KEY)
    ]
    result = {}
    if not disable_statistics:
        for label, component_key in label_component_key_list:
            if component_key in output_dict:
                result[label] = artifact_utils.get_single_uri(
                    output_dict[component_key])
    if result and len(result) != len(label_component_key_list):
        raise ValueError(
            'Either all stats_output_paths should be specified or none.')
    return result
示例#16
0
  def Do(self, input_dict: Dict[str, List[types.Artifact]],
         output_dict: Dict[str, List[types.Artifact]],
         exec_properties: Dict[str, Any]) -> None:
    """ImportSchemaGen executor entrypoint.

    This generate Schema artifact with given schema_file.

    Args:
      input_dict: Should be empty.
      output_dict: Output dict from key to a list of artifacts, including:
        - schema: A list of 'Schema' artifact of size one.
      exec_properties: A dict of execution properties, includes:
        - schema_file: Source schema file path.

    Returns:
      None
    """
    source_file_path = exec_properties.get(
        standard_component_specs.SCHEMA_FILE_KEY)
    if not source_file_path:
      raise ValueError('Schema file path is missing in exec_properties.')
    output_uri = os.path.join(
        artifact_utils.get_single_uri(
            output_dict[standard_component_specs.SCHEMA_KEY]),
        schema_gen_executor.DEFAULT_FILE_NAME)

    # Check whether the input file has a proper schema proto.
    _ = io_utils.SchemaReader().read(source_file_path)

    io_utils.copy_file(source_file_path, output_uri)
    logging.info('Copied a schema file from %s to %s.', source_file_path,
                 output_uri)
示例#17
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        client = MongoClient(host=exec_properties["ip"],
                             port=int(exec_properties["port"]),
                             username=exec_properties["username"],
                             password=exec_properties["password"])
        dbname = exec_properties["dbname"]
        db = client[dbname]
        colnames = exec_properties["colnames"]
        end_token = exec_properties["end_token"]
        merged_text_dir = get_single_uri(output_dict["merged_text_dir"])

        raw_text = ""
        for colname in colnames:
            logging.info("Get data from {}/{}".format(dbname, colname))
            documents = db[colname].find({}, {"text": 1, "_id": 0})
            for document in documents:
                raw_text += document["text"] + end_token

        # store raw text for encoding
        merged_text_path = os.path.join(merged_text_dir, "merged_text")
        with open(merged_text_path, "w") as text_file:
            text_file.write(raw_text)
        logging.info("Saving merged text to {}".format(merged_text_dir))
示例#18
0
文件: executor.py 项目: zzhmtxxhh/tfx
def _ImportExample(  # pylint: disable=invalid-name
    pipeline: beam.Pipeline,
    input_dict: Dict[Text, List[types.Artifact]],
    exec_properties: Dict[Text, Any],  # pylint: disable=unused-argument
    split_pattern: Text) -> beam.pvalue.PCollection:
    """Read TFRecord files to PCollection of TF examples.

  Note that each input split will be transformed by this function separately.

  Args:
    pipeline: beam pipeline.
    input_dict: Input dict from input key to a list of Artifacts.
      - input_base: input dir that contains tf example data.
    exec_properties: A dict of execution properties.
    split_pattern: Split.pattern in Input config, glob relative file pattern
      that maps to input files with root directory given by input_base.

  Returns:
    PCollection of TF examples.
  """
    input_base_uri = artifact_utils.get_single_uri(input_dict['input_base'])
    input_split_pattern = os.path.join(input_base_uri, split_pattern)
    absl.logging.info(
        'Reading input TFExample data {}.'.format(input_split_pattern))

    # TODO(jyzhao): profile input examples.
    return (
        pipeline
        # TODO(jyzhao): support multiple input format.
        | 'ReadFromTFRecord' >>
        beam.io.ReadFromTFRecord(file_pattern=input_split_pattern)
        # TODO(jyzhao): consider move serialization out of base example gen.
        | 'ToTFExample' >> beam.Map(tf.train.Example.FromString))
示例#19
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        if exec_properties.get(_TUNE_ARGS_KEY):
            raise ValueError(
                "TuneArgs is not supported for default Tuner's Executor.")

        tuner_fn = udf_utils.get_fn(exec_properties, 'tuner_fn')
        fn_args = fn_args_utils.get_common_fn_args(input_dict, exec_properties,
                                                   self._get_tmp_dir())

        tuner_fn_result = tuner_fn(fn_args)
        tuner = tuner_fn_result.tuner
        fit_kwargs = tuner_fn_result.fit_kwargs

        # TODO(b/156966497): set logger for printing.
        tuner.search_space_summary()
        absl.logging.info('Start tuning...')
        tuner.search(**fit_kwargs)
        tuner.results_summary()
        best_hparams_config = tuner.get_best_hyperparameters()[0].get_config()
        absl.logging.info('Best hyperParameters: %s' % best_hparams_config)
        best_hparams_path = os.path.join(
            artifact_utils.get_single_uri(
                output_dict[_BEST_HYPERPARAMETERS_KEY]), _DEFAULT_FILE_NAME)
        io_utils.write_string_file(best_hparams_path,
                                   json.dumps(best_hparams_config))
        absl.logging.info('Best Hyperparameters are written to %s.' %
                          best_hparams_path)
示例#20
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:

        model_dir = get_single_uri(output_dict["model_dir"])
        model_name = exec_properties["model_name"]
        logging.info("Downloading pretrained model of {}".format(model_name))
        logging.info("Storing pretrained mdoel to {}".format(model_dir))

        subdir = os.path.join('models', model_name)
        subdir = subdir.replace('\\', '/')  # needed for Windows
        for filename in [
                'checkpoint', 'encoder.json', 'hparams.json',
                'model.ckpt.data-00000-of-00001', 'model.ckpt.index',
                'model.ckpt.meta', 'vocab.bpe'
        ]:
            logging.info("Getting {}".format(filename))
            # get file from storage server
            r = requests.get("https://storage.googleapis.com/gpt-2/" + subdir +
                             "/" + filename,
                             stream=True)
            # save to output path
            with open(os.path.join(model_dir, filename), 'wb') as f:
                file_size = int(r.headers["content-length"])
                chunk_size = 1000
                with tqdm(ncols=100,
                          desc="Fetching " + filename,
                          total=file_size,
                          unit_scale=True) as pbar:
                    # 1k for chunk_size, since Ethernet packet size is around 1500 bytes
                    for chunk in r.iter_content(chunk_size=chunk_size):
                        f.write(chunk)
                        pbar.update(chunk_size)
示例#21
0
def _AvroToExample(  # pylint: disable=invalid-name
    pipeline: beam.Pipeline,
    input_dict: Dict[Text, List[types.Artifact]],
    exec_properties: Dict[Text, Any],  # pylint: disable=unused-argument
    split_pattern: Text) -> beam.pvalue.PCollection:
    """Read Avro files and transform to TF examples.

  Note that each input split will be transformed by this function separately.

  Args:
    pipeline: beam pipeline.
    input_dict: Input dict from input key to a list of Artifacts.
      - input_base: input dir that contains Avro data.
    exec_properties: A dict of execution properties.
    split_pattern: Split.pattern in Input config, glob relative file pattern
      that maps to input files with root directory given by input_base.

  Returns:
    PCollection of TF examples.
  """
    input_base_uri = artifact_utils.get_single_uri(input_dict['input_base'])
    avro_pattern = os.path.join(input_base_uri, split_pattern)
    tf.logging.info(
        'Processing input avro data {} to TFExample.'.format(avro_pattern))

    return (pipeline
            | 'ReadFromAvro' >> beam.io.ReadFromAvro(avro_pattern)
            | 'ToTFExample' >> beam.Map(dict_to_example))
示例#22
0
    def CheckBlessing(self, input_dict: Dict[Text, List[types.Artifact]],
                      output_dict: Dict[Text, List[types.Artifact]]) -> bool:
        """Check that model is blessed by upstream ModelValidator, or update output.

    Args:
      input_dict: Input dict from input key to a list of artifacts:
        - model_blessing: model blessing path from model_validator. Pusher looks
          for a file named 'BLESSED' to consider the model blessed and safe to
          push.
      output_dict: Output dict from key to a list of artifacts, including:
        - model_push: A list of 'ModelPushPath' artifact of size one.

    Returns:
      True if the model is blessed by validator.
    """
        model_blessing_uri = artifact_utils.get_single_uri(
            input_dict['model_blessing'])
        model_push = artifact_utils.get_single_instance(
            output_dict['model_push'])
        # TODO(jyzhao): should this be in driver or executor.
        if not tf.io.gfile.exists(os.path.join(model_blessing_uri, 'BLESSED')):
            model_push.set_int_custom_property('pushed', 0)
            absl.logging.info('Model on %s was not blessed',
                              model_blessing_uri)
            return False
        return True
示例#23
0
文件: executor.py 项目: ssoudan/tfx_x
  def Do(self,
         input_dict: Dict[Text, List[types.Artifact]],
         output_dict: Dict[Text, List[types.Artifact]],
         exec_properties: Dict[Text, Any]) -> None:
    """Stores `custom_config` as an artifact of type `artifacts.PipelineConfiguration`.

    Args:
      input_dict: Empty
      output_dict: Output dict from key to a list of artifacts, including:
        - pipeline_configuration: A list of type `artifacts.PipelineConfiguration`
      exec_properties: A dict of execution properties, including:
        - custom_config: the configuration to save.
    Returns:
      None

    Raises:
      OSError and its subclasses
      ValueError
    """
    self._log_startup(input_dict, output_dict, exec_properties)

    pipeline_configuration = artifact_utils.get_single_instance(output_dict[PIPELINE_CONFIGURATION_KEY])
    custom_config = exec_properties.get(CUSTOM_CONFIG_KEY, "{}")

    output_dir = artifact_utils.get_single_uri([pipeline_configuration])
    output_file = os.path.join(output_dir, 'custom_config.json')

    io_utils.write_string_file(output_file, custom_config)
示例#24
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """TensorFlow SchemaGen executor entrypoint.

    This infers the schema using tensorflow_data_validation on the precomputed
    stats of 'train' split.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - 'stats': A list of 'ExampleStatistics' type which must contain
          split 'train'. Stats on other splits are ignored.
        - 'statistics': Synonym for 'stats'.
        - 'schema': A singleton list of 'Schema' type. If provided, pass
          it through as the output as fixed schema. If not provided, infer
          schema from stats.
        If both or neither 'stats/statistics' nor 'schema' is provided,
        an error is raised.
      output_dict: Output dict from key to a list of artifacts, including:
        - output: A list of 'Schema' artifact of size one.
      exec_properties: A dict of execution properties, includes:
        - infer_feature_shape: Whether or not to infer the shape of the feature.

    Returns:
      None
    """
        output_uri = os.path.join(
            artifact_utils.get_single_uri(output_dict['output']),
            _DEFAULT_FILE_NAME)

        # Materializing schema as an output artifact from SchemaGen, in order to log
        # metadata of it in the same way regardless of inferred or fixed.
        io_utils.write_pbtxt_file(
            output_uri, self._provide_schema(input_dict, exec_properties))
        absl.logging.info('Schema written to {}.'.format(output_uri))
示例#25
0
    def _provide_schema(self, input_dict,
                        exec_properties) -> schema_pb2.Schema:
        """Generates schema from either schema or statistics."""
        # TODO(zhitaoli): Move constants between this file and component.py to a
        # constants.py.
        stats = input_dict.get('stats') or input_dict.get('statistics')
        schema = input_dict.get('schema')

        if bool(stats) == bool(schema):
            raise ValueError(
                'Exactly only one of schema or stats must be provided')

        if schema:
            schema_uri = artifact_utils.get_single_uri(schema)
            absl.logging.info('Schema is provided. Reading from %s.' %
                              schema_uri)
            schema_reader = io_utils.SchemaReader()
            try:
                return schema_reader.read(
                    os.path.join(schema_uri, _DEFAULT_FILE_NAME))

            except tf.errors.NotFoundError:
                raise ValueError(
                    'Schema is provided, but failed to read from %s.' %
                    schema_uri)

        train_stats_uri = io_utils.get_only_uri_in_dir(
            artifact_utils.get_split_uri(stats, 'train'))
        infer_feature_shape = exec_properties['infer_feature_shape']
        return tfdv.infer_schema(tfdv.load_statistics(train_stats_uri),
                                 infer_feature_shape)
示例#26
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """TensorFlow SchemaGen executor entrypoint.

    This infers the schema using tensorflow_data_validation on the precomputed
    stats of 'train' split.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - 'stats': A list of 'ExampleStatistics' type which must contain
          split 'train'. Stats on other splits are ignored.
        - 'statistics': Synonym for 'stats'.
      output_dict: Output dict from key to a list of artifacts, including:
        - output: A list of 'Schema' artifact of size one.
      exec_properties: A dict of execution properties, includes:
        - infer_feature_shape: Whether or not to infer the shape of the feature.

    Returns:
      None
    """
        # TODO(zhitaoli): Move constants between this file and component.py to a
        # constants.py.
        train_stats_uri = io_utils.get_only_uri_in_dir(
            artifact_utils.get_split_uri(input_dict['stats'], 'train'))
        output_uri = os.path.join(
            artifact_utils.get_single_uri(output_dict['output']),
            _DEFAULT_FILE_NAME)

        infer_feature_shape = exec_properties['infer_feature_shape']
        absl.logging.info('Infering schema from statistics.')
        schema = tfdv.infer_schema(tfdv.load_statistics(train_stats_uri),
                                   infer_feature_shape)
        io_utils.write_pbtxt_file(output_uri, schema)
        absl.logging.info('Schema written to %s.' % output_uri)
示例#27
0
    def Do(self, input_dict: Dict[str, List[types.Artifact]],
           output_dict: Dict[str, List[types.Artifact]],
           exec_properties: Dict[str, Any]) -> None:

        if tfx_tuner.get_tune_args(exec_properties):
            raise ValueError(
                "TuneArgs is not supported by this Tuner's Executor.")

        metalearning_algorithm = None
        if 'metalearning_algorithm' in exec_properties:
            metalearning_algorithm = exec_properties.get(
                'metalearning_algorithm')

        warmup_trials = 0
        warmup_trial_data = None
        if metalearning_algorithm:
            warmup_tuner, warmup_trials = self.warmup(input_dict,
                                                      exec_properties,
                                                      metalearning_algorithm)
            warmup_trial_data = extract_tuner_trial_progress(warmup_tuner)
        else:
            logging.info('MetaLearning Algorithm not provided.')

        # Create new fn_args for final tuning stage.
        fn_args = fn_args_utils.get_common_fn_args(
            input_dict, exec_properties, working_dir=self._get_tmp_dir())
        tuner_fn = udf_utils.get_fn(exec_properties, 'tuner_fn')
        tuner_fn_result = tuner_fn(fn_args)
        tuner_fn_result.tuner.oracle.max_trials = max(
            (tuner_fn_result.tuner.oracle.max_trials - warmup_trials), 1)
        tuner = self.search(tuner_fn_result)
        tuner_trial_data = extract_tuner_trial_progress(tuner)

        if warmup_trial_data:
            cumulative_tuner_trial_data, best_tuner_ix = merge_trial_data(
                warmup_trial_data, tuner_trial_data)
            cumulative_tuner_trial_data[
                'warmup_trial_data'] = warmup_trial_data[BEST_CUMULATIVE_SCORE]
            cumulative_tuner_trial_data['tuner_trial_data'] = tuner_trial_data[
                BEST_CUMULATIVE_SCORE]

            if isinstance(tuner.oracle.objective, kerastuner.Objective):
                cumulative_tuner_trial_data[
                    'objective'] = tuner.oracle.objective.name
            else:
                cumulative_tuner_trial_data[
                    'objective'] = 'objective not understood'

            tuner_trial_data = cumulative_tuner_trial_data
            best_tuner = warmup_tuner if best_tuner_ix == 0 else tuner
        else:
            best_tuner = tuner
        tfx_tuner.write_best_hyperparameters(best_tuner, output_dict)
        tuner_plot_path = os.path.join(
            artifact_utils.get_single_uri(output_dict['trial_summary_plot']),
            'tuner_plot_data.txt')
        io_utils.write_string_file(tuner_plot_path,
                                   json.dumps(tuner_trial_data))
        logging.info('Tuner plot data written at: %s', tuner_plot_path)
示例#28
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        crawler = Crawler(exec_properties["url"])
        rss_feed = crawler.get_article_information_as_dataframe()

        output_path = os.path.join(get_single_uri(output_dict["rss_feed"]), "feed.csv")
        rss_feed.to_csv(output_path, index=False)
示例#29
0
  def Do(self, input_dict: Dict[Text, List[types.Artifact]],
         output_dict: Dict[Text, List[types.Artifact]],
         exec_properties: Dict[Text, Any]) -> None:
    """TensorFlow SchemaGen executor entrypoint.

    This infers the schema using tensorflow_data_validation on the precomputed
    stats of 'train' split.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - 'stats': A list of 'ExampleStatistics' type which must contain
          split 'train'. Stats on other splits are ignored.
        - 'statistics': Synonym for 'stats'.
      output_dict: Output dict from key to a list of artifacts, including:
        - output: A list of 'Schema' artifact of size one.
      exec_properties: A dict of execution properties, includes:
        - infer_feature_shape: Whether or not to infer the shape of the feature.
        - exclude_splits: Names of splits that will not be taken into
          consideration when auto-generating a schema.

    Returns:
      None
    """
    # TODO(zhitaoli): Move constants between this file and component.py to a
    # constants.py.
    infer_feature_shape = exec_properties.get(INFER_FEATURE_SHAPE_KEY)

    # Load and deserialize exclude splits from execution properties.
    exclude_splits = json_utils.loads(
        exec_properties.get(EXCLUDE_SPLITS_KEY, 'null')) or []
    if not isinstance(exclude_splits, list):
      raise ValueError('exclude_splits in execution properties needs to be a '
                       'list. Got %s instead.' % type(exclude_splits))

    # Only one schema is generated for all splits.
    schema = None
    stats_artifact = artifact_utils.get_single_instance(
        input_dict[STATISTICS_KEY])
    for split in artifact_utils.decode_split_names(stats_artifact.split_names):
      if split in exclude_splits:
        continue

      logging.info('Processing schema from statistics for split %s.', split)
      stats_uri = io_utils.get_only_uri_in_dir(
          os.path.join(stats_artifact.uri, split))
      if not schema:
        schema = tfdv.infer_schema(
            tfdv.load_statistics(stats_uri), infer_feature_shape)
      else:
        schema = tfdv.update_schema(schema, tfdv.load_statistics(stats_uri),
                                    infer_feature_shape)

    output_uri = os.path.join(
        artifact_utils.get_single_uri(output_dict[SCHEMA_KEY]),
        _DEFAULT_FILE_NAME)
    io_utils.write_pbtxt_file(output_uri, schema)
    logging.info('Schema written to %s.', output_uri)
示例#30
0
文件: executor.py 项目: zzhmtxxhh/tfx
def _CsvToExample(  # pylint: disable=invalid-name
    pipeline: beam.Pipeline,
    input_dict: Dict[Text, List[types.Artifact]],
    exec_properties: Dict[Text, Any],  # pylint: disable=unused-argument
    split_pattern: Text) -> beam.pvalue.PCollection:
    """Read CSV files and transform to TF examples.

  Note that each input split will be transformed by this function separately.

  Args:
    pipeline: beam pipeline.
    input_dict: Input dict from input key to a list of Artifacts.
      - input_base: input dir that contains csv data. csv files must have header
        line.
    exec_properties: A dict of execution properties.
    split_pattern: Split.pattern in Input config, glob relative file pattern
      that maps to input files with root directory given by input_base.

  Returns:
    PCollection of TF examples.

  Raises:
    RuntimeError: if split is empty or csv headers are not equal.
  """
    input_base_uri = artifact_utils.get_single_uri(input_dict['input_base'])
    csv_pattern = os.path.join(input_base_uri, split_pattern)
    absl.logging.info(
        'Processing input csv data {} to TFExample.'.format(csv_pattern))

    csv_files = tf.io.gfile.glob(csv_pattern)
    if not csv_files:
        raise RuntimeError(
            'Split pattern {} does not match any files.'.format(csv_pattern))

    column_names = io_utils.load_csv_column_names(csv_files[0])
    for csv_files in csv_files[1:]:
        if io_utils.load_csv_column_names(csv_files) != column_names:
            raise RuntimeError(
                'Files in same split {} have different header.'.format(
                    csv_pattern))

    parsed_csv_lines = (
        pipeline
        | 'ReadFromText' >> beam.io.ReadFromText(file_pattern=csv_pattern,
                                                 skip_header_lines=1)
        |
        'ParseCSVLine' >> beam.ParDo(csv_decoder.ParseCSVLine(delimiter=',')))
    column_infos = beam.pvalue.AsSingleton(
        parsed_csv_lines
        | 'InferColumnTypes' >> beam.CombineGlobally(
            csv_decoder.ColumnTypeInferrer(column_names,
                                           skip_blank_lines=True)))

    return (parsed_csv_lines
            |
            'ToTFExample' >> beam.ParDo(_ParsedCsvToTfExample(), column_infos))