示例#1
0
    def __init__(self,
                 components_to_always_add: List[BaseComponent],
                 benchmark_subpipelines: List[BenchmarkSubpipeline],
                 pipeline_name: Optional[str],
                 pipeline_root: Optional[str],
                 metadata_connection_config: Optional[
                     metadata_store_pb2.ConnectionConfig] = None,
                 beam_pipeline_args: Optional[List[str]] = None,
                 **kwargs):

        if not benchmark_subpipelines and not components_to_always_add:
            raise ValueError(
                "Requires at least one benchmark subpipeline or component to run. "
                "You may want to call `self.add(..., always=True) in order "
                "to run Components, Subpipelines, or Pipeline even without requiring "
                "a call to `self.evaluate(...)`.")

        # Set defaults.
        if not pipeline_name:
            pipeline_name = "nitroml"
        if not pipeline_root:
            tmp_root_dir = os.path.join("/tmp", pipeline_name)
            tf.io.gfile.makedirs(tmp_root_dir)
            pipeline_root = tempfile.mkdtemp(dir=tmp_root_dir)
            logging.info("Creating tmp pipeline_root at %s", pipeline_root)
        if not metadata_connection_config:
            metadata_connection_config = metadata_store_pb2.ConnectionConfig(
                sqlite=metadata_store_pb2.SqliteMetadataSourceConfig(
                    filename_uri=os.path.join(pipeline_root, "mlmd.sqlite")))

        # Ensure that pipeline dirs are created.
        _make_pipeline_dirs(pipeline_root, metadata_connection_config)

        components = set(components_to_always_add)
        for benchmark_subpipeline in benchmark_subpipelines:
            for component in benchmark_subpipeline.components:
                components.add(component)
        super().__init__(pipeline_name=pipeline_name,
                         pipeline_root=pipeline_root,
                         metadata_connection_config=metadata_connection_config,
                         components=list(components),
                         beam_pipeline_args=beam_pipeline_args,
                         **kwargs)

        self._subpipelines = benchmark_subpipelines
def get_tfx_pipeline_metadata_store(tmp_db_path: str) -> mlmd.MetadataStore:
    """Copies and opens a metadata_store from the testdata tfx pipeline db.

  It migrates the db to the compatible schema at the head. In addition, it
  updates the stored artifacts' uri to the test data db path, so that the test
  code can open the testdata files mentioned in the database.

  Args:
    tmp_db_path: a temp path for copying the pipeline database.

  Returns:
    A ml-metadata store for the copied pipeline db.
  """
    testdata_db_path = os.path.join(_TEST_DATA_DIR, _TFX_0_21_DB_FILE)
    shutil.copyfile(testdata_db_path, tmp_db_path)

    connection_config = metadata_store_pb2.ConnectionConfig(
        sqlite=metadata_store_pb2.SqliteMetadataSourceConfig(
            filename_uri=tmp_db_path,
            connection_mode=metadata_store_pb2.SqliteMetadataSourceConfig.
            READWRITE,
        ))
    # The pipeline db is created with mlmd 0.21, the test run from the head
    # may include newer mlmd schema versions. We migrate the db to newer
    # mlmd schema if needed.
    store = mlmd.MetadataStore(connection_config,
                               enable_upgrade_migration=True)
    # The pipeline db is generated with real pipelines in which the payloads of
    # the artifacts are stored in the file system when the pipeline ran. We fix
    # the uri to point to the testdata payloads generated by the pipeline.
    fixed_artifacts = []
    for artifact in store.get_artifacts():
        artifact.uri = artifact.uri.replace(_TFX_0_21_PAYLOAD_DIR,
                                            _TEST_DATA_DIR)
        fixed_artifacts.append(artifact)
    store.put_artifacts(fixed_artifacts)
    return store
示例#3
0
    def build(self,
              pipeline_name: Optional[Text],
              pipeline_root: Optional[Text],
              metadata_connection_config: Optional[
                  metadata_store_pb2.ConnectionConfig] = None,
              enable_cache: Optional[bool] = False,
              beam_pipeline_args: Optional[List[Text]] = None,
              **kwargs) -> pipeline_lib.Pipeline:
        """Contatenates multiple benchmarks into a single pipeline DAG.

    Args:
      pipeline_name: name of the pipeline;
      pipeline_root: path to root directory of the pipeline;
      metadata_connection_config: the config to connect to ML metadata.
      enable_cache: whether or not cache is enabled for this run.
      beam_pipeline_args: Beam pipeline args for beam jobs within executor.
        Executor will use beam DirectRunner as Default.
      **kwargs: additional kwargs forwarded as pipeline args.

    Returns:
      A TFX Pipeline.
    """

        # Set defaults.
        if not pipeline_name:
            pipeline_name = "nitroml"
        if not pipeline_root:
            tmp_root_dir = os.path.join("/tmp", pipeline_name)
            tf.io.gfile.makedirs(tmp_root_dir)
            pipeline_root = tempfile.mkdtemp(dir=tmp_root_dir)
            logging.info("Creating tmp pipeline_root at %s", pipeline_root)
        if not metadata_connection_config:
            metadata_connection_config = metadata_store_pb2.ConnectionConfig(
                sqlite=metadata_store_pb2.SqliteMetadataSourceConfig(
                    filename_uri=os.path.join(pipeline_root, "mlmd.sqlite")))

        # Ensure that pipeline dirs are created.
        _make_pipeline_dirs(pipeline_root, metadata_connection_config)

        dag = []
        logging.info("NitroML benchmarks:")
        seen = set()
        for repeatable_pipeline in self._pipelines:
            logging.info("\t%s", repeatable_pipeline.benchmark_name)
            logging.info("\t\tRUNNING")
            components = repeatable_pipeline.components
            for component in components:
                if component in seen:
                    continue
                # pylint: disable=protected-access
                component._instance_name = _qualified_name(
                    component._instance_name,
                    repeatable_pipeline.benchmark_name)
                # pylint: enable=protected-access
                seen.add(component)
            dag += components
        return pipeline_lib.Pipeline(
            pipeline_name=pipeline_name,
            pipeline_root=pipeline_root,
            metadata_connection_config=metadata_connection_config,
            components=dag,
            enable_cache=enable_cache,
            beam_pipeline_args=beam_pipeline_args,
            **kwargs)
示例#4
0
文件: e2etest.py 项目: NeoTim/nitroml
 def metadata_config(self) -> metadata_store_pb2.ConnectionConfig:
     return metadata_store_pb2.ConnectionConfig(
         sqlite=metadata_store_pb2.SqliteMetadataSourceConfig(
             filename_uri=self.metadata_path))