Exemplo n.º 1
0
    def testMakeOutputDirsAndRemoveOutputDirs(self):
        output_artifacts = self._output_resolver.generate_output_artifacts(1)
        outputs_utils.make_output_dirs(output_artifacts)
        for _, artifact_list in output_artifacts.items():
            for artifact in artifact_list:
                if isinstance(artifact, ValueArtifact):
                    self.assertFalse(fileio.isdir(artifact.uri))
                else:
                    self.assertTrue(fileio.isdir(artifact.uri))
                self.assertTrue(fileio.exists(artifact.uri))

        outputs_utils.remove_output_dirs(output_artifacts)
        for _, artifact_list in output_artifacts.items():
            for artifact in artifact_list:
                self.assertFalse(fileio.exists(artifact.uri))
Exemplo n.º 2
0
def clear_output_dirs(output_dict: Dict[str, List[types.Artifact]]) -> None:
    """Clear dirs of output artifacts' URI."""
    for _, artifact_list in output_dict.items():
        for artifact in artifact_list:
            if fileio.isdir(artifact.uri) and fileio.listdir(artifact.uri):
                fileio.rmtree(artifact.uri)
                fileio.mkdir(artifact.uri)
Exemplo n.º 3
0
def copy_dir(
    src: str,
    dst: str,
    allow_regex_patterns: Iterable[str] = (),
    deny_regex_patterns: Iterable[str] = (),
) -> None:
  """Copies the whole directory recursively from source to destination.

  Args:
    src: Source directory to copy from. <src>/a/b.txt will be copied to
        <dst>/a/b.txt.
    dst: Destination directoy to copy to. <src>/a/b.txt will be copied to
        <dst>/a/b.txt.
    allow_regex_patterns: Optional list of allowlist regular expressions to
        filter from. Pattern is matched against the full path of the file.
        Files and subdirectories that do not match any of the patterns will not
        be copied.
    deny_regex_patterns: Optional list of denylist regular expressions to
        filter from. Pattern is matched against the full path of the file.
        Files and subdirectories that match any of the patterns will not be
        copied.
  """
  src = src.rstrip('/')
  dst = dst.rstrip('/')

  allow_regex_patterns = [re.compile(p) for p in allow_regex_patterns]
  deny_regex_patterns = [re.compile(p) for p in deny_regex_patterns]

  def should_copy(path):
    if allow_regex_patterns:
      if not any(p.search(path) for p in allow_regex_patterns):
        return False
    if deny_regex_patterns:
      if any(p.search(path) for p in deny_regex_patterns):
        return False
    return True

  if fileio.exists(dst):
    fileio.rmtree(dst)
  fileio.makedirs(dst)

  for dir_name, sub_dirs, leaf_files in fileio.walk(src):
    new_dir_name = dir_name.replace(src, dst, 1)
    new_dir_exists = fileio.isdir(new_dir_name)

    for leaf_file in leaf_files:
      leaf_file_path = os.path.join(dir_name, leaf_file)
      if should_copy(leaf_file_path):
        if not new_dir_exists:
          # Parent directory may not have been created yet if its name is not
          # in the allowlist, but its containing file is.
          fileio.makedirs(new_dir_name)
          new_dir_exists = True
        new_file_path = os.path.join(new_dir_name, leaf_file)
        fileio.copy(leaf_file_path, new_file_path)

    for sub_dir in sub_dirs:
      if should_copy(os.path.join(dir_name, sub_dir)):
        fileio.makedirs(os.path.join(new_dir_name, sub_dir))
Exemplo n.º 4
0
def remove_output_dirs(output_dict: Dict[Text, List[types.Artifact]]) -> None:
    """Remove dirs of output artifacts' URI."""
    for _, artifact_list in output_dict.items():
        for artifact in artifact_list:
            if fileio.isdir(artifact.uri):
                fileio.rmtree(artifact.uri)
            else:
                fileio.remove(artifact.uri)
Exemplo n.º 5
0
 def _cleanup_kfp_server(self):
   pipelines = fileio.listdir(self._kubeflow_home)
   for pipeline_name in pipelines:
     if fileio.isdir(pipeline_name):
       self._delete_experiment(pipeline_name)
       self._delete_pipeline(pipeline_name)
       self._delete_pipeline_output(pipeline_name)
       self._delete_pipeline_metadata(pipeline_name)
Exemplo n.º 6
0
    def run(self,
            pipeline: tfx_pipeline.Pipeline,
            parameter_values: Optional[Dict[Text, Any]] = None,
            write_out: Optional[bool] = True) -> Dict[Text, Any]:
        """Compiles a pipeline DSL object into pipeline file.

    Args:
      pipeline: TFX pipeline object.
      parameter_values: mapping from runtime parameter names to its values.
      write_out: set to True to actually write out the file to the place
        designated by output_dir and output_filename. Otherwise return the
        JSON-serialized pipeline job spec.

    Returns:
      Returns the JSON pipeline job spec.

    Raises:
      RuntimeError: if trying to write out to a place occupied by an existing
      file.
    """
        # TODO(b/166343606): Support user-provided labels.
        # TODO(b/169095387): Deprecate .run() method in favor of the unified API
        # client.
        display_name = (self._config.display_name
                        or pipeline.pipeline_info.pipeline_name)
        pipeline_spec = pipeline_builder.PipelineBuilder(
            tfx_pipeline=pipeline,
            default_image=self._config.default_image,
            default_commands=self._config.default_commands).build()
        pipeline_spec.sdk_version = 'tfx-{}'.format(version.__version__)
        pipeline_spec.schema_version = _SCHEMA_VERSION
        runtime_config = pipeline_builder.RuntimeConfigBuilder(
            pipeline_info=pipeline.pipeline_info,
            parameter_values=parameter_values).build()
        with telemetry_utils.scoped_labels(
            {telemetry_utils.LABEL_TFX_RUNNER: 'kubeflow_v2'}):
            result = pipeline_spec_pb2.PipelineJob(
                display_name=display_name
                or pipeline.pipeline_info.pipeline_name,
                labels=telemetry_utils.get_labels_dict(),
                runtime_config=runtime_config)
        result.pipeline_spec.update(json_format.MessageToDict(pipeline_spec))
        pipeline_json_dict = json_format.MessageToDict(result)
        if write_out:
            if fileio.exists(
                    self._output_dir) and not fileio.isdir(self._output_dir):
                raise RuntimeError('Output path: %s is pointed to a file.' %
                                   self._output_dir)
            if not fileio.exists(self._output_dir):
                fileio.makedirs(self._output_dir)

            with fileio.open(
                    os.path.join(self._output_dir, self._output_filename),
                    'wb') as f:
                f.write(json.dumps(pipeline_json_dict, sort_keys=True))

        return pipeline_json_dict
Exemplo n.º 7
0
 def extractDirectorySpec(self, path):
   if fileio.isdir(path):
     result = {}
     for name in fileio.listdir(path):
       result[name] = self.extractDirectorySpec(os.path.join(path, name))
     return result
   elif fileio.exists(path):
     return file_io.FileIO(path, mode='r').read()
   else:
     raise ValueError(f'{path} does not exist.')
Exemplo n.º 8
0
  def setUp(self):
    super().setUp()

    # List of packages installed.
    self._pip_list = pip_utils.get_package_names()

    # Check if Apache Airflow is installed before running E2E tests.
    if labels.AIRFLOW_PACKAGE_NAME not in self._pip_list:
      sys.exit('Apache Airflow not installed.')

    # Change the encoding for Click since Python 3 is configured to use ASCII as
    # encoding for the environment.
    if codecs.lookup(locale.getpreferredencoding()).name == 'ascii':
      os.environ['LANG'] = 'en_US.utf-8'

    # Setup airflow_home in a temp directory
    self._airflow_home = os.path.join(self.tmp_dir, 'airflow')
    self.enter_context(
        test_case_utils.override_env_var('AIRFLOW_HOME', self._airflow_home))
    self.enter_context(
        test_case_utils.override_env_var('HOME', self._airflow_home))

    absl.logging.info('Using %s as AIRFLOW_HOME and HOME in this e2e test',
                      self._airflow_home)

    # Testdata path.
    self._testdata_dir = os.path.join(
        os.path.dirname(os.path.dirname(__file__)), 'testdata')

    self._pipeline_name = 'chicago_taxi_simple'
    self._pipeline_path = os.path.join(self._testdata_dir,
                                       'test_pipeline_airflow_1.py')

    # Copy data.
    chicago_taxi_pipeline_dir = os.path.join(
        os.path.dirname(
            os.path.dirname(
                os.path.dirname(os.path.dirname(os.path.abspath(__file__))))),
        'examples', 'chicago_taxi_pipeline')
    data_dir = os.path.join(chicago_taxi_pipeline_dir, 'data', 'simple')
    content = fileio.listdir(data_dir)
    assert content, 'content in {} is empty'.format(data_dir)
    target_data_dir = os.path.join(self._airflow_home, 'taxi', 'data', 'simple')
    io_utils.copy_dir(data_dir, target_data_dir)
    assert fileio.isdir(target_data_dir)
    content = fileio.listdir(target_data_dir)
    assert content, 'content in {} is {}'.format(target_data_dir, content)
    io_utils.copy_file(
        os.path.join(chicago_taxi_pipeline_dir, 'taxi_utils.py'),
        os.path.join(self._airflow_home, 'taxi', 'taxi_utils.py'))

    # Initialize CLI runner.
    self.runner = click_testing.CliRunner()
Exemplo n.º 9
0
    def testMakeClearAndRemoveOutputDirs(self):
        output_artifacts = self._output_resolver().generate_output_artifacts(1)
        outputs_utils.make_output_dirs(output_artifacts)
        for _, artifact_list in output_artifacts.items():
            for artifact in artifact_list:
                if isinstance(artifact, ValueArtifact):
                    self.assertFalse(fileio.isdir(artifact.uri))
                else:
                    self.assertTrue(fileio.isdir(artifact.uri))
                    with fileio.open(os.path.join(artifact.uri, 'output'),
                                     'w') as f:
                        f.write('')
                self.assertTrue(fileio.exists(artifact.uri))

        outputs_utils.clear_output_dirs(output_artifacts)
        for _, artifact_list in output_artifacts.items():
            for artifact in artifact_list:
                if not isinstance(artifact, ValueArtifact):
                    self.assertEqual(fileio.listdir(artifact.uri), [])

        outputs_utils.remove_output_dirs(output_artifacts)
        for _, artifact_list in output_artifacts.items():
            for artifact in artifact_list:
                self.assertFalse(fileio.exists(artifact.uri))
Exemplo n.º 10
0
    def setUp(self):
        super(CliLocalEndToEndTest, self).setUp()

        # Change the encoding for Click since Python 3 is configured to use ASCII as
        # encoding for the environment.
        if codecs.lookup(locale.getpreferredencoding()).name == 'ascii':
            os.environ['LANG'] = 'en_US.utf-8'

        # Setup local_home in a temp directory
        self._home = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', tempfile.mkdtemp()),
            self._testMethodName)
        self._old_home = os.environ.get('HOME')
        os.environ['HOME'] = self._home
        self._old_local_home = os.environ.get('LOCAL_HOME')
        os.environ['LOCAL_HOME'] = os.path.join(self._home, 'local', '')
        self._local_home = os.environ['LOCAL_HOME']

        # Testdata path.
        self._testdata_dir = os.path.join(
            os.path.dirname(os.path.dirname(__file__)), 'testdata')

        # Copy data.
        chicago_taxi_pipeline_dir = os.path.join(
            os.path.dirname(
                os.path.dirname(
                    os.path.dirname(os.path.dirname(
                        os.path.abspath(__file__))))), 'examples',
            'chicago_taxi_pipeline', '')
        data_dir = os.path.join(chicago_taxi_pipeline_dir, 'data', 'simple')
        content = fileio.listdir(data_dir)
        assert content, 'content in {} is empty'.format(data_dir)
        target_data_dir = os.path.join(self._home, 'taxi', 'data', 'simple')
        io_utils.copy_dir(data_dir, target_data_dir)
        assert fileio.isdir(target_data_dir)
        content = fileio.listdir(target_data_dir)
        assert content, 'content in {} is {}'.format(target_data_dir, content)
        io_utils.copy_file(
            os.path.join(chicago_taxi_pipeline_dir, 'taxi_utils.py'),
            os.path.join(self._home, 'taxi', 'taxi_utils.py'))

        # Initialize CLI runner.
        self.runner = click_testing.CliRunner()
Exemplo n.º 11
0
    def setUp(self):
        super().setUp()

        # Change the encoding for Click since Python 3 is configured to use ASCII as
        # encoding for the environment.
        if codecs.lookup(locale.getpreferredencoding()).name == 'ascii':
            os.environ['LANG'] = 'en_US.utf-8'

        # Setup beam_home in a temp directory
        self._home = self.tmp_dir
        self._beam_home = os.path.join(self._home, 'beam')
        self.enter_context(
            test_case_utils.override_env_var('BEAM_HOME', self._beam_home))
        self.enter_context(test_case_utils.override_env_var(
            'HOME', self._home))

        # Testdata path.
        self._testdata_dir = os.path.join(
            os.path.dirname(os.path.dirname(__file__)), 'testdata')

        # Copy data.
        chicago_taxi_pipeline_dir = os.path.join(
            os.path.dirname(
                os.path.dirname(
                    os.path.dirname(os.path.dirname(
                        os.path.abspath(__file__))))), 'examples',
            'chicago_taxi_pipeline', '')
        data_dir = os.path.join(chicago_taxi_pipeline_dir, 'data', 'simple')
        content = fileio.listdir(data_dir)
        assert content, 'content in {} is empty'.format(data_dir)
        target_data_dir = os.path.join(self._home, 'taxi', 'data', 'simple')
        io_utils.copy_dir(data_dir, target_data_dir)
        assert fileio.isdir(target_data_dir)
        content = fileio.listdir(target_data_dir)
        assert content, 'content in {} is {}'.format(target_data_dir, content)
        io_utils.copy_file(
            os.path.join(chicago_taxi_pipeline_dir, 'taxi_utils.py'),
            os.path.join(self._home, 'taxi', 'taxi_utils.py'))

        # Initialize CLI runner.
        self.runner = click_testing.CliRunner()
    def setUp(self):
        super(AirflowEndToEndTest, self).setUp()
        # setup airflow_home in a temp directory, config and init db.
        self._airflow_home = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', tempfile.mkdtemp()),
            self._testMethodName)
        self._old_airflow_home = os.environ.get('AIRFLOW_HOME')
        os.environ['AIRFLOW_HOME'] = self._airflow_home
        self._old_home = os.environ.get('HOME')
        os.environ['HOME'] = self._airflow_home
        absl.logging.info('Using %s as AIRFLOW_HOME and HOME in this e2e test',
                          self._airflow_home)

        self._mysql_container_name = 'airflow_' + test_utils.generate_random_id(
        )
        db_port = airflow_test_utils.create_mysql_container(
            self._mysql_container_name)
        self.addCleanup(airflow_test_utils.delete_mysql_container,
                        self._mysql_container_name)
        os.environ['AIRFLOW__CORE__SQL_ALCHEMY_CONN'] = (
            'mysql://[email protected]:%d/airflow' % db_port)

        # Set a couple of important environment variables. See
        # https://airflow.apache.org/howto/set-config.html for details.
        os.environ['AIRFLOW__CORE__DAGS_FOLDER'] = os.path.join(
            self._airflow_home, 'dags')
        os.environ['AIRFLOW__CORE__BASE_LOG_FOLDER'] = os.path.join(
            self._airflow_home, 'logs')
        # Do not load examples to make this a bit faster.
        os.environ['AIRFLOW__CORE__LOAD_EXAMPLES'] = 'False'
        # Following environment variables make scheduler process dags faster.
        os.environ['AIRFLOW__SCHEDULER__JOB_HEARTBEAT_SEC'] = '1'
        os.environ['AIRFLOW__SCHEDULER__SCHEDULER_HEARTBEAT_SEC'] = '1'
        os.environ['AIRFLOW__SCHEDULER__RUN_DURATION'] = '-1'
        os.environ['AIRFLOW__SCHEDULER__MIN_FILE_PROCESS_INTERVAL'] = '1'
        os.environ['AIRFLOW__SCHEDULER__PRINT_STATS_INTERVAL'] = '30'

        # Following fields are specific to the chicago_taxi_simple example.
        self._dag_id = 'chicago_taxi_simple'
        self._run_id = 'manual_run_id_1'
        # This execution date must be after the start_date in chicago_taxi_simple
        # but before current execution date.
        self._execution_date = '2019-02-01T01:01:01'
        self._all_tasks = [
            'CsvExampleGen',
            'Evaluator',
            'ExampleValidator',
            'Pusher',
            'SchemaGen',
            'StatisticsGen',
            'Trainer',
            'Transform',
        ]
        # Copy dag file and data.
        chicago_taxi_pipeline_dir = os.path.dirname(__file__)
        simple_pipeline_file = os.path.join(chicago_taxi_pipeline_dir,
                                            'taxi_pipeline_simple.py')

        io_utils.copy_file(
            simple_pipeline_file,
            os.path.join(self._airflow_home, 'dags',
                         'taxi_pipeline_simple.py'))

        data_dir = os.path.join(chicago_taxi_pipeline_dir, 'data', 'simple')
        content = fileio.listdir(data_dir)
        assert content, 'content in {} is empty'.format(data_dir)
        target_data_dir = os.path.join(self._airflow_home, 'taxi', 'data',
                                       'simple')
        io_utils.copy_dir(data_dir, target_data_dir)
        assert fileio.isdir(target_data_dir)
        content = fileio.listdir(target_data_dir)
        assert content, 'content in {} is {}'.format(target_data_dir, content)
        io_utils.copy_file(
            os.path.join(chicago_taxi_pipeline_dir, 'taxi_utils.py'),
            os.path.join(self._airflow_home, 'taxi', 'taxi_utils.py'))

        # Initialize database.
        subprocess.run(['airflow', 'initdb'], check=True)
        subprocess.run(['airflow', 'unpause', self._dag_id], check=True)
Exemplo n.º 13
0
def delete_dir(path: Text) -> None:
    """Deletes a directory if exists."""

    if fileio.isdir(path):
        fileio.rmtree(path)
Exemplo n.º 14
0
  def _rewrite(self, original_model: rewriter.ModelDescription,
               rewritten_model: rewriter.ModelDescription):
    """Rewrites the provided model.

    Args:
      original_model: A `ModelDescription` specifying the original model to be
        rewritten.
      rewritten_model: A `ModelDescription` specifying the format and location
        of the rewritten model.

    Raises:
      ValueError: If the model could not be sucessfully rewritten.
    """
    if rewritten_model.model_type not in [
        rewriter.ModelType.TFLITE_MODEL, rewriter.ModelType.ANY_MODEL
    ]:
      raise ValueError('TFLiteConverter can only convert to the TFLite format.')

    # TODO(dzats): We create a temporary directory with a SavedModel that does
    # not contain an assets or assets.extra directory. Remove this when the
    # TFLite converter can convert models having these directories.
    tmp_model_dir = os.path.join(
        _ensure_str(rewritten_model.path),
        'tmp-rewrite-' + str(int(time.time())))
    if fileio.exists(tmp_model_dir):
      raise ValueError('TFLiteConverter is unable to create a unique path '
                       'for the temp rewriting directory.')

    fileio.makedirs(tmp_model_dir)
    _create_tflite_compatible_saved_model(
        _ensure_str(original_model.path), tmp_model_dir)

    converter = self._create_tflite_converter(
        saved_model_path=tmp_model_dir,
        quantization_optimizations=self._quantization_optimizations,
        quantization_supported_types=self._quantization_supported_types,
        representative_dataset=self._representative_dataset,
        signature_key=self._signature_key,
        **self._kwargs)
    tflite_model = converter.convert()

    output_path = os.path.join(
        _ensure_str(rewritten_model.path), self._filename)
    with fileio.open(_ensure_str(output_path), 'wb') as f:
      f.write(_ensure_bytes(tflite_model))
    fileio.rmtree(tmp_model_dir)

    copy_pairs = []
    if self._copy_assets:
      src = os.path.join(
          _ensure_str(original_model.path), tf.saved_model.ASSETS_DIRECTORY)
      dst = os.path.join(
          _ensure_str(rewritten_model.path), tf.saved_model.ASSETS_DIRECTORY)
      if fileio.isdir(src):
        fileio.mkdir(dst)
        copy_pairs.append((src, dst))
    if self._copy_assets_extra:
      src = os.path.join(
          _ensure_str(original_model.path), EXTRA_ASSETS_DIRECTORY)
      dst = os.path.join(
          _ensure_str(rewritten_model.path), EXTRA_ASSETS_DIRECTORY)
      if fileio.isdir(src):
        fileio.mkdir(dst)
        copy_pairs.append((src, dst))
    for src, dst in copy_pairs:
      io_utils.copy_dir(src, dst)
Exemplo n.º 15
0
    def setUp(self):
        super(CliAirflowEndToEndTest, self).setUp()

        # List of packages installed.
        self._pip_list = str(
            subprocess.check_output(['pip', 'freeze', '--local']))

        # Check if Apache Airflow is installed before running E2E tests.
        if labels.AIRFLOW_PACKAGE_NAME not in self._pip_list:
            sys.exit('Apache Airflow not installed.')

        # Change the encoding for Click since Python 3 is configured to use ASCII as
        # encoding for the environment.
        if codecs.lookup(locale.getpreferredencoding()).name == 'ascii':
            os.environ['LANG'] = 'en_US.utf-8'

        # Setup airflow_home in a temp directory
        self._airflow_home = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName, 'airflow')
        self._old_airflow_home = os.environ.get('AIRFLOW_HOME')
        os.environ['AIRFLOW_HOME'] = self._airflow_home
        self._old_home = os.environ.get('HOME')
        os.environ['HOME'] = self._airflow_home
        absl.logging.info('Using %s as AIRFLOW_HOME and HOME in this e2e test',
                          self._airflow_home)

        # Testdata path.
        self._testdata_dir = os.path.join(
            os.path.dirname(os.path.dirname(__file__)), 'testdata')

        self._pipeline_name = 'chicago_taxi_simple'
        self._pipeline_path = os.path.join(self._testdata_dir,
                                           'test_pipeline_airflow_1.py')

        # Copy data.
        chicago_taxi_pipeline_dir = os.path.join(
            os.path.dirname(
                os.path.dirname(
                    os.path.dirname(os.path.dirname(
                        os.path.abspath(__file__))))), 'examples',
            'chicago_taxi_pipeline')
        data_dir = os.path.join(chicago_taxi_pipeline_dir, 'data', 'simple')
        content = fileio.listdir(data_dir)
        assert content, 'content in {} is empty'.format(data_dir)
        target_data_dir = os.path.join(self._airflow_home, 'taxi', 'data',
                                       'simple')
        io_utils.copy_dir(data_dir, target_data_dir)
        assert fileio.isdir(target_data_dir)
        content = fileio.listdir(target_data_dir)
        assert content, 'content in {} is {}'.format(target_data_dir, content)
        io_utils.copy_file(
            os.path.join(chicago_taxi_pipeline_dir, 'taxi_utils.py'),
            os.path.join(self._airflow_home, 'taxi', 'taxi_utils.py'))

        self._mysql_container_name = 'airflow_' + test_utils.generate_random_id(
        )
        db_port = airflow_test_utils.create_mysql_container(
            self._mysql_container_name)
        self.addCleanup(self._cleanup_mysql_container)
        os.environ['AIRFLOW__CORE__SQL_ALCHEMY_CONN'] = (
            'mysql://[email protected]:%d/airflow' % db_port)
        # Do not load examples to make this a bit faster.
        os.environ['AIRFLOW__CORE__LOAD_EXAMPLES'] = 'False'

        self._airflow_initdb()

        # Initialize CLI runner.
        self.runner = click_testing.CliRunner()