Пример #1
0
 def test_cleanup_invoked_when_new_env_replace_not_none_env(self,
                                                            mocked_cleanup):
   ie._interactive_beam_env = None
   ie.new_env(cache.FileBasedCacheManager())
   mocked_cleanup.assert_not_called()
   ie.new_env(cache.FileBasedCacheManager())
   mocked_cleanup.assert_called_once()
Пример #2
0
 def test_cleanup_reregistered_when_cm_changed(self, mocked_unreg, mocked_reg):
   ie.new_env(cache.FileBasedCacheManager())
   mocked_unreg.assert_not_called()
   ie.current_env().set_cache_manager(cache.FileBasedCacheManager())
   mocked_unreg.assert_called_once()
   mocked_reg.assert_has_calls(
       [call(ie.current_env().cleanup), call(ie.current_env().cleanup)])
Пример #3
0
 def test_cleanup_invoked_when_not_none_cm_changed(self):
   env = ie.InteractiveEnvironment()
   with patch('apache_beam.runners.interactive.interactive_environment'
              '.InteractiveEnvironment.cleanup') as mocked_cleanup:
     dummy_pipeline = 'dummy'
     env.set_cache_manager(cache.FileBasedCacheManager(), dummy_pipeline)
     mocked_cleanup.assert_not_called()
     env.set_cache_manager(cache.FileBasedCacheManager(), dummy_pipeline)
     mocked_cleanup.assert_called_once()
 def test_cleanup_invoked_when_not_none_cm_changed(self, mocked_cleanup):
     ie._interactive_beam_env = None
     ie.new_env()
     dummy_pipeline = 'dummy'
     ie.current_env().set_cache_manager(cache.FileBasedCacheManager(),
                                        dummy_pipeline)
     mocked_cleanup.assert_not_called()
     ie.current_env().set_cache_manager(cache.FileBasedCacheManager(),
                                        dummy_pipeline)
     mocked_cleanup.assert_called_once()
Пример #5
0
    def test_cache_manager_uses_local_ib_cache_root(self):
        """
    Checks that FileBasedCacheManager._cache_dir is set to the
    cache_root set under Interactive Beam for a local directory
    and that the cached values are the same as the values of a
    cache using default settings.
    """
        prefix = 'full'
        cache_label = 'some-cache-label'
        cached_values = [1, 2, 3]

        self.mock_write_cache(cached_values, prefix, cache_label)
        reader_one, _ = self.cache_manager.read(prefix, cache_label)
        pcoll_list_one = list(reader_one)

        # Set Interactive Beam specified cache dir to local directory
        ib.options.cache_root = '/tmp/it-test/'
        cache_manager_with_ib_option = cache.FileBasedCacheManager(
            cache_dir=ib.options.cache_root)
        self.assertEqual(ib.options.cache_root,
                         cache_manager_with_ib_option._cache_dir)

        cache_manager_with_ib_option.write(cached_values,
                                           *[prefix, cache_label])
        reader_two, _ = self.cache_manager.read(prefix, cache_label)
        pcoll_list_two = list(reader_two)

        # Writing to a different directory should not impact the cached values
        self.assertEqual(pcoll_list_one, pcoll_list_two)

        # Reset Interactive Beam setting
        ib.options.cache_root = None
Пример #6
0
    def __init__(self,
                 underlying_runner=None,
                 cache_dir=None,
                 cache_format='text',
                 render_option=None,
                 skip_display=False):
        """Constructor of InteractiveRunner.

    Args:
      underlying_runner: (runner.PipelineRunner)
      cache_dir: (str) the directory where PCollection caches are kept
      cache_format: (str) the file format that should be used for saving
          PCollection caches. Available options are 'text' and 'tfrecord'.
      render_option: (str) this parameter decides how the pipeline graph is
          rendered. See display.pipeline_graph_renderer for available options.
      skip_display: (bool) whether to skip display operations when running the
          pipeline. Useful if running large pipelines when display is not
          needed.
    """
        self._underlying_runner = (underlying_runner
                                   or direct_runner.DirectRunner())
        self._cache_manager = cache.FileBasedCacheManager(
            cache_dir, cache_format)
        self._renderer = pipeline_graph_renderer.get_renderer(render_option)
        self._in_session = False
        self._skip_display = skip_display
Пример #7
0
 def test_track_user_pipeline_cleanup_non_inspectable_pipeline(self):
     dummy_pipeline_1 = beam.Pipeline()
     dummy_pipeline_2 = beam.Pipeline()
     dummy_pipeline_3 = beam.Pipeline()
     dummy_pipeline_4 = beam.Pipeline()
     dummy_pcoll = dummy_pipeline_4 | beam.Create([1])
     dummy_pipeline_5 = beam.Pipeline()
     dummy_non_inspectable_pipeline = 'dummy'
     ie.current_env().watch(locals())
     from apache_beam.runners.interactive.background_caching_job import BackgroundCachingJob
     ie.current_env().set_background_caching_job(
         dummy_pipeline_1,
         BackgroundCachingJob(runner.PipelineResult(
             runner.PipelineState.DONE),
                              limiters=[]))
     ie.current_env().set_test_stream_service_controller(
         dummy_pipeline_2, None)
     ie.current_env().set_cache_manager(cache.FileBasedCacheManager(),
                                        dummy_pipeline_3)
     ie.current_env().mark_pcollection_computed([dummy_pcoll])
     ie.current_env().set_cached_source_signature(
         dummy_non_inspectable_pipeline, None)
     ie.current_env().set_pipeline_result(
         dummy_pipeline_5,
         runner.PipelineResult(runner.PipelineState.RUNNING))
     with patch('apache_beam.runners.interactive.interactive_environment'
                '.InteractiveEnvironment.cleanup') as mocked_cleanup:
         ie.current_env().track_user_pipelines()
         mocked_cleanup.assert_called_once()
Пример #8
0
 def __init__(self, underlying_runner=None, cache_dir=None):
     # TODO(qinyeli, BEAM-4755) remove explicitly overriding underlying runner
     # once interactive_runner works with FnAPI mode
     self._underlying_runner = (underlying_runner
                                or direct_runner.BundleBasedDirectRunner())
     self._cache_manager = cache.FileBasedCacheManager(cache_dir)
     self._in_session = False
Пример #9
0
 def test_cleanup_unregistered_when_not_none_cm_cleared(
     self, mocked_unreg, mocked_reg):
   ie.new_env(cache.FileBasedCacheManager())
   mocked_reg.assert_called_once()
   mocked_unreg.assert_not_called()
   ie.current_env().set_cache_manager(None)
   mocked_reg.assert_called_once()
   mocked_unreg.assert_called_once()
Пример #10
0
 def test_noop_when_cm_is_not_changed(self, mocked_unreg, mocked_reg):
   cache_manager = cache.FileBasedCacheManager()
   ie.new_env(cache_manager)
   mocked_unreg.assert_not_called()
   mocked_reg.assert_called_once()
   ie.current_env().set_cache_manager(cache_manager)
   mocked_unreg.assert_not_called()
   mocked_reg.assert_called_once()
 def test_noop_when_cm_is_not_changed(self, mocked_cleanup):
   ie._interactive_beam_env = None
   cache_manager = cache.FileBasedCacheManager()
   dummy_pipeline = 'dummy'
   ie.new_env()
   ie.current_env()._cache_managers[str(id(dummy_pipeline))] = cache_manager
   mocked_cleanup.assert_not_called()
   ie.current_env().set_cache_manager(cache_manager, dummy_pipeline)
   mocked_cleanup.assert_not_called()
 def test_cleanup_not_invoked_when_cm_changed_from_none(self, mocked_cleanup):
   ie._interactive_beam_env = None
   ie.new_env()
   dummy_pipeline = 'dummy'
   self.assertIsNone(ie.current_env().get_cache_manager(dummy_pipeline))
   cache_manager = cache.FileBasedCacheManager()
   ie.current_env().set_cache_manager(cache_manager, dummy_pipeline)
   mocked_cleanup.assert_not_called()
   self.assertIs(
       ie.current_env().get_cache_manager(dummy_pipeline), cache_manager)
Пример #13
0
 def test_noop_when_cm_is_not_changed(self):
     cache_manager = cache.FileBasedCacheManager()
     dummy_pipeline = 'dummy'
     env = ie.InteractiveEnvironment()
     with patch('apache_beam.runners.interactive.interactive_environment'
                '.InteractiveEnvironment.cleanup') as mocked_cleanup:
         env._cache_managers[str(id(dummy_pipeline))] = cache_manager
         mocked_cleanup.assert_not_called()
         env.set_cache_manager(cache_manager, dummy_pipeline)
         mocked_cleanup.assert_not_called()
Пример #14
0
 def test_cleanup_not_invoked_when_cm_changed_from_none(self):
     env = ie.InteractiveEnvironment()
     with patch('apache_beam.runners.interactive.interactive_environment'
                '.InteractiveEnvironment.cleanup') as mocked_cleanup:
         dummy_pipeline = 'dummy'
         self.assertIsNone(env.get_cache_manager(dummy_pipeline))
         cache_manager = cache.FileBasedCacheManager()
         env.set_cache_manager(cache_manager, dummy_pipeline)
         mocked_cleanup.assert_not_called()
         self.assertIs(env.get_cache_manager(dummy_pipeline), cache_manager)
Пример #15
0
    def get_cache_manager(self, pipeline, create_if_absent=False):
        """Gets the cache manager held by current Interactive Environment for the
    given pipeline. If the pipeline is absent from the environment while
    create_if_absent is True, creates and returns a new file based cache
    manager for the pipeline."""
        if self._is_in_ipython:
            warnings.filterwarnings(
                'ignore',
                'options is deprecated since First stable release. References to '
                '<pipeline>.options will not be supported',
                category=DeprecationWarning)

        cache_manager = self._cache_managers.get(str(id(pipeline)), None)
        if isinstance(pipeline, Pipeline):
            from apache_beam.runners.interactive.interactive_runner import InteractiveRunner
            if isinstance(pipeline.runner, InteractiveRunner):
                pipeline_runner = pipeline.runner._underlying_runner
            else:
                pipeline_runner = pipeline.runner
        else:
            pipeline_runner = None
        if not cache_manager and create_if_absent:
            cache_root = self.options.cache_root
            if cache_root:
                if cache_root.startswith('gs://'):
                    cache_dir = self._get_gcs_cache_dir(pipeline, cache_root)
                else:
                    cache_dir = tempfile.mkdtemp(dir=cache_root)
                    if not isinstance(pipeline_runner,
                                      direct_runner.DirectRunner):
                        _LOGGER.warning(
                            'A local cache directory has been specified while '
                            'not using DirectRunner. It is recommended to cache into a '
                            'GCS bucket instead.')
            else:
                staging_location = pipeline.options.get_all_options(
                )['staging_location']
                if isinstance(pipeline_runner,
                              DataflowRunner) and staging_location:
                    cache_dir = self._get_gcs_cache_dir(
                        pipeline, staging_location)
                    _LOGGER.info(
                        'No cache_root detected. '
                        'Defaulting to staging_location %s for cache location.',
                        staging_location)
                else:
                    cache_dir = tempfile.mkdtemp(suffix=str(id(pipeline)),
                                                 prefix='it-',
                                                 dir=os.environ.get(
                                                     'TEST_TMPDIR', None))
            cache_manager = cache.FileBasedCacheManager(cache_dir)
            self._cache_managers[str(id(pipeline))] = cache_manager
        return cache_manager
Пример #16
0
 def get_cache_manager(self, pipeline, create_if_absent=False):
   """Gets the cache manager held by current Interactive Environment for the
   given pipeline. If the pipeline is absent from the environment while
   create_if_absent is True, creates and returns a new file based cache
   manager for the pipeline."""
   cache_manager = self._cache_managers.get(str(id(pipeline)), None)
   if not cache_manager and create_if_absent:
     cache_dir = tempfile.mkdtemp(
         suffix=str(id(pipeline)),
         prefix='it-',
         dir=os.environ.get('TEST_TMPDIR', None))
     cache_manager = cache.FileBasedCacheManager(cache_dir)
     self._cache_managers[str(id(pipeline))] = cache_manager
   return cache_manager
Пример #17
0
  def __init__(self, underlying_runner=None, cache_dir=None,
               render_option=None):
    """Constructor of InteractiveRunner.

    Args:
      underlying_runner: (runner.PipelineRunner)
      cache_dir: (str) the directory where PCollection caches are kept
      render_option: (str) this parameter decides how the pipeline graph is
          rendered. See display.pipeline_graph_renderer for available options.
    """
    self._underlying_runner = (underlying_runner
                               or direct_runner.DirectRunner())
    self._cache_manager = cache.FileBasedCacheManager(cache_dir)
    self._renderer = pipeline_graph_renderer.get_renderer(render_option)
    self._in_session = False
Пример #18
0
    def test_cache_manager_uses_gcs_ib_cache_root(self):
        """
    Checks that FileBasedCacheManager._cache_dir is set to the
    cache_root set under Interactive Beam for a GCS directory.
    """
        # Set Interactive Beam specified cache dir to cloud storage
        ib.options.cache_root = 'gs://'

        cache_manager_with_ib_option = cache.FileBasedCacheManager(
            cache_dir=ib.options.cache_root)

        self.assertEqual(ib.options.cache_root,
                         cache_manager_with_ib_option._cache_dir)

        # Reset Interactive Beam setting
        ib.options.cache_root = None
Пример #19
0
 def get_cache_manager(self, pipeline, create_if_absent=False):
     """Gets the cache manager held by current Interactive Environment for the
 given pipeline. If the pipeline is absent from the environment while
 create_if_absent is True, creates and returns a new file based cache
 manager for the pipeline."""
     cache_manager = self._cache_managers.get(str(id(pipeline)), None)
     if not cache_manager and create_if_absent:
         from apache_beam.runners.interactive import interactive_beam as ib
         if ib.options.cache_root:
             #TODO(victorhc): Handle the case when the path starts with "gs://"
             if ib.options.cache_root.startswith("gs://"):
                 raise ValueError("GCS paths are not currently supported.")
             cache_dir = tempfile.mkdtemp(dir=ib.options.cache_root)
         else:
             cache_dir = tempfile.mkdtemp(suffix=str(id(pipeline)),
                                          prefix='it-',
                                          dir=os.environ.get(
                                              'TEST_TMPDIR', None))
         cache_manager = cache.FileBasedCacheManager(cache_dir)
         self._cache_managers[str(id(pipeline))] = cache_manager
     return cache_manager
Пример #20
0
  def __init__(
      self,
      underlying_runner=None,
      cache_dir=None,
      cache_format='text',
      render_option=None,
      skip_display=True,
      force_compute=True,
      blocking=True):
    """Constructor of InteractiveRunner.

    Args:
      underlying_runner: (runner.PipelineRunner)
      cache_dir: (str) the directory where PCollection caches are kept
      cache_format: (str) the file format that should be used for saving
          PCollection caches. Available options are 'text' and 'tfrecord'.
      render_option: (str) this parameter decides how the pipeline graph is
          rendered. See display.pipeline_graph_renderer for available options.
      skip_display: (bool) whether to skip display operations when running the
          pipeline. Useful if running large pipelines when display is not
          needed.
      force_compute: (bool) whether sequential pipeline runs can use cached data
          of PCollections computed from the previous runs including show API
          invocation from interactive_beam module. If True, always run the whole
          pipeline and compute data for PCollections forcefully. If False, use
          available data and run minimum pipeline fragment to only compute data
          not available.
      blocking: (bool) whether the pipeline run should be blocking or not.
    """
    self._underlying_runner = (
        underlying_runner or direct_runner.DirectRunner())
    if not ie.current_env().cache_manager():
      ie.current_env().set_cache_manager(
          cache.FileBasedCacheManager(cache_dir, cache_format))
    self._cache_manager = ie.current_env().cache_manager()
    self._render_option = render_option
    self._in_session = False
    self._skip_display = skip_display
    self._force_compute = force_compute
    self._blocking = blocking
Пример #21
0
 def setUp(self):
     self.cache_manager = cache.FileBasedCacheManager(
         cache_format=self.cache_format)
Пример #22
0
 def setUp(self):
     self.test_dir = tempfile.mkdtemp()
     self.cache_manager = cache.FileBasedCacheManager(self.test_dir)
Пример #23
0
 def __init__(self, underlying_runner=None, cache_dir=None):
   self._underlying_runner = (underlying_runner
                              or direct_runner.DirectRunner())
   self._cache_manager = cache.FileBasedCacheManager(cache_dir)
   self._in_session = False
Пример #24
0
 def test_cleanup_when_cm_not_none(self,
                                   mocked_atexit):
   ie.new_env(cache.FileBasedCacheManager())
   mocked_atexit.assert_called_once()
Пример #25
0
 def setUp(self):
     self.runner = direct_runner.DirectRunner()
     self.cache_manager = cache.FileBasedCacheManager()
Пример #26
0
 def test_cleanup_invoked_when_cm_changed(self,
                                          mocked_cleanup):
   ie._interactive_beam_env = None
   ie.new_env(cache.FileBasedCacheManager())
   ie.current_env().set_cache_manager(cache.FileBasedCacheManager())
   mocked_cleanup.assert_called_once()
Пример #27
0
 def setUp(self):
     ie.new_env(cache_manager=cache.FileBasedCacheManager())