def __init__(self, pipeline, options=None): self._pipeline = pipeline self._user_pipeline = ie.current_env().user_pipeline(pipeline) if not self._user_pipeline: self._user_pipeline = pipeline self._cache_manager = ie.current_env().get_cache_manager( self._user_pipeline, create_if_absent=True) # Check if the user defined pipeline contains any source to cache. # If so, during the check, the cache manager is converted into a # streaming cache manager, thus re-assign. if background_caching_job.has_source_to_cache(self._user_pipeline): self._cache_manager = ie.current_env().get_cache_manager( self._user_pipeline) self._background_caching_pipeline = beam.pipeline.Pipeline.from_runner_api( pipeline.to_runner_api(), pipeline.runner, options) ie.current_env().add_derived_pipeline( self._pipeline, self._background_caching_pipeline) # Snapshot of original pipeline information. (self._original_pipeline_proto, context) = self._pipeline.to_runner_api(return_context=True) # All compute-once-against-original-pipeline fields. self._unbounded_sources = utils.unbounded_sources( self._background_caching_pipeline) self._pcoll_to_pcoll_id = pcoll_to_pcoll_id(self._pipeline, context) # A Dict[str, Cacheable] from a PCollection id to a Cacheable that belongs # to the analyzed pipeline. self._cacheables = self.find_cacheables() # A dict from cache key to PCollection that is read from cache. # If exists, caller should reuse the PCollection read. If not, caller # should create new transform and track the PCollection read from cache. # (Dict[str, AppliedPTransform]). self._cached_pcoll_read = {} # A dict from PCollections in the runner pipeline instance to their # corresponding PCollections in the user pipeline instance. Populated # after preprocess(). self._runner_pcoll_to_user_pcoll = {} self._pruned_pipeline_proto = None # Refers target pcolls output by instrumented write cache transforms, used # by pruning logic as supplemental targets to build pipeline fragment up # from. self._extended_targets = set() # Refers pcolls used as inputs but got replaced by outputs of read cache # transforms instrumented, used by pruning logic as targets no longer need # to be produced during pipeline runs. self._ignored_targets = set() # Set of PCollections that are written to cache. self.cached_pcolls = set()
def extract_source_to_cache_signature(user_pipeline): """Extracts a set of signature for sources that need to be cached in the user-defined pipeline. A signature is a str representation of urn and payload of a source. """ # TODO(BEAM-8335): we temporarily only cache replaceable unbounded sources. # Add logic for other cacheable sources here when they are available. unbounded_sources_as_applied_transforms = utils.unbounded_sources( user_pipeline) unbounded_sources_as_ptransforms = set( map(lambda x: x.transform, unbounded_sources_as_applied_transforms)) _, context = user_pipeline.to_runner_api(return_context=True) signature = set( map(lambda transform: str(transform.to_runner_api(context)), unbounded_sources_as_ptransforms)) return signature
def background_caching_pipeline_proto(self): """Returns the background caching pipeline. This method creates a background caching pipeline by: adding writes to cache from each unbounded source (done in the instrument method), and cutting out all components (transform, PCollections, coders, windowing strategies) that are not the unbounded sources or writes to cache (or subtransforms thereof). """ # Create the pipeline_proto to read all the components from. It will later # create a new pipeline proto from the cut out components. pipeline_proto, context = self._background_caching_pipeline.to_runner_api( return_context=True) # Get all the sources we want to cache. sources = utils.unbounded_sources(self._background_caching_pipeline) # Get all the root transforms. The caching transforms will be subtransforms # of one of these roots. roots = [root for root in pipeline_proto.root_transform_ids] # Get the transform IDs of the caching transforms. These caching operations # are added to the _background_caching_pipeline in the instrument() method. # It's added there so that multiple calls to this method won't add multiple # caching operations (idempotent). transforms = pipeline_proto.components.transforms caching_transform_ids = [ t_id for root in roots for t_id in transforms[root].subtransforms if WRITE_CACHE in t_id ] # Get the IDs of the unbounded sources. required_transform_labels = [src.full_label for src in sources] unbounded_source_ids = [ k for k, v in transforms.items() if v.unique_name in required_transform_labels ] # The required transforms are the tranforms that we want to cut out of # the pipeline_proto and insert into a new pipeline to return. required_transform_ids = (roots + caching_transform_ids + unbounded_source_ids) (t, p) = self._required_components(pipeline_proto, required_transform_ids, set()) def set_proto_map(proto_map, new_value): proto_map.clear() for key, value in new_value.items(): proto_map[key].CopyFrom(value) # Copy the transforms into the new pipeline. pipeline_to_execute = beam_runner_api_pb2.Pipeline() pipeline_to_execute.root_transform_ids[:] = roots set_proto_map(pipeline_to_execute.components.transforms, t) set_proto_map(pipeline_to_execute.components.pcollections, p) set_proto_map(pipeline_to_execute.components.coders, context.to_runner_api().coders) set_proto_map(pipeline_to_execute.components.windowing_strategies, context.to_runner_api().windowing_strategies) # Cut out all subtransforms in the root that aren't the required transforms. for root_id in roots: root = pipeline_to_execute.components.transforms[root_id] root.subtransforms[:] = [ transform_id for transform_id in root.subtransforms if transform_id in pipeline_to_execute.components.transforms ] return pipeline_to_execute