def cache_output(output_name: str, output: PValue) -> None: user_pipeline = ie.current_env().user_pipeline(output.pipeline) if user_pipeline: cache_manager = ie.current_env().get_cache_manager( user_pipeline, create_if_absent=True) else: _LOGGER.warning( 'Something is wrong with %s. Cannot introspect its data.', output) return key = CacheKey.from_pcoll(output_name, output).to_str() _ = reify_to_cache(pcoll=output, cache_key=key, cache_manager=cache_manager) try: output.pipeline.run().wait_until_finish() except (KeyboardInterrupt, SystemExit): raise except: # pylint: disable=bare-except _LOGGER.warning( _NOT_SUPPORTED_MSG, traceback.format_exc(), output.pipeline.runner) return ie.current_env().mark_pcollection_computed([output]) visualize_computed_pcoll( output_name, output, max_n=float('inf'), max_duration_secs=float('inf'))
def cache_key_of(self, name, pcoll): return CacheKey.from_pcoll(name, pcoll).to_str()
def _build_query_components( query: str, found: Dict[str, beam.PCollection], output_name: str, run: bool = True ) -> Tuple[str, Union[Dict[str, beam.PCollection], beam.PCollection, beam.Pipeline], SqlChain]: """Builds necessary components needed to apply the SqlTransform. Args: query: The SQL query to be executed by the magic. found: The PCollections with variable names found to be used by the query. output_name: The output variable name in __main__ module. run: Whether to prepare components for a local run or not. Returns: The processed query to be executed by the magic; a source to apply the SqlTransform to: a dictionary of tagged PCollections, or a single PCollection, or the pipeline to execute the query; the chain of applied beam_sql magics this one belongs to. """ if found: user_pipeline = ie.current_env().user_pipeline( next(iter(found.values())).pipeline) sql_pipeline = beam.Pipeline(options=user_pipeline._options) ie.current_env().add_derived_pipeline(user_pipeline, sql_pipeline) sql_source = {} if run: if has_source_to_cache(user_pipeline): sql_source = pcolls_from_streaming_cache( user_pipeline, sql_pipeline, found) else: cache_manager = ie.current_env().get_cache_manager( user_pipeline, create_if_absent=True) for pcoll_name, pcoll in found.items(): cache_key = CacheKey.from_pcoll(pcoll_name, pcoll).to_str() sql_source[pcoll_name] = unreify_from_cache( pipeline=sql_pipeline, cache_key=cache_key, cache_manager=cache_manager, element_type=pcoll.element_type) else: sql_source = found if len(sql_source) == 1: query = replace_single_pcoll_token(query, next(iter(sql_source.keys()))) sql_source = next(iter(sql_source.values())) node = SqlNode(output_name=output_name, source=set(found.keys()), query=query) chain = ie.current_env().get_sql_chain( user_pipeline, set_user_pipeline=True).append(node) else: # does not query any existing PCollection sql_source = beam.Pipeline() ie.current_env().add_user_pipeline(sql_source) # The node should be the root node of the chain created below. node = SqlNode(output_name=output_name, source=sql_source, query=query) chain = ie.current_env().get_sql_chain(sql_source).append(node) return query, sql_source, chain