def _show_options(btn): with options_output_area: options_output_area.clear_output() options = self.to_options() options_name = 'options_{}'.format(self.output_name) create_var_in_main(options_name, options) _LOGGER.info('The pipeline options configured is: %s.', pformat_dict(options.display_data()))
def to_pipeline(self, pipeline: Optional[beam.Pipeline]) -> beam.Pipeline: """Converts the chain into an executable pipeline.""" if pipeline not in self.evaluated: # The whole chain should form a single pipeline. source = self.source if isinstance(self.source, beam.Pipeline): if pipeline: # use the known pipeline source = pipeline else: # use the source pipeline pipeline = self.source else: name_to_pcoll = pcoll_by_name() if len(self.source) == 1: source = name_to_pcoll.get(next(iter(self.source))) else: source = {s: name_to_pcoll.get(s) for s in self.source} if isinstance(source, beam.Pipeline): output = source | 'beam_sql_{}_{}'.format( self.output_name, self.execution_count) >> SqlTransform( self.query) else: output = source | 'schema_loaded_beam_sql_{}_{}'.format( self.output_name, self.execution_count) >> SchemaLoadedSqlTransform( self.output_name, self.query, self.schemas, self.execution_count) _ = create_var_in_main(self.output_name, output) self.evaluated.add(pipeline) if self.next: return self.next.to_pipeline(pipeline) else: return pipeline
def _inner(): options = self.to_options() # Caches the output_pcoll to a GCS bucket. try: execution_count = 0 if is_in_ipython(): from IPython import get_ipython execution_count = get_ipython().execution_count output_location = '{}/{}'.format( options.view_as( GoogleCloudOptions).staging_location, self.output_name) _ = self.output_pcoll | 'WriteOuput{}_{}ToGCS'.format( self.output_name, execution_count) >> WriteToText(output_location) _LOGGER.info( 'Data of output PCollection %s will be written to %s', self.output_name, output_location) except (KeyboardInterrupt, SystemExit): raise except: # pylint: disable=bare-except # The transform has been added before, noop. pass if self.verbose: _LOGGER.info( 'Running the pipeline on Dataflow with pipeline options %s.', pformat_dict(options.display_data())) result = create_runner('DataflowRunner').run_pipeline( self.p, options) cloud_options = options.view_as(GoogleCloudOptions) url = ( 'https://console.cloud.google.com/dataflow/jobs/%s/%s?project=%s' % (cloud_options.region, result.job_id(), cloud_options.project)) display( HTML( 'Click <a href="%s" target="_new">here</a> for the details ' 'of your Dataflow job.' % url)) result_name = 'result_{}'.format(self.output_name) create_var_in_main(result_name, result) if self.verbose: _LOGGER.info( 'The pipeline result of the run can be accessed from variable ' '%s. The current status is %s.', result_name, result)
def apply_sql( query: str, output_name: Optional[str], found: Dict[str, beam.PCollection], run: bool = True) -> Tuple[str, Union[PValue, SqlNode], SqlChain]: """Applies a SqlTransform with the given sql and queried PCollections. Args: query: The SQL query executed in the magic. output_name: (optional) The output variable name in __main__ module. found: The PCollections with variable names found to be used in the query. run: Whether to prepare the SQL pipeline for a local run or not. Returns: A tuple of values. First str value is the output variable name in __main__ module, auto-generated if not provided. Second value: if run, it's a PValue; otherwise, a SqlNode tracks the SQL without applying it or executing it. Third value: SqlChain is a chain of SqlNodes that have been applied. """ output_name = _generate_output_name(output_name, query, found) query, sql_source, chain = _build_query_components( query, found, output_name, run) if run: try: output = sql_source | SqlTransform(query) # Declare a variable with the output_name and output value in the # __main__ module so that the user can use the output smoothly. output_name, output = create_var_in_main(output_name, output) _LOGGER.info( "The output PCollection variable is %s with element_type %s", output_name, pformat_namedtuple(output.element_type)) return output_name, output, chain except (KeyboardInterrupt, SystemExit): raise except: # pylint: disable=bare-except on_error('Error when applying the Beam SQL: %s', traceback.format_exc()) raise else: return output_name, chain.current, chain
def test_create_var_in_main(self): name = 'test_create_var_in_main' value = Record(0, 0, 0) _ = utils.create_var_in_main(name, value) main_session = importlib.import_module('__main__') self.assertIs(getattr(main_session, name, None), value)
def display_actions(self): from IPython.display import HTML from IPython.display import display from ipywidgets import Button from ipywidgets import GridBox from ipywidgets import Layout from ipywidgets import Output options_output_area = Output() run_output_area = Output() run_btn = Button( description='Run on Dataflow', button_style='success', tooltip= ('Submit to Dataflow for execution with the configured options. The ' 'output PCollection\'s data will be written to the GCS bucket you ' 'configure.')) show_options_btn = Button( description='Show Options', button_style='info', tooltip='Show current pipeline options configured.') def _run_on_dataflow(btn): with run_output_area: run_output_area.clear_output() @progress_indicated def _inner(): options = self.to_options() # Caches the output_pcoll to a GCS bucket. try: execution_count = 0 if is_in_ipython(): from IPython import get_ipython execution_count = get_ipython().execution_count output_location = '{}/{}'.format( options.view_as( GoogleCloudOptions).staging_location, self.output_name) _ = self.output_pcoll | 'WriteOuput{}_{}ToGCS'.format( self.output_name, execution_count) >> WriteToText(output_location) _LOGGER.info( 'Data of output PCollection %s will be written to %s', self.output_name, output_location) except (KeyboardInterrupt, SystemExit): raise except: # pylint: disable=bare-except # The transform has been added before, noop. pass if self.verbose: _LOGGER.info( 'Running the pipeline on Dataflow with pipeline options %s.', pformat_dict(options.display_data())) result = create_runner('DataflowRunner').run_pipeline( self.p, options) cloud_options = options.view_as(GoogleCloudOptions) url = ( 'https://console.cloud.google.com/dataflow/jobs/%s/%s?project=%s' % (cloud_options.region, result.job_id(), cloud_options.project)) display( HTML( 'Click <a href="%s" target="_new">here</a> for the details ' 'of your Dataflow job.' % url)) result_name = 'result_{}'.format(self.output_name) create_var_in_main(result_name, result) if self.verbose: _LOGGER.info( 'The pipeline result of the run can be accessed from variable ' '%s. The current status is %s.', result_name, result) try: btn.disabled = True _inner() finally: btn.disabled = False run_btn.on_click(_run_on_dataflow) def _show_options(btn): with options_output_area: options_output_area.clear_output() options = self.to_options() options_name = 'options_{}'.format(self.output_name) create_var_in_main(options_name, options) _LOGGER.info('The pipeline options configured is: %s.', pformat_dict(options.display_data())) show_options_btn.on_click(_show_options) grid = GridBox([run_btn, show_options_btn], layout=Layout(grid_template_columns='repeat(2, 200px)')) display(grid) # Implicitly initializes the options variable before 1st time showing # options. options_name_inited, _ = create_var_in_main( 'options_{}'.format(self.output_name), self.to_options()) if not self.notice_shown: _LOGGER.info( 'The pipeline options can be configured through variable %s. You ' 'may also add additional options or sink transforms such as write ' 'to BigQuery in other notebook cells. Come back to click "Run on ' 'Dataflow" button once you complete additional configurations. ' 'Optionally, you can chain more beam_sql magics with DataflowRunner ' 'and click "Run on Dataflow" in their outputs.', options_name_inited) self.notice_shown = True display(options_output_area) display(run_output_area)