Пример #1
0
 def _show_options(btn):
     with options_output_area:
         options_output_area.clear_output()
         options = self.to_options()
         options_name = 'options_{}'.format(self.output_name)
         create_var_in_main(options_name, options)
         _LOGGER.info('The pipeline options configured is: %s.',
                      pformat_dict(options.display_data()))
Пример #2
0
 def to_pipeline(self, pipeline: Optional[beam.Pipeline]) -> beam.Pipeline:
     """Converts the chain into an executable pipeline."""
     if pipeline not in self.evaluated:
         # The whole chain should form a single pipeline.
         source = self.source
         if isinstance(self.source, beam.Pipeline):
             if pipeline:  # use the known pipeline
                 source = pipeline
             else:  # use the source pipeline
                 pipeline = self.source
         else:
             name_to_pcoll = pcoll_by_name()
             if len(self.source) == 1:
                 source = name_to_pcoll.get(next(iter(self.source)))
             else:
                 source = {s: name_to_pcoll.get(s) for s in self.source}
         if isinstance(source, beam.Pipeline):
             output = source | 'beam_sql_{}_{}'.format(
                 self.output_name, self.execution_count) >> SqlTransform(
                     self.query)
         else:
             output = source | 'schema_loaded_beam_sql_{}_{}'.format(
                 self.output_name,
                 self.execution_count) >> SchemaLoadedSqlTransform(
                     self.output_name, self.query, self.schemas,
                     self.execution_count)
         _ = create_var_in_main(self.output_name, output)
         self.evaluated.add(pipeline)
     if self.next:
         return self.next.to_pipeline(pipeline)
     else:
         return pipeline
Пример #3
0
 def _inner():
     options = self.to_options()
     # Caches the output_pcoll to a GCS bucket.
     try:
         execution_count = 0
         if is_in_ipython():
             from IPython import get_ipython
             execution_count = get_ipython().execution_count
         output_location = '{}/{}'.format(
             options.view_as(
                 GoogleCloudOptions).staging_location,
             self.output_name)
         _ = self.output_pcoll | 'WriteOuput{}_{}ToGCS'.format(
             self.output_name,
             execution_count) >> WriteToText(output_location)
         _LOGGER.info(
             'Data of output PCollection %s will be written to %s',
             self.output_name, output_location)
     except (KeyboardInterrupt, SystemExit):
         raise
     except:  # pylint: disable=bare-except
         # The transform has been added before, noop.
         pass
     if self.verbose:
         _LOGGER.info(
             'Running the pipeline on Dataflow with pipeline options %s.',
             pformat_dict(options.display_data()))
     result = create_runner('DataflowRunner').run_pipeline(
         self.p, options)
     cloud_options = options.view_as(GoogleCloudOptions)
     url = (
         'https://console.cloud.google.com/dataflow/jobs/%s/%s?project=%s'
         % (cloud_options.region, result.job_id(),
            cloud_options.project))
     display(
         HTML(
             'Click <a href="%s" target="_new">here</a> for the details '
             'of your Dataflow job.' % url))
     result_name = 'result_{}'.format(self.output_name)
     create_var_in_main(result_name, result)
     if self.verbose:
         _LOGGER.info(
             'The pipeline result of the run can be accessed from variable '
             '%s. The current status is %s.', result_name,
             result)
Пример #4
0
def apply_sql(
    query: str,
    output_name: Optional[str],
    found: Dict[str, beam.PCollection],
    run: bool = True) -> Tuple[str, Union[PValue, SqlNode], SqlChain]:
  """Applies a SqlTransform with the given sql and queried PCollections.

  Args:
    query: The SQL query executed in the magic.
    output_name: (optional) The output variable name in __main__ module.
    found: The PCollections with variable names found to be used in the query.
    run: Whether to prepare the SQL pipeline for a local run or not.

  Returns:
    A tuple of values. First str value is the output variable name in
    __main__ module, auto-generated if not provided. Second value: if run,
    it's a PValue; otherwise, a SqlNode tracks the SQL without applying it or
    executing it. Third value: SqlChain is a chain of SqlNodes that have been
    applied.
  """
  output_name = _generate_output_name(output_name, query, found)
  query, sql_source, chain = _build_query_components(
      query, found, output_name, run)
  if run:
    try:
      output = sql_source | SqlTransform(query)
      # Declare a variable with the output_name and output value in the
      # __main__ module so that the user can use the output smoothly.
      output_name, output = create_var_in_main(output_name, output)
      _LOGGER.info(
          "The output PCollection variable is %s with element_type %s",
          output_name,
          pformat_namedtuple(output.element_type))
      return output_name, output, chain
    except (KeyboardInterrupt, SystemExit):
      raise
    except:  # pylint: disable=bare-except
      on_error('Error when applying the Beam SQL: %s', traceback.format_exc())
      raise
  else:
    return output_name, chain.current, chain
Пример #5
0
 def test_create_var_in_main(self):
     name = 'test_create_var_in_main'
     value = Record(0, 0, 0)
     _ = utils.create_var_in_main(name, value)
     main_session = importlib.import_module('__main__')
     self.assertIs(getattr(main_session, name, None), value)
Пример #6
0
    def display_actions(self):
        from IPython.display import HTML
        from IPython.display import display
        from ipywidgets import Button
        from ipywidgets import GridBox
        from ipywidgets import Layout
        from ipywidgets import Output
        options_output_area = Output()
        run_output_area = Output()
        run_btn = Button(
            description='Run on Dataflow',
            button_style='success',
            tooltip=
            ('Submit to Dataflow for execution with the configured options. The '
             'output PCollection\'s data will be written to the GCS bucket you '
             'configure.'))
        show_options_btn = Button(
            description='Show Options',
            button_style='info',
            tooltip='Show current pipeline options configured.')

        def _run_on_dataflow(btn):
            with run_output_area:
                run_output_area.clear_output()

                @progress_indicated
                def _inner():
                    options = self.to_options()
                    # Caches the output_pcoll to a GCS bucket.
                    try:
                        execution_count = 0
                        if is_in_ipython():
                            from IPython import get_ipython
                            execution_count = get_ipython().execution_count
                        output_location = '{}/{}'.format(
                            options.view_as(
                                GoogleCloudOptions).staging_location,
                            self.output_name)
                        _ = self.output_pcoll | 'WriteOuput{}_{}ToGCS'.format(
                            self.output_name,
                            execution_count) >> WriteToText(output_location)
                        _LOGGER.info(
                            'Data of output PCollection %s will be written to %s',
                            self.output_name, output_location)
                    except (KeyboardInterrupt, SystemExit):
                        raise
                    except:  # pylint: disable=bare-except
                        # The transform has been added before, noop.
                        pass
                    if self.verbose:
                        _LOGGER.info(
                            'Running the pipeline on Dataflow with pipeline options %s.',
                            pformat_dict(options.display_data()))
                    result = create_runner('DataflowRunner').run_pipeline(
                        self.p, options)
                    cloud_options = options.view_as(GoogleCloudOptions)
                    url = (
                        'https://console.cloud.google.com/dataflow/jobs/%s/%s?project=%s'
                        % (cloud_options.region, result.job_id(),
                           cloud_options.project))
                    display(
                        HTML(
                            'Click <a href="%s" target="_new">here</a> for the details '
                            'of your Dataflow job.' % url))
                    result_name = 'result_{}'.format(self.output_name)
                    create_var_in_main(result_name, result)
                    if self.verbose:
                        _LOGGER.info(
                            'The pipeline result of the run can be accessed from variable '
                            '%s. The current status is %s.', result_name,
                            result)

                try:
                    btn.disabled = True
                    _inner()
                finally:
                    btn.disabled = False

        run_btn.on_click(_run_on_dataflow)

        def _show_options(btn):
            with options_output_area:
                options_output_area.clear_output()
                options = self.to_options()
                options_name = 'options_{}'.format(self.output_name)
                create_var_in_main(options_name, options)
                _LOGGER.info('The pipeline options configured is: %s.',
                             pformat_dict(options.display_data()))

        show_options_btn.on_click(_show_options)
        grid = GridBox([run_btn, show_options_btn],
                       layout=Layout(grid_template_columns='repeat(2, 200px)'))
        display(grid)

        # Implicitly initializes the options variable before 1st time showing
        # options.
        options_name_inited, _ = create_var_in_main(
            'options_{}'.format(self.output_name), self.to_options())
        if not self.notice_shown:
            _LOGGER.info(
                'The pipeline options can be configured through variable %s. You '
                'may also add additional options or sink transforms such as write '
                'to BigQuery in other notebook cells. Come back to click "Run on '
                'Dataflow" button once you complete additional configurations. '
                'Optionally, you can chain more beam_sql magics with DataflowRunner '
                'and click "Run on Dataflow" in their outputs.',
                options_name_inited)
            self.notice_shown = True

        display(options_output_area)
        display(run_output_area)