Exemplo n.º 1
0
    def test_pcoll_by_name(self):
        p = beam.Pipeline()
        pcoll = p | beam.Create([1])
        ib.watch({'p': p, 'pcoll': pcoll})

        name_to_pcoll = utils.pcoll_by_name()
        self.assertIn('pcoll', name_to_pcoll)
Exemplo n.º 2
0
    def __init__(
            self,
            user_pipeline,  # type: beam.Pipeline
            pcolls,  # type: List[beam.pvalue.PCollection]
            result,  # type: beam.runner.PipelineResult
            max_n,  # type: int
            max_duration_secs,  # type: float
    ):
        self._user_pipeline = user_pipeline
        self._result = result
        self._result_lock = threading.Lock()
        self._pcolls = pcolls
        pcoll_var = lambda pcoll: {
            v: k
            for k, v in utils.pcoll_by_name().items()
        }.get(pcoll, None)

        self._streams = {
            pcoll: ElementStream(
                pcoll, pcoll_var(pcoll),
                CacheKey.from_pcoll(pcoll_var(pcoll), pcoll).to_str(), max_n,
                max_duration_secs)
            for pcoll in pcolls
        }

        self._start = time.time()
        self._duration_secs = max_duration_secs
        self._set_computed = bcj.is_cache_complete(str(id(user_pipeline)))

        # Run a separate thread for marking the PCollections done. This is because
        # the pipeline run may be asynchronous.
        self._mark_computed = threading.Thread(target=self._mark_all_computed)
        self._mark_computed.daemon = True
        self._mark_computed.start()
Exemplo n.º 3
0
    def beam_sql(self, line: str, cell: str) -> Union[None, PValue]:
        """The beam_sql cell magic that executes a Beam SQL.

    Args:
      line: (optional) the string on the same line after the beam_sql magic.
          Used as the output variable name in the __main__ module.
      cell: everything else in the same notebook cell as a string. Used as a
          Beam SQL query.

    Returns None if running into an error, otherwise a PValue as if a
    SqlTransform is applied.
    """
        if line and not line.strip().isidentifier() or keyword.iskeyword(
                line.strip()):
            on_error(
                'The output_name "%s" is not a valid identifier. Please supply a '
                'valid identifier that is not a Python keyword.', line)
            return
        if not cell or cell.isspace():
            on_error('Please supply the sql to be executed.')
            return
        found = find_pcolls(cell, pcoll_by_name())
        for _, pcoll in found.items():
            if not is_namedtuple(pcoll.element_type):
                on_error(
                    'PCollection %s of type %s is not a NamedTuple. See '
                    'https://beam.apache.org/documentation/programming-guide/#schemas '
                    'for more details.', pcoll, pcoll.element_type)
                return
            register_coder_for_schema(pcoll.element_type)

        output_name, output = apply_sql(cell, line, found)
        cache_output(output_name, output)
        return output
Exemplo n.º 4
0
 def to_pipeline(self, pipeline: Optional[beam.Pipeline]) -> beam.Pipeline:
     """Converts the chain into an executable pipeline."""
     if pipeline not in self.evaluated:
         # The whole chain should form a single pipeline.
         source = self.source
         if isinstance(self.source, beam.Pipeline):
             if pipeline:  # use the known pipeline
                 source = pipeline
             else:  # use the source pipeline
                 pipeline = self.source
         else:
             name_to_pcoll = pcoll_by_name()
             if len(self.source) == 1:
                 source = name_to_pcoll.get(next(iter(self.source)))
             else:
                 source = {s: name_to_pcoll.get(s) for s in self.source}
         if isinstance(source, beam.Pipeline):
             output = source | 'beam_sql_{}_{}'.format(
                 self.output_name, self.execution_count) >> SqlTransform(
                     self.query)
         else:
             output = source | 'schema_loaded_beam_sql_{}_{}'.format(
                 self.output_name,
                 self.execution_count) >> SchemaLoadedSqlTransform(
                     self.output_name, self.query, self.schemas,
                     self.execution_count)
         _ = create_var_in_main(self.output_name, output)
         self.evaluated.add(pipeline)
     if self.next:
         return self.next.to_pipeline(pipeline)
     else:
         return pipeline
Exemplo n.º 5
0
    def beam_sql(self,
                 line: str,
                 cell: Optional[str] = None) -> Optional[PValue]:
        """The beam_sql line/cell magic that executes a Beam SQL.

    Args:
      line: the string on the same line after the beam_sql magic.
      cell: everything else in the same notebook cell as a string. If None,
        beam_sql is used as line magic. Otherwise, cell magic.

    Returns None if running into an error, otherwise a PValue as if a
    SqlTransform is applied.
    """
        input_str = line
        if cell:
            input_str += ' ' + cell
        parsed = self._parser.parse(input_str.strip().split())
        if not parsed:
            # Failed to parse inputs, let the parser handle the exit.
            return
        output_name = parsed.output_name
        verbose = parsed.verbose
        query = parsed.query

        if output_name and not output_name.isidentifier() or keyword.iskeyword(
                output_name):
            on_error(
                'The output_name "%s" is not a valid identifier. Please supply a '
                'valid identifier that is not a Python keyword.', line)
            return
        if not query:
            on_error('Please supply the SQL query to be executed.')
            return
        query = ' '.join(query)

        found = find_pcolls(query, pcoll_by_name(), verbose=verbose)
        for _, pcoll in found.items():
            if not is_namedtuple(pcoll.element_type):
                on_error(
                    'PCollection %s of type %s is not a NamedTuple. See '
                    'https://beam.apache.org/documentation/programming-guide/#schemas '
                    'for more details.', pcoll, pcoll.element_type)
                return
            register_coder_for_schema(pcoll.element_type, verbose=verbose)

        output_name, output = apply_sql(query, output_name, found)
        cache_output(output_name, output)
        return output
Exemplo n.º 6
0
  def beam_sql(self, line: str, cell: Optional[str] = None) -> Optional[PValue]:
    """The beam_sql line/cell magic that executes a Beam SQL.

    Args:
      line: the string on the same line after the beam_sql magic.
      cell: everything else in the same notebook cell as a string. If None,
        beam_sql is used as line magic. Otherwise, cell magic.

    Returns None if running into an error or waiting for user input (running on
    a selected runner remotely), otherwise a PValue as if a SqlTransform is
    applied.
    """
    input_str = line
    if cell:
      input_str += ' ' + cell
    parsed = self._parser.parse(input_str.strip().split())
    if not parsed:
      # Failed to parse inputs, let the parser handle the exit.
      return
    output_name = parsed.output_name
    verbose = parsed.verbose
    query = parsed.query
    runner = parsed.runner

    if output_name and not output_name.isidentifier() or keyword.iskeyword(
        output_name):
      on_error(
          'The output_name "%s" is not a valid identifier. Please supply a '
          'valid identifier that is not a Python keyword.',
          line)
      return
    if not query:
      on_error('Please supply the SQL query to be executed.')
      return
    if runner and runner not in _SUPPORTED_RUNNERS:
      on_error(
          'Runner "%s" is not supported. Supported runners are %s.',
          runner,
          _SUPPORTED_RUNNERS)
      return
    query = ' '.join(query)

    found = find_pcolls(query, pcoll_by_name(), verbose=verbose)
    schemas = set()
    main_session = importlib.import_module('__main__')
    for _, pcoll in found.items():
      if not match_is_named_tuple(pcoll.element_type):
        on_error(
            'PCollection %s of type %s is not a NamedTuple. See '
            'https://beam.apache.org/documentation/programming-guide/#schemas '
            'for more details.',
            pcoll,
            pcoll.element_type)
        return
      register_coder_for_schema(pcoll.element_type, verbose=verbose)
      # Only care about schemas defined by the user in the main module.
      if hasattr(main_session, pcoll.element_type.__name__):
        schemas.add(pcoll.element_type)

    if runner in ('DirectRunner', None):
      collect_data_for_local_run(query, found)
      output_name, output, chain = apply_sql(query, output_name, found)
      chain.current.schemas = schemas
      cache_output(output_name, output)
      return output

    output_name, current_node, chain = apply_sql(
        query, output_name, found, False)
    current_node.schemas = schemas
    # TODO(BEAM-10708): Move the options setup and result handling to a
    # separate module when more runners are supported.
    if runner == 'DataflowRunner':
      _ = chain.to_pipeline()
      _ = DataflowOptionsForm(
          output_name, pcoll_by_name()[output_name],
          verbose).display_for_input()
      return None
    else:
      raise ValueError('Unsupported runner %s.', runner)