def Prepare(self, request, context=None): # For now, just use the job name as the job id. logging.debug('Got Prepare request.') preparation_id = '%s-%s' % (request.job_name, uuid.uuid4()) provision_info = fn_api_runner.ExtendedProvisionInfo( beam_provision_api_pb2.ProvisionInfo( job_id=preparation_id, job_name=request.job_name, pipeline_options=request.pipeline_options, retrieval_token=self._artifact_service.retrieval_token( preparation_id)), self._staging_dir) self._jobs[preparation_id] = BeamJob(preparation_id, request.pipeline_options, request.pipeline, provision_info) logging.debug("Prepared job '%s' as '%s'", request.job_name, preparation_id) # TODO(angoenka): Pass an appropriate staging_session_token. The token can # be obtained in PutArtifactResponse from JobService if not self._artifact_staging_endpoint: # The front-end didn't try to stage anything, but the worker may # request what's here so we should at least store an empty manifest. self._artifact_service.CommitManifest( beam_artifact_api_pb2.CommitManifestRequest( staging_session_token=preparation_id, manifest=beam_artifact_api_pb2.Manifest())) return beam_job_api_pb2.PrepareJobResponse( preparation_id=preparation_id, artifact_staging_endpoint=self._artifact_staging_endpoint, staging_session_token=preparation_id)
def create_beam_job( self, preparation_id, # stype: str job_name, # type: str pipeline, # type: beam_runner_api_pb2.Pipeline options # type: struct_pb2.Struct ): # type: (...) -> BeamJob # TODO(angoenka): Pass an appropriate staging_session_token. The token can # be obtained in PutArtifactResponse from JobService if not self._artifact_staging_endpoint: # The front-end didn't try to stage anything, but the worker may # request what's here so we should at least store an empty manifest. self._legacy_artifact_service.CommitManifest( beam_artifact_api_pb2.CommitManifestRequest( staging_session_token=preparation_id, manifest=beam_artifact_api_pb2.Manifest())) self._artifact_service.register_job( staging_token=preparation_id, dependency_sets={ id: env.dependencies for (id, env) in pipeline.components.environments.items() }) provision_info = fn_runner.ExtendedProvisionInfo( beam_provision_api_pb2.ProvisionInfo( pipeline_options=options, retrieval_token=self._legacy_artifact_service.retrieval_token( preparation_id)), self._staging_dir, job_name=job_name) return BeamJob(preparation_id, pipeline, options, provision_info, self._artifact_staging_endpoint, self._artifact_service)
def __init__( self, default_environment=None, # type: Optional[environments.Environment] bundle_repeat=0, use_state_iterables=False, provision_info=None, # type: Optional[ExtendedProvisionInfo] progress_request_frequency=None): # type: (...) -> None """Creates a new Fn API Runner. Args: default_environment: the default environment to use for UserFns. bundle_repeat: replay every bundle this many extra times, for profiling and debugging use_state_iterables: Intentionally split gbk iterables over state API (for testing) provision_info: provisioning info to make available to workers, or None progress_request_frequency: The frequency (in seconds) that the runner waits before requesting progress from the SDK. """ super(FnApiRunner, self).__init__() self._default_environment = (default_environment or environments.EmbeddedPythonEnvironment()) self._bundle_repeat = bundle_repeat self._num_workers = 1 self._progress_frequency = progress_request_frequency self._profiler_factory = None # type: Optional[Callable[..., profiler.Profile]] self._use_state_iterables = use_state_iterables self._provision_info = provision_info or ExtendedProvisionInfo( beam_provision_api_pb2.ProvisionInfo( retrieval_token='unused-retrieval-token'))
def run_pipeline(self, pipeline, options): from apache_beam.pipeline import PipelineVisitor from apache_beam.runners.dataflow.native_io.iobase import NativeSource from apache_beam.runners.dataflow.native_io.iobase import _NativeWrite from apache_beam.testing.test_stream import TestStream class _FnApiRunnerSupportVisitor(PipelineVisitor): """Visitor determining if a Pipeline can be run on the FnApiRunner.""" def accept(self, pipeline): self.supported_by_fnapi_runner = True pipeline.visit(self) return self.supported_by_fnapi_runner def visit_transform(self, applied_ptransform): transform = applied_ptransform.transform # The FnApiRunner does not support streaming execution. if isinstance(transform, TestStream): self.supported_by_fnapi_runner = False # The FnApiRunner does not support reads from NativeSources. if (isinstance(transform, beam.io.Read) and isinstance(transform.source, NativeSource)): self.supported_by_fnapi_runner = False # The FnApiRunner does not support the use of _NativeWrites. if isinstance(transform, _NativeWrite): self.supported_by_fnapi_runner = False if isinstance(transform, beam.ParDo): dofn = transform.dofn # The FnApiRunner does not support execution of CombineFns with # deferred side inputs. if isinstance(dofn, CombineValuesDoFn): args, kwargs = transform.raw_side_inputs args_to_check = itertools.chain(args, kwargs.values()) if any( isinstance(arg, ArgumentPlaceholder) for arg in args_to_check): self.supported_by_fnapi_runner = False if userstate.is_stateful_dofn(dofn): _, timer_specs = userstate.get_dofn_specs(dofn) for timer in timer_specs: if timer.time_domain == TimeDomain.REAL_TIME: self.supported_by_fnapi_runner = False # Check whether all transforms used in the pipeline are supported by the # FnApiRunner, and the pipeline was not meant to be run as streaming. if _FnApiRunnerSupportVisitor().accept(pipeline): from apache_beam.portability.api import beam_provision_api_pb2 from apache_beam.runners.portability.fn_api_runner import fn_runner from apache_beam.runners.portability.portable_runner import JobServiceHandle all_options = options.get_all_options() encoded_options = JobServiceHandle.encode_pipeline_options( all_options) provision_info = fn_runner.ExtendedProvisionInfo( beam_provision_api_pb2.ProvisionInfo( pipeline_options=encoded_options)) runner = fn_runner.FnApiRunner(provision_info=provision_info) else: runner = BundleBasedDirectRunner() return runner.run_pipeline(pipeline, options)
def __init__(self, provision_info=None, # type: Optional[beam_provision_api_pb2.ProvisionInfo] artifact_staging_dir=None, job_name=None, # type: Optional[str] ): self.provision_info = ( provision_info or beam_provision_api_pb2.ProvisionInfo()) self.artifact_staging_dir = artifact_staging_dir self.job_name = job_name
def create_beam_job(self, preparation_id, job_name, pipeline, options): # TODO(angoenka): Pass an appropriate staging_session_token. The token can # be obtained in PutArtifactResponse from JobService if not self._artifact_staging_endpoint: # The front-end didn't try to stage anything, but the worker may # request what's here so we should at least store an empty manifest. self._artifact_service.CommitManifest( beam_artifact_api_pb2.CommitManifestRequest( staging_session_token=preparation_id, manifest=beam_artifact_api_pb2.Manifest())) provision_info = fn_api_runner.ExtendedProvisionInfo( beam_provision_api_pb2.ProvisionInfo( job_id=preparation_id, job_name=job_name, pipeline_options=options, retrieval_token=self._artifact_service.retrieval_token( preparation_id)), self._staging_dir) return BeamJob(preparation_id, pipeline, options, provision_info, self._artifact_staging_endpoint)
def create_beam_job( self, preparation_id, # stype: str job_name, # type: str pipeline, # type: beam_runner_api_pb2.Pipeline options # type: struct_pb2.Struct ): # type: (...) -> BeamJob self._artifact_service.register_job( staging_token=preparation_id, dependency_sets={ id: env.dependencies for (id, env) in pipeline.components.environments.items() }) provision_info = fn_runner.ExtendedProvisionInfo( beam_provision_api_pb2.ProvisionInfo(pipeline_options=options), self._staging_dir, job_name=job_name) return BeamJob(preparation_id, pipeline, options, provision_info, self._artifact_staging_endpoint, self._artifact_service)