def create_stages(self, pipeline_proto): pipeline_context = fn_api_runner_transforms.TransformContext( copy.deepcopy(pipeline_proto.components), use_state_iterables=self._use_state_iterables) # Initial set of stages are singleton leaf transforms. stages = list(fn_api_runner_transforms.leaf_transform_stages( pipeline_proto.root_transform_ids, pipeline_proto.components)) # Apply each phase in order. for phase in [ fn_api_runner_transforms.annotate_downstream_side_inputs, fn_api_runner_transforms.fix_side_input_pcoll_coders, fn_api_runner_transforms.lift_combiners, fn_api_runner_transforms.expand_gbk, fn_api_runner_transforms.sink_flattens, fn_api_runner_transforms.greedily_fuse, fn_api_runner_transforms.read_to_impulse, fn_api_runner_transforms.impulse_to_input, fn_api_runner_transforms.inject_timer_pcollections, fn_api_runner_transforms.sort_stages, fn_api_runner_transforms.window_pcollection_coders]: logging.info('%s %s %s', '=' * 20, phase, '=' * 20) stages = list(phase(stages, pipeline_context)) logging.debug('Stages: %s', [str(s) for s in stages]) # Return the (possibly mutated) context and ordered set of stages. return pipeline_context.components, stages, pipeline_context.safe_coders
def run_pipeline(self, pipeline, options): portable_options = options.view_as(PortableOptions) job_endpoint = portable_options.job_endpoint # TODO: https://issues.apache.org/jira/browse/BEAM-5525 # portable runner specific default if options.view_as(SetupOptions).sdk_location == 'default': options.view_as(SetupOptions).sdk_location = 'container' if not job_endpoint: # TODO Provide a way to specify a container Docker URL # https://issues.apache.org/jira/browse/BEAM-6328 docker = DockerizedJobServer() job_endpoint = docker.start() # This is needed as we start a worker server if one is requested # but none is provided. if portable_options.environment_type == 'LOOPBACK': portable_options.environment_config, server = ( BeamFnExternalWorkerPoolServicer.start( sdk_worker_main._get_worker_count(options))) cleanup_callbacks = [functools.partial(server.stop, 1)] else: cleanup_callbacks = [] proto_pipeline = pipeline.to_runner_api( default_environment=PortableRunner._create_environment( portable_options)) # Some runners won't detect the GroupByKey transform unless it has no # subtransforms. Remove all sub-transforms until BEAM-4605 is resolved. for _, transform_proto in list( proto_pipeline.components.transforms.items()): if transform_proto.spec.urn == common_urns.primitives.GROUP_BY_KEY.urn: for sub_transform in transform_proto.subtransforms: del proto_pipeline.components.transforms[sub_transform] del transform_proto.subtransforms[:] # Preemptively apply combiner lifting, until all runners support it. # This optimization is idempotent. if not options.view_as(StandardOptions).streaming: stages = list( fn_api_runner_transforms.leaf_transform_stages( proto_pipeline.root_transform_ids, proto_pipeline.components)) stages = fn_api_runner_transforms.lift_combiners( stages, fn_api_runner_transforms.TransformContext( proto_pipeline.components)) proto_pipeline = fn_api_runner_transforms.with_stages( proto_pipeline, stages) # TODO: Define URNs for options. # convert int values: https://issues.apache.org/jira/browse/BEAM-5509 p_options = { 'beam:option:' + k + ':v1': (str(v) if type(v) == int else v) for k, v in options.get_all_options().items() if v is not None } channel = grpc.insecure_channel(job_endpoint) grpc.channel_ready_future(channel).result() job_service = beam_job_api_pb2_grpc.JobServiceStub(channel) # Sends the PrepareRequest but retries in case the channel is not ready def send_prepare_request(max_retries=5): num_retries = 0 while True: try: # This reports channel is READY but connections may fail # Seems to be only an issue on Mac with port forwardings grpc.channel_ready_future(channel).result() return job_service.Prepare( beam_job_api_pb2.PrepareJobRequest( job_name='job', pipeline=proto_pipeline, pipeline_options=job_utils.dict_to_struct( p_options))) except grpc._channel._Rendezvous as e: num_retries += 1 if num_retries > max_retries: raise e prepare_response = send_prepare_request() if prepare_response.artifact_staging_endpoint.url: stager = portable_stager.PortableStager( grpc.insecure_channel( prepare_response.artifact_staging_endpoint.url), prepare_response.staging_session_token) retrieval_token, _ = stager.stage_job_resources( options, staging_location='') else: retrieval_token = None try: state_stream = job_service.GetStateStream( beam_job_api_pb2.GetJobStateRequest( job_id=prepare_response.preparation_id)) # If there's an error, we don't always get it until we try to read. # Fortunately, there's always an immediate current state published. state_stream = itertools.chain([next(state_stream)], state_stream) message_stream = job_service.GetMessageStream( beam_job_api_pb2.JobMessagesRequest( job_id=prepare_response.preparation_id)) except Exception: # TODO(BEAM-6442): Unify preparation_id and job_id for all runners. state_stream = message_stream = None # Run the job and wait for a result. run_response = job_service.Run( beam_job_api_pb2.RunJobRequest( preparation_id=prepare_response.preparation_id, retrieval_token=retrieval_token)) if state_stream is None: state_stream = job_service.GetStateStream( beam_job_api_pb2.GetJobStateRequest( job_id=run_response.job_id)) message_stream = job_service.GetMessageStream( beam_job_api_pb2.JobMessagesRequest( job_id=run_response.job_id)) return PipelineResult(job_service, run_response.job_id, message_stream, state_stream, cleanup_callbacks)