def test_full_completion(self): # Create dummy file and close it. Note that we need to do this because # Windows does not allow NamedTemporaryFiles to be reopened elsewhere # before the temporary file is closed. dummy_file = tempfile.NamedTemporaryFile(delete=False) dummy_file_name = dummy_file.name dummy_file.close() dummy_dir = tempfile.mkdtemp() remote_runner = DataflowRunner() pipeline = Pipeline( remote_runner, options=PipelineOptions([ '--dataflow_endpoint=ignored', '--sdk_location=' + dummy_file_name, '--job_name=test-job', '--project=test-project', '--staging_location=' + dummy_dir, '--temp_location=/dev/null', '--template_location=' + dummy_file_name, '--no_auth=True' ])) pipeline | beam.Create([1, 2, 3]) | beam.Map(lambda x: x) # pylint: disable=expression-not-assigned pipeline.run().wait_until_finish() with open(dummy_file_name) as template_file: saved_job_dict = json.load(template_file) self.assertEqual( saved_job_dict['environment']['sdkPipelineOptions']['options'] ['project'], 'test-project') self.assertEqual( saved_job_dict['environment']['sdkPipelineOptions']['options'] ['job_name'], 'test-job')
def test_bad_path(self): dummy_sdk_file = tempfile.NamedTemporaryFile() remote_runner = DataflowRunner() pipeline = Pipeline( remote_runner, options=PipelineOptions([ '--dataflow_endpoint=ignored', '--sdk_location=' + dummy_sdk_file.name, '--job_name=test-job', '--project=test-project', '--staging_location=ignored', '--temp_location=/dev/null', '--template_location=/bad/path', '--no_auth=True' ])) remote_runner.job = apiclient.Job(pipeline._options) with self.assertRaises(IOError): pipeline.run().wait_until_finish()
def test_bad_path(self): dummy_sdk_file = tempfile.NamedTemporaryFile() remote_runner = DataflowRunner() pipeline = Pipeline(remote_runner, options=PipelineOptions([ '--dataflow_endpoint=ignored', '--sdk_location=' + dummy_sdk_file.name, '--job_name=test-job', '--project=test-project', '--staging_location=ignored', '--temp_location=/dev/null', '--template_location=/bad/path', '--no_auth=True'])) remote_runner.job = apiclient.Job(pipeline._options) with self.assertRaises(IOError): pipeline.run().wait_until_finish()
def run_pipeline( self, pipeline, # type: Pipeline options # type: pipeline_options.PipelineOptions ): # type: (...) -> RunnerResult RuntimeValueProvider.set_runtime_options({}) # Setup "beam_fn_api" experiment options if lacked. experiments = (options.view_as( pipeline_options.DebugOptions).experiments or []) if not 'beam_fn_api' in experiments: experiments.append('beam_fn_api') options.view_as( pipeline_options.DebugOptions).experiments = experiments # This is sometimes needed if type checking is disabled # to enforce that the inputs (and outputs) of GroupByKey operations # are known to be KVs. from apache_beam.runners.dataflow.dataflow_runner import DataflowRunner # TODO: Move group_by_key_input_visitor() to a non-dataflow specific file. pipeline.visit( DataflowRunner.group_by_key_input_visitor( not options.view_as(pipeline_options.TypeOptions ).allow_non_deterministic_key_coders)) self._bundle_repeat = self._bundle_repeat or options.view_as( pipeline_options.DirectOptions).direct_runner_bundle_repeat pipeline_direct_num_workers = options.view_as( pipeline_options.DirectOptions).direct_num_workers if pipeline_direct_num_workers == 0: self._num_workers = multiprocessing.cpu_count() else: self._num_workers = pipeline_direct_num_workers or self._num_workers # set direct workers running mode if it is defined with pipeline options. running_mode = \ options.view_as(pipeline_options.DirectOptions).direct_running_mode if running_mode == 'multi_threading': self._default_environment = environments.EmbeddedPythonGrpcEnvironment( ) elif running_mode == 'multi_processing': command_string = '%s -m apache_beam.runners.worker.sdk_worker_main' \ % sys.executable self._default_environment = environments.SubprocessSDKEnvironment( command_string=command_string) self._profiler_factory = Profile.factory_from_options( options.view_as(pipeline_options.ProfilingOptions)) self._latest_run_result = self.run_via_runner_api( pipeline.to_runner_api( default_environment=self._default_environment)) return self._latest_run_result
def run_pipeline(self, pipeline, options): MetricsEnvironment.set_metrics_supported(False) RuntimeValueProvider.set_runtime_options({}) # This is sometimes needed if type checking is disabled # to enforce that the inputs (and outputs) of GroupByKey operations # are known to be KVs. from apache_beam.runners.dataflow.dataflow_runner import DataflowRunner pipeline.visit(DataflowRunner.group_by_key_input_visitor()) self._bundle_repeat = self._bundle_repeat or options.view_as( pipeline_options.DirectOptions).direct_runner_bundle_repeat self._profiler_factory = profiler.Profile.factory_from_options( options.view_as(pipeline_options.ProfilingOptions)) return self.run_via_runner_api(pipeline.to_runner_api( default_environment=self._default_environment))
def run_pipeline(self, pipeline, options): MetricsEnvironment.set_metrics_supported(False) RuntimeValueProvider.set_runtime_options({}) # This is sometimes needed if type checking is disabled # to enforce that the inputs (and outputs) of GroupByKey operations # are known to be KVs. from apache_beam.runners.dataflow.dataflow_runner import DataflowRunner pipeline.visit(DataflowRunner.group_by_key_input_visitor()) self._bundle_repeat = self._bundle_repeat or options.view_as( pipeline_options.DirectOptions).direct_runner_bundle_repeat self._profiler_factory = profiler.Profile.factory_from_options( options.view_as(pipeline_options.ProfilingOptions)) return self.run_via_runner_api(pipeline.to_runner_api( default_environment=self._default_environment))
def create_streaming_job( service_name, service_id, project_name, region, image_uri, setup_file_path=DEFAULT_SETUP_FILE_PATH, temporary_files_location=DEFAULT_DATAFLOW_TEMPORARY_FILES_LOCATION, service_account_email=None, worker_machine_type=None, maximum_instances=None, update=False, extra_options=None, ): """Deploy an `octue` service as a streaming Google Dataflow Prime job. :param str service_name: the name to give the Dataflow job :param str service_id: the Pub/Sub topic name for the Dataflow job to subscribe to :param str project_name: the name of the project to deploy the job to :param str region: the region to deploy the job to :param str image_uri: the URI of the `apache-beam`-based Docker image to use for the job :param str setup_file_path: path to the python `setup.py` file to use for the job :param str temporary_files_location: a Google Cloud Storage path to save temporary files from the job at :param str|None service_account_email: the email of the service account to run the Dataflow VMs as :param str|None worker_machine_type: the machine type to create Dataflow worker VMs as. See https://cloud.google.com/compute/docs/machine-types for a list of valid options. If not set, the Dataflow service will choose a reasonable default. :param int|None maximum_instances: the maximum number of workers to use when executing the Dataflow job :param bool update: if `True`, update the existing job with the same name :param dict|None extra_options: any further arguments to be passed to Apache Beam as pipeline options :raise DeploymentError: if a Dataflow job with the service name already exists :return None: """ pipeline_options = { "project": project_name, "region": region, "temp_location": temporary_files_location, "job_name": service_name, "sdk_container_image": image_uri, "setup_file": os.path.abspath(setup_file_path), "update": update, "streaming": True, **(extra_options or {}), } if service_account_email: pipeline_options["service_account_email"] = service_account_email if worker_machine_type: pipeline_options["worker_machine_type"] = worker_machine_type else: # Dataflow Prime can only be used if a worker machine type is not specified. pipeline_options["dataflow_service_options"] = ["enable_prime"] if maximum_instances: pipeline_options["max_num_workers"] = maximum_instances pipeline_options = PipelineOptions.from_dictionary(pipeline_options) pipeline = apache_beam.Pipeline(options=pipeline_options) service_topic = Topic( name=service_id, namespace=OCTUE_NAMESPACE, service=Service(backend=GCPPubSubBackend(project_name=project_name)), ) service_topic.create(allow_existing=True) ( pipeline | "Read from Pub/Sub" >> apache_beam.io.ReadFromPubSub(topic=service_topic.path, with_attributes=True) | "Answer question" >> apache_beam.Map(lambda question: answer_question(question, project_name=project_name)) ) try: DataflowRunner().run_pipeline(pipeline, options=pipeline_options) except DataflowJobAlreadyExistsError: raise DeploymentError(f"A Dataflow job with name {service_name!r} already exists.") from None