def run(self): """Runs the pipeline. Returns whatever our runner returns after running.""" if self.options.view_as(SetupOptions).save_main_session: # If this option is chosen, verify we can pickle the main session early. tmpdir = tempfile.mkdtemp() try: pickler.dump_session(os.path.join(tmpdir, 'main_session.pickle')) finally: shutil.rmtree(tmpdir) return self.runner.run(self)
def run(self, test_runner_api=True, runner=None, options=None, interactive=None): """Runs the pipeline. Returns whatever our runner returns after running. If another runner instance and options are provided, that runner will execute the pipeline with the given options. If either of them is not set, a ValueError is raised. The usage is similar to directly invoking `runner.run_pipeline(pipeline, options)`. Additionally, an interactive field can be set to override the pipeline's self.interactive field to mark current pipeline as being initiated from an interactive environment. """ from apache_beam.runners.interactive import interactive_runner if interactive: self.interactive = interactive elif isinstance(self.runner, interactive_runner.InteractiveRunner): self.interactive = True else: self.interactive = False runner_in_use = self.runner options_in_use = self._options if runner and options: runner_in_use = runner options_in_use = options elif not runner and options: raise ValueError( 'Parameter runner is not given when parameter options ' 'is given.') elif not options and runner: raise ValueError( 'Parameter options is not given when parameter runner ' 'is given.') # When possible, invoke a round trip through the runner API. if test_runner_api and self._verify_runner_api_compatible(): return Pipeline.from_runner_api( self.to_runner_api(use_fake_coders=True), runner_in_use, options_in_use).run(test_runner_api=False, interactive=self.interactive) if options_in_use.view_as(TypeOptions).runtime_type_check: from apache_beam.typehints import typecheck self.visit(typecheck.TypeCheckVisitor()) if options_in_use.view_as(SetupOptions).save_main_session: # If this option is chosen, verify we can pickle the main session early. tmpdir = tempfile.mkdtemp() try: pickler.dump_session( os.path.join(tmpdir, 'main_session.pickle')) finally: shutil.rmtree(tmpdir) return runner_in_use.run_pipeline(self, options_in_use)
def run(self, test_runner_api=True): """Runs the pipeline. Returns whatever our runner returns after running.""" # When possible, invoke a round trip through the runner API. if test_runner_api and self._verify_runner_api_compatible(): return Pipeline.from_runner_api( self.to_runner_api(), self.runner, self._options).run(False) if self._options.view_as(SetupOptions).save_main_session: # If this option is chosen, verify we can pickle the main session early. tmpdir = tempfile.mkdtemp() try: pickler.dump_session(os.path.join(tmpdir, 'main_session.pickle')) finally: shutil.rmtree(tmpdir) return self.runner.run_pipeline(self)
def run(self, test_runner_api=True): """Runs the pipeline. Returns whatever our runner returns after running.""" # When possible, invoke a round trip through the runner API. if test_runner_api and self._verify_runner_api_compatible(): return Pipeline.from_runner_api( self.to_runner_api(), self.runner, self._options).run(False) if self._options.view_as(SetupOptions).save_main_session: # If this option is chosen, verify we can pickle the main session early. tmpdir = tempfile.mkdtemp() try: pickler.dump_session(os.path.join(tmpdir, 'main_session.pickle')) finally: shutil.rmtree(tmpdir) return self.runner.run(self)
def run(self, test_runner_api=True): """Runs the pipeline. Returns whatever our runner returns after running.""" # When possible, invoke a round trip through the runner API. if test_runner_api and self._verify_runner_api_compatible(): return Pipeline.from_runner_api( self.to_runner_api(use_fake_coders=True), self.runner, self._options).run(False) if self._options.view_as(TypeOptions).runtime_type_check: from apache_beam.typehints import typecheck self.visit(typecheck.TypeCheckVisitor()) if self._options.view_as(SetupOptions).save_main_session: # If this option is chosen, verify we can pickle the main session early. tmpdir = tempfile.mkdtemp() try: pickler.dump_session( os.path.join(tmpdir, 'main_session.pickle')) finally: shutil.rmtree(tmpdir) return self.runner.run_pipeline(self, self._options)
def run(self, test_runner_api=True): """Runs the pipeline. Returns whatever our runner returns after running.""" # When possible, invoke a round trip through the runner API. if test_runner_api and self._verify_runner_api_compatible(): return Pipeline.from_runner_api( self.to_runner_api(use_fake_coders=True), self.runner, self._options).run(False) if self._options.view_as(TypeOptions).runtime_type_check: from apache_beam.typehints import typecheck self.visit(typecheck.TypeCheckVisitor()) if self._options.view_as(SetupOptions).save_main_session: # If this option is chosen, verify we can pickle the main session early. tmpdir = tempfile.mkdtemp() try: pickler.dump_session(os.path.join(tmpdir, 'main_session.pickle')) finally: shutil.rmtree(tmpdir) return self.runner.run_pipeline(self)
def stage_job_resources( options, file_copy=_dependency_file_copy, build_setup_args=None, temp_dir=None, populate_requirements_cache=_populate_requirements_cache): """Creates (if needed) and stages job resources to options.staging_location. Args: options: Command line options. More specifically the function will expect staging_location, requirements_file, setup_file, and save_main_session options to be present. file_copy: Callable for copying files. The default version will copy from a local file to a GCS location using the gsutil tool available in the Google Cloud SDK package. build_setup_args: A list of command line arguments used to build a setup package. Used only if options.setup_file is not None. Used only for testing. temp_dir: Temporary folder where the resource building can happen. If None then a unique temp directory will be created. Used only for testing. populate_requirements_cache: Callable for populating the requirements cache. Used only for testing. Returns: A list of file names (no paths) for the resources staged. All the files are assumed to be staged in options.staging_location. Raises: RuntimeError: If files specified are not found or error encountered while trying to create the resources (e.g., build a setup package). """ temp_dir = temp_dir or tempfile.mkdtemp() resources = [] google_cloud_options = options.view_as(GoogleCloudOptions) setup_options = options.view_as(SetupOptions) # Make sure that all required options are specified. There are a few that have # defaults to support local running scenarios. if google_cloud_options.staging_location is None: raise RuntimeError( 'The --staging_location option must be specified.') if google_cloud_options.temp_location is None: raise RuntimeError( 'The --temp_location option must be specified.') # Stage a requirements file if present. if setup_options.requirements_file is not None: if not os.path.isfile(setup_options.requirements_file): raise RuntimeError('The file %s cannot be found. It was specified in the ' '--requirements_file command line option.' % setup_options.requirements_file) staged_path = utils.path.join(google_cloud_options.staging_location, REQUIREMENTS_FILE) file_copy(setup_options.requirements_file, staged_path) resources.append(REQUIREMENTS_FILE) requirements_cache_path = ( os.path.join(tempfile.gettempdir(), 'dataflow-requirements-cache') if setup_options.requirements_cache is None else setup_options.requirements_cache) # Populate cache with packages from requirements and stage the files # in the cache. if not os.path.exists(requirements_cache_path): os.makedirs(requirements_cache_path) populate_requirements_cache( setup_options.requirements_file, requirements_cache_path) for pkg in glob.glob(os.path.join(requirements_cache_path, '*')): file_copy(pkg, utils.path.join(google_cloud_options.staging_location, os.path.basename(pkg))) resources.append(os.path.basename(pkg)) # Handle a setup file if present. # We will build the setup package locally and then copy it to the staging # location because the staging location is a GCS path and the file cannot be # created directly there. if setup_options.setup_file is not None: if not os.path.isfile(setup_options.setup_file): raise RuntimeError('The file %s cannot be found. It was specified in the ' '--setup_file command line option.' % setup_options.setup_file) if os.path.basename(setup_options.setup_file) != 'setup.py': raise RuntimeError( 'The --setup_file option expects the full path to a file named ' 'setup.py instead of %s' % setup_options.setup_file) tarball_file = _build_setup_package(setup_options.setup_file, temp_dir, build_setup_args) staged_path = utils.path.join(google_cloud_options.staging_location, WORKFLOW_TARBALL_FILE) file_copy(tarball_file, staged_path) resources.append(WORKFLOW_TARBALL_FILE) # Handle extra local packages that should be staged. if setup_options.extra_packages is not None: resources.extend( _stage_extra_packages(setup_options.extra_packages, google_cloud_options.staging_location, temp_dir=temp_dir, file_copy=file_copy)) # Pickle the main session if requested. # We will create the pickled main session locally and then copy it to the # staging location because the staging location is a GCS path and the file # cannot be created directly there. if setup_options.save_main_session: pickled_session_file = os.path.join(temp_dir, names.PICKLED_MAIN_SESSION_FILE) pickler.dump_session(pickled_session_file) staged_path = utils.path.join(google_cloud_options.staging_location, names.PICKLED_MAIN_SESSION_FILE) file_copy(pickled_session_file, staged_path) resources.append(names.PICKLED_MAIN_SESSION_FILE) if hasattr(setup_options, 'sdk_location') and setup_options.sdk_location: if setup_options.sdk_location == 'default': stage_tarball_from_remote_location = True elif (setup_options.sdk_location.startswith('gs://') or setup_options.sdk_location.startswith('http://') or setup_options.sdk_location.startswith('https://')): stage_tarball_from_remote_location = True else: stage_tarball_from_remote_location = False staged_path = utils.path.join(google_cloud_options.staging_location, names.DATAFLOW_SDK_TARBALL_FILE) if stage_tarball_from_remote_location: # If --sdk_location is not specified then the appropriate package # will be obtained from PyPI (https://pypi.python.org) based on the # version of the currently running SDK. If the option is # present then no version matching is made and the exact URL or path # is expected. # # Unit tests running in the 'python setup.py test' context will # not have the sdk_location attribute present and therefore we # will not stage a tarball. if setup_options.sdk_location == 'default': sdk_remote_location = 'pypi' else: sdk_remote_location = setup_options.sdk_location _stage_dataflow_sdk_tarball(sdk_remote_location, staged_path, temp_dir) resources.append(names.DATAFLOW_SDK_TARBALL_FILE) else: # Check if we have a local Dataflow SDK tarball present. This branch is # used by tests running with the SDK built at head. if setup_options.sdk_location == 'default': module_path = os.path.abspath(__file__) sdk_path = os.path.join( os.path.dirname(module_path), '..', names.DATAFLOW_SDK_TARBALL_FILE) elif os.path.isdir(setup_options.sdk_location): sdk_path = os.path.join( setup_options.sdk_location, names.DATAFLOW_SDK_TARBALL_FILE) else: sdk_path = setup_options.sdk_location if os.path.isfile(sdk_path): logging.info('Copying dataflow SDK "%s" to staging location.', sdk_path) file_copy(sdk_path, staged_path) resources.append(names.DATAFLOW_SDK_TARBALL_FILE) else: if setup_options.sdk_location == 'default': raise RuntimeError('Cannot find default Dataflow SDK tar file "%s"', sdk_path) else: raise RuntimeError( 'The file "%s" cannot be found. Its location was specified by ' 'the --sdk_location command-line option.' % sdk_path) # Delete all temp files created while staging job resources. shutil.rmtree(temp_dir) return resources
def stage_job_resources( options, file_copy=_dependency_file_copy, build_setup_args=None, temp_dir=None, populate_requirements_cache=_populate_requirements_cache): """For internal use only; no backwards-compatibility guarantees. Creates (if needed) and stages job resources to options.staging_location. Args: options: Command line options. More specifically the function will expect staging_location, requirements_file, setup_file, and save_main_session options to be present. file_copy: Callable for copying files. The default version will copy from a local file to a GCS location using the gsutil tool available in the Google Cloud SDK package. build_setup_args: A list of command line arguments used to build a setup package. Used only if options.setup_file is not None. Used only for testing. temp_dir: Temporary folder where the resource building can happen. If None then a unique temp directory will be created. Used only for testing. populate_requirements_cache: Callable for populating the requirements cache. Used only for testing. Returns: A list of file names (no paths) for the resources staged. All the files are assumed to be staged in options.staging_location. Raises: RuntimeError: If files specified are not found or error encountered while trying to create the resources (e.g., build a setup package). """ temp_dir = temp_dir or tempfile.mkdtemp() resources = [] google_cloud_options = options.view_as(GoogleCloudOptions) setup_options = options.view_as(SetupOptions) # Make sure that all required options are specified. There are a few that have # defaults to support local running scenarios. if google_cloud_options.staging_location is None: raise RuntimeError('The --staging_location option must be specified.') if google_cloud_options.temp_location is None: raise RuntimeError('The --temp_location option must be specified.') # Stage a requirements file if present. if setup_options.requirements_file is not None: if not os.path.isfile(setup_options.requirements_file): raise RuntimeError( 'The file %s cannot be found. It was specified in the ' '--requirements_file command line option.' % setup_options.requirements_file) staged_path = FileSystems.join(google_cloud_options.staging_location, REQUIREMENTS_FILE) file_copy(setup_options.requirements_file, staged_path) resources.append(REQUIREMENTS_FILE) requirements_cache_path = (os.path.join(tempfile.gettempdir(), 'dataflow-requirements-cache') if setup_options.requirements_cache is None else setup_options.requirements_cache) # Populate cache with packages from requirements and stage the files # in the cache. if not os.path.exists(requirements_cache_path): os.makedirs(requirements_cache_path) populate_requirements_cache(setup_options.requirements_file, requirements_cache_path) for pkg in glob.glob(os.path.join(requirements_cache_path, '*')): file_copy( pkg, FileSystems.join(google_cloud_options.staging_location, os.path.basename(pkg))) resources.append(os.path.basename(pkg)) # Handle a setup file if present. # We will build the setup package locally and then copy it to the staging # location because the staging location is a GCS path and the file cannot be # created directly there. if setup_options.setup_file is not None: if not os.path.isfile(setup_options.setup_file): raise RuntimeError( 'The file %s cannot be found. It was specified in the ' '--setup_file command line option.' % setup_options.setup_file) if os.path.basename(setup_options.setup_file) != 'setup.py': raise RuntimeError( 'The --setup_file option expects the full path to a file named ' 'setup.py instead of %s' % setup_options.setup_file) tarball_file = _build_setup_package(setup_options.setup_file, temp_dir, build_setup_args) staged_path = FileSystems.join(google_cloud_options.staging_location, WORKFLOW_TARBALL_FILE) file_copy(tarball_file, staged_path) resources.append(WORKFLOW_TARBALL_FILE) # Handle extra local packages that should be staged. if setup_options.extra_packages is not None: resources.extend( _stage_extra_packages(setup_options.extra_packages, google_cloud_options.staging_location, temp_dir=temp_dir, file_copy=file_copy)) # Pickle the main session if requested. # We will create the pickled main session locally and then copy it to the # staging location because the staging location is a GCS path and the file # cannot be created directly there. if setup_options.save_main_session: pickled_session_file = os.path.join(temp_dir, names.PICKLED_MAIN_SESSION_FILE) pickler.dump_session(pickled_session_file) staged_path = FileSystems.join(google_cloud_options.staging_location, names.PICKLED_MAIN_SESSION_FILE) file_copy(pickled_session_file, staged_path) resources.append(names.PICKLED_MAIN_SESSION_FILE) if hasattr(setup_options, 'sdk_location'): if setup_options.sdk_location == 'default': stage_sdk_from_remote_location = True elif (setup_options.sdk_location.startswith('gs://') or setup_options.sdk_location.startswith('http://') or setup_options.sdk_location.startswith('https://')): stage_sdk_from_remote_location = True else: stage_sdk_from_remote_location = False if stage_sdk_from_remote_location: # If --sdk_location is not specified then the appropriate package # will be obtained from PyPI (https://pypi.python.org) based on the # version of the currently running SDK. If the option is # present then no version matching is made and the exact URL or path # is expected. # # Unit tests running in the 'python setup.py test' context will # not have the sdk_location attribute present and therefore we # will not stage SDK. if setup_options.sdk_location == 'default': sdk_remote_location = 'pypi' else: sdk_remote_location = setup_options.sdk_location resources.extend( _stage_beam_sdk(sdk_remote_location, google_cloud_options.staging_location, temp_dir)) else: # This branch is also used by internal tests running with the SDK built # at head. if setup_options.sdk_location == 'default': module_path = os.path.abspath(__file__) sdk_path = os.path.join(os.path.dirname(module_path), '..', '..', '..', names.DATAFLOW_SDK_TARBALL_FILE) elif os.path.isdir(setup_options.sdk_location): sdk_path = os.path.join(setup_options.sdk_location, names.DATAFLOW_SDK_TARBALL_FILE) else: sdk_path = setup_options.sdk_location if os.path.isfile(sdk_path): logging.info('Copying Beam SDK "%s" to staging location.', sdk_path) staged_path = FileSystems.join( google_cloud_options.staging_location, _desired_sdk_filename_in_staging_location( setup_options.sdk_location)) file_copy(sdk_path, staged_path) _, sdk_staged_filename = FileSystems.split(staged_path) resources.append(sdk_staged_filename) else: if setup_options.sdk_location == 'default': raise RuntimeError( 'Cannot find default Beam SDK tar file "%s"', sdk_path) elif not setup_options.sdk_location: logging.info( 'Beam SDK will not be staged since --sdk_location ' 'is empty.') else: raise RuntimeError( 'The file "%s" cannot be found. Its location was specified by ' 'the --sdk_location command-line option.' % sdk_path) # Delete all temp files created while staging job resources. shutil.rmtree(temp_dir) return resources
def stage_job_resources(self, options, build_setup_args=None, temp_dir=None, populate_requirements_cache=None, staging_location=None): """For internal use only; no backwards-compatibility guarantees. Creates (if needed) and stages job resources to staging_location. Args: options: Command line options. More specifically the function will expect requirements_file, setup_file, and save_main_session options to be present. build_setup_args: A list of command line arguments used to build a setup package. Used only if options.setup_file is not None. Used only for testing. temp_dir: Temporary folder where the resource building can happen. If None then a unique temp directory will be created. Used only for testing. populate_requirements_cache: Callable for populating the requirements cache. Used only for testing. staging_location: Location to stage the file. Returns: A list of file names (no paths) for the resources staged. All the files are assumed to be staged at staging_location. Raises: RuntimeError: If files specified are not found or error encountered while trying to create the resources (e.g., build a setup package). """ temp_dir = temp_dir or tempfile.mkdtemp() resources = [] setup_options = options.view_as(SetupOptions) # Make sure that all required options are specified. if staging_location is None: raise RuntimeError('The staging_location must be specified.') # Stage a requirements file if present. if setup_options.requirements_file is not None: if not os.path.isfile(setup_options.requirements_file): raise RuntimeError( 'The file %s cannot be found. It was specified in the ' '--requirements_file command line option.' % setup_options.requirements_file) staged_path = FileSystems.join(staging_location, REQUIREMENTS_FILE) self.stage_artifact(setup_options.requirements_file, staged_path) resources.append(REQUIREMENTS_FILE) requirements_cache_path = (os.path.join( tempfile.gettempdir(), 'dataflow-requirements-cache') if setup_options.requirements_cache is None else setup_options.requirements_cache) # Populate cache with packages from requirements and stage the files # in the cache. if not os.path.exists(requirements_cache_path): os.makedirs(requirements_cache_path) (populate_requirements_cache if populate_requirements_cache else Stager._populate_requirements_cache)( setup_options.requirements_file, requirements_cache_path) for pkg in glob.glob(os.path.join(requirements_cache_path, '*')): self.stage_artifact( pkg, FileSystems.join(staging_location, os.path.basename(pkg))) resources.append(os.path.basename(pkg)) # Handle a setup file if present. # We will build the setup package locally and then copy it to the staging # location because the staging location is a remote path and the file cannot # be created directly there. if setup_options.setup_file is not None: if not os.path.isfile(setup_options.setup_file): raise RuntimeError( 'The file %s cannot be found. It was specified in the ' '--setup_file command line option.' % setup_options.setup_file) if os.path.basename(setup_options.setup_file) != 'setup.py': raise RuntimeError( 'The --setup_file option expects the full path to a file named ' 'setup.py instead of %s' % setup_options.setup_file) tarball_file = Stager._build_setup_package( setup_options.setup_file, temp_dir, build_setup_args) staged_path = FileSystems.join(staging_location, WORKFLOW_TARBALL_FILE) self.stage_artifact(tarball_file, staged_path) resources.append(WORKFLOW_TARBALL_FILE) # Handle extra local packages that should be staged. if setup_options.extra_packages is not None: resources.extend( self._stage_extra_packages(setup_options.extra_packages, staging_location, temp_dir=temp_dir)) # Handle jar packages that should be staged for Java SDK Harness. jar_packages = options.view_as(DebugOptions).lookup_experiment( 'jar_packages') if jar_packages is not None: resources.extend( self._stage_jar_packages(jar_packages.split(':'), staging_location, temp_dir=temp_dir)) # Pickle the main session if requested. # We will create the pickled main session locally and then copy it to the # staging location because the staging location is a remote path and the # file cannot be created directly there. if setup_options.save_main_session: pickled_session_file = os.path.join( temp_dir, names.PICKLED_MAIN_SESSION_FILE) pickler.dump_session(pickled_session_file) staged_path = FileSystems.join(staging_location, names.PICKLED_MAIN_SESSION_FILE) self.stage_artifact(pickled_session_file, staged_path) resources.append(names.PICKLED_MAIN_SESSION_FILE) if hasattr(setup_options, 'sdk_location'): if (setup_options.sdk_location == 'default') or Stager._is_remote_path( setup_options.sdk_location): # If --sdk_location is not specified then the appropriate package # will be obtained from PyPI (https://pypi.python.org) based on the # version of the currently running SDK. If the option is # present then no version matching is made and the exact URL or path # is expected. # # Unit tests running in the 'python setup.py test' context will # not have the sdk_location attribute present and therefore we # will not stage SDK. sdk_remote_location = 'pypi' if ( setup_options.sdk_location == 'default') else setup_options.sdk_location resources.extend( self._stage_beam_sdk(sdk_remote_location, staging_location, temp_dir)) elif setup_options.sdk_location == 'container': # Use the SDK that's built into the container, rather than re-staging # it. pass else: # This branch is also used by internal tests running with the SDK built # at head. if os.path.isdir(setup_options.sdk_location): # TODO(angoenka): remove reference to Dataflow sdk_path = os.path.join(setup_options.sdk_location, DATAFLOW_SDK_TARBALL_FILE) else: sdk_path = setup_options.sdk_location if os.path.isfile(sdk_path): logging.info('Copying Beam SDK "%s" to staging location.', sdk_path) staged_path = FileSystems.join( staging_location, Stager._desired_sdk_filename_in_staging_location( setup_options.sdk_location)) self.stage_artifact(sdk_path, staged_path) _, sdk_staged_filename = FileSystems.split(staged_path) resources.append(sdk_staged_filename) else: if setup_options.sdk_location == 'default': raise RuntimeError( 'Cannot find default Beam SDK tar file "%s"' % sdk_path) elif not setup_options.sdk_location: logging.info( 'Beam SDK will not be staged since --sdk_location ' 'is empty.') else: raise RuntimeError( 'The file "%s" cannot be found. Its location was specified by ' 'the --sdk_location command-line option.' % sdk_path) worker_options = options.view_as(WorkerOptions) dataflow_worker_jar = getattr(worker_options, 'dataflow_worker_jar', None) if dataflow_worker_jar is not None: jar_staged_filename = 'dataflow-worker.jar' staged_path = FileSystems.join(staging_location, jar_staged_filename) self.stage_artifact(dataflow_worker_jar, staged_path) resources.append(jar_staged_filename) # Delete all temp files created while staging job resources. shutil.rmtree(temp_dir) retrieval_token = self.commit_manifest() return retrieval_token, resources
def stage_job_resources(self, options, build_setup_args=None, temp_dir=None, populate_requirements_cache=None, staging_location=None): """For internal use only; no backwards-compatibility guarantees. Creates (if needed) and stages job resources to staging_location. Args: options: Command line options. More specifically the function will expect requirements_file, setup_file, and save_main_session options to be present. build_setup_args: A list of command line arguments used to build a setup package. Used only if options.setup_file is not None. Used only for testing. temp_dir: Temporary folder where the resource building can happen. If None then a unique temp directory will be created. Used only for testing. populate_requirements_cache: Callable for populating the requirements cache. Used only for testing. staging_location: Location to stage the file. Returns: A list of file names (no paths) for the resources staged. All the files are assumed to be staged at staging_location. Raises: RuntimeError: If files specified are not found or error encountered while trying to create the resources (e.g., build a setup package). """ temp_dir = temp_dir or tempfile.mkdtemp() resources = [] setup_options = options.view_as(SetupOptions) # Make sure that all required options are specified. if staging_location is None: raise RuntimeError('The staging_location must be specified.') # Stage a requirements file if present. if setup_options.requirements_file is not None: if not os.path.isfile(setup_options.requirements_file): raise RuntimeError( 'The file %s cannot be found. It was specified in the ' '--requirements_file command line option.' % setup_options.requirements_file) staged_path = FileSystems.join(staging_location, REQUIREMENTS_FILE) self.stage_artifact(setup_options.requirements_file, staged_path) resources.append(REQUIREMENTS_FILE) requirements_cache_path = ( os.path.join(tempfile.gettempdir(), 'dataflow-requirements-cache') if setup_options.requirements_cache is None else setup_options.requirements_cache) # Populate cache with packages from requirements and stage the files # in the cache. if not os.path.exists(requirements_cache_path): os.makedirs(requirements_cache_path) (populate_requirements_cache if populate_requirements_cache else Stager._populate_requirements_cache)(setup_options.requirements_file, requirements_cache_path) for pkg in glob.glob(os.path.join(requirements_cache_path, '*')): self.stage_artifact( pkg, FileSystems.join(staging_location, os.path.basename(pkg))) resources.append(os.path.basename(pkg)) # Handle a setup file if present. # We will build the setup package locally and then copy it to the staging # location because the staging location is a remote path and the file cannot # be created directly there. if setup_options.setup_file is not None: if not os.path.isfile(setup_options.setup_file): raise RuntimeError( 'The file %s cannot be found. It was specified in the ' '--setup_file command line option.' % setup_options.setup_file) if os.path.basename(setup_options.setup_file) != 'setup.py': raise RuntimeError( 'The --setup_file option expects the full path to a file named ' 'setup.py instead of %s' % setup_options.setup_file) tarball_file = Stager._build_setup_package(setup_options.setup_file, temp_dir, build_setup_args) staged_path = FileSystems.join(staging_location, WORKFLOW_TARBALL_FILE) self.stage_artifact(tarball_file, staged_path) resources.append(WORKFLOW_TARBALL_FILE) # Handle extra local packages that should be staged. if setup_options.extra_packages is not None: resources.extend( self._stage_extra_packages( setup_options.extra_packages, staging_location, temp_dir=temp_dir)) # Pickle the main session if requested. # We will create the pickled main session locally and then copy it to the # staging location because the staging location is a remote path and the # file cannot be created directly there. if setup_options.save_main_session: pickled_session_file = os.path.join(temp_dir, names.PICKLED_MAIN_SESSION_FILE) pickler.dump_session(pickled_session_file) staged_path = FileSystems.join(staging_location, names.PICKLED_MAIN_SESSION_FILE) self.stage_artifact(pickled_session_file, staged_path) resources.append(names.PICKLED_MAIN_SESSION_FILE) if hasattr(setup_options, 'sdk_location'): if (setup_options.sdk_location == 'default') or Stager._is_remote_path( setup_options.sdk_location): # If --sdk_location is not specified then the appropriate package # will be obtained from PyPI (https://pypi.python.org) based on the # version of the currently running SDK. If the option is # present then no version matching is made and the exact URL or path # is expected. # # Unit tests running in the 'python setup.py test' context will # not have the sdk_location attribute present and therefore we # will not stage SDK. sdk_remote_location = 'pypi' if (setup_options.sdk_location == 'default' ) else setup_options.sdk_location resources.extend( self._stage_beam_sdk(sdk_remote_location, staging_location, temp_dir)) else: # This branch is also used by internal tests running with the SDK built # at head. if os.path.isdir(setup_options.sdk_location): # TODO(angoenka): remove reference to Dataflow sdk_path = os.path.join(setup_options.sdk_location, names.DATAFLOW_SDK_TARBALL_FILE) else: sdk_path = setup_options.sdk_location if os.path.isfile(sdk_path): logging.info('Copying Beam SDK "%s" to staging location.', sdk_path) staged_path = FileSystems.join( staging_location, Stager._desired_sdk_filename_in_staging_location( setup_options.sdk_location)) self.stage_artifact(sdk_path, staged_path) _, sdk_staged_filename = FileSystems.split(staged_path) resources.append(sdk_staged_filename) else: if setup_options.sdk_location == 'default': raise RuntimeError('Cannot find default Beam SDK tar file "%s"' % sdk_path) elif not setup_options.sdk_location: logging.info('Beam SDK will not be staged since --sdk_location ' 'is empty.') else: raise RuntimeError( 'The file "%s" cannot be found. Its location was specified by ' 'the --sdk_location command-line option.' % sdk_path) # Delete all temp files created while staging job resources. shutil.rmtree(temp_dir) self.commit_manifest() return resources
def create_job_resources(options, # type: PipelineOptions temp_dir, # type: str build_setup_args=None, # type: Optional[List[str]] populate_requirements_cache=None, # type: Optional[str] skip_prestaged_dependencies=False, # type: Optional[bool] ): """For internal use only; no backwards-compatibility guarantees. Creates (if needed) a list of job resources. Args: options: Command line options. More specifically the function will expect requirements_file, setup_file, and save_main_session options to be present. temp_dir: Temporary folder where the resource building can happen. If None then a unique temp directory will be created. Used only for testing. build_setup_args: A list of command line arguments used to build a setup package. Used only if options.setup_file is not None. Used only for testing. populate_requirements_cache: Callable for populating the requirements cache. Used only for testing. skip_prestaged_dependencies: Skip staging dependencies that can be added into SDK containers during prebuilding. Returns: A list of tuples of local file paths and file names (no paths) to be used for staging resources. Raises: RuntimeError: If files specified are not found or error encountered while trying to create the resources (e.g., build a setup package). """ resources = [] # type: List[Tuple[str, str]] setup_options = options.view_as(SetupOptions) # We can skip boot dependencies: apache beam sdk, python packages from # requirements.txt, python packages from extra_packages and workflow tarball # if we know we are using a dependency pre-installed sdk container image. if not skip_prestaged_dependencies: # Stage a requirements file if present. if setup_options.requirements_file is not None: if not os.path.isfile(setup_options.requirements_file): raise RuntimeError( 'The file %s cannot be found. It was specified in the ' '--requirements_file command line option.' % setup_options.requirements_file) resources.append((setup_options.requirements_file, REQUIREMENTS_FILE)) requirements_cache_path = ( os.path.join(tempfile.gettempdir(), 'dataflow-requirements-cache') if setup_options.requirements_cache is None else setup_options.requirements_cache) # Populate cache with packages from requirements and stage the files # in the cache. if not os.path.exists(requirements_cache_path): os.makedirs(requirements_cache_path) ( populate_requirements_cache if populate_requirements_cache else Stager._populate_requirements_cache)( setup_options.requirements_file, requirements_cache_path) for pkg in glob.glob(os.path.join(requirements_cache_path, '*')): resources.append((pkg, os.path.basename(pkg))) # Handle a setup file if present. # We will build the setup package locally and then copy it to the staging # location because the staging location is a remote path and the file # cannot be created directly there. if setup_options.setup_file is not None: if not os.path.isfile(setup_options.setup_file): raise RuntimeError( 'The file %s cannot be found. It was specified in the ' '--setup_file command line option.' % setup_options.setup_file) if os.path.basename(setup_options.setup_file) != 'setup.py': raise RuntimeError( 'The --setup_file option expects the full path to a file named ' 'setup.py instead of %s' % setup_options.setup_file) tarball_file = Stager._build_setup_package( setup_options.setup_file, temp_dir, build_setup_args) resources.append((tarball_file, WORKFLOW_TARBALL_FILE)) # Handle extra local packages that should be staged. if setup_options.extra_packages is not None: resources.extend( Stager._create_extra_packages( setup_options.extra_packages, temp_dir=temp_dir)) if hasattr(setup_options, 'sdk_location'): if (setup_options.sdk_location == 'default') or Stager._is_remote_path( setup_options.sdk_location): # If --sdk_location is not specified then the appropriate package # will be obtained from PyPI (https://pypi.python.org) based on the # version of the currently running SDK. If the option is # present then no version matching is made and the exact URL or path # is expected. # # Unit tests running in the 'python setup.py test' context will # not have the sdk_location attribute present and therefore we # will not stage SDK. sdk_remote_location = 'pypi' if ( setup_options.sdk_location == 'default' ) else setup_options.sdk_location resources.extend( Stager._create_beam_sdk(sdk_remote_location, temp_dir)) elif setup_options.sdk_location == 'container': # Use the SDK that's built into the container, rather than re-staging # it. pass else: # This branch is also used by internal tests running with the SDK # built at head. if os.path.isdir(setup_options.sdk_location): sdk_path = os.path.join( setup_options.sdk_location, WORKFLOW_TARBALL_FILE) else: sdk_path = setup_options.sdk_location if os.path.isfile(sdk_path): _LOGGER.info('Copying Beam SDK "%s" to staging location.', sdk_path) resources.append(( sdk_path, Stager._desired_sdk_filename_in_staging_location( setup_options.sdk_location))) else: if setup_options.sdk_location == 'default': raise RuntimeError( 'Cannot find default Beam SDK tar file "%s"' % sdk_path) elif not setup_options.sdk_location: _LOGGER.info( 'Beam SDK will not be staged since --sdk_location ' 'is empty.') else: raise RuntimeError( 'The file "%s" cannot be found. Its location was specified ' 'by the --sdk_location command-line option.' % sdk_path) # The following artifacts are not processed by python sdk container boot # sequence in a setup mode and hence should not be skipped even if a # prebuilt sdk container image is used. # TODO(heejong): remove jar_packages experimental flag when cross-language # dependency management is implemented for all runners. # Handle jar packages that should be staged for Java SDK Harness. jar_packages = options.view_as(DebugOptions).lookup_experiment( 'jar_packages') if jar_packages is not None: resources.extend( Stager._create_jar_packages( jar_packages.split(','), temp_dir=temp_dir)) # Pickle the main session if requested. # We will create the pickled main session locally and then copy it to the # staging location because the staging location is a remote path and the # file cannot be created directly there. if setup_options.save_main_session: pickled_session_file = os.path.join( temp_dir, names.PICKLED_MAIN_SESSION_FILE) pickler.dump_session(pickled_session_file) resources.append((pickled_session_file, names.PICKLED_MAIN_SESSION_FILE)) worker_options = options.view_as(WorkerOptions) dataflow_worker_jar = getattr(worker_options, 'dataflow_worker_jar', None) if dataflow_worker_jar is not None: jar_staged_filename = 'dataflow-worker.jar' resources.append((dataflow_worker_jar, jar_staged_filename)) return resources