示例#1
0
 def run(self):
   """Runs the pipeline. Returns whatever our runner returns after running."""
   if self.options.view_as(SetupOptions).save_main_session:
     # If this option is chosen, verify we can pickle the main session early.
     tmpdir = tempfile.mkdtemp()
     try:
       pickler.dump_session(os.path.join(tmpdir, 'main_session.pickle'))
     finally:
       shutil.rmtree(tmpdir)
   return self.runner.run(self)
示例#2
0
    def run(self,
            test_runner_api=True,
            runner=None,
            options=None,
            interactive=None):
        """Runs the pipeline. Returns whatever our runner returns after running.

    If another runner instance and options are provided, that runner will
    execute the pipeline with the given options. If either of them is not set,
    a ValueError is raised. The usage is similar to directly invoking
    `runner.run_pipeline(pipeline, options)`.
    Additionally, an interactive field can be set to override the pipeline's
    self.interactive field to mark current pipeline as being initiated from an
    interactive environment.
    """
        from apache_beam.runners.interactive import interactive_runner
        if interactive:
            self.interactive = interactive
        elif isinstance(self.runner, interactive_runner.InteractiveRunner):
            self.interactive = True
        else:
            self.interactive = False
        runner_in_use = self.runner
        options_in_use = self._options
        if runner and options:
            runner_in_use = runner
            options_in_use = options
        elif not runner and options:
            raise ValueError(
                'Parameter runner is not given when parameter options '
                'is given.')
        elif not options and runner:
            raise ValueError(
                'Parameter options is not given when parameter runner '
                'is given.')
        # When possible, invoke a round trip through the runner API.
        if test_runner_api and self._verify_runner_api_compatible():
            return Pipeline.from_runner_api(
                self.to_runner_api(use_fake_coders=True), runner_in_use,
                options_in_use).run(test_runner_api=False,
                                    interactive=self.interactive)

        if options_in_use.view_as(TypeOptions).runtime_type_check:
            from apache_beam.typehints import typecheck
            self.visit(typecheck.TypeCheckVisitor())

        if options_in_use.view_as(SetupOptions).save_main_session:
            # If this option is chosen, verify we can pickle the main session early.
            tmpdir = tempfile.mkdtemp()
            try:
                pickler.dump_session(
                    os.path.join(tmpdir, 'main_session.pickle'))
            finally:
                shutil.rmtree(tmpdir)
        return runner_in_use.run_pipeline(self, options_in_use)
示例#3
0
  def run(self, test_runner_api=True):
    """Runs the pipeline. Returns whatever our runner returns after running."""

    # When possible, invoke a round trip through the runner API.
    if test_runner_api and self._verify_runner_api_compatible():
      return Pipeline.from_runner_api(
          self.to_runner_api(), self.runner, self._options).run(False)

    if self._options.view_as(SetupOptions).save_main_session:
      # If this option is chosen, verify we can pickle the main session early.
      tmpdir = tempfile.mkdtemp()
      try:
        pickler.dump_session(os.path.join(tmpdir, 'main_session.pickle'))
      finally:
        shutil.rmtree(tmpdir)
    return self.runner.run_pipeline(self)
示例#4
0
  def run(self, test_runner_api=True):
    """Runs the pipeline. Returns whatever our runner returns after running."""

    # When possible, invoke a round trip through the runner API.
    if test_runner_api and self._verify_runner_api_compatible():
      return Pipeline.from_runner_api(
          self.to_runner_api(), self.runner, self._options).run(False)

    if self._options.view_as(SetupOptions).save_main_session:
      # If this option is chosen, verify we can pickle the main session early.
      tmpdir = tempfile.mkdtemp()
      try:
        pickler.dump_session(os.path.join(tmpdir, 'main_session.pickle'))
      finally:
        shutil.rmtree(tmpdir)
    return self.runner.run(self)
示例#5
0
    def run(self, test_runner_api=True):
        """Runs the pipeline. Returns whatever our runner returns after running."""

        # When possible, invoke a round trip through the runner API.
        if test_runner_api and self._verify_runner_api_compatible():
            return Pipeline.from_runner_api(
                self.to_runner_api(use_fake_coders=True), self.runner,
                self._options).run(False)

        if self._options.view_as(TypeOptions).runtime_type_check:
            from apache_beam.typehints import typecheck
            self.visit(typecheck.TypeCheckVisitor())

        if self._options.view_as(SetupOptions).save_main_session:
            # If this option is chosen, verify we can pickle the main session early.
            tmpdir = tempfile.mkdtemp()
            try:
                pickler.dump_session(
                    os.path.join(tmpdir, 'main_session.pickle'))
            finally:
                shutil.rmtree(tmpdir)
        return self.runner.run_pipeline(self, self._options)
示例#6
0
  def run(self, test_runner_api=True):
    """Runs the pipeline. Returns whatever our runner returns after running."""

    # When possible, invoke a round trip through the runner API.
    if test_runner_api and self._verify_runner_api_compatible():
      return Pipeline.from_runner_api(
          self.to_runner_api(use_fake_coders=True),
          self.runner,
          self._options).run(False)

    if self._options.view_as(TypeOptions).runtime_type_check:
      from apache_beam.typehints import typecheck
      self.visit(typecheck.TypeCheckVisitor())

    if self._options.view_as(SetupOptions).save_main_session:
      # If this option is chosen, verify we can pickle the main session early.
      tmpdir = tempfile.mkdtemp()
      try:
        pickler.dump_session(os.path.join(tmpdir, 'main_session.pickle'))
      finally:
        shutil.rmtree(tmpdir)
    return self.runner.run_pipeline(self)
示例#7
0
def stage_job_resources(
    options, file_copy=_dependency_file_copy, build_setup_args=None,
    temp_dir=None, populate_requirements_cache=_populate_requirements_cache):
  """Creates (if needed) and stages job resources to options.staging_location.

  Args:
    options: Command line options. More specifically the function will expect
      staging_location, requirements_file, setup_file, and save_main_session
      options to be present.
    file_copy: Callable for copying files. The default version will copy from
      a local file to a GCS location using the gsutil tool available in the
      Google Cloud SDK package.
    build_setup_args: A list of command line arguments used to build a setup
      package. Used only if options.setup_file is not None. Used only for
      testing.
    temp_dir: Temporary folder where the resource building can happen. If None
      then a unique temp directory will be created. Used only for testing.
    populate_requirements_cache: Callable for populating the requirements cache.
      Used only for testing.

  Returns:
    A list of file names (no paths) for the resources staged. All the files
    are assumed to be staged in options.staging_location.

  Raises:
    RuntimeError: If files specified are not found or error encountered while
      trying to create the resources (e.g., build a setup package).
  """
  temp_dir = temp_dir or tempfile.mkdtemp()
  resources = []

  google_cloud_options = options.view_as(GoogleCloudOptions)
  setup_options = options.view_as(SetupOptions)
  # Make sure that all required options are specified. There are a few that have
  # defaults to support local running scenarios.
  if google_cloud_options.staging_location is None:
    raise RuntimeError(
        'The --staging_location option must be specified.')
  if google_cloud_options.temp_location is None:
    raise RuntimeError(
        'The --temp_location option must be specified.')

  # Stage a requirements file if present.
  if setup_options.requirements_file is not None:
    if not os.path.isfile(setup_options.requirements_file):
      raise RuntimeError('The file %s cannot be found. It was specified in the '
                         '--requirements_file command line option.' %
                         setup_options.requirements_file)
    staged_path = utils.path.join(google_cloud_options.staging_location,
                                  REQUIREMENTS_FILE)
    file_copy(setup_options.requirements_file, staged_path)
    resources.append(REQUIREMENTS_FILE)
    requirements_cache_path = (
        os.path.join(tempfile.gettempdir(), 'dataflow-requirements-cache')
        if setup_options.requirements_cache is None
        else setup_options.requirements_cache)
    # Populate cache with packages from requirements and stage the files
    # in the cache.
    if not os.path.exists(requirements_cache_path):
      os.makedirs(requirements_cache_path)
    populate_requirements_cache(
        setup_options.requirements_file, requirements_cache_path)
    for pkg in  glob.glob(os.path.join(requirements_cache_path, '*')):
      file_copy(pkg, utils.path.join(google_cloud_options.staging_location,
                                     os.path.basename(pkg)))
      resources.append(os.path.basename(pkg))

  # Handle a setup file if present.
  # We will build the setup package locally and then copy it to the staging
  # location because the staging location is a GCS path and the file cannot be
  # created directly there.
  if setup_options.setup_file is not None:
    if not os.path.isfile(setup_options.setup_file):
      raise RuntimeError('The file %s cannot be found. It was specified in the '
                         '--setup_file command line option.' %
                         setup_options.setup_file)
    if os.path.basename(setup_options.setup_file) != 'setup.py':
      raise RuntimeError(
          'The --setup_file option expects the full path to a file named '
          'setup.py instead of %s' % setup_options.setup_file)
    tarball_file = _build_setup_package(setup_options.setup_file, temp_dir,
                                        build_setup_args)
    staged_path = utils.path.join(google_cloud_options.staging_location,
                                  WORKFLOW_TARBALL_FILE)
    file_copy(tarball_file, staged_path)
    resources.append(WORKFLOW_TARBALL_FILE)

  # Handle extra local packages that should be staged.
  if setup_options.extra_packages is not None:
    resources.extend(
        _stage_extra_packages(setup_options.extra_packages,
                              google_cloud_options.staging_location,
                              temp_dir=temp_dir, file_copy=file_copy))

  # Pickle the main session if requested.
  # We will create the pickled main session locally and then copy it to the
  # staging location because the staging location is a GCS path and the file
  # cannot be created directly there.
  if setup_options.save_main_session:
    pickled_session_file = os.path.join(temp_dir,
                                        names.PICKLED_MAIN_SESSION_FILE)
    pickler.dump_session(pickled_session_file)
    staged_path = utils.path.join(google_cloud_options.staging_location,
                                  names.PICKLED_MAIN_SESSION_FILE)
    file_copy(pickled_session_file, staged_path)
    resources.append(names.PICKLED_MAIN_SESSION_FILE)

  if hasattr(setup_options, 'sdk_location') and setup_options.sdk_location:
    if setup_options.sdk_location == 'default':
      stage_tarball_from_remote_location = True
    elif (setup_options.sdk_location.startswith('gs://') or
          setup_options.sdk_location.startswith('http://') or
          setup_options.sdk_location.startswith('https://')):
      stage_tarball_from_remote_location = True
    else:
      stage_tarball_from_remote_location = False

    staged_path = utils.path.join(google_cloud_options.staging_location,
                                  names.DATAFLOW_SDK_TARBALL_FILE)
    if stage_tarball_from_remote_location:
      # If --sdk_location is not specified then the appropriate package
      # will be obtained from PyPI (https://pypi.python.org) based on the
      # version of the currently running SDK. If the option is
      # present then no version matching is made and the exact URL or path
      # is expected.
      #
      # Unit tests running in the 'python setup.py test' context will
      # not have the sdk_location attribute present and therefore we
      # will not stage a tarball.
      if setup_options.sdk_location == 'default':
        sdk_remote_location = 'pypi'
      else:
        sdk_remote_location = setup_options.sdk_location
      _stage_dataflow_sdk_tarball(sdk_remote_location, staged_path, temp_dir)
      resources.append(names.DATAFLOW_SDK_TARBALL_FILE)
    else:
      # Check if we have a local Dataflow SDK tarball present. This branch is
      # used by tests running with the SDK built at head.
      if setup_options.sdk_location == 'default':
        module_path = os.path.abspath(__file__)
        sdk_path = os.path.join(
            os.path.dirname(module_path), '..', names.DATAFLOW_SDK_TARBALL_FILE)
      elif os.path.isdir(setup_options.sdk_location):
        sdk_path = os.path.join(
            setup_options.sdk_location, names.DATAFLOW_SDK_TARBALL_FILE)
      else:
        sdk_path = setup_options.sdk_location
      if os.path.isfile(sdk_path):
        logging.info('Copying dataflow SDK "%s" to staging location.', sdk_path)
        file_copy(sdk_path, staged_path)
        resources.append(names.DATAFLOW_SDK_TARBALL_FILE)
      else:
        if setup_options.sdk_location == 'default':
          raise RuntimeError('Cannot find default Dataflow SDK tar file "%s"',
                             sdk_path)
        else:
          raise RuntimeError(
              'The file "%s" cannot be found. Its location was specified by '
              'the --sdk_location command-line option.' %
              sdk_path)

  # Delete all temp files created while staging job resources.
  shutil.rmtree(temp_dir)
  return resources
示例#8
0
def stage_job_resources(
        options,
        file_copy=_dependency_file_copy,
        build_setup_args=None,
        temp_dir=None,
        populate_requirements_cache=_populate_requirements_cache):
    """For internal use only; no backwards-compatibility guarantees.

  Creates (if needed) and stages job resources to options.staging_location.

  Args:
    options: Command line options. More specifically the function will expect
      staging_location, requirements_file, setup_file, and save_main_session
      options to be present.
    file_copy: Callable for copying files. The default version will copy from
      a local file to a GCS location using the gsutil tool available in the
      Google Cloud SDK package.
    build_setup_args: A list of command line arguments used to build a setup
      package. Used only if options.setup_file is not None. Used only for
      testing.
    temp_dir: Temporary folder where the resource building can happen. If None
      then a unique temp directory will be created. Used only for testing.
    populate_requirements_cache: Callable for populating the requirements cache.
      Used only for testing.

  Returns:
    A list of file names (no paths) for the resources staged. All the files
    are assumed to be staged in options.staging_location.

  Raises:
    RuntimeError: If files specified are not found or error encountered while
      trying to create the resources (e.g., build a setup package).
  """
    temp_dir = temp_dir or tempfile.mkdtemp()
    resources = []

    google_cloud_options = options.view_as(GoogleCloudOptions)
    setup_options = options.view_as(SetupOptions)
    # Make sure that all required options are specified. There are a few that have
    # defaults to support local running scenarios.
    if google_cloud_options.staging_location is None:
        raise RuntimeError('The --staging_location option must be specified.')
    if google_cloud_options.temp_location is None:
        raise RuntimeError('The --temp_location option must be specified.')

    # Stage a requirements file if present.
    if setup_options.requirements_file is not None:
        if not os.path.isfile(setup_options.requirements_file):
            raise RuntimeError(
                'The file %s cannot be found. It was specified in the '
                '--requirements_file command line option.' %
                setup_options.requirements_file)
        staged_path = FileSystems.join(google_cloud_options.staging_location,
                                       REQUIREMENTS_FILE)
        file_copy(setup_options.requirements_file, staged_path)
        resources.append(REQUIREMENTS_FILE)
        requirements_cache_path = (os.path.join(tempfile.gettempdir(),
                                                'dataflow-requirements-cache')
                                   if setup_options.requirements_cache is None
                                   else setup_options.requirements_cache)
        # Populate cache with packages from requirements and stage the files
        # in the cache.
        if not os.path.exists(requirements_cache_path):
            os.makedirs(requirements_cache_path)
        populate_requirements_cache(setup_options.requirements_file,
                                    requirements_cache_path)
        for pkg in glob.glob(os.path.join(requirements_cache_path, '*')):
            file_copy(
                pkg,
                FileSystems.join(google_cloud_options.staging_location,
                                 os.path.basename(pkg)))
            resources.append(os.path.basename(pkg))

    # Handle a setup file if present.
    # We will build the setup package locally and then copy it to the staging
    # location because the staging location is a GCS path and the file cannot be
    # created directly there.
    if setup_options.setup_file is not None:
        if not os.path.isfile(setup_options.setup_file):
            raise RuntimeError(
                'The file %s cannot be found. It was specified in the '
                '--setup_file command line option.' % setup_options.setup_file)
        if os.path.basename(setup_options.setup_file) != 'setup.py':
            raise RuntimeError(
                'The --setup_file option expects the full path to a file named '
                'setup.py instead of %s' % setup_options.setup_file)
        tarball_file = _build_setup_package(setup_options.setup_file, temp_dir,
                                            build_setup_args)
        staged_path = FileSystems.join(google_cloud_options.staging_location,
                                       WORKFLOW_TARBALL_FILE)
        file_copy(tarball_file, staged_path)
        resources.append(WORKFLOW_TARBALL_FILE)

    # Handle extra local packages that should be staged.
    if setup_options.extra_packages is not None:
        resources.extend(
            _stage_extra_packages(setup_options.extra_packages,
                                  google_cloud_options.staging_location,
                                  temp_dir=temp_dir,
                                  file_copy=file_copy))

    # Pickle the main session if requested.
    # We will create the pickled main session locally and then copy it to the
    # staging location because the staging location is a GCS path and the file
    # cannot be created directly there.
    if setup_options.save_main_session:
        pickled_session_file = os.path.join(temp_dir,
                                            names.PICKLED_MAIN_SESSION_FILE)
        pickler.dump_session(pickled_session_file)
        staged_path = FileSystems.join(google_cloud_options.staging_location,
                                       names.PICKLED_MAIN_SESSION_FILE)
        file_copy(pickled_session_file, staged_path)
        resources.append(names.PICKLED_MAIN_SESSION_FILE)

    if hasattr(setup_options, 'sdk_location'):
        if setup_options.sdk_location == 'default':
            stage_sdk_from_remote_location = True
        elif (setup_options.sdk_location.startswith('gs://')
              or setup_options.sdk_location.startswith('http://')
              or setup_options.sdk_location.startswith('https://')):
            stage_sdk_from_remote_location = True
        else:
            stage_sdk_from_remote_location = False

        if stage_sdk_from_remote_location:
            # If --sdk_location is not specified then the appropriate package
            # will be obtained from PyPI (https://pypi.python.org) based on the
            # version of the currently running SDK. If the option is
            # present then no version matching is made and the exact URL or path
            # is expected.
            #
            # Unit tests running in the 'python setup.py test' context will
            # not have the sdk_location attribute present and therefore we
            # will not stage SDK.
            if setup_options.sdk_location == 'default':
                sdk_remote_location = 'pypi'
            else:
                sdk_remote_location = setup_options.sdk_location
            resources.extend(
                _stage_beam_sdk(sdk_remote_location,
                                google_cloud_options.staging_location,
                                temp_dir))
        else:
            # This branch is also used by internal tests running with the SDK built
            # at head.
            if setup_options.sdk_location == 'default':
                module_path = os.path.abspath(__file__)
                sdk_path = os.path.join(os.path.dirname(module_path), '..',
                                        '..', '..',
                                        names.DATAFLOW_SDK_TARBALL_FILE)
            elif os.path.isdir(setup_options.sdk_location):
                sdk_path = os.path.join(setup_options.sdk_location,
                                        names.DATAFLOW_SDK_TARBALL_FILE)
            else:
                sdk_path = setup_options.sdk_location
            if os.path.isfile(sdk_path):
                logging.info('Copying Beam SDK "%s" to staging location.',
                             sdk_path)
                staged_path = FileSystems.join(
                    google_cloud_options.staging_location,
                    _desired_sdk_filename_in_staging_location(
                        setup_options.sdk_location))
                file_copy(sdk_path, staged_path)
                _, sdk_staged_filename = FileSystems.split(staged_path)
                resources.append(sdk_staged_filename)
            else:
                if setup_options.sdk_location == 'default':
                    raise RuntimeError(
                        'Cannot find default Beam SDK tar file "%s"', sdk_path)
                elif not setup_options.sdk_location:
                    logging.info(
                        'Beam SDK will not be staged since --sdk_location '
                        'is empty.')
                else:
                    raise RuntimeError(
                        'The file "%s" cannot be found. Its location was specified by '
                        'the --sdk_location command-line option.' % sdk_path)

    # Delete all temp files created while staging job resources.
    shutil.rmtree(temp_dir)
    return resources
示例#9
0
    def stage_job_resources(self,
                            options,
                            build_setup_args=None,
                            temp_dir=None,
                            populate_requirements_cache=None,
                            staging_location=None):
        """For internal use only; no backwards-compatibility guarantees.

        Creates (if needed) and stages job resources to staging_location.

        Args:
          options: Command line options. More specifically the function will
            expect requirements_file, setup_file, and save_main_session options
            to be present.
          build_setup_args: A list of command line arguments used to build a
            setup package. Used only if options.setup_file is not None. Used
            only for testing.
          temp_dir: Temporary folder where the resource building can happen. If
            None then a unique temp directory will be created. Used only for
            testing.
          populate_requirements_cache: Callable for populating the requirements
            cache. Used only for testing.
          staging_location: Location to stage the file.

        Returns:
          A list of file names (no paths) for the resources staged. All the
          files are assumed to be staged at staging_location.

        Raises:
          RuntimeError: If files specified are not found or error encountered
          while trying to create the resources (e.g., build a setup package).
        """
        temp_dir = temp_dir or tempfile.mkdtemp()
        resources = []

        setup_options = options.view_as(SetupOptions)
        # Make sure that all required options are specified.
        if staging_location is None:
            raise RuntimeError('The staging_location must be specified.')

        # Stage a requirements file if present.
        if setup_options.requirements_file is not None:
            if not os.path.isfile(setup_options.requirements_file):
                raise RuntimeError(
                    'The file %s cannot be found. It was specified in the '
                    '--requirements_file command line option.' %
                    setup_options.requirements_file)
            staged_path = FileSystems.join(staging_location, REQUIREMENTS_FILE)
            self.stage_artifact(setup_options.requirements_file, staged_path)
            resources.append(REQUIREMENTS_FILE)
            requirements_cache_path = (os.path.join(
                tempfile.gettempdir(), 'dataflow-requirements-cache') if
                                       setup_options.requirements_cache is None
                                       else setup_options.requirements_cache)
            # Populate cache with packages from requirements and stage the files
            # in the cache.
            if not os.path.exists(requirements_cache_path):
                os.makedirs(requirements_cache_path)
            (populate_requirements_cache if populate_requirements_cache else
             Stager._populate_requirements_cache)(
                 setup_options.requirements_file, requirements_cache_path)
            for pkg in glob.glob(os.path.join(requirements_cache_path, '*')):
                self.stage_artifact(
                    pkg,
                    FileSystems.join(staging_location, os.path.basename(pkg)))
                resources.append(os.path.basename(pkg))

        # Handle a setup file if present.
        # We will build the setup package locally and then copy it to the staging
        # location because the staging location is a remote path and the file cannot
        # be created directly there.
        if setup_options.setup_file is not None:
            if not os.path.isfile(setup_options.setup_file):
                raise RuntimeError(
                    'The file %s cannot be found. It was specified in the '
                    '--setup_file command line option.' %
                    setup_options.setup_file)
            if os.path.basename(setup_options.setup_file) != 'setup.py':
                raise RuntimeError(
                    'The --setup_file option expects the full path to a file named '
                    'setup.py instead of %s' % setup_options.setup_file)
            tarball_file = Stager._build_setup_package(
                setup_options.setup_file, temp_dir, build_setup_args)
            staged_path = FileSystems.join(staging_location,
                                           WORKFLOW_TARBALL_FILE)
            self.stage_artifact(tarball_file, staged_path)
            resources.append(WORKFLOW_TARBALL_FILE)

        # Handle extra local packages that should be staged.
        if setup_options.extra_packages is not None:
            resources.extend(
                self._stage_extra_packages(setup_options.extra_packages,
                                           staging_location,
                                           temp_dir=temp_dir))

        # Handle jar packages that should be staged for Java SDK Harness.
        jar_packages = options.view_as(DebugOptions).lookup_experiment(
            'jar_packages')
        if jar_packages is not None:
            resources.extend(
                self._stage_jar_packages(jar_packages.split(':'),
                                         staging_location,
                                         temp_dir=temp_dir))

        # Pickle the main session if requested.
        # We will create the pickled main session locally and then copy it to the
        # staging location because the staging location is a remote path and the
        # file cannot be created directly there.
        if setup_options.save_main_session:
            pickled_session_file = os.path.join(
                temp_dir, names.PICKLED_MAIN_SESSION_FILE)
            pickler.dump_session(pickled_session_file)
            staged_path = FileSystems.join(staging_location,
                                           names.PICKLED_MAIN_SESSION_FILE)
            self.stage_artifact(pickled_session_file, staged_path)
            resources.append(names.PICKLED_MAIN_SESSION_FILE)

        if hasattr(setup_options, 'sdk_location'):

            if (setup_options.sdk_location
                    == 'default') or Stager._is_remote_path(
                        setup_options.sdk_location):
                # If --sdk_location is not specified then the appropriate package
                # will be obtained from PyPI (https://pypi.python.org) based on the
                # version of the currently running SDK. If the option is
                # present then no version matching is made and the exact URL or path
                # is expected.
                #
                # Unit tests running in the 'python setup.py test' context will
                # not have the sdk_location attribute present and therefore we
                # will not stage SDK.
                sdk_remote_location = 'pypi' if (
                    setup_options.sdk_location
                    == 'default') else setup_options.sdk_location
                resources.extend(
                    self._stage_beam_sdk(sdk_remote_location, staging_location,
                                         temp_dir))
            elif setup_options.sdk_location == 'container':
                # Use the SDK that's built into the container, rather than re-staging
                # it.
                pass
            else:
                # This branch is also used by internal tests running with the SDK built
                # at head.
                if os.path.isdir(setup_options.sdk_location):
                    # TODO(angoenka): remove reference to Dataflow
                    sdk_path = os.path.join(setup_options.sdk_location,
                                            DATAFLOW_SDK_TARBALL_FILE)
                else:
                    sdk_path = setup_options.sdk_location

                if os.path.isfile(sdk_path):
                    logging.info('Copying Beam SDK "%s" to staging location.',
                                 sdk_path)
                    staged_path = FileSystems.join(
                        staging_location,
                        Stager._desired_sdk_filename_in_staging_location(
                            setup_options.sdk_location))
                    self.stage_artifact(sdk_path, staged_path)
                    _, sdk_staged_filename = FileSystems.split(staged_path)
                    resources.append(sdk_staged_filename)
                else:
                    if setup_options.sdk_location == 'default':
                        raise RuntimeError(
                            'Cannot find default Beam SDK tar file "%s"' %
                            sdk_path)
                    elif not setup_options.sdk_location:
                        logging.info(
                            'Beam SDK will not be staged since --sdk_location '
                            'is empty.')
                    else:
                        raise RuntimeError(
                            'The file "%s" cannot be found. Its location was specified by '
                            'the --sdk_location command-line option.' %
                            sdk_path)

        worker_options = options.view_as(WorkerOptions)
        dataflow_worker_jar = getattr(worker_options, 'dataflow_worker_jar',
                                      None)
        if dataflow_worker_jar is not None:
            jar_staged_filename = 'dataflow-worker.jar'
            staged_path = FileSystems.join(staging_location,
                                           jar_staged_filename)
            self.stage_artifact(dataflow_worker_jar, staged_path)
            resources.append(jar_staged_filename)

        # Delete all temp files created while staging job resources.
        shutil.rmtree(temp_dir)
        retrieval_token = self.commit_manifest()
        return retrieval_token, resources
示例#10
0
文件: stager.py 项目: onderson/beam
  def stage_job_resources(self,
                          options,
                          build_setup_args=None,
                          temp_dir=None,
                          populate_requirements_cache=None,
                          staging_location=None):
    """For internal use only; no backwards-compatibility guarantees.

        Creates (if needed) and stages job resources to staging_location.

        Args:
          options: Command line options. More specifically the function will
            expect requirements_file, setup_file, and save_main_session options
            to be present.
          build_setup_args: A list of command line arguments used to build a
            setup package. Used only if options.setup_file is not None. Used
            only for testing.
          temp_dir: Temporary folder where the resource building can happen. If
            None then a unique temp directory will be created. Used only for
            testing.
          populate_requirements_cache: Callable for populating the requirements
            cache. Used only for testing.
          staging_location: Location to stage the file.

        Returns:
          A list of file names (no paths) for the resources staged. All the
          files
          are assumed to be staged at staging_location.

        Raises:
          RuntimeError: If files specified are not found or error encountered
          while trying to create the resources (e.g., build a setup package).
        """
    temp_dir = temp_dir or tempfile.mkdtemp()
    resources = []

    setup_options = options.view_as(SetupOptions)
    # Make sure that all required options are specified.
    if staging_location is None:
      raise RuntimeError('The staging_location must be specified.')

    # Stage a requirements file if present.
    if setup_options.requirements_file is not None:
      if not os.path.isfile(setup_options.requirements_file):
        raise RuntimeError(
            'The file %s cannot be found. It was specified in the '
            '--requirements_file command line option.' %
            setup_options.requirements_file)
      staged_path = FileSystems.join(staging_location, REQUIREMENTS_FILE)
      self.stage_artifact(setup_options.requirements_file, staged_path)
      resources.append(REQUIREMENTS_FILE)
      requirements_cache_path = (
          os.path.join(tempfile.gettempdir(), 'dataflow-requirements-cache')
          if setup_options.requirements_cache is None else
          setup_options.requirements_cache)
      # Populate cache with packages from requirements and stage the files
      # in the cache.
      if not os.path.exists(requirements_cache_path):
        os.makedirs(requirements_cache_path)
      (populate_requirements_cache if populate_requirements_cache else
       Stager._populate_requirements_cache)(setup_options.requirements_file,
                                            requirements_cache_path)
      for pkg in glob.glob(os.path.join(requirements_cache_path, '*')):
        self.stage_artifact(
            pkg, FileSystems.join(staging_location, os.path.basename(pkg)))
        resources.append(os.path.basename(pkg))

    # Handle a setup file if present.
    # We will build the setup package locally and then copy it to the staging
    # location because the staging location is a remote path and the file cannot
    # be created directly there.
    if setup_options.setup_file is not None:
      if not os.path.isfile(setup_options.setup_file):
        raise RuntimeError(
            'The file %s cannot be found. It was specified in the '
            '--setup_file command line option.' % setup_options.setup_file)
      if os.path.basename(setup_options.setup_file) != 'setup.py':
        raise RuntimeError(
            'The --setup_file option expects the full path to a file named '
            'setup.py instead of %s' % setup_options.setup_file)
      tarball_file = Stager._build_setup_package(setup_options.setup_file,
                                                 temp_dir, build_setup_args)
      staged_path = FileSystems.join(staging_location, WORKFLOW_TARBALL_FILE)
      self.stage_artifact(tarball_file, staged_path)
      resources.append(WORKFLOW_TARBALL_FILE)

    # Handle extra local packages that should be staged.
    if setup_options.extra_packages is not None:
      resources.extend(
          self._stage_extra_packages(
              setup_options.extra_packages, staging_location,
              temp_dir=temp_dir))

    # Pickle the main session if requested.
    # We will create the pickled main session locally and then copy it to the
    # staging location because the staging location is a remote path and the
    # file cannot be created directly there.
    if setup_options.save_main_session:
      pickled_session_file = os.path.join(temp_dir,
                                          names.PICKLED_MAIN_SESSION_FILE)
      pickler.dump_session(pickled_session_file)
      staged_path = FileSystems.join(staging_location,
                                     names.PICKLED_MAIN_SESSION_FILE)
      self.stage_artifact(pickled_session_file, staged_path)
      resources.append(names.PICKLED_MAIN_SESSION_FILE)

    if hasattr(setup_options, 'sdk_location'):

      if (setup_options.sdk_location == 'default') or Stager._is_remote_path(
          setup_options.sdk_location):
        # If --sdk_location is not specified then the appropriate package
        # will be obtained from PyPI (https://pypi.python.org) based on the
        # version of the currently running SDK. If the option is
        # present then no version matching is made and the exact URL or path
        # is expected.
        #
        # Unit tests running in the 'python setup.py test' context will
        # not have the sdk_location attribute present and therefore we
        # will not stage SDK.
        sdk_remote_location = 'pypi' if (setup_options.sdk_location == 'default'
                                        ) else setup_options.sdk_location
        resources.extend(
            self._stage_beam_sdk(sdk_remote_location, staging_location,
                                 temp_dir))
      else:
        # This branch is also used by internal tests running with the SDK built
        # at head.
        if os.path.isdir(setup_options.sdk_location):
          # TODO(angoenka): remove reference to Dataflow
          sdk_path = os.path.join(setup_options.sdk_location,
                                  names.DATAFLOW_SDK_TARBALL_FILE)
        else:
          sdk_path = setup_options.sdk_location

        if os.path.isfile(sdk_path):
          logging.info('Copying Beam SDK "%s" to staging location.', sdk_path)
          staged_path = FileSystems.join(
              staging_location,
              Stager._desired_sdk_filename_in_staging_location(
                  setup_options.sdk_location))
          self.stage_artifact(sdk_path, staged_path)
          _, sdk_staged_filename = FileSystems.split(staged_path)
          resources.append(sdk_staged_filename)
        else:
          if setup_options.sdk_location == 'default':
            raise RuntimeError('Cannot find default Beam SDK tar file "%s"'
                               % sdk_path)
          elif not setup_options.sdk_location:
            logging.info('Beam SDK will not be staged since --sdk_location '
                         'is empty.')
          else:
            raise RuntimeError(
                'The file "%s" cannot be found. Its location was specified by '
                'the --sdk_location command-line option.' % sdk_path)

    # Delete all temp files created while staging job resources.
    shutil.rmtree(temp_dir)
    self.commit_manifest()
    return resources
示例#11
0
  def create_job_resources(options,  # type: PipelineOptions
                           temp_dir,  # type: str
                           build_setup_args=None,  # type: Optional[List[str]]
                           populate_requirements_cache=None,  # type: Optional[str]
                           skip_prestaged_dependencies=False, # type: Optional[bool]
                           ):
    """For internal use only; no backwards-compatibility guarantees.

        Creates (if needed) a list of job resources.

        Args:
          options: Command line options. More specifically the function will
            expect requirements_file, setup_file, and save_main_session options
            to be present.
          temp_dir: Temporary folder where the resource building can happen. If
            None then a unique temp directory will be created. Used only for
            testing.
          build_setup_args: A list of command line arguments used to build a
            setup package. Used only if options.setup_file is not None. Used
            only for testing.
          populate_requirements_cache: Callable for populating the requirements
            cache. Used only for testing.
          skip_prestaged_dependencies: Skip staging dependencies that can be
            added into SDK containers during prebuilding.

        Returns:
          A list of tuples of local file paths and file names (no paths) to be
          used for staging resources.

        Raises:
          RuntimeError: If files specified are not found or error encountered
          while trying to create the resources (e.g., build a setup package).
        """

    resources = []  # type: List[Tuple[str, str]]

    setup_options = options.view_as(SetupOptions)

    # We can skip boot dependencies: apache beam sdk, python packages from
    # requirements.txt, python packages from extra_packages and workflow tarball
    # if we know we are using a dependency pre-installed sdk container image.
    if not skip_prestaged_dependencies:
      # Stage a requirements file if present.
      if setup_options.requirements_file is not None:
        if not os.path.isfile(setup_options.requirements_file):
          raise RuntimeError(
              'The file %s cannot be found. It was specified in the '
              '--requirements_file command line option.' %
              setup_options.requirements_file)
        resources.append((setup_options.requirements_file, REQUIREMENTS_FILE))
        requirements_cache_path = (
            os.path.join(tempfile.gettempdir(), 'dataflow-requirements-cache')
            if setup_options.requirements_cache is None else
            setup_options.requirements_cache)
        # Populate cache with packages from requirements and stage the files
        # in the cache.
        if not os.path.exists(requirements_cache_path):
          os.makedirs(requirements_cache_path)
        (
            populate_requirements_cache if populate_requirements_cache else
            Stager._populate_requirements_cache)(
                setup_options.requirements_file, requirements_cache_path)
        for pkg in glob.glob(os.path.join(requirements_cache_path, '*')):
          resources.append((pkg, os.path.basename(pkg)))

      # Handle a setup file if present.
      # We will build the setup package locally and then copy it to the staging
      # location because the staging location is a remote path and the file
      # cannot be created directly there.
      if setup_options.setup_file is not None:
        if not os.path.isfile(setup_options.setup_file):
          raise RuntimeError(
              'The file %s cannot be found. It was specified in the '
              '--setup_file command line option.' % setup_options.setup_file)
        if os.path.basename(setup_options.setup_file) != 'setup.py':
          raise RuntimeError(
              'The --setup_file option expects the full path to a file named '
              'setup.py instead of %s' % setup_options.setup_file)
        tarball_file = Stager._build_setup_package(
            setup_options.setup_file, temp_dir, build_setup_args)
        resources.append((tarball_file, WORKFLOW_TARBALL_FILE))

      # Handle extra local packages that should be staged.
      if setup_options.extra_packages is not None:
        resources.extend(
            Stager._create_extra_packages(
                setup_options.extra_packages, temp_dir=temp_dir))

      if hasattr(setup_options, 'sdk_location'):

        if (setup_options.sdk_location == 'default') or Stager._is_remote_path(
            setup_options.sdk_location):
          # If --sdk_location is not specified then the appropriate package
          # will be obtained from PyPI (https://pypi.python.org) based on the
          # version of the currently running SDK. If the option is
          # present then no version matching is made and the exact URL or path
          # is expected.
          #
          # Unit tests running in the 'python setup.py test' context will
          # not have the sdk_location attribute present and therefore we
          # will not stage SDK.
          sdk_remote_location = 'pypi' if (
              setup_options.sdk_location == 'default'
          ) else setup_options.sdk_location
          resources.extend(
              Stager._create_beam_sdk(sdk_remote_location, temp_dir))
        elif setup_options.sdk_location == 'container':
          # Use the SDK that's built into the container, rather than re-staging
          # it.
          pass
        else:
          # This branch is also used by internal tests running with the SDK
          # built at head.
          if os.path.isdir(setup_options.sdk_location):
            sdk_path = os.path.join(
                setup_options.sdk_location, WORKFLOW_TARBALL_FILE)
          else:
            sdk_path = setup_options.sdk_location

          if os.path.isfile(sdk_path):
            _LOGGER.info('Copying Beam SDK "%s" to staging location.', sdk_path)
            resources.append((
                sdk_path,
                Stager._desired_sdk_filename_in_staging_location(
                    setup_options.sdk_location)))
          else:
            if setup_options.sdk_location == 'default':
              raise RuntimeError(
                  'Cannot find default Beam SDK tar file "%s"' % sdk_path)
            elif not setup_options.sdk_location:
              _LOGGER.info(
                  'Beam SDK will not be staged since --sdk_location '
                  'is empty.')
            else:
              raise RuntimeError(
                  'The file "%s" cannot be found. Its location was specified '
                  'by the --sdk_location command-line option.' % sdk_path)

    # The following artifacts are not processed by python sdk container boot
    # sequence in a setup mode and hence should not be skipped even if a
    # prebuilt sdk container image is used.

    # TODO(heejong): remove jar_packages experimental flag when cross-language
    #   dependency management is implemented for all runners.
    # Handle jar packages that should be staged for Java SDK Harness.
    jar_packages = options.view_as(DebugOptions).lookup_experiment(
        'jar_packages')
    if jar_packages is not None:
      resources.extend(
          Stager._create_jar_packages(
              jar_packages.split(','), temp_dir=temp_dir))

    # Pickle the main session if requested.
    # We will create the pickled main session locally and then copy it to the
    # staging location because the staging location is a remote path and the
    # file cannot be created directly there.
    if setup_options.save_main_session:
      pickled_session_file = os.path.join(
          temp_dir, names.PICKLED_MAIN_SESSION_FILE)
      pickler.dump_session(pickled_session_file)
      resources.append((pickled_session_file, names.PICKLED_MAIN_SESSION_FILE))

    worker_options = options.view_as(WorkerOptions)
    dataflow_worker_jar = getattr(worker_options, 'dataflow_worker_jar', None)
    if dataflow_worker_jar is not None:
      jar_staged_filename = 'dataflow-worker.jar'
      resources.append((dataflow_worker_jar, jar_staged_filename))

    return resources