def test_no_temp_location(self):
     staging_dir = tempfile.mkdtemp()
     options = PipelineOptions()
     google_cloud_options = options.view_as(GoogleCloudOptions)
     google_cloud_options.staging_location = staging_dir
     self.update_options(options)
     google_cloud_options.temp_location = None
     with self.assertRaises(RuntimeError) as cm:
         dependency.stage_job_resources(options)
     self.assertEqual('The --temp_location option must be specified.',
                      cm.exception.message)
Пример #2
0
 def test_no_temp_location(self):
   staging_dir = tempfile.mkdtemp()
   options = PipelineOptions()
   google_cloud_options = options.view_as(GoogleCloudOptions)
   google_cloud_options.staging_location = staging_dir
   self.update_options(options)
   google_cloud_options.temp_location = None
   with self.assertRaises(RuntimeError) as cm:
     dependency.stage_job_resources(options)
   self.assertEqual('The --temp_location option must be specified.',
                    cm.exception.message)
Пример #3
0
 def test_requirements_file_not_present(self):
   staging_dir = tempfile.mkdtemp()
   with self.assertRaises(RuntimeError) as cm:
     options = PipelineOptions()
     options.view_as(GoogleCloudOptions).staging_location = staging_dir
     self.update_options(options)
     options.view_as(SetupOptions).requirements_file = 'nosuchfile'
     dependency.stage_job_resources(options)
   self.assertEqual(
       cm.exception.message,
       'The file %s cannot be found. It was specified in the '
       '--requirements_file command line option.' % 'nosuchfile')
 def test_requirements_file_not_present(self):
     staging_dir = tempfile.mkdtemp()
     with self.assertRaises(RuntimeError) as cm:
         options = PipelineOptions()
         options.view_as(GoogleCloudOptions).staging_location = staging_dir
         self.update_options(options)
         options.view_as(SetupOptions).requirements_file = 'nosuchfile'
         dependency.stage_job_resources(options)
     self.assertEqual(
         cm.exception.message,
         'The file %s cannot be found. It was specified in the '
         '--requirements_file command line option.' % 'nosuchfile')
    def test_sdk_location_local_not_present(self):
        staging_dir = tempfile.mkdtemp()
        sdk_location = 'nosuchdir'
        with self.assertRaises(RuntimeError) as cm:
            options = PipelineOptions()
            options.view_as(GoogleCloudOptions).staging_location = staging_dir
            self.update_options(options)
            options.view_as(SetupOptions).sdk_location = sdk_location

            dependency.stage_job_resources(options)
        self.assertEqual(
            'The file "%s" cannot be found. Its '
            'location was specified by the --sdk_location command-line option.'
            % sdk_location, cm.exception.message)
Пример #6
0
  def test_with_extra_packages_missing_files(self):
    staging_dir = tempfile.mkdtemp()
    with self.assertRaises(RuntimeError) as cm:

      options = PipelineOptions()
      options.view_as(GoogleCloudOptions).staging_location = staging_dir
      self.update_options(options)
      options.view_as(SetupOptions).extra_packages = ['nosuchfile.tar.gz']

      dependency.stage_job_resources(options)
    self.assertEqual(
        cm.exception.message,
        'The file %s cannot be found. It was specified in the '
        '--extra_packages command line option.' % 'nosuchfile.tar.gz')
Пример #7
0
  def test_sdk_location_local_not_present(self):
    staging_dir = tempfile.mkdtemp()
    sdk_location = 'nosuchdir'
    with self.assertRaises(RuntimeError) as cm:
      options = PipelineOptions()
      options.view_as(GoogleCloudOptions).staging_location = staging_dir
      self.update_options(options)
      options.view_as(SetupOptions).sdk_location = sdk_location

      dependency.stage_job_resources(options)
    self.assertEqual(
        'The file "%s" cannot be found. Its '
        'location was specified by the --sdk_location command-line option.' %
        sdk_location,
        cm.exception.message)
    def test_with_extra_packages_missing_files(self):
        staging_dir = tempfile.mkdtemp()
        with self.assertRaises(RuntimeError) as cm:

            options = PipelineOptions()
            options.view_as(GoogleCloudOptions).staging_location = staging_dir
            self.update_options(options)
            options.view_as(SetupOptions).extra_packages = [
                'nosuchfile.tar.gz'
            ]

            dependency.stage_job_resources(options)
        self.assertEqual(
            cm.exception.message,
            'The file %s cannot be found. It was specified in the '
            '--extra_packages command line option.' % 'nosuchfile.tar.gz')
Пример #9
0
  def test_with_setup_file(self):
    staging_dir = tempfile.mkdtemp()
    source_dir = tempfile.mkdtemp()
    self.create_temp_file(
        os.path.join(source_dir, 'setup.py'), 'notused')

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).setup_file = os.path.join(
        source_dir, 'setup.py')

    self.assertEqual(
        [dependency.WORKFLOW_TARBALL_FILE,
         names.PICKLED_MAIN_SESSION_FILE],
        dependency.stage_job_resources(
            options,
            # We replace the build setup command because a realistic one would
            # require the setuptools package to be installed. Note that we can't
            # use "touch" here to create the expected output tarball file, since
            # touch is not available on Windows, so we invoke python to produce
            # equivalent behavior.
            build_setup_args=[
                'python', '-c', 'open(__import__("sys").argv[1], "a")',
                os.path.join(source_dir, dependency.WORKFLOW_TARBALL_FILE)],
            temp_dir=source_dir))
    self.assertTrue(
        os.path.isfile(
            os.path.join(staging_dir, dependency.WORKFLOW_TARBALL_FILE)))
    def test_with_setup_file(self):
        staging_dir = tempfile.mkdtemp()
        source_dir = tempfile.mkdtemp()
        self.create_temp_file(os.path.join(source_dir, 'setup.py'), 'notused')

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).setup_file = os.path.join(
            source_dir, 'setup.py')

        self.assertEqual(
            [
                dependency.WORKFLOW_TARBALL_FILE,
                names.PICKLED_MAIN_SESSION_FILE
            ],
            dependency.stage_job_resources(
                options,
                # We replace the build setup command because a realistic one would
                # require the setuptools package to be installed. Note that we can't
                # use "touch" here to create the expected output tarball file, since
                # touch is not available on Windows, so we invoke python to produce
                # equivalent behavior.
                build_setup_args=[
                    'python', '-c', 'open(__import__("sys").argv[1], "a")',
                    os.path.join(source_dir, dependency.WORKFLOW_TARBALL_FILE)
                ],
                temp_dir=source_dir))
        self.assertTrue(
            os.path.isfile(
                os.path.join(staging_dir, dependency.WORKFLOW_TARBALL_FILE)))
Пример #11
0
  def create_job(self, job):
    """Submits for remote execution a job described by the workflow proto."""
    # Stage job resources and add an environment proto with their paths.
    resources = dependency.stage_job_resources(
        job.options, file_copy=self._gcs_file_copy)
    job.proto.environment = Environment(
        packages=resources, options=job.options,
        environment_version=self.environment_version).proto
    # TODO(silviuc): Remove the debug logging eventually.
    logging.info('JOB: %s', job)
    request = dataflow.DataflowProjectsJobsCreateRequest()

    request.projectId = self.google_cloud_options.project
    request.job = job.proto

    try:
      response = self._client.projects_jobs.Create(request)
    except exceptions.BadStatusCodeError as e:
      logging.error('HTTP status %d trying to create job'
                    ' at dataflow service endpoint %s',
                    e.response.status,
                    self.google_cloud_options.dataflow_endpoint)
      logging.fatal('details of server error: %s', e)
      raise
    logging.info('Create job: %s', response)
    # The response is a Job proto with the id for the new job.
    logging.info('Created job with id: [%s]', response.id)
    logging.info(
        'To accesss the Dataflow monitoring console, please navigate to '
        'https://console.developers.google.com/project/%s/dataflow/job/%s',
        self.google_cloud_options.project, response.id)

    # Show the whitelisting warning. Projects should be whitelisted prior to
    # submitting jobs to Google Cloud Dataflow service. Please see documentation
    # for more information.
    #
    # TODO(altay): Remove once the whitelisting requirements are lifted.
    logging.warning(
        '\n\n***************************************************************\n'
        '*      WARNING: PROJECT WHITELISTING REQUIRED.                *'
        '\n***************************************************************\n'
        'Please make sure your project is whitelisted for running\n'
        'Python-based pipelines using the Google Cloud Dataflow service.\n\n'
        'You may ignore this message if you have successfully ran\n'
        'Python-based pipelines with this project on Google Cloud\n'
        'Dataflow service before.\n\n'
        'If your project is not whitelisted, your job will attempt to run\n'
        'however it will fail to make any progress. Google Cloud Dataflow\n'
        'service will automatically cancel your non-whitelisted job\n'
        'after some time due to inactivity. You can also manually cancel\n'
        'your job using the following command:\n\n'
        'gcloud alpha dataflow jobs --project=%s cancel %s\n\n'
        'Please refer to the documentation to learn more about whitelisting\n'
        'your project at: %s'
        '\n***************************************************************\n\n',
        request.projectId, response.id,
        'http://goo.gl/forms/o4w14whz9x'
    )

    return response
 def test_with_extra_packages_invalid_file_name(self):
     staging_dir = tempfile.mkdtemp()
     source_dir = tempfile.mkdtemp()
     self.create_temp_file(os.path.join(source_dir, 'abc.tgz'), 'nothing')
     with self.assertRaises(RuntimeError) as cm:
         options = PipelineOptions()
         options.view_as(GoogleCloudOptions).staging_location = staging_dir
         self.update_options(options)
         options.view_as(SetupOptions).extra_packages = [
             os.path.join(source_dir, 'abc.tgz')
         ]
         dependency.stage_job_resources(options)
     self.assertEqual(
         cm.exception.message,
         'The --extra_packages option expects a full path ending with '
         '\'.tar.gz\' instead of %s' % os.path.join(source_dir, 'abc.tgz'))
Пример #13
0
 def test_with_extra_packages_invalid_file_name(self):
   staging_dir = tempfile.mkdtemp()
   source_dir = tempfile.mkdtemp()
   self.create_temp_file(
       os.path.join(source_dir, 'abc.tgz'), 'nothing')
   with self.assertRaises(RuntimeError) as cm:
     options = PipelineOptions()
     options.view_as(GoogleCloudOptions).staging_location = staging_dir
     self.update_options(options)
     options.view_as(SetupOptions).extra_packages = [
         os.path.join(source_dir, 'abc.tgz')]
     dependency.stage_job_resources(options)
   self.assertEqual(
       cm.exception.message,
       'The --extra_packages option expects a full path ending with '
       '\'.tar.gz\' instead of %s' % os.path.join(source_dir, 'abc.tgz'))
Пример #14
0
    def create_job(self, job):
        """Submits for remote execution a job described by the workflow proto."""
        # Stage job resources and add an environment proto with their paths.
        resources = dependency.stage_job_resources(
            job.options, file_copy=self._gcs_file_copy)
        job.proto.environment = Environment(
            packages=resources,
            options=job.options,
            environment_version=self.environment_version).proto
        # TODO(silviuc): Remove the debug logging eventually.
        logging.info('JOB: %s', job)
        request = dataflow.DataflowProjectsJobsCreateRequest()

        request.projectId = self.google_cloud_options.project
        request.job = job.proto

        try:
            response = self._client.projects_jobs.Create(request)
        except exceptions.BadStatusCodeError as e:
            logging.error(
                'HTTP status %d trying to create job'
                ' at dataflow service endpoint %s', e.response.status,
                self.google_cloud_options.dataflow_endpoint)
            logging.fatal('details of server error: %s', e)
            raise
        logging.info('Create job: %s', response)
        # The response is a Job proto with the id for the new job.
        logging.info('Created job with id: [%s]', response.id)
        logging.info(
            'To accesss the Dataflow monitoring console, please navigate to '
            'https://console.developers.google.com/project/%s/dataflow/job/%s',
            self.google_cloud_options.project, response.id)

        # Show the whitelisting warning. Projects should be whitelisted prior to
        # submitting jobs to Google Cloud Dataflow service. Please see documentation
        # for more information.
        #
        # TODO(altay): Remove once the whitelisting requirements are lifted.
        logging.warning(
            '\n\n***************************************************************\n'
            '*      WARNING: PROJECT WHITELISTING REQUIRED.                *'
            '\n***************************************************************\n'
            'Please make sure your project is whitelisted for running\n'
            'Python-based pipelines using the Google Cloud Dataflow service.\n\n'
            'You may ignore this message if you have successfully ran\n'
            'Python-based pipelines with this project on Google Cloud\n'
            'Dataflow service before.\n\n'
            'If your project is not whitelisted, your job will attempt to run\n'
            'however it will fail to make any progress. Google Cloud Dataflow\n'
            'service will automatically cancel your non-whitelisted job\n'
            'after some time due to inactivity. You can also manually cancel\n'
            'your job using the following command:\n\n'
            'gcloud alpha dataflow jobs --project=%s cancel %s\n\n'
            'Please refer to the documentation to learn more about whitelisting\n'
            'your project at: %s'
            '\n***************************************************************\n\n',
            request.projectId, response.id, 'http://goo.gl/forms/o4w14whz9x')

        return response
Пример #15
0
  def create_job(self, job):
    """Submits for remote execution a job described by the workflow proto."""

    # Checks the whitelisting status of this account. This is just an early
    # courtesy check to show a warning in case of potential whitelisting errors.
    # It will not block job submission. Jobs submitted from non-whitelisted
    # projects will fail to download required files, make no progress and fail
    # eventually.
    #
    # This check will provide a false warning if a project is whitelisted but
    # not the current user. In that case job will still execute successfully
    # in the service.
    #
    # TODO(altay): Remove once the whitelisting requirements are lifted.
    try:
      request = storage.StorageObjectsListRequest(
          bucket='dataflow-python-docker')
      self._storage_client.objects.List(request)
    except exceptions.HttpError as e:
      if e.status_code == 403:
        logging.error(
            '\n*************************************************************\n'
            'This account is not whitelisted to run Python-based pipelines '
            'using the Google Cloud Dataflow service. '
            'Make sure that your project is whitelisted before submitting your '
            'job. \nPlease see documentation for getting more information on '
            'getting your project whitelisted.'
            '\n*************************************************************\n')
      else:
        logging.warning('Could not verify whitelisting status.')

    # Stage job resources and add an environment proto with their paths.
    resources = dependency.stage_job_resources(
        job.options, file_copy=self._gcs_file_copy)
    job.proto.environment = Environment(
        packages=resources, options=job.options,
        environment_version=self.environment_version).proto
    # TODO(silviuc): Remove the debug logging eventually.
    logging.info('JOB: %s', job)
    request = dataflow.DataflowProjectsJobsCreateRequest()

    request.projectId = self.google_cloud_options.project
    request.job = job.proto

    try:
      response = self._client.projects_jobs.Create(request)
    except exceptions.BadStatusCodeError as e:
      logging.error('HTTP status %d trying to create job'
                    ' at dataflow service endpoint %s',
                    e.response.status,
                    self.google_cloud_options.dataflow_endpoint)
      logging.fatal('details of server error: %s', e)
      raise
    logging.info('Create job: %s', response)
    # The response is a Job proto with the id for the new job.
    logging.info('Created job with id: [%s]', response.id)
    return response
    def test_setup_file_not_named_setup_dot_py(self):
        staging_dir = tempfile.mkdtemp()
        source_dir = tempfile.mkdtemp()

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).setup_file = (os.path.join(
            source_dir, 'xyz-setup.py'))

        self.create_temp_file(os.path.join(source_dir, 'xyz-setup.py'),
                              'notused')
        with self.assertRaises(RuntimeError) as cm:
            dependency.stage_job_resources(options)
        self.assertTrue(
            cm.exception.message.startswith(
                'The --setup_file option expects the full path to a file named '
                'setup.py instead of '))
    def test_no_main_session(self):
        staging_dir = tempfile.mkdtemp()
        options = PipelineOptions()

        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        options.view_as(SetupOptions).save_main_session = False
        self.update_options(options)

        self.assertEqual([], dependency.stage_job_resources(options))
Пример #18
0
  def test_setup_file_not_named_setup_dot_py(self):
    staging_dir = tempfile.mkdtemp()
    source_dir = tempfile.mkdtemp()

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).setup_file = (
        os.path.join(source_dir, 'xyz-setup.py'))

    self.create_temp_file(
        os.path.join(source_dir, 'xyz-setup.py'), 'notused')
    with self.assertRaises(RuntimeError) as cm:
      dependency.stage_job_resources(options)
    self.assertTrue(
        cm.exception.message.startswith(
            'The --setup_file option expects the full path to a file named '
            'setup.py instead of '))
Пример #19
0
  def test_no_main_session(self):
    staging_dir = tempfile.mkdtemp()
    options = PipelineOptions()

    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    options.view_as(SetupOptions).save_main_session = False
    self.update_options(options)

    self.assertEqual(
        [],
        dependency.stage_job_resources(options))
    def test_default_resources(self):
        staging_dir = tempfile.mkdtemp()
        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)

        self.assertEqual([names.PICKLED_MAIN_SESSION_FILE],
                         dependency.stage_job_resources(options))
        self.assertTrue(
            os.path.isfile(
                os.path.join(staging_dir, names.PICKLED_MAIN_SESSION_FILE)))
Пример #21
0
  def test_default_resources(self):
    staging_dir = tempfile.mkdtemp()
    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)

    self.assertEqual(
        [names.PICKLED_MAIN_SESSION_FILE],
        dependency.stage_job_resources(options))
    self.assertTrue(
        os.path.isfile(
            os.path.join(staging_dir, names.PICKLED_MAIN_SESSION_FILE)))
    def test_sdk_location_gcs(self):
        staging_dir = tempfile.mkdtemp()
        sdk_location = 'gs://my-gcs-bucket/tarball.tar.gz'
        self.override_file_copy(sdk_location, staging_dir)

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).sdk_location = sdk_location

        self.assertEqual(
            [names.PICKLED_MAIN_SESSION_FILE, names.DATAFLOW_SDK_TARBALL_FILE],
            dependency.stage_job_resources(options))
Пример #23
0
  def test_sdk_location_gcs(self):
    staging_dir = tempfile.mkdtemp()
    sdk_location = 'gs://my-gcs-bucket/tarball.tar.gz'
    self.override_file_copy(sdk_location, staging_dir)

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).sdk_location = sdk_location

    self.assertEqual(
        [names.PICKLED_MAIN_SESSION_FILE,
         names.DATAFLOW_SDK_TARBALL_FILE],
        dependency.stage_job_resources(options))
    def test_sdk_location_default(self):
        staging_dir = tempfile.mkdtemp()
        expected_from_url = '%s/v%s.tar.gz' % (dependency.PACKAGES_URL_PREFIX,
                                               __version__)
        expected_from_path = self.override_file_download(
            expected_from_url, staging_dir)
        self.override_file_copy(expected_from_path, staging_dir)

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).sdk_location = 'default'

        self.assertEqual(
            [names.PICKLED_MAIN_SESSION_FILE, names.DATAFLOW_SDK_TARBALL_FILE],
            dependency.stage_job_resources(
                options, file_copy=dependency._dependency_file_copy))
    def test_with_extra_packages(self):
        staging_dir = tempfile.mkdtemp()
        source_dir = tempfile.mkdtemp()
        self.create_temp_file(os.path.join(source_dir, 'abc.tar.gz'),
                              'nothing')
        self.create_temp_file(os.path.join(source_dir, 'xyz.tar.gz'),
                              'nothing')
        self.create_temp_file(
            os.path.join(source_dir, dependency.EXTRA_PACKAGES_FILE),
            'nothing')

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).extra_packages = [
            os.path.join(source_dir, 'abc.tar.gz'),
            os.path.join(source_dir, 'xyz.tar.gz'),
            'gs://my-gcs-bucket/gcs.tar.gz'
        ]

        gcs_copied_files = []

        def file_copy(from_path, to_path):
            if from_path.startswith('gs://'):
                gcs_copied_files.append(from_path)
                _, from_name = os.path.split(from_path)
                self.create_temp_file(os.path.join(to_path, from_name),
                                      'nothing')
                logging.info('Fake copied GCS file: %s to %s', from_path,
                             to_path)
            elif to_path.startswith('gs://'):
                logging.info('Faking file_copy(%s, %s)', from_path, to_path)
            else:
                shutil.copyfile(from_path, to_path)

        dependency._dependency_file_copy = file_copy

        self.assertEqual([
            'abc.tar.gz', 'xyz.tar.gz', 'gcs.tar.gz',
            dependency.EXTRA_PACKAGES_FILE, names.PICKLED_MAIN_SESSION_FILE
        ], dependency.stage_job_resources(options))
        with open(os.path.join(staging_dir,
                               dependency.EXTRA_PACKAGES_FILE)) as f:
            self.assertEqual(['abc.tar.gz\n', 'xyz.tar.gz\n', 'gcs.tar.gz\n'],
                             f.readlines())
        self.assertEqual(['gs://my-gcs-bucket/gcs.tar.gz'], gcs_copied_files)
    def test_with_requirements_file(self):
        staging_dir = tempfile.mkdtemp()
        source_dir = tempfile.mkdtemp()

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).requirements_file = os.path.join(
            source_dir, dependency.REQUIREMENTS_FILE)
        self.create_temp_file(
            os.path.join(source_dir, dependency.REQUIREMENTS_FILE), 'nothing')
        self.assertEqual(
            [dependency.REQUIREMENTS_FILE, names.PICKLED_MAIN_SESSION_FILE],
            dependency.stage_job_resources(options))
        self.assertTrue(
            os.path.isfile(
                os.path.join(staging_dir, dependency.REQUIREMENTS_FILE)))
Пример #27
0
  def test_sdk_location_gcs(self):
    staging_dir = tempfile.mkdtemp()
    sdk_location = 'gs://my-gcs-bucket'
    expected_from_path = utils.path.join(
        sdk_location,
        'google-cloud-dataflow-python-sdk-%s.tgz' % __version__)
    self.override_file_copy(expected_from_path, staging_dir)

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).sdk_location = sdk_location

    self.assertEqual(
        [names.PICKLED_MAIN_SESSION_FILE,
         names.DATAFLOW_SDK_TARBALL_FILE],
        dependency.stage_job_resources(options))
Пример #28
0
  def test_with_requirements_file(self):
    staging_dir = tempfile.mkdtemp()
    source_dir = tempfile.mkdtemp()

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).requirements_file = os.path.join(
        source_dir, dependency.REQUIREMENTS_FILE)
    self.create_temp_file(
        os.path.join(source_dir, dependency.REQUIREMENTS_FILE), 'nothing')
    self.assertEqual(
        [dependency.REQUIREMENTS_FILE,
         names.PICKLED_MAIN_SESSION_FILE],
        dependency.stage_job_resources(options))
    self.assertTrue(
        os.path.isfile(
            os.path.join(staging_dir, dependency.REQUIREMENTS_FILE)))
    def test_sdk_location_local(self):
        staging_dir = tempfile.mkdtemp()
        sdk_location = tempfile.mkdtemp()
        self.create_temp_file(
            os.path.join(sdk_location, names.DATAFLOW_SDK_TARBALL_FILE),
            'contents')

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).sdk_location = sdk_location

        self.assertEqual(
            [names.PICKLED_MAIN_SESSION_FILE, names.DATAFLOW_SDK_TARBALL_FILE],
            dependency.stage_job_resources(options))
        tarball_path = os.path.join(staging_dir,
                                    names.DATAFLOW_SDK_TARBALL_FILE)
        with open(tarball_path) as f:
            self.assertEqual(f.read(), 'contents')
Пример #30
0
  def test_sdk_location_default(self):
    staging_dir = tempfile.mkdtemp()
    expected_from_url = '%s/v%s.tar.gz' % (
        dependency.PACKAGES_URL_PREFIX, __version__)
    expected_from_path = self.override_file_download(
        expected_from_url, staging_dir)
    self.override_file_copy(expected_from_path, staging_dir)

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).sdk_location = 'default'

    self.assertEqual(
        [names.PICKLED_MAIN_SESSION_FILE,
         names.DATAFLOW_SDK_TARBALL_FILE],
        dependency.stage_job_resources(
            options,
            file_copy=dependency._dependency_file_copy))
Пример #31
0
  def test_with_extra_packages(self):
    staging_dir = tempfile.mkdtemp()
    source_dir = tempfile.mkdtemp()
    self.create_temp_file(
        os.path.join(source_dir, 'abc.tar.gz'), 'nothing')
    self.create_temp_file(
        os.path.join(source_dir, 'xyz.tar.gz'), 'nothing')
    self.create_temp_file(
        os.path.join(source_dir, dependency.EXTRA_PACKAGES_FILE), 'nothing')

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).extra_packages = [
        os.path.join(source_dir, 'abc.tar.gz'),
        os.path.join(source_dir, 'xyz.tar.gz'),
        'gs://my-gcs-bucket/gcs.tar.gz']

    gcs_copied_files = []
    def file_copy(from_path, to_path):
      if from_path.startswith('gs://'):
        gcs_copied_files.append(from_path)
        _, from_name = os.path.split(from_path)
        self.create_temp_file(os.path.join(to_path, from_name), 'nothing')
        logging.info('Fake copied GCS file: %s to %s', from_path, to_path)
      elif to_path.startswith('gs://'):
        logging.info('Faking file_copy(%s, %s)', from_path, to_path)
      else:
        shutil.copyfile(from_path, to_path)

    dependency._dependency_file_copy = file_copy

    self.assertEqual(
        ['abc.tar.gz', 'xyz.tar.gz', 'gcs.tar.gz',
         dependency.EXTRA_PACKAGES_FILE,
         names.PICKLED_MAIN_SESSION_FILE],
        dependency.stage_job_resources(options))
    with open(os.path.join(staging_dir, dependency.EXTRA_PACKAGES_FILE)) as f:
      self.assertEqual(['abc.tar.gz\n', 'xyz.tar.gz\n', 'gcs.tar.gz\n'],
                       f.readlines())
    self.assertEqual(['gs://my-gcs-bucket/gcs.tar.gz'], gcs_copied_files)
Пример #32
0
  def test_sdk_location_local(self):
    staging_dir = tempfile.mkdtemp()
    sdk_location = tempfile.mkdtemp()
    self.create_temp_file(
        os.path.join(
            sdk_location,
            names.DATAFLOW_SDK_TARBALL_FILE),
        'contents')

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).sdk_location = sdk_location

    self.assertEqual(
        [names.PICKLED_MAIN_SESSION_FILE,
         names.DATAFLOW_SDK_TARBALL_FILE],
        dependency.stage_job_resources(options))
    tarball_path = os.path.join(
        staging_dir, names.DATAFLOW_SDK_TARBALL_FILE)
    with open(tarball_path) as f:
      self.assertEqual(f.read(), 'contents')
Пример #33
0
  def test_with_extra_packages(self):
    staging_dir = tempfile.mkdtemp()
    source_dir = tempfile.mkdtemp()
    self.create_temp_file(
        os.path.join(source_dir, 'abc.tar.gz'), 'nothing')
    self.create_temp_file(
        os.path.join(source_dir, 'xyz.tar.gz'), 'nothing')
    self.create_temp_file(
        os.path.join(source_dir, dependency.EXTRA_PACKAGES_FILE), 'nothing')

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).extra_packages = [
        os.path.join(source_dir, 'abc.tar.gz'),
        os.path.join(source_dir, 'xyz.tar.gz')]

    self.assertEqual(
        ['abc.tar.gz', 'xyz.tar.gz', dependency.EXTRA_PACKAGES_FILE,
         names.PICKLED_MAIN_SESSION_FILE],
        dependency.stage_job_resources(options))
    with open(os.path.join(staging_dir, dependency.EXTRA_PACKAGES_FILE)) as f:
      self.assertEqual(['abc.tar.gz\n', 'xyz.tar.gz\n'], f.readlines())
Пример #34
0
  def test_with_requirements_file_and_cache(self):
    staging_dir = tempfile.mkdtemp()
    source_dir = tempfile.mkdtemp()

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).requirements_file = os.path.join(
        source_dir, dependency.REQUIREMENTS_FILE)
    options.view_as(SetupOptions).requirements_cache = os.path.join(
        tempfile.gettempdir(), 'alternative-cache-dir')
    self.create_temp_file(
        os.path.join(source_dir, dependency.REQUIREMENTS_FILE), 'nothing')
    self.assertEqual(
        sorted([dependency.REQUIREMENTS_FILE, names.PICKLED_MAIN_SESSION_FILE,
                'abc.txt', 'def.txt']),
        sorted(dependency.stage_job_resources(
            options,
            populate_requirements_cache=self.populate_requirements_cache)))
    self.assertTrue(
        os.path.isfile(
            os.path.join(staging_dir, dependency.REQUIREMENTS_FILE)))
    self.assertTrue(os.path.isfile(os.path.join(staging_dir, 'abc.txt')))
    self.assertTrue(os.path.isfile(os.path.join(staging_dir, 'def.txt')))
 def test_no_staging_location(self):
     with self.assertRaises(RuntimeError) as cm:
         dependency.stage_job_resources(PipelineOptions())
     self.assertEqual('The --staging_location option must be specified.',
                      cm.exception.message)
Пример #36
0
 def test_no_staging_location(self):
   with self.assertRaises(RuntimeError) as cm:
     dependency.stage_job_resources(PipelineOptions())
   self.assertEqual('The --staging_location option must be specified.',
                    cm.exception.message)