Exemplo n.º 1
0
  def test_with_setup_file(self):
    staging_dir = tempfile.mkdtemp()
    source_dir = tempfile.mkdtemp()
    self.create_temp_file(
        os.path.join(source_dir, 'setup.py'), 'notused')

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).setup_file = os.path.join(
        source_dir, 'setup.py')

    self.assertEqual(
        [dependency.WORKFLOW_TARBALL_FILE,
         names.PICKLED_MAIN_SESSION_FILE],
        dependency.stage_job_resources(
            options,
            # We replace the build setup command because a realistic one would
            # require the setuptools package to be installed. Note that we can't
            # use "touch" here to create the expected output tarball file, since
            # touch is not available on Windows, so we invoke python to produce
            # equivalent behavior.
            build_setup_args=[
                'python', '-c', 'open(__import__("sys").argv[1], "a")',
                os.path.join(source_dir, dependency.WORKFLOW_TARBALL_FILE)],
            temp_dir=source_dir))
    self.assertTrue(
        os.path.isfile(
            os.path.join(staging_dir, dependency.WORKFLOW_TARBALL_FILE)))
    def test_with_setup_file(self):
        staging_dir = tempfile.mkdtemp()
        source_dir = tempfile.mkdtemp()
        self.create_temp_file(os.path.join(source_dir, 'setup.py'), 'notused')

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).setup_file = os.path.join(
            source_dir, 'setup.py')

        self.assertEqual(
            [
                dependency.WORKFLOW_TARBALL_FILE,
                names.PICKLED_MAIN_SESSION_FILE
            ],
            dependency.stage_job_resources(
                options,
                # We replace the build setup command because a realistic one would
                # require the setuptools package to be installed. Note that we can't
                # use "touch" here to create the expected output tarball file, since
                # touch is not available on Windows, so we invoke python to produce
                # equivalent behavior.
                build_setup_args=[
                    'python', '-c', 'open(__import__("sys").argv[1], "a")',
                    os.path.join(source_dir, dependency.WORKFLOW_TARBALL_FILE)
                ],
                temp_dir=source_dir))
        self.assertTrue(
            os.path.isfile(
                os.path.join(staging_dir, dependency.WORKFLOW_TARBALL_FILE)))
Exemplo n.º 3
0
def examples_wordcount_minimal(renames):
  """MinimalWordCount example snippets.

  URL:
  https://cloud.google.com/dataflow/examples/wordcount-example#MinimalWordCount
  """
  import re

  import google.cloud.dataflow as df

  from google.cloud.dataflow.utils.options import GoogleCloudOptions
  from google.cloud.dataflow.utils.options import StandardOptions
  from google.cloud.dataflow.utils.options import PipelineOptions

  # [START examples_wordcount_minimal_options]
  options = PipelineOptions()
  google_cloud_options = options.view_as(GoogleCloudOptions)
  google_cloud_options.project = 'my-project-id'
  google_cloud_options.job_name = 'myjob'
  google_cloud_options.staging_location = 'gs://your-bucket-name-here/staging'
  google_cloud_options.temp_location = 'gs://your-bucket-name-here/temp'
  options.view_as(StandardOptions).runner = 'BlockingDataflowPipelineRunner'
  # [END examples_wordcount_minimal_options]

  # Run it locally for testing.
  options = PipelineOptions()

  # [START examples_wordcount_minimal_create]
  p = df.Pipeline(options=options)
  # [END examples_wordcount_minimal_create]

  (
      # [START examples_wordcount_minimal_read]
      p | df.io.Read(df.io.TextFileSource(
          'gs://dataflow-samples/shakespeare/kinglear.txt'))
      # [END examples_wordcount_minimal_read]

      # [START examples_wordcount_minimal_pardo]
      | df.FlatMap('ExtractWords', lambda x: re.findall(r'[A-Za-z\']+', x))
      # [END examples_wordcount_minimal_pardo]

      # [START examples_wordcount_minimal_count]
      | df.combiners.Count.PerElement()
      # [END examples_wordcount_minimal_count]

      # [START examples_wordcount_minimal_map]
      | df.Map(lambda (word, count): '%s: %s' % (word, count))
      # [END examples_wordcount_minimal_map]

      # [START examples_wordcount_minimal_write]
      | df.io.Write(df.io.TextFileSink('gs://my-bucket/counts.txt'))
      # [END examples_wordcount_minimal_write]
  )

  p.visit(SnippetUtils.RenameFiles(renames))

  # [START examples_wordcount_minimal_run]
  p.run()
    def test_no_main_session(self):
        staging_dir = tempfile.mkdtemp()
        options = PipelineOptions()

        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        options.view_as(SetupOptions).save_main_session = False
        self.update_options(options)

        self.assertEqual([], dependency.stage_job_resources(options))
 def test_get_all_options(self):
   for case in PipelineOptionsTest.TEST_CASES:
     options = PipelineOptions(flags=case['flags'])
     self.assertDictContainsSubset(case['expected'], options.get_all_options())
     self.assertEqual(options.view_as(
         PipelineOptionsTest.MockOptions).mock_flag,
                      case['expected']['mock_flag'])
     self.assertEqual(options.view_as(
         PipelineOptionsTest.MockOptions).mock_option,
                      case['expected']['mock_option'])
Exemplo n.º 6
0
def pipeline_options_remote(argv):
  """"Creating a Pipeline using a PipelineOptions object for remote execution.

  URL: https://cloud.google.com/dataflow/pipelines/specifying-exec-params
  """

  from google.cloud.dataflow import Pipeline
  from google.cloud.dataflow.utils.options import PipelineOptions

  # [START pipeline_options_create]
  options = PipelineOptions(flags=argv)
  # [END pipeline_options_create]

  # [START pipeline_options_define_custom]
  class MyOptions(PipelineOptions):

    @classmethod
    def _add_argparse_args(cls, parser):
      parser.add_argument('--input')
      parser.add_argument('--output')
  # [END pipeline_options_define_custom]

  from google.cloud.dataflow.utils.options import GoogleCloudOptions
  from google.cloud.dataflow.utils.options import StandardOptions

  # [START pipeline_options_dataflow_service]
  # Create and set your PipelineOptions.
  options = PipelineOptions(flags=argv)

  # For Cloud execution, set the Cloud Platform project, job_name,
  # staging location, temp_location and specify DataflowPipelineRunner or
  # BlockingDataflowPipelineRunner.
  google_cloud_options = options.view_as(GoogleCloudOptions)
  google_cloud_options.project = 'my-project-id'
  google_cloud_options.job_name = 'myjob'
  google_cloud_options.staging_location = 'gs://my-bucket/binaries'
  google_cloud_options.temp_location = 'gs://my-bucket/temp'
  options.view_as(StandardOptions).runner = 'DataflowPipelineRunner'

  # Create the Pipeline with the specified options.
  p = Pipeline(options=options)
  # [END pipeline_options_dataflow_service]

  my_options = options.view_as(MyOptions)
  my_input = my_options.input
  my_output = my_options.output

  # Overriding the runner for tests.
  options.view_as(StandardOptions).runner = 'DirectPipelineRunner'
  p = Pipeline(options=options)

  lines = p | df.io.Read('ReadFromText', df.io.TextFileSource(my_input))
  lines | df.io.Write('WriteToText', df.io.TextFileSink(my_output))

  p.run()
Exemplo n.º 7
0
 def test_get_all_options(self):
     for case in PipelineOptionsTest.TEST_CASES:
         options = PipelineOptions(flags=case['flags'])
         self.assertDictContainsSubset(case['expected'],
                                       options.get_all_options())
         self.assertEqual(
             options.view_as(PipelineOptionsTest.MockOptions).mock_flag,
             case['expected']['mock_flag'])
         self.assertEqual(
             options.view_as(PipelineOptionsTest.MockOptions).mock_option,
             case['expected']['mock_option'])
Exemplo n.º 8
0
  def test_no_main_session(self):
    staging_dir = tempfile.mkdtemp()
    options = PipelineOptions()

    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    options.view_as(SetupOptions).save_main_session = False
    self.update_options(options)

    self.assertEqual(
        [],
        dependency.stage_job_resources(options))
 def test_requirements_file_not_present(self):
     staging_dir = tempfile.mkdtemp()
     with self.assertRaises(RuntimeError) as cm:
         options = PipelineOptions()
         options.view_as(GoogleCloudOptions).staging_location = staging_dir
         self.update_options(options)
         options.view_as(SetupOptions).requirements_file = 'nosuchfile'
         dependency.stage_job_resources(options)
     self.assertEqual(
         cm.exception.message,
         'The file %s cannot be found. It was specified in the '
         '--requirements_file command line option.' % 'nosuchfile')
Exemplo n.º 10
0
 def test_requirements_file_not_present(self):
   staging_dir = tempfile.mkdtemp()
   with self.assertRaises(RuntimeError) as cm:
     options = PipelineOptions()
     options.view_as(GoogleCloudOptions).staging_location = staging_dir
     self.update_options(options)
     options.view_as(SetupOptions).requirements_file = 'nosuchfile'
     dependency.stage_job_resources(options)
   self.assertEqual(
       cm.exception.message,
       'The file %s cannot be found. It was specified in the '
       '--requirements_file command line option.' % 'nosuchfile')
Exemplo n.º 11
0
    def test_sdk_location_gcs(self):
        staging_dir = tempfile.mkdtemp()
        sdk_location = 'gs://my-gcs-bucket/tarball.tar.gz'
        self.override_file_copy(sdk_location, staging_dir)

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).sdk_location = sdk_location

        self.assertEqual(
            [names.PICKLED_MAIN_SESSION_FILE, names.DATAFLOW_SDK_TARBALL_FILE],
            dependency.stage_job_resources(options))
Exemplo n.º 12
0
    def test_sdk_location_local_not_present(self):
        staging_dir = tempfile.mkdtemp()
        sdk_location = 'nosuchdir'
        with self.assertRaises(RuntimeError) as cm:
            options = PipelineOptions()
            options.view_as(GoogleCloudOptions).staging_location = staging_dir
            self.update_options(options)
            options.view_as(SetupOptions).sdk_location = sdk_location

            dependency.stage_job_resources(options)
        self.assertEqual(
            'The file "%s" cannot be found. Its '
            'location was specified by the --sdk_location command-line option.'
            % sdk_location, cm.exception.message)
Exemplo n.º 13
0
  def test_sdk_location_gcs(self):
    staging_dir = tempfile.mkdtemp()
    sdk_location = 'gs://my-gcs-bucket/tarball.tar.gz'
    self.override_file_copy(sdk_location, staging_dir)

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).sdk_location = sdk_location

    self.assertEqual(
        [names.PICKLED_MAIN_SESSION_FILE,
         names.DATAFLOW_SDK_TARBALL_FILE],
        dependency.stage_job_resources(options))
Exemplo n.º 14
0
  def test_with_extra_packages_missing_files(self):
    staging_dir = tempfile.mkdtemp()
    with self.assertRaises(RuntimeError) as cm:

      options = PipelineOptions()
      options.view_as(GoogleCloudOptions).staging_location = staging_dir
      self.update_options(options)
      options.view_as(SetupOptions).extra_packages = ['nosuchfile.tar.gz']

      dependency.stage_job_resources(options)
    self.assertEqual(
        cm.exception.message,
        'The file %s cannot be found. It was specified in the '
        '--extra_packages command line option.' % 'nosuchfile.tar.gz')
Exemplo n.º 15
0
  def test_sdk_location_local_not_present(self):
    staging_dir = tempfile.mkdtemp()
    sdk_location = 'nosuchdir'
    with self.assertRaises(RuntimeError) as cm:
      options = PipelineOptions()
      options.view_as(GoogleCloudOptions).staging_location = staging_dir
      self.update_options(options)
      options.view_as(SetupOptions).sdk_location = sdk_location

      dependency.stage_job_resources(options)
    self.assertEqual(
        'The file "%s" cannot be found. Its '
        'location was specified by the --sdk_location command-line option.' %
        sdk_location,
        cm.exception.message)
Exemplo n.º 16
0
def model_pcollection(argv):
  """Creating a PCollection from data in local memory.

  URL: https://cloud.google.com/dataflow/model/pcollection
  """
  from google.cloud.dataflow.utils.options import PipelineOptions

  class MyOptions(PipelineOptions):

    @classmethod
    def _add_argparse_args(cls, parser):
      parser.add_argument('--output',
                          dest='output',
                          required=True,
                          help='Output file to write results to.')

  pipeline_options = PipelineOptions(argv)
  my_options = pipeline_options.view_as(MyOptions)

  # [START model_pcollection]
  p = df.Pipeline(options=pipeline_options)

  (p
   | df.Create([
       'To be, or not to be: that is the question: ',
       'Whether \'tis nobler in the mind to suffer ',
       'The slings and arrows of outrageous fortune, ',
       'Or to take arms against a sea of troubles, '])
   | df.io.Write(df.io.TextFileSink(my_options.output)))

  p.run()
Exemplo n.º 17
0
 def test_with_extra_packages_invalid_file_name(self):
   staging_dir = tempfile.mkdtemp()
   source_dir = tempfile.mkdtemp()
   self.create_temp_file(
       os.path.join(source_dir, 'abc.tgz'), 'nothing')
   with self.assertRaises(RuntimeError) as cm:
     options = PipelineOptions()
     options.view_as(GoogleCloudOptions).staging_location = staging_dir
     self.update_options(options)
     options.view_as(SetupOptions).extra_packages = [
         os.path.join(source_dir, 'abc.tgz')]
     dependency.stage_job_resources(options)
   self.assertEqual(
       cm.exception.message,
       'The --extra_packages option expects a full path ending with '
       '\'.tar.gz\' instead of %s' % os.path.join(source_dir, 'abc.tgz'))
  def test_get_unknown_args(self):

    # Used for testing newly added flags.
    class MockOptions(PipelineOptions):

      @classmethod
      def _add_argparse_args(cls, parser):
        parser.add_argument('--mock_flag',
                            action='store_true',
                            help='Enable work item profiling')

    test_cases = [
        {'flags': ['--num_workers', '5'],
         'expected': {'num_workers': 5, 'mock_flag': False}},
        {
            'flags': [
                '--profile', '--profile_location', 'gs://bucket/', 'ignored'],
            'expected': {
                'profile': True, 'profile_location': 'gs://bucket/',
                'mock_flag': False}
        },
        {'flags': ['--num_workers', '5', '--mock_flag'],
         'expected': {'num_workers': 5, 'mock_flag': True}},
    ]

    for case in test_cases:
      options = PipelineOptions(flags=case['flags'])
      self.assertDictContainsSubset(case['expected'], options.get_all_options())
      self.assertEqual(options.view_as(MockOptions).mock_flag,
                       case['expected']['mock_flag'])
Exemplo n.º 19
0
    def test_with_extra_packages_missing_files(self):
        staging_dir = tempfile.mkdtemp()
        with self.assertRaises(RuntimeError) as cm:

            options = PipelineOptions()
            options.view_as(GoogleCloudOptions).staging_location = staging_dir
            self.update_options(options)
            options.view_as(SetupOptions).extra_packages = [
                'nosuchfile.tar.gz'
            ]

            dependency.stage_job_resources(options)
        self.assertEqual(
            cm.exception.message,
            'The file %s cannot be found. It was specified in the '
            '--extra_packages command line option.' % 'nosuchfile.tar.gz')
Exemplo n.º 20
0
 def test_with_extra_packages_invalid_file_name(self):
     staging_dir = tempfile.mkdtemp()
     source_dir = tempfile.mkdtemp()
     self.create_temp_file(os.path.join(source_dir, 'abc.tgz'), 'nothing')
     with self.assertRaises(RuntimeError) as cm:
         options = PipelineOptions()
         options.view_as(GoogleCloudOptions).staging_location = staging_dir
         self.update_options(options)
         options.view_as(SetupOptions).extra_packages = [
             os.path.join(source_dir, 'abc.tgz')
         ]
         dependency.stage_job_resources(options)
     self.assertEqual(
         cm.exception.message,
         'The --extra_packages option expects a full path ending with '
         '\'.tar.gz\' instead of %s' % os.path.join(source_dir, 'abc.tgz'))
Exemplo n.º 21
0
    def test_sdk_location_default(self):
        staging_dir = tempfile.mkdtemp()
        expected_from_url = '%s/v%s.tar.gz' % (dependency.PACKAGES_URL_PREFIX,
                                               __version__)
        expected_from_path = self.override_file_download(
            expected_from_url, staging_dir)
        self.override_file_copy(expected_from_path, staging_dir)

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).sdk_location = 'default'

        self.assertEqual(
            [names.PICKLED_MAIN_SESSION_FILE, names.DATAFLOW_SDK_TARBALL_FILE],
            dependency.stage_job_resources(
                options, file_copy=dependency._dependency_file_copy))
Exemplo n.º 22
0
    def test_with_extra_packages(self):
        staging_dir = tempfile.mkdtemp()
        source_dir = tempfile.mkdtemp()
        self.create_temp_file(os.path.join(source_dir, 'abc.tar.gz'),
                              'nothing')
        self.create_temp_file(os.path.join(source_dir, 'xyz.tar.gz'),
                              'nothing')
        self.create_temp_file(
            os.path.join(source_dir, dependency.EXTRA_PACKAGES_FILE),
            'nothing')

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).extra_packages = [
            os.path.join(source_dir, 'abc.tar.gz'),
            os.path.join(source_dir, 'xyz.tar.gz'),
            'gs://my-gcs-bucket/gcs.tar.gz'
        ]

        gcs_copied_files = []

        def file_copy(from_path, to_path):
            if from_path.startswith('gs://'):
                gcs_copied_files.append(from_path)
                _, from_name = os.path.split(from_path)
                self.create_temp_file(os.path.join(to_path, from_name),
                                      'nothing')
                logging.info('Fake copied GCS file: %s to %s', from_path,
                             to_path)
            elif to_path.startswith('gs://'):
                logging.info('Faking file_copy(%s, %s)', from_path, to_path)
            else:
                shutil.copyfile(from_path, to_path)

        dependency._dependency_file_copy = file_copy

        self.assertEqual([
            'abc.tar.gz', 'xyz.tar.gz', 'gcs.tar.gz',
            dependency.EXTRA_PACKAGES_FILE, names.PICKLED_MAIN_SESSION_FILE
        ], dependency.stage_job_resources(options))
        with open(os.path.join(staging_dir,
                               dependency.EXTRA_PACKAGES_FILE)) as f:
            self.assertEqual(['abc.tar.gz\n', 'xyz.tar.gz\n', 'gcs.tar.gz\n'],
                             f.readlines())
        self.assertEqual(['gs://my-gcs-bucket/gcs.tar.gz'], gcs_copied_files)
Exemplo n.º 23
0
    def test_with_requirements_file(self):
        staging_dir = tempfile.mkdtemp()
        source_dir = tempfile.mkdtemp()

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).requirements_file = os.path.join(
            source_dir, dependency.REQUIREMENTS_FILE)
        self.create_temp_file(
            os.path.join(source_dir, dependency.REQUIREMENTS_FILE), 'nothing')
        self.assertEqual(
            [dependency.REQUIREMENTS_FILE, names.PICKLED_MAIN_SESSION_FILE],
            dependency.stage_job_resources(options))
        self.assertTrue(
            os.path.isfile(
                os.path.join(staging_dir, dependency.REQUIREMENTS_FILE)))
Exemplo n.º 24
0
  def test_sdk_location_gcs(self):
    staging_dir = tempfile.mkdtemp()
    sdk_location = 'gs://my-gcs-bucket'
    expected_from_path = utils.path.join(
        sdk_location,
        'google-cloud-dataflow-python-sdk-%s.tgz' % __version__)
    self.override_file_copy(expected_from_path, staging_dir)

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).sdk_location = sdk_location

    self.assertEqual(
        [names.PICKLED_MAIN_SESSION_FILE,
         names.DATAFLOW_SDK_TARBALL_FILE],
        dependency.stage_job_resources(options))
Exemplo n.º 25
0
  def test_setup_file_not_named_setup_dot_py(self):
    staging_dir = tempfile.mkdtemp()
    source_dir = tempfile.mkdtemp()

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).setup_file = (
        os.path.join(source_dir, 'xyz-setup.py'))

    self.create_temp_file(
        os.path.join(source_dir, 'xyz-setup.py'), 'notused')
    with self.assertRaises(RuntimeError) as cm:
      dependency.stage_job_resources(options)
    self.assertTrue(
        cm.exception.message.startswith(
            'The --setup_file option expects the full path to a file named '
            'setup.py instead of '))
  def test_override_options(self):
    base_flags = ['--num_workers', '5']
    options = PipelineOptions(base_flags)
    self.assertEqual(options.get_all_options()['num_workers'], 5)
    self.assertEqual(options.get_all_options()['mock_flag'], False)

    options.view_as(PipelineOptionsTest.MockOptions).mock_flag = True
    self.assertEqual(options.get_all_options()['num_workers'], 5)
    self.assertEqual(options.get_all_options()['mock_flag'], True)
Exemplo n.º 27
0
    def test_override_options(self):
        base_flags = ['--num_workers', '5']
        options = PipelineOptions(base_flags)
        self.assertEqual(options.get_all_options()['num_workers'], 5)
        self.assertEqual(options.get_all_options()['mock_flag'], False)

        options.view_as(PipelineOptionsTest.MockOptions).mock_flag = True
        self.assertEqual(options.get_all_options()['num_workers'], 5)
        self.assertEqual(options.get_all_options()['mock_flag'], True)
Exemplo n.º 28
0
    def test_setup_file_not_named_setup_dot_py(self):
        staging_dir = tempfile.mkdtemp()
        source_dir = tempfile.mkdtemp()

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).setup_file = (os.path.join(
            source_dir, 'xyz-setup.py'))

        self.create_temp_file(os.path.join(source_dir, 'xyz-setup.py'),
                              'notused')
        with self.assertRaises(RuntimeError) as cm:
            dependency.stage_job_resources(options)
        self.assertTrue(
            cm.exception.message.startswith(
                'The --setup_file option expects the full path to a file named '
                'setup.py instead of '))
Exemplo n.º 29
0
  def test_with_requirements_file(self):
    staging_dir = tempfile.mkdtemp()
    source_dir = tempfile.mkdtemp()

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).requirements_file = os.path.join(
        source_dir, dependency.REQUIREMENTS_FILE)
    self.create_temp_file(
        os.path.join(source_dir, dependency.REQUIREMENTS_FILE), 'nothing')
    self.assertEqual(
        [dependency.REQUIREMENTS_FILE,
         names.PICKLED_MAIN_SESSION_FILE],
        dependency.stage_job_resources(options))
    self.assertTrue(
        os.path.isfile(
            os.path.join(staging_dir, dependency.REQUIREMENTS_FILE)))
 def test_option_with_spcae(self):
   options = PipelineOptions(flags=['--option with space= value with space'])
   self.assertEqual(
       getattr(options.view_as(PipelineOptionsTest.MockOptions),
               'option with space'), ' value with space')
   options_from_dict = PipelineOptions.from_dictionary(
       options.get_all_options())
   self.assertEqual(
       getattr(options_from_dict.view_as(PipelineOptionsTest.MockOptions),
               'option with space'), ' value with space')
Exemplo n.º 31
0
    def test_sdk_location_local(self):
        staging_dir = tempfile.mkdtemp()
        sdk_location = tempfile.mkdtemp()
        self.create_temp_file(
            os.path.join(sdk_location, names.DATAFLOW_SDK_TARBALL_FILE),
            'contents')

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).sdk_location = sdk_location

        self.assertEqual(
            [names.PICKLED_MAIN_SESSION_FILE, names.DATAFLOW_SDK_TARBALL_FILE],
            dependency.stage_job_resources(options))
        tarball_path = os.path.join(staging_dir,
                                    names.DATAFLOW_SDK_TARBALL_FILE)
        with open(tarball_path) as f:
            self.assertEqual(f.read(), 'contents')
Exemplo n.º 32
0
  def test_sdk_location_default(self):
    staging_dir = tempfile.mkdtemp()
    expected_from_url = '%s/v%s.tar.gz' % (
        dependency.PACKAGES_URL_PREFIX, __version__)
    expected_from_path = self.override_file_download(
        expected_from_url, staging_dir)
    self.override_file_copy(expected_from_path, staging_dir)

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).sdk_location = 'default'

    self.assertEqual(
        [names.PICKLED_MAIN_SESSION_FILE,
         names.DATAFLOW_SDK_TARBALL_FILE],
        dependency.stage_job_resources(
            options,
            file_copy=dependency._dependency_file_copy))
Exemplo n.º 33
0
 def test_option_with_spcae(self):
     options = PipelineOptions(
         flags=['--option with space= value with space'])
     self.assertEqual(
         getattr(options.view_as(PipelineOptionsTest.MockOptions),
                 'option with space'), ' value with space')
     options_from_dict = PipelineOptions.from_dictionary(
         options.get_all_options())
     self.assertEqual(
         getattr(options_from_dict.view_as(PipelineOptionsTest.MockOptions),
                 'option with space'), ' value with space')
Exemplo n.º 34
0
 def test_no_temp_location(self):
     staging_dir = tempfile.mkdtemp()
     options = PipelineOptions()
     google_cloud_options = options.view_as(GoogleCloudOptions)
     google_cloud_options.staging_location = staging_dir
     self.update_options(options)
     google_cloud_options.temp_location = None
     with self.assertRaises(RuntimeError) as cm:
         dependency.stage_job_resources(options)
     self.assertEqual('The --temp_location option must be specified.',
                      cm.exception.message)
Exemplo n.º 35
0
 def test_no_temp_location(self):
   staging_dir = tempfile.mkdtemp()
   options = PipelineOptions()
   google_cloud_options = options.view_as(GoogleCloudOptions)
   google_cloud_options.staging_location = staging_dir
   self.update_options(options)
   google_cloud_options.temp_location = None
   with self.assertRaises(RuntimeError) as cm:
     dependency.stage_job_resources(options)
   self.assertEqual('The --temp_location option must be specified.',
                    cm.exception.message)
 def test_from_dictionary(self):
   for case in PipelineOptionsTest.TEST_CASES:
     options = PipelineOptions(flags=case['flags'])
     all_options_dict = options.get_all_options()
     options_from_dict = PipelineOptions.from_dictionary(all_options_dict)
     self.assertEqual(options_from_dict.view_as(
         PipelineOptionsTest.MockOptions).mock_flag,
                      case['expected']['mock_flag'])
     self.assertEqual(options.view_as(
         PipelineOptionsTest.MockOptions).mock_option,
                      case['expected']['mock_option'])
Exemplo n.º 37
0
    def test_default_resources(self):
        staging_dir = tempfile.mkdtemp()
        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)

        self.assertEqual([names.PICKLED_MAIN_SESSION_FILE],
                         dependency.stage_job_resources(options))
        self.assertTrue(
            os.path.isfile(
                os.path.join(staging_dir, names.PICKLED_MAIN_SESSION_FILE)))
Exemplo n.º 38
0
  def test_with_extra_packages(self):
    staging_dir = tempfile.mkdtemp()
    source_dir = tempfile.mkdtemp()
    self.create_temp_file(
        os.path.join(source_dir, 'abc.tar.gz'), 'nothing')
    self.create_temp_file(
        os.path.join(source_dir, 'xyz.tar.gz'), 'nothing')
    self.create_temp_file(
        os.path.join(source_dir, dependency.EXTRA_PACKAGES_FILE), 'nothing')

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).extra_packages = [
        os.path.join(source_dir, 'abc.tar.gz'),
        os.path.join(source_dir, 'xyz.tar.gz'),
        'gs://my-gcs-bucket/gcs.tar.gz']

    gcs_copied_files = []
    def file_copy(from_path, to_path):
      if from_path.startswith('gs://'):
        gcs_copied_files.append(from_path)
        _, from_name = os.path.split(from_path)
        self.create_temp_file(os.path.join(to_path, from_name), 'nothing')
        logging.info('Fake copied GCS file: %s to %s', from_path, to_path)
      elif to_path.startswith('gs://'):
        logging.info('Faking file_copy(%s, %s)', from_path, to_path)
      else:
        shutil.copyfile(from_path, to_path)

    dependency._dependency_file_copy = file_copy

    self.assertEqual(
        ['abc.tar.gz', 'xyz.tar.gz', 'gcs.tar.gz',
         dependency.EXTRA_PACKAGES_FILE,
         names.PICKLED_MAIN_SESSION_FILE],
        dependency.stage_job_resources(options))
    with open(os.path.join(staging_dir, dependency.EXTRA_PACKAGES_FILE)) as f:
      self.assertEqual(['abc.tar.gz\n', 'xyz.tar.gz\n', 'gcs.tar.gz\n'],
                       f.readlines())
    self.assertEqual(['gs://my-gcs-bucket/gcs.tar.gz'], gcs_copied_files)
Exemplo n.º 39
0
  def test_default_resources(self):
    staging_dir = tempfile.mkdtemp()
    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)

    self.assertEqual(
        [names.PICKLED_MAIN_SESSION_FILE],
        dependency.stage_job_resources(options))
    self.assertTrue(
        os.path.isfile(
            os.path.join(staging_dir, names.PICKLED_MAIN_SESSION_FILE)))
Exemplo n.º 40
0
 def test_from_dictionary(self):
     for case in PipelineOptionsTest.TEST_CASES:
         options = PipelineOptions(flags=case['flags'])
         all_options_dict = options.get_all_options()
         options_from_dict = PipelineOptions.from_dictionary(
             all_options_dict)
         self.assertEqual(
             options_from_dict.view_as(
                 PipelineOptionsTest.MockOptions).mock_flag,
             case['expected']['mock_flag'])
         self.assertEqual(
             options.view_as(PipelineOptionsTest.MockOptions).mock_option,
             case['expected']['mock_option'])
Exemplo n.º 41
0
  def test_sdk_location_local(self):
    staging_dir = tempfile.mkdtemp()
    sdk_location = tempfile.mkdtemp()
    self.create_temp_file(
        os.path.join(
            sdk_location,
            names.DATAFLOW_SDK_TARBALL_FILE),
        'contents')

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).sdk_location = sdk_location

    self.assertEqual(
        [names.PICKLED_MAIN_SESSION_FILE,
         names.DATAFLOW_SDK_TARBALL_FILE],
        dependency.stage_job_resources(options))
    tarball_path = os.path.join(
        staging_dir, names.DATAFLOW_SDK_TARBALL_FILE)
    with open(tarball_path) as f:
      self.assertEqual(f.read(), 'contents')
Exemplo n.º 42
0
  def test_with_extra_packages(self):
    staging_dir = tempfile.mkdtemp()
    source_dir = tempfile.mkdtemp()
    self.create_temp_file(
        os.path.join(source_dir, 'abc.tar.gz'), 'nothing')
    self.create_temp_file(
        os.path.join(source_dir, 'xyz.tar.gz'), 'nothing')
    self.create_temp_file(
        os.path.join(source_dir, dependency.EXTRA_PACKAGES_FILE), 'nothing')

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).extra_packages = [
        os.path.join(source_dir, 'abc.tar.gz'),
        os.path.join(source_dir, 'xyz.tar.gz')]

    self.assertEqual(
        ['abc.tar.gz', 'xyz.tar.gz', dependency.EXTRA_PACKAGES_FILE,
         names.PICKLED_MAIN_SESSION_FILE],
        dependency.stage_job_resources(options))
    with open(os.path.join(staging_dir, dependency.EXTRA_PACKAGES_FILE)) as f:
      self.assertEqual(['abc.tar.gz\n', 'xyz.tar.gz\n'], f.readlines())
Exemplo n.º 43
0
  def test_with_requirements_file_and_cache(self):
    staging_dir = tempfile.mkdtemp()
    source_dir = tempfile.mkdtemp()

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).requirements_file = os.path.join(
        source_dir, dependency.REQUIREMENTS_FILE)
    options.view_as(SetupOptions).requirements_cache = os.path.join(
        tempfile.gettempdir(), 'alternative-cache-dir')
    self.create_temp_file(
        os.path.join(source_dir, dependency.REQUIREMENTS_FILE), 'nothing')
    self.assertEqual(
        sorted([dependency.REQUIREMENTS_FILE, names.PICKLED_MAIN_SESSION_FILE,
                'abc.txt', 'def.txt']),
        sorted(dependency.stage_job_resources(
            options,
            populate_requirements_cache=self.populate_requirements_cache)))
    self.assertTrue(
        os.path.isfile(
            os.path.join(staging_dir, dependency.REQUIREMENTS_FILE)))
    self.assertTrue(os.path.isfile(os.path.join(staging_dir, 'abc.txt')))
    self.assertTrue(os.path.isfile(os.path.join(staging_dir, 'def.txt')))
Exemplo n.º 44
0
def pipeline_options_local(argv):
  """"Creating a Pipeline using a PipelineOptions object for local execution.

  URL: https://cloud.google.com/dataflow/pipelines/specifying-exec-params
  """

  from google.cloud.dataflow import Pipeline
  from google.cloud.dataflow.utils.options import PipelineOptions

  options = PipelineOptions(flags=argv)

  # [START pipeline_options_define_custom_with_help_and_default]
  class MyOptions(PipelineOptions):

    @classmethod
    def _add_argparse_args(cls, parser):
      parser.add_argument('--input',
                          help='Input for the dataflow pipeline',
                          default='gs://my-bucket/input')
      parser.add_argument('--output',
                          help='Output for the dataflow pipeline',
                          default='gs://my-bucket/output')
  # [END pipeline_options_define_custom_with_help_and_default]

  my_options = options.view_as(MyOptions)

  my_input = my_options.input
  my_output = my_options.output

  # [START pipeline_options_local]
  # Create and set your Pipeline Options.
  options = PipelineOptions()
  p = Pipeline(options=options)
  # [END pipeline_options_local]

  lines = p | df.io.Read('ReadFromText', df.io.TextFileSource(my_input))
  lines | df.io.Write('WriteToText', df.io.TextFileSink(my_output))
  p.run()
Exemplo n.º 45
0
def model_pipelines(argv):
  """A wordcount snippet as a simple pipeline example.

  URL: https://cloud.google.com/dataflow/model/pipelines
  """
  # [START model_pipelines]
  import re

  import google.cloud.dataflow as df
  from google.cloud.dataflow.utils.options import PipelineOptions

  class MyOptions(PipelineOptions):

    @classmethod
    def _add_argparse_args(cls, parser):
      parser.add_argument('--input',
                          dest='input',
                          default='gs://dataflow-samples/shakespeare/kinglear'
                          '.txt',
                          help='Input file to process.')
      parser.add_argument('--output',
                          dest='output',
                          required=True,
                          help='Output file to write results to.')

  pipeline_options = PipelineOptions(argv)
  my_options = pipeline_options.view_as(MyOptions)

  p = df.Pipeline(options=pipeline_options)

  (p
   | df.io.Read(df.io.TextFileSource(my_options.input))
   | df.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
   | df.Map(lambda x: (x, 1)) | df.combiners.Count.PerKey()
   | df.io.Write(df.io.TextFileSink(my_options.output)))

  p.run()
Exemplo n.º 46
0
class Pipeline(object):
    """A pipeline object that manages a DAG of PValues and their PTransforms.

  Conceptually the PValues are the DAG's nodes and the PTransforms computing
  the PValues are the edges.

  All the transforms applied to the pipeline must have distinct full labels.
  If same transform instance needs to be applied then a clone should be created
  with a new label (e.g., transform.clone('new label')).
  """
    def __init__(self, runner=None, options=None, argv=None):
        """Initialize a pipeline object.

    Args:
      runner: An object of type 'PipelineRunner' that will be used to execute
        the pipeline. For registered runners, the runner name can be specified,
        otherwise a runner object must be supplied.
      options: A configured 'PipelineOptions' object containing arguments
        that should be used for running the Dataflow job.
      argv: a list of arguments (such as sys.argv) to be used for building a
        'PipelineOptions' object. This will only be used if argument 'options'
        is None.

    Raises:
      ValueError: if either the runner or options argument is not of the
      expected type.
    """

        if options is not None:
            if isinstance(options, PipelineOptions):
                self.options = options
            else:
                raise ValueError(
                    'Parameter options, if specified, must be of type PipelineOptions. '
                    'Received : %r', options)
        elif argv is not None:
            if isinstance(argv, list):
                self.options = PipelineOptions(argv)
            else:
                raise ValueError(
                    'Parameter argv, if specified, must be a list. Received : %r',
                    argv)
        else:
            self.options = None

        if runner is None and self.options is not None:
            runner = self.options.view_as(StandardOptions).runner
            if runner is None:
                runner = StandardOptions.DEFAULT_RUNNER
                logging.info(
                    ('Missing pipeline option (runner). Executing pipeline '
                     'using the default runner: %s.'), runner)

        if isinstance(runner, str):
            runner = create_runner(runner)
        elif not isinstance(runner, PipelineRunner):
            raise TypeError('Runner must be a PipelineRunner object or the '
                            'name of a registered runner.')

        # Validate pipeline options
        if self.options is not None:
            errors = PipelineOptionsValidator(self.options, runner).validate()
            if errors:
                raise ValueError('Pipeline has validations errors: \n' +
                                 '\n'.join(errors))

        # Default runner to be used.
        self.runner = runner
        # Stack of transforms generated by nested apply() calls. The stack will
        # contain a root node as an enclosing (parent) node for top transforms.
        self.transforms_stack = [AppliedPTransform(None, None, '', None)]
        # Set of transform labels (full labels) applied to the pipeline.
        # If a transform is applied and the full label is already in the set
        # then the transform will have to be cloned with a new label.
        self.applied_labels = set()
        # Store cache of views created from PCollections.  For reference, see
        # pvalue._cache_view().
        self._view_cache = {}

    def _current_transform(self):
        """Returns the transform currently on the top of the stack."""
        return self.transforms_stack[-1]

    def _root_transform(self):
        """Returns the root transform of the transform stack."""
        return self.transforms_stack[0]

    def run(self):
        """Runs the pipeline. Returns whatever our runner returns after running."""
        if not self.options or self.options.view_as(
                SetupOptions).save_main_session:
            # If this option is chosen, verify we can pickle the main session early.
            tmpdir = tempfile.mkdtemp()
            try:
                pickler.dump_session(
                    os.path.join(tmpdir, 'main_session.pickle'))
            finally:
                shutil.rmtree(tmpdir)
        return self.runner.run(self)

    def visit(self, visitor):
        """Visits depth-first every node of a pipeline's DAG.

    Args:
      visitor: PipelineVisitor object whose callbacks will be called for each
        node visited. See PipelineVisitor comments.

    Raises:
      TypeError: if node is specified and is not a PValue.
      pipeline.PipelineError: if node is specified and does not belong to this
        pipeline instance.
    """

        visited = set()
        self._root_transform().visit(visitor, self, visited)

    def apply(self, transform, pvalueish=None):
        """Applies a custom transform using the pvalueish specified.

    Args:
      transform: the PTranform (or callable) to apply.
      pvalueish: the input for the PTransform (typically a PCollection).

    Raises:
      TypeError: if the transform object extracted from the argument list is
        not a callable type or a descendant from PTransform.
      RuntimeError: if the transform object was already applied to this pipeline
        and needs to be cloned in order to apply again.
    """
        if not isinstance(transform, ptransform.PTransform):
            transform = _CallableWrapperPTransform(transform)

        full_label = format_full_label(self._current_transform(), transform)
        if full_label in self.applied_labels:
            raise RuntimeError(
                'Transform "%s" does not have a stable unique label. '
                'This will prevent updating of pipelines. '
                'To clone a transform with a new label use: '
                'transform.clone("NEW LABEL").' % full_label)
        self.applied_labels.add(full_label)

        pvalueish, inputs = transform._extract_input_pvalues(pvalueish)
        try:
            inputs = tuple(inputs)
            for leaf_input in inputs:
                if not isinstance(leaf_input, pvalue.PValue):
                    raise TypeError
        except TypeError:
            raise NotImplementedError(
                'Unable to extract PValue inputs from %s; either %s does not accept '
                'inputs of this format, or it does not properly override '
                '_extract_input_values' % (pvalueish, transform))

        current = AppliedPTransform(self._current_transform(), transform,
                                    full_label, inputs)
        self._current_transform().add_part(current)
        self.transforms_stack.append(current)

        if self.options is not None:
            type_options = self.options.view_as(TypeOptions)
        else:
            type_options = None

        if type_options is not None and type_options.pipeline_type_check:
            transform.type_check_inputs(pvalueish)

        pvalueish_result = self.runner.apply(transform, pvalueish)

        if type_options is not None and type_options.pipeline_type_check:
            transform.type_check_outputs(pvalueish_result)

        for result in ptransform.GetPValues().visit(pvalueish_result):
            assert isinstance(result, (pvalue.PValue, pvalue.DoOutputsTuple))

            # Make sure we set the producer only for a leaf node in the transform DAG.
            # This way we preserve the last transform of a composite transform as
            # being the real producer of the result.
            if result.producer is None:
                result.producer = current
            # TODO(robertwb): Multi-input, multi-output inference.
            # TODO(robertwb): Ideally we'd do intersection here.
            if (type_options is not None
                    and type_options.pipeline_type_check and isinstance(
                        result, (pvalue.PCollection, pvalue.PCollectionView))
                    and not result.element_type):
                input_element_type = (inputs[0].element_type
                                      if len(inputs) == 1 else typehints.Any)
                type_hints = transform.get_type_hints()
                declared_output_type = type_hints.simple_output_type(
                    transform.label)
                if declared_output_type:
                    input_types = type_hints.input_types
                    if input_types and input_types[0]:
                        declared_input_type = input_types[0][0]
                        result.element_type = typehints.bind_type_variables(
                            declared_output_type,
                            typehints.match_type_variables(
                                declared_input_type, input_element_type))
                    else:
                        result.element_type = declared_output_type
                else:
                    result.element_type = transform.infer_output_type(
                        input_element_type)

            assert isinstance(result.producer.inputs, tuple)
            current.add_output(result)

        if (type_options is not None
                and type_options.type_check_strictness == 'ALL_REQUIRED'
                and transform.get_type_hints().output_types is None):
            ptransform_name = '%s(%s)' % (transform.__class__.__name__,
                                          full_label)
            raise TypeCheckError(
                'Pipeline type checking is enabled, however no '
                'output type-hint was found for the '
                'PTransform %s' % ptransform_name)

        current.update_input_refcounts()
        self.transforms_stack.pop()
        return pvalueish_result
Exemplo n.º 47
0
class Pipeline(object):
  """A pipeline object that manages a DAG of PValues and their PTransforms.

  Conceptually the PValues are the DAG's nodes and the PTransforms computing
  the PValues are the edges.

  All the transforms applied to the pipeline must have distinct full labels.
  If same transform instance needs to be applied then a clone should be created
  with a new label (e.g., transform.clone('new label')).
  """

  def __init__(self, runner=None, options=None, argv=None):
    """Initialize a pipeline object.

    Args:
      runner: An object of type 'PipelineRunner' that will be used to execute
        the pipeline. For registered runners, the runner name can be specified,
        otherwise a runner object must be supplied.
      options: A configured 'PipelineOptions' object containing arguments
        that should be used for running the Dataflow job.
      argv: a list of arguments (such as sys.argv) to be used for building a
        'PipelineOptions' object. This will only be used if argument 'options'
        is None.

    Raises:
      ValueError: if either the runner or options argument is not of the
      expected type.
    """

    if options is not None:
      if isinstance(options, PipelineOptions):
        self.options = options
      else:
        raise ValueError(
            'Parameter options, if specified, must be of type PipelineOptions. '
            'Received : %r', options)
    elif argv is not None:
      if isinstance(argv, list):
        self.options = PipelineOptions(argv)
      else:
        raise ValueError(
            'Parameter argv, if specified, must be a list. Received : %r', argv)
    else:
      self.options = None

    if runner is None and self.options is not None:
      runner = self.options.view_as(StandardOptions).runner
      if runner is None:
        runner = StandardOptions.DEFAULT_RUNNER
        logging.info(('Missing pipeline option (runner). Executing pipeline '
                      'using the default runner: %s.'), runner)

    if isinstance(runner, str):
      runner = create_runner(runner)
    elif not isinstance(runner, PipelineRunner):
      raise TypeError('Runner must be a PipelineRunner object or the '
                      'name of a registered runner.')

    # Validate pipeline options
    if self.options is not None:
      errors = PipelineOptionsValidator(self.options, runner).validate()
      if errors:
        raise ValueError(
            'Pipeline has validations errors: \n' + '\n'.join(errors))

    # Default runner to be used.
    self.runner = runner
    # Stack of transforms generated by nested apply() calls. The stack will
    # contain a root node as an enclosing (parent) node for top transforms.
    self.transforms_stack = [AppliedPTransform(None, None, '', None)]
    # Set of transform labels (full labels) applied to the pipeline.
    # If a transform is applied and the full label is already in the set
    # then the transform will have to be cloned with a new label.
    self.applied_labels = set()
    # Store cache of views created from PCollections.  For reference, see
    # pvalue._cache_view().
    self._view_cache = {}

  def _current_transform(self):
    """Returns the transform currently on the top of the stack."""
    return self.transforms_stack[-1]

  def _root_transform(self):
    """Returns the root transform of the transform stack."""
    return self.transforms_stack[0]

  def run(self):
    """Runs the pipeline. Returns whatever our runner returns after running."""
    if not self.options or self.options.view_as(SetupOptions).save_main_session:
      # If this option is chosen, verify we can pickle the main session early.
      tmpdir = tempfile.mkdtemp()
      try:
        pickler.dump_session(os.path.join(tmpdir, 'main_session.pickle'))
      finally:
        shutil.rmtree(tmpdir)
    return self.runner.run(self)

  def visit(self, visitor):
    """Visits depth-first every node of a pipeline's DAG.

    Args:
      visitor: PipelineVisitor object whose callbacks will be called for each
        node visited. See PipelineVisitor comments.

    Raises:
      TypeError: if node is specified and is not a PValue.
      pipeline.PipelineError: if node is specified and does not belong to this
        pipeline instance.
    """

    visited = set()
    self._root_transform().visit(visitor, self, visited)

  def apply(self, transform, pvalueish=None):
    """Applies a custom transform using the pvalueish specified.

    Args:
      transform: the PTranform (or callable) to apply.
      pvalueish: the input for the PTransform (typically a PCollection).

    Raises:
      TypeError: if the transform object extracted from the argument list is
        not a callable type or a descendant from PTransform.
      RuntimeError: if the transform object was already applied to this pipeline
        and needs to be cloned in order to apply again.
    """
    if not isinstance(transform, ptransform.PTransform):
      transform = _CallableWrapperPTransform(transform)

    full_label = format_full_label(self._current_transform(), transform)
    if full_label in self.applied_labels:
      raise RuntimeError(
          'Transform "%s" does not have a stable unique label. '
          'This will prevent updating of pipelines. '
          'To clone a transform with a new label use: '
          'transform.clone("NEW LABEL").'
          % full_label)
    self.applied_labels.add(full_label)

    pvalueish, inputs = transform._extract_input_pvalues(pvalueish)
    try:
      inputs = tuple(inputs)
      for leaf_input in inputs:
        if not isinstance(leaf_input, pvalue.PValue):
          raise TypeError
    except TypeError:
      raise NotImplementedError(
          'Unable to extract PValue inputs from %s; either %s does not accept '
          'inputs of this format, or it does not properly override '
          '_extract_input_values' % (pvalueish, transform))

    current = AppliedPTransform(
        self._current_transform(), transform, full_label, inputs)
    self._current_transform().add_part(current)
    self.transforms_stack.append(current)

    if self.options is not None:
      type_options = self.options.view_as(TypeOptions)
    else:
      type_options = None

    if type_options is not None and type_options.pipeline_type_check:
      transform.type_check_inputs(pvalueish)

    pvalueish_result = self.runner.apply(transform, pvalueish)

    if type_options is not None and type_options.pipeline_type_check:
      transform.type_check_outputs(pvalueish_result)

    for result in ptransform.GetPValues().visit(pvalueish_result):
      assert isinstance(result, (pvalue.PValue, pvalue.DoOutputsTuple))

      # Make sure we set the producer only for a leaf node in the transform DAG.
      # This way we preserve the last transform of a composite transform as
      # being the real producer of the result.
      if result.producer is None:
        result.producer = current
      # TODO(robertwb): Multi-input, multi-output inference.
      # TODO(robertwb): Ideally we'd do intersection here.
      if (type_options is not None and type_options.pipeline_type_check and
          isinstance(result, (pvalue.PCollection, pvalue.PCollectionView))
          and not result.element_type):
        input_element_type = (
            inputs[0].element_type
            if len(inputs) == 1
            else typehints.Any)
        type_hints = transform.get_type_hints()
        declared_output_type = type_hints.simple_output_type(transform.label)
        if declared_output_type:
          input_types = type_hints.input_types
          if input_types and input_types[0]:
            declared_input_type = input_types[0][0]
            result.element_type = typehints.bind_type_variables(
                declared_output_type,
                typehints.match_type_variables(declared_input_type,
                                               input_element_type))
          else:
            result.element_type = declared_output_type
        else:
          result.element_type = transform.infer_output_type(input_element_type)

      assert isinstance(result.producer.inputs, tuple)
      current.add_output(result)

    if (type_options is not None and
        type_options.type_check_strictness == 'ALL_REQUIRED' and
        transform.get_type_hints().output_types is None):
      ptransform_name = '%s(%s)' % (transform.__class__.__name__, full_label)
      raise TypeCheckError('Pipeline type checking is enabled, however no '
                           'output type-hint was found for the '
                           'PTransform %s' % ptransform_name)

    current.update_input_refcounts()
    self.transforms_stack.pop()
    return pvalueish_result
Exemplo n.º 48
0
class ImagePreprocessor(object):
    """Runs the pre-processing pipeline.
  """
    def __init__(self, args):
        self.pipeline_options = PipelineOptions(args)

    def preprocess(self, input_path, input_dict, output_path):
        """

    Args:
      input_path: Input specified as uri to CSV file. Each line of csv file
                  contains colon-separated GCS uri to an image and labels
      input_dict: Input dictionary. Specified as text file uri.
                  Each line of the file stores one label.
    """
        opt = self.pipeline_options.view_as(PrepareImagesOptions)
        p = df.Pipeline(options=self.pipeline_options)

        # Read input data.
        csv_data = df.io.TextFileSource(input_path,
                                        strip_trailing_newlines=True)
        dict_data = df.io.TextFileSource(input_dict,
                                         strip_trailing_newlines=True)
        labels = (p | df.Read(StageName.READ_DICTIONARY, dict_data))
        content = (p | df.Read(StageName.READ_CSV, csv_data)
                   | df.Map(StageName.PARSE_CSV,
                            lambda line: csv.reader([line]).next())
                   | df.ParDo(StageName.EXTRACT_LABEL_IDS,
                              ExtractLabelIdsDoFn(), df.pvalue.AsIter(labels))
                   | df.ParDo(StageName.READ_IMAGE, ExtractImageDoFn()))

        # Process input data using common transformations.
        image_graph_uri = os.path.join(opt.input_data_location,
                                       Default.IMAGE_GRAPH_FILENAME)
        examples = (
            content
            | df.ParDo(
                StageName.CONVERT_IMAGE,
                ResizeImageDoFn(Default.IMAGE_TYPE, opt.max_image_width,
                                opt.max_image_height))
            | df.ParDo(
                StageName.ENCODE_EXAMPLE,
                EncodeExampleDoFn(image_graph_uri,
                                  opt.image_graph_jpeg_input_tensor,
                                  opt.image_graph_output_tensor,
                                  opt.training_data_percentage)))

        # Write in JSON format to Text file.
        # Remove redundant whitespace for more compact representation.
        # Images/labels are base64 encoded so will not contain spaces.
        to_json = lambda x: re.sub(r'\s+', ' ', json_format.MessageToJson(x[0])
                                   )

        for dataset in Dataset.ALL:
            _ = (examples
                 | df.Filter(StageName.FILTER + dataset,
                             lambda x, dataset=dataset: x[1] == dataset)
                 | df.Map(StageName.TO_JSON + dataset, to_json)
                 | df.Write(
                     StageName.SAVE + dataset,
                     df.io.TextFileSink('{}.{}.json'.format(
                         output_path, dataset),
                                        num_shards=opt.output_shard_count)))

        # Execute the pipeline.
        p.run()
Exemplo n.º 49
0
def pipeline_monitoring(renames):
  """Using monitoring interface snippets.

  URL: https://cloud.google.com/dataflow/pipelines/dataflow-monitoring-intf
  """

  import re
  import google.cloud.dataflow as df
  from google.cloud.dataflow.utils.options import PipelineOptions

  class WordCountOptions(PipelineOptions):

    @classmethod
    def _add_argparse_args(cls, parser):
      parser.add_argument('--input',
                          help='Input for the dataflow pipeline',
                          default='gs://my-bucket/input')
      parser.add_argument('--output',
                          help='output for the dataflow pipeline',
                          default='gs://my-bucket/output')

  class ExtractWordsFn(df.DoFn):

    def process(self, context):
      words = re.findall(r'[A-Za-z\']+', context.element)
      for word in words:
        yield word

  class FormatCountsFn(df.DoFn):

    def process(self, context):
      word, count = context.element
      yield '%s: %s' % (word, count)

  # [START pipeline_monitoring_composite]
  # The CountWords Composite Transform inside the WordCount pipeline.
  class CountWords(df.PTransform):

    def apply(self, pcoll):
      return (pcoll
              # Convert lines of text into individual words.
              | df.ParDo('ExtractWords', ExtractWordsFn())
              # Count the number of times each word occurs.
              | df.combiners.Count.PerElement()
              # Format each word and count into a printable string.
              | df.ParDo('FormatCounts', FormatCountsFn()))
  # [END pipeline_monitoring_composite]

  pipeline_options = PipelineOptions()
  options = pipeline_options.view_as(WordCountOptions)
  p = df.Pipeline(options=pipeline_options)

  # [START pipeline_monitoring_execution]
  (p
   # Read the lines of the input text.
   | df.io.Read('ReadLines', df.io.TextFileSource(options.input))
   # Count the words.
   | CountWords()
   # Write the formatted word counts to output.
   | df.io.Write('WriteCounts', df.io.TextFileSink(options.output)))
  # [END pipeline_monitoring_execution]

  p.visit(SnippetUtils.RenameFiles(renames))
  p.run()
Exemplo n.º 50
0
class Pipeline(object):
  """A pipeline object that manages a DAG of PValues and their PTransforms.

  Conceptually the PValues are the DAG's nodes and the PTransforms computing
  the PValues are the edges.

  All the transforms applied to the pipeline must have distinct full labels.
  If same transform instance needs to be applied then a clone should be created
  with a new label (e.g., transform.clone('new label')).
  """

  def __init__(self, runner=None, options=None, argv=None):
    """Initialize a pipeline object.

    Args:
      runner: An object of type 'PipelineRunner' that will be used to execute
        the pipeline. For registered runners, the runner name can be specified,
        otherwise a runner object must be supplied.
      options: A configured 'PipelineOptions' object containing arguments
        that should be used for running the Dataflow job.
      argv: a list of arguments (such as sys.argv) to be used for building a
        'PipelineOptions' object. This will only be used if argument 'options'
        is None.

    Raises:
      ValueError: if either the runner or options argument is not of the
      expected type.
    """

    if options is not None:
      if isinstance(options, PipelineOptions):
        self.options = options
      else:
        raise ValueError(
            'Parameter options, if specified, must be of type PipelineOptions. '
            'Received : %r', options)
    elif argv is not None:
      if isinstance(argv, list):
        self.options = PipelineOptions(argv)
      else:
        raise ValueError(
            'Parameter argv, if specified, must be a list. Received : %r', argv)
    else:
      self.options = None

    if runner is None and self.options is not None:
      runner = self.options.view_as(StandardOptions).runner

    if isinstance(runner, str):
      runner = create_runner(runner)
    elif not isinstance(runner, PipelineRunner):
      raise TypeError('Runner must be a PipelineRunner object or the '
                      'name of a registered runner.')
    # List of PValue objects representing a DAG of transformations.
    self._nodes = []
    # Default runner to be used.
    self.runner = runner
    # Stack of transforms generated by nested apply() calls. The stack will
    # contain a root node as an enclosing (parent) node for top transforms.
    self.transforms_stack = [AppliedPTransform(None, None, '', None)]
    # Set of transform labels (full labels) applied to the pipeline.
    # If a transform is applied and the full label is already in the set
    # then the transform will have to be cloned with a new label.
    self.applied_labels = set()

  def _add_pvalue(self, pval):
    """Adds a PValue to the pipeline's node list."""
    if pval not in self._nodes:
      self._nodes.append(pval)

  def _current_transform(self):
    """Returns the transform currently on the top of the stack."""
    return self.transforms_stack[-1]

  def _root_transform(self):
    """Returns the root transform of the transform stack."""
    return self.transforms_stack[0]

  def run(self):
    """Runs the pipeline. Returns whatever our runner returns after running."""
    return self.runner.run(self)

  def visit(self, visitor, node=None):
    """Visits depth-first every node of a pipeline's DAG.

    If node is specified then only that node's predecessors (inputs and
    recursively their creating transforms) and outputs will be visited.

    Args:
      visitor: PipelineVisitor object whose callbacks will be called for each
        node visited. See PipelineVisitor comments.
      node: if specified it is expected to be a PValue and only the nodes of
        the DAG reachable from this node will be visited.

    Raises:
      TypeError: if node is specified and is not a PValue.
      pipeline.PipelineError: if node is specified and does not belong to this
        pipeline instance.
    """

    # Make sure the specified node has its transform registered as an output
    # producer. We can have this situation for PCollections created as results
    # of accessing a tag of a FlatMap().with_outputs() result.
    if node is not None:
      if not isinstance(node, pvalue.PValue):
        raise TypeError(
            'Expected a PValue for the node argument instead of: %r' % node)
      if node not in self._nodes:
        raise error.PipelineError('PValue not in pipeline: %r' % node)
      assert node.producer is not None

    visited = set()
    start_transform = self._root_transform() if node is None else node.producer
    start_transform.visit(visitor, self, visited)

  def apply(self, transform, pvalueish=None):
    """Applies a custom transform using the pvalueish specified.

    Args:
      transform: the PTranform (or callable) to apply.
      pvalueish: the input for the PTransform (typically a PCollection).

    Raises:
      TypeError: if the transform object extracted from the argument list is
        not a callable type or a descendant from PTransform.
      RuntimeError: if the transform object was already applied to this pipeline
        and needs to be cloned in order to apply again.
    """
    if not isinstance(transform, ptransform.PTransform):

      class CallableTransform(ptransform.PTransform):

        def __init__(self, callee):
          super(CallableTransform, self).__init__(
              label=getattr(callee, '__name__', 'Callable'))
          self._callee = callee

        def apply(self, *args, **kwargs):
          return self._callee(*args, **kwargs)

      assert callable(transform)
      transform = CallableTransform(transform)

    full_label = format_full_label(self._current_transform(), transform)
    if full_label in self.applied_labels:
      raise RuntimeError(
          'Transform with label %s already applied. Please clone the current '
          'instance using a new label or alternatively create a new instance. '
          'To clone a transform use: transform.clone(\'NEW LABEL\').'
          % full_label)
    self.applied_labels.add(full_label)

    pvalueish, inputs = transform._extract_input_pvalues(pvalueish)
    try:
      inputs = tuple(inputs)
      for leaf_input in inputs:
        if not isinstance(leaf_input, pvalue.PValue):
          raise TypeError
    except TypeError:
      raise NotImplementedError(
          'Unable to extract PValue inputs from %s; either %s does not accept '
          'inputs of this format, or it does not properly override '
          '_extract_input_values' % (pvalueish, transform))

    child = AppliedPTransform(
        self._current_transform(), transform, full_label, inputs)
    self._current_transform().add_part(child)
    self.transforms_stack.append(child)

    if self.options is not None:
      type_options = self.options.view_as(TypeOptions)
    else:
      type_options = None

    if type_options is not None and type_options.pipeline_type_check:
      transform.type_check_inputs(pvalueish)

    pvalueish_result = self.runner.apply(transform, pvalueish)

    if type_options is not None and type_options.pipeline_type_check:
      transform.type_check_outputs(pvalueish_result)

    for result in ptransform.GetPValues().visit(pvalueish_result):
      assert isinstance(result, (pvalue.PValue, pvalue.DoOutputsTuple))

      # Make sure we set the producer only for a leaf node in the transform DAG.
      # This way we preserve the last transform of a composite transform as
      # being the real producer of the result.
      if result.producer is None:
        result.producer = child
      self._current_transform().add_output(result)
      # TODO(robertwb): Multi-input, multi-output inference.
      if (type_options is not None and type_options.pipeline_type_check and
          isinstance(result, pvalue.PCollection) and not result.element_type):
        input_element_type = (
            inputs[0].element_type
            if len(inputs) == 1
            else typehints.Any)
        type_hints = transform.get_type_hints()
        declared_output_type = type_hints.simple_output_type(transform.label)
        if declared_output_type:
          input_types = type_hints.input_types
          if input_types and input_types[0]:
            declared_input_type = input_types[0][0]
            result.element_type = typehints.bind_type_variables(
                declared_output_type,
                typehints.match_type_variables(declared_input_type,
                                               input_element_type))
          else:
            result.element_type = declared_output_type
        else:
          result.element_type = transform.infer_output_type(input_element_type)

      assert isinstance(result.producer.inputs, tuple)

    if (type_options is not None and
        type_options.type_check_strictness == 'ALL_REQUIRED' and
        transform.get_type_hints().output_types is None):
      ptransform_name = '%s(%s)' % (transform.__class__.__name__, full_label)
      raise TypeCheckError('Pipeline type checking is enabled, however no '
                           'output type-hint was found for the '
                           'PTransform %s' % ptransform_name)

    self.transforms_stack.pop()
    return pvalueish_result