示例#1
0
    def test_with_requirements_file_and_cache(self):
        staging_dir = tempfile.mkdtemp()
        source_dir = tempfile.mkdtemp()

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).requirements_file = os.path.join(
            source_dir, dependency.REQUIREMENTS_FILE)
        options.view_as(SetupOptions).requirements_cache = os.path.join(
            tempfile.gettempdir(), 'alternative-cache-dir')
        self.create_temp_file(
            os.path.join(source_dir, dependency.REQUIREMENTS_FILE), 'nothing')
        self.assertEqual(
            sorted([dependency.REQUIREMENTS_FILE, 'abc.txt', 'def.txt']),
            sorted(
                dependency.stage_job_resources(
                    options,
                    populate_requirements_cache=self.
                    populate_requirements_cache)))
        self.assertTrue(
            os.path.isfile(
                os.path.join(staging_dir, dependency.REQUIREMENTS_FILE)))
        self.assertTrue(os.path.isfile(os.path.join(staging_dir, 'abc.txt')))
        self.assertTrue(os.path.isfile(os.path.join(staging_dir, 'def.txt')))
示例#2
0
  def test_with_requirements_file(self):
    try:
      staging_dir = tempfile.mkdtemp()
      requirements_cache_dir = tempfile.mkdtemp()
      source_dir = tempfile.mkdtemp()

      options = PipelineOptions()
      options.view_as(GoogleCloudOptions).staging_location = staging_dir
      self.update_options(options)
      options.view_as(SetupOptions).requirements_cache = requirements_cache_dir
      options.view_as(SetupOptions).requirements_file = os.path.join(
          source_dir, dependency.REQUIREMENTS_FILE)
      self.create_temp_file(
          os.path.join(source_dir, dependency.REQUIREMENTS_FILE), 'nothing')
      self.assertEqual(
          sorted([dependency.REQUIREMENTS_FILE,
                  'abc.txt', 'def.txt']),
          sorted(dependency.stage_job_resources(
              options,
              populate_requirements_cache=self.populate_requirements_cache)))
      self.assertTrue(
          os.path.isfile(
              os.path.join(staging_dir, dependency.REQUIREMENTS_FILE)))
      self.assertTrue(os.path.isfile(os.path.join(staging_dir, 'abc.txt')))
      self.assertTrue(os.path.isfile(os.path.join(staging_dir, 'def.txt')))
    finally:
      shutil.rmtree(staging_dir)
      shutil.rmtree(requirements_cache_dir)
      shutil.rmtree(source_dir)
示例#3
0
    def test_with_setup_file(self):
        staging_dir = tempfile.mkdtemp()
        source_dir = tempfile.mkdtemp()
        self.create_temp_file(os.path.join(source_dir, 'setup.py'), 'notused')

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).setup_file = os.path.join(
            source_dir, 'setup.py')

        self.assertEqual(
            [dependency.WORKFLOW_TARBALL_FILE],
            dependency.stage_job_resources(
                options,
                # We replace the build setup command because a realistic one would
                # require the setuptools package to be installed. Note that we can't
                # use "touch" here to create the expected output tarball file, since
                # touch is not available on Windows, so we invoke python to produce
                # equivalent behavior.
                build_setup_args=[
                    'python', '-c', 'open(__import__("sys").argv[1], "a")',
                    os.path.join(source_dir, dependency.WORKFLOW_TARBALL_FILE)
                ],
                temp_dir=source_dir))
        self.assertTrue(
            os.path.isfile(
                os.path.join(staging_dir, dependency.WORKFLOW_TARBALL_FILE)))
示例#4
0
  def test_with_setup_file(self):
    staging_dir = tempfile.mkdtemp()
    source_dir = tempfile.mkdtemp()
    self.create_temp_file(
        os.path.join(source_dir, 'setup.py'), 'notused')

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).setup_file = os.path.join(
        source_dir, 'setup.py')

    self.assertEqual(
        [dependency.WORKFLOW_TARBALL_FILE],
        dependency.stage_job_resources(
            options,
            # We replace the build setup command because a realistic one would
            # require the setuptools package to be installed. Note that we can't
            # use "touch" here to create the expected output tarball file, since
            # touch is not available on Windows, so we invoke python to produce
            # equivalent behavior.
            build_setup_args=[
                'python', '-c', 'open(__import__("sys").argv[1], "a")',
                os.path.join(source_dir, dependency.WORKFLOW_TARBALL_FILE)],
            temp_dir=source_dir))
    self.assertTrue(
        os.path.isfile(
            os.path.join(staging_dir, dependency.WORKFLOW_TARBALL_FILE)))
def run():
    parser = argparse.ArgumentParser()
    parser.add_argument('--run_locally', dest='run_locally', default='', help='Run data subset and do not save.')
    known_args, pipeline_args = parser.parse_known_args()
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    gcloud_options = pipeline_options.view_as(GoogleCloudOptions)
    delete_from_datastore('dancedeets-hrd', gcloud_options, known_args.run_locally)
示例#6
0
def run(argv=None):
    """Main entry point; defines and runs the wordcount pipeline."""

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument('--kind',
                        dest='kind',
                        required=True,
                        help='Datastore Kind')
    parser.add_argument('--namespace',
                        dest='namespace',
                        help='Datastore Namespace')
    parser.add_argument('--ancestor',
                        dest='ancestor',
                        default='root',
                        help='The ancestor key name for all entities.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')
    parser.add_argument('--read_only',
                        action='store_true',
                        help='Read an existing dataset, do not write first')
    parser.add_argument(
        '--num_shards',
        dest='num_shards',
        type=int,
        # If the system should choose automatically.
        default=0,
        help='Number of output shards')

    known_args, pipeline_args = parser.parse_known_args(argv)
    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    gcloud_options = pipeline_options.view_as(GoogleCloudOptions)

    # Write to Datastore if `read_only` options is not specified.
    if not known_args.read_only:
        write_to_datastore(gcloud_options.project, known_args,
                           pipeline_options)

    # Read entities from Datastore.
    result = read_from_datastore(gcloud_options.project, known_args,
                                 pipeline_options)

    empty_lines_filter = MetricsFilter().with_name('empty_lines')
    query_result = result.metrics().query(empty_lines_filter)
    if query_result['counters']:
        empty_lines_counter = query_result['counters'][0]
        logging.info('number of empty lines: %d',
                     empty_lines_counter.committed)
示例#7
0
    def test_no_main_session(self):
        staging_dir = tempfile.mkdtemp()
        options = PipelineOptions()

        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        options.view_as(SetupOptions).save_main_session = False
        self.update_options(options)

        self.assertEqual([], dependency.stage_job_resources(options))
示例#8
0
def examples_wordcount_minimal(renames):
    """MinimalWordCount example snippets."""
    import re

    import apache_beam as beam

    from apache_beam.utils.pipeline_options import GoogleCloudOptions
    from apache_beam.utils.pipeline_options import StandardOptions
    from apache_beam.utils.pipeline_options import PipelineOptions

    # [START examples_wordcount_minimal_options]
    options = PipelineOptions()
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = 'my-project-id'
    google_cloud_options.job_name = 'myjob'
    google_cloud_options.staging_location = 'gs://your-bucket-name-here/staging'
    google_cloud_options.temp_location = 'gs://your-bucket-name-here/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'
    # [END examples_wordcount_minimal_options]

    # Run it locally for testing.
    options = PipelineOptions()

    # [START examples_wordcount_minimal_create]
    p = beam.Pipeline(options=options)
    # [END examples_wordcount_minimal_create]

    (
        # [START examples_wordcount_minimal_read]
        p |
        beam.io.ReadFromText('gs://dataflow-samples/shakespeare/kinglear.txt')
        # [END examples_wordcount_minimal_read]

        # [START examples_wordcount_minimal_pardo]
        |
        'ExtractWords' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
        # [END examples_wordcount_minimal_pardo]

        # [START examples_wordcount_minimal_count]
        | beam.combiners.Count.PerElement()
        # [END examples_wordcount_minimal_count]

        # [START examples_wordcount_minimal_map]
        | beam.Map(lambda (word, count): '%s: %s' % (word, count))
        # [END examples_wordcount_minimal_map]

        # [START examples_wordcount_minimal_write]
        | beam.io.WriteToText('gs://my-bucket/counts.txt')
        # [END examples_wordcount_minimal_write]
    )

    p.visit(SnippetUtils.RenameFiles(renames))

    # [START examples_wordcount_minimal_run]
    result = p.run()
    # [END examples_wordcount_minimal_run]
    result.wait_until_finish()
示例#9
0
def examples_wordcount_minimal(renames):
  """MinimalWordCount example snippets."""
  import re

  import apache_beam as beam

  from apache_beam.utils.pipeline_options import GoogleCloudOptions
  from apache_beam.utils.pipeline_options import StandardOptions
  from apache_beam.utils.pipeline_options import PipelineOptions

  # [START examples_wordcount_minimal_options]
  options = PipelineOptions()
  google_cloud_options = options.view_as(GoogleCloudOptions)
  google_cloud_options.project = 'my-project-id'
  google_cloud_options.job_name = 'myjob'
  google_cloud_options.staging_location = 'gs://your-bucket-name-here/staging'
  google_cloud_options.temp_location = 'gs://your-bucket-name-here/temp'
  options.view_as(StandardOptions).runner = 'DataflowRunner'
  # [END examples_wordcount_minimal_options]

  # Run it locally for testing.
  options = PipelineOptions()

  # [START examples_wordcount_minimal_create]
  p = beam.Pipeline(options=options)
  # [END examples_wordcount_minimal_create]

  (
      # [START examples_wordcount_minimal_read]
      p | beam.io.ReadFromText(
          'gs://dataflow-samples/shakespeare/kinglear.txt')
      # [END examples_wordcount_minimal_read]

      # [START examples_wordcount_minimal_pardo]
      | 'ExtractWords' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
      # [END examples_wordcount_minimal_pardo]

      # [START examples_wordcount_minimal_count]
      | beam.combiners.Count.PerElement()
      # [END examples_wordcount_minimal_count]

      # [START examples_wordcount_minimal_map]
      | beam.Map(lambda (word, count): '%s: %s' % (word, count))
      # [END examples_wordcount_minimal_map]

      # [START examples_wordcount_minimal_write]
      | beam.io.WriteToText('gs://my-bucket/counts.txt')
      # [END examples_wordcount_minimal_write]
  )

  p.visit(SnippetUtils.RenameFiles(renames))

  # [START examples_wordcount_minimal_run]
  result = p.run()
  # [END examples_wordcount_minimal_run]
  result.wait_until_finish()
示例#10
0
文件: wordcount.py 项目: zhpshu/beam
def run(argv=None):
    """Main entry point; defines and runs the wordcount pipeline."""
    class WordcountOptions(PipelineOptions):
        @classmethod
        def _add_argparse_args(cls, parser):
            parser.add_value_provider_argument(
                '--input',
                dest='input',
                default='gs://dataflow-samples/shakespeare/kinglear.txt',
                help='Input file to process.')
            parser.add_value_provider_argument(
                '--output',
                dest='output',
                required=True,
                help='Output file to write results to.')

    pipeline_options = PipelineOptions(argv)
    wordcount_options = pipeline_options.view_as(WordcountOptions)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    # Read the text file[pattern] into a PCollection.
    lines = p | 'read' >> ReadFromText(wordcount_options.input)

    # Count the occurrences of each word.
    counts = (lines
              | 'split' >>
              (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode))
              | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
              | 'group' >> beam.GroupByKey()
              | 'count' >> beam.Map(lambda (word, ones): (word, sum(ones))))

    # Format the counts into a PCollection of strings.
    output = counts | 'format' >> beam.Map(lambda (word, c): '%s: %s' %
                                           (word, c))

    # Write the output using a "Write" transform that has side effects.
    # pylint: disable=expression-not-assigned
    output | 'write' >> WriteToText(wordcount_options.output)

    # Actually run the pipeline (all operations above are deferred).
    result = p.run()
    result.wait_until_finish()

    # Do not query metrics when creating a template which doesn't run
    if (not hasattr(result, 'has_job')  # direct runner
            or result.has_job):  # not just a template creation
        empty_lines_filter = MetricsFilter().with_name('empty_lines')
        query_result = result.metrics().query(empty_lines_filter)
        if query_result['counters']:
            empty_lines_counter = query_result['counters'][0]
            logging.info('number of empty lines: %d',
                         empty_lines_counter.committed)
示例#11
0
 def test_get_all_options(self):
   for case in PipelineOptionsTest.TEST_CASES:
     options = PipelineOptions(flags=case['flags'])
     self.assertDictContainsSubset(case['expected'], options.get_all_options())
     self.assertEqual(options.view_as(
         PipelineOptionsTest.MockOptions).mock_flag,
                      case['expected']['mock_flag'])
     self.assertEqual(options.view_as(
         PipelineOptionsTest.MockOptions).mock_option,
                      case['expected']['mock_option'])
 def test_get_all_options(self):
   for case in PipelineOptionsTest.TEST_CASES:
     options = PipelineOptions(flags=case['flags'])
     self.assertDictContainsSubset(case['expected'], options.get_all_options())
     self.assertEqual(options.view_as(
         PipelineOptionsTest.MockOptions).mock_flag,
                      case['expected']['mock_flag'])
     self.assertEqual(options.view_as(
         PipelineOptionsTest.MockOptions).mock_option,
                      case['expected']['mock_option'])
示例#13
0
  def test_no_main_session(self):
    staging_dir = tempfile.mkdtemp()
    options = PipelineOptions()

    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    options.view_as(SetupOptions).save_main_session = False
    self.update_options(options)

    self.assertEqual(
        [],
        dependency.stage_job_resources(options))
示例#14
0
def run():
    parser = argparse.ArgumentParser()
    parser.add_argument('--run_locally',
                        dest='run_locally',
                        default='',
                        help='Run data subset and do not save.')
    known_args, pipeline_args = parser.parse_known_args()
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    gcloud_options = pipeline_options.view_as(GoogleCloudOptions)
    delete_from_datastore('dancedeets-hrd', gcloud_options,
                          known_args.run_locally)
示例#15
0
 def __init_pipeline(self):
     pipeline_args = self.config['pipeline_args']
     options = PipelineOptions()
     google_cloud_options = options.view_as(GoogleCloudOptions)
     google_cloud_options.project = pipeline_args['project']
     google_cloud_options.job_name = pipeline_args['job_name']
     google_cloud_options.staging_location = pipeline_args['staging_location']
     google_cloud_options.temp_location = pipeline_args['temp_location']
     options.view_as(StandardOptions).runner = pipeline_args['runner']
     options.view_as(SetupOptions).setup_file = pipeline_args['setup_file']
     options.view_as(SetupOptions).save_main_session = True
     return beam.Pipeline(options=options)
示例#16
0
    def test_sdk_location_gcs(self):
        staging_dir = tempfile.mkdtemp()
        sdk_location = 'gs://my-gcs-bucket/tarball.tar.gz'
        self.override_file_copy(sdk_location, staging_dir)

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).sdk_location = sdk_location

        self.assertEqual([names.DATAFLOW_SDK_TARBALL_FILE],
                         dependency.stage_job_resources(options))
def run(argv=None):
  """Main entry point; defines and runs the wordcount pipeline."""

  parser = argparse.ArgumentParser()
  parser.add_argument('--input',
                      dest='input',
                      default='gs://dataflow-samples/shakespeare/kinglear.txt',
                      help='Input file to process.')
  parser.add_argument('--kind',
                      dest='kind',
                      required=True,
                      help='Datastore Kind')
  parser.add_argument('--namespace',
                      dest='namespace',
                      help='Datastore Namespace')
  parser.add_argument('--ancestor',
                      dest='ancestor',
                      default='root',
                      help='The ancestor key name for all entities.')
  parser.add_argument('--output',
                      dest='output',
                      required=True,
                      help='Output file to write results to.')
  parser.add_argument('--read_only',
                      action='store_true',
                      help='Read an existing dataset, do not write first')
  parser.add_argument('--num_shards',
                      dest='num_shards',
                      type=int,
                      # If the system should choose automatically.
                      default=0,
                      help='Number of output shards')

  known_args, pipeline_args = parser.parse_known_args(argv)
  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True
  gcloud_options = pipeline_options.view_as(GoogleCloudOptions)

  # Write to Datastore if `read_only` options is not specified.
  if not known_args.read_only:
    write_to_datastore(gcloud_options.project, known_args, pipeline_options)

  # Read entities from Datastore.
  result = read_from_datastore(gcloud_options.project, known_args,
                               pipeline_options)

  empty_lines_filter = MetricsFilter().with_name('empty_lines')
  query_result = result.metrics().query(empty_lines_filter)
  if query_result['counters']:
    empty_lines_counter = query_result['counters'][0]
    logging.info('number of empty lines: %d', empty_lines_counter.committed)
示例#18
0
    def test_with_extra_packages(self):
        staging_dir = tempfile.mkdtemp()
        source_dir = tempfile.mkdtemp()
        self.create_temp_file(os.path.join(source_dir, 'abc.tar.gz'),
                              'nothing')
        self.create_temp_file(os.path.join(source_dir, 'xyz.tar.gz'),
                              'nothing')
        self.create_temp_file(os.path.join(source_dir, 'xyz2.tar'), 'nothing')
        self.create_temp_file(os.path.join(source_dir, 'whl.whl'), 'nothing')
        self.create_temp_file(
            os.path.join(source_dir, dependency.EXTRA_PACKAGES_FILE),
            'nothing')

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).extra_packages = [
            os.path.join(source_dir, 'abc.tar.gz'),
            os.path.join(source_dir, 'xyz.tar.gz'),
            os.path.join(source_dir, 'xyz2.tar'),
            os.path.join(source_dir, 'whl.whl'),
            'gs://my-gcs-bucket/gcs.tar.gz'
        ]

        gcs_copied_files = []

        def file_copy(from_path, to_path):
            if from_path.startswith('gs://'):
                gcs_copied_files.append(from_path)
                _, from_name = os.path.split(from_path)
                self.create_temp_file(os.path.join(to_path, from_name),
                                      'nothing')
                logging.info('Fake copied GCS file: %s to %s', from_path,
                             to_path)
            elif to_path.startswith('gs://'):
                logging.info('Faking file_copy(%s, %s)', from_path, to_path)
            else:
                shutil.copyfile(from_path, to_path)

        dependency._dependency_file_copy = file_copy

        self.assertEqual([
            'abc.tar.gz', 'xyz.tar.gz', 'xyz2.tar', 'whl.whl', 'gcs.tar.gz',
            dependency.EXTRA_PACKAGES_FILE
        ], dependency.stage_job_resources(options))
        with open(os.path.join(staging_dir,
                               dependency.EXTRA_PACKAGES_FILE)) as f:
            self.assertEqual([
                'abc.tar.gz\n', 'xyz.tar.gz\n', 'xyz2.tar\n', 'whl.whl\n',
                'gcs.tar.gz\n'
            ], f.readlines())
        self.assertEqual(['gs://my-gcs-bucket/gcs.tar.gz'], gcs_copied_files)
示例#19
0
    def test_with_main_session(self):
        staging_dir = tempfile.mkdtemp()
        options = PipelineOptions()

        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        options.view_as(SetupOptions).save_main_session = True
        self.update_options(options)

        self.assertEqual([names.PICKLED_MAIN_SESSION_FILE],
                         dependency.stage_job_resources(options))
        self.assertTrue(
            os.path.isfile(
                os.path.join(staging_dir, names.PICKLED_MAIN_SESSION_FILE)))
示例#20
0
 def test_requirements_file_not_present(self):
   staging_dir = tempfile.mkdtemp()
   with self.assertRaises(RuntimeError) as cm:
     options = PipelineOptions()
     options.view_as(GoogleCloudOptions).staging_location = staging_dir
     self.update_options(options)
     options.view_as(SetupOptions).requirements_file = 'nosuchfile'
     dependency.stage_job_resources(
         options, populate_requirements_cache=self.populate_requirements_cache)
   self.assertEqual(
       cm.exception.message,
       'The file %s cannot be found. It was specified in the '
       '--requirements_file command line option.' % 'nosuchfile')
示例#21
0
  def test_sdk_location_gcs(self):
    staging_dir = tempfile.mkdtemp()
    sdk_location = 'gs://my-gcs-bucket/tarball.tar.gz'
    self.override_file_copy(sdk_location, staging_dir)

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).sdk_location = sdk_location

    self.assertEqual(
        [names.DATAFLOW_SDK_TARBALL_FILE],
        dependency.stage_job_resources(options))
示例#22
0
  def test_with_main_session(self):
    staging_dir = tempfile.mkdtemp()
    options = PipelineOptions()

    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    options.view_as(SetupOptions).save_main_session = True
    self.update_options(options)

    self.assertEqual(
        [names.PICKLED_MAIN_SESSION_FILE],
        dependency.stage_job_resources(options))
    self.assertTrue(
        os.path.isfile(
            os.path.join(staging_dir, names.PICKLED_MAIN_SESSION_FILE)))
示例#23
0
  def test_with_extra_packages_missing_files(self):
    staging_dir = tempfile.mkdtemp()
    with self.assertRaises(RuntimeError) as cm:

      options = PipelineOptions()
      options.view_as(GoogleCloudOptions).staging_location = staging_dir
      self.update_options(options)
      options.view_as(SetupOptions).extra_packages = ['nosuchfile.tar.gz']

      dependency.stage_job_resources(options)
    self.assertEqual(
        cm.exception.message,
        'The file %s cannot be found. It was specified in the '
        '--extra_packages command line option.' % 'nosuchfile.tar.gz')
示例#24
0
def pipeline_options_remote(argv):
    """Creating a Pipeline using a PipelineOptions object for remote execution."""

    from apache_beam import Pipeline
    from apache_beam.utils.pipeline_options import PipelineOptions

    # [START pipeline_options_create]
    options = PipelineOptions(flags=argv)

    # [END pipeline_options_create]

    # [START pipeline_options_define_custom]
    class MyOptions(PipelineOptions):
        @classmethod
        def _add_argparse_args(cls, parser):
            parser.add_argument('--input')
            parser.add_argument('--output')

    # [END pipeline_options_define_custom]

    from apache_beam.utils.pipeline_options import GoogleCloudOptions
    from apache_beam.utils.pipeline_options import StandardOptions

    # [START pipeline_options_dataflow_service]
    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=argv)

    # For Cloud execution, set the Cloud Platform project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = 'my-project-id'
    google_cloud_options.job_name = 'myjob'
    google_cloud_options.staging_location = 'gs://my-bucket/binaries'
    google_cloud_options.temp_location = 'gs://my-bucket/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    # Create the Pipeline with the specified options.
    p = Pipeline(options=options)
    # [END pipeline_options_dataflow_service]

    my_options = options.view_as(MyOptions)
    my_input = my_options.input
    my_output = my_options.output

    p = TestPipeline()  # Use TestPipeline for testing.

    lines = p | beam.io.ReadFromText(my_input)
    lines | beam.io.WriteToText(my_output)

    p.run()
示例#25
0
    def test_setup_file_not_present(self):
        staging_dir = tempfile.mkdtemp()

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).setup_file = 'nosuchfile'

        with self.assertRaises(RuntimeError) as cm:
            dependency.stage_job_resources(options)
        self.assertEqual(
            cm.exception.message,
            'The file %s cannot be found. It was specified in the '
            '--setup_file command line option.' % 'nosuchfile')
示例#26
0
    def test_sdk_location_local_not_present(self):
        staging_dir = tempfile.mkdtemp()
        sdk_location = 'nosuchdir'
        with self.assertRaises(RuntimeError) as cm:
            options = PipelineOptions()
            options.view_as(GoogleCloudOptions).staging_location = staging_dir
            self.update_options(options)
            options.view_as(SetupOptions).sdk_location = sdk_location

            dependency.stage_job_resources(options)
        self.assertEqual(
            'The file "%s" cannot be found. Its '
            'location was specified by the --sdk_location command-line option.'
            % sdk_location, cm.exception.message)
示例#27
0
def pipeline_options_remote(argv):
  """Creating a Pipeline using a PipelineOptions object for remote execution."""

  from apache_beam import Pipeline
  from apache_beam.utils.pipeline_options import PipelineOptions

  # [START pipeline_options_create]
  options = PipelineOptions(flags=argv)
  # [END pipeline_options_create]

  # [START pipeline_options_define_custom]
  class MyOptions(PipelineOptions):

    @classmethod
    def _add_argparse_args(cls, parser):
      parser.add_argument('--input')
      parser.add_argument('--output')
  # [END pipeline_options_define_custom]

  from apache_beam.utils.pipeline_options import GoogleCloudOptions
  from apache_beam.utils.pipeline_options import StandardOptions

  # [START pipeline_options_dataflow_service]
  # Create and set your PipelineOptions.
  options = PipelineOptions(flags=argv)

  # For Cloud execution, set the Cloud Platform project, job_name,
  # staging location, temp_location and specify DataflowRunner.
  google_cloud_options = options.view_as(GoogleCloudOptions)
  google_cloud_options.project = 'my-project-id'
  google_cloud_options.job_name = 'myjob'
  google_cloud_options.staging_location = 'gs://my-bucket/binaries'
  google_cloud_options.temp_location = 'gs://my-bucket/temp'
  options.view_as(StandardOptions).runner = 'DataflowRunner'

  # Create the Pipeline with the specified options.
  p = Pipeline(options=options)
  # [END pipeline_options_dataflow_service]

  my_options = options.view_as(MyOptions)
  my_input = my_options.input
  my_output = my_options.output

  p = TestPipeline()  # Use TestPipeline for testing.

  lines = p | beam.io.ReadFromText(my_input)
  lines | beam.io.WriteToText(my_output)

  p.run()
示例#28
0
def run(argv=None):
  """Main entry point; defines and runs the wordcount pipeline."""
  class WordcountOptions(PipelineOptions):
    @classmethod
    def _add_argparse_args(cls, parser):
      parser.add_value_provider_argument(
          '--input',
          dest='input',
          default='gs://dataflow-samples/shakespeare/kinglear.txt',
          help='Input file to process.')
      parser.add_value_provider_argument(
          '--output',
          dest='output',
          required=True,
          help='Output file to write results to.')
  pipeline_options = PipelineOptions(argv)
  wordcount_options = pipeline_options.view_as(WordcountOptions)

  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options.view_as(SetupOptions).save_main_session = True
  p = beam.Pipeline(options=pipeline_options)

  # Read the text file[pattern] into a PCollection.
  lines = p | 'read' >> ReadFromText(wordcount_options.input)

  # Count the occurrences of each word.
  counts = (lines
            | 'split' >> (beam.ParDo(WordExtractingDoFn())
                          .with_output_types(unicode))
            | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
            | 'group' >> beam.GroupByKey()
            | 'count' >> beam.Map(lambda (word, ones): (word, sum(ones))))

  # Format the counts into a PCollection of strings.
  output = counts | 'format' >> beam.Map(lambda (word, c): '%s: %s' % (word, c))

  # Write the output using a "Write" transform that has side effects.
  # pylint: disable=expression-not-assigned
  output | 'write' >> WriteToText(wordcount_options.output)

  # Actually run the pipeline (all operations above are deferred).
  result = p.run()
  result.wait_until_finish()
  empty_lines_filter = MetricsFilter().with_name('empty_lines')
  query_result = result.metrics().query(empty_lines_filter)
  if query_result['counters']:
    empty_lines_counter = query_result['counters'][0]
    logging.info('number of empty lines: %d', empty_lines_counter.committed)
示例#29
0
  def test_sdk_location_local_not_present(self):
    staging_dir = tempfile.mkdtemp()
    sdk_location = 'nosuchdir'
    with self.assertRaises(RuntimeError) as cm:
      options = PipelineOptions()
      options.view_as(GoogleCloudOptions).staging_location = staging_dir
      self.update_options(options)
      options.view_as(SetupOptions).sdk_location = sdk_location

      dependency.stage_job_resources(options)
    self.assertEqual(
        'The file "%s" cannot be found. Its '
        'location was specified by the --sdk_location command-line option.' %
        sdk_location,
        cm.exception.message)
示例#30
0
 def test_with_extra_packages_invalid_file_name(self):
   staging_dir = tempfile.mkdtemp()
   source_dir = tempfile.mkdtemp()
   self.create_temp_file(
       os.path.join(source_dir, 'abc.tgz'), 'nothing')
   with self.assertRaises(RuntimeError) as cm:
     options = PipelineOptions()
     options.view_as(GoogleCloudOptions).staging_location = staging_dir
     self.update_options(options)
     options.view_as(SetupOptions).extra_packages = [
         os.path.join(source_dir, 'abc.tgz')]
     dependency.stage_job_resources(options)
   self.assertEqual(
       cm.exception.message,
       'The --extra_package option expects a full path ending with ".tar" or '
       '".tar.gz" instead of %s' % os.path.join(source_dir, 'abc.tgz'))
示例#31
0
def model_pcollection(argv):
    """Creating a PCollection from data in local memory."""
    from apache_beam.utils.pipeline_options import PipelineOptions

    class MyOptions(PipelineOptions):
        @classmethod
        def _add_argparse_args(cls, parser):
            parser.add_argument('--output',
                                dest='output',
                                required=True,
                                help='Output file to write results to.')

    pipeline_options = PipelineOptions(argv)
    my_options = pipeline_options.view_as(MyOptions)

    # [START model_pcollection]
    p = beam.Pipeline(options=pipeline_options)

    (p
     | beam.Create([
         'To be, or not to be: that is the question: ',
         'Whether \'tis nobler in the mind to suffer ',
         'The slings and arrows of outrageous fortune, ',
         'Or to take arms against a sea of troubles, '
     ])
     | beam.io.WriteToText(my_options.output))

    result = p.run()
    # [END model_pcollection]
    result.wait_until_finish()
示例#32
0
def run(argv=None):
    """Runs the Wikipedia top edits pipeline.

  Args:
    argv: Pipeline options as a list of arguments.
  """

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/wikipedia_edits/*.json',
        help=
        'Input specified as a GCS path containing a BigQuery table exported '
        'as json.')
    parser.add_argument('--output',
                        required=True,
                        help='Output file to write results to.')
    parser.add_argument('--sampling_threshold',
                        type=float,
                        default=0.1,
                        help='Fraction of entries used for session tracking')
    known_args, pipeline_args = parser.parse_known_args(argv)
    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    (p  # pylint: disable=expression-not-assigned
     | ReadFromText(known_args.input)
     | ComputeTopSessions(known_args.sampling_threshold)
     | WriteToText(known_args.output))

    p.run()
示例#33
0
    def test_with_extra_packages_missing_files(self):
        staging_dir = tempfile.mkdtemp()
        with self.assertRaises(RuntimeError) as cm:

            options = PipelineOptions()
            options.view_as(GoogleCloudOptions).staging_location = staging_dir
            self.update_options(options)
            options.view_as(SetupOptions).extra_packages = [
                'nosuchfile.tar.gz'
            ]

            dependency.stage_job_resources(options)
        self.assertEqual(
            cm.exception.message,
            'The file %s cannot be found. It was specified in the '
            '--extra_packages command line option.' % 'nosuchfile.tar.gz')
示例#34
0
def run(argv=None):
  """Main entry point; defines and runs the tfidf pipeline."""
  parser = argparse.ArgumentParser()
  parser.add_argument('--uris',
                      required=True,
                      help='URIs to process.')
  parser.add_argument('--output',
                      required=True,
                      help='Output file to write results to.')
  known_args, pipeline_args = parser.parse_known_args(argv)
  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True
  p = beam.Pipeline(options=pipeline_options)

  # Read documents specified by the uris command line option.
  pcoll = read_documents(p, glob.glob(known_args.uris))
  # Compute TF-IDF information for each word.
  output = pcoll | TfIdf()
  # Write the output using a "Write" transform that has side effects.
  # pylint: disable=expression-not-assigned
  output | 'write' >> WriteToText(known_args.output)
  # Execute the pipeline and wait until it is completed.
  p.run().wait_until_finish()
示例#35
0
    def test_default_resources(self):
        staging_dir = tempfile.mkdtemp()
        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)

        self.assertEqual([], dependency.stage_job_resources(options))
示例#36
0
def model_pcollection(argv):
  """Creating a PCollection from data in local memory."""
  from apache_beam.utils.pipeline_options import PipelineOptions

  class MyOptions(PipelineOptions):

    @classmethod
    def _add_argparse_args(cls, parser):
      parser.add_argument('--output',
                          dest='output',
                          required=True,
                          help='Output file to write results to.')

  pipeline_options = PipelineOptions(argv)
  my_options = pipeline_options.view_as(MyOptions)

  # [START model_pcollection]
  p = beam.Pipeline(options=pipeline_options)

  (p
   | beam.Create([
       'To be, or not to be: that is the question: ',
       'Whether \'tis nobler in the mind to suffer ',
       'The slings and arrows of outrageous fortune, ',
       'Or to take arms against a sea of troubles, '])
   | beam.io.WriteToText(my_options.output))

  result = p.run()
  # [END model_pcollection]
  result.wait_until_finish()
def run(argv=None):
  """Runs the Wikipedia top edits pipeline.

  Args:
    argv: Pipeline options as a list of arguments.
  """

  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--input',
      dest='input',
      default='gs://dataflow-samples/wikipedia_edits/*.json',
      help='Input specified as a GCS path containing a BigQuery table exported '
      'as json.')
  parser.add_argument('--output',
                      required=True,
                      help='Output file to write results to.')
  parser.add_argument('--sampling_threshold',
                      type=float,
                      default=0.1,
                      help='Fraction of entries used for session tracking')
  known_args, pipeline_args = parser.parse_known_args(argv)
  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True
  p = beam.Pipeline(options=pipeline_options)

  (p  # pylint: disable=expression-not-assigned
   | ReadFromText(known_args.input)
   | ComputeTopSessions(known_args.sampling_threshold)
   | WriteToText(known_args.output))

  p.run()
示例#38
0
    def test_sdk_location_default(self):
        staging_dir = tempfile.mkdtemp()
        expected_from_url = 'pypi'
        expected_from_path = self.override_pypi_download(
            expected_from_url, staging_dir)
        self.override_file_copy(expected_from_path, staging_dir)

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).sdk_location = 'default'

        self.assertEqual([names.DATAFLOW_SDK_TARBALL_FILE],
                         dependency.stage_job_resources(
                             options,
                             file_copy=dependency._dependency_file_copy))
示例#39
0
 def test_with_extra_packages_invalid_file_name(self):
     staging_dir = tempfile.mkdtemp()
     source_dir = tempfile.mkdtemp()
     self.create_temp_file(os.path.join(source_dir, 'abc.tgz'), 'nothing')
     with self.assertRaises(RuntimeError) as cm:
         options = PipelineOptions()
         options.view_as(GoogleCloudOptions).staging_location = staging_dir
         self.update_options(options)
         options.view_as(SetupOptions).extra_packages = [
             os.path.join(source_dir, 'abc.tgz')
         ]
         dependency.stage_job_resources(options)
     self.assertEqual(
         cm.exception.message,
         'The --extra_package option expects a full path ending with ".tar" or '
         '".tar.gz" instead of %s' % os.path.join(source_dir, 'abc.tgz'))
示例#40
0
  def test_with_extra_packages(self):
    staging_dir = tempfile.mkdtemp()
    source_dir = tempfile.mkdtemp()
    self.create_temp_file(
        os.path.join(source_dir, 'abc.tar.gz'), 'nothing')
    self.create_temp_file(
        os.path.join(source_dir, 'xyz.tar.gz'), 'nothing')
    self.create_temp_file(
        os.path.join(source_dir, 'xyz2.tar'), 'nothing')
    self.create_temp_file(
        os.path.join(source_dir, 'whl.whl'), 'nothing')
    self.create_temp_file(
        os.path.join(source_dir, dependency.EXTRA_PACKAGES_FILE), 'nothing')

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).extra_packages = [
        os.path.join(source_dir, 'abc.tar.gz'),
        os.path.join(source_dir, 'xyz.tar.gz'),
        os.path.join(source_dir, 'xyz2.tar'),
        os.path.join(source_dir, 'whl.whl'),
        'gs://my-gcs-bucket/gcs.tar.gz']

    gcs_copied_files = []

    def file_copy(from_path, to_path):
      if from_path.startswith('gs://'):
        gcs_copied_files.append(from_path)
        _, from_name = os.path.split(from_path)
        self.create_temp_file(os.path.join(to_path, from_name), 'nothing')
        logging.info('Fake copied GCS file: %s to %s', from_path, to_path)
      elif to_path.startswith('gs://'):
        logging.info('Faking file_copy(%s, %s)', from_path, to_path)
      else:
        shutil.copyfile(from_path, to_path)

    dependency._dependency_file_copy = file_copy

    self.assertEqual(
        ['abc.tar.gz', 'xyz.tar.gz', 'xyz2.tar', 'whl.whl', 'gcs.tar.gz',
         dependency.EXTRA_PACKAGES_FILE],
        dependency.stage_job_resources(options))
    with open(os.path.join(staging_dir, dependency.EXTRA_PACKAGES_FILE)) as f:
      self.assertEqual(['abc.tar.gz\n', 'xyz.tar.gz\n', 'xyz2.tar\n',
                        'whl.whl\n', 'gcs.tar.gz\n'], f.readlines())
    self.assertEqual(['gs://my-gcs-bucket/gcs.tar.gz'], gcs_copied_files)
def run(argv=None):
  """Run the workflow."""

  # Parse the input options
  parser = argparse.ArgumentParser()
  parser.add_argument('--runner', dest='runner', default='DirectRunner')
  parser.add_argument('--input_file', dest='input_file', default='csv_sample.csv')
  parser.add_argument('--input_schema', dest='input_schema', default='')
  parser.add_argument('--input_delimiter', dest='input_delimiter', default=',')
  parser.add_argument('--output_project', dest='output_project', default='mam-cloud')
  parser.add_argument('--output_dataset', dest='output_dataset', default='dummy')
  parser.add_argument('--output_table', dest='output_table', default='df_from_csv')
  parser.add_argument('--cat_read', dest='cat_read', default='10000', help='Choose big enough to read at least 2 lines of the csv')
  parser.add_argument('--skip_row', dest='skip_row', default=1, help='We generally assume that the first row in the csv contains column names')

  known_args, pipeline_args = parser.parse_known_args(argv)

  # Set the options based on the runner
  if known_args.runner == 'DataflowRunner':
    pipeline_args.extend([
        '--runner=DataflowRunner',
        '--staging_location=gs://bq-connector-pii/staging',
        '--temp_location=gs://bq-connector-pii/temp',
        '--job_name=csv2bq-{}'.format(str(datetime.datetime.now()).replace("-","").replace(".","").replace(":","").replace(" ","")),
    ])

  # Create the pipeline
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True
  p = beam.Pipeline(options=pipeline_options)

  # Prepares schema as a side input unless there is one given. There is not head
  # in GCS so we need to guess a number big enough to cover 2 lines at least: 10000
  fields = known_args.input_schema
  if fields == '':
      cmd_head = "gsutil cat -r 0-{} ".format(known_args.cat_read) if known_args.input_file[:5] == "gs://" else "head -n 2 "
      cmd_head = cmd_head + known_args.input_file
      row1_n_row2 = subprocess.check_output(cmd_head.split(" ")).split("\n")
      fields = row1_n_row2[0]
      row2 = row1_n_row2[1]

  si_fields, table_schema = SchemaSideInput(fields, row2).parseSchema()

  # Reads csv file. We skip the first line as we give the schema in input
  csv_lines = p | 'Read CSV' >> ReadFromText(known_args.input_file, skip_header_lines=known_args.skip_row)

  # Converts to TableRow
  bq_rows = csv_lines | 'Convert to BQ rows' >> beam.ParDo(ConvertToTableRowFn(), si_fields).with_output_types(unicode)

  # Writes to BigQuery
  bq_rows | 'write' >> beam.io.Write(
      beam.io.BigQuerySink(
          "{}:{}.{}".format(known_args.output_project, known_args.output_dataset, known_args.output_table),
          schema=table_schema,
          create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
          write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))

  # Run the pipeline (all operations are deferred until run() is called).
  p.run()
示例#42
0
  def test_sdk_location_default(self):
    staging_dir = tempfile.mkdtemp()
    expected_from_url = 'pypi'
    expected_from_path = self.override_pypi_download(
        expected_from_url, staging_dir)
    self.override_file_copy(expected_from_path, staging_dir)

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).sdk_location = 'default'

    self.assertEqual(
        [names.DATAFLOW_SDK_TARBALL_FILE],
        dependency.stage_job_resources(
            options,
            file_copy=dependency._dependency_file_copy))
示例#43
0
def pipeline_monitoring(renames):
    """Using monitoring interface snippets."""

    import re
    import apache_beam as beam
    from apache_beam.utils.pipeline_options import PipelineOptions

    class WordCountOptions(PipelineOptions):
        @classmethod
        def _add_argparse_args(cls, parser):
            parser.add_argument('--input',
                                help='Input for the pipeline',
                                default='gs://my-bucket/input')
            parser.add_argument('--output',
                                help='output for the pipeline',
                                default='gs://my-bucket/output')

    class ExtractWordsFn(beam.DoFn):
        def process(self, element):
            words = re.findall(r'[A-Za-z\']+', element)
            for word in words:
                yield word

    class FormatCountsFn(beam.DoFn):
        def process(self, element):
            word, count = element
            yield '%s: %s' % (word, count)

    # [START pipeline_monitoring_composite]
    # The CountWords Composite Transform inside the WordCount pipeline.
    class CountWords(beam.PTransform):
        def expand(self, pcoll):
            return (pcoll
                    # Convert lines of text into individual words.
                    | 'ExtractWords' >> beam.ParDo(ExtractWordsFn())
                    # Count the number of times each word occurs.
                    | beam.combiners.Count.PerElement()
                    # Format each word and count into a printable string.
                    | 'FormatCounts' >> beam.ParDo(FormatCountsFn()))

    # [END pipeline_monitoring_composite]

    pipeline_options = PipelineOptions()
    options = pipeline_options.view_as(WordCountOptions)
    p = TestPipeline()  # Use TestPipeline for testing.

    # [START pipeline_monitoring_execution]
    (p
     # Read the lines of the input text.
     | 'ReadLines' >> beam.io.ReadFromText(options.input)
     # Count the words.
     | CountWords()
     # Write the formatted word counts to output.
     | 'WriteCounts' >> beam.io.WriteToText(options.output))
    # [END pipeline_monitoring_execution]

    p.visit(SnippetUtils.RenameFiles(renames))
    p.run()
示例#44
0
def run(argv=None):
    """Main entry point; defines and runs the hourly_team_score pipeline."""
    parser = argparse.ArgumentParser()

    # The default maps to two large Google Cloud Storage files (each ~12GB)
    # holding two subsequent day's worth (roughly) of data.
    parser.add_argument('--input',
                        dest='input',
                        default='gs://dataflow-samples/game/gaming_data*.csv',
                        help='Path to the data file(s) containing game data.')
    parser.add_argument('--dataset',
                        dest='dataset',
                        required=True,
                        help='BigQuery Dataset to write tables to. '
                        'Must already exist.')
    parser.add_argument(
        '--table_name',
        dest='table_name',
        default='hourly_team_score',
        help='The BigQuery table name. Should not already exist.')
    parser.add_argument(
        '--window_duration',
        type=int,
        default=60,
        help='Numeric value of fixed window duration, in minutes')
    parser.add_argument('--start_min',
                        dest='start_min',
                        default='1970-01-01-00-00',
                        help='String representation of the first minute after '
                        'which to generate results in the format: '
                        'yyyy-MM-dd-HH-mm. Any input data timestamped '
                        'prior to that minute won\'t be included in the '
                        'sums.')
    parser.add_argument('--stop_min',
                        dest='stop_min',
                        default='2100-01-01-00-00',
                        help='String representation of the first minute for '
                        'which to generate results in the format: '
                        'yyyy-MM-dd-HH-mm. Any input data timestamped '
                        'after to that minute won\'t be included in the '
                        'sums.')

    known_args, pipeline_args = parser.parse_known_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    p = beam.Pipeline(options=pipeline_options)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    (p  # pylint: disable=expression-not-assigned
     | ReadFromText(known_args.input)
     | HourlyTeamScore(known_args.start_min, known_args.stop_min,
                       known_args.window_duration)
     | WriteWindowedToBigQuery(known_args.table_name, known_args.dataset,
                               configure_bigquery_write()))

    result = p.run()
    result.wait_until_finish()
示例#45
0
  def test_default_resources(self):
    staging_dir = tempfile.mkdtemp()
    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)

    self.assertEqual(
        [],
        dependency.stage_job_resources(options))
示例#46
0
    def test_sdk_location_local(self):
        staging_dir = tempfile.mkdtemp()
        sdk_location = tempfile.mkdtemp()
        self.create_temp_file(
            os.path.join(sdk_location, names.DATAFLOW_SDK_TARBALL_FILE),
            'contents')

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).sdk_location = sdk_location

        self.assertEqual([names.DATAFLOW_SDK_TARBALL_FILE],
                         dependency.stage_job_resources(options))
        tarball_path = os.path.join(staging_dir,
                                    names.DATAFLOW_SDK_TARBALL_FILE)
        with open(tarball_path) as f:
            self.assertEqual(f.read(), 'contents')
def run(argv=None):
  """Main entry point; defines and runs the wordcount pipeline."""

  parser = argparse.ArgumentParser()
  parser.add_argument('--input',
                      dest='input',
                      default='gs://dataflow-samples/shakespeare/kinglear.txt',
                      help='Input file to process.')
  parser.add_argument('--output',
                      dest='output',
                      # CHANGE 1/5: The Google Cloud Storage path is required
                      # for outputting the results.
                      default='gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX',
                      help='Output file to write results to.')
  known_args, pipeline_args = parser.parse_known_args(argv)
  pipeline_args.extend([
      # CHANGE 2/5: (OPTIONAL) Change this to DataflowRunner to
      # run your pipeline on the Google Cloud Dataflow Service.
      '--runner=DirectRunner',
      # CHANGE 3/5: Your project ID is required in order to run your pipeline on
      # the Google Cloud Dataflow Service.
      '--project=SET_YOUR_PROJECT_ID_HERE',
      # CHANGE 4/5: Your Google Cloud Storage path is required for staging local
      # files.
      '--staging_location=gs://YOUR_BUCKET_NAME/AND_STAGING_DIRECTORY',
      # CHANGE 5/5: Your Google Cloud Storage path is required for temporary
      # files.
      '--temp_location=gs://YOUR_BUCKET_NAME/AND_TEMP_DIRECTORY',
      '--job_name=your-wordcount-job',
  ])

  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True
  p = beam.Pipeline(options=pipeline_options)

  # Read the text file[pattern] into a PCollection.
  lines = p | 'read' >> ReadFromText(known_args.input)

  # Count the occurrences of each word.
  counts = (lines
            | 'split' >> (beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
                          .with_output_types(unicode))
            | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
            | 'group' >> beam.GroupByKey()
            | 'count' >> beam.Map(lambda (word, ones): (word, sum(ones))))

  # Format the counts into a PCollection of strings.
  output = counts | 'format' >> beam.Map(lambda (word, c): '%s: %s' % (word, c))

  # Write the output using a "Write" transform that has side effects.
  # pylint: disable=expression-not-assigned
  output | 'write' >> WriteToText(known_args.output)

  # Actually run the pipeline (all operations above are deferred).
  p.run().wait_until_finish()
示例#48
0
    def test_setup_file_not_named_setup_dot_py(self):
        staging_dir = tempfile.mkdtemp()
        source_dir = tempfile.mkdtemp()

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).setup_file = (os.path.join(
            source_dir, 'xyz-setup.py'))

        self.create_temp_file(os.path.join(source_dir, 'xyz-setup.py'),
                              'notused')
        with self.assertRaises(RuntimeError) as cm:
            dependency.stage_job_resources(options)
        self.assertTrue(
            cm.exception.message.startswith(
                'The --setup_file option expects the full path to a file named '
                'setup.py instead of '))
    def test_override_options(self):
        base_flags = ['--num_workers', '5']
        options = PipelineOptions(base_flags)
        self.assertEqual(options.get_all_options()['num_workers'], 5)
        self.assertEqual(options.get_all_options()['mock_flag'], False)

        options.view_as(PipelineOptionsTest.MockOptions).mock_flag = True
        self.assertEqual(options.get_all_options()['num_workers'], 5)
        self.assertTrue(options.get_all_options()['mock_flag'])
  def test_override_options(self):
    base_flags = ['--num_workers', '5']
    options = PipelineOptions(base_flags)
    self.assertEqual(options.get_all_options()['num_workers'], 5)
    self.assertEqual(options.get_all_options()['mock_flag'], False)

    options.view_as(PipelineOptionsTest.MockOptions).mock_flag = True
    self.assertEqual(options.get_all_options()['num_workers'], 5)
    self.assertTrue(options.get_all_options()['mock_flag'])
示例#51
0
  def test_setup_file_not_named_setup_dot_py(self):
    staging_dir = tempfile.mkdtemp()
    source_dir = tempfile.mkdtemp()

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).setup_file = (
        os.path.join(source_dir, 'xyz-setup.py'))

    self.create_temp_file(
        os.path.join(source_dir, 'xyz-setup.py'), 'notused')
    with self.assertRaises(RuntimeError) as cm:
      dependency.stage_job_resources(options)
    self.assertTrue(
        cm.exception.message.startswith(
            'The --setup_file option expects the full path to a file named '
            'setup.py instead of '))
 def test_option_with_space(self):
   options = PipelineOptions(flags=['--option with space= value with space'])
   self.assertEqual(
       getattr(options.view_as(PipelineOptionsTest.MockOptions),
               'option with space'), ' value with space')
   options_from_dict = PipelineOptions.from_dictionary(
       options.get_all_options())
   self.assertEqual(
       getattr(options_from_dict.view_as(PipelineOptionsTest.MockOptions),
               'option with space'), ' value with space')
示例#53
0
 def test_option_with_space(self):
   options = PipelineOptions(flags=['--option with space= value with space'])
   self.assertEqual(
       getattr(options.view_as(PipelineOptionsTest.MockOptions),
               'option with space'), ' value with space')
   options_from_dict = PipelineOptions.from_dictionary(
       options.get_all_options())
   self.assertEqual(
       getattr(options_from_dict.view_as(PipelineOptionsTest.MockOptions),
               'option with space'), ' value with space')
示例#54
0
def run(argv=None):
  """Main entry point; defines and runs the hourly_team_score pipeline."""
  parser = argparse.ArgumentParser()

  # The default maps to two large Google Cloud Storage files (each ~12GB)
  # holding two subsequent day's worth (roughly) of data.
  parser.add_argument('--input',
                      dest='input',
                      default='gs://dataflow-samples/game/gaming_data*.csv',
                      help='Path to the data file(s) containing game data.')
  parser.add_argument('--dataset',
                      dest='dataset',
                      required=True,
                      help='BigQuery Dataset to write tables to. '
                           'Must already exist.')
  parser.add_argument('--table_name',
                      dest='table_name',
                      default='hourly_team_score',
                      help='The BigQuery table name. Should not already exist.')
  parser.add_argument('--window_duration',
                      type=int,
                      default=60,
                      help='Numeric value of fixed window duration, in minutes')
  parser.add_argument('--start_min',
                      dest='start_min',
                      default='1970-01-01-00-00',
                      help='String representation of the first minute after '
                           'which to generate results in the format: '
                           'yyyy-MM-dd-HH-mm. Any input data timestamped '
                           'prior to that minute won\'t be included in the '
                           'sums.')
  parser.add_argument('--stop_min',
                      dest='stop_min',
                      default='2100-01-01-00-00',
                      help='String representation of the first minute for '
                           'which to generate results in the format: '
                           'yyyy-MM-dd-HH-mm. Any input data timestamped '
                           'after to that minute won\'t be included in the '
                           'sums.')

  known_args, pipeline_args = parser.parse_known_args(argv)

  pipeline_options = PipelineOptions(pipeline_args)
  p = beam.Pipeline(options=pipeline_options)
  pipeline_options.view_as(SetupOptions).save_main_session = True

  (p  # pylint: disable=expression-not-assigned
   | ReadFromText(known_args.input)
   | HourlyTeamScore(
       known_args.start_min, known_args.stop_min, known_args.window_duration)
   | WriteWindowedToBigQuery(
       known_args.table_name, known_args.dataset, configure_bigquery_write()))

  result = p.run()
  result.wait_until_finish()
示例#55
0
def run(known_args, pipeline_args):
    network = MinimalNetwork()

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    pipeline_options.view_as(SetupOptions).extra_packages = [ml.sdk_location]
    pipeline_options.view_as(
        WorkerOptions).autoscaling_algorithm = 'THROUGHPUT_BASED'
    pipeline_options.view_as(GoogleCloudOptions).staging_location = \
        os.path.join(known_args.staging)
    pipeline_options.view_as(GoogleCloudOptions).temp_location = os.path.join(
        known_args.staging, 'tmp')
    pipeline_options.view_as(GoogleCloudOptions).job_name = str(
        network).replace('_', '').lower()

    beam.coders.registry.register_coder(tf.train.Example, ExampleProtoCoder)
    p = beam.Pipeline(options=pipeline_options)

    # Read Example data
    def parse_example(example):
        #TODO: add actual implementation
        yield example

    network_input = (p
                     | 'readExamples' >> beam.io.ReadFromText(known_args.input)
                     | 'processExamples' >>
                     beam.FlatMap(lambda example: parse_example(example)))

    examples = network_input | 'encodeExamples' >> beam.Map(
        lambda raw_input: network.preprocess(raw_input))

    # # Write the serialized compressed protocol buffers to Cloud Storage.
    _ = examples | beam.io.Write(
        'writeExamples',
        tfrecordio.WriteToTFRecord(
            file_path_prefix=os.path.join(known_args.output, 'examples'),
            compression_type=fileio.CompressionTypes.GZIP,
            coder=ExampleProtoCoder(),
            file_name_suffix='.tfrecord.gz'))

    # # Actually run the pipeline (all operations above are deferred).
    p.run()
 def test_from_dictionary(self):
   for case in PipelineOptionsTest.TEST_CASES:
     options = PipelineOptions(flags=case['flags'])
     all_options_dict = options.get_all_options()
     options_from_dict = PipelineOptions.from_dictionary(all_options_dict)
     self.assertEqual(options_from_dict.view_as(
         PipelineOptionsTest.MockOptions).mock_flag,
                      case['expected']['mock_flag'])
     self.assertEqual(options.view_as(
         PipelineOptionsTest.MockOptions).mock_option,
                      case['expected']['mock_option'])
示例#57
0
 def test_no_temp_location(self):
   staging_dir = tempfile.mkdtemp()
   options = PipelineOptions()
   google_cloud_options = options.view_as(GoogleCloudOptions)
   google_cloud_options.staging_location = staging_dir
   self.update_options(options)
   google_cloud_options.temp_location = None
   with self.assertRaises(RuntimeError) as cm:
     dependency.stage_job_resources(options)
   self.assertEqual('The --temp_location option must be specified.',
                    cm.exception.message)
示例#58
0
  def test_sdk_location_local(self):
    staging_dir = tempfile.mkdtemp()
    sdk_location = tempfile.mkdtemp()
    self.create_temp_file(
        os.path.join(
            sdk_location,
            names.DATAFLOW_SDK_TARBALL_FILE),
        'contents')

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).sdk_location = sdk_location

    self.assertEqual(
        [names.DATAFLOW_SDK_TARBALL_FILE],
        dependency.stage_job_resources(options))
    tarball_path = os.path.join(
        staging_dir, names.DATAFLOW_SDK_TARBALL_FILE)
    with open(tarball_path) as f:
      self.assertEqual(f.read(), 'contents')
示例#59
0
def run(argv=None):
  # pylint: disable=expression-not-assigned

  parser = argparse.ArgumentParser()
  parser.add_argument('--input',
                      required=True,
                      help='Input file pattern to process.')
  parser.add_argument('--output',
                      required=True,
                      help='Output file pattern to write results to.')
  parser.add_argument('--checksum_output',
                      help='Checksum output file pattern.')
  known_args, pipeline_args = parser.parse_known_args(argv)
  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True
  p = beam.Pipeline(options=pipeline_options)

  # Read the text file[pattern] into a PCollection.
  lines = p | ReadFromText(known_args.input, coder=beam.coders.BytesCoder())

  # Count the occurrences of each word.
  output = (lines
            | 'split' >> beam.Map(
                lambda x: (x[:10], x[10:99]))
            .with_output_types(beam.typehints.KV[str, str])
            | 'group' >> beam.GroupByKey()
            | 'format' >> beam.FlatMap(
                lambda (key, vals): ['%s%s' % (key, val) for val in vals]))

  # Write the output using a "Write" transform that has side effects.
  output | WriteToText(known_args.output)

  # Optionally write the input and output checksums.
  if known_args.checksum_output:
    input_csum = (lines
                  | 'input-csum' >> beam.Map(crc32line)
                  | 'combine-input-csum' >> beam.CombineGlobally(sum)
                  | 'hex-format' >> beam.Map(lambda x: '%x' % x))
    input_csum | 'write-input-csum' >> WriteToText(
        known_args.checksum_output + '-input')

    output_csum = (output
                   | 'output-csum' >> beam.Map(crc32line)
                   | 'combine-output-csum' >> beam.CombineGlobally(sum)
                   | 'hex-format-output' >> beam.Map(lambda x: '%x' % x))
    output_csum | 'write-output-csum' >> WriteToText(
        known_args.checksum_output + '-output')

  # Actually run the pipeline (all operations above are deferred).
  return p.run()
def run(argv=None):
  """Runs the workflow counting the long words and short words separately."""

  parser = argparse.ArgumentParser()
  parser.add_argument('--input',
                      default='gs://dataflow-samples/shakespeare/kinglear.txt',
                      help='Input file to process.')
  parser.add_argument('--output',
                      required=True,
                      help='Output prefix for files to write results to.')
  known_args, pipeline_args = parser.parse_known_args(argv)
  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True
  p = beam.Pipeline(options=pipeline_options)

  lines = p | ReadFromText(known_args.input)

  # with_outputs allows accessing the side outputs of a DoFn.
  split_lines_result = (lines
                        | beam.ParDo(SplitLinesToWordsFn()).with_outputs(
                            SplitLinesToWordsFn.SIDE_OUTPUT_TAG_SHORT_WORDS,
                            SplitLinesToWordsFn.SIDE_OUTPUT_TAG_CHARACTER_COUNT,
                            main='words'))

  # split_lines_result is an object of type DoOutputsTuple. It supports
  # accessing result in alternative ways.
  words, _, _ = split_lines_result
  short_words = split_lines_result[
      SplitLinesToWordsFn.SIDE_OUTPUT_TAG_SHORT_WORDS]
  character_count = split_lines_result.tag_character_count

  # pylint: disable=expression-not-assigned
  (character_count
   | 'pair_with_key' >> beam.Map(lambda x: ('chars_temp_key', x))
   | beam.GroupByKey()
   | 'count chars' >> beam.Map(lambda (_, counts): sum(counts))
   | 'write chars' >> WriteToText(known_args.output + '-chars'))

  # pylint: disable=expression-not-assigned
  (short_words
   | 'count short words' >> CountWords()
   | 'write short words' >> WriteToText(
       known_args.output + '-short-words'))

  # pylint: disable=expression-not-assigned
  (words
   | 'count words' >> CountWords()
   | 'write words' >> WriteToText(known_args.output + '-words'))

  return p.run()