def test_with_requirements_file_and_cache(self): staging_dir = tempfile.mkdtemp() source_dir = tempfile.mkdtemp() options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).requirements_file = os.path.join( source_dir, dependency.REQUIREMENTS_FILE) options.view_as(SetupOptions).requirements_cache = os.path.join( tempfile.gettempdir(), 'alternative-cache-dir') self.create_temp_file( os.path.join(source_dir, dependency.REQUIREMENTS_FILE), 'nothing') self.assertEqual( sorted([dependency.REQUIREMENTS_FILE, 'abc.txt', 'def.txt']), sorted( dependency.stage_job_resources( options, populate_requirements_cache=self. populate_requirements_cache))) self.assertTrue( os.path.isfile( os.path.join(staging_dir, dependency.REQUIREMENTS_FILE))) self.assertTrue(os.path.isfile(os.path.join(staging_dir, 'abc.txt'))) self.assertTrue(os.path.isfile(os.path.join(staging_dir, 'def.txt')))
def test_with_requirements_file(self): try: staging_dir = tempfile.mkdtemp() requirements_cache_dir = tempfile.mkdtemp() source_dir = tempfile.mkdtemp() options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).requirements_cache = requirements_cache_dir options.view_as(SetupOptions).requirements_file = os.path.join( source_dir, dependency.REQUIREMENTS_FILE) self.create_temp_file( os.path.join(source_dir, dependency.REQUIREMENTS_FILE), 'nothing') self.assertEqual( sorted([dependency.REQUIREMENTS_FILE, 'abc.txt', 'def.txt']), sorted(dependency.stage_job_resources( options, populate_requirements_cache=self.populate_requirements_cache))) self.assertTrue( os.path.isfile( os.path.join(staging_dir, dependency.REQUIREMENTS_FILE))) self.assertTrue(os.path.isfile(os.path.join(staging_dir, 'abc.txt'))) self.assertTrue(os.path.isfile(os.path.join(staging_dir, 'def.txt'))) finally: shutil.rmtree(staging_dir) shutil.rmtree(requirements_cache_dir) shutil.rmtree(source_dir)
def test_with_setup_file(self): staging_dir = tempfile.mkdtemp() source_dir = tempfile.mkdtemp() self.create_temp_file(os.path.join(source_dir, 'setup.py'), 'notused') options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).setup_file = os.path.join( source_dir, 'setup.py') self.assertEqual( [dependency.WORKFLOW_TARBALL_FILE], dependency.stage_job_resources( options, # We replace the build setup command because a realistic one would # require the setuptools package to be installed. Note that we can't # use "touch" here to create the expected output tarball file, since # touch is not available on Windows, so we invoke python to produce # equivalent behavior. build_setup_args=[ 'python', '-c', 'open(__import__("sys").argv[1], "a")', os.path.join(source_dir, dependency.WORKFLOW_TARBALL_FILE) ], temp_dir=source_dir)) self.assertTrue( os.path.isfile( os.path.join(staging_dir, dependency.WORKFLOW_TARBALL_FILE)))
def test_with_setup_file(self): staging_dir = tempfile.mkdtemp() source_dir = tempfile.mkdtemp() self.create_temp_file( os.path.join(source_dir, 'setup.py'), 'notused') options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).setup_file = os.path.join( source_dir, 'setup.py') self.assertEqual( [dependency.WORKFLOW_TARBALL_FILE], dependency.stage_job_resources( options, # We replace the build setup command because a realistic one would # require the setuptools package to be installed. Note that we can't # use "touch" here to create the expected output tarball file, since # touch is not available on Windows, so we invoke python to produce # equivalent behavior. build_setup_args=[ 'python', '-c', 'open(__import__("sys").argv[1], "a")', os.path.join(source_dir, dependency.WORKFLOW_TARBALL_FILE)], temp_dir=source_dir)) self.assertTrue( os.path.isfile( os.path.join(staging_dir, dependency.WORKFLOW_TARBALL_FILE)))
def run(): parser = argparse.ArgumentParser() parser.add_argument('--run_locally', dest='run_locally', default='', help='Run data subset and do not save.') known_args, pipeline_args = parser.parse_known_args() pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True gcloud_options = pipeline_options.view_as(GoogleCloudOptions) delete_from_datastore('dancedeets-hrd', gcloud_options, known_args.run_locally)
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--kind', dest='kind', required=True, help='Datastore Kind') parser.add_argument('--namespace', dest='namespace', help='Datastore Namespace') parser.add_argument('--ancestor', dest='ancestor', default='root', help='The ancestor key name for all entities.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') parser.add_argument('--read_only', action='store_true', help='Read an existing dataset, do not write first') parser.add_argument( '--num_shards', dest='num_shards', type=int, # If the system should choose automatically. default=0, help='Number of output shards') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True gcloud_options = pipeline_options.view_as(GoogleCloudOptions) # Write to Datastore if `read_only` options is not specified. if not known_args.read_only: write_to_datastore(gcloud_options.project, known_args, pipeline_options) # Read entities from Datastore. result = read_from_datastore(gcloud_options.project, known_args, pipeline_options) empty_lines_filter = MetricsFilter().with_name('empty_lines') query_result = result.metrics().query(empty_lines_filter) if query_result['counters']: empty_lines_counter = query_result['counters'][0] logging.info('number of empty lines: %d', empty_lines_counter.committed)
def test_no_main_session(self): staging_dir = tempfile.mkdtemp() options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir options.view_as(SetupOptions).save_main_session = False self.update_options(options) self.assertEqual([], dependency.stage_job_resources(options))
def examples_wordcount_minimal(renames): """MinimalWordCount example snippets.""" import re import apache_beam as beam from apache_beam.utils.pipeline_options import GoogleCloudOptions from apache_beam.utils.pipeline_options import StandardOptions from apache_beam.utils.pipeline_options import PipelineOptions # [START examples_wordcount_minimal_options] options = PipelineOptions() google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = 'my-project-id' google_cloud_options.job_name = 'myjob' google_cloud_options.staging_location = 'gs://your-bucket-name-here/staging' google_cloud_options.temp_location = 'gs://your-bucket-name-here/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' # [END examples_wordcount_minimal_options] # Run it locally for testing. options = PipelineOptions() # [START examples_wordcount_minimal_create] p = beam.Pipeline(options=options) # [END examples_wordcount_minimal_create] ( # [START examples_wordcount_minimal_read] p | beam.io.ReadFromText('gs://dataflow-samples/shakespeare/kinglear.txt') # [END examples_wordcount_minimal_read] # [START examples_wordcount_minimal_pardo] | 'ExtractWords' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x)) # [END examples_wordcount_minimal_pardo] # [START examples_wordcount_minimal_count] | beam.combiners.Count.PerElement() # [END examples_wordcount_minimal_count] # [START examples_wordcount_minimal_map] | beam.Map(lambda (word, count): '%s: %s' % (word, count)) # [END examples_wordcount_minimal_map] # [START examples_wordcount_minimal_write] | beam.io.WriteToText('gs://my-bucket/counts.txt') # [END examples_wordcount_minimal_write] ) p.visit(SnippetUtils.RenameFiles(renames)) # [START examples_wordcount_minimal_run] result = p.run() # [END examples_wordcount_minimal_run] result.wait_until_finish()
def examples_wordcount_minimal(renames): """MinimalWordCount example snippets.""" import re import apache_beam as beam from apache_beam.utils.pipeline_options import GoogleCloudOptions from apache_beam.utils.pipeline_options import StandardOptions from apache_beam.utils.pipeline_options import PipelineOptions # [START examples_wordcount_minimal_options] options = PipelineOptions() google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = 'my-project-id' google_cloud_options.job_name = 'myjob' google_cloud_options.staging_location = 'gs://your-bucket-name-here/staging' google_cloud_options.temp_location = 'gs://your-bucket-name-here/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' # [END examples_wordcount_minimal_options] # Run it locally for testing. options = PipelineOptions() # [START examples_wordcount_minimal_create] p = beam.Pipeline(options=options) # [END examples_wordcount_minimal_create] ( # [START examples_wordcount_minimal_read] p | beam.io.ReadFromText( 'gs://dataflow-samples/shakespeare/kinglear.txt') # [END examples_wordcount_minimal_read] # [START examples_wordcount_minimal_pardo] | 'ExtractWords' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x)) # [END examples_wordcount_minimal_pardo] # [START examples_wordcount_minimal_count] | beam.combiners.Count.PerElement() # [END examples_wordcount_minimal_count] # [START examples_wordcount_minimal_map] | beam.Map(lambda (word, count): '%s: %s' % (word, count)) # [END examples_wordcount_minimal_map] # [START examples_wordcount_minimal_write] | beam.io.WriteToText('gs://my-bucket/counts.txt') # [END examples_wordcount_minimal_write] ) p.visit(SnippetUtils.RenameFiles(renames)) # [START examples_wordcount_minimal_run] result = p.run() # [END examples_wordcount_minimal_run] result.wait_until_finish()
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" class WordcountOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_value_provider_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_value_provider_argument( '--output', dest='output', required=True, help='Output file to write results to.') pipeline_options = PipelineOptions(argv) wordcount_options = pipeline_options.view_as(WordcountOptions) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) # Read the text file[pattern] into a PCollection. lines = p | 'read' >> ReadFromText(wordcount_options.input) # Count the occurrences of each word. counts = (lines | 'split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode)) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(lambda (word, ones): (word, sum(ones)))) # Format the counts into a PCollection of strings. output = counts | 'format' >> beam.Map(lambda (word, c): '%s: %s' % (word, c)) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | 'write' >> WriteToText(wordcount_options.output) # Actually run the pipeline (all operations above are deferred). result = p.run() result.wait_until_finish() # Do not query metrics when creating a template which doesn't run if (not hasattr(result, 'has_job') # direct runner or result.has_job): # not just a template creation empty_lines_filter = MetricsFilter().with_name('empty_lines') query_result = result.metrics().query(empty_lines_filter) if query_result['counters']: empty_lines_counter = query_result['counters'][0] logging.info('number of empty lines: %d', empty_lines_counter.committed)
def test_get_all_options(self): for case in PipelineOptionsTest.TEST_CASES: options = PipelineOptions(flags=case['flags']) self.assertDictContainsSubset(case['expected'], options.get_all_options()) self.assertEqual(options.view_as( PipelineOptionsTest.MockOptions).mock_flag, case['expected']['mock_flag']) self.assertEqual(options.view_as( PipelineOptionsTest.MockOptions).mock_option, case['expected']['mock_option'])
def test_get_all_options(self): for case in PipelineOptionsTest.TEST_CASES: options = PipelineOptions(flags=case['flags']) self.assertDictContainsSubset(case['expected'], options.get_all_options()) self.assertEqual(options.view_as( PipelineOptionsTest.MockOptions).mock_flag, case['expected']['mock_flag']) self.assertEqual(options.view_as( PipelineOptionsTest.MockOptions).mock_option, case['expected']['mock_option'])
def test_no_main_session(self): staging_dir = tempfile.mkdtemp() options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir options.view_as(SetupOptions).save_main_session = False self.update_options(options) self.assertEqual( [], dependency.stage_job_resources(options))
def run(): parser = argparse.ArgumentParser() parser.add_argument('--run_locally', dest='run_locally', default='', help='Run data subset and do not save.') known_args, pipeline_args = parser.parse_known_args() pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True gcloud_options = pipeline_options.view_as(GoogleCloudOptions) delete_from_datastore('dancedeets-hrd', gcloud_options, known_args.run_locally)
def __init_pipeline(self): pipeline_args = self.config['pipeline_args'] options = PipelineOptions() google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = pipeline_args['project'] google_cloud_options.job_name = pipeline_args['job_name'] google_cloud_options.staging_location = pipeline_args['staging_location'] google_cloud_options.temp_location = pipeline_args['temp_location'] options.view_as(StandardOptions).runner = pipeline_args['runner'] options.view_as(SetupOptions).setup_file = pipeline_args['setup_file'] options.view_as(SetupOptions).save_main_session = True return beam.Pipeline(options=options)
def test_sdk_location_gcs(self): staging_dir = tempfile.mkdtemp() sdk_location = 'gs://my-gcs-bucket/tarball.tar.gz' self.override_file_copy(sdk_location, staging_dir) options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location self.assertEqual([names.DATAFLOW_SDK_TARBALL_FILE], dependency.stage_job_resources(options))
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--kind', dest='kind', required=True, help='Datastore Kind') parser.add_argument('--namespace', dest='namespace', help='Datastore Namespace') parser.add_argument('--ancestor', dest='ancestor', default='root', help='The ancestor key name for all entities.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') parser.add_argument('--read_only', action='store_true', help='Read an existing dataset, do not write first') parser.add_argument('--num_shards', dest='num_shards', type=int, # If the system should choose automatically. default=0, help='Number of output shards') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True gcloud_options = pipeline_options.view_as(GoogleCloudOptions) # Write to Datastore if `read_only` options is not specified. if not known_args.read_only: write_to_datastore(gcloud_options.project, known_args, pipeline_options) # Read entities from Datastore. result = read_from_datastore(gcloud_options.project, known_args, pipeline_options) empty_lines_filter = MetricsFilter().with_name('empty_lines') query_result = result.metrics().query(empty_lines_filter) if query_result['counters']: empty_lines_counter = query_result['counters'][0] logging.info('number of empty lines: %d', empty_lines_counter.committed)
def test_with_extra_packages(self): staging_dir = tempfile.mkdtemp() source_dir = tempfile.mkdtemp() self.create_temp_file(os.path.join(source_dir, 'abc.tar.gz'), 'nothing') self.create_temp_file(os.path.join(source_dir, 'xyz.tar.gz'), 'nothing') self.create_temp_file(os.path.join(source_dir, 'xyz2.tar'), 'nothing') self.create_temp_file(os.path.join(source_dir, 'whl.whl'), 'nothing') self.create_temp_file( os.path.join(source_dir, dependency.EXTRA_PACKAGES_FILE), 'nothing') options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).extra_packages = [ os.path.join(source_dir, 'abc.tar.gz'), os.path.join(source_dir, 'xyz.tar.gz'), os.path.join(source_dir, 'xyz2.tar'), os.path.join(source_dir, 'whl.whl'), 'gs://my-gcs-bucket/gcs.tar.gz' ] gcs_copied_files = [] def file_copy(from_path, to_path): if from_path.startswith('gs://'): gcs_copied_files.append(from_path) _, from_name = os.path.split(from_path) self.create_temp_file(os.path.join(to_path, from_name), 'nothing') logging.info('Fake copied GCS file: %s to %s', from_path, to_path) elif to_path.startswith('gs://'): logging.info('Faking file_copy(%s, %s)', from_path, to_path) else: shutil.copyfile(from_path, to_path) dependency._dependency_file_copy = file_copy self.assertEqual([ 'abc.tar.gz', 'xyz.tar.gz', 'xyz2.tar', 'whl.whl', 'gcs.tar.gz', dependency.EXTRA_PACKAGES_FILE ], dependency.stage_job_resources(options)) with open(os.path.join(staging_dir, dependency.EXTRA_PACKAGES_FILE)) as f: self.assertEqual([ 'abc.tar.gz\n', 'xyz.tar.gz\n', 'xyz2.tar\n', 'whl.whl\n', 'gcs.tar.gz\n' ], f.readlines()) self.assertEqual(['gs://my-gcs-bucket/gcs.tar.gz'], gcs_copied_files)
def test_with_main_session(self): staging_dir = tempfile.mkdtemp() options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir options.view_as(SetupOptions).save_main_session = True self.update_options(options) self.assertEqual([names.PICKLED_MAIN_SESSION_FILE], dependency.stage_job_resources(options)) self.assertTrue( os.path.isfile( os.path.join(staging_dir, names.PICKLED_MAIN_SESSION_FILE)))
def test_requirements_file_not_present(self): staging_dir = tempfile.mkdtemp() with self.assertRaises(RuntimeError) as cm: options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).requirements_file = 'nosuchfile' dependency.stage_job_resources( options, populate_requirements_cache=self.populate_requirements_cache) self.assertEqual( cm.exception.message, 'The file %s cannot be found. It was specified in the ' '--requirements_file command line option.' % 'nosuchfile')
def test_sdk_location_gcs(self): staging_dir = tempfile.mkdtemp() sdk_location = 'gs://my-gcs-bucket/tarball.tar.gz' self.override_file_copy(sdk_location, staging_dir) options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location self.assertEqual( [names.DATAFLOW_SDK_TARBALL_FILE], dependency.stage_job_resources(options))
def test_with_main_session(self): staging_dir = tempfile.mkdtemp() options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir options.view_as(SetupOptions).save_main_session = True self.update_options(options) self.assertEqual( [names.PICKLED_MAIN_SESSION_FILE], dependency.stage_job_resources(options)) self.assertTrue( os.path.isfile( os.path.join(staging_dir, names.PICKLED_MAIN_SESSION_FILE)))
def test_with_extra_packages_missing_files(self): staging_dir = tempfile.mkdtemp() with self.assertRaises(RuntimeError) as cm: options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).extra_packages = ['nosuchfile.tar.gz'] dependency.stage_job_resources(options) self.assertEqual( cm.exception.message, 'The file %s cannot be found. It was specified in the ' '--extra_packages command line option.' % 'nosuchfile.tar.gz')
def pipeline_options_remote(argv): """Creating a Pipeline using a PipelineOptions object for remote execution.""" from apache_beam import Pipeline from apache_beam.utils.pipeline_options import PipelineOptions # [START pipeline_options_create] options = PipelineOptions(flags=argv) # [END pipeline_options_create] # [START pipeline_options_define_custom] class MyOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--input') parser.add_argument('--output') # [END pipeline_options_define_custom] from apache_beam.utils.pipeline_options import GoogleCloudOptions from apache_beam.utils.pipeline_options import StandardOptions # [START pipeline_options_dataflow_service] # Create and set your PipelineOptions. options = PipelineOptions(flags=argv) # For Cloud execution, set the Cloud Platform project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = 'my-project-id' google_cloud_options.job_name = 'myjob' google_cloud_options.staging_location = 'gs://my-bucket/binaries' google_cloud_options.temp_location = 'gs://my-bucket/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' # Create the Pipeline with the specified options. p = Pipeline(options=options) # [END pipeline_options_dataflow_service] my_options = options.view_as(MyOptions) my_input = my_options.input my_output = my_options.output p = TestPipeline() # Use TestPipeline for testing. lines = p | beam.io.ReadFromText(my_input) lines | beam.io.WriteToText(my_output) p.run()
def test_setup_file_not_present(self): staging_dir = tempfile.mkdtemp() options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).setup_file = 'nosuchfile' with self.assertRaises(RuntimeError) as cm: dependency.stage_job_resources(options) self.assertEqual( cm.exception.message, 'The file %s cannot be found. It was specified in the ' '--setup_file command line option.' % 'nosuchfile')
def test_sdk_location_local_not_present(self): staging_dir = tempfile.mkdtemp() sdk_location = 'nosuchdir' with self.assertRaises(RuntimeError) as cm: options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location dependency.stage_job_resources(options) self.assertEqual( 'The file "%s" cannot be found. Its ' 'location was specified by the --sdk_location command-line option.' % sdk_location, cm.exception.message)
def pipeline_options_remote(argv): """Creating a Pipeline using a PipelineOptions object for remote execution.""" from apache_beam import Pipeline from apache_beam.utils.pipeline_options import PipelineOptions # [START pipeline_options_create] options = PipelineOptions(flags=argv) # [END pipeline_options_create] # [START pipeline_options_define_custom] class MyOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--input') parser.add_argument('--output') # [END pipeline_options_define_custom] from apache_beam.utils.pipeline_options import GoogleCloudOptions from apache_beam.utils.pipeline_options import StandardOptions # [START pipeline_options_dataflow_service] # Create and set your PipelineOptions. options = PipelineOptions(flags=argv) # For Cloud execution, set the Cloud Platform project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = 'my-project-id' google_cloud_options.job_name = 'myjob' google_cloud_options.staging_location = 'gs://my-bucket/binaries' google_cloud_options.temp_location = 'gs://my-bucket/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' # Create the Pipeline with the specified options. p = Pipeline(options=options) # [END pipeline_options_dataflow_service] my_options = options.view_as(MyOptions) my_input = my_options.input my_output = my_options.output p = TestPipeline() # Use TestPipeline for testing. lines = p | beam.io.ReadFromText(my_input) lines | beam.io.WriteToText(my_output) p.run()
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" class WordcountOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_value_provider_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_value_provider_argument( '--output', dest='output', required=True, help='Output file to write results to.') pipeline_options = PipelineOptions(argv) wordcount_options = pipeline_options.view_as(WordcountOptions) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) # Read the text file[pattern] into a PCollection. lines = p | 'read' >> ReadFromText(wordcount_options.input) # Count the occurrences of each word. counts = (lines | 'split' >> (beam.ParDo(WordExtractingDoFn()) .with_output_types(unicode)) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(lambda (word, ones): (word, sum(ones)))) # Format the counts into a PCollection of strings. output = counts | 'format' >> beam.Map(lambda (word, c): '%s: %s' % (word, c)) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | 'write' >> WriteToText(wordcount_options.output) # Actually run the pipeline (all operations above are deferred). result = p.run() result.wait_until_finish() empty_lines_filter = MetricsFilter().with_name('empty_lines') query_result = result.metrics().query(empty_lines_filter) if query_result['counters']: empty_lines_counter = query_result['counters'][0] logging.info('number of empty lines: %d', empty_lines_counter.committed)
def test_sdk_location_local_not_present(self): staging_dir = tempfile.mkdtemp() sdk_location = 'nosuchdir' with self.assertRaises(RuntimeError) as cm: options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location dependency.stage_job_resources(options) self.assertEqual( 'The file "%s" cannot be found. Its ' 'location was specified by the --sdk_location command-line option.' % sdk_location, cm.exception.message)
def test_with_extra_packages_invalid_file_name(self): staging_dir = tempfile.mkdtemp() source_dir = tempfile.mkdtemp() self.create_temp_file( os.path.join(source_dir, 'abc.tgz'), 'nothing') with self.assertRaises(RuntimeError) as cm: options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).extra_packages = [ os.path.join(source_dir, 'abc.tgz')] dependency.stage_job_resources(options) self.assertEqual( cm.exception.message, 'The --extra_package option expects a full path ending with ".tar" or ' '".tar.gz" instead of %s' % os.path.join(source_dir, 'abc.tgz'))
def model_pcollection(argv): """Creating a PCollection from data in local memory.""" from apache_beam.utils.pipeline_options import PipelineOptions class MyOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') pipeline_options = PipelineOptions(argv) my_options = pipeline_options.view_as(MyOptions) # [START model_pcollection] p = beam.Pipeline(options=pipeline_options) (p | beam.Create([ 'To be, or not to be: that is the question: ', 'Whether \'tis nobler in the mind to suffer ', 'The slings and arrows of outrageous fortune, ', 'Or to take arms against a sea of troubles, ' ]) | beam.io.WriteToText(my_options.output)) result = p.run() # [END model_pcollection] result.wait_until_finish()
def run(argv=None): """Runs the Wikipedia top edits pipeline. Args: argv: Pipeline options as a list of arguments. """ parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/wikipedia_edits/*.json', help= 'Input specified as a GCS path containing a BigQuery table exported ' 'as json.') parser.add_argument('--output', required=True, help='Output file to write results to.') parser.add_argument('--sampling_threshold', type=float, default=0.1, help='Fraction of entries used for session tracking') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) (p # pylint: disable=expression-not-assigned | ReadFromText(known_args.input) | ComputeTopSessions(known_args.sampling_threshold) | WriteToText(known_args.output)) p.run()
def test_with_extra_packages_missing_files(self): staging_dir = tempfile.mkdtemp() with self.assertRaises(RuntimeError) as cm: options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).extra_packages = [ 'nosuchfile.tar.gz' ] dependency.stage_job_resources(options) self.assertEqual( cm.exception.message, 'The file %s cannot be found. It was specified in the ' '--extra_packages command line option.' % 'nosuchfile.tar.gz')
def run(argv=None): """Main entry point; defines and runs the tfidf pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--uris', required=True, help='URIs to process.') parser.add_argument('--output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) # Read documents specified by the uris command line option. pcoll = read_documents(p, glob.glob(known_args.uris)) # Compute TF-IDF information for each word. output = pcoll | TfIdf() # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | 'write' >> WriteToText(known_args.output) # Execute the pipeline and wait until it is completed. p.run().wait_until_finish()
def test_default_resources(self): staging_dir = tempfile.mkdtemp() options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) self.assertEqual([], dependency.stage_job_resources(options))
def model_pcollection(argv): """Creating a PCollection from data in local memory.""" from apache_beam.utils.pipeline_options import PipelineOptions class MyOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') pipeline_options = PipelineOptions(argv) my_options = pipeline_options.view_as(MyOptions) # [START model_pcollection] p = beam.Pipeline(options=pipeline_options) (p | beam.Create([ 'To be, or not to be: that is the question: ', 'Whether \'tis nobler in the mind to suffer ', 'The slings and arrows of outrageous fortune, ', 'Or to take arms against a sea of troubles, ']) | beam.io.WriteToText(my_options.output)) result = p.run() # [END model_pcollection] result.wait_until_finish()
def run(argv=None): """Runs the Wikipedia top edits pipeline. Args: argv: Pipeline options as a list of arguments. """ parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/wikipedia_edits/*.json', help='Input specified as a GCS path containing a BigQuery table exported ' 'as json.') parser.add_argument('--output', required=True, help='Output file to write results to.') parser.add_argument('--sampling_threshold', type=float, default=0.1, help='Fraction of entries used for session tracking') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) (p # pylint: disable=expression-not-assigned | ReadFromText(known_args.input) | ComputeTopSessions(known_args.sampling_threshold) | WriteToText(known_args.output)) p.run()
def test_sdk_location_default(self): staging_dir = tempfile.mkdtemp() expected_from_url = 'pypi' expected_from_path = self.override_pypi_download( expected_from_url, staging_dir) self.override_file_copy(expected_from_path, staging_dir) options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = 'default' self.assertEqual([names.DATAFLOW_SDK_TARBALL_FILE], dependency.stage_job_resources( options, file_copy=dependency._dependency_file_copy))
def test_with_extra_packages_invalid_file_name(self): staging_dir = tempfile.mkdtemp() source_dir = tempfile.mkdtemp() self.create_temp_file(os.path.join(source_dir, 'abc.tgz'), 'nothing') with self.assertRaises(RuntimeError) as cm: options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).extra_packages = [ os.path.join(source_dir, 'abc.tgz') ] dependency.stage_job_resources(options) self.assertEqual( cm.exception.message, 'The --extra_package option expects a full path ending with ".tar" or ' '".tar.gz" instead of %s' % os.path.join(source_dir, 'abc.tgz'))
def test_with_extra_packages(self): staging_dir = tempfile.mkdtemp() source_dir = tempfile.mkdtemp() self.create_temp_file( os.path.join(source_dir, 'abc.tar.gz'), 'nothing') self.create_temp_file( os.path.join(source_dir, 'xyz.tar.gz'), 'nothing') self.create_temp_file( os.path.join(source_dir, 'xyz2.tar'), 'nothing') self.create_temp_file( os.path.join(source_dir, 'whl.whl'), 'nothing') self.create_temp_file( os.path.join(source_dir, dependency.EXTRA_PACKAGES_FILE), 'nothing') options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).extra_packages = [ os.path.join(source_dir, 'abc.tar.gz'), os.path.join(source_dir, 'xyz.tar.gz'), os.path.join(source_dir, 'xyz2.tar'), os.path.join(source_dir, 'whl.whl'), 'gs://my-gcs-bucket/gcs.tar.gz'] gcs_copied_files = [] def file_copy(from_path, to_path): if from_path.startswith('gs://'): gcs_copied_files.append(from_path) _, from_name = os.path.split(from_path) self.create_temp_file(os.path.join(to_path, from_name), 'nothing') logging.info('Fake copied GCS file: %s to %s', from_path, to_path) elif to_path.startswith('gs://'): logging.info('Faking file_copy(%s, %s)', from_path, to_path) else: shutil.copyfile(from_path, to_path) dependency._dependency_file_copy = file_copy self.assertEqual( ['abc.tar.gz', 'xyz.tar.gz', 'xyz2.tar', 'whl.whl', 'gcs.tar.gz', dependency.EXTRA_PACKAGES_FILE], dependency.stage_job_resources(options)) with open(os.path.join(staging_dir, dependency.EXTRA_PACKAGES_FILE)) as f: self.assertEqual(['abc.tar.gz\n', 'xyz.tar.gz\n', 'xyz2.tar\n', 'whl.whl\n', 'gcs.tar.gz\n'], f.readlines()) self.assertEqual(['gs://my-gcs-bucket/gcs.tar.gz'], gcs_copied_files)
def run(argv=None): """Run the workflow.""" # Parse the input options parser = argparse.ArgumentParser() parser.add_argument('--runner', dest='runner', default='DirectRunner') parser.add_argument('--input_file', dest='input_file', default='csv_sample.csv') parser.add_argument('--input_schema', dest='input_schema', default='') parser.add_argument('--input_delimiter', dest='input_delimiter', default=',') parser.add_argument('--output_project', dest='output_project', default='mam-cloud') parser.add_argument('--output_dataset', dest='output_dataset', default='dummy') parser.add_argument('--output_table', dest='output_table', default='df_from_csv') parser.add_argument('--cat_read', dest='cat_read', default='10000', help='Choose big enough to read at least 2 lines of the csv') parser.add_argument('--skip_row', dest='skip_row', default=1, help='We generally assume that the first row in the csv contains column names') known_args, pipeline_args = parser.parse_known_args(argv) # Set the options based on the runner if known_args.runner == 'DataflowRunner': pipeline_args.extend([ '--runner=DataflowRunner', '--staging_location=gs://bq-connector-pii/staging', '--temp_location=gs://bq-connector-pii/temp', '--job_name=csv2bq-{}'.format(str(datetime.datetime.now()).replace("-","").replace(".","").replace(":","").replace(" ","")), ]) # Create the pipeline pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) # Prepares schema as a side input unless there is one given. There is not head # in GCS so we need to guess a number big enough to cover 2 lines at least: 10000 fields = known_args.input_schema if fields == '': cmd_head = "gsutil cat -r 0-{} ".format(known_args.cat_read) if known_args.input_file[:5] == "gs://" else "head -n 2 " cmd_head = cmd_head + known_args.input_file row1_n_row2 = subprocess.check_output(cmd_head.split(" ")).split("\n") fields = row1_n_row2[0] row2 = row1_n_row2[1] si_fields, table_schema = SchemaSideInput(fields, row2).parseSchema() # Reads csv file. We skip the first line as we give the schema in input csv_lines = p | 'Read CSV' >> ReadFromText(known_args.input_file, skip_header_lines=known_args.skip_row) # Converts to TableRow bq_rows = csv_lines | 'Convert to BQ rows' >> beam.ParDo(ConvertToTableRowFn(), si_fields).with_output_types(unicode) # Writes to BigQuery bq_rows | 'write' >> beam.io.Write( beam.io.BigQuerySink( "{}:{}.{}".format(known_args.output_project, known_args.output_dataset, known_args.output_table), schema=table_schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)) # Run the pipeline (all operations are deferred until run() is called). p.run()
def test_sdk_location_default(self): staging_dir = tempfile.mkdtemp() expected_from_url = 'pypi' expected_from_path = self.override_pypi_download( expected_from_url, staging_dir) self.override_file_copy(expected_from_path, staging_dir) options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = 'default' self.assertEqual( [names.DATAFLOW_SDK_TARBALL_FILE], dependency.stage_job_resources( options, file_copy=dependency._dependency_file_copy))
def pipeline_monitoring(renames): """Using monitoring interface snippets.""" import re import apache_beam as beam from apache_beam.utils.pipeline_options import PipelineOptions class WordCountOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--input', help='Input for the pipeline', default='gs://my-bucket/input') parser.add_argument('--output', help='output for the pipeline', default='gs://my-bucket/output') class ExtractWordsFn(beam.DoFn): def process(self, element): words = re.findall(r'[A-Za-z\']+', element) for word in words: yield word class FormatCountsFn(beam.DoFn): def process(self, element): word, count = element yield '%s: %s' % (word, count) # [START pipeline_monitoring_composite] # The CountWords Composite Transform inside the WordCount pipeline. class CountWords(beam.PTransform): def expand(self, pcoll): return (pcoll # Convert lines of text into individual words. | 'ExtractWords' >> beam.ParDo(ExtractWordsFn()) # Count the number of times each word occurs. | beam.combiners.Count.PerElement() # Format each word and count into a printable string. | 'FormatCounts' >> beam.ParDo(FormatCountsFn())) # [END pipeline_monitoring_composite] pipeline_options = PipelineOptions() options = pipeline_options.view_as(WordCountOptions) p = TestPipeline() # Use TestPipeline for testing. # [START pipeline_monitoring_execution] (p # Read the lines of the input text. | 'ReadLines' >> beam.io.ReadFromText(options.input) # Count the words. | CountWords() # Write the formatted word counts to output. | 'WriteCounts' >> beam.io.WriteToText(options.output)) # [END pipeline_monitoring_execution] p.visit(SnippetUtils.RenameFiles(renames)) p.run()
def run(argv=None): """Main entry point; defines and runs the hourly_team_score pipeline.""" parser = argparse.ArgumentParser() # The default maps to two large Google Cloud Storage files (each ~12GB) # holding two subsequent day's worth (roughly) of data. parser.add_argument('--input', dest='input', default='gs://dataflow-samples/game/gaming_data*.csv', help='Path to the data file(s) containing game data.') parser.add_argument('--dataset', dest='dataset', required=True, help='BigQuery Dataset to write tables to. ' 'Must already exist.') parser.add_argument( '--table_name', dest='table_name', default='hourly_team_score', help='The BigQuery table name. Should not already exist.') parser.add_argument( '--window_duration', type=int, default=60, help='Numeric value of fixed window duration, in minutes') parser.add_argument('--start_min', dest='start_min', default='1970-01-01-00-00', help='String representation of the first minute after ' 'which to generate results in the format: ' 'yyyy-MM-dd-HH-mm. Any input data timestamped ' 'prior to that minute won\'t be included in the ' 'sums.') parser.add_argument('--stop_min', dest='stop_min', default='2100-01-01-00-00', help='String representation of the first minute for ' 'which to generate results in the format: ' 'yyyy-MM-dd-HH-mm. Any input data timestamped ' 'after to that minute won\'t be included in the ' 'sums.') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) p = beam.Pipeline(options=pipeline_options) pipeline_options.view_as(SetupOptions).save_main_session = True (p # pylint: disable=expression-not-assigned | ReadFromText(known_args.input) | HourlyTeamScore(known_args.start_min, known_args.stop_min, known_args.window_duration) | WriteWindowedToBigQuery(known_args.table_name, known_args.dataset, configure_bigquery_write())) result = p.run() result.wait_until_finish()
def test_default_resources(self): staging_dir = tempfile.mkdtemp() options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) self.assertEqual( [], dependency.stage_job_resources(options))
def test_sdk_location_local(self): staging_dir = tempfile.mkdtemp() sdk_location = tempfile.mkdtemp() self.create_temp_file( os.path.join(sdk_location, names.DATAFLOW_SDK_TARBALL_FILE), 'contents') options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location self.assertEqual([names.DATAFLOW_SDK_TARBALL_FILE], dependency.stage_job_resources(options)) tarball_path = os.path.join(staging_dir, names.DATAFLOW_SDK_TARBALL_FILE) with open(tarball_path) as f: self.assertEqual(f.read(), 'contents')
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', # CHANGE 1/5: The Google Cloud Storage path is required # for outputting the results. default='gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX', help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_args.extend([ # CHANGE 2/5: (OPTIONAL) Change this to DataflowRunner to # run your pipeline on the Google Cloud Dataflow Service. '--runner=DirectRunner', # CHANGE 3/5: Your project ID is required in order to run your pipeline on # the Google Cloud Dataflow Service. '--project=SET_YOUR_PROJECT_ID_HERE', # CHANGE 4/5: Your Google Cloud Storage path is required for staging local # files. '--staging_location=gs://YOUR_BUCKET_NAME/AND_STAGING_DIRECTORY', # CHANGE 5/5: Your Google Cloud Storage path is required for temporary # files. '--temp_location=gs://YOUR_BUCKET_NAME/AND_TEMP_DIRECTORY', '--job_name=your-wordcount-job', ]) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) # Read the text file[pattern] into a PCollection. lines = p | 'read' >> ReadFromText(known_args.input) # Count the occurrences of each word. counts = (lines | 'split' >> (beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x)) .with_output_types(unicode)) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(lambda (word, ones): (word, sum(ones)))) # Format the counts into a PCollection of strings. output = counts | 'format' >> beam.Map(lambda (word, c): '%s: %s' % (word, c)) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | 'write' >> WriteToText(known_args.output) # Actually run the pipeline (all operations above are deferred). p.run().wait_until_finish()
def test_setup_file_not_named_setup_dot_py(self): staging_dir = tempfile.mkdtemp() source_dir = tempfile.mkdtemp() options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).setup_file = (os.path.join( source_dir, 'xyz-setup.py')) self.create_temp_file(os.path.join(source_dir, 'xyz-setup.py'), 'notused') with self.assertRaises(RuntimeError) as cm: dependency.stage_job_resources(options) self.assertTrue( cm.exception.message.startswith( 'The --setup_file option expects the full path to a file named ' 'setup.py instead of '))
def test_override_options(self): base_flags = ['--num_workers', '5'] options = PipelineOptions(base_flags) self.assertEqual(options.get_all_options()['num_workers'], 5) self.assertEqual(options.get_all_options()['mock_flag'], False) options.view_as(PipelineOptionsTest.MockOptions).mock_flag = True self.assertEqual(options.get_all_options()['num_workers'], 5) self.assertTrue(options.get_all_options()['mock_flag'])
def test_override_options(self): base_flags = ['--num_workers', '5'] options = PipelineOptions(base_flags) self.assertEqual(options.get_all_options()['num_workers'], 5) self.assertEqual(options.get_all_options()['mock_flag'], False) options.view_as(PipelineOptionsTest.MockOptions).mock_flag = True self.assertEqual(options.get_all_options()['num_workers'], 5) self.assertTrue(options.get_all_options()['mock_flag'])
def test_setup_file_not_named_setup_dot_py(self): staging_dir = tempfile.mkdtemp() source_dir = tempfile.mkdtemp() options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).setup_file = ( os.path.join(source_dir, 'xyz-setup.py')) self.create_temp_file( os.path.join(source_dir, 'xyz-setup.py'), 'notused') with self.assertRaises(RuntimeError) as cm: dependency.stage_job_resources(options) self.assertTrue( cm.exception.message.startswith( 'The --setup_file option expects the full path to a file named ' 'setup.py instead of '))
def test_option_with_space(self): options = PipelineOptions(flags=['--option with space= value with space']) self.assertEqual( getattr(options.view_as(PipelineOptionsTest.MockOptions), 'option with space'), ' value with space') options_from_dict = PipelineOptions.from_dictionary( options.get_all_options()) self.assertEqual( getattr(options_from_dict.view_as(PipelineOptionsTest.MockOptions), 'option with space'), ' value with space')
def test_option_with_space(self): options = PipelineOptions(flags=['--option with space= value with space']) self.assertEqual( getattr(options.view_as(PipelineOptionsTest.MockOptions), 'option with space'), ' value with space') options_from_dict = PipelineOptions.from_dictionary( options.get_all_options()) self.assertEqual( getattr(options_from_dict.view_as(PipelineOptionsTest.MockOptions), 'option with space'), ' value with space')
def run(argv=None): """Main entry point; defines and runs the hourly_team_score pipeline.""" parser = argparse.ArgumentParser() # The default maps to two large Google Cloud Storage files (each ~12GB) # holding two subsequent day's worth (roughly) of data. parser.add_argument('--input', dest='input', default='gs://dataflow-samples/game/gaming_data*.csv', help='Path to the data file(s) containing game data.') parser.add_argument('--dataset', dest='dataset', required=True, help='BigQuery Dataset to write tables to. ' 'Must already exist.') parser.add_argument('--table_name', dest='table_name', default='hourly_team_score', help='The BigQuery table name. Should not already exist.') parser.add_argument('--window_duration', type=int, default=60, help='Numeric value of fixed window duration, in minutes') parser.add_argument('--start_min', dest='start_min', default='1970-01-01-00-00', help='String representation of the first minute after ' 'which to generate results in the format: ' 'yyyy-MM-dd-HH-mm. Any input data timestamped ' 'prior to that minute won\'t be included in the ' 'sums.') parser.add_argument('--stop_min', dest='stop_min', default='2100-01-01-00-00', help='String representation of the first minute for ' 'which to generate results in the format: ' 'yyyy-MM-dd-HH-mm. Any input data timestamped ' 'after to that minute won\'t be included in the ' 'sums.') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) p = beam.Pipeline(options=pipeline_options) pipeline_options.view_as(SetupOptions).save_main_session = True (p # pylint: disable=expression-not-assigned | ReadFromText(known_args.input) | HourlyTeamScore( known_args.start_min, known_args.stop_min, known_args.window_duration) | WriteWindowedToBigQuery( known_args.table_name, known_args.dataset, configure_bigquery_write())) result = p.run() result.wait_until_finish()
def run(known_args, pipeline_args): network = MinimalNetwork() pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True pipeline_options.view_as(SetupOptions).extra_packages = [ml.sdk_location] pipeline_options.view_as( WorkerOptions).autoscaling_algorithm = 'THROUGHPUT_BASED' pipeline_options.view_as(GoogleCloudOptions).staging_location = \ os.path.join(known_args.staging) pipeline_options.view_as(GoogleCloudOptions).temp_location = os.path.join( known_args.staging, 'tmp') pipeline_options.view_as(GoogleCloudOptions).job_name = str( network).replace('_', '').lower() beam.coders.registry.register_coder(tf.train.Example, ExampleProtoCoder) p = beam.Pipeline(options=pipeline_options) # Read Example data def parse_example(example): #TODO: add actual implementation yield example network_input = (p | 'readExamples' >> beam.io.ReadFromText(known_args.input) | 'processExamples' >> beam.FlatMap(lambda example: parse_example(example))) examples = network_input | 'encodeExamples' >> beam.Map( lambda raw_input: network.preprocess(raw_input)) # # Write the serialized compressed protocol buffers to Cloud Storage. _ = examples | beam.io.Write( 'writeExamples', tfrecordio.WriteToTFRecord( file_path_prefix=os.path.join(known_args.output, 'examples'), compression_type=fileio.CompressionTypes.GZIP, coder=ExampleProtoCoder(), file_name_suffix='.tfrecord.gz')) # # Actually run the pipeline (all operations above are deferred). p.run()
def test_from_dictionary(self): for case in PipelineOptionsTest.TEST_CASES: options = PipelineOptions(flags=case['flags']) all_options_dict = options.get_all_options() options_from_dict = PipelineOptions.from_dictionary(all_options_dict) self.assertEqual(options_from_dict.view_as( PipelineOptionsTest.MockOptions).mock_flag, case['expected']['mock_flag']) self.assertEqual(options.view_as( PipelineOptionsTest.MockOptions).mock_option, case['expected']['mock_option'])
def test_no_temp_location(self): staging_dir = tempfile.mkdtemp() options = PipelineOptions() google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.staging_location = staging_dir self.update_options(options) google_cloud_options.temp_location = None with self.assertRaises(RuntimeError) as cm: dependency.stage_job_resources(options) self.assertEqual('The --temp_location option must be specified.', cm.exception.message)
def test_sdk_location_local(self): staging_dir = tempfile.mkdtemp() sdk_location = tempfile.mkdtemp() self.create_temp_file( os.path.join( sdk_location, names.DATAFLOW_SDK_TARBALL_FILE), 'contents') options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location self.assertEqual( [names.DATAFLOW_SDK_TARBALL_FILE], dependency.stage_job_resources(options)) tarball_path = os.path.join( staging_dir, names.DATAFLOW_SDK_TARBALL_FILE) with open(tarball_path) as f: self.assertEqual(f.read(), 'contents')
def run(argv=None): # pylint: disable=expression-not-assigned parser = argparse.ArgumentParser() parser.add_argument('--input', required=True, help='Input file pattern to process.') parser.add_argument('--output', required=True, help='Output file pattern to write results to.') parser.add_argument('--checksum_output', help='Checksum output file pattern.') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) # Read the text file[pattern] into a PCollection. lines = p | ReadFromText(known_args.input, coder=beam.coders.BytesCoder()) # Count the occurrences of each word. output = (lines | 'split' >> beam.Map( lambda x: (x[:10], x[10:99])) .with_output_types(beam.typehints.KV[str, str]) | 'group' >> beam.GroupByKey() | 'format' >> beam.FlatMap( lambda (key, vals): ['%s%s' % (key, val) for val in vals])) # Write the output using a "Write" transform that has side effects. output | WriteToText(known_args.output) # Optionally write the input and output checksums. if known_args.checksum_output: input_csum = (lines | 'input-csum' >> beam.Map(crc32line) | 'combine-input-csum' >> beam.CombineGlobally(sum) | 'hex-format' >> beam.Map(lambda x: '%x' % x)) input_csum | 'write-input-csum' >> WriteToText( known_args.checksum_output + '-input') output_csum = (output | 'output-csum' >> beam.Map(crc32line) | 'combine-output-csum' >> beam.CombineGlobally(sum) | 'hex-format-output' >> beam.Map(lambda x: '%x' % x)) output_csum | 'write-output-csum' >> WriteToText( known_args.checksum_output + '-output') # Actually run the pipeline (all operations above are deferred). return p.run()
def run(argv=None): """Runs the workflow counting the long words and short words separately.""" parser = argparse.ArgumentParser() parser.add_argument('--input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', required=True, help='Output prefix for files to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) lines = p | ReadFromText(known_args.input) # with_outputs allows accessing the side outputs of a DoFn. split_lines_result = (lines | beam.ParDo(SplitLinesToWordsFn()).with_outputs( SplitLinesToWordsFn.SIDE_OUTPUT_TAG_SHORT_WORDS, SplitLinesToWordsFn.SIDE_OUTPUT_TAG_CHARACTER_COUNT, main='words')) # split_lines_result is an object of type DoOutputsTuple. It supports # accessing result in alternative ways. words, _, _ = split_lines_result short_words = split_lines_result[ SplitLinesToWordsFn.SIDE_OUTPUT_TAG_SHORT_WORDS] character_count = split_lines_result.tag_character_count # pylint: disable=expression-not-assigned (character_count | 'pair_with_key' >> beam.Map(lambda x: ('chars_temp_key', x)) | beam.GroupByKey() | 'count chars' >> beam.Map(lambda (_, counts): sum(counts)) | 'write chars' >> WriteToText(known_args.output + '-chars')) # pylint: disable=expression-not-assigned (short_words | 'count short words' >> CountWords() | 'write short words' >> WriteToText( known_args.output + '-short-words')) # pylint: disable=expression-not-assigned (words | 'count words' >> CountWords() | 'write words' >> WriteToText(known_args.output + '-words')) return p.run()