def test_with_setup_file(self): staging_dir = tempfile.mkdtemp() source_dir = tempfile.mkdtemp() self.create_temp_file( os.path.join(source_dir, 'setup.py'), 'notused') options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).setup_file = os.path.join( source_dir, 'setup.py') self.assertEqual( [dependency.WORKFLOW_TARBALL_FILE, names.PICKLED_MAIN_SESSION_FILE], dependency.stage_job_resources( options, # We replace the build setup command because a realistic one would # require the setuptools package to be installed. Note that we can't # use "touch" here to create the expected output tarball file, since # touch is not available on Windows, so we invoke python to produce # equivalent behavior. build_setup_args=[ 'python', '-c', 'open(__import__("sys").argv[1], "a")', os.path.join(source_dir, dependency.WORKFLOW_TARBALL_FILE)], temp_dir=source_dir)) self.assertTrue( os.path.isfile( os.path.join(staging_dir, dependency.WORKFLOW_TARBALL_FILE)))
def test_with_setup_file(self): staging_dir = tempfile.mkdtemp() source_dir = tempfile.mkdtemp() self.create_temp_file(os.path.join(source_dir, 'setup.py'), 'notused') options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).setup_file = os.path.join( source_dir, 'setup.py') self.assertEqual( [ dependency.WORKFLOW_TARBALL_FILE, names.PICKLED_MAIN_SESSION_FILE ], dependency.stage_job_resources( options, # We replace the build setup command because a realistic one would # require the setuptools package to be installed. Note that we can't # use "touch" here to create the expected output tarball file, since # touch is not available on Windows, so we invoke python to produce # equivalent behavior. build_setup_args=[ 'python', '-c', 'open(__import__("sys").argv[1], "a")', os.path.join(source_dir, dependency.WORKFLOW_TARBALL_FILE) ], temp_dir=source_dir)) self.assertTrue( os.path.isfile( os.path.join(staging_dir, dependency.WORKFLOW_TARBALL_FILE)))
def examples_wordcount_minimal(renames): """MinimalWordCount example snippets. URL: https://cloud.google.com/dataflow/examples/wordcount-example#MinimalWordCount """ import re import google.cloud.dataflow as df from google.cloud.dataflow.utils.options import GoogleCloudOptions from google.cloud.dataflow.utils.options import StandardOptions from google.cloud.dataflow.utils.options import PipelineOptions # [START examples_wordcount_minimal_options] options = PipelineOptions() google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = 'my-project-id' google_cloud_options.job_name = 'myjob' google_cloud_options.staging_location = 'gs://your-bucket-name-here/staging' google_cloud_options.temp_location = 'gs://your-bucket-name-here/temp' options.view_as(StandardOptions).runner = 'BlockingDataflowPipelineRunner' # [END examples_wordcount_minimal_options] # Run it locally for testing. options = PipelineOptions() # [START examples_wordcount_minimal_create] p = df.Pipeline(options=options) # [END examples_wordcount_minimal_create] ( # [START examples_wordcount_minimal_read] p | df.io.Read(df.io.TextFileSource( 'gs://dataflow-samples/shakespeare/kinglear.txt')) # [END examples_wordcount_minimal_read] # [START examples_wordcount_minimal_pardo] | df.FlatMap('ExtractWords', lambda x: re.findall(r'[A-Za-z\']+', x)) # [END examples_wordcount_minimal_pardo] # [START examples_wordcount_minimal_count] | df.combiners.Count.PerElement() # [END examples_wordcount_minimal_count] # [START examples_wordcount_minimal_map] | df.Map(lambda (word, count): '%s: %s' % (word, count)) # [END examples_wordcount_minimal_map] # [START examples_wordcount_minimal_write] | df.io.Write(df.io.TextFileSink('gs://my-bucket/counts.txt')) # [END examples_wordcount_minimal_write] ) p.visit(SnippetUtils.RenameFiles(renames)) # [START examples_wordcount_minimal_run] p.run()
def test_no_main_session(self): staging_dir = tempfile.mkdtemp() options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir options.view_as(SetupOptions).save_main_session = False self.update_options(options) self.assertEqual([], dependency.stage_job_resources(options))
def test_get_all_options(self): for case in PipelineOptionsTest.TEST_CASES: options = PipelineOptions(flags=case['flags']) self.assertDictContainsSubset(case['expected'], options.get_all_options()) self.assertEqual(options.view_as( PipelineOptionsTest.MockOptions).mock_flag, case['expected']['mock_flag']) self.assertEqual(options.view_as( PipelineOptionsTest.MockOptions).mock_option, case['expected']['mock_option'])
def pipeline_options_remote(argv): """"Creating a Pipeline using a PipelineOptions object for remote execution. URL: https://cloud.google.com/dataflow/pipelines/specifying-exec-params """ from google.cloud.dataflow import Pipeline from google.cloud.dataflow.utils.options import PipelineOptions # [START pipeline_options_create] options = PipelineOptions(flags=argv) # [END pipeline_options_create] # [START pipeline_options_define_custom] class MyOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--input') parser.add_argument('--output') # [END pipeline_options_define_custom] from google.cloud.dataflow.utils.options import GoogleCloudOptions from google.cloud.dataflow.utils.options import StandardOptions # [START pipeline_options_dataflow_service] # Create and set your PipelineOptions. options = PipelineOptions(flags=argv) # For Cloud execution, set the Cloud Platform project, job_name, # staging location, temp_location and specify DataflowPipelineRunner or # BlockingDataflowPipelineRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = 'my-project-id' google_cloud_options.job_name = 'myjob' google_cloud_options.staging_location = 'gs://my-bucket/binaries' google_cloud_options.temp_location = 'gs://my-bucket/temp' options.view_as(StandardOptions).runner = 'DataflowPipelineRunner' # Create the Pipeline with the specified options. p = Pipeline(options=options) # [END pipeline_options_dataflow_service] my_options = options.view_as(MyOptions) my_input = my_options.input my_output = my_options.output # Overriding the runner for tests. options.view_as(StandardOptions).runner = 'DirectPipelineRunner' p = Pipeline(options=options) lines = p | df.io.Read('ReadFromText', df.io.TextFileSource(my_input)) lines | df.io.Write('WriteToText', df.io.TextFileSink(my_output)) p.run()
def test_get_all_options(self): for case in PipelineOptionsTest.TEST_CASES: options = PipelineOptions(flags=case['flags']) self.assertDictContainsSubset(case['expected'], options.get_all_options()) self.assertEqual( options.view_as(PipelineOptionsTest.MockOptions).mock_flag, case['expected']['mock_flag']) self.assertEqual( options.view_as(PipelineOptionsTest.MockOptions).mock_option, case['expected']['mock_option'])
def test_no_main_session(self): staging_dir = tempfile.mkdtemp() options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir options.view_as(SetupOptions).save_main_session = False self.update_options(options) self.assertEqual( [], dependency.stage_job_resources(options))
def test_requirements_file_not_present(self): staging_dir = tempfile.mkdtemp() with self.assertRaises(RuntimeError) as cm: options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).requirements_file = 'nosuchfile' dependency.stage_job_resources(options) self.assertEqual( cm.exception.message, 'The file %s cannot be found. It was specified in the ' '--requirements_file command line option.' % 'nosuchfile')
def test_requirements_file_not_present(self): staging_dir = tempfile.mkdtemp() with self.assertRaises(RuntimeError) as cm: options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).requirements_file = 'nosuchfile' dependency.stage_job_resources(options) self.assertEqual( cm.exception.message, 'The file %s cannot be found. It was specified in the ' '--requirements_file command line option.' % 'nosuchfile')
def test_sdk_location_gcs(self): staging_dir = tempfile.mkdtemp() sdk_location = 'gs://my-gcs-bucket/tarball.tar.gz' self.override_file_copy(sdk_location, staging_dir) options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location self.assertEqual( [names.PICKLED_MAIN_SESSION_FILE, names.DATAFLOW_SDK_TARBALL_FILE], dependency.stage_job_resources(options))
def test_sdk_location_local_not_present(self): staging_dir = tempfile.mkdtemp() sdk_location = 'nosuchdir' with self.assertRaises(RuntimeError) as cm: options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location dependency.stage_job_resources(options) self.assertEqual( 'The file "%s" cannot be found. Its ' 'location was specified by the --sdk_location command-line option.' % sdk_location, cm.exception.message)
def test_sdk_location_gcs(self): staging_dir = tempfile.mkdtemp() sdk_location = 'gs://my-gcs-bucket/tarball.tar.gz' self.override_file_copy(sdk_location, staging_dir) options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location self.assertEqual( [names.PICKLED_MAIN_SESSION_FILE, names.DATAFLOW_SDK_TARBALL_FILE], dependency.stage_job_resources(options))
def test_with_extra_packages_missing_files(self): staging_dir = tempfile.mkdtemp() with self.assertRaises(RuntimeError) as cm: options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).extra_packages = ['nosuchfile.tar.gz'] dependency.stage_job_resources(options) self.assertEqual( cm.exception.message, 'The file %s cannot be found. It was specified in the ' '--extra_packages command line option.' % 'nosuchfile.tar.gz')
def test_sdk_location_local_not_present(self): staging_dir = tempfile.mkdtemp() sdk_location = 'nosuchdir' with self.assertRaises(RuntimeError) as cm: options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location dependency.stage_job_resources(options) self.assertEqual( 'The file "%s" cannot be found. Its ' 'location was specified by the --sdk_location command-line option.' % sdk_location, cm.exception.message)
def model_pcollection(argv): """Creating a PCollection from data in local memory. URL: https://cloud.google.com/dataflow/model/pcollection """ from google.cloud.dataflow.utils.options import PipelineOptions class MyOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') pipeline_options = PipelineOptions(argv) my_options = pipeline_options.view_as(MyOptions) # [START model_pcollection] p = df.Pipeline(options=pipeline_options) (p | df.Create([ 'To be, or not to be: that is the question: ', 'Whether \'tis nobler in the mind to suffer ', 'The slings and arrows of outrageous fortune, ', 'Or to take arms against a sea of troubles, ']) | df.io.Write(df.io.TextFileSink(my_options.output))) p.run()
def test_with_extra_packages_invalid_file_name(self): staging_dir = tempfile.mkdtemp() source_dir = tempfile.mkdtemp() self.create_temp_file( os.path.join(source_dir, 'abc.tgz'), 'nothing') with self.assertRaises(RuntimeError) as cm: options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).extra_packages = [ os.path.join(source_dir, 'abc.tgz')] dependency.stage_job_resources(options) self.assertEqual( cm.exception.message, 'The --extra_packages option expects a full path ending with ' '\'.tar.gz\' instead of %s' % os.path.join(source_dir, 'abc.tgz'))
def test_get_unknown_args(self): # Used for testing newly added flags. class MockOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--mock_flag', action='store_true', help='Enable work item profiling') test_cases = [ {'flags': ['--num_workers', '5'], 'expected': {'num_workers': 5, 'mock_flag': False}}, { 'flags': [ '--profile', '--profile_location', 'gs://bucket/', 'ignored'], 'expected': { 'profile': True, 'profile_location': 'gs://bucket/', 'mock_flag': False} }, {'flags': ['--num_workers', '5', '--mock_flag'], 'expected': {'num_workers': 5, 'mock_flag': True}}, ] for case in test_cases: options = PipelineOptions(flags=case['flags']) self.assertDictContainsSubset(case['expected'], options.get_all_options()) self.assertEqual(options.view_as(MockOptions).mock_flag, case['expected']['mock_flag'])
def test_with_extra_packages_missing_files(self): staging_dir = tempfile.mkdtemp() with self.assertRaises(RuntimeError) as cm: options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).extra_packages = [ 'nosuchfile.tar.gz' ] dependency.stage_job_resources(options) self.assertEqual( cm.exception.message, 'The file %s cannot be found. It was specified in the ' '--extra_packages command line option.' % 'nosuchfile.tar.gz')
def test_with_extra_packages_invalid_file_name(self): staging_dir = tempfile.mkdtemp() source_dir = tempfile.mkdtemp() self.create_temp_file(os.path.join(source_dir, 'abc.tgz'), 'nothing') with self.assertRaises(RuntimeError) as cm: options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).extra_packages = [ os.path.join(source_dir, 'abc.tgz') ] dependency.stage_job_resources(options) self.assertEqual( cm.exception.message, 'The --extra_packages option expects a full path ending with ' '\'.tar.gz\' instead of %s' % os.path.join(source_dir, 'abc.tgz'))
def test_sdk_location_default(self): staging_dir = tempfile.mkdtemp() expected_from_url = '%s/v%s.tar.gz' % (dependency.PACKAGES_URL_PREFIX, __version__) expected_from_path = self.override_file_download( expected_from_url, staging_dir) self.override_file_copy(expected_from_path, staging_dir) options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = 'default' self.assertEqual( [names.PICKLED_MAIN_SESSION_FILE, names.DATAFLOW_SDK_TARBALL_FILE], dependency.stage_job_resources( options, file_copy=dependency._dependency_file_copy))
def test_with_extra_packages(self): staging_dir = tempfile.mkdtemp() source_dir = tempfile.mkdtemp() self.create_temp_file(os.path.join(source_dir, 'abc.tar.gz'), 'nothing') self.create_temp_file(os.path.join(source_dir, 'xyz.tar.gz'), 'nothing') self.create_temp_file( os.path.join(source_dir, dependency.EXTRA_PACKAGES_FILE), 'nothing') options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).extra_packages = [ os.path.join(source_dir, 'abc.tar.gz'), os.path.join(source_dir, 'xyz.tar.gz'), 'gs://my-gcs-bucket/gcs.tar.gz' ] gcs_copied_files = [] def file_copy(from_path, to_path): if from_path.startswith('gs://'): gcs_copied_files.append(from_path) _, from_name = os.path.split(from_path) self.create_temp_file(os.path.join(to_path, from_name), 'nothing') logging.info('Fake copied GCS file: %s to %s', from_path, to_path) elif to_path.startswith('gs://'): logging.info('Faking file_copy(%s, %s)', from_path, to_path) else: shutil.copyfile(from_path, to_path) dependency._dependency_file_copy = file_copy self.assertEqual([ 'abc.tar.gz', 'xyz.tar.gz', 'gcs.tar.gz', dependency.EXTRA_PACKAGES_FILE, names.PICKLED_MAIN_SESSION_FILE ], dependency.stage_job_resources(options)) with open(os.path.join(staging_dir, dependency.EXTRA_PACKAGES_FILE)) as f: self.assertEqual(['abc.tar.gz\n', 'xyz.tar.gz\n', 'gcs.tar.gz\n'], f.readlines()) self.assertEqual(['gs://my-gcs-bucket/gcs.tar.gz'], gcs_copied_files)
def test_with_requirements_file(self): staging_dir = tempfile.mkdtemp() source_dir = tempfile.mkdtemp() options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).requirements_file = os.path.join( source_dir, dependency.REQUIREMENTS_FILE) self.create_temp_file( os.path.join(source_dir, dependency.REQUIREMENTS_FILE), 'nothing') self.assertEqual( [dependency.REQUIREMENTS_FILE, names.PICKLED_MAIN_SESSION_FILE], dependency.stage_job_resources(options)) self.assertTrue( os.path.isfile( os.path.join(staging_dir, dependency.REQUIREMENTS_FILE)))
def test_sdk_location_gcs(self): staging_dir = tempfile.mkdtemp() sdk_location = 'gs://my-gcs-bucket' expected_from_path = utils.path.join( sdk_location, 'google-cloud-dataflow-python-sdk-%s.tgz' % __version__) self.override_file_copy(expected_from_path, staging_dir) options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location self.assertEqual( [names.PICKLED_MAIN_SESSION_FILE, names.DATAFLOW_SDK_TARBALL_FILE], dependency.stage_job_resources(options))
def test_setup_file_not_named_setup_dot_py(self): staging_dir = tempfile.mkdtemp() source_dir = tempfile.mkdtemp() options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).setup_file = ( os.path.join(source_dir, 'xyz-setup.py')) self.create_temp_file( os.path.join(source_dir, 'xyz-setup.py'), 'notused') with self.assertRaises(RuntimeError) as cm: dependency.stage_job_resources(options) self.assertTrue( cm.exception.message.startswith( 'The --setup_file option expects the full path to a file named ' 'setup.py instead of '))
def test_override_options(self): base_flags = ['--num_workers', '5'] options = PipelineOptions(base_flags) self.assertEqual(options.get_all_options()['num_workers'], 5) self.assertEqual(options.get_all_options()['mock_flag'], False) options.view_as(PipelineOptionsTest.MockOptions).mock_flag = True self.assertEqual(options.get_all_options()['num_workers'], 5) self.assertEqual(options.get_all_options()['mock_flag'], True)
def test_override_options(self): base_flags = ['--num_workers', '5'] options = PipelineOptions(base_flags) self.assertEqual(options.get_all_options()['num_workers'], 5) self.assertEqual(options.get_all_options()['mock_flag'], False) options.view_as(PipelineOptionsTest.MockOptions).mock_flag = True self.assertEqual(options.get_all_options()['num_workers'], 5) self.assertEqual(options.get_all_options()['mock_flag'], True)
def test_setup_file_not_named_setup_dot_py(self): staging_dir = tempfile.mkdtemp() source_dir = tempfile.mkdtemp() options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).setup_file = (os.path.join( source_dir, 'xyz-setup.py')) self.create_temp_file(os.path.join(source_dir, 'xyz-setup.py'), 'notused') with self.assertRaises(RuntimeError) as cm: dependency.stage_job_resources(options) self.assertTrue( cm.exception.message.startswith( 'The --setup_file option expects the full path to a file named ' 'setup.py instead of '))
def test_with_requirements_file(self): staging_dir = tempfile.mkdtemp() source_dir = tempfile.mkdtemp() options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).requirements_file = os.path.join( source_dir, dependency.REQUIREMENTS_FILE) self.create_temp_file( os.path.join(source_dir, dependency.REQUIREMENTS_FILE), 'nothing') self.assertEqual( [dependency.REQUIREMENTS_FILE, names.PICKLED_MAIN_SESSION_FILE], dependency.stage_job_resources(options)) self.assertTrue( os.path.isfile( os.path.join(staging_dir, dependency.REQUIREMENTS_FILE)))
def test_option_with_spcae(self): options = PipelineOptions(flags=['--option with space= value with space']) self.assertEqual( getattr(options.view_as(PipelineOptionsTest.MockOptions), 'option with space'), ' value with space') options_from_dict = PipelineOptions.from_dictionary( options.get_all_options()) self.assertEqual( getattr(options_from_dict.view_as(PipelineOptionsTest.MockOptions), 'option with space'), ' value with space')
def test_sdk_location_local(self): staging_dir = tempfile.mkdtemp() sdk_location = tempfile.mkdtemp() self.create_temp_file( os.path.join(sdk_location, names.DATAFLOW_SDK_TARBALL_FILE), 'contents') options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location self.assertEqual( [names.PICKLED_MAIN_SESSION_FILE, names.DATAFLOW_SDK_TARBALL_FILE], dependency.stage_job_resources(options)) tarball_path = os.path.join(staging_dir, names.DATAFLOW_SDK_TARBALL_FILE) with open(tarball_path) as f: self.assertEqual(f.read(), 'contents')
def test_sdk_location_default(self): staging_dir = tempfile.mkdtemp() expected_from_url = '%s/v%s.tar.gz' % ( dependency.PACKAGES_URL_PREFIX, __version__) expected_from_path = self.override_file_download( expected_from_url, staging_dir) self.override_file_copy(expected_from_path, staging_dir) options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = 'default' self.assertEqual( [names.PICKLED_MAIN_SESSION_FILE, names.DATAFLOW_SDK_TARBALL_FILE], dependency.stage_job_resources( options, file_copy=dependency._dependency_file_copy))
def test_option_with_spcae(self): options = PipelineOptions( flags=['--option with space= value with space']) self.assertEqual( getattr(options.view_as(PipelineOptionsTest.MockOptions), 'option with space'), ' value with space') options_from_dict = PipelineOptions.from_dictionary( options.get_all_options()) self.assertEqual( getattr(options_from_dict.view_as(PipelineOptionsTest.MockOptions), 'option with space'), ' value with space')
def test_no_temp_location(self): staging_dir = tempfile.mkdtemp() options = PipelineOptions() google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.staging_location = staging_dir self.update_options(options) google_cloud_options.temp_location = None with self.assertRaises(RuntimeError) as cm: dependency.stage_job_resources(options) self.assertEqual('The --temp_location option must be specified.', cm.exception.message)
def test_no_temp_location(self): staging_dir = tempfile.mkdtemp() options = PipelineOptions() google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.staging_location = staging_dir self.update_options(options) google_cloud_options.temp_location = None with self.assertRaises(RuntimeError) as cm: dependency.stage_job_resources(options) self.assertEqual('The --temp_location option must be specified.', cm.exception.message)
def test_from_dictionary(self): for case in PipelineOptionsTest.TEST_CASES: options = PipelineOptions(flags=case['flags']) all_options_dict = options.get_all_options() options_from_dict = PipelineOptions.from_dictionary(all_options_dict) self.assertEqual(options_from_dict.view_as( PipelineOptionsTest.MockOptions).mock_flag, case['expected']['mock_flag']) self.assertEqual(options.view_as( PipelineOptionsTest.MockOptions).mock_option, case['expected']['mock_option'])
def test_default_resources(self): staging_dir = tempfile.mkdtemp() options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) self.assertEqual([names.PICKLED_MAIN_SESSION_FILE], dependency.stage_job_resources(options)) self.assertTrue( os.path.isfile( os.path.join(staging_dir, names.PICKLED_MAIN_SESSION_FILE)))
def test_with_extra_packages(self): staging_dir = tempfile.mkdtemp() source_dir = tempfile.mkdtemp() self.create_temp_file( os.path.join(source_dir, 'abc.tar.gz'), 'nothing') self.create_temp_file( os.path.join(source_dir, 'xyz.tar.gz'), 'nothing') self.create_temp_file( os.path.join(source_dir, dependency.EXTRA_PACKAGES_FILE), 'nothing') options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).extra_packages = [ os.path.join(source_dir, 'abc.tar.gz'), os.path.join(source_dir, 'xyz.tar.gz'), 'gs://my-gcs-bucket/gcs.tar.gz'] gcs_copied_files = [] def file_copy(from_path, to_path): if from_path.startswith('gs://'): gcs_copied_files.append(from_path) _, from_name = os.path.split(from_path) self.create_temp_file(os.path.join(to_path, from_name), 'nothing') logging.info('Fake copied GCS file: %s to %s', from_path, to_path) elif to_path.startswith('gs://'): logging.info('Faking file_copy(%s, %s)', from_path, to_path) else: shutil.copyfile(from_path, to_path) dependency._dependency_file_copy = file_copy self.assertEqual( ['abc.tar.gz', 'xyz.tar.gz', 'gcs.tar.gz', dependency.EXTRA_PACKAGES_FILE, names.PICKLED_MAIN_SESSION_FILE], dependency.stage_job_resources(options)) with open(os.path.join(staging_dir, dependency.EXTRA_PACKAGES_FILE)) as f: self.assertEqual(['abc.tar.gz\n', 'xyz.tar.gz\n', 'gcs.tar.gz\n'], f.readlines()) self.assertEqual(['gs://my-gcs-bucket/gcs.tar.gz'], gcs_copied_files)
def test_default_resources(self): staging_dir = tempfile.mkdtemp() options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) self.assertEqual( [names.PICKLED_MAIN_SESSION_FILE], dependency.stage_job_resources(options)) self.assertTrue( os.path.isfile( os.path.join(staging_dir, names.PICKLED_MAIN_SESSION_FILE)))
def test_from_dictionary(self): for case in PipelineOptionsTest.TEST_CASES: options = PipelineOptions(flags=case['flags']) all_options_dict = options.get_all_options() options_from_dict = PipelineOptions.from_dictionary( all_options_dict) self.assertEqual( options_from_dict.view_as( PipelineOptionsTest.MockOptions).mock_flag, case['expected']['mock_flag']) self.assertEqual( options.view_as(PipelineOptionsTest.MockOptions).mock_option, case['expected']['mock_option'])
def test_sdk_location_local(self): staging_dir = tempfile.mkdtemp() sdk_location = tempfile.mkdtemp() self.create_temp_file( os.path.join( sdk_location, names.DATAFLOW_SDK_TARBALL_FILE), 'contents') options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location self.assertEqual( [names.PICKLED_MAIN_SESSION_FILE, names.DATAFLOW_SDK_TARBALL_FILE], dependency.stage_job_resources(options)) tarball_path = os.path.join( staging_dir, names.DATAFLOW_SDK_TARBALL_FILE) with open(tarball_path) as f: self.assertEqual(f.read(), 'contents')
def test_with_extra_packages(self): staging_dir = tempfile.mkdtemp() source_dir = tempfile.mkdtemp() self.create_temp_file( os.path.join(source_dir, 'abc.tar.gz'), 'nothing') self.create_temp_file( os.path.join(source_dir, 'xyz.tar.gz'), 'nothing') self.create_temp_file( os.path.join(source_dir, dependency.EXTRA_PACKAGES_FILE), 'nothing') options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).extra_packages = [ os.path.join(source_dir, 'abc.tar.gz'), os.path.join(source_dir, 'xyz.tar.gz')] self.assertEqual( ['abc.tar.gz', 'xyz.tar.gz', dependency.EXTRA_PACKAGES_FILE, names.PICKLED_MAIN_SESSION_FILE], dependency.stage_job_resources(options)) with open(os.path.join(staging_dir, dependency.EXTRA_PACKAGES_FILE)) as f: self.assertEqual(['abc.tar.gz\n', 'xyz.tar.gz\n'], f.readlines())
def test_with_requirements_file_and_cache(self): staging_dir = tempfile.mkdtemp() source_dir = tempfile.mkdtemp() options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).requirements_file = os.path.join( source_dir, dependency.REQUIREMENTS_FILE) options.view_as(SetupOptions).requirements_cache = os.path.join( tempfile.gettempdir(), 'alternative-cache-dir') self.create_temp_file( os.path.join(source_dir, dependency.REQUIREMENTS_FILE), 'nothing') self.assertEqual( sorted([dependency.REQUIREMENTS_FILE, names.PICKLED_MAIN_SESSION_FILE, 'abc.txt', 'def.txt']), sorted(dependency.stage_job_resources( options, populate_requirements_cache=self.populate_requirements_cache))) self.assertTrue( os.path.isfile( os.path.join(staging_dir, dependency.REQUIREMENTS_FILE))) self.assertTrue(os.path.isfile(os.path.join(staging_dir, 'abc.txt'))) self.assertTrue(os.path.isfile(os.path.join(staging_dir, 'def.txt')))
def pipeline_options_local(argv): """"Creating a Pipeline using a PipelineOptions object for local execution. URL: https://cloud.google.com/dataflow/pipelines/specifying-exec-params """ from google.cloud.dataflow import Pipeline from google.cloud.dataflow.utils.options import PipelineOptions options = PipelineOptions(flags=argv) # [START pipeline_options_define_custom_with_help_and_default] class MyOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--input', help='Input for the dataflow pipeline', default='gs://my-bucket/input') parser.add_argument('--output', help='Output for the dataflow pipeline', default='gs://my-bucket/output') # [END pipeline_options_define_custom_with_help_and_default] my_options = options.view_as(MyOptions) my_input = my_options.input my_output = my_options.output # [START pipeline_options_local] # Create and set your Pipeline Options. options = PipelineOptions() p = Pipeline(options=options) # [END pipeline_options_local] lines = p | df.io.Read('ReadFromText', df.io.TextFileSource(my_input)) lines | df.io.Write('WriteToText', df.io.TextFileSink(my_output)) p.run()
def model_pipelines(argv): """A wordcount snippet as a simple pipeline example. URL: https://cloud.google.com/dataflow/model/pipelines """ # [START model_pipelines] import re import google.cloud.dataflow as df from google.cloud.dataflow.utils.options import PipelineOptions class MyOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear' '.txt', help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') pipeline_options = PipelineOptions(argv) my_options = pipeline_options.view_as(MyOptions) p = df.Pipeline(options=pipeline_options) (p | df.io.Read(df.io.TextFileSource(my_options.input)) | df.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x)) | df.Map(lambda x: (x, 1)) | df.combiners.Count.PerKey() | df.io.Write(df.io.TextFileSink(my_options.output))) p.run()
class Pipeline(object): """A pipeline object that manages a DAG of PValues and their PTransforms. Conceptually the PValues are the DAG's nodes and the PTransforms computing the PValues are the edges. All the transforms applied to the pipeline must have distinct full labels. If same transform instance needs to be applied then a clone should be created with a new label (e.g., transform.clone('new label')). """ def __init__(self, runner=None, options=None, argv=None): """Initialize a pipeline object. Args: runner: An object of type 'PipelineRunner' that will be used to execute the pipeline. For registered runners, the runner name can be specified, otherwise a runner object must be supplied. options: A configured 'PipelineOptions' object containing arguments that should be used for running the Dataflow job. argv: a list of arguments (such as sys.argv) to be used for building a 'PipelineOptions' object. This will only be used if argument 'options' is None. Raises: ValueError: if either the runner or options argument is not of the expected type. """ if options is not None: if isinstance(options, PipelineOptions): self.options = options else: raise ValueError( 'Parameter options, if specified, must be of type PipelineOptions. ' 'Received : %r', options) elif argv is not None: if isinstance(argv, list): self.options = PipelineOptions(argv) else: raise ValueError( 'Parameter argv, if specified, must be a list. Received : %r', argv) else: self.options = None if runner is None and self.options is not None: runner = self.options.view_as(StandardOptions).runner if runner is None: runner = StandardOptions.DEFAULT_RUNNER logging.info( ('Missing pipeline option (runner). Executing pipeline ' 'using the default runner: %s.'), runner) if isinstance(runner, str): runner = create_runner(runner) elif not isinstance(runner, PipelineRunner): raise TypeError('Runner must be a PipelineRunner object or the ' 'name of a registered runner.') # Validate pipeline options if self.options is not None: errors = PipelineOptionsValidator(self.options, runner).validate() if errors: raise ValueError('Pipeline has validations errors: \n' + '\n'.join(errors)) # Default runner to be used. self.runner = runner # Stack of transforms generated by nested apply() calls. The stack will # contain a root node as an enclosing (parent) node for top transforms. self.transforms_stack = [AppliedPTransform(None, None, '', None)] # Set of transform labels (full labels) applied to the pipeline. # If a transform is applied and the full label is already in the set # then the transform will have to be cloned with a new label. self.applied_labels = set() # Store cache of views created from PCollections. For reference, see # pvalue._cache_view(). self._view_cache = {} def _current_transform(self): """Returns the transform currently on the top of the stack.""" return self.transforms_stack[-1] def _root_transform(self): """Returns the root transform of the transform stack.""" return self.transforms_stack[0] def run(self): """Runs the pipeline. Returns whatever our runner returns after running.""" if not self.options or self.options.view_as( SetupOptions).save_main_session: # If this option is chosen, verify we can pickle the main session early. tmpdir = tempfile.mkdtemp() try: pickler.dump_session( os.path.join(tmpdir, 'main_session.pickle')) finally: shutil.rmtree(tmpdir) return self.runner.run(self) def visit(self, visitor): """Visits depth-first every node of a pipeline's DAG. Args: visitor: PipelineVisitor object whose callbacks will be called for each node visited. See PipelineVisitor comments. Raises: TypeError: if node is specified and is not a PValue. pipeline.PipelineError: if node is specified and does not belong to this pipeline instance. """ visited = set() self._root_transform().visit(visitor, self, visited) def apply(self, transform, pvalueish=None): """Applies a custom transform using the pvalueish specified. Args: transform: the PTranform (or callable) to apply. pvalueish: the input for the PTransform (typically a PCollection). Raises: TypeError: if the transform object extracted from the argument list is not a callable type or a descendant from PTransform. RuntimeError: if the transform object was already applied to this pipeline and needs to be cloned in order to apply again. """ if not isinstance(transform, ptransform.PTransform): transform = _CallableWrapperPTransform(transform) full_label = format_full_label(self._current_transform(), transform) if full_label in self.applied_labels: raise RuntimeError( 'Transform "%s" does not have a stable unique label. ' 'This will prevent updating of pipelines. ' 'To clone a transform with a new label use: ' 'transform.clone("NEW LABEL").' % full_label) self.applied_labels.add(full_label) pvalueish, inputs = transform._extract_input_pvalues(pvalueish) try: inputs = tuple(inputs) for leaf_input in inputs: if not isinstance(leaf_input, pvalue.PValue): raise TypeError except TypeError: raise NotImplementedError( 'Unable to extract PValue inputs from %s; either %s does not accept ' 'inputs of this format, or it does not properly override ' '_extract_input_values' % (pvalueish, transform)) current = AppliedPTransform(self._current_transform(), transform, full_label, inputs) self._current_transform().add_part(current) self.transforms_stack.append(current) if self.options is not None: type_options = self.options.view_as(TypeOptions) else: type_options = None if type_options is not None and type_options.pipeline_type_check: transform.type_check_inputs(pvalueish) pvalueish_result = self.runner.apply(transform, pvalueish) if type_options is not None and type_options.pipeline_type_check: transform.type_check_outputs(pvalueish_result) for result in ptransform.GetPValues().visit(pvalueish_result): assert isinstance(result, (pvalue.PValue, pvalue.DoOutputsTuple)) # Make sure we set the producer only for a leaf node in the transform DAG. # This way we preserve the last transform of a composite transform as # being the real producer of the result. if result.producer is None: result.producer = current # TODO(robertwb): Multi-input, multi-output inference. # TODO(robertwb): Ideally we'd do intersection here. if (type_options is not None and type_options.pipeline_type_check and isinstance( result, (pvalue.PCollection, pvalue.PCollectionView)) and not result.element_type): input_element_type = (inputs[0].element_type if len(inputs) == 1 else typehints.Any) type_hints = transform.get_type_hints() declared_output_type = type_hints.simple_output_type( transform.label) if declared_output_type: input_types = type_hints.input_types if input_types and input_types[0]: declared_input_type = input_types[0][0] result.element_type = typehints.bind_type_variables( declared_output_type, typehints.match_type_variables( declared_input_type, input_element_type)) else: result.element_type = declared_output_type else: result.element_type = transform.infer_output_type( input_element_type) assert isinstance(result.producer.inputs, tuple) current.add_output(result) if (type_options is not None and type_options.type_check_strictness == 'ALL_REQUIRED' and transform.get_type_hints().output_types is None): ptransform_name = '%s(%s)' % (transform.__class__.__name__, full_label) raise TypeCheckError( 'Pipeline type checking is enabled, however no ' 'output type-hint was found for the ' 'PTransform %s' % ptransform_name) current.update_input_refcounts() self.transforms_stack.pop() return pvalueish_result
class Pipeline(object): """A pipeline object that manages a DAG of PValues and their PTransforms. Conceptually the PValues are the DAG's nodes and the PTransforms computing the PValues are the edges. All the transforms applied to the pipeline must have distinct full labels. If same transform instance needs to be applied then a clone should be created with a new label (e.g., transform.clone('new label')). """ def __init__(self, runner=None, options=None, argv=None): """Initialize a pipeline object. Args: runner: An object of type 'PipelineRunner' that will be used to execute the pipeline. For registered runners, the runner name can be specified, otherwise a runner object must be supplied. options: A configured 'PipelineOptions' object containing arguments that should be used for running the Dataflow job. argv: a list of arguments (such as sys.argv) to be used for building a 'PipelineOptions' object. This will only be used if argument 'options' is None. Raises: ValueError: if either the runner or options argument is not of the expected type. """ if options is not None: if isinstance(options, PipelineOptions): self.options = options else: raise ValueError( 'Parameter options, if specified, must be of type PipelineOptions. ' 'Received : %r', options) elif argv is not None: if isinstance(argv, list): self.options = PipelineOptions(argv) else: raise ValueError( 'Parameter argv, if specified, must be a list. Received : %r', argv) else: self.options = None if runner is None and self.options is not None: runner = self.options.view_as(StandardOptions).runner if runner is None: runner = StandardOptions.DEFAULT_RUNNER logging.info(('Missing pipeline option (runner). Executing pipeline ' 'using the default runner: %s.'), runner) if isinstance(runner, str): runner = create_runner(runner) elif not isinstance(runner, PipelineRunner): raise TypeError('Runner must be a PipelineRunner object or the ' 'name of a registered runner.') # Validate pipeline options if self.options is not None: errors = PipelineOptionsValidator(self.options, runner).validate() if errors: raise ValueError( 'Pipeline has validations errors: \n' + '\n'.join(errors)) # Default runner to be used. self.runner = runner # Stack of transforms generated by nested apply() calls. The stack will # contain a root node as an enclosing (parent) node for top transforms. self.transforms_stack = [AppliedPTransform(None, None, '', None)] # Set of transform labels (full labels) applied to the pipeline. # If a transform is applied and the full label is already in the set # then the transform will have to be cloned with a new label. self.applied_labels = set() # Store cache of views created from PCollections. For reference, see # pvalue._cache_view(). self._view_cache = {} def _current_transform(self): """Returns the transform currently on the top of the stack.""" return self.transforms_stack[-1] def _root_transform(self): """Returns the root transform of the transform stack.""" return self.transforms_stack[0] def run(self): """Runs the pipeline. Returns whatever our runner returns after running.""" if not self.options or self.options.view_as(SetupOptions).save_main_session: # If this option is chosen, verify we can pickle the main session early. tmpdir = tempfile.mkdtemp() try: pickler.dump_session(os.path.join(tmpdir, 'main_session.pickle')) finally: shutil.rmtree(tmpdir) return self.runner.run(self) def visit(self, visitor): """Visits depth-first every node of a pipeline's DAG. Args: visitor: PipelineVisitor object whose callbacks will be called for each node visited. See PipelineVisitor comments. Raises: TypeError: if node is specified and is not a PValue. pipeline.PipelineError: if node is specified and does not belong to this pipeline instance. """ visited = set() self._root_transform().visit(visitor, self, visited) def apply(self, transform, pvalueish=None): """Applies a custom transform using the pvalueish specified. Args: transform: the PTranform (or callable) to apply. pvalueish: the input for the PTransform (typically a PCollection). Raises: TypeError: if the transform object extracted from the argument list is not a callable type or a descendant from PTransform. RuntimeError: if the transform object was already applied to this pipeline and needs to be cloned in order to apply again. """ if not isinstance(transform, ptransform.PTransform): transform = _CallableWrapperPTransform(transform) full_label = format_full_label(self._current_transform(), transform) if full_label in self.applied_labels: raise RuntimeError( 'Transform "%s" does not have a stable unique label. ' 'This will prevent updating of pipelines. ' 'To clone a transform with a new label use: ' 'transform.clone("NEW LABEL").' % full_label) self.applied_labels.add(full_label) pvalueish, inputs = transform._extract_input_pvalues(pvalueish) try: inputs = tuple(inputs) for leaf_input in inputs: if not isinstance(leaf_input, pvalue.PValue): raise TypeError except TypeError: raise NotImplementedError( 'Unable to extract PValue inputs from %s; either %s does not accept ' 'inputs of this format, or it does not properly override ' '_extract_input_values' % (pvalueish, transform)) current = AppliedPTransform( self._current_transform(), transform, full_label, inputs) self._current_transform().add_part(current) self.transforms_stack.append(current) if self.options is not None: type_options = self.options.view_as(TypeOptions) else: type_options = None if type_options is not None and type_options.pipeline_type_check: transform.type_check_inputs(pvalueish) pvalueish_result = self.runner.apply(transform, pvalueish) if type_options is not None and type_options.pipeline_type_check: transform.type_check_outputs(pvalueish_result) for result in ptransform.GetPValues().visit(pvalueish_result): assert isinstance(result, (pvalue.PValue, pvalue.DoOutputsTuple)) # Make sure we set the producer only for a leaf node in the transform DAG. # This way we preserve the last transform of a composite transform as # being the real producer of the result. if result.producer is None: result.producer = current # TODO(robertwb): Multi-input, multi-output inference. # TODO(robertwb): Ideally we'd do intersection here. if (type_options is not None and type_options.pipeline_type_check and isinstance(result, (pvalue.PCollection, pvalue.PCollectionView)) and not result.element_type): input_element_type = ( inputs[0].element_type if len(inputs) == 1 else typehints.Any) type_hints = transform.get_type_hints() declared_output_type = type_hints.simple_output_type(transform.label) if declared_output_type: input_types = type_hints.input_types if input_types and input_types[0]: declared_input_type = input_types[0][0] result.element_type = typehints.bind_type_variables( declared_output_type, typehints.match_type_variables(declared_input_type, input_element_type)) else: result.element_type = declared_output_type else: result.element_type = transform.infer_output_type(input_element_type) assert isinstance(result.producer.inputs, tuple) current.add_output(result) if (type_options is not None and type_options.type_check_strictness == 'ALL_REQUIRED' and transform.get_type_hints().output_types is None): ptransform_name = '%s(%s)' % (transform.__class__.__name__, full_label) raise TypeCheckError('Pipeline type checking is enabled, however no ' 'output type-hint was found for the ' 'PTransform %s' % ptransform_name) current.update_input_refcounts() self.transforms_stack.pop() return pvalueish_result
class ImagePreprocessor(object): """Runs the pre-processing pipeline. """ def __init__(self, args): self.pipeline_options = PipelineOptions(args) def preprocess(self, input_path, input_dict, output_path): """ Args: input_path: Input specified as uri to CSV file. Each line of csv file contains colon-separated GCS uri to an image and labels input_dict: Input dictionary. Specified as text file uri. Each line of the file stores one label. """ opt = self.pipeline_options.view_as(PrepareImagesOptions) p = df.Pipeline(options=self.pipeline_options) # Read input data. csv_data = df.io.TextFileSource(input_path, strip_trailing_newlines=True) dict_data = df.io.TextFileSource(input_dict, strip_trailing_newlines=True) labels = (p | df.Read(StageName.READ_DICTIONARY, dict_data)) content = (p | df.Read(StageName.READ_CSV, csv_data) | df.Map(StageName.PARSE_CSV, lambda line: csv.reader([line]).next()) | df.ParDo(StageName.EXTRACT_LABEL_IDS, ExtractLabelIdsDoFn(), df.pvalue.AsIter(labels)) | df.ParDo(StageName.READ_IMAGE, ExtractImageDoFn())) # Process input data using common transformations. image_graph_uri = os.path.join(opt.input_data_location, Default.IMAGE_GRAPH_FILENAME) examples = ( content | df.ParDo( StageName.CONVERT_IMAGE, ResizeImageDoFn(Default.IMAGE_TYPE, opt.max_image_width, opt.max_image_height)) | df.ParDo( StageName.ENCODE_EXAMPLE, EncodeExampleDoFn(image_graph_uri, opt.image_graph_jpeg_input_tensor, opt.image_graph_output_tensor, opt.training_data_percentage))) # Write in JSON format to Text file. # Remove redundant whitespace for more compact representation. # Images/labels are base64 encoded so will not contain spaces. to_json = lambda x: re.sub(r'\s+', ' ', json_format.MessageToJson(x[0]) ) for dataset in Dataset.ALL: _ = (examples | df.Filter(StageName.FILTER + dataset, lambda x, dataset=dataset: x[1] == dataset) | df.Map(StageName.TO_JSON + dataset, to_json) | df.Write( StageName.SAVE + dataset, df.io.TextFileSink('{}.{}.json'.format( output_path, dataset), num_shards=opt.output_shard_count))) # Execute the pipeline. p.run()
def pipeline_monitoring(renames): """Using monitoring interface snippets. URL: https://cloud.google.com/dataflow/pipelines/dataflow-monitoring-intf """ import re import google.cloud.dataflow as df from google.cloud.dataflow.utils.options import PipelineOptions class WordCountOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--input', help='Input for the dataflow pipeline', default='gs://my-bucket/input') parser.add_argument('--output', help='output for the dataflow pipeline', default='gs://my-bucket/output') class ExtractWordsFn(df.DoFn): def process(self, context): words = re.findall(r'[A-Za-z\']+', context.element) for word in words: yield word class FormatCountsFn(df.DoFn): def process(self, context): word, count = context.element yield '%s: %s' % (word, count) # [START pipeline_monitoring_composite] # The CountWords Composite Transform inside the WordCount pipeline. class CountWords(df.PTransform): def apply(self, pcoll): return (pcoll # Convert lines of text into individual words. | df.ParDo('ExtractWords', ExtractWordsFn()) # Count the number of times each word occurs. | df.combiners.Count.PerElement() # Format each word and count into a printable string. | df.ParDo('FormatCounts', FormatCountsFn())) # [END pipeline_monitoring_composite] pipeline_options = PipelineOptions() options = pipeline_options.view_as(WordCountOptions) p = df.Pipeline(options=pipeline_options) # [START pipeline_monitoring_execution] (p # Read the lines of the input text. | df.io.Read('ReadLines', df.io.TextFileSource(options.input)) # Count the words. | CountWords() # Write the formatted word counts to output. | df.io.Write('WriteCounts', df.io.TextFileSink(options.output))) # [END pipeline_monitoring_execution] p.visit(SnippetUtils.RenameFiles(renames)) p.run()
class Pipeline(object): """A pipeline object that manages a DAG of PValues and their PTransforms. Conceptually the PValues are the DAG's nodes and the PTransforms computing the PValues are the edges. All the transforms applied to the pipeline must have distinct full labels. If same transform instance needs to be applied then a clone should be created with a new label (e.g., transform.clone('new label')). """ def __init__(self, runner=None, options=None, argv=None): """Initialize a pipeline object. Args: runner: An object of type 'PipelineRunner' that will be used to execute the pipeline. For registered runners, the runner name can be specified, otherwise a runner object must be supplied. options: A configured 'PipelineOptions' object containing arguments that should be used for running the Dataflow job. argv: a list of arguments (such as sys.argv) to be used for building a 'PipelineOptions' object. This will only be used if argument 'options' is None. Raises: ValueError: if either the runner or options argument is not of the expected type. """ if options is not None: if isinstance(options, PipelineOptions): self.options = options else: raise ValueError( 'Parameter options, if specified, must be of type PipelineOptions. ' 'Received : %r', options) elif argv is not None: if isinstance(argv, list): self.options = PipelineOptions(argv) else: raise ValueError( 'Parameter argv, if specified, must be a list. Received : %r', argv) else: self.options = None if runner is None and self.options is not None: runner = self.options.view_as(StandardOptions).runner if isinstance(runner, str): runner = create_runner(runner) elif not isinstance(runner, PipelineRunner): raise TypeError('Runner must be a PipelineRunner object or the ' 'name of a registered runner.') # List of PValue objects representing a DAG of transformations. self._nodes = [] # Default runner to be used. self.runner = runner # Stack of transforms generated by nested apply() calls. The stack will # contain a root node as an enclosing (parent) node for top transforms. self.transforms_stack = [AppliedPTransform(None, None, '', None)] # Set of transform labels (full labels) applied to the pipeline. # If a transform is applied and the full label is already in the set # then the transform will have to be cloned with a new label. self.applied_labels = set() def _add_pvalue(self, pval): """Adds a PValue to the pipeline's node list.""" if pval not in self._nodes: self._nodes.append(pval) def _current_transform(self): """Returns the transform currently on the top of the stack.""" return self.transforms_stack[-1] def _root_transform(self): """Returns the root transform of the transform stack.""" return self.transforms_stack[0] def run(self): """Runs the pipeline. Returns whatever our runner returns after running.""" return self.runner.run(self) def visit(self, visitor, node=None): """Visits depth-first every node of a pipeline's DAG. If node is specified then only that node's predecessors (inputs and recursively their creating transforms) and outputs will be visited. Args: visitor: PipelineVisitor object whose callbacks will be called for each node visited. See PipelineVisitor comments. node: if specified it is expected to be a PValue and only the nodes of the DAG reachable from this node will be visited. Raises: TypeError: if node is specified and is not a PValue. pipeline.PipelineError: if node is specified and does not belong to this pipeline instance. """ # Make sure the specified node has its transform registered as an output # producer. We can have this situation for PCollections created as results # of accessing a tag of a FlatMap().with_outputs() result. if node is not None: if not isinstance(node, pvalue.PValue): raise TypeError( 'Expected a PValue for the node argument instead of: %r' % node) if node not in self._nodes: raise error.PipelineError('PValue not in pipeline: %r' % node) assert node.producer is not None visited = set() start_transform = self._root_transform() if node is None else node.producer start_transform.visit(visitor, self, visited) def apply(self, transform, pvalueish=None): """Applies a custom transform using the pvalueish specified. Args: transform: the PTranform (or callable) to apply. pvalueish: the input for the PTransform (typically a PCollection). Raises: TypeError: if the transform object extracted from the argument list is not a callable type or a descendant from PTransform. RuntimeError: if the transform object was already applied to this pipeline and needs to be cloned in order to apply again. """ if not isinstance(transform, ptransform.PTransform): class CallableTransform(ptransform.PTransform): def __init__(self, callee): super(CallableTransform, self).__init__( label=getattr(callee, '__name__', 'Callable')) self._callee = callee def apply(self, *args, **kwargs): return self._callee(*args, **kwargs) assert callable(transform) transform = CallableTransform(transform) full_label = format_full_label(self._current_transform(), transform) if full_label in self.applied_labels: raise RuntimeError( 'Transform with label %s already applied. Please clone the current ' 'instance using a new label or alternatively create a new instance. ' 'To clone a transform use: transform.clone(\'NEW LABEL\').' % full_label) self.applied_labels.add(full_label) pvalueish, inputs = transform._extract_input_pvalues(pvalueish) try: inputs = tuple(inputs) for leaf_input in inputs: if not isinstance(leaf_input, pvalue.PValue): raise TypeError except TypeError: raise NotImplementedError( 'Unable to extract PValue inputs from %s; either %s does not accept ' 'inputs of this format, or it does not properly override ' '_extract_input_values' % (pvalueish, transform)) child = AppliedPTransform( self._current_transform(), transform, full_label, inputs) self._current_transform().add_part(child) self.transforms_stack.append(child) if self.options is not None: type_options = self.options.view_as(TypeOptions) else: type_options = None if type_options is not None and type_options.pipeline_type_check: transform.type_check_inputs(pvalueish) pvalueish_result = self.runner.apply(transform, pvalueish) if type_options is not None and type_options.pipeline_type_check: transform.type_check_outputs(pvalueish_result) for result in ptransform.GetPValues().visit(pvalueish_result): assert isinstance(result, (pvalue.PValue, pvalue.DoOutputsTuple)) # Make sure we set the producer only for a leaf node in the transform DAG. # This way we preserve the last transform of a composite transform as # being the real producer of the result. if result.producer is None: result.producer = child self._current_transform().add_output(result) # TODO(robertwb): Multi-input, multi-output inference. if (type_options is not None and type_options.pipeline_type_check and isinstance(result, pvalue.PCollection) and not result.element_type): input_element_type = ( inputs[0].element_type if len(inputs) == 1 else typehints.Any) type_hints = transform.get_type_hints() declared_output_type = type_hints.simple_output_type(transform.label) if declared_output_type: input_types = type_hints.input_types if input_types and input_types[0]: declared_input_type = input_types[0][0] result.element_type = typehints.bind_type_variables( declared_output_type, typehints.match_type_variables(declared_input_type, input_element_type)) else: result.element_type = declared_output_type else: result.element_type = transform.infer_output_type(input_element_type) assert isinstance(result.producer.inputs, tuple) if (type_options is not None and type_options.type_check_strictness == 'ALL_REQUIRED' and transform.get_type_hints().output_types is None): ptransform_name = '%s(%s)' % (transform.__class__.__name__, full_label) raise TypeCheckError('Pipeline type checking is enabled, however no ' 'output type-hint was found for the ' 'PTransform %s' % ptransform_name) self.transforms_stack.pop() return pvalueish_result