def run_pipeline(self, pipeline): """Execute the entire pipeline and returns an DirectPipelineResult.""" # TODO: Move imports to top. Pipeline <-> Runner dependency cause problems # with resolving imports when they are at top. # pylint: disable=wrong-import-position from apache_beam.pipeline import PipelineVisitor from apache_beam.runners.direct.consumer_tracking_pipeline_visitor import \ ConsumerTrackingPipelineVisitor from apache_beam.runners.direct.evaluation_context import EvaluationContext from apache_beam.runners.direct.executor import Executor from apache_beam.runners.direct.transform_evaluator import \ TransformEvaluatorRegistry from apache_beam.testing.test_stream import TestStream # Performing configured PTransform overrides. pipeline.replace_all(_get_transform_overrides(pipeline.options)) # If the TestStream I/O is used, use a mock test clock. class _TestStreamUsageVisitor(PipelineVisitor): """Visitor determining whether a Pipeline uses a TestStream.""" def __init__(self): self.uses_test_stream = False def visit_transform(self, applied_ptransform): if isinstance(applied_ptransform.transform, TestStream): self.uses_test_stream = True visitor = _TestStreamUsageVisitor() pipeline.visit(visitor) clock = TestClock() if visitor.uses_test_stream else RealClock() MetricsEnvironment.set_metrics_supported(True) logging.info('Running pipeline with DirectRunner.') self.consumer_tracking_visitor = ConsumerTrackingPipelineVisitor() pipeline.visit(self.consumer_tracking_visitor) evaluation_context = EvaluationContext( pipeline._options, BundleFactory(stacked=pipeline._options.view_as(DirectOptions) .direct_runner_use_stacked_bundle), self.consumer_tracking_visitor.root_transforms, self.consumer_tracking_visitor.value_to_consumers, self.consumer_tracking_visitor.step_names, self.consumer_tracking_visitor.views, clock) executor = Executor(self.consumer_tracking_visitor.value_to_consumers, TransformEvaluatorRegistry(evaluation_context), evaluation_context) # DirectRunner does not support injecting # PipelineOptions values at runtime RuntimeValueProvider.set_runtime_options({}) # Start the executor. This is a non-blocking call, it will start the # execution in background threads and return. executor.start(self.consumer_tracking_visitor.root_transforms) result = DirectPipelineResult(executor, evaluation_context) return result
def _r(runner, options, seeds): bigquery.truncate(seeds) bigquery.seed(seeds) RuntimeValueProvider.set_runtime_options(None) runner._run(TestPipeline(options=options), options)
def run_pipeline(self, pipeline, options): """Execute the entire pipeline and returns an DirectPipelineResult.""" # TODO: Move imports to top. Pipeline <-> Runner dependency cause problems # with resolving imports when they are at top. # pylint: disable=wrong-import-position from apache_beam.pipeline import PipelineVisitor from apache_beam.runners.direct.consumer_tracking_pipeline_visitor import \ ConsumerTrackingPipelineVisitor from apache_beam.runners.direct.evaluation_context import EvaluationContext from apache_beam.runners.direct.executor import Executor from apache_beam.runners.direct.transform_evaluator import \ TransformEvaluatorRegistry from apache_beam.testing.test_stream import TestStream # Performing configured PTransform overrides. pipeline.replace_all(_get_transform_overrides(options)) # If the TestStream I/O is used, use a mock test clock. class _TestStreamUsageVisitor(PipelineVisitor): """Visitor determining whether a Pipeline uses a TestStream.""" def __init__(self): self.uses_test_stream = False def visit_transform(self, applied_ptransform): if isinstance(applied_ptransform.transform, TestStream): self.uses_test_stream = True visitor = _TestStreamUsageVisitor() pipeline.visit(visitor) clock = TestClock() if visitor.uses_test_stream else RealClock() # TODO(BEAM-4274): Circular import runners-metrics. Requires refactoring. from apache_beam.metrics.execution import MetricsEnvironment MetricsEnvironment.set_metrics_supported(True) logging.info('Running pipeline with DirectRunner.') self.consumer_tracking_visitor = ConsumerTrackingPipelineVisitor() pipeline.visit(self.consumer_tracking_visitor) evaluation_context = EvaluationContext( options, BundleFactory(stacked=options.view_as( DirectOptions).direct_runner_use_stacked_bundle), self.consumer_tracking_visitor.root_transforms, self.consumer_tracking_visitor.value_to_consumers, self.consumer_tracking_visitor.step_names, self.consumer_tracking_visitor.views, clock) executor = Executor(self.consumer_tracking_visitor.value_to_consumers, TransformEvaluatorRegistry(evaluation_context), evaluation_context) # DirectRunner does not support injecting # PipelineOptions values at runtime RuntimeValueProvider.set_runtime_options({}) # Start the executor. This is a non-blocking call, it will start the # execution in background threads and return. executor.start(self.consumer_tracking_visitor.root_transforms) result = DirectPipelineResult(executor, evaluation_context) return result
def test_inserting_the_dest_table_schema_into_pcollection_runtime(): with TestPipeline() as p: lake_table = RuntimeValueProvider( option_name='dest', value_type=str, default_value=f'{project_id}:lake.wrench_metrics') expected = [{ 'schema': [ gcp_bq.schema.SchemaField('entity_id', 'STRING', 'REQUIRED', None, ()), gcp_bq.schema.SchemaField('tree_user_id', 'INTEGER', 'REQUIRED', None, ()), gcp_bq.schema.SchemaField('prediction', 'STRING', 'REQUIRED', None, ()), gcp_bq.schema.SchemaField('client_wrench_id', 'STRING', 'REQUIRED', None, ()), gcp_bq.schema.SchemaField('expirement_name', 'STRING', 'NULLABLE', None, ()), gcp_bq.schema.SchemaField('processing_datetime', 'DATETIME', 'NULLABLE', None, ()), gcp_bq.schema.SchemaField('ingestion_timestamp', 'TIMESTAMP', 'REQUIRED', None, ()) ], 'payload': {} }] pcoll = p | beam.Create([{}]) schema_pcoll = pcoll | beam.ParDo( bq.IngectTableSchema(table=lake_table)) assert_that(schema_pcoll, equal_to(expected)) RuntimeValueProvider.set_runtime_options(None)
def test_experiments_setup(self): self.assertFalse('feature_1' in RuntimeValueProvider.experiments) RuntimeValueProvider.set_runtime_options( {'experiments': ['feature_1', 'feature_2']}) self.assertTrue(isinstance(RuntimeValueProvider.experiments, set)) self.assertTrue('feature_1' in RuntimeValueProvider.experiments) self.assertTrue('feature_2' in RuntimeValueProvider.experiments)
def test_experiments_setup(self): RuntimeValueProvider.set_runtime_options( {'experiments': ['feature_1', 'feature_2']}) self.assertTrue(isinstance(RuntimeValueProvider.experiments, set)) self.assertTrue('feature_1' in RuntimeValueProvider.experiments) self.assertTrue('feature_2' in RuntimeValueProvider.experiments) # Clean up runtime_options after this test case finish, otherwise, it'll # affect other cases since runtime_options is static attr RuntimeValueProvider.set_runtime_options(None)
def test_get_destination_uri_runtime_vp(self): # Provide values at job-execution time. RuntimeValueProvider.set_runtime_options({'gcs_location': 'gs://bucket'}) options = self.UserDefinedOptions() unique_id = uuid.uuid4().hex uri = bigquery_export_destination_uri(options.gcs_location, None, unique_id) self.assertEqual( uri, 'gs://bucket/' + unique_id + '/bigquery-table-dump-*.json')
def test_get_destination_uri_empty_runtime_vp(self): with self.assertRaisesRegex(ValueError, '^ReadFromBigQuery requires a GCS ' 'location to be provided'): # Don't provide any runtime values. RuntimeValueProvider.set_runtime_options({}) options = self.UserDefinedOptions() bigquery_export_destination_uri( options.gcs_location, None, uuid.uuid4().hex)
def test_set_runtime_option(self): # define ValueProvider options, with and without default values class UserDefinedOptions1(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_value_provider_argument( '--vpt_vp_arg6', help='This keyword argument is a value provider' ) # set at runtime parser.add_value_provider_argument( # not set, had default int '-v', '--vpt_vp_arg7', # with short form default=123, type=int) parser.add_value_provider_argument( # not set, had default str '--vpt_vp-arg8', # with dash in name default='123', type=str) parser.add_value_provider_argument( # not set and no default '--vpt_vp_arg9', type=float) parser.add_value_provider_argument( # positional argument set 'vpt_vp_arg10', # default & runtime ignored help='This positional argument is a value provider', type=float, default=5.4) # provide values at graph-construction time # (options not provided here become of the type RuntimeValueProvider) options = UserDefinedOptions1(['1.2']) self.assertFalse(options.vpt_vp_arg6.is_accessible()) self.assertFalse(options.vpt_vp_arg7.is_accessible()) self.assertFalse(options.vpt_vp_arg8.is_accessible()) self.assertFalse(options.vpt_vp_arg9.is_accessible()) self.assertTrue(options.vpt_vp_arg10.is_accessible()) # provide values at job-execution time # (options not provided here will use their default, if they have one) RuntimeValueProvider.set_runtime_options({ 'vpt_vp_arg6': 'abc', 'vpt_vp_arg10': '3.2' }) self.assertTrue(options.vpt_vp_arg6.is_accessible()) self.assertEqual(options.vpt_vp_arg6.get(), 'abc') self.assertTrue(options.vpt_vp_arg7.is_accessible()) self.assertEqual(options.vpt_vp_arg7.get(), 123) self.assertTrue(options.vpt_vp_arg8.is_accessible()) self.assertEqual(options.vpt_vp_arg8.get(), '123') self.assertTrue(options.vpt_vp_arg9.is_accessible()) self.assertIsNone(options.vpt_vp_arg9.get()) self.assertTrue(options.vpt_vp_arg10.is_accessible()) self.assertEqual(options.vpt_vp_arg10.get(), 1.2)
def run_pipeline( self, pipeline, # type: Pipeline options # type: pipeline_options.PipelineOptions ): # type: (...) -> RunnerResult RuntimeValueProvider.set_runtime_options({}) # Setup "beam_fn_api" experiment options if lacked. experiments = (options.view_as( pipeline_options.DebugOptions).experiments or []) if not 'beam_fn_api' in experiments: experiments.append('beam_fn_api') options.view_as( pipeline_options.DebugOptions).experiments = experiments # This is sometimes needed if type checking is disabled # to enforce that the inputs (and outputs) of GroupByKey operations # are known to be KVs. from apache_beam.runners.dataflow.dataflow_runner import DataflowRunner # TODO: Move group_by_key_input_visitor() to a non-dataflow specific file. pipeline.visit( DataflowRunner.group_by_key_input_visitor( not options.view_as(pipeline_options.TypeOptions ).allow_non_deterministic_key_coders)) self._bundle_repeat = self._bundle_repeat or options.view_as( pipeline_options.DirectOptions).direct_runner_bundle_repeat pipeline_direct_num_workers = options.view_as( pipeline_options.DirectOptions).direct_num_workers if pipeline_direct_num_workers == 0: self._num_workers = multiprocessing.cpu_count() else: self._num_workers = pipeline_direct_num_workers or self._num_workers # set direct workers running mode if it is defined with pipeline options. running_mode = \ options.view_as(pipeline_options.DirectOptions).direct_running_mode if running_mode == 'multi_threading': self._default_environment = environments.EmbeddedPythonGrpcEnvironment( ) elif running_mode == 'multi_processing': command_string = '%s -m apache_beam.runners.worker.sdk_worker_main' \ % sys.executable self._default_environment = environments.SubprocessSDKEnvironment( command_string=command_string) self._profiler_factory = Profile.factory_from_options( options.view_as(pipeline_options.ProfilingOptions)) self._latest_run_result = self.run_via_runner_api( pipeline.to_runner_api( default_environment=self._default_environment)) return self._latest_run_result
def test_experiments_setup(self): self.assertFalse('feature_1' in RuntimeValueProvider.experiments) RuntimeValueProvider.set_runtime_options( {'experiments': ['feature_1', 'feature_2']} ) self.assertTrue(isinstance(RuntimeValueProvider.experiments, set)) self.assertTrue('feature_1' in RuntimeValueProvider.experiments) self.assertTrue('feature_2' in RuntimeValueProvider.experiments) # Clean up runtime_options after this test case finish, otherwise, it'll # affect other cases since runtime_options is static attr RuntimeValueProvider.set_runtime_options(None)
def test_set_runtime_option(self): # define ValueProvider ptions, with and without default values class UserDefinedOptions1(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_value_provider_argument( '--vpt_vp_arg6', help='This keyword argument is a value provider') # set at runtime parser.add_value_provider_argument( # not set, had default int '-v', '--vpt_vp_arg7', # with short form default=123, type=int) parser.add_value_provider_argument( # not set, had default str '--vpt_vp-arg8', # with dash in name default='123', type=str) parser.add_value_provider_argument( # not set and no default '--vpt_vp_arg9', type=float) parser.add_value_provider_argument( # positional argument set 'vpt_vp_arg10', # default & runtime ignored help='This positional argument is a value provider', type=float, default=5.4) # provide values at graph-construction time # (options not provided here become of the type RuntimeValueProvider) options = UserDefinedOptions1(['1.2']) self.assertFalse(options.vpt_vp_arg6.is_accessible()) self.assertFalse(options.vpt_vp_arg7.is_accessible()) self.assertFalse(options.vpt_vp_arg8.is_accessible()) self.assertFalse(options.vpt_vp_arg9.is_accessible()) self.assertTrue(options.vpt_vp_arg10.is_accessible()) # provide values at job-execution time # (options not provided here will use their default, if they have one) RuntimeValueProvider.set_runtime_options({'vpt_vp_arg6': 'abc', 'vpt_vp_arg10':'3.2'}) self.assertTrue(options.vpt_vp_arg6.is_accessible()) self.assertEqual(options.vpt_vp_arg6.get(), 'abc') self.assertTrue(options.vpt_vp_arg7.is_accessible()) self.assertEqual(options.vpt_vp_arg7.get(), 123) self.assertTrue(options.vpt_vp_arg8.is_accessible()) self.assertEqual(options.vpt_vp_arg8.get(), '123') self.assertTrue(options.vpt_vp_arg9.is_accessible()) self.assertIsNone(options.vpt_vp_arg9.get()) self.assertTrue(options.vpt_vp_arg10.is_accessible()) self.assertEqual(options.vpt_vp_arg10.get(), 1.2)
def run_pipeline(self, pipeline): """Execute the entire pipeline and returns an DirectPipelineResult.""" # Performing configured PTransform overrides. pipeline.replace_all(self._ptransform_overrides) # TODO: Move imports to top. Pipeline <-> Runner dependency cause problems # with resolving imports when they are at top. # pylint: disable=wrong-import-position from apache_beam.runners.direct.consumer_tracking_pipeline_visitor import \ ConsumerTrackingPipelineVisitor from apache_beam.runners.direct.evaluation_context import EvaluationContext from apache_beam.runners.direct.executor import Executor from apache_beam.runners.direct.transform_evaluator import \ TransformEvaluatorRegistry MetricsEnvironment.set_metrics_supported(True) logging.info('Running pipeline with DirectRunner.') self.consumer_tracking_visitor = ConsumerTrackingPipelineVisitor() pipeline.visit(self.consumer_tracking_visitor) clock = TestClock() if self._use_test_clock else RealClock() evaluation_context = EvaluationContext( pipeline._options, BundleFactory(stacked=pipeline._options.view_as(DirectOptions) .direct_runner_use_stacked_bundle), self.consumer_tracking_visitor.root_transforms, self.consumer_tracking_visitor.value_to_consumers, self.consumer_tracking_visitor.step_names, self.consumer_tracking_visitor.views, clock) evaluation_context.use_pvalue_cache(self._cache) executor = Executor(self.consumer_tracking_visitor.value_to_consumers, TransformEvaluatorRegistry(evaluation_context), evaluation_context) # DirectRunner does not support injecting # PipelineOptions values at runtime RuntimeValueProvider.set_runtime_options({}) # Start the executor. This is a non-blocking call, it will start the # execution in background threads and return. executor.start(self.consumer_tracking_visitor.root_transforms) result = DirectPipelineResult(executor, evaluation_context) if self._cache: # We are running in eager mode, block until the pipeline execution # completes in order to have full results in the cache. result.wait_until_finish() self._cache.finalize() return result
def run_pipeline(self, pipeline, options): MetricsEnvironment.set_metrics_supported(False) RuntimeValueProvider.set_runtime_options({}) # This is sometimes needed if type checking is disabled # to enforce that the inputs (and outputs) of GroupByKey operations # are known to be KVs. from apache_beam.runners.dataflow.dataflow_runner import DataflowRunner pipeline.visit(DataflowRunner.group_by_key_input_visitor()) self._bundle_repeat = self._bundle_repeat or options.view_as( pipeline_options.DirectOptions).direct_runner_bundle_repeat self._profiler_factory = profiler.Profile.factory_from_options( options.view_as(pipeline_options.ProfilingOptions)) return self.run_via_runner_api(pipeline.to_runner_api( default_environment=self._default_environment))
def run_pipeline(self, pipeline, options): MetricsEnvironment.set_metrics_supported(False) RuntimeValueProvider.set_runtime_options({}) # This is sometimes needed if type checking is disabled # to enforce that the inputs (and outputs) of GroupByKey operations # are known to be KVs. from apache_beam.runners.dataflow.dataflow_runner import DataflowRunner pipeline.visit(DataflowRunner.group_by_key_input_visitor()) self._bundle_repeat = self._bundle_repeat or options.view_as( pipeline_options.DirectOptions).direct_runner_bundle_repeat self._profiler_factory = profiler.Profile.factory_from_options( options.view_as(pipeline_options.ProfilingOptions)) return self.run_via_runner_api(pipeline.to_runner_api( default_environment=self._default_environment))
def test_get_destination_uri_fallback_temp_location(self): # Don't provide any runtime values. RuntimeValueProvider.set_runtime_options({}) options = self.UserDefinedOptions() with self.assertLogs('apache_beam.io.gcp.bigquery_read_internal', level='DEBUG') as context: bigquery_export_destination_uri( options.gcs_location, 'gs://bucket', uuid.uuid4().hex) self.assertEqual( context.output, [ 'DEBUG:apache_beam.io.gcp.bigquery_read_internal:gcs_location is ' 'empty, using temp_location instead' ])
def test_bytes_read_are_reported(self): RuntimeValueProvider.set_runtime_options( {'experiments': ['sideinput_io_metrics_v2', 'other']}) mock_read_counter = mock.MagicMock() source_records = ['a', 'b', 'c', 'd'] sources = [ FakeSource(source_records, notify_observers=True), ] iterator_fn = sideinputs.get_iterator_fn_for_sources( sources, max_reader_threads=3, read_counter=mock_read_counter) assert list(strip_windows(iterator_fn())) == source_records mock_read_counter.add_bytes_read.assert_called_with(4) # Remove runtime options from the runtime value provider. RuntimeValueProvider.set_runtime_options({})
def test_bytes_read_are_reported(self): RuntimeValueProvider.set_runtime_options( {'experiments': 'sideinput_io_metrics,other'}) mock_read_counter = mock.MagicMock() source_records = ['a', 'b', 'c', 'd'] sources = [ FakeSource(source_records, notify_observers=True), ] iterator_fn = sideinputs.get_iterator_fn_for_sources( sources, max_reader_threads=3, read_counter=mock_read_counter) assert list(strip_windows(iterator_fn())) == source_records mock_read_counter.add_bytes_read.assert_called_with(4) # Remove runtime options from the runtime value provider. RuntimeValueProvider.set_runtime_options({})
def test_nested_value_provider_wrap_runtime(self): class UserDefinedOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_value_provider_argument( '--vpt_vp_arg15', help='This keyword argument is a value provider') # set at runtime options = UserDefinedOptions([]) vp = NestedValueProvider(options.vpt_vp_arg15, lambda x: x + x) self.assertFalse(vp.is_accessible()) RuntimeValueProvider.set_runtime_options({'vpt_vp_arg15': 'abc'}) self.assertTrue(vp.is_accessible()) self.assertEqual(vp.get(), 'abcabc')
def test_runtime_serialized_file_list_is_deserialized_and_processed_by_insertion_order( cloudstorage): with TestPipeline() as p: bucket = f'{project_id}-cdc-imports' # Update sort_key based on the filename format def _sort_key(f): delimeter = '-' ts = f[f.rfind(delimeter) + 1:] return int(ts) if ts.isdigit() else f _sort_key = bytes.hex(dill.dumps(_sort_key)) runtime_env = RuntimeValueProvider(option_name='env', value_type=str, default_value='local') runtime_bucket = RuntimeValueProvider(option_name='bucket', value_type=str, default_value=bucket) runtime_startswith = RuntimeValueProvider( option_name='files_startwith', value_type=str, default_value='vibe-tree-user-statuses-final') runtime_sort_key = RuntimeValueProvider(option_name='sort_key', value_type=str, default_value=_sort_key) [b.delete() for b in cloudstorage.client.list_blobs(bucket)] file_paths = [ 'vibe-tree-user-statuses-final-0083c-1987582612499', 'vibe-tree-user-statuses-final-003c-1587582612405', 'vibe-order-items-final-0030dd8697-1588231505823' ] expected_output = [ 'gs://icentris-ml-local-wbrito-cdc-imports/vibe-tree-user-statuses-final-003c-1587582612405', 'gs://icentris-ml-local-wbrito-cdc-imports/vibe-tree-user-statuses-final-0083c-1987582612499' ] for f in file_paths: cloudstorage.client.upload_blob_from_string(bucket, f, f) p_paths = p | FileListIteratorTransform( env=runtime_env, bucket=runtime_bucket, files_startwith=runtime_startswith, sort_key=runtime_sort_key) assert_that(p_paths, equal_to(expected_output)) RuntimeValueProvider.set_runtime_options(None)
def test_string_or_value_provider_only(self): str_file_pattern = tempfile.NamedTemporaryFile(delete=False).name self.assertEqual(str_file_pattern, FileBasedSource(str_file_pattern)._pattern.value) static_vp_file_pattern = StaticValueProvider(value_type=str, value=str_file_pattern) self.assertEqual(static_vp_file_pattern, FileBasedSource(static_vp_file_pattern)._pattern) runtime_vp_file_pattern = RuntimeValueProvider( option_name='arg', value_type=str, default_value=str_file_pattern) self.assertEqual(runtime_vp_file_pattern, FileBasedSource(runtime_vp_file_pattern)._pattern) # Reset runtime options to avoid side-effects in other tests. RuntimeValueProvider.set_runtime_options(None) invalid_file_pattern = 123 with self.assertRaises(TypeError): FileBasedSource(invalid_file_pattern)
def ndjson(env, cloudstorage, record_testsuite_property): cloudstorage.client.delete_blob(bucket, dest_blob_name) assert cloudstorage.client.blob_exists(bucket, dest_blob_name) is False sql = BigQuery.querybuilder(union=('all', [ BigQuery.querybuilder( select=[('NULL', 'none'), ('True', 'true_bool'), ( 'False', 'false_bool'), ( '"2020-04-03"', 'date'), ('"2020-04-03 13:45:00"', 'datetime'), ('"1966-06-06 06:06:06.666666 UTC"', 'timestamp'), ('"STRING"', 'string'), ('234', 'integer'), ('123.54', 'float')]), BigQuery.querybuilder(select=['NULL'] * 9), BigQuery.querybuilder(select=[ '"String"', 'False', 'True', '"1993-09-03"', '"1993-09-03 03:44:00"', '"1993-09-03 03:44:00.777555 UTC"', '"Not String"', '567', '456' ]) ])) RuntimeValueProvider.set_runtime_options(None) options = RuntimeOptions([ '--env', env['env'], '--query', str(sql), '--destination', f'gs://{bucket}/{blob_name}' ]) Runner._run(TestPipeline(options=options), options) assert cloudstorage.client.blob_exists(bucket, dest_blob_name) is True zbytes = cloudstorage.client.download_blob_as_string( bucket, dest_blob_name) bytes = gzip.decompress(zbytes) lns = bytes.decode('utf8').rstrip().split('\n') yield [json.loads(l) for l in lns]
def test_runtime_value_provider_to(self): RuntimeValueProvider.set_runtime_options(None) rvp = RuntimeValueProvider('arg', 123, int) self.assertEqual(JsonValue(is_null=True), to_json_value(rvp)) # Reset runtime options to avoid side-effects in other tests. RuntimeValueProvider.set_runtime_options(None)
def tearDown(self): # Reset runtime options to avoid side-effects in other tests. RuntimeValueProvider.set_runtime_options(None)
def setUp(self): # Reset runtime options to avoid side-effects caused by other tests. # Note that is_accessible assertions require runtime_options to # be uninitialized. RuntimeValueProvider.set_runtime_options(None)
def main(unused_argv): """Main entry point for SDK Fn Harness.""" if 'LOGGING_API_SERVICE_DESCRIPTOR' in os.environ: try: logging_service_descriptor = endpoints_pb2.ApiServiceDescriptor() text_format.Merge(os.environ['LOGGING_API_SERVICE_DESCRIPTOR'], logging_service_descriptor) # Send all logs to the runner. fn_log_handler = FnApiLogRecordHandler(logging_service_descriptor) # TODO(BEAM-5468): This should be picked up from pipeline options. logging.getLogger().setLevel(logging.DEBUG) logging.getLogger().addHandler(fn_log_handler) logging.info('Logging handler created.') except Exception: logging.error( "Failed to set up logging handler, continuing without.", exc_info=True) fn_log_handler = None else: fn_log_handler = None # Start status HTTP server thread. thread = threading.Thread(name='status_http_server', target=StatusServer().start) thread.daemon = True thread.setName('status-server-demon') thread.start() if 'PIPELINE_OPTIONS' in os.environ: sdk_pipeline_options = _parse_pipeline_options( os.environ['PIPELINE_OPTIONS']) else: sdk_pipeline_options = PipelineOptions.from_dictionary({}) if 'SEMI_PERSISTENT_DIRECTORY' in os.environ: semi_persistent_directory = os.environ['SEMI_PERSISTENT_DIRECTORY'] else: semi_persistent_directory = None logging.info('semi_persistent_directory: %s', semi_persistent_directory) _worker_id = os.environ.get('WORKER_ID', None) try: _load_main_session(semi_persistent_directory) except Exception: # pylint: disable=broad-except exception_details = traceback.format_exc() logging.error('Could not load main session: %s', exception_details, exc_info=True) try: logging.info('Python sdk harness started with pipeline_options: %s', sdk_pipeline_options.get_all_options(drop_default=True)) RuntimeValueProvider.set_runtime_options( sdk_pipeline_options.view_as( pipeline_options.HadoopFileSystemOptions).get_all_options()) service_descriptor = endpoints_pb2.ApiServiceDescriptor() text_format.Merge(os.environ['CONTROL_API_SERVICE_DESCRIPTOR'], service_descriptor) # TODO(robertwb): Support credentials. assert not service_descriptor.oauth2_client_credentials_grant.url SdkHarness(control_address=service_descriptor.url, worker_count=_get_worker_count(sdk_pipeline_options), worker_id=_worker_id, profiler_factory=profiler.Profile.factory_from_options( sdk_pipeline_options.view_as( pipeline_options.ProfilingOptions))).run() logging.info('Python sdk harness exiting.') except: # pylint: disable=broad-except logging.exception('Python sdk harness failed: ') raise finally: if fn_log_handler: fn_log_handler.close()
def setUp(self): # Reset runtime options, since the is_accessible assertions require them to # be uninitialized. RuntimeValueProvider.set_runtime_options(None)
def create_harness(environment, dry_run=False): """Creates SDK Fn Harness.""" if 'LOGGING_API_SERVICE_DESCRIPTOR' in environment: try: logging_service_descriptor = endpoints_pb2.ApiServiceDescriptor() text_format.Merge(environment['LOGGING_API_SERVICE_DESCRIPTOR'], logging_service_descriptor) # Send all logs to the runner. fn_log_handler = FnApiLogRecordHandler(logging_service_descriptor) # TODO(BEAM-5468): This should be picked up from pipeline options. logging.getLogger().setLevel(logging.INFO) logging.getLogger().addHandler(fn_log_handler) _LOGGER.info('Logging handler created.') except Exception: _LOGGER.error( "Failed to set up logging handler, continuing without.", exc_info=True) fn_log_handler = None else: fn_log_handler = None pipeline_options_dict = _load_pipeline_options( environment.get('PIPELINE_OPTIONS')) # These are used for dataflow templates. RuntimeValueProvider.set_runtime_options(pipeline_options_dict) sdk_pipeline_options = PipelineOptions.from_dictionary( pipeline_options_dict) filesystems.FileSystems.set_options(sdk_pipeline_options) if 'SEMI_PERSISTENT_DIRECTORY' in environment: semi_persistent_directory = environment['SEMI_PERSISTENT_DIRECTORY'] else: semi_persistent_directory = None _LOGGER.info('semi_persistent_directory: %s', semi_persistent_directory) _worker_id = environment.get('WORKER_ID', None) try: _load_main_session(semi_persistent_directory) except CorruptMainSessionException: exception_details = traceback.format_exc() _LOGGER.error('Could not load main session: %s', exception_details, exc_info=True) raise except Exception: # pylint: disable=broad-except exception_details = traceback.format_exc() _LOGGER.error('Could not load main session: %s', exception_details, exc_info=True) _LOGGER.info('Pipeline_options: %s', sdk_pipeline_options.get_all_options(drop_default=True)) control_service_descriptor = endpoints_pb2.ApiServiceDescriptor() status_service_descriptor = endpoints_pb2.ApiServiceDescriptor() text_format.Merge(environment['CONTROL_API_SERVICE_DESCRIPTOR'], control_service_descriptor) if 'STATUS_API_SERVICE_DESCRIPTOR' in environment: text_format.Merge(environment['STATUS_API_SERVICE_DESCRIPTOR'], status_service_descriptor) # TODO(robertwb): Support authentication. assert not control_service_descriptor.HasField('authentication') experiments = sdk_pipeline_options.view_as(DebugOptions).experiments or [] enable_heap_dump = 'enable_heap_dump' in experiments if dry_run: return sdk_harness = SdkHarness( control_address=control_service_descriptor.url, status_address=status_service_descriptor.url, worker_id=_worker_id, state_cache_size=_get_state_cache_size(experiments), data_buffer_time_limit_ms=_get_data_buffer_time_limit_ms(experiments), profiler_factory=profiler.Profile.factory_from_options( sdk_pipeline_options.view_as(ProfilingOptions)), enable_heap_dump=enable_heap_dump) return fn_log_handler, sdk_harness