def run_pipeline( image_tag, direct_runner, update, klio_config, config_meta, blocking ): # Prompt user to continue if runtime config file is not the same as # the buildtime config file. Do this after _get_config since that # will prompt the user if their config file doesn't even exist first. if _compare_runtime_to_buildtime_config(klio_config) is False: msg = ( "The Klio config file '{}' at runtime differs from the config " "file used when building this Docker image. If this is unexpected " "behavior, please double check your runtime config, or rebuild " "your Docker image with the correct config file." ) logging.warning(msg.format(config_meta.config_path)) # RunConfig ensures config is pickled and sent to worker. Note this # depends on save_main_session being True klio_transforms_core.RunConfig.set(klio_config) # This can only be imported after RunConfig is set since it will end up # importing classes that may (or do) attempt to read it from klio_exec.commands import run if update is None: # if it's not explicitly set in CLI, look at config update = klio_config.pipeline_options.update if blocking is None: # if it's not explicitly set in CLI, look at config blocking = klio_config.job_config.blocking runtime_conf = RuntimeConfig(image_tag, direct_runner, update, blocking) klio_pipeline = run.KlioPipeline( klio_config.job_name, klio_config, runtime_conf ) klio_pipeline.run()
def test_verify_packaging(exp, setup_file, requirements_file, streaming, mocker): mock_config = mocker.Mock() mock_config.pipeline_options = mocker.Mock() mock_config.pipeline_options.experiments = [exp] mock_config.pipeline_options.setup_file = setup_file mock_config.pipeline_options.requirements_file = requirements_file mock_config.pipeline_options.streaming = streaming mock_path_exists = mocker.patch.object(os.path, "exists") mock_path_exists.return_value = True mock_write_run_effective_config = mocker.patch.object( run.KlioPipeline, "_write_run_effective_config") kpipe = run.KlioPipeline("test-job", mock_config, mocker.Mock()) if not streaming and not any([requirements_file, setup_file, exp]): with pytest.raises(SystemExit): kpipe._verify_packaging() else: kpipe._verify_packaging() if setup_file and not exp: mock_write_run_effective_config.assert_called_once() else: mock_write_run_effective_config.assert_not_called()
def run_pipeline(image_tag, direct_runner, update, config_file, blocking): config_path = config_file or "klio-job.yaml" config_data = _get_config(config_path) # Prompt user to continue if runtime config file is not the same as # the buildtime config file. Do this after _get_config since that # will prompt the user if their config file doesn't even exist first. if _compare_runtime_to_buildtime_config(config_path) is False: msg = ( "The Klio config file '{}' at runtime differs from the config " "file used when building this Docker image. If this is unexpected " "behavior, please double check your runtime config, or rebuild " "your Docker image with the correct config file.") logging.warning(msg.format(config_path)) if direct_runner: config_data["pipeline_options"]["runner"] = "direct" job_name = config_data["job_name"] conf_obj = config.KlioConfig(config_data) if update is None: # if it's not explicitly set in CLI, look at config update = conf_obj.pipeline_options.update if blocking is None: # if it's not explicitly set in CLI, look at config blocking = conf_obj.job_config.blocking runtime_conf = RuntimeConfig(image_tag, direct_runner, update, blocking) klio_pipeline = run.KlioPipeline(job_name, conf_obj, runtime_conf) klio_pipeline.run()
def test_set_setup_options( all_options, config, setup_file, reqs_file, mocker, monkeypatch ): here = os.path.abspath(".") if reqs_file: all_options["requirements_file"] = reqs_file if setup_file: all_options["setup_file"] = setup_file options = pipeline_options.PipelineOptions().from_dictionary(all_options) actual_setup_options = options.view_as(pipeline_options.SetupOptions) kpipe = run.KlioPipeline("test-job", config, mocker.Mock()) kpipe._set_setup_options(options) if setup_file: expected_setup_file = os.path.join(here, setup_file) assert expected_setup_file == actual_setup_options.setup_file else: assert actual_setup_options.setup_file is None if reqs_file: expected_reqs_file = os.path.join(here, reqs_file) assert expected_reqs_file == actual_setup_options.requirements_file else: assert actual_setup_options.requirements_file is None
def test_verify_packaging_for_batch_raises( mocker, caplog, has_setup_file, setup_file_exists, has_reqs_file, reqs_file_exists, ): mock_config = mocker.Mock() mock_config.pipeline_options = mocker.Mock() mock_config.pipeline_options.streaming = False mock_config.pipeline_options.experiments = [] if has_setup_file: mock_config.pipeline_options.setup_file = "setup.py" if has_reqs_file: mock_config.pipeline_options.requirements_file = "requirements.txt" mock_path_exists = mocker.patch.object(os.path, "exists") mock_path_exists.side_effect = [setup_file_exists, reqs_file_exists] kpipe = run.KlioPipeline("test-job", mock_config, mocker.Mock()) with pytest.raises(SystemExit): kpipe._verify_packaging() assert 1 == len(caplog.records)
def test_get_pipeline_options( has_none_values, config, options, worker_options, gcp_options, mocker, monkeypatch, ): mock_parse_config_pipeline_opts = mocker.Mock() mock_set_gcp_opts = mocker.Mock() mock_set_worker_opts = mocker.Mock() mock_set_std_opts = mocker.Mock() mock_set_debug_opts = mocker.Mock() mock_set_setup_opts = mocker.Mock() mock_opts = mocker.Mock() mock_opts_from_dict = mocker.Mock() monkeypatch.setattr( run.KlioPipeline, "_parse_config_pipeline_options", mock_parse_config_pipeline_opts, ) monkeypatch.setattr(run.KlioPipeline, "_set_google_cloud_options", mock_set_gcp_opts) monkeypatch.setattr(run.KlioPipeline, "_set_worker_options", mock_set_worker_opts) monkeypatch.setattr(run.KlioPipeline, "_set_standard_options", mock_set_std_opts) monkeypatch.setattr(run.KlioPipeline, "_set_debug_options", mock_set_debug_opts) monkeypatch.setattr(run.KlioPipeline, "_set_setup_options", mock_set_setup_opts) monkeypatch.setattr(run.pipeline_options, "PipelineOptions", lambda: mock_opts) mock_opts_from_dict = mock_opts.from_dictionary.return_value mock_runtime_conf = mocker.Mock() kpipe = run.KlioPipeline("test-job", config, mock_runtime_conf) kpipe._get_pipeline_options() mock_parse_config_pipeline_opts.assert_called_once_with() mock_set_gcp_opts.assert_called_once_with(mock_opts_from_dict) mock_set_worker_opts.assert_called_once_with(mock_opts_from_dict) mock_set_std_opts.assert_called_once_with(mock_opts_from_dict) mock_set_debug_opts.assert_called_once_with(mock_opts_from_dict) mock_set_setup_opts.assert_called_once_with(mock_opts_from_dict)
def test_verify_packaging_with_both_packagaing_systems_raises(mocker): mock_config = mocker.Mock() mock_config.pipeline_options = mocker.Mock() mock_config.pipeline_options.experiments = ["beam_fn_api"] mock_config.pipeline_options.setup_file = "setup.py" kpipe = run.KlioPipeline("test-job", mock_config, mocker.Mock()) with pytest.raises(SystemExit): kpipe._verify_packaging()
def test_verify_packaging(exp, setup_file, requirements_file, mocker): mock_config = mocker.Mock() mock_config.pipeline_options = mocker.Mock() mock_config.pipeline_options.experiments = [exp] mock_config.pipeline_options.setup_file = setup_file mock_config.pipeline_options.requirements_file = requirements_file kpipe = run.KlioPipeline("test-job", mock_config, mocker.Mock()) kpipe._verify_packaging()
def test_run_pipeline_raises( update, value_err_msg, config, mocker, monkeypatch, caplog ): job_name = "my-job" mock_runtime_config = mocker.Mock(update=update) mock_verify_packaging = mocker.Mock() mock_get_run_callable = mocker.Mock() mock_run_callable = mocker.Mock() mock_run_callable.return_value.__or__ = mocker.Mock() mock_get_run_callable.return_value = mock_run_callable mock_get_pipeline_options = mocker.Mock() mock_pipeline = mocker.Mock() mock_pipeline.return_value.__or__ = mocker.Mock() mock_read_from_pubsub = mocker.Mock() mock_read_from_pubsub.return_value.__or__ = mocker.Mock() mock_read_from_file = mocker.Mock() mock_read_from_file.return_value.__or__ = mocker.Mock() mock_write_to_file = mocker.Mock() mock_write_to_file.return_value.__or__ = mocker.Mock() mock_write_to_pubsub = mocker.Mock() mock_write_to_pubsub.return_value.__or__ = mocker.Mock() monkeypatch.setattr( run.KlioPipeline, "_verify_packaging", mock_verify_packaging ) monkeypatch.setattr( run.KlioPipeline, "_get_run_callable", mock_get_run_callable ) monkeypatch.setattr( run.KlioPipeline, "_get_pipeline_options", mock_get_pipeline_options ) monkeypatch.setattr(run.beam, "Pipeline", mock_pipeline) monkeypatch.setattr(run.beam.io, "ReadFromPubSub", mock_read_from_pubsub) monkeypatch.setattr( run.transforms, "KlioReadFromText", mock_read_from_file ) monkeypatch.setattr(run.transforms, "KlioWriteToText", mock_write_to_file) monkeypatch.setattr(run.beam.io, "WriteToPubSub", mock_write_to_pubsub) monkeypatch.setattr( run.BatchEventMapper, "input", {"file": mock_read_from_file}, ) monkeypatch.setattr( run.BatchEventMapper, "output", {"file": mock_write_to_file}, ) config.pipeline_options.streaming = False mock_pipeline.return_value.run.side_effect = ValueError(value_err_msg) kpipe = run.KlioPipeline(job_name, config, mock_runtime_config) with pytest.raises(SystemExit): kpipe.run() mock_pipeline.return_value.run.assert_called_once_with() assert 1 == len(caplog.records) assert "ERROR" == caplog.records[0].levelname
def test_set_worker_options( pipeline_options_from_dict, worker_options, config, fn_api_enabled, mocker, monkeypatch, ): expected_opts = [ "subnetwork", "disk_size_gb", "autoscaling_algorithm", "num_workers", "max_num_workers", "use_public_ips", "min_cpu_platform", "dataflow_worker_jar", ] if fn_api_enabled: monkeypatch.setattr(config.pipeline_options, "experiments", ["beam_fn_api"]) kpipe = run.KlioPipeline("test-job", config, mocker.Mock(image_tag="foo")) kpipe._set_worker_options(pipeline_options_from_dict) actual_worker_options = pipeline_options_from_dict.view_as( pipeline_options.WorkerOptions) for opt in expected_opts: expected_value = worker_options[opt] # The True/False values in worker opts represent flags in the options. # These values are set to None when you set the PipelineOptions from # a dictionary. Since beam uses argparse to set values, None represents # False for these flags. # https://docs.python.org/2/howto/argparse.html # https://github.com/apache/beam/blob/master/sdks/python/apache_beam/options/pipeline_options.py#L723 if not expected_value: expected_value = None # getattr should explode when not setting a default value assert expected_value == getattr(actual_worker_options, opt) assert (worker_options["worker_machine_type"] == actual_worker_options.machine_type) assert ( worker_options["worker_disk_type"] == actual_worker_options.disk_type) if fn_api_enabled: assert ("gcr.io/example/image:foo" == actual_worker_options.worker_harness_container_image) else: assert ("gcr.io/example/image" == actual_worker_options.worker_harness_container_image)
def test_verify_packaging_for_batch_warns(mocker, caplog): mock_config = mocker.Mock() mock_config.pipeline_options = mocker.Mock() mock_config.pipeline_options.streaming = False mock_config.pipeline_options.experiments = ["beam_fn_api"] mock_config.pipeline_options.setup_file = None mock_config.pipeline_options.requirements_file = None kpipe = run.KlioPipeline("test-job", mock_config, mocker.Mock()) kpipe._verify_packaging() assert 1 == len(caplog.records) assert "WARNING" == caplog.records[0].levelname
def test_set_standard_options(all_options, config, direct_runner, mocker, monkeypatch): all_options["runner"] = "dataflow" options = pipeline_options.PipelineOptions().from_dictionary(all_options) actual_std_options = options.view_as(pipeline_options.StandardOptions) runtime_conf = mocker.Mock(direct_runner=direct_runner) kpipe = run.KlioPipeline("test-job", config, runtime_conf) kpipe._set_standard_options(options) assert actual_std_options.streaming is True expected_runner = "direct" if direct_runner else "dataflow" assert expected_runner == actual_std_options.runner
def test_verify_packaging(exp, setup_file, requirements_file, streaming, mocker): mock_config = mocker.Mock() mock_config.pipeline_options = mocker.Mock() mock_config.pipeline_options.experiments = [exp] mock_config.pipeline_options.setup_file = setup_file mock_config.pipeline_options.requirements_file = requirements_file mock_config.pipeline_options.streaming = streaming mock_path_exists = mocker.patch.object(os.path, "exists") mock_path_exists.return_value = True kpipe = run.KlioPipeline("test-job", mock_config, mocker.Mock()) kpipe._verify_packaging()
def test_parse_config_pipeline_options(setup_file, reqs_file, all_options, config, mocker, monkeypatch): as_dict_ret = config.pipeline_options.as_dict.return_value if setup_file: monkeypatch.setitem(as_dict_ret, "setup_file", setup_file) all_options["setup_file"] = setup_file all_options.pop("worker_harness_container_image") if reqs_file: monkeypatch.setitem(as_dict_ret, "requirements_file", reqs_file) all_options["requirements_file"] = reqs_file all_options.pop("worker_harness_container_image") kpipe = run.KlioPipeline("test-job", config, mocker.Mock()) actual = kpipe._parse_config_pipeline_options() all_options.pop("none_param") assert all_options == actual
def test_write_run_effective_config(mocker, direct_runner): if direct_runner: expected_path = "/usr/local/klio-job-run-effective.yaml" else: expected_path = "/usr/src/app/klio-job-run-effective.yaml" mock_config = mocker.Mock() mock_runtime_conf = mocker.Mock() mock_runtime_conf.direct_runner = direct_runner m_open = mocker.mock_open() mock_open = mocker.patch("klio_exec.commands.run.open", m_open) kpipe = run.KlioPipeline("test-job", mock_config, mock_runtime_conf) kpipe._write_run_effective_config() mock_open.assert_called_once_with(expected_path, "w") mock_config.write_to_file.assert_called_once_with(mock_open.return_value)
def test_get_run_callable_raises(mocker, monkeypatch, caplog, exception): mock_run_module = mocker.Mock() if not exception: mock_run_module.return_value.run = None mock_run_module.return_value.run_basic = None else: mock_run_module.side_effect = exception monkeypatch.setattr(run.imp, "load_source", mock_run_module) kpipe = run.KlioPipeline("my-job", mocker.Mock(), mocker.Mock()) with pytest.raises(SystemExit): kpipe._get_run_callable() assert 1 == len(caplog.records)
def test_get_run_callable(monkeypatch, mocker, run_callable): mock_run_module = mocker.Mock() expected_run_callable = mocker.Mock() if run_callable == "run": mock_run_module.return_value.run = expected_run_callable mock_run_module.return_value.run_basic = None else: mock_run_module.return_value.run = None mock_run_module.return_value.run_basic = expected_run_callable monkeypatch.setattr(run.imp, "load_source", mock_run_module) kpipe = run.KlioPipeline("my-job", mocker.Mock(), mocker.Mock()) actual_callable = kpipe._get_run_callable() assert expected_run_callable == actual_callable
def run_pipeline(image_tag, direct_runner, update, klio_config, config_meta, blocking): # RunConfig ensures config is pickled and sent to worker. Note this # depends on save_main_session being True # Notice, this is currently unused due to dataflow pickling issues, leaving # in for compatibility until a fix/alternative solution is in place klio_transforms_core.RunConfig.set(klio_config) # This can only be imported after RunConfig is set since it will end up # importing classes that may (or do) attempt to read it from klio_exec.commands import run if update is None: # if it's not explicitly set in CLI, look at config update = klio_config.pipeline_options.update if blocking is None: # if it's not explicitly set in CLI, look at config blocking = klio_config.job_config.blocking runtime_conf = RuntimeConfig(image_tag, direct_runner, update, blocking) klio_pipeline = run.KlioPipeline(klio_config.job_name, klio_config, runtime_conf) klio_pipeline.run()
def test_run_pipeline( streaming, blocking, direct_runner, run_error, exp_call_count, config, mocker, monkeypatch, ): job_name = "my-job" mock_runtime_config = mocker.Mock( direct_runner=direct_runner, update=True, blocking=blocking ) mock_verify_packaging = mocker.Mock() mock_get_run_callable = mocker.Mock() mock_run_callable = mocker.Mock() mock_run_callable.return_value.__or__ = mocker.Mock() mock_get_run_callable.return_value = mock_run_callable mock_get_pipeline_options = mocker.Mock() mock_pipeline = mocker.Mock() mock_pipeline.return_value.__or__ = mocker.Mock() mock_read_from_pubsub = mocker.Mock() mock_read_from_pubsub.return_value.__or__ = mocker.Mock() mock_read_from_file = mocker.Mock() mock_write_to_file = mocker.Mock() mock_write_to_pubsub = mocker.Mock() mock_write_to_pubsub.return_value.__or__ = mocker.Mock() monkeypatch.setattr( run.BatchEventMapper, "input", {"file": mock_read_from_file}, ) monkeypatch.setattr( run.BatchEventMapper, "output", {"file": mock_write_to_file}, ) monkeypatch.setattr( run.StreamingEventMapper, "input", {"pubsub": mock_read_from_pubsub}, ) monkeypatch.setattr( run.StreamingEventMapper, "output", {"pubsub": mock_write_to_pubsub}, ) monkeypatch.setattr( run.KlioPipeline, "_verify_packaging", mock_verify_packaging ) monkeypatch.setattr( run.KlioPipeline, "_get_run_callable", mock_get_run_callable ) monkeypatch.setattr( run.KlioPipeline, "_get_pipeline_options", mock_get_pipeline_options ) monkeypatch.setattr(run.beam, "Pipeline", mock_pipeline) monkeypatch.setattr(run.beam.io, "ReadFromPubSub", mock_read_from_pubsub) monkeypatch.setattr(run.beam.io, "WriteToPubSub", mock_write_to_pubsub) if streaming: mock_input = mocker.Mock() mock_input.name = "pubsub" mock_input.to_io_kwargs.return_value = { "subscription": "projects/foo/subscriptions/bar", } mock_output = mocker.Mock() mock_output.name = "pubsub" mock_output.to_io_kwargs.return_value = { "topic": "projects/foo/topics/bar", } config.job_config.events.inputs = [mock_input] config.job_config.events.outputs = [mock_output] if run_error: mock_pipeline.return_value.run.side_effect = [ run_error, mock_pipeline.return_value.run.return_value, ] config.pipeline_options.streaming = streaming kpipe = run.KlioPipeline(job_name, config, mock_runtime_config) kpipe.run() assert exp_call_count == mock_verify_packaging.call_count mock_verify_packaging.assert_called_with() assert exp_call_count == mock_get_run_callable.call_count mock_get_run_callable.assert_called_with() assert exp_call_count == mock_get_pipeline_options.call_count mock_get_pipeline_options.assert_called_with() assert ( exp_call_count == mock_get_pipeline_options.return_value.view_as.call_count ) mock_get_pipeline_options.return_value.view_as.assert_called_with( pipeline_options.SetupOptions ) assert exp_call_count == mock_pipeline.call_count mock_pipeline.assert_called_with( options=mock_get_pipeline_options.return_value ) assert exp_call_count == mock_pipeline.return_value.run.call_count if direct_runner or blocking: result = mock_pipeline.return_value.run.return_value result.wait_until_finish.assert_called_once_with()
def test_set_google_cloud_options( all_options, config, update, exp_update, dataflow_endpoint, klio_cli_version, deployed_ci, user_env, gcp_options, mocker, monkeypatch, ): expected_opts = [ "project", "region", "temp_location", "staging_location", "service_account_email", "no_auth", "template_location", "enable_streaming_engine", "dataflow_kms_key", "flexrs_goal", ] # this is to be changed when running `tox`; remove when no longer # supporting beam 2.14.0 if dataflow_endpoint: all_options["dataflow_endpoint"] = dataflow_endpoint else: all_options.pop("dataflow_endpoint", None) options = pipeline_options.PipelineOptions().from_dictionary(all_options) actual_gcp_opts = options.view_as(pipeline_options.GoogleCloudOptions) monkeypatch.setattr( config.pipeline_options, "dataflow_endpoint", dataflow_endpoint ) if klio_cli_version: monkeypatch.setenv("KLIO_CLI_VERSION", klio_cli_version) klio_cli_version_clean = klio_cli_version.replace(".", "-") if deployed_ci: monkeypatch.setenv("CI", "TRUE") if not user_env: monkeypatch.delenv("USER", raising=False) kpipe = run.KlioPipeline("test-job", config, mocker.Mock(update=update)) kpipe._set_google_cloud_options(options) for opt in expected_opts: expected_value = gcp_options[opt] # getattr should explode when not setting a default value assert expected_value == getattr(actual_gcp_opts, opt) assert exp_update == actual_gcp_opts.update if dataflow_endpoint: assert dataflow_endpoint == actual_gcp_opts.dataflow_endpoint else: assert ( "https://dataflow.googleapis.com" == actual_gcp_opts.dataflow_endpoint ) user = None if deployed_ci: user = "******" elif user_env: user = os.environ["USER"] klio_exec_version_clean = klio_exec_version.replace(".", "-") klio_core_version_clean = klio_core_version.replace(".", "-") klio_lib_version_clean = klio_lib_version.replace(".", "-") exp_labels = [ "foo=bar", "baz=bla", "klio-exec={}".format(klio_exec_version_clean), "klio-core={}".format(klio_core_version_clean), "klio={}".format(klio_lib_version_clean), ] if user: exp_labels.append("deployed_by={}".format(user).lower()) if klio_cli_version: exp_labels.append("klio-cli={}".format(klio_cli_version_clean)) assert sorted(exp_labels) == sorted(actual_gcp_opts.labels)
def test_set_debug_options(config, mocker): test_options = mocker.Mock() kpipe = run.KlioPipeline("test-job", config, mocker.Mock()) kpipe._set_debug_options(test_options)