def test_launch_slurm_cluster_orc(fileutils, wlmutils): """test clustered 3-node orchestrator""" # TODO detect number of nodes in allocation and skip if not sufficent launcher = wlmutils.get_test_launcher() if launcher != "slurm": pytest.skip("Test only runs on systems with Slurm as WLM") exp_name = "test-launch-slurm-cluster-orc" exp = Experiment(exp_name, launcher=launcher) test_dir = fileutils.make_test_dir(exp_name) # batch = False to launch on existing allocation orc = SlurmOrchestrator(6780, db_nodes=3, batch=False) orc.set_path(test_dir) exp.start(orc, block=True) status = exp.get_status(orc) # don't use assert so that orc we don't leave an orphan process if constants.STATUS_FAILED in status: exp.stop(orc) assert False exp.stop(orc) status = exp.get_status(orc) assert all([stat == constants.STATUS_CANCELLED for stat in status])
def test_launch_pbs_orc(fileutils, wlmutils): """test single node orchestrator""" launcher = wlmutils.get_test_launcher() if launcher != "pbs": pytest.skip("Test only runs on systems with PBSPro as WLM") exp_name = "test-launch-pbs-orc" exp = Experiment(exp_name, launcher=launcher) test_dir = fileutils.make_test_dir(exp_name) # batch = False to launch on existing allocation orc = PBSOrchestrator(6780, batch=False) orc.set_path(test_dir) exp.start(orc, block=True) status = exp.get_status(orc) # don't use assert so that orc we don't leave an orphan process if constants.STATUS_FAILED in status: exp.stop(orc) assert False exp.stop(orc) status = exp.get_status(orc) assert all([stat == constants.STATUS_CANCELLED for stat in status])
def test_launch_pbs_cluster_orc(fileutils, wlmutils): """test clustered 3-node orchestrator This test will fail if the PBS allocation is not obtained with `-l place=scatter` It will also fail if there are not enough nodes in the allocation to support a 3 node deployment """ launcher = wlmutils.get_test_launcher() if launcher != "pbs": pytest.skip("Test only runs on systems with PBSPro as WLM") exp_name = "test-launch-pbs-cluster-orc" exp = Experiment(exp_name, launcher=launcher) test_dir = fileutils.make_test_dir(exp_name) # batch = False to launch on existing allocation orc = PBSOrchestrator(6780, db_nodes=3, batch=False, inter_op_threads=4) orc.set_path(test_dir) exp.start(orc, block=True) status = exp.get_status(orc) # don't use assert so that orc we don't leave an orphan process if constants.STATUS_FAILED in status: exp.stop(orc) assert False exp.stop(orc) status = exp.get_status(orc) assert all([stat == constants.STATUS_CANCELLED for stat in status])
def test_bad_run_command_args(fileutils, wlmutils): """Should fail because of incorrect arguments given to the run command This test ensures that we catch immediate failures """ launcher = wlmutils.get_test_launcher() if launcher != "slurm": pytest.skip(f"Only fails with slurm. Launcher is {launcher}") exp_name = "test-bad-run-command-args" exp = Experiment(exp_name, launcher=launcher) test_dir = fileutils.make_test_dir(exp_name) script = fileutils.get_test_conf_path("bad.py") # this argument will get turned into an argument for the run command # of the specific WLM of the system. settings = wlmutils.get_run_settings("python", f"{script} --time=5", badarg="bad-arg") model = exp.create_model("bad-model", path=test_dir, run_settings=settings) with pytest.raises(SmartSimError): exp.start(model)
def test_ensemble(fileutils): exp = Experiment("gen-test", launcher="local") test_dir = fileutils.get_test_dir("gen_ensemble_test") gen = Generator(test_dir) params = {"THERMO": [10, 20, 30], "STEPS": [10, 20, 30]} ensemble = exp.create_ensemble("test", params=params, run_settings=rs) config = fileutils.get_test_conf_path("in.atm") ensemble.attach_generator_files(to_configure=config) gen.generate_experiment(ensemble) assert len(ensemble) == 9 assert osp.isdir(osp.join(test_dir, "test")) for i in range(9): assert osp.isdir(osp.join(test_dir, "test/test_" + str(i)))
def test_ensemble_overwrite_error(fileutils): exp = Experiment("gen-test-overwrite-error", launcher="local") test_dir = fileutils.get_test_dir("test_gen_overwrite_error") gen = Generator(test_dir) params = {"THERMO": [10, 20, 30], "STEPS": [10, 20, 30]} ensemble = exp.create_ensemble("test", params=params, run_settings=rs) config = fileutils.get_test_conf_path("in.atm") ensemble.attach_generator_files(to_configure=[config]) gen.generate_experiment(ensemble) # re generate without overwrite config = fileutils.get_test_conf_path("in.atm") ensemble.attach_generator_files(to_configure=[config]) with pytest.raises(FileExistsError): gen.generate_experiment(ensemble)
def test_local_orchestrator(fileutils): """Test launching orchestrator locally""" global first_dir exp_name = "test-orc-launch-local" exp = Experiment(exp_name, launcher="local") test_dir = fileutils.make_test_dir(exp_name) first_dir = test_dir orc = Orchestrator(port=6780) orc.set_path(test_dir) exp.start(orc) statuses = exp.get_status(orc) assert [stat != constants.STATUS_FAILED for stat in statuses] # simulate user shutting down main thread exp._control._jobs.actively_monitoring = False exp._control._launcher.task_manager.actively_monitoring = False
def test_dir_files(fileutils): """test the generate of models with files that are directories with subdirectories and files """ test_dir = fileutils.make_test_dir("gen_dir_test") exp = Experiment("gen-test", test_dir, launcher="local") params = {"THERMO": [10, 20, 30], "STEPS": [10, 20, 30]} ensemble = exp.create_ensemble("dir_test", params=params, run_settings=rs) conf_dir = fileutils.get_test_dir_path("test_dir") ensemble.attach_generator_files(to_copy=conf_dir) exp.generate(ensemble) assert osp.isdir(osp.join(test_dir, "dir_test/")) for i in range(9): model_path = osp.join(test_dir, "dir_test/dir_test_" + str(i)) assert osp.isdir(model_path) assert osp.isdir(osp.join(model_path, "test_dir_1")) assert osp.isfile(osp.join(model_path, "test.py"))
def test_launch_slurm_cluster_orc(fileutils): """test clustered 3-node orchestrator""" exp_name = "test-launch-slurm-cluster-orc-batch" exp = Experiment(exp_name, launcher="slurm") test_dir = fileutils.make_test_dir(exp_name) # batch = False to launch on existing allocation orc = SlurmOrchestrator(6780, db_nodes=3, batch=True) orc.set_path(test_dir) exp.start(orc, block=True) status = exp.get_status(orc) # don't use assert so that orc we don't leave an orphan process if constants.STATUS_FAILED in status: exp.stop(orc) assert False exp.stop(orc) status = exp.get_status(orc) assert all([stat == constants.STATUS_CANCELLED for stat in status])
def test_full_exp(fileutils): test_dir = fileutils.make_test_dir("gen_full_test") exp = Experiment("gen-test", test_dir, launcher="local") model = exp.create_model("model", run_settings=rs) script = fileutils.get_test_conf_path("sleep.py") model.attach_generator_files(to_copy=script) orc = Orchestrator(6780) params = {"THERMO": [10, 20, 30], "STEPS": [10, 20, 30]} ensemble = exp.create_ensemble("test_ens", params=params, run_settings=rs) config = fileutils.get_test_conf_path("in.atm") ensemble.attach_generator_files(to_configure=config) exp.generate(orc, ensemble, model) # test for ensemble assert osp.isdir(osp.join(test_dir, "test_ens/")) for i in range(9): assert osp.isdir(osp.join(test_dir, "test_ens/test_ens_" + str(i))) # test for orc dir assert osp.isdir(osp.join(test_dir, "database")) # test for model file assert osp.isdir(osp.join(test_dir, "model")) assert osp.isfile(osp.join(test_dir, "model/sleep.py"))
def test_stop_entity(fileutils, wlmutils): exp_name = "test-launch-stop-model" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) test_dir = fileutils.make_test_dir(exp_name) script = fileutils.get_test_conf_path("sleep.py") settings = wlmutils.get_run_settings("python", f"{script} --time=10") M1 = exp.create_model("m1", path=test_dir, run_settings=settings) exp.start(M1, block=False) time.sleep(5) exp.stop(M1) assert M1.name in exp._control._jobs.completed assert exp.get_status(M1)[0] == constants.STATUS_CANCELLED
def test_restart(fileutils, wlmutils): exp_name = "test-restart" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) test_dir = fileutils.make_test_dir(exp_name) script = fileutils.get_test_conf_path("sleep.py") settings = wlmutils.get_run_settings("python", f"{script} --time=5") M1 = exp.create_model("m1", path=test_dir, run_settings=settings) M2 = exp.create_model("m2", path=test_dir, run_settings=settings) exp.start(M1, M2, block=True) statuses = exp.get_status(M1, M2) assert all([stat == constants.STATUS_COMPLETED for stat in statuses]) exp.start(M1, M2, block=True) statuses = exp.get_status(M1, M2) assert all([stat == constants.STATUS_COMPLETED for stat in statuses])
def test_orchestrator_relaunch(fileutils): """Test error when users try to launch second orchestrator""" exp_name = "test-orc-error-on-relaunch" exp = Experiment(exp_name, launcher="local") test_dir = fileutils.make_test_dir(exp_name) orc = Orchestrator(port=6780) orc.set_path(test_dir) orc_1 = Orchestrator(port=6790) orc_1.set_path(test_dir) exp.start(orc) with pytest.raises(SmartSimError): exp.start(orc_1) exp.stop(orc)
def test_stop_entity_list(fileutils, wlmutils): exp_name = "test-launch-stop-ensemble" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) test_dir = fileutils.make_test_dir(exp_name) script = fileutils.get_test_conf_path("sleep.py") settings = wlmutils.get_run_settings("python", f"{script} --time=10") ensemble = exp.create_ensemble("e1", run_settings=settings, replicas=2) ensemble.set_path(test_dir) exp.start(ensemble, block=False) time.sleep(5) exp.stop(ensemble) statuses = exp.get_status(ensemble) assert all([stat == constants.STATUS_CANCELLED for stat in statuses]) assert all([m.name in exp._control._jobs.completed for m in ensemble])
def test_reconnect_local_orc(): """Test reconnecting to orchestrator from first experiment""" global first_dir # start new experiment exp_name = "test-orc-local-reconnect-2nd" exp_2 = Experiment(exp_name, launcher="local") checkpoint = osp.join(first_dir, "smartsim_db.dat") reloaded_orc = exp_2.reconnect_orchestrator(checkpoint) # let statuses update once time.sleep(5) statuses = exp_2.get_status(reloaded_orc) for stat in statuses: if stat == constants.STATUS_FAILED: exp_2.stop(reloaded_orc) assert False exp_2.stop(reloaded_orc)
def test_summary(fileutils, wlmutils): """Fairly rudimentary test of the summary dataframe""" exp_name = "test-launch-summary" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) test_dir = fileutils.make_test_dir(exp_name) sleep = fileutils.get_test_conf_path("sleep.py") bad = fileutils.get_test_conf_path("bad.py") sleep_settings = wlmutils.get_run_settings("python", f"{sleep} --time=3") bad_settings = wlmutils.get_run_settings("python", f"{bad} --time=6") sleep = exp.create_model("sleep", path=test_dir, run_settings=sleep_settings) bad = exp.create_model("bad", path=test_dir, run_settings=bad_settings) # start and poll exp.start(sleep, bad) assert exp.get_status(bad)[0] == constants.STATUS_FAILED assert exp.get_status(sleep)[0] == constants.STATUS_COMPLETED summary_df = exp.summary() print(summary_df) row = summary_df.loc[0] assert sleep.name == row["Name"] assert sleep.type == row["Entity-Type"] assert 0 == int(row["RunID"]) assert 0 == int(row["Returncode"]) row_1 = summary_df.loc[1] assert bad.name == row_1["Name"] assert bad.type == row_1["Entity-Type"] assert 0 == int(row_1["RunID"]) assert 0 != int(row_1["Returncode"])
def test_model_failure(fileutils): exp_name = "test-model-failure" exp = Experiment(exp_name, launcher="local") test_dir = fileutils.make_test_dir(exp_name) script = fileutils.get_test_conf_path("bad.py") settings = RunSettings("python", f"{script} --time=3") M1 = exp.create_model("m1", path=test_dir, run_settings=settings) exp.start(M1, block=True) statuses = exp.get_status(M1) assert all([stat == constants.STATUS_FAILED for stat in statuses])
def test_ensemble(fileutils, wlmutils): exp_name = "test-ensemble-launch" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) test_dir = fileutils.make_test_dir(exp_name) script = fileutils.get_test_conf_path("sleep.py") settings = wlmutils.get_run_settings("python", f"{script} --time=5") ensemble = exp.create_ensemble("e1", run_settings=settings, replicas=2) ensemble.set_path(test_dir) exp.start(ensemble, block=True) statuses = exp.get_status(ensemble) assert all([stat == constants.STATUS_COMPLETED for stat in statuses])
def test_failed_status(fileutils, wlmutils): """Test when a failure occurs deep into model execution""" exp_name = "test-report-failure" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) test_dir = fileutils.make_test_dir(exp_name) script = fileutils.get_test_conf_path("bad.py") settings = wlmutils.get_run_settings("python", f"{script} --time=7") model = exp.create_model("bad-model", path=test_dir, run_settings=settings) exp.start(model, block=False) while not exp.finished(model): time.sleep(2) status = exp.get_status(model) assert status[0] == constants.STATUS_FAILED
def test_models(fileutils): exp_name = "test-models-local-launch" exp = Experiment(exp_name, launcher="local") test_dir = fileutils.make_test_dir(exp_name) script = fileutils.get_test_conf_path("sleep.py") settings = RunSettings("python", f"{script} --time=3") M1 = exp.create_model("m1", path=test_dir, run_settings=settings) M2 = exp.create_model("m2", path=test_dir, run_settings=settings) exp.start(M1, M2, block=True, summary=True) statuses = exp.get_status(M1, M2) assert all([stat == constants.STATUS_COMPLETED for stat in statuses])
def test_batch_ensemble_replicas(fileutils, wlmutils): exp_name = "test-slurm-batch-ensemble-replicas" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) test_dir = fileutils.make_test_dir(exp_name) script = fileutils.get_test_conf_path("sleep.py") settings = wlmutils.get_run_settings("python", f"{script} --time=5") batch = SbatchSettings(nodes=2, time="00:01:00") ensemble = exp.create_ensemble( "batch-ens-replicas", batch_settings=batch, run_settings=settings, replicas=2 ) ensemble.set_path(test_dir) exp.start(ensemble, block=True) statuses = exp.get_status(ensemble) assert all([stat == constants.STATUS_COMPLETED for stat in statuses])
def test_batch_ensemble(fileutils, wlmutils): """Test the launch of a manually constructed batch ensemble""" exp_name = "test-slurm-batch-ensemble" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) test_dir = fileutils.make_test_dir(exp_name) script = fileutils.get_test_conf_path("sleep.py") settings = wlmutils.get_run_settings("python", f"{script} --time=5") M1 = exp.create_model("m1", path=test_dir, run_settings=settings) M2 = exp.create_model("m2", path=test_dir, run_settings=settings) batch = SbatchSettings(nodes=2, time="00:01:00") ensemble = exp.create_ensemble("batch-ens", batch_settings=batch) ensemble.add_model(M1) ensemble.add_model(M2) ensemble.set_path(test_dir) exp.start(ensemble, block=True) statuses = exp.get_status(ensemble) assert all([stat == constants.STATUS_COMPLETED for stat in statuses])
def test_launch_pbs_mpmd(): """test the launch of a aprun MPMD workload this test will obtain an allocation as a batch workload. Aprun MPMD workloads share an output file for all processes and they share MPI_COMM_WORLDs. Prior to running this test, hw_mpi.c in test_configs needs to be compiled. #TODO write a script for this. """ exp = Experiment("pbs-test", launcher="pbs") run_args = {"pes": 1, "pes-per-node": 1} aprun = AprunSettings("./hellow", run_args=run_args) aprun2 = AprunSettings("./hellow", run_args=run_args) aprun.make_mpmd(aprun2) model = exp.create_model("hello_world", run_settings=aprun) qsub = QsubBatchSettings(nodes=2, ppn=1, time="1:00:00") ensemble = exp.create_ensemble("ensemble", batch_settings=qsub) ensemble.add_model(model) exp.start(ensemble)
def test_consumer(fileutils): """Run three processes, each one of the first two processes puts a tensor on the DB; the third process accesses the tensors put by the two producers. Finally, the tensor is used to run a model by each producer and the consumer accesses the two results. """ test_dir = fileutils.make_test_dir("smartredis_ensemble_consumer_test") exp = Experiment("smartredis_ensemble_consumer", exp_path=test_dir, launcher="local") # create and start a database orc = Orchestrator(port=REDIS_PORT) exp.generate(orc) exp.start(orc, block=False) rs_prod = RunSettings("python", "producer.py") rs_consumer = RunSettings("python", "consumer.py") params = {"mult": [1, -10]} ensemble = Ensemble(name="producer", params=params, run_settings=rs_prod, perm_strat="step") consumer = Model("consumer", params={}, path=ensemble.path, run_settings=rs_consumer) ensemble.add_model(consumer) ensemble.register_incoming_entity(ensemble[0]) ensemble.register_incoming_entity(ensemble[1]) config = fileutils.get_test_conf_path("smartredis") ensemble.attach_generator_files(to_copy=[config]) exp.generate(ensemble) # start the models exp.start(ensemble, summary=False) # get and confirm statuses statuses = exp.get_status(ensemble) assert all([stat == constants.STATUS_COMPLETED for stat in statuses]) # stop the orchestrator exp.stop(orc) print(exp.summary())
def test_exchange(fileutils): """Run two processes, each process puts a tensor on the DB, then accesses the other process's tensor. Finally, the tensor is used to run a model. """ test_dir = fileutils.make_test_dir("smartredis_ensemble_exchange_test") exp = Experiment("smartredis_ensemble_exchange", exp_path=test_dir, launcher="local") # create and start a database orc = Orchestrator(port=REDIS_PORT) exp.generate(orc) exp.start(orc, block=False) rs = RunSettings("python", "producer.py --exchange") params = {"mult": [1, -10]} ensemble = Ensemble( name="producer", params=params, run_settings=rs, perm_strat="step", ) ensemble.register_incoming_entity(ensemble[0]) ensemble.register_incoming_entity(ensemble[1]) config = fileutils.get_test_conf_path("smartredis") ensemble.attach_generator_files(to_copy=[config]) exp.generate(ensemble) # start the models exp.start(ensemble, summary=False) # get and confirm statuses statuses = exp.get_status(ensemble) assert all([stat == constants.STATUS_COMPLETED for stat in statuses]) # stop the orchestrator exp.stop(orc) print(exp.summary())
def mom6_colocated_driver( walltime="02:00:00", ensemble_size=1, nodes_per_member=15, tasks_per_node=17, mom6_exe_path="/lus/cls01029/shao/dev/gfdl/MOM6-examples/build/gnu/" + "ice_ocean_SIS2/repro/MOM6", ensemble_node_features='P100', mask_table="mask_table.33.16x18", domain_layout="16,18", eke_model_name="ncar_ml_eke.gpu.pt", eke_backend="GPU", orchestrator_port=6780, orchestrator_interface="ipogif0", colocated_stride=18, orchestrator_cpus=4, limit_orchestrator_cpus=False): """Run a MOM6 OM4_025 simulation using a colocated deployment for online machine-learning inference :param walltime: how long to allocate for the run, "hh:mm:ss" :type walltime: str, optional :param ensemble_size: number of members in the ensemble :type ensemble_size: int, optional :param nodes_per_member: number of nodes allocated to each ensemble member :type nodes_per_member: int, optional :param tasks_per_node: how many MPI ranks to be run per node :type tasks_per_node: int, optional :param mom6_exe_path: full path to the compiled MOM6 executable :type mom6_exe_path: str, optional :param ensemble_node_features: (Slurm-only) Constraints/features for the node :type ensemble_node_features: str, optional :param mask_table: the file to use for the specified layout eliminating land domains :type mask_table: str, optional :param domain_layout: the particular domain decomposition :type domain_layout: str, optional :param eke_model_name: file containing the saved machine-learning model :type eke_model_name: str, optional :param eke_backend: (CPU or GPU), sets whether the ML-EKE model will be run on CPU or GPU :type eke_backend: str, optional :param orchestrator_port: port that the database will listen on :type orchestrator_port: int, optional :param orchestrator_interface: network interface bound to the orchestrator :type orchestrator_interface: str, optional :param orchestrator_cpus: Specify the number of cores that the orchestrator can use to handle requests :type orchestrator_cpus: int, optional :param limit_orchestrator_cpus: Limit the number of CPUs that the orchestrator can use to handle requests :type limit_orchestrator_cpus: bool, optional """ experiment = Experiment("AI-EKE-MOM6", launcher="auto") mom_ensemble = create_mom_ensemble(experiment, walltime, ensemble_size, nodes_per_member, tasks_per_node, mom6_exe_path, ensemble_node_features) configure_mom_ensemble(mom_ensemble, True, False, mask_table, domain_layout, eke_model_name, eke_backend, colocated_stride=colocated_stride) add_colocated_orchestrator( mom_ensemble, orchestrator_port, orchestrator_interface, orchestrator_cpus, limit_orchestrator_cpus, ) experiment.generate(mom_ensemble, overwrite=True) experiment.start(mom_ensemble, summary=True) experiment.stop()
from copy import deepcopy import pytest from smartsim import Experiment from smartsim.database import Orchestrator from smartsim.error import SmartSimError from smartsim.settings import RunSettings from smartsim.utils.entityutils import separate_entities # ---- create entities for testing -------- rs = RunSettings("python", "sleep.py") exp = Experiment("util-test", launcher="local") model = exp.create_model("model_1", run_settings=rs) model_2 = exp.create_model("model_1", run_settings=rs) ensemble = exp.create_ensemble("ensemble", run_settings=rs, replicas=1) orc = Orchestrator() orc_1 = deepcopy(orc) def test_separate(): ent, ent_list, _orc = separate_entities([model, ensemble, orc]) assert ent[0] == model assert ent_list[0] == ensemble assert _orc == orc def test_two_orc(): with pytest.raises(SmartSimError):
def test_model_prefix(): exp = Experiment("test") model = exp.create_model("model", RunSettings("python"), enable_key_prefixing=True) assert model._key_prefixing_enabled == True
def mom6_clustered_driver( walltime="02:00:00", ensemble_size=1, nodes_per_member=25, tasks_per_node=45, mom6_exe_path="/lus/cls01029/shao/dev/gfdl/MOM6-examples/build/gnu/" + "ice_ocean_SIS2/repro/MOM6", ensemble_node_features='[CL48|SK48|SK56]', mask_table="mask_table.315.32x45", domain_layout="32,45", eke_model_name="ncar_ml_eke.gpu.pt", eke_backend="GPU", orchestrator_port=6780, orchestrator_interface="ipogif0", orchestrator_nodes=3, orchestrator_node_features='P100', configure_only=False): """Run a MOM6 OM4_025 simulation with a cluster of databases used for machine-learning inference :param walltime: how long to allocate for the run, "hh:mm:ss" :type walltime: str, optional :param ensemble_size: number of members in the ensemble :type ensemble_size: int, optional :param nodes_per_member: number of nodes allocated to each ensemble member :type nodes_per_member: int, optional :param tasks_per_node: how many MPI ranks to be run per node :type tasks_per_node: int, optional :param mom6_exe_path: full path to the compiled MOM6 executable :type mom6_exe_path: str, optional :param ensemble_node_features: (Slurm-only) Constraints/features for the node :type ensemble_node_features: str, optional :param mask_table: the file to use for the specified layout eliminating land domains :type mask_table: str, optional :param domain_layout: the particular domain decomposition :type domain_layout: str, optional :param eke_model_name: file containing the saved machine-learning model :type eke_model_name: str, optional :param eke_backend: (CPU or GPU), sets whether the ML-EKE model will be run on CPU or GPU :type eke_backend: str, optional :param orchestrator_port: port that the database will listen on :type orchestrator_port: int, optional :param orchestrator_interface: network interface bound to the database :type orchestrator_interface: str, optional :param orchestrator_nodes: number of orchestrator nodes to use :type orchestrator_nodes: int, optional :param orchestrator_node_features: (Slurm-only) node features requested for the orchestrator nodes :type orchestrator_node_features: str, optional :param configure_only: If True, only configure the experiment and return the orchestrator and experiment objects :type configure_only: bool, optional """ experiment = Experiment("AI-EKE-MOM6", launcher="auto") mom_ensemble = create_mom_ensemble(experiment, walltime, ensemble_size, nodes_per_member, tasks_per_node, mom6_exe_path, ensemble_node_features) configure_mom_ensemble(mom_ensemble, False, orchestrator_nodes >= 3, mask_table, domain_layout, eke_model_name, eke_backend) orchestrator = create_distributed_orchestrator( experiment, orchestrator_port, orchestrator_interface, orchestrator_nodes, orchestrator_node_features, walltime) experiment.generate(mom_ensemble, orchestrator, overwrite=True) if configure_only: return experiment, mom_ensemble, orchestrator else: experiment.start(mom_ensemble, orchestrator, summary=True) experiment.stop(orchestrator)
def test_bad_ensemble_init_no_rs_bs(): """ensemble init without run settings or batch settings""" exp = Experiment("test") with pytest.raises(SmartSimError): exp.create_ensemble("name")