def test_launch_slurm_cluster_orc_reconnect(fileutils): """test reconnecting to clustered 3-node orchestrator""" exp_name = "test-launch-slurm-cluster-orc-batch-reconect" exp = Experiment(exp_name, launcher="slurm") test_dir = fileutils.make_test_dir(exp_name) # batch = False to launch on existing allocation orc = SlurmOrchestrator(6780, db_nodes=3, batch=True) orc.set_path(test_dir) exp.start(orc, block=True) status = exp.get_status(orc) # don't use assert so that orc we don't leave an orphan process if constants.STATUS_FAILED in status: exp.stop(orc) assert False exp_name = "test-orc-slurm-cluster-orc-batch-reconnect-2nd" exp_2 = Experiment(exp_name, launcher="slurm") checkpoint = osp.join(test_dir, "smartsim_db.dat") reloaded_orc = exp_2.reconnect_orchestrator(checkpoint) # let statuses update once time.sleep(5) statuses = exp_2.get_status(reloaded_orc) for stat in statuses: if stat == constants.STATUS_FAILED: exp_2.stop(reloaded_orc) assert False exp_2.stop(reloaded_orc)
def test_reconnect_local_orc(): """Test reconnecting to orchestrator from first experiment""" global first_dir # start new experiment exp_name = "test-orc-local-reconnect-2nd" exp_2 = Experiment(exp_name, launcher="local") checkpoint = osp.join(first_dir, "smartsim_db.dat") reloaded_orc = exp_2.reconnect_orchestrator(checkpoint) # let statuses update once time.sleep(5) statuses = exp_2.get_status(reloaded_orc) for stat in statuses: if stat == constants.STATUS_FAILED: exp_2.stop(reloaded_orc) assert False exp_2.stop(reloaded_orc)