def cleanup(): _do_cleanup() commands = [ 'echo "hello"', "ls invalid-path", 'echo "hello"', 'echo "hello"', 'echo "hello"', 'echo "hello"', 'echo "hello"', 'echo "hello"', ] with open(TEST_FILENAME, "w") as f_out: for command in commands: f_out.write(command + "\n") inputs = GenericCommandInputs(TEST_FILENAME) config = GenericCommandConfiguration.auto_config( inputs, cancel_on_blocking_job_failure=True) config.get_job("3").set_blocking_jobs(set([2])) config.get_job("4").set_blocking_jobs(set([3])) config.get_job("5").set_blocking_jobs(set([4])) config.get_job("6").set_blocking_jobs(set([5])) config.get_job("7").set_blocking_jobs(set([6])) config.get_job("8").set_blocking_jobs(set([7])) config.dump(CONFIG_FILE) yield _do_cleanup()
def create( filename, append_job_name, append_output_dir, config_file, cancel_on_blocking_job_failure, minutes_per_job, shuffle, strip_whitespace, verbose, ): """Create a config file from a filename with a list of executable commands.""" level = logging.DEBUG if verbose else logging.WARNING setup_logging("auto_config", None, console_level=level) config = GenericCommandConfiguration.auto_config( filename, cancel_on_blocking_job_failure=cancel_on_blocking_job_failure, minutes_per_job=minutes_per_job, append_job_name=append_job_name, append_output_dir=append_output_dir, ) if shuffle: config.shuffle_jobs() print(f"Created configuration with {config.get_num_jobs()} jobs.") indent = None if strip_whitespace else 2 config.dump(config_file, indent=indent) print(f"Dumped configuration to {config_file}.\n")
def test_job_configuration__shuffle_jobs(job_fixture): num_jobs = 10 with open(TEST_FILENAME, "w") as f_out: for i in range(num_jobs): f_out.write("echo hello world\n") inputs = GenericCommandInputs(TEST_FILENAME) config = GenericCommandConfiguration(job_inputs=inputs) for job_param in inputs.iter_jobs(): config.add_job(job_param) assert config.get_num_jobs() == num_jobs assert [x.name for x in config.iter_jobs()] == [str(x) for x in range(1, num_jobs + 1)] config.shuffle_jobs() assert [x.name for x in config.iter_jobs()] != [str(x) for x in range(1, num_jobs + 1)]
def test_run_generic_commands(generic_command_fixture): commands = [ "ls .", "ls invalid-file-path", ] with open(TEST_FILENAME, "w") as f_out: for command in commands: f_out.write(command + "\n") inputs = GenericCommandInputs(TEST_FILENAME) config = GenericCommandConfiguration() for job_param in inputs.iter_jobs(): config.add_job(job_param) assert config.get_num_jobs() == 2 config.dump(CONFIG_FILE) cmds = ( f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} -p 0.1 -h {FAKE_HPC_CONFIG}", # Test with higher queue depth. This exercises the code paths but # doesn't actually verify the functionality. # The infrastructure to do that is currently lacking. TODO f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} -p 0.1 -q 32 -h {FAKE_HPC_CONFIG}", ) for cmd in cmds: check_run_command(cmd) check_run_command(f"{WAIT} --output={OUTPUT} --poll-interval=0.01") assert list(Path(OUTPUT).glob("*.sh")) check_run_command(f"jade prune-files {OUTPUT}") assert not list(Path(OUTPUT).glob("*.sh"))
def test_config__assign_blocked_by(cleanup): config = GenericCommandConfiguration() base_cmd = "bash my_script.sh" regular_job_names = [] for i in range(1, 4): cmd = base_cmd + " " + str(i) name = f"job_{i}" job = GenericCommandParameters( command=cmd, name=name, append_job_name=True, append_output_dir=True, ) config.add_job(job) regular_job_names.append(name) pp_name = "post_process" post_process_job = GenericCommandParameters( command="bash run_post_process.sh", name=pp_name, append_job_name=True, append_output_dir=True, ) config.add_job(post_process_job) config_file = CONFIG1 config.dump(config_file, indent=2) ret = run_command( f"jade config assign-blocked-by {CONFIG1} 3 -o {CONFIG2}") assert ret == 0 assert os.path.exists(CONFIG2) config = load_data(CONFIG2) assert sorted(config["jobs"][3]["blocked_by"]) == sorted(regular_job_names) os.remove(CONFIG2) ret = run_command( f"jade config assign-blocked-by {CONFIG1} 3 1 2 -o {CONFIG2}") assert ret == 0 assert os.path.exists(CONFIG2) config = load_data(CONFIG2) expected = [regular_job_names[1], regular_job_names[2]] assert sorted(config["jobs"][3]["blocked_by"]) == sorted(expected) # Include the pp job in blocking-job-indexes. ret = run_command( f"jade config assign-blocked-by {CONFIG1} 3 1 2 3 -o {CONFIG2}") assert ret != 0 # Invalid job index ret = run_command( f"jade config assign-blocked-by {CONFIG1} 47 1 2 -o {CONFIG2}") assert ret != 0
def cleanup(): _do_cleanup() commands = ['echo "hello world"'] * NUM_COMMANDS with open(TEST_FILENAME, "w") as f_out: for command in commands: f_out.write(command + "\n") inputs = GenericCommandInputs(TEST_FILENAME) config = GenericCommandConfiguration.auto_config(inputs) config.dump(CONFIG_FILE) yield _do_cleanup()
def create_config(): num_commands = 5 commands = ['echo "hello world"'] * num_commands with open(TEST_FILENAME, "w") as f_out: for command in commands: f_out.write(command + "\n") inputs = GenericCommandInputs(TEST_FILENAME) config = GenericCommandConfiguration(job_inputs=inputs) jobs = list(inputs.iter_jobs()) for i, job_param in enumerate(jobs): if i < 3: job_param.submission_group = "group1" else: job_param.submission_group = "group2" config.add_job(job_param) hpc_config1 = load_data(FAKE_HPC_CONFIG) hpc_config2 = copy.deepcopy(hpc_config1) hpc_config1["hpc"]["walltime"] = "1:00:00" hpc_config2["hpc"]["walltime"] = "5:00:00" params1 = SubmitterParams(hpc_config=hpc_config1, per_node_batch_size=3) params2 = SubmitterParams(hpc_config=hpc_config2, per_node_batch_size=1) group1 = SubmissionGroup(name="group1", submitter_params=params1) group2 = SubmissionGroup(name="group2", submitter_params=params2) config.append_submission_group(group1) config.append_submission_group(group2) return config
def test_job_configuration__check_job_dependencies_blocking(job_fixture): with open(TEST_FILENAME, "w") as f_out: f_out.write("echo hello world\n") inputs = GenericCommandInputs(TEST_FILENAME) config = GenericCommandConfiguration(job_inputs=inputs) for job_param in inputs.iter_jobs(): config.add_job(job_param) assert config.get_num_jobs() == 1 hpc_config = HpcConfig(**load_data(FAKE_HPC_CONFIG)) params = SubmitterParams(hpc_config=hpc_config) job = config.get_job("1") job.blocked_by.add("10") with pytest.raises(InvalidConfiguration): config.check_job_dependencies(params) # While we have this setup, verify that submit-jobs calls this function. config.dump(CONFIG_FILE) cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} " "--poll-interval=.1 " ret = run_command(cmd) assert ret != 0
def auto_config(inputs, **kwargs): """Create a configuration file for generic_command. Parameters ---------- inputs : str Input file containing commands, one line per command """ if not os.path.exists(inputs): raise OSError(f"Inputs path '{inputs}' does not exist.") return GenericCommandConfiguration.auto_config(inputs, **kwargs)
def test_job_configuration__check_job_dependencies_estimate(job_fixture): with open(TEST_FILENAME, "w") as f_out: f_out.write("echo hello world\n") inputs = GenericCommandInputs(TEST_FILENAME) config = GenericCommandConfiguration(job_inputs=inputs) for job_param in inputs.iter_jobs(): config.add_job(job_param) assert config.get_num_jobs() == 1 hpc_config = HpcConfig(**load_data(FAKE_HPC_CONFIG)) params = SubmitterParams(hpc_config=hpc_config, per_node_batch_size=0) with pytest.raises(InvalidConfiguration): config.check_job_dependencies(params)
def job_too_long(): _do_cleanup() commands = ['echo "hello world"'] * NUM_COMMANDS with open(TEST_FILENAME, "w") as f_out: for command in commands: f_out.write(command + "\n") inputs = GenericCommandInputs(TEST_FILENAME) config = GenericCommandConfiguration.auto_config(inputs, minutes_per_job=10) for i, job in enumerate(config.iter_jobs()): if i == 1: job.estimated_run_minutes = 1000 break config.dump(CONFIG_FILE) yield _do_cleanup()
def test_job_configuration__custom_names(job_fixture): num_jobs = 3 with open(TEST_FILENAME, "w") as f_out: for i in range(num_jobs): f_out.write("echo hello world\n") inputs = GenericCommandInputs(TEST_FILENAME) config = GenericCommandConfiguration(job_inputs=inputs) for i, job_param in enumerate(inputs.iter_jobs()): job_param.name = f"job_{i}" config.add_job(job_param) assert config.get_num_jobs() == num_jobs job = GenericCommandParameters(command="echo hello world", name="job_2") with pytest.raises(InvalidConfiguration): config.add_job(job)
def cluster(): os.makedirs(OUTPUT, exist_ok=True) commands = ["echo 'hello'"] * 2 cmd_file = os.path.join(OUTPUT, "commands.txt") with open(cmd_file, "w") as f_out: for cmd in commands: f_out.write(cmd + "\n") jade_config = GenericCommandConfiguration.auto_config(cmd_file) config_file = os.path.join(OUTPUT, CONFIG_FILE) jade_config.dump(config_file) hpc_config = HpcConfig(hpc_type="slurm", hpc=SlurmConfig(account="abc")) cluster = Cluster.create(OUTPUT, jade_config) yield cluster if os.path.exists(OUTPUT): shutil.rmtree(OUTPUT)
def test_sorted_order(generic_command_fixture): with open(TEST_FILENAME, "w") as f_out: pass config = GenericCommandConfiguration() num_jobs = 20 for i in range(num_jobs): job = GenericCommandParameters(command="echo hello") config.add_job(job) assert config.get_num_jobs() == num_jobs job_ids = [job.job_id for job in config.iter_jobs()] assert job_ids == list(range(1, num_jobs + 1))
def test_try_add_blocked_jobs(cleanup): num_commands = 5 commands = ['echo "hello world"'] * num_commands with open(TEST_FILENAME, "w") as f_out: for command in commands: f_out.write(command + "\n") inputs = GenericCommandInputs(TEST_FILENAME) config = GenericCommandConfiguration(job_inputs=inputs) jobs = list(inputs.iter_jobs()) for i, job_param in enumerate(jobs): if i == num_commands - 1: job_param.blocked_by = set([1, 2, 3, 4]) config.add_job(job_param) config.dump(CONFIG_FILE) for option in ("--try-add-blocked-jobs", "--no-try-add-blocked-jobs"): cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} -h {FAKE_HPC_CONFIG} -p 0.1 {option}" ret = run_command(cmd) assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 events_file = os.path.join(OUTPUT, "submit_jobs_events.log") events_summary = EventsSummary(OUTPUT, preload=True) submit_events = events_summary.list_events(EVENT_NAME_HPC_SUBMIT) if option == "--try-add-blocked-jobs": assert len(submit_events) == 1 event = submit_events[0] assert event.data["batch_size"] == num_commands shutil.rmtree(OUTPUT) else: assert len(submit_events) == 2 event1 = submit_events[0] event2 = submit_events[1] assert event1.data["batch_size"] == num_commands - 1 assert event2.data["batch_size"] == 1
def test_job_order(generic_command_fixture): num_jobs = 50 commands = ["echo hello world"] * num_jobs with open(TEST_FILENAME, "w") as f_out: for command in commands: f_out.write(command + "\n") inputs = GenericCommandInputs(TEST_FILENAME) config = GenericCommandConfiguration() for job_param in inputs.iter_jobs(): config.add_job(job_param) assert config.get_num_jobs() == num_jobs job = config.get_job("1") for i in range(10, 15): job.blocked_by.add(i) config.get_job("2").blocked_by.add("1") config.get_job("21").blocked_by.add("30") config.get_job("41").blocked_by.add("50") config.dump(CONFIG_FILE) cmd = (f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} " "--per-node-batch-size=10 " "--max-nodes=4 " "--poll-interval=0.1 " f"--hpc-config {FAKE_HPC_CONFIG} " "--num-processes=10") check_run_command(cmd) check_run_command(f"{WAIT} --output={OUTPUT} --poll-interval=0.01") result_summary = ResultsSummary(OUTPUT) results = result_summary.list_results() assert len(results) == num_jobs tracker = {x.name: x for x in results} for i in range(10, 15): assert tracker["1"].completion_time > tracker[str(i)].completion_time assert tracker["2"].completion_time > tracker["1"].completion_time assert tracker["21"].completion_time > tracker["30"].completion_time assert tracker["41"].completion_time > tracker["50"].completion_time # Verify that stats are summarized correctly with aggregation mode. stats_text = Path(OUTPUT) / "stats.txt" assert stats_text.exists() assert "Average" in stats_text.read_text() stats_json = Path(OUTPUT) / "stats_summary.json" assert stats_json.exists() stats = load_data(stats_json) assert stats assert "batch" in stats[0]
def test_resubmit_with_blocking_jobs(basic_setup): num_commands = 7 commands = ['echo "hello world"'] * num_commands with open(TEST_FILENAME, "w") as f_out: for command in commands: f_out.write(command + "\n") inputs = GenericCommandInputs(TEST_FILENAME) config = GenericCommandConfiguration(job_inputs=inputs) jobs = list(inputs.iter_jobs()) # Set an inefficient ordering to make sure the resubmit algorithm is recursive. for i, job_param in enumerate(jobs): if i == 3: job_param.blocked_by = set([5]) elif i == 4: job_param.blocked_by = set([7]) elif i == 6: job_param.blocked_by = set([6]) config.add_job(job_param) config.dump(CONFIG_FILE) cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT}" ret = run_command(cmd) assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 agg = ResultsAggregator.load(OUTPUT) results = agg.get_results_unsafe() assert results for result in results: assert result.return_code == 0 found = False for i, result in enumerate(results): if result.name == "7": results.pop(i) found = True break assert found agg._write_results(results) results_filename = os.path.join(OUTPUT, RESULTS_FILE) final_results = load_data(results_filename) missing = None for i, result in enumerate(final_results["results"]): if result["name"] == "7": missing = result final_results["results"].pop(i) break assert missing is not None final_results["results_summary"]["num_missing"] = 1 final_results["results_summary"]["num_successful"] -= 1 final_results["missing_jobs"] = [missing["name"]] dump_data(final_results, results_filename) summary = ResultsSummary(OUTPUT) assert len(summary.get_failed_results()) == 0 assert len(summary.get_successful_results()) == num_commands - 1 first_batch = load_data(Path(OUTPUT) / "config_batch_1.json") assert len(first_batch["jobs"]) == num_commands ret = run_command(f"{RESUBMIT_JOBS} {OUTPUT}") assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 summary = ResultsSummary(OUTPUT) assert len(summary.get_successful_results()) == num_commands second_batch_file = Path(OUTPUT) / "config_batch_2.json" assert second_batch_file.exists() second_batch = load_data(second_batch_file)["jobs"] assert len(second_batch) == 3