def test_resubmit_missing(cleanup): cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT}" ret = run_command(cmd) assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 agg = ResultsAggregator.load(OUTPUT) results = agg.get_results_unsafe() assert results for result in results: assert result.return_code == 0 results.pop() agg._write_results(results) results_filename = os.path.join(OUTPUT, RESULTS_FILE) final_results = load_data(results_filename) missing = final_results["results"].pop() final_results["results_summary"]["num_missing"] = 1 final_results["results_summary"]["num_successful"] -= 1 final_results["missing_jobs"] = [missing["name"]] dump_data(final_results, results_filename) summary = ResultsSummary(OUTPUT) assert len(summary.get_failed_results()) == 0 assert len(summary.get_successful_results()) == NUM_COMMANDS - 1 ret = run_command(f"{RESUBMIT_JOBS} {OUTPUT}") assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 summary = ResultsSummary(OUTPUT) assert len(summary.get_successful_results()) == NUM_COMMANDS
def test_resubmit_successful(cleanup): cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT}" check_run_command(cmd) check_run_command(f"{WAIT} --output={OUTPUT} -p 0.01 -t2") summary = ResultsSummary(OUTPUT) assert len(summary.get_failed_results()) == 0 assert len(summary.get_successful_results()) == NUM_COMMANDS check_run_command( f"jade config save-submission-groups {OUTPUT} -c {SG_FILE}") groups = load_data(SG_FILE) assert groups[0]["submitter_params"]["per_node_batch_size"] > NUM_COMMANDS groups[0]["submitter_params"]["per_node_batch_size"] = NUM_COMMANDS dump_data(groups, SG_FILE) check_run_command(f"{RESUBMIT_JOBS} {OUTPUT} -s {SG_FILE} --successful") check_run_command(f"{WAIT} --output={OUTPUT} -p 0.01") summary = ResultsSummary(OUTPUT) assert len(summary.get_failed_results()) == 0 assert len(summary.get_successful_results()) == NUM_COMMANDS check_run_command( f"jade config save-submission-groups {OUTPUT} --force -c {SG_FILE}") groups = load_data(SG_FILE) assert groups[0]["submitter_params"]["per_node_batch_size"] == NUM_COMMANDS
def test_resubmit_failed(cleanup): cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT}" ret = run_command(cmd) assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 agg = ResultsAggregator.load(OUTPUT) results = agg.get_results_unsafe() assert results for result in results: assert result.return_code == 0 x = results[0] results[0] = Result(x.name, 1, x.status, x.exec_time_s, x.completion_time) agg._write_results(results) results_filename = os.path.join(OUTPUT, RESULTS_FILE) final_results = load_data(results_filename) final_results["results"][0]["return_code"] = 1 final_results["results_summary"]["num_failed"] = 1 final_results["results_summary"]["num_successful"] -= 1 dump_data(final_results, results_filename) summary = ResultsSummary(OUTPUT) assert summary.get_failed_results()[0].name == "1" ret = run_command(f"{RESUBMIT_JOBS} {OUTPUT}") assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 summary = ResultsSummary(OUTPUT) assert len(summary.get_successful_results()) == NUM_COMMANDS
def create_config_from_previous_run(config_file, output, result_type="successful", **kwargs): """Create instance of a JobConfiguration from a previous config file, returning only those of the type given Parameters ---------- config_file : str location of config output : str location of previous results result_type : string type of results Returns ------- JobConfiguration Raises ------ InvalidParameter Raised if result_type is not successful or failed """ allowed_types = ["successful", "failed", "missing"] if result_type not in allowed_types: raise InvalidParameter(f"given result type invalid: {result_type}") config = deserialize_config(load_data(config_file)) summary = ResultsSummary(output) results_of_type = [] if result_type == "successful": results_of_type = summary.get_successful_results() elif result_type == "failed": results_of_type = summary.get_failed_results() elif result_type == "missing": results_of_type = summary.get_missing_jobs(config.iter_jobs()) parameters = [] # Note that both jobs and results have `.name`. for result in results_of_type: job_parameters = config.get_job(result.name) parameters.append(job_parameters) config.reconfigure_jobs(parameters) return deserialize_config(config.serialize(), **kwargs)
def test_cancel_on_failure_detect_by_submitter(cleanup): # HpcSubmitter handles the cancellation because the blocked job will be in the 2nd batch. cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} -n2 -b2" ret = run_command(cmd) assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 summary = ResultsSummary(OUTPUT) assert len(summary.get_successful_results()) == 1 assert len(summary.get_failed_results()) == 1 assert len(summary.get_canceled_results()) == 6 results = summary.get_results_by_type() assert len(results["successful"]) == 1 assert len(results["failed"]) == 1 assert len(results["canceled"]) == 6
def test_cancel_on_failure_detect_by_runner(cleanup): # JobRunner handles the cancellation in JobQueue because the blocked job is in the batch # along with the blocking job. cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} -n2 -b8" ret = run_command(cmd) assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 summary = ResultsSummary(OUTPUT) assert len(summary.get_successful_results()) == 1 assert len(summary.get_failed_results()) == 1 assert len(summary.get_canceled_results()) == 6 results = summary.get_results_by_type() assert len(results["successful"]) == 1 assert len(results["failed"]) == 1 assert len(results["canceled"]) == 6
def test_resubmit_with_blocking_jobs(basic_setup): num_commands = 7 commands = ['echo "hello world"'] * num_commands with open(TEST_FILENAME, "w") as f_out: for command in commands: f_out.write(command + "\n") inputs = GenericCommandInputs(TEST_FILENAME) config = GenericCommandConfiguration(job_inputs=inputs) jobs = list(inputs.iter_jobs()) # Set an inefficient ordering to make sure the resubmit algorithm is recursive. for i, job_param in enumerate(jobs): if i == 3: job_param.blocked_by = set([5]) elif i == 4: job_param.blocked_by = set([7]) elif i == 6: job_param.blocked_by = set([6]) config.add_job(job_param) config.dump(CONFIG_FILE) cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT}" ret = run_command(cmd) assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 agg = ResultsAggregator.load(OUTPUT) results = agg.get_results_unsafe() assert results for result in results: assert result.return_code == 0 found = False for i, result in enumerate(results): if result.name == "7": results.pop(i) found = True break assert found agg._write_results(results) results_filename = os.path.join(OUTPUT, RESULTS_FILE) final_results = load_data(results_filename) missing = None for i, result in enumerate(final_results["results"]): if result["name"] == "7": missing = result final_results["results"].pop(i) break assert missing is not None final_results["results_summary"]["num_missing"] = 1 final_results["results_summary"]["num_successful"] -= 1 final_results["missing_jobs"] = [missing["name"]] dump_data(final_results, results_filename) summary = ResultsSummary(OUTPUT) assert len(summary.get_failed_results()) == 0 assert len(summary.get_successful_results()) == num_commands - 1 first_batch = load_data(Path(OUTPUT) / "config_batch_1.json") assert len(first_batch["jobs"]) == num_commands ret = run_command(f"{RESUBMIT_JOBS} {OUTPUT}") assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 summary = ResultsSummary(OUTPUT) assert len(summary.get_successful_results()) == num_commands second_batch_file = Path(OUTPUT) / "config_batch_2.json" assert second_batch_file.exists() second_batch = load_data(second_batch_file)["jobs"] assert len(second_batch) == 3