def test_resubmit_failed(cleanup): cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT}" ret = run_command(cmd) assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 agg = ResultsAggregator.load(OUTPUT) results = agg.get_results_unsafe() assert results for result in results: assert result.return_code == 0 x = results[0] results[0] = Result(x.name, 1, x.status, x.exec_time_s, x.completion_time) agg._write_results(results) results_filename = os.path.join(OUTPUT, RESULTS_FILE) final_results = load_data(results_filename) final_results["results"][0]["return_code"] = 1 final_results["results_summary"]["num_failed"] = 1 final_results["results_summary"]["num_successful"] -= 1 dump_data(final_results, results_filename) summary = ResultsSummary(OUTPUT) assert summary.get_failed_results()[0].name == "1" ret = run_command(f"{RESUBMIT_JOBS} {OUTPUT}") assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 summary = ResultsSummary(OUTPUT) assert len(summary.get_successful_results()) == NUM_COMMANDS
def test_resubmit_missing(cleanup): cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT}" ret = run_command(cmd) assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 agg = ResultsAggregator.load(OUTPUT) results = agg.get_results_unsafe() assert results for result in results: assert result.return_code == 0 results.pop() agg._write_results(results) results_filename = os.path.join(OUTPUT, RESULTS_FILE) final_results = load_data(results_filename) missing = final_results["results"].pop() final_results["results_summary"]["num_missing"] = 1 final_results["results_summary"]["num_successful"] -= 1 final_results["missing_jobs"] = [missing["name"]] dump_data(final_results, results_filename) summary = ResultsSummary(OUTPUT) assert len(summary.get_failed_results()) == 0 assert len(summary.get_successful_results()) == NUM_COMMANDS - 1 ret = run_command(f"{RESUBMIT_JOBS} {OUTPUT}") assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 summary = ResultsSummary(OUTPUT) assert len(summary.get_successful_results()) == NUM_COMMANDS
def _handle_completion(self, cluster): result = Status.GOOD self._results = ResultsAggregator.list_results(self._output) if len(self._results) != self._config.get_num_jobs(): finished_jobs = {x.name for x in self._results} all_jobs = {x.name for x in self._config.iter_jobs()} missing_jobs = sorted(all_jobs.difference(finished_jobs)) logger.error( "Error in result totals. num_results=%s total_num_jobs=%s", len(self._results), self._config.get_num_jobs(), ) logger.error( "These jobs did not finish: %s. Check for process crashes or HPC timeouts.", missing_jobs, ) result = Status.ERROR else: missing_jobs = [] self.write_results_summary(RESULTS_FILE, missing_jobs) self._log_error_log_messages(self._output) bytes_consumed = get_directory_size_bytes(self._output, recursive=False) event = StructuredLogEvent( source="submitter", category=EVENT_CATEGORY_RESOURCE_UTIL, name=EVENT_NAME_BYTES_CONSUMED, message="main output directory size", bytes_consumed=bytes_consumed, ) log_event(event) event = StructuredLogEvent( source="submitter", category=EVENT_CATEGORY_RESOURCE_UTIL, name=EVENT_NAME_SUBMIT_COMPLETED, message="job submission completed", num_jobs=self.get_num_jobs(), ) log_event(event) group = self._config.get_default_submission_group() if group.submitter_params.generate_reports: self.generate_reports(self._output, group.submitter_params.resource_monitor_type) cluster.mark_complete() if cluster.config.pipeline_stage_num is not None: # The pipeline directory must be the one above this one. pipeline_dir = os.path.dirname(self._output) next_stage = cluster.config.pipeline_stage_num + 1 cmd = (f"jade pipeline submit-next-stage {pipeline_dir} " f"--stage-num={next_stage} " f"--return-code={result.value}") run_command(cmd) return result
def test_demo_extension(test_data_dir): """Should create a config.json file""" config_file = os.path.join(test_data_dir, "config.json") if os.path.exists(config_file): os.remove(config_file) base = os.path.join(JADE_PATH, "extensions", "demo") create_demo_config = os.path.join(base, "create_demo_config.sh") create_merge_config = os.path.join(base, "create_merge_pred_gdp.py") config_file = os.path.join(test_data_dir, "pipeline.json") output = os.path.join(test_data_dir, "output") if os.path.exists(output): shutil.rmtree(output) try: cmd = ( f"jade pipeline create {create_demo_config} {create_merge_config} -c {config_file} -l" ) returncode = run_command(cmd=cmd) assert returncode == 0 assert os.path.exists(config_file) returncode = run_command( f"jade pipeline submit {config_file} -o {output}") assert returncode == 0 output_stage1 = os.path.join(output, "output-stage1") output_stage2 = os.path.join(output, "output-stage2") assert os.path.exists(output) assert os.path.exists(output_stage1) assert os.path.exists(output_stage2) job_outputs = os.path.join(output_stage1, "job-outputs") for country in ("australia", "brazil", "united_states"): results = os.listdir(os.path.join(job_outputs, country)) assert "result.csv" in results assert "result.png" in results assert "summary.toml" in results pred_gdp_file = os.path.join(output_stage2, "pred_gdp.csv") assert os.path.exists(pred_gdp_file) df = pd.read_csv(pred_gdp_file) assert "year" in df.columns assert "brazil" in df.columns assert "united_states" in df.columns assert "australia" in df.columns finally: if os.path.exists(output): shutil.rmtree(output) if os.path.exists(config_file): os.remove(config_file) if os.path.exists(PRED_GDP_COMMANDS_FILE): os.remove(PRED_GDP_COMMANDS_FILE)
def test_stats__bytes_consumed(example_output): output = {} ret = run_command(f"jade stats bytes-consumed -o {example_output}", output) assert ret == 0 assert len(output["stdout"]) > 0 ret = run_command(f"jade stats bytes-consumed --no-human-readable -o {example_output}", output) assert ret == 0 bytes_consumed = int(output["stdout"].strip()) assert bytes_consumed > 0
def test_stats__exec_time(example_output): output = {} ret = run_command(f"jade stats exec-time -o {example_output}", output) assert ret == 0 assert len(output["stdout"]) > 0 ret = run_command(f"jade stats exec-time --no-human-readable -o {example_output}", output) assert ret == 0 exec_time = float(output["stdout"].strip()) assert exec_time > 0
def test_config__show(cleanup): ret = run_command(f"jade auto-config demo tests/data/demo -c {CONFIG1}") assert ret == 0 assert os.path.exists(CONFIG1) output = {} ret = run_command(f"jade config show {CONFIG1}", output=output) assert ret == 0 for country in ("australia", "brazil", "united_states"): assert country in output["stdout"]
def test_config__filter_show_only(cleanup): ret = run_command(f"jade auto-config demo tests/data/demo -c {CONFIG1}") assert ret == 0 assert os.path.exists(CONFIG1) output = {} ret = run_command(f"jade config filter {CONFIG1} -f country brazil", output=output) assert ret == 0 assert not os.path.exists(CONFIG2) assert "brazil" in output["stdout"]
def test_config__assign_blocked_by(cleanup): config = GenericCommandConfiguration() base_cmd = "bash my_script.sh" regular_job_names = [] for i in range(1, 4): cmd = base_cmd + " " + str(i) name = f"job_{i}" job = GenericCommandParameters( command=cmd, name=name, append_job_name=True, append_output_dir=True, ) config.add_job(job) regular_job_names.append(name) pp_name = "post_process" post_process_job = GenericCommandParameters( command="bash run_post_process.sh", name=pp_name, append_job_name=True, append_output_dir=True, ) config.add_job(post_process_job) config_file = CONFIG1 config.dump(config_file, indent=2) ret = run_command( f"jade config assign-blocked-by {CONFIG1} 3 -o {CONFIG2}") assert ret == 0 assert os.path.exists(CONFIG2) config = load_data(CONFIG2) assert sorted(config["jobs"][3]["blocked_by"]) == sorted(regular_job_names) os.remove(CONFIG2) ret = run_command( f"jade config assign-blocked-by {CONFIG1} 3 1 2 -o {CONFIG2}") assert ret == 0 assert os.path.exists(CONFIG2) config = load_data(CONFIG2) expected = [regular_job_names[1], regular_job_names[2]] assert sorted(config["jobs"][3]["blocked_by"]) == sorted(expected) # Include the pp job in blocking-job-indexes. ret = run_command( f"jade config assign-blocked-by {CONFIG1} 3 1 2 3 -o {CONFIG2}") assert ret != 0 # Invalid job index ret = run_command( f"jade config assign-blocked-by {CONFIG1} 47 1 2 -o {CONFIG2}") assert ret != 0
def test_config__filter_copy(cleanup): ret = run_command(f"jade auto-config demo tests/data/demo -c {CONFIG1}") assert ret == 0 assert os.path.exists(CONFIG1) ret = run_command(f"jade config filter {CONFIG1} -o {CONFIG2}") assert ret == 0 assert os.path.exists(CONFIG2) config1 = load_data(CONFIG1) config2 = load_data(CONFIG2) assert config1 == config2
def test_config__filter_range(cleanup): ret = run_command(f"jade auto-config demo tests/data/demo -c {CONFIG1}") assert ret == 0 assert os.path.exists(CONFIG1) ret = run_command(f"jade config filter {CONFIG1} -o {CONFIG2} 0 1") assert ret == 0 assert os.path.exists(CONFIG2) config1 = load_data(CONFIG1) config2 = load_data(CONFIG2) assert config2["jobs"] == [config1["jobs"][0], config1["jobs"][1]]
def test_collect_stats(): output_dir = os.path.join(tempfile.gettempdir(), "test-stats-output") try: ret = run_command(f"jade stats collect -i1 -o {output_dir} -d 1 -f") assert ret == 0 cmd = f"jade stats show -o {output_dir} cpu disk mem net" output = {} ret = run_command(cmd, output=output) assert ret == 0 for term in ("IOPS", "read_bytes", "bytes_recv", "idle"): assert term in output["stdout"] finally: if os.path.exists(output_dir): shutil.rmtree(output_dir)
def run_jobs(config_file, distributed_submitter, output, num_processes, verbose): """Starts jobs on HPC.""" match = re.search(r"batch_(\d+)\.json", config_file) assert match batch_id = match.group(1) os.makedirs(output, exist_ok=True) mgr = JobRunner(config_file, output=output, batch_id=batch_id) # Logging has to get enabled after the JobRunner is created because we need the node ID # is what makes the file unique. filename = os.path.join(output, f"run_jobs_batch_{batch_id}_{mgr.node_id}.log") level = logging.DEBUG if verbose else logging.INFO setup_event_logging(mgr.event_filename) logger = setup_logging(__name__, filename, file_level=level, console_level=level, mode="w") logger.info(get_cli_string()) group = mgr.config.get_default_submission_group() if group.submitter_params.node_setup_script: cmd = f"{group.submitter_params.node_setup_script} {config_file} {output}" ret = run_command(cmd) if ret != 0: logger.error("Failed to run node setup script %s: %s", cmd, ret) sys.exit(ret) status = mgr.run_jobs(distributed_submitter=distributed_submitter, verbose=verbose, num_processes=num_processes) ret = status.value if group.submitter_params.node_shutdown_script: cmd = f"{group.submitter_params.node_shutdown_script} {config_file} {output}" ret2 = run_command(cmd) if ret2 != 0: logger.error("Failed to run node shutdown script %s: %s", cmd, ret2) if status == Status.GOOD and distributed_submitter: start = time.time() _try_submit_jobs(output, verbose=verbose) logger.info("try-submit-jobs took %s seconds", time.time() - start) sys.exit(ret)
def _submit(self, verbose): for stage in self._stages: os.environ["JADE_PIPELINE_STAGE_ID"] = str(self._cur_stage_id) stage_info = { "stage_id": self._cur_stage_id, "output_directory": self.get_stage_output_path(self._output, self._cur_stage_id) } self._status_info["stages"].append(stage_info) self._dump_status() self._run_auto_config(stage) cmd = self._make_submit_cmd(stage, verbose) start = time.time() ret = run_command(cmd) end = time.time() exec_time = end - start result = Result(str(self._cur_stage_id), ret, "finished", exec_time, end) self._status_info["stages"][-1]["result"] = serialize_result( result) self._dump_status() if ret != 0: raise ExecutionError(f"stage {self._cur_stage_id} failed") self._cur_stage_id += 1 logger.info("Finished execution pipeline")
def generate_reports(directory): """Create reports summarizing the output results of a set of jobs. Parameters ---------- directory : str output directory """ commands = ( (f"jade show-results -o {directory}", "results.txt"), (f"jade show-events -o {directory} --categories Error", "errors.txt"), (f"jade stats show -o {directory}", "stats.txt"), ) reports = [] for cmd in commands: output = {} ret = run_command(cmd[0], output=output) if ret != 0: return ret filename = os.path.join(directory, cmd[1]) with open(filename, "w") as f_out: f_out.write(cmd[0] + "\n\n") f_out.write(output["stdout"]) reports.append(filename) logger.info("Generated reports %s.", " ".join(reports)) return 0
def check_status(self, name=None, job_id=None): field_names = ("jobid", "name", "state") cmd = f"squeue -u {self.USER} --Format \"{','.join(field_names)}\" -h" if name is not None: cmd += f" -n {name}" elif job_id is not None: cmd += f" -j {job_id}" else: # Mutual exclusivity should be handled in HpcManager. assert False output = {} ret = run_command(cmd, output) if ret != 0: logger.error("Failed to run squeue command=[%s] ret=%s err=%s", cmd, ret, output["stderr"]) raise ExecutionError(f"squeue command failed: {ret}") stdout = output["stdout"] logger.debug("squeue output: [%s]", stdout) fields = stdout.split() if not fields: # No jobs are currently running. return HpcJobInfo("", "", HpcJobStatus.NONE) assert len(fields) == len(field_names) job_info = HpcJobInfo(fields[0], fields[1], self._STATUSES.get(fields[2], HpcJobStatus.UNKNOWN)) return job_info
def test_cancel_on_failure_detect_by_submitter(cleanup): # HpcSubmitter handles the cancellation because the blocked job will be in the 2nd batch. cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} -n2 -b2" ret = run_command(cmd) assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 summary = ResultsSummary(OUTPUT) assert len(summary.get_successful_results()) == 1 assert len(summary.get_failed_results()) == 1 assert len(summary.get_canceled_results()) == 6 results = summary.get_results_by_type() assert len(results["successful"]) == 1 assert len(results["failed"]) == 1 assert len(results["canceled"]) == 6
def test_run_generic_commands(generic_command_fixture): commands = [ "ls .", "ls invalid-file-path", ] with open(TEST_FILENAME, "w") as f_out: for command in commands: f_out.write(command + "\n") inputs = GenericCommandInputs(TEST_FILENAME) config = GenericCommandConfiguration(job_inputs=inputs) for job_param in inputs.iter_jobs(): config.add_job(job_param) assert config.get_num_jobs() == 2 config.dump(CONFIG_FILE) cmds = ( f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} -p 0.1", # Test with higher queue depth. This exercises the code paths but # doesn't actually verify the functionality. # The infrastructure to do that is currently lacking. TODO f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} -p 0.1 -q 32", ) for cmd in cmds: ret = run_command(cmd) assert ret == 0
def test_run_command__stdout(): """Should run a command as a subprocess""" command = "echo 'Hello Disco'" output = {} ret = run_command(command, output) assert ret == 0 assert "stdout" in output assert "Hello Disco" in output["stdout"]
def test_cancel_on_failure_detect_by_runner(cleanup): # JobRunner handles the cancellation in JobQueue because the blocked job is in the batch # along with the blocking job. cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} -n2 -b8" ret = run_command(cmd) assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 summary = ResultsSummary(OUTPUT) assert len(summary.get_successful_results()) == 1 assert len(summary.get_failed_results()) == 1 assert len(summary.get_canceled_results()) == 6 results = summary.get_results_by_type() assert len(results["successful"]) == 1 assert len(results["failed"]) == 1 assert len(results["canceled"]) == 6
def test_stats__show(example_output): output = {} ret = run_command(f"jade stats show -o {example_output}", output) assert ret == 0 assert len(output["stdout"]) > 0 for text in ("Network stat", "Memory stat", "Disk stat", "CPU stat"): assert text in output["stdout"]
def _try_submit_jobs(output, verbose): try_submit_cmd = f"jade try-submit-jobs {output}" if verbose: try_submit_cmd += " --verbose" ret = run_command(try_submit_cmd) if ret != 0: logger = logging.getLogger(__name__) logger.error("Failed to run '%s' ret=%s", try_submit_cmd, ret)
def test_run_command(): """Should run a command as a subprocess""" command = "ls -l /dirnotexit" output = {} ret = run_command(command, output) assert ret != 0 assert "stderr" in output assert "No such file or directory" in output["stderr"]
def _qstat(self): """Run the PBS qstat command and return the stdout split to rows. Returns ------- qstat_rows : list | None List of strings where each string is a row in the qstat printout. Returns None if qstat is empty. """ cmd = "qstat -u {user}".format(user=self.USER) output = {} run_command(cmd, output) if not output["stdout"]: # No jobs are currently running. return None qstat_rows = output["stdout"].split("\n") return qstat_rows
def submit(self, filename): output = {} ret = run_command("qsub {}".format(filename), output) if ret == 0: result = Status.GOOD job_id = output["stdout"] else: result = Status.ERROR job_id = None return result, job_id, output["stderr"]
def test_stats__plot(example_output): path = os.path.join(example_output, "stats") try: ret = run_command(f"jade stats plot -o {example_output}") assert ret == 0 for stat in ("Cpu", "Disk", "Memory", "Network"): filename = os.path.join( path, stat + "StatsViewer__resource_monitor_batch_0.html") assert os.path.exists(filename) finally: if os.path.exists(path): shutil.rmtree(path)
def main(): status = load_data(os.environ["JADE_PIPELINE_STATUS_FILE"]) cur_stage = status["stages"][-1] cur_stage_output = cur_stage["output_directory"] previous_stage = status["stages"][-2] previous_stage_output = previous_stage["output_directory"] script = "jade/extensions/demo/merge_pred_gdp.py" with open(PRED_GDP_COMMANDS_FILE, "w") as f_out: cmd = f"python {script} run {previous_stage_output} {cur_stage_output}" f_out.write(cmd + "\n") cmd = "jade config create pred_gdp_commands.txt -c config-stage2.json" sys.exit(run_command(cmd))
def _run_command(self, cmd): orig = os.getcwd() os.chdir(self._path) try: output = {} ret = run_command(cmd, output=output) if ret != 0: raise ExecutionError( f"[{cmd}] failed: {ret}: {output['stderr']}") return output["stdout"].strip() finally: os.chdir(orig)
def main(): config = PipelineConfig( **load_data(os.environ["JADE_PIPELINE_STATUS_FILE"])) cur_stage = config.stages[-1] cur_stage_output = cur_stage.path previous_stage = config.stages[-2] previous_stage_output = previous_stage.path script = "jade/extensions/demo/merge_pred_gdp.py" with open(PRED_GDP_COMMANDS_FILE, "w") as f_out: cmd = f"python {script} run {previous_stage_output} {cur_stage_output}" f_out.write(cmd + "\n") cmd = "jade config create pred_gdp_commands.txt -c config-stage2.json" sys.exit(run_command(cmd))
def test_try_add_blocked_jobs(cleanup): num_commands = 5 commands = ['echo "hello world"'] * num_commands with open(TEST_FILENAME, "w") as f_out: for command in commands: f_out.write(command + "\n") inputs = GenericCommandInputs(TEST_FILENAME) config = GenericCommandConfiguration(job_inputs=inputs) jobs = list(inputs.iter_jobs()) for i, job_param in enumerate(jobs): if i == num_commands - 1: job_param.blocked_by = set([1, 2, 3, 4]) config.add_job(job_param) config.dump(CONFIG_FILE) for option in ("--try-add-blocked-jobs", "--no-try-add-blocked-jobs"): cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} -h {FAKE_HPC_CONFIG} -p 0.1 {option}" ret = run_command(cmd) assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 events_file = os.path.join(OUTPUT, "submit_jobs_events.log") events_summary = EventsSummary(OUTPUT, preload=True) submit_events = events_summary.list_events(EVENT_NAME_HPC_SUBMIT) if option == "--try-add-blocked-jobs": assert len(submit_events) == 1 event = submit_events[0] assert event.data["batch_size"] == num_commands shutil.rmtree(OUTPUT) else: assert len(submit_events) == 2 event1 = submit_events[0] event2 = submit_events[1] assert event1.data["batch_size"] == num_commands - 1 assert event2.data["batch_size"] == 1