def run_worker(job, manager_node, output_dir, verbose, poll_interval=60): """Run a worker instance.""" logger.error("in worker manager_node=%s job=%s", manager_node, job.name) hostname = socket.gethostname() filename = os.path.join( output_dir, f"run_spark_job_worker__{hostname}__{job.name}.log") level = logging.DEBUG if verbose else logging.INFO setup_logging(__name__, filename, file_level=level, console_level=level, mode="w") logger.info("Run worker: %s", get_cli_string()) # Ignore errors. Spark may not be running. run_command(job.model.spark_config.get_stop_worker()) # Give the master a head start. time.sleep(10) job_output = Path(output_dir) / JOBS_OUTPUT_DIR / job.name logs_dir = job_output / "spark" / "logs" job_conf_dir = job_output / "spark" / "conf" workers_dir = job_output / "spark" / "workers" _set_env_variables(job, job_conf_dir, logs_dir) worker_memory = _get_worker_memory_str(job, is_master=False) cmd = _get_worker_command(job, manager_node, worker_memory) ret = 1 output = {} for _ in range(5): output.clear() logger.info("Run spark worker: [%s]", cmd) ret = run_command(cmd, output=output) if ret == 0: break if ret != 0: logger.error("Failed to start spark worker: %s: %s", ret, output) shutdown_file = _get_shutdown_file(job.name, output_dir) while not shutdown_file.exists(): logger.debug("sleep for %s seconds", poll_interval) time.sleep(poll_interval) logger.info("Detected shutdown.") check_run_command(job.model.spark_config.get_stop_worker()) if job.model.spark_config.collect_worker_logs: shutil.copytree(Path(os.environ["SPARK_WORKER_DIR"]), workers_dir / hostname) return 0
def _run_manager(job_name, output_dir, verbose, manager_script_and_args): filename = os.path.join(output_dir, f"run_multi_node_job_manager__{job_name}.log") level = logging.DEBUG if verbose else logging.INFO setup_logging(__name__, filename, file_level=level, console_level=level, mode="w") logger.info("Run manager on %s: %s", socket.gethostname(), get_cli_string()) # Note that the manager receives its own hostname. output = {} check_run_command(f"jade cluster hostnames {output_dir}", output) hostnames = [x for x in output["stdout"].split() if x != ""] logger.info("Manager found %s hostnames: %s", len(hostnames), hostnames) cmd = " ".join(manager_script_and_args) logger.info("Run manager script [%s]", cmd) os.environ["JADE_OUTPUT_DIR"] = output_dir os.environ["JADE_COMPUTE_NODE_NAMES"] = " ".join(hostnames) start = time.time() ret = run_command(cmd) logger.info("Finished job. duration = %s seconds", time.time() - start) return ret
def test_submission_groups_mixed_hpc_types(cleanup): config = create_config() config.submission_groups[ 0].submitter_params.hpc_config.hpc_type = HpcType.SLURM config.dump(CONFIG_FILE) cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} -h {FAKE_HPC_CONFIG} --dry-run" assert run_command(cmd) != 0
def test_run_command__stdout(): """Should run a command as a subprocess""" command = "echo 'Hello Disco'" output = {} ret = run_command(command, output) assert ret == 0 assert "stdout" in output assert "Hello Disco" in output["stdout"]
def test_run_command(): """Should run a command as a subprocess""" command = "ls -l /dirnotexist" output = {} ret = run_command(command, output) assert ret != 0 assert "stderr" in output assert "No such file or directory" in output["stderr"]
def test_run_command_skip_retries(): """Should run a command as a subprocess""" command = "jade bad-command" output = {} errors = ["No such command"] # Make sure that we get the expected return. ret = run_command(command, output) assert ret != 0 assert "stderr" in output assert errors[0] in output["stderr"] # Now make it hang if it doesn't skip retries. ret = run_command(command, output, error_strings=errors, num_retries=sys.maxsize, retry_delay_s=100000) assert ret != 0 assert "stderr" in output assert "No such command" in output["stderr"]
def check_statuses(self): field_names = ("jobid", "state") cmd = f"squeue -u {self.USER} --Format \"{','.join(field_names)}\" -h" output = {} # Transient failures could be costly. Retry for up to one minute. ret = run_command(cmd, output, num_retries=6, retry_delay_s=10) if ret != 0: logger.error("Failed to run squeue command=[%s] ret=%s err=%s", cmd, ret, output["stderr"]) raise ExecutionError(f"squeue command failed: {ret}") return self._get_statuses_from_output(output["stdout"])
def test_run_command_with_retries(): """Test that a retry works.""" with tempfile.TemporaryDirectory() as tmpdir: script = Path(tmpdir) / "read_input.py" input_file = Path(tmpdir) / "inputs.txt" input_file.write_text("2") content = f"""import sys from pathlib import Path input_file = Path("{input_file}") cur_val = int(input_file.read_text()) input_file.write_text(str(cur_val - 1)) sys.exit(cur_val) """ script.write_text(content) command = f"python {script}" ret = run_command(command, num_retries=2, retry_delay_s=0.1) assert ret == 0
def check_status(self, name=None, job_id=None): field_names = ("jobid", "name", "state") cmd = f"squeue -u {self.USER} --Format \"{','.join(field_names)}\" -h" if name is not None: cmd += f" -n {name}" elif job_id is not None: cmd += f" -j {job_id}" else: # Mutual exclusivity should be handled in HpcManager. assert False output = {} # Transient failures could be costly. Retry for up to one minute. errors = ["Invalid job id specified"] ret = run_command(cmd, output, num_retries=6, retry_delay_s=10, error_strings=errors) if ret != 0: if "Invalid job id specified" in output["stderr"]: return HpcJobInfo("", "", HpcJobStatus.NONE) logger.error("Failed to run squeue command=[%s] ret=%s err=%s", cmd, ret, output["stderr"]) raise ExecutionError(f"squeue command failed: {ret}") stdout = output["stdout"] logger.debug("squeue output: [%s]", stdout) fields = stdout.split() if not fields: # No jobs are currently running. return HpcJobInfo("", "", HpcJobStatus.NONE) assert len(fields) == len(field_names) job_info = HpcJobInfo( fields[0], fields[1], self._STATUSES.get(fields[2], HpcJobStatus.UNKNOWN)) return job_info
def cancel_jobs(output, complete, verbose): """Cancels jobs.""" filename = os.path.join(output, "cancel_jobs.log") level = logging.DEBUG if verbose else logging.INFO setup_logging(__name__, filename, file_level=level, console_level=level, mode="a") logger.info(get_cli_string()) for _ in range(60): cluster, promoted = Cluster.deserialize( output, try_promote_to_submitter=True, deserialize_jobs=True, ) if not promoted: logger.info("Did not get promoted. Sleep.") time.sleep(1) continue if cluster.is_complete(): cluster.demote_from_submitter() logger.info("All jobs are already finished.") sys.exit(0) submitter = JobSubmitter.load(output) submitter.cancel_jobs(cluster) cluster.demote_from_submitter() ret = 0 if complete: delay = 15 print(f"Delaying {delay} seconds to let the nodes complete.") time.sleep(delay) ret = run_command(f"jade try-submit-jobs {output}") sys.exit(ret) logger.error("Failed to get promoted to submitter.") sys.exit(1)
def submit(self, filename): job_id = None output = {} # Transient failures could be costly. Retry for up to one minute. # TODO: Some errors are not transient. We could detect those and skip the retries. ret = run_command("sbatch {}".format(filename), output, num_retries=6, retry_delay_s=10) if ret == 0: result = Status.GOOD stdout = output["stdout"] match = self._REGEX_SBATCH_OUTPUT.search(stdout) if match: job_id = match.group(1) result = Status.GOOD else: logger.error("Failed to interpret sbatch output [%s]", stdout) result = Status.ERROR else: result = Status.ERROR return result, job_id, output["stderr"]
def test_submission_groups_duplicate_name(cleanup): config = create_config() config.submission_groups[0].name = config.submission_groups[1].name config.dump(CONFIG_FILE) cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} -h {FAKE_HPC_CONFIG} --dry-run" assert run_command(cmd) != 0
def _run_cluster_master(job, manager_node, output_dir, verbose, manager_script_and_args): filename = os.path.join(output_dir, f"run_spark_cluster__{job.name}.log") level = logging.DEBUG if verbose else logging.INFO setup_logging(__name__, filename, file_level=level, console_level=level, mode="w") logger.info("Run cluster master on %s job=%s: %s", socket.gethostname(), job.name, get_cli_string()) job_output = Path(output_dir) / JOBS_OUTPUT_DIR / job.name if job_output.exists(): shutil.rmtree(job_output) job_output.mkdir(parents=True) events_dir = job_output / "spark" / "events" events_dir.mkdir(parents=True) logs_dir = job_output / "spark" / "logs" logs_dir.mkdir() workers_dir = job_output / "spark" / "workers" workers_dir.mkdir() # Make a job-specific conf directory because the log and event files need to be per-job. job_conf_dir = job_output / "spark" / "conf" shutil.copytree( Path(job.model.spark_config.conf_dir) / "conf", job_conf_dir) _fix_spark_conf_file(job_conf_dir, events_dir) _set_env_variables(job, job_conf_dir, logs_dir) # Ignore errors. Spark may not be running. run_command(job.model.spark_config.get_stop_worker()) run_command(job.model.spark_config.get_stop_history_server()) run_command(job.model.spark_config.get_stop_master()) # It would be better to start all workers from the master. Doing so would require that # Spark processes on the master node be able to ssh into the worker nodes. # I haven't spent the time to figure out to do that inside Singularity containers. master_cmd = job.model.spark_config.get_start_master() logger.info("Run spark master: [%s]", master_cmd) check_run_command(master_cmd) history_cmd = job.model.spark_config.get_start_history_server() logger.info("Run spark history server: [%s]", history_cmd) check_run_command(history_cmd) worker_memory = _get_worker_memory_str(job, is_master=True) worker_cmd = _get_worker_command(job, manager_node, memory=worker_memory) logger.info("Run spark worker: [%s]", worker_cmd) check_run_command(worker_cmd) # Wait for workers. # TODO: find a way to check programmatically with the rest api # or parse the logs time.sleep(15) args = list(manager_script_and_args) + [ _get_cluster(manager_node), str(job_output) ] if job.model.spark_config.run_user_script_inside_container: user_cmd = str(job.model.spark_config.get_run_user_script() ) + " " + " ".join(args) else: user_cmd = " ".join(args) logger.info("Run user script [%s]", user_cmd) start = time.time() ret = run_command(user_cmd) logger.info("Finished job. duration = %s seconds", time.time() - start) # Delay to ensure the history is saved. time.sleep(10) metrics = SparkMetrics("localhost", history=True) try: metrics.generate_metrics(job_output / "spark_metrics") except Exception: logger.exception("Failed to generate metrics") check_run_command(job.model.spark_config.get_stop_worker()) check_run_command(job.model.spark_config.get_stop_history_server()) check_run_command(job.model.spark_config.get_stop_master()) if job.model.spark_config.collect_worker_logs: shutil.copytree(Path(os.environ["SPARK_WORKER_DIR"]), workers_dir / socket.gethostname()) return ret
def cancel_job(self, job_id): return run_command(f"scancel {job_id}")
def run_jobs(self, distributed_submitter=True, verbose=False, num_processes=None): """Run the jobs. Parameters ---------- distributed_submitter : bool If True, make cluster updates. verbose : bool If True, enable debug logging. num_processes : int Number of processes to run in parallel; defaults to num CPUs Returns ------- Status """ logger.info("Run jobs.") scratch_dir = self._create_local_scratch() are_inputs_local = self._intf_type == HpcType.LOCAL try: config_file = self._config.serialize_for_execution( scratch_dir, are_inputs_local) jobs = self._generate_jobs(config_file, verbose) os.environ["JADE_RUNTIME_OUTPUT"] = self._output os.environ[ "JADE_SUBMISSION_GROUP"] = self._config.get_default_submission_group( ).name # Setting node_setup_script and node_shutdown_script are obsolete and will # eventually be deleted. group = self._config.get_default_submission_group() if group.submitter_params.node_setup_script is not None: cmd = f"{group.submitter_params.node_setup_script} {config_file} {self._output}" check_run_command(cmd) elif self._config.node_setup_command is not None: check_run_command(self._config.node_setup_command) result = self._run_jobs(jobs, num_processes=num_processes) if group.submitter_params.node_shutdown_script: cmd = f"{group.submitter_params.node_shutdown_script} {config_file} {self._output}" ret2 = run_command(cmd) if ret2 != 0: logger.error("Failed to run node shutdown script %s: %s", cmd, ret2) elif self._config.node_teardown_command is not None: start = time.time() ret2 = run_command(self._config.node_teardown_script) if ret2 != 0: logger.error( "Failed to run node shutdown script %s: %s", self._config.node_teardown_command, ret2, ) logger.info("Node teardown script duration = %s seconds", time.time() - start) logger.info("Completed %s jobs", len(jobs)) finally: shutil.rmtree(scratch_dir) if distributed_submitter and are_inputs_local: self._complete_hpc_job() return result
def test_run_command__on_output(): """Should run a command as a subprocess""" command = "echo 'Hello World'" ret = run_command(command) assert ret == 0
def test_run_command_retries_exhausted(): """Test retries that never work.""" command = "ls invalid_test_file" ret = run_command(command, num_retries=3, retry_delay_s=0.1) assert ret != 0