def _run_manager(job_name, output_dir, verbose, manager_script_and_args): filename = os.path.join(output_dir, f"run_multi_node_job_manager__{job_name}.log") level = logging.DEBUG if verbose else logging.INFO setup_logging(__name__, filename, file_level=level, console_level=level, mode="w") logger.info("Run manager on %s: %s", socket.gethostname(), get_cli_string()) # Note that the manager receives its own hostname. output = {} check_run_command(f"jade cluster hostnames {output_dir}", output) hostnames = [x for x in output["stdout"].split() if x != ""] logger.info("Manager found %s hostnames: %s", len(hostnames), hostnames) cmd = " ".join(manager_script_and_args) logger.info("Run manager script [%s]", cmd) os.environ["JADE_OUTPUT_DIR"] = output_dir os.environ["JADE_COMPUTE_NODE_NAMES"] = " ".join(hostnames) start = time.time() ret = run_command(cmd) logger.info("Finished job. duration = %s seconds", time.time() - start) return ret
def submit(config_file, output, force, verbose=False): """Submit the pipeline for execution.""" if os.path.exists(output): if force: shutil.rmtree(output) else: print( f"{output} already exists. Delete it or use '--force' to overwrite.", file=sys.stderr, ) sys.exit(1) os.makedirs(output, exist_ok=True) filename = os.path.join(output, "pipeline_submit.log") level = logging.DEBUG if verbose else logging.INFO setup_logging(__name__, filename, file_level=level, console_level=level) logger.info(get_cli_string()) mgr = PipelineManager.create(config_file, output) try: mgr.submit_next_stage(1) except Exception: logger.exception("Pipeline execution failed") raise logging.shutdown() sys.exit(0)
def cancel_jobs(output, verbose): """Cancels jobs.""" filename = os.path.join(output, "cancel_jobs.log") level = logging.DEBUG if verbose else logging.INFO setup_logging(__name__, filename, file_level=level, console_level=level, mode="a") logger.info(get_cli_string()) for _ in range(60): cluster, promoted = Cluster.deserialize( output, try_promote_to_submitter=True, deserialize_jobs=True, ) if not promoted: logger.info("Did not get promoted. Sleep.") time.sleep(1) continue if cluster.is_complete(): cluster.demote_from_submitter() logger.info("All jobs are already finished.") sys.exit(0) submitter = JobSubmitter.load(output) submitter.cancel_jobs(cluster) sys.exit(0) logger.error("Failed to get promoted to submitter.") sys.exit(1)
def submit_jobs(config_file, per_node_batch_size, hpc_config, local, max_nodes, output, poll_interval, num_processes, rotate_logs, verbose, restart_failed, restart_missing, reports, try_add_blocked_jobs): """Submits jobs for execution, locally or on HPC.""" os.makedirs(output, exist_ok=True) previous_results = [] if restart_failed: failed_job_config = create_config_from_previous_run( config_file, output, result_type='failed') previous_results = ResultsSummary(output).get_successful_results() config_file = "failed_job_inputs.json" failed_job_config.dump(config_file) if restart_missing: missing_job_config = create_config_from_previous_run( config_file, output, result_type='missing') config_file = "missing_job_inputs.json" missing_job_config.dump(config_file) previous_results = ResultsSummary(output).list_results() if rotate_logs: rotate_filenames(output, ".log") filename = os.path.join(output, "submit_jobs.log") level = logging.DEBUG if verbose else logging.INFO setup_logging(__name__, filename, file_level=level, console_level=level) logger.info(get_cli_string()) event_file = os.path.join(output, "submit_jobs_events.log") # This effectively means no console logging. setup_logging("event", event_file, console_level=logging.ERROR, file_level=logging.INFO) mgr = JobSubmitter(config_file, hpc_config=hpc_config, output=output) ret = mgr.submit_jobs( per_node_batch_size=per_node_batch_size, max_nodes=max_nodes, force_local=local, verbose=verbose, num_processes=num_processes, poll_interval=poll_interval, previous_results=previous_results, reports=reports, try_add_blocked_jobs=try_add_blocked_jobs, ) sys.exit(ret.value)
def run_jobs(config_file, distributed_submitter, output, num_processes, verbose): """Starts jobs on HPC.""" match = re.search(r"batch_(\d+)\.json", config_file) assert match batch_id = match.group(1) os.makedirs(output, exist_ok=True) mgr = JobRunner(config_file, output=output, batch_id=batch_id) # Logging has to get enabled after the JobRunner is created because we need the node ID # is what makes the file unique. filename = os.path.join(output, f"run_jobs_batch_{batch_id}_{mgr.node_id}.log") level = logging.DEBUG if verbose else logging.INFO setup_event_logging(mgr.event_filename) logger = setup_logging(__name__, filename, file_level=level, console_level=level, mode="w") logger.info(get_cli_string()) group = mgr.config.get_default_submission_group() if group.submitter_params.node_setup_script: cmd = f"{group.submitter_params.node_setup_script} {config_file} {output}" ret = run_command(cmd) if ret != 0: logger.error("Failed to run node setup script %s: %s", cmd, ret) sys.exit(ret) status = mgr.run_jobs(distributed_submitter=distributed_submitter, verbose=verbose, num_processes=num_processes) ret = status.value if group.submitter_params.node_shutdown_script: cmd = f"{group.submitter_params.node_shutdown_script} {config_file} {output}" ret2 = run_command(cmd) if ret2 != 0: logger.error("Failed to run node shutdown script %s: %s", cmd, ret2) if status == Status.GOOD and distributed_submitter: start = time.time() _try_submit_jobs(output, verbose=verbose) logger.info("try-submit-jobs took %s seconds", time.time() - start) sys.exit(ret)
def run_worker(job_name, output_dir, verbose, poll_interval=60): """Run a worker instance.""" hostname = socket.gethostname() filename = os.path.join(output_dir, f"run_multi_node_job_worker__{job_name}__{hostname}.log") level = logging.DEBUG if verbose else logging.INFO setup_logging(__name__, filename, file_level=level, console_level=level, mode="w") logger.info("Run worker: %s", get_cli_string()) shutdown_file = _get_shutdown_file(job_name, output_dir) while not shutdown_file.exists(): logger.debug("sleep for %s seconds", poll_interval) time.sleep(poll_interval) logger.info("Detected shutdown.") return 0
def run_worker(job, manager_node, output_dir, verbose, poll_interval=60): """Run a worker instance.""" logger.error("in worker manager_node=%s job=%s", manager_node, job.name) hostname = socket.gethostname() filename = os.path.join( output_dir, f"run_spark_job_worker__{hostname}__{job.name}.log") level = logging.DEBUG if verbose else logging.INFO setup_logging(__name__, filename, file_level=level, console_level=level, mode="w") logger.info("Run worker: %s", get_cli_string()) # Ignore errors. Spark may not be running. run_command(job.model.spark_config.get_stop_worker()) # Give the master a head start. time.sleep(10) job_output = Path(output_dir) / JOBS_OUTPUT_DIR / job.name logs_dir = job_output / "spark" / "logs" job_conf_dir = job_output / "spark" / "conf" workers_dir = job_output / "spark" / "workers" _set_env_variables(job, job_conf_dir, logs_dir) worker_memory = _get_worker_memory_str(job, is_master=False) cmd = _get_worker_command(job, manager_node, worker_memory) ret = 1 output = {} for _ in range(5): output.clear() logger.info("Run spark worker: [%s]", cmd) ret = run_command(cmd, output=output) if ret == 0: break if ret != 0: logger.error("Failed to start spark worker: %s: %s", ret, output) shutdown_file = _get_shutdown_file(job.name, output_dir) while not shutdown_file.exists(): logger.debug("sleep for %s seconds", poll_interval) time.sleep(poll_interval) logger.info("Detected shutdown.") check_run_command(job.model.spark_config.get_stop_worker()) if job.model.spark_config.collect_worker_logs: shutil.copytree(Path(os.environ["SPARK_WORKER_DIR"]), workers_dir / hostname) return 0
def run_jobs(config_file, output, num_processes, verbose): """Starts jobs on HPC.""" match = re.search(r"batch_(\d+)\.json", config_file) assert match batch_id = match.group(1) os.makedirs(output, exist_ok=True) filename = os.path.join(output, f"run_jobs_batch_{batch_id}.log") level = logging.DEBUG if verbose else logging.INFO setup_logging(__name__, filename, file_level=level, console_level=logging.ERROR) logger.info(get_cli_string()) mgr = JobRunner(config_file, output=output, batch_id=batch_id) ret = mgr.run_jobs(verbose=verbose, num_processes=num_processes) sys.exit(ret.value)
def submit(config_file, output, verbose=False): """Submit the pipeline for execution.""" global logger os.makedirs(output, exist_ok=True) filename = os.path.join(output, "pipeline_submit.log") level = logging.DEBUG if verbose else logging.INFO logger = setup_logging(__name__, filename, file_level=level, console_level=level) logger.info(get_cli_string()) mgr = PipelineManager(config_file, output) try: mgr.submit(verbose=verbose) except Exception: logger.exception("Pipeline execution failed") raise sys.exit(0)
def submit_next_stage(output, stage_num, return_code, verbose=False): """Internal command to submit the next stage of the pipeline for execution.""" filename = os.path.join(output, "pipeline_submit.log") level = logging.DEBUG if verbose else logging.INFO setup_logging(__name__, filename, file_level=level, console_level=level, mode="a") logger.info(get_cli_string()) mgr = PipelineManager.load(output) try: mgr.submit_next_stage(stage_num, return_code=return_code) except Exception: logger.exception("Pipeline execution failed") raise logging.shutdown() sys.exit(0)
def _run_cluster_master(job, manager_node, output_dir, verbose, manager_script_and_args): filename = os.path.join(output_dir, f"run_spark_cluster__{job.name}.log") level = logging.DEBUG if verbose else logging.INFO setup_logging(__name__, filename, file_level=level, console_level=level, mode="w") logger.info("Run cluster master on %s job=%s: %s", socket.gethostname(), job.name, get_cli_string()) job_output = Path(output_dir) / JOBS_OUTPUT_DIR / job.name if job_output.exists(): shutil.rmtree(job_output) job_output.mkdir(parents=True) events_dir = job_output / "spark" / "events" events_dir.mkdir(parents=True) logs_dir = job_output / "spark" / "logs" logs_dir.mkdir() workers_dir = job_output / "spark" / "workers" workers_dir.mkdir() # Make a job-specific conf directory because the log and event files need to be per-job. job_conf_dir = job_output / "spark" / "conf" shutil.copytree( Path(job.model.spark_config.conf_dir) / "conf", job_conf_dir) _fix_spark_conf_file(job_conf_dir, events_dir) _set_env_variables(job, job_conf_dir, logs_dir) # Ignore errors. Spark may not be running. run_command(job.model.spark_config.get_stop_worker()) run_command(job.model.spark_config.get_stop_history_server()) run_command(job.model.spark_config.get_stop_master()) # It would be better to start all workers from the master. Doing so would require that # Spark processes on the master node be able to ssh into the worker nodes. # I haven't spent the time to figure out to do that inside Singularity containers. master_cmd = job.model.spark_config.get_start_master() logger.info("Run spark master: [%s]", master_cmd) check_run_command(master_cmd) history_cmd = job.model.spark_config.get_start_history_server() logger.info("Run spark history server: [%s]", history_cmd) check_run_command(history_cmd) worker_memory = _get_worker_memory_str(job, is_master=True) worker_cmd = _get_worker_command(job, manager_node, memory=worker_memory) logger.info("Run spark worker: [%s]", worker_cmd) check_run_command(worker_cmd) # Wait for workers. # TODO: find a way to check programmatically with the rest api # or parse the logs time.sleep(15) args = list(manager_script_and_args) + [ _get_cluster(manager_node), str(job_output) ] if job.model.spark_config.run_user_script_inside_container: user_cmd = str(job.model.spark_config.get_run_user_script() ) + " " + " ".join(args) else: user_cmd = " ".join(args) logger.info("Run user script [%s]", user_cmd) start = time.time() ret = run_command(user_cmd) logger.info("Finished job. duration = %s seconds", time.time() - start) # Delay to ensure the history is saved. time.sleep(10) metrics = SparkMetrics("localhost", history=True) try: metrics.generate_metrics(job_output / "spark_metrics") except Exception: logger.exception("Failed to generate metrics") check_run_command(job.model.spark_config.get_stop_worker()) check_run_command(job.model.spark_config.get_stop_history_server()) check_run_command(job.model.spark_config.get_stop_master()) if job.model.spark_config.collect_worker_logs: shutil.copytree(Path(os.environ["SPARK_WORKER_DIR"]), workers_dir / socket.gethostname()) return ret
def submit_jobs( config_file=None, per_node_batch_size=None, dry_run=None, force=None, hpc_config=None, local=None, max_nodes=None, output=None, poll_interval=None, resource_monitor_interval=None, resource_monitor_type=None, num_processes=None, verbose=None, reports=None, enable_singularity=None, container=None, try_add_blocked_jobs=None, time_based_batching=None, node_setup_script=None, node_shutdown_script=None, submitter_params=None, no_distributed_submitter=None, ): """Submits jobs for execution, locally or on HPC.""" if os.path.exists(output): if force: shutil.rmtree(output) else: print( f"{output} already exists. Delete it or use '--force' to overwrite.", file=sys.stderr, ) sys.exit(1) if submitter_params is not None: params = SubmitterParams(**load_data(submitter_params)) else: params = make_submitter_params( per_node_batch_size=per_node_batch_size, dry_run=dry_run, hpc_config=hpc_config, local=local, max_nodes=max_nodes, poll_interval=poll_interval, resource_monitor_interval=resource_monitor_interval, resource_monitor_type=resource_monitor_type, num_processes=num_processes, verbose=verbose, reports=reports, enable_singularity=enable_singularity, container=container, try_add_blocked_jobs=try_add_blocked_jobs, time_based_batching=time_based_batching, node_setup_script=node_setup_script, node_shutdown_script=node_shutdown_script, no_distributed_submitter=no_distributed_submitter, ) if params.time_based_batching and params.num_processes is None: print("Error: num_processes must be set with time-based batching", file=sys.stderr) sys.exit(1) os.makedirs(output) filename = os.path.join(output, "submit_jobs.log") event_filename = os.path.join(output, "submit_jobs_events.log") level = logging.DEBUG if verbose else logging.INFO # For some reason event logging must be setup before general logging. # Otherwise, the first event doesn't show up in the log. setup_event_logging(event_filename) logger = setup_logging(__name__, filename, file_level=level, console_level=level, mode="w") logger.info(get_cli_string()) try: ret = JobSubmitter.run_submit_jobs(config_file, output, params) sys.exit(ret) except Exception: logger.exception("Failed to run submit_jobs") raise
def run(extension, **kwargs): """Runs individual job.""" registry = Registry() if not registry.is_registered(extension): raise InvalidExtension(f"Extension '{extension}' is not registered.") # Parse Argument config_file = kwargs["config_file"] name = kwargs["name"] output = kwargs["output"] output_format = kwargs["output_format"] verbose = kwargs["verbose"] level = logging.DEBUG if verbose else logging.INFO # Create directory for current job job_dir = os.path.join(output, name) os.makedirs(job_dir, exist_ok=True) # Structural logging setup event_file = os.path.join(job_dir, "events.log") setup_event_logging(event_file) # General logging setup log_file = os.path.join(job_dir, "run.log") general_logger = setup_logging( extension, log_file, console_level=logging.ERROR, file_level=level, ) general_logger.info(get_cli_string()) # Create config for run try: cli = registry.get_extension_class(extension, ExtensionClassType.CLI) ret = cli.run(config_file, name, output, output_format, verbose) except Exception as err: msg = f"unexpected exception in run '{extension}' job={name} - {err}" general_logger.exception(msg) event = StructuredErrorLogEvent( source=name, category=EVENT_CATEGORY_ERROR, name=EVENT_NAME_UNHANDLED_ERROR, message=msg, ) log_event(event) ret = 1 if ret == 0: try: config = load_data(config_file) if "job_post_process_config" in config.keys(): post_process = JobPostProcess( module_name=config["job_post_process_config"]["module"], class_name=config["job_post_process_config"]["class"], data=config["job_post_process_config"]["data"], job_name=name, output=output, ) post_process.run(config_file=config_file, output=output) except Exception as err: msg = f"unexpected exception in post-process '{extension}' job={name} - {err}" general_logger.exception(msg) event = StructuredErrorLogEvent( source=name, category=EVENT_CATEGORY_ERROR, name=EVENT_NAME_UNHANDLED_ERROR, message=msg, ) log_event(event) ret = 1 sys.exit(ret)