示例#1
0
def run_worker(job, manager_node, output_dir, verbose, poll_interval=60):
    """Run a worker instance."""
    logger.error("in worker manager_node=%s job=%s", manager_node, job.name)
    hostname = socket.gethostname()
    filename = os.path.join(
        output_dir, f"run_spark_job_worker__{hostname}__{job.name}.log")
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__,
                  filename,
                  file_level=level,
                  console_level=level,
                  mode="w")
    logger.info("Run worker: %s", get_cli_string())

    # Ignore errors. Spark may not be running.
    run_command(job.model.spark_config.get_stop_worker())

    # Give the master a head start.
    time.sleep(10)
    job_output = Path(output_dir) / JOBS_OUTPUT_DIR / job.name
    logs_dir = job_output / "spark" / "logs"
    job_conf_dir = job_output / "spark" / "conf"
    workers_dir = job_output / "spark" / "workers"
    _set_env_variables(job, job_conf_dir, logs_dir)
    worker_memory = _get_worker_memory_str(job, is_master=False)
    cmd = _get_worker_command(job, manager_node, worker_memory)
    ret = 1
    output = {}
    for _ in range(5):
        output.clear()
        logger.info("Run spark worker: [%s]", cmd)
        ret = run_command(cmd, output=output)
        if ret == 0:
            break
    if ret != 0:
        logger.error("Failed to start spark worker: %s: %s", ret, output)

    shutdown_file = _get_shutdown_file(job.name, output_dir)
    while not shutdown_file.exists():
        logger.debug("sleep for %s seconds", poll_interval)
        time.sleep(poll_interval)

    logger.info("Detected shutdown.")
    check_run_command(job.model.spark_config.get_stop_worker())
    if job.model.spark_config.collect_worker_logs:
        shutil.copytree(Path(os.environ["SPARK_WORKER_DIR"]),
                        workers_dir / hostname)
    return 0
示例#2
0
def _run_manager(job_name, output_dir, verbose, manager_script_and_args):
    filename = os.path.join(output_dir,
                            f"run_multi_node_job_manager__{job_name}.log")
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__,
                  filename,
                  file_level=level,
                  console_level=level,
                  mode="w")
    logger.info("Run manager on %s: %s", socket.gethostname(),
                get_cli_string())

    # Note that the manager receives its own hostname.
    output = {}
    check_run_command(f"jade cluster hostnames {output_dir}", output)
    hostnames = [x for x in output["stdout"].split() if x != ""]
    logger.info("Manager found %s hostnames: %s", len(hostnames), hostnames)
    cmd = " ".join(manager_script_and_args)
    logger.info("Run manager script [%s]", cmd)

    os.environ["JADE_OUTPUT_DIR"] = output_dir
    os.environ["JADE_COMPUTE_NODE_NAMES"] = " ".join(hostnames)
    start = time.time()
    ret = run_command(cmd)
    logger.info("Finished job. duration = %s seconds", time.time() - start)
    return ret
示例#3
0
def test_submission_groups_mixed_hpc_types(cleanup):
    config = create_config()
    config.submission_groups[
        0].submitter_params.hpc_config.hpc_type = HpcType.SLURM
    config.dump(CONFIG_FILE)
    cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} -h {FAKE_HPC_CONFIG} --dry-run"
    assert run_command(cmd) != 0
示例#4
0
def test_run_command__stdout():
    """Should run a command as a subprocess"""
    command = "echo 'Hello Disco'"
    output = {}
    ret = run_command(command, output)
    assert ret == 0
    assert "stdout" in output
    assert "Hello Disco" in output["stdout"]
示例#5
0
def test_run_command():
    """Should run a command as a subprocess"""
    command = "ls -l /dirnotexist"
    output = {}
    ret = run_command(command, output)
    assert ret != 0
    assert "stderr" in output
    assert "No such file or directory" in output["stderr"]
示例#6
0
def test_run_command_skip_retries():
    """Should run a command as a subprocess"""
    command = "jade bad-command"
    output = {}
    errors = ["No such command"]
    # Make sure that we get the expected return.
    ret = run_command(command, output)
    assert ret != 0
    assert "stderr" in output
    assert errors[0] in output["stderr"]

    # Now make it hang if it doesn't skip retries.
    ret = run_command(command,
                      output,
                      error_strings=errors,
                      num_retries=sys.maxsize,
                      retry_delay_s=100000)
    assert ret != 0
    assert "stderr" in output
    assert "No such command" in output["stderr"]
示例#7
0
    def check_statuses(self):
        field_names = ("jobid", "state")
        cmd = f"squeue -u {self.USER} --Format \"{','.join(field_names)}\" -h"

        output = {}
        # Transient failures could be costly. Retry for up to one minute.
        ret = run_command(cmd, output, num_retries=6, retry_delay_s=10)
        if ret != 0:
            logger.error("Failed to run squeue command=[%s] ret=%s err=%s",
                         cmd, ret, output["stderr"])
            raise ExecutionError(f"squeue command failed: {ret}")

        return self._get_statuses_from_output(output["stdout"])
示例#8
0
def test_run_command_with_retries():
    """Test that a retry works."""
    with tempfile.TemporaryDirectory() as tmpdir:
        script = Path(tmpdir) / "read_input.py"
        input_file = Path(tmpdir) / "inputs.txt"
        input_file.write_text("2")
        content = f"""import sys
from pathlib import Path
input_file = Path("{input_file}")
cur_val = int(input_file.read_text())
input_file.write_text(str(cur_val - 1))
sys.exit(cur_val)
"""
        script.write_text(content)
        command = f"python {script}"
        ret = run_command(command, num_retries=2, retry_delay_s=0.1)
        assert ret == 0
示例#9
0
    def check_status(self, name=None, job_id=None):
        field_names = ("jobid", "name", "state")
        cmd = f"squeue -u {self.USER} --Format \"{','.join(field_names)}\" -h"
        if name is not None:
            cmd += f" -n {name}"
        elif job_id is not None:
            cmd += f" -j {job_id}"
        else:
            # Mutual exclusivity should be handled in HpcManager.
            assert False

        output = {}
        # Transient failures could be costly. Retry for up to one minute.
        errors = ["Invalid job id specified"]
        ret = run_command(cmd,
                          output,
                          num_retries=6,
                          retry_delay_s=10,
                          error_strings=errors)
        if ret != 0:
            if "Invalid job id specified" in output["stderr"]:
                return HpcJobInfo("", "", HpcJobStatus.NONE)

            logger.error("Failed to run squeue command=[%s] ret=%s err=%s",
                         cmd, ret, output["stderr"])
            raise ExecutionError(f"squeue command failed: {ret}")

        stdout = output["stdout"]
        logger.debug("squeue output:  [%s]", stdout)
        fields = stdout.split()
        if not fields:
            # No jobs are currently running.
            return HpcJobInfo("", "", HpcJobStatus.NONE)

        assert len(fields) == len(field_names)
        job_info = HpcJobInfo(
            fields[0], fields[1],
            self._STATUSES.get(fields[2], HpcJobStatus.UNKNOWN))
        return job_info
示例#10
0
def cancel_jobs(output, complete, verbose):
    """Cancels jobs."""
    filename = os.path.join(output, "cancel_jobs.log")
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__,
                  filename,
                  file_level=level,
                  console_level=level,
                  mode="a")
    logger.info(get_cli_string())

    for _ in range(60):
        cluster, promoted = Cluster.deserialize(
            output,
            try_promote_to_submitter=True,
            deserialize_jobs=True,
        )
        if not promoted:
            logger.info("Did not get promoted. Sleep.")
            time.sleep(1)
            continue
        if cluster.is_complete():
            cluster.demote_from_submitter()
            logger.info("All jobs are already finished.")
            sys.exit(0)
        submitter = JobSubmitter.load(output)
        submitter.cancel_jobs(cluster)
        cluster.demote_from_submitter()
        ret = 0
        if complete:
            delay = 15
            print(f"Delaying {delay} seconds to let the nodes complete.")
            time.sleep(delay)
            ret = run_command(f"jade try-submit-jobs {output}")
        sys.exit(ret)

    logger.error("Failed to get promoted to submitter.")
    sys.exit(1)
示例#11
0
    def submit(self, filename):
        job_id = None
        output = {}
        # Transient failures could be costly. Retry for up to one minute.
        # TODO: Some errors are not transient. We could detect those and skip the retries.
        ret = run_command("sbatch {}".format(filename),
                          output,
                          num_retries=6,
                          retry_delay_s=10)
        if ret == 0:
            result = Status.GOOD
            stdout = output["stdout"]
            match = self._REGEX_SBATCH_OUTPUT.search(stdout)
            if match:
                job_id = match.group(1)
                result = Status.GOOD
            else:
                logger.error("Failed to interpret sbatch output [%s]", stdout)
                result = Status.ERROR
        else:
            result = Status.ERROR

        return result, job_id, output["stderr"]
示例#12
0
def test_submission_groups_duplicate_name(cleanup):
    config = create_config()
    config.submission_groups[0].name = config.submission_groups[1].name
    config.dump(CONFIG_FILE)
    cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} -h {FAKE_HPC_CONFIG} --dry-run"
    assert run_command(cmd) != 0
示例#13
0
def _run_cluster_master(job, manager_node, output_dir, verbose,
                        manager_script_and_args):
    filename = os.path.join(output_dir, f"run_spark_cluster__{job.name}.log")
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__,
                  filename,
                  file_level=level,
                  console_level=level,
                  mode="w")
    logger.info("Run cluster master on %s job=%s: %s", socket.gethostname(),
                job.name, get_cli_string())

    job_output = Path(output_dir) / JOBS_OUTPUT_DIR / job.name
    if job_output.exists():
        shutil.rmtree(job_output)
    job_output.mkdir(parents=True)
    events_dir = job_output / "spark" / "events"
    events_dir.mkdir(parents=True)
    logs_dir = job_output / "spark" / "logs"
    logs_dir.mkdir()
    workers_dir = job_output / "spark" / "workers"
    workers_dir.mkdir()

    # Make a job-specific conf directory because the log and event files need to be per-job.
    job_conf_dir = job_output / "spark" / "conf"
    shutil.copytree(
        Path(job.model.spark_config.conf_dir) / "conf", job_conf_dir)
    _fix_spark_conf_file(job_conf_dir, events_dir)
    _set_env_variables(job, job_conf_dir, logs_dir)

    # Ignore errors. Spark may not be running.
    run_command(job.model.spark_config.get_stop_worker())
    run_command(job.model.spark_config.get_stop_history_server())
    run_command(job.model.spark_config.get_stop_master())

    # It would be better to start all workers from the master. Doing so would require that
    # Spark processes on the master node be able to ssh into the worker nodes.
    # I haven't spent the time to figure out to do that inside Singularity containers.
    master_cmd = job.model.spark_config.get_start_master()
    logger.info("Run spark master: [%s]", master_cmd)
    check_run_command(master_cmd)
    history_cmd = job.model.spark_config.get_start_history_server()
    logger.info("Run spark history server: [%s]", history_cmd)
    check_run_command(history_cmd)
    worker_memory = _get_worker_memory_str(job, is_master=True)

    worker_cmd = _get_worker_command(job, manager_node, memory=worker_memory)
    logger.info("Run spark worker: [%s]", worker_cmd)
    check_run_command(worker_cmd)

    # Wait for workers.
    # TODO: find a way to check programmatically with the rest api
    # or parse the logs
    time.sleep(15)
    args = list(manager_script_and_args) + [
        _get_cluster(manager_node),
        str(job_output)
    ]
    if job.model.spark_config.run_user_script_inside_container:
        user_cmd = str(job.model.spark_config.get_run_user_script()
                       ) + " " + " ".join(args)
    else:
        user_cmd = " ".join(args)
    logger.info("Run user script [%s]", user_cmd)

    start = time.time()
    ret = run_command(user_cmd)
    logger.info("Finished job. duration = %s seconds", time.time() - start)

    # Delay to ensure the history is saved.
    time.sleep(10)
    metrics = SparkMetrics("localhost", history=True)
    try:
        metrics.generate_metrics(job_output / "spark_metrics")
    except Exception:
        logger.exception("Failed to generate metrics")

    check_run_command(job.model.spark_config.get_stop_worker())
    check_run_command(job.model.spark_config.get_stop_history_server())
    check_run_command(job.model.spark_config.get_stop_master())
    if job.model.spark_config.collect_worker_logs:
        shutil.copytree(Path(os.environ["SPARK_WORKER_DIR"]),
                        workers_dir / socket.gethostname())
    return ret
示例#14
0
 def cancel_job(self, job_id):
     return run_command(f"scancel {job_id}")
示例#15
0
文件: job_runner.py 项目: NREL/jade
    def run_jobs(self,
                 distributed_submitter=True,
                 verbose=False,
                 num_processes=None):
        """Run the jobs.

        Parameters
        ----------
        distributed_submitter : bool
            If True, make cluster updates.
        verbose : bool
            If True, enable debug logging.
        num_processes : int
            Number of processes to run in parallel; defaults to num CPUs

        Returns
        -------
        Status

        """
        logger.info("Run jobs.")
        scratch_dir = self._create_local_scratch()
        are_inputs_local = self._intf_type == HpcType.LOCAL

        try:
            config_file = self._config.serialize_for_execution(
                scratch_dir, are_inputs_local)
            jobs = self._generate_jobs(config_file, verbose)

            os.environ["JADE_RUNTIME_OUTPUT"] = self._output
            os.environ[
                "JADE_SUBMISSION_GROUP"] = self._config.get_default_submission_group(
                ).name
            # Setting node_setup_script and node_shutdown_script are obsolete and will
            # eventually be deleted.
            group = self._config.get_default_submission_group()
            if group.submitter_params.node_setup_script is not None:
                cmd = f"{group.submitter_params.node_setup_script} {config_file} {self._output}"
                check_run_command(cmd)
            elif self._config.node_setup_command is not None:
                check_run_command(self._config.node_setup_command)

            result = self._run_jobs(jobs, num_processes=num_processes)

            if group.submitter_params.node_shutdown_script:
                cmd = f"{group.submitter_params.node_shutdown_script} {config_file} {self._output}"
                ret2 = run_command(cmd)
                if ret2 != 0:
                    logger.error("Failed to run node shutdown script %s: %s",
                                 cmd, ret2)
            elif self._config.node_teardown_command is not None:
                start = time.time()
                ret2 = run_command(self._config.node_teardown_script)
                if ret2 != 0:
                    logger.error(
                        "Failed to run node shutdown script %s: %s",
                        self._config.node_teardown_command,
                        ret2,
                    )
                logger.info("Node teardown script duration = %s seconds",
                            time.time() - start)

            logger.info("Completed %s jobs", len(jobs))
        finally:
            shutil.rmtree(scratch_dir)
            if distributed_submitter and are_inputs_local:
                self._complete_hpc_job()

        return result
示例#16
0
def test_run_command__on_output():
    """Should run a command as a subprocess"""
    command = "echo 'Hello World'"
    ret = run_command(command)
    assert ret == 0
示例#17
0
def test_run_command_retries_exhausted():
    """Test retries that never work."""
    command = "ls invalid_test_file"
    ret = run_command(command, num_retries=3, retry_delay_s=0.1)
    assert ret != 0