예제 #1
0
def _get_manager_node_name(output_dir):
    output = {}
    job_id = os.environ[
        "SLURM_JOB_ID"]  # TODO: needs to be agnostic to HPC type
    check_run_command(f"jade cluster manager-node {output_dir} {job_id}",
                      output)
    return output["stdout"].strip()
예제 #2
0
def _run_manager(job_name, output_dir, verbose, manager_script_and_args):
    filename = os.path.join(output_dir,
                            f"run_multi_node_job_manager__{job_name}.log")
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__,
                  filename,
                  file_level=level,
                  console_level=level,
                  mode="w")
    logger.info("Run manager on %s: %s", socket.gethostname(),
                get_cli_string())

    # Note that the manager receives its own hostname.
    output = {}
    check_run_command(f"jade cluster hostnames {output_dir}", output)
    hostnames = [x for x in output["stdout"].split() if x != ""]
    logger.info("Manager found %s hostnames: %s", len(hostnames), hostnames)
    cmd = " ".join(manager_script_and_args)
    logger.info("Run manager script [%s]", cmd)

    os.environ["JADE_OUTPUT_DIR"] = output_dir
    os.environ["JADE_COMPUTE_NODE_NAMES"] = " ".join(hostnames)
    start = time.time()
    ret = run_command(cmd)
    logger.info("Finished job. duration = %s seconds", time.time() - start)
    return ret
예제 #3
0
def test_submission_groups_per_node_setup(cleanup):
    config = create_config()
    config.submission_groups[1].submitter_params.node_setup_script = "node.sh"
    config.dump(CONFIG_FILE)
    cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} -h {FAKE_HPC_CONFIG} --dry-run"
    check_run_command(cmd)
    config = create_config_from_file(Path(OUTPUT) / "config_batch_2.json")
    assert config.get_default_submission_group(
    ).submitter_params.node_setup_script == "node.sh"
예제 #4
0
def _set_hostnames(output_dir):
    output = {}
    job_id = os.environ[
        "SLURM_JOB_ID"]  # TODO: needs to be agnostic to HPC type
    check_run_command(f"jade cluster hostnames -j {job_id} {output_dir}",
                      output)
    hostnames = [x for x in output["stdout"].split() if x != ""]
    logger.info("Found %s hostnames: %s", len(hostnames), hostnames)
    os.environ["JADE_OUTPUT_DIR"] = output_dir
    os.environ["JADE_COMPUTE_NODE_NAMES"] = " ".join(hostnames)
    return hostnames
예제 #5
0
def _get_tmpfs_size_gb():
    output = {}
    check_run_command("df -h", output=output)
    # Output looks like this:
    # Filesystem                                  Size  Used Avail Use% Mounted on
    # tmpfs                                       378G  4.0K  378G   1% /dev/shm
    for line in output["stdout"].splitlines():
        if line.endswith(TMPFS_MOUNT):
            return _parse_tmpfs_size_str(line)
    raise Exception(
        f"Did not find {TMPFS_MOUNT} in 'df -h' output: {output['stdout']}")
예제 #6
0
def test_submission_groups_per_node_setup(cleanup):
    # TODO: this test is no longer in the right place. Belongs in file testing job_config.
    config = create_config()
    config.node_setup_command = "node_setup.sh"
    config.node_teardown_command = "node_teardown.sh"
    config.dump(CONFIG_FILE)
    cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} -h {FAKE_HPC_CONFIG} --dry-run"
    check_run_command(cmd)
    config = create_config_from_file(Path(OUTPUT) / "config_batch_2.json")
    assert config.node_setup_command == "node_setup.sh"
    assert config.node_teardown_command == "node_teardown.sh"
예제 #7
0
def test_job_order(generic_command_fixture):
    num_jobs = 50
    commands = ["echo hello world"] * num_jobs

    with open(TEST_FILENAME, "w") as f_out:
        for command in commands:
            f_out.write(command + "\n")

    inputs = GenericCommandInputs(TEST_FILENAME)
    config = GenericCommandConfiguration()
    for job_param in inputs.iter_jobs():
        config.add_job(job_param)
    assert config.get_num_jobs() == num_jobs
    job = config.get_job("1")
    for i in range(10, 15):
        job.blocked_by.add(i)

    config.get_job("2").blocked_by.add("1")
    config.get_job("21").blocked_by.add("30")
    config.get_job("41").blocked_by.add("50")
    config.dump(CONFIG_FILE)

    cmd = (f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} "
           "--per-node-batch-size=10 "
           "--max-nodes=4 "
           "--poll-interval=0.1 "
           f"--hpc-config {FAKE_HPC_CONFIG} "
           "--num-processes=10")
    check_run_command(cmd)
    check_run_command(f"{WAIT} --output={OUTPUT} --poll-interval=0.01")

    result_summary = ResultsSummary(OUTPUT)
    results = result_summary.list_results()
    assert len(results) == num_jobs
    tracker = {x.name: x for x in results}

    for i in range(10, 15):
        assert tracker["1"].completion_time > tracker[str(i)].completion_time

    assert tracker["2"].completion_time > tracker["1"].completion_time
    assert tracker["21"].completion_time > tracker["30"].completion_time
    assert tracker["41"].completion_time > tracker["50"].completion_time

    # Verify that stats are summarized correctly with aggregation mode.
    stats_text = Path(OUTPUT) / "stats.txt"
    assert stats_text.exists()
    assert "Average" in stats_text.read_text()
    stats_json = Path(OUTPUT) / "stats_summary.json"
    assert stats_json.exists()
    stats = load_data(stats_json)
    assert stats
    assert "batch" in stats[0]
예제 #8
0
def run_worker(job, manager_node, output_dir, verbose, poll_interval=60):
    """Run a worker instance."""
    logger.error("in worker manager_node=%s job=%s", manager_node, job.name)
    hostname = socket.gethostname()
    filename = os.path.join(
        output_dir, f"run_spark_job_worker__{hostname}__{job.name}.log")
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__,
                  filename,
                  file_level=level,
                  console_level=level,
                  mode="w")
    logger.info("Run worker: %s", get_cli_string())

    # Ignore errors. Spark may not be running.
    run_command(job.model.spark_config.get_stop_worker())

    # Give the master a head start.
    time.sleep(10)
    job_output = Path(output_dir) / JOBS_OUTPUT_DIR / job.name
    logs_dir = job_output / "spark" / "logs"
    job_conf_dir = job_output / "spark" / "conf"
    workers_dir = job_output / "spark" / "workers"
    _set_env_variables(job, job_conf_dir, logs_dir)
    worker_memory = _get_worker_memory_str(job, is_master=False)
    cmd = _get_worker_command(job, manager_node, worker_memory)
    ret = 1
    output = {}
    for _ in range(5):
        output.clear()
        logger.info("Run spark worker: [%s]", cmd)
        ret = run_command(cmd, output=output)
        if ret == 0:
            break
    if ret != 0:
        logger.error("Failed to start spark worker: %s: %s", ret, output)

    shutdown_file = _get_shutdown_file(job.name, output_dir)
    while not shutdown_file.exists():
        logger.debug("sleep for %s seconds", poll_interval)
        time.sleep(poll_interval)

    logger.info("Detected shutdown.")
    check_run_command(job.model.spark_config.get_stop_worker())
    if job.model.spark_config.collect_worker_logs:
        shutil.copytree(Path(os.environ["SPARK_WORKER_DIR"]),
                        workers_dir / hostname)
    return 0
예제 #9
0
def run_multi_node_job(job_name, jade_runtime_output, verbose, manager_script_and_args):
    """Run a job across multiple nodes. The manager node will invoke manager_script_and_args."""
    output = {}
    check_run_command(f"jade cluster am-i-manager {jade_runtime_output}", output)
    result = output["stdout"].strip()
    if result == "true":
        ret = run_manager(job_name, jade_runtime_output, verbose, manager_script_and_args)
    else:
        assert result == "false", result
        # The only purpose of this worker function is to keep the node allocation
        # alive. There are more efficient ways of doing this with HPC commands.
        # However, this procedure allows us to run the JADE JobRunner in the
        # background on each node and collect resource utilization statistics.
        ret = run_worker(job_name, jade_runtime_output, verbose)

    return ret
예제 #10
0
def test_submission_groups(cleanup):
    config = create_config()
    config.dump(CONFIG_FILE)

    cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} -h {FAKE_HPC_CONFIG} -p 0.1"
    check_run_command(cmd)

    output_path = Path(OUTPUT)
    config_batch_files = list(output_path.glob("config_batch*.json"))
    assert len(config_batch_files) == 3
    batch1 = load_data(output_path / "config_batch_1.json")
    assert len(batch1["jobs"]) == 3
    batch2 = load_data(output_path / "config_batch_2.json")
    assert len(batch2["jobs"]) == 1
    assert batch2["jobs"][0]["job_id"] == 4
    batch3 = load_data(output_path / "config_batch_3.json")
    assert len(batch3["jobs"]) == 1
    assert batch3["jobs"][0]["job_id"] == 5
예제 #11
0
def test_estimated_run_time(cleanup):
    # walltime is 240 minutes
    # 10-minute jobs
    # Each of 4 cores can each complete 24 jobs. 4 * 24 = 96 jobs
    # 100 jobs will take two batches.
    cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} -t -n2 -q4"
    check_run_command(cmd)
    check_run_command(f"{WAIT} --output={OUTPUT} -p 0.01")

    batch_config_1 = Path(OUTPUT) / "config_batch_1.json"
    assert os.path.exists(batch_config_1)
    batch_config_2 = Path(OUTPUT) / "config_batch_2.json"
    assert os.path.exists(batch_config_2)

    config1 = load_data(batch_config_1)
    assert len(config1["jobs"]) == 96
    config2 = load_data(batch_config_2)
    assert len(config2["jobs"]) == 4
예제 #12
0
파일: slurm_manager.py 프로젝트: jgu2/jade
 def list_active_nodes(self, job_id):
     out1 = {}
     # It's possible that 500 characters won't be enough, even with the compact format.
     # Compare the node count against the result to make sure we got all nodes.
     # There should be a better way to get this.
     check_run_command(f'squeue -j {job_id} --format="%5D %500N" -h', out1)
     result = out1["stdout"].strip().split()
     assert len(result) == 2, str(result)
     num_nodes = int(result[0])
     nodes_compact = result[1]
     out2 = {}
     check_run_command(f'scontrol show hostnames "{nodes_compact}"', out2)
     nodes = [x for x in out2["stdout"].split("\n") if x != ""]
     if len(nodes) != num_nodes:
         raise Exception(
             f"Bug in parsing node names. Found={len(nodes)} Actual={num_nodes}"
         )
     return nodes
예제 #13
0
def run_spark_cluster(job_name, jade_runtime_output, verbose,
                      manager_script_and_args):
    """Create a Spark cluster across multiple nodes. The manager node will invoke the script."""
    config = create_config_from_file(Path(jade_runtime_output) / CONFIG_FILE)
    job = config.get_job(job_name)
    _set_hostnames(jade_runtime_output)
    output = {}
    check_run_command(f"jade cluster am-i-manager {jade_runtime_output}",
                      output)
    result = output["stdout"].strip()
    manager_node = _get_manager_node_name(jade_runtime_output)
    if result == "true":
        ret = run_cluster_master(job, manager_node, jade_runtime_output,
                                 verbose, manager_script_and_args)
    else:
        assert result == "false", result
        ret = run_worker(job, manager_node, jade_runtime_output, verbose)

    return ret
예제 #14
0
    def get_job_stats(self, job_id):
        cmd = (
            f"sacct -j {job_id} --format=JobID,JobName%20,state,start,end,Account,Partition%15,QOS"
        )
        output = {}
        check_run_command(cmd, output=output)
        result = output["stdout"].strip().split("\n")
        if len(result) != 6:
            raise Exception(
                f"Unknown output for sacct: {result} length={len(result)}")

        # 8165902       COMPLETED 2022-01-16T12:10:37 2022-01-17T04:04:34
        fields = result[2].split()
        if fields[0] != job_id:
            raise Exception(f"sacct returned unexpected job_id={fields[0]}")

        state = self._STATUSES.get(fields[2], HpcJobStatus.UNKNOWN)
        fmt = "%Y-%m-%dT%H:%M:%S"
        try:
            start = datetime.strptime(fields[3], fmt)
        except ValueError:
            logger.exception("Failed to parse start_time=%s", fields[3])
            raise
        try:
            if fields[4] == "Unknown":
                end = fields[4]
            else:
                end = datetime.strptime(fields[4], fmt)
        except ValueError:
            logger.exception("Failed to parse end_time=%s", fields[4])
            raise
        stats = HpcJobStats(
            hpc_job_id=job_id,
            name=fields[1],
            state=state,
            start=start,
            end=end,
            account=fields[5],
            partition=fields[6],
            qos=fields[7],
        )
        return stats
예제 #15
0
def test_run_generic_commands(generic_command_fixture):
    commands = [
        "ls .",
        "ls invalid-file-path",
    ]

    with open(TEST_FILENAME, "w") as f_out:
        for command in commands:
            f_out.write(command + "\n")

    inputs = GenericCommandInputs(TEST_FILENAME)
    config = GenericCommandConfiguration()
    for job_param in inputs.iter_jobs():
        config.add_job(job_param)
    assert config.get_num_jobs() == 2

    config.dump(CONFIG_FILE)

    cmds = (
        f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} -p 0.1 -h {FAKE_HPC_CONFIG}",
        # Test with higher queue depth. This exercises the code paths but
        # doesn't actually verify the functionality.
        # The infrastructure to do that is currently lacking. TODO
        f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT} -p 0.1 -q 32 -h {FAKE_HPC_CONFIG}",
    )

    for cmd in cmds:
        check_run_command(cmd)
        check_run_command(f"{WAIT} --output={OUTPUT} --poll-interval=0.01")

    assert list(Path(OUTPUT).glob("*.sh"))
    check_run_command(f"jade prune-files {OUTPUT}")
    assert not list(Path(OUTPUT).glob("*.sh"))
예제 #16
0
파일: job_runner.py 프로젝트: NREL/jade
    def run_jobs(self,
                 distributed_submitter=True,
                 verbose=False,
                 num_processes=None):
        """Run the jobs.

        Parameters
        ----------
        distributed_submitter : bool
            If True, make cluster updates.
        verbose : bool
            If True, enable debug logging.
        num_processes : int
            Number of processes to run in parallel; defaults to num CPUs

        Returns
        -------
        Status

        """
        logger.info("Run jobs.")
        scratch_dir = self._create_local_scratch()
        are_inputs_local = self._intf_type == HpcType.LOCAL

        try:
            config_file = self._config.serialize_for_execution(
                scratch_dir, are_inputs_local)
            jobs = self._generate_jobs(config_file, verbose)

            os.environ["JADE_RUNTIME_OUTPUT"] = self._output
            os.environ[
                "JADE_SUBMISSION_GROUP"] = self._config.get_default_submission_group(
                ).name
            # Setting node_setup_script and node_shutdown_script are obsolete and will
            # eventually be deleted.
            group = self._config.get_default_submission_group()
            if group.submitter_params.node_setup_script is not None:
                cmd = f"{group.submitter_params.node_setup_script} {config_file} {self._output}"
                check_run_command(cmd)
            elif self._config.node_setup_command is not None:
                check_run_command(self._config.node_setup_command)

            result = self._run_jobs(jobs, num_processes=num_processes)

            if group.submitter_params.node_shutdown_script:
                cmd = f"{group.submitter_params.node_shutdown_script} {config_file} {self._output}"
                ret2 = run_command(cmd)
                if ret2 != 0:
                    logger.error("Failed to run node shutdown script %s: %s",
                                 cmd, ret2)
            elif self._config.node_teardown_command is not None:
                start = time.time()
                ret2 = run_command(self._config.node_teardown_script)
                if ret2 != 0:
                    logger.error(
                        "Failed to run node shutdown script %s: %s",
                        self._config.node_teardown_command,
                        ret2,
                    )
                logger.info("Node teardown script duration = %s seconds",
                            time.time() - start)

            logger.info("Completed %s jobs", len(jobs))
        finally:
            shutil.rmtree(scratch_dir)
            if distributed_submitter and are_inputs_local:
                self._complete_hpc_job()

        return result
예제 #17
0
def test_estimated_run_time_too_long(job_too_long):
    cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT}"
    with pytest.raises(ExecutionError):
        check_run_command(cmd)
예제 #18
0
def _run_cluster_master(job, manager_node, output_dir, verbose,
                        manager_script_and_args):
    filename = os.path.join(output_dir, f"run_spark_cluster__{job.name}.log")
    level = logging.DEBUG if verbose else logging.INFO
    setup_logging(__name__,
                  filename,
                  file_level=level,
                  console_level=level,
                  mode="w")
    logger.info("Run cluster master on %s job=%s: %s", socket.gethostname(),
                job.name, get_cli_string())

    job_output = Path(output_dir) / JOBS_OUTPUT_DIR / job.name
    if job_output.exists():
        shutil.rmtree(job_output)
    job_output.mkdir(parents=True)
    events_dir = job_output / "spark" / "events"
    events_dir.mkdir(parents=True)
    logs_dir = job_output / "spark" / "logs"
    logs_dir.mkdir()
    workers_dir = job_output / "spark" / "workers"
    workers_dir.mkdir()

    # Make a job-specific conf directory because the log and event files need to be per-job.
    job_conf_dir = job_output / "spark" / "conf"
    shutil.copytree(
        Path(job.model.spark_config.conf_dir) / "conf", job_conf_dir)
    _fix_spark_conf_file(job_conf_dir, events_dir)
    _set_env_variables(job, job_conf_dir, logs_dir)

    # Ignore errors. Spark may not be running.
    run_command(job.model.spark_config.get_stop_worker())
    run_command(job.model.spark_config.get_stop_history_server())
    run_command(job.model.spark_config.get_stop_master())

    # It would be better to start all workers from the master. Doing so would require that
    # Spark processes on the master node be able to ssh into the worker nodes.
    # I haven't spent the time to figure out to do that inside Singularity containers.
    master_cmd = job.model.spark_config.get_start_master()
    logger.info("Run spark master: [%s]", master_cmd)
    check_run_command(master_cmd)
    history_cmd = job.model.spark_config.get_start_history_server()
    logger.info("Run spark history server: [%s]", history_cmd)
    check_run_command(history_cmd)
    worker_memory = _get_worker_memory_str(job, is_master=True)

    worker_cmd = _get_worker_command(job, manager_node, memory=worker_memory)
    logger.info("Run spark worker: [%s]", worker_cmd)
    check_run_command(worker_cmd)

    # Wait for workers.
    # TODO: find a way to check programmatically with the rest api
    # or parse the logs
    time.sleep(15)
    args = list(manager_script_and_args) + [
        _get_cluster(manager_node),
        str(job_output)
    ]
    if job.model.spark_config.run_user_script_inside_container:
        user_cmd = str(job.model.spark_config.get_run_user_script()
                       ) + " " + " ".join(args)
    else:
        user_cmd = " ".join(args)
    logger.info("Run user script [%s]", user_cmd)

    start = time.time()
    ret = run_command(user_cmd)
    logger.info("Finished job. duration = %s seconds", time.time() - start)

    # Delay to ensure the history is saved.
    time.sleep(10)
    metrics = SparkMetrics("localhost", history=True)
    try:
        metrics.generate_metrics(job_output / "spark_metrics")
    except Exception:
        logger.exception("Failed to generate metrics")

    check_run_command(job.model.spark_config.get_stop_worker())
    check_run_command(job.model.spark_config.get_stop_history_server())
    check_run_command(job.model.spark_config.get_stop_master())
    if job.model.spark_config.collect_worker_logs:
        shutil.copytree(Path(os.environ["SPARK_WORKER_DIR"]),
                        workers_dir / socket.gethostname())
    return ret
예제 #19
0
def test_dry_run(cleanup):
    cmd = f"jade submit-jobs --dry-run -h {SLURM_HPC_CONFIG} {CONFIG_FILE} --output={OUTPUT}"
    check_run_command(cmd)
예제 #20
0
def test_check_run_command():
    """Test that check_run_command raises an exception."""
    with pytest.raises(ExecutionError):
        check_run_command("ls invalid_test_file")