Exemplo n.º 1
0
def test_slurm_max_array_size():
    max_array_size = 2

    executor = cluster_tools.get_executor("slurm", debug=True)
    original_max_array_size = executor.get_max_array_size()

    command = f"MaxArraySize={max_array_size}"
    _, _, exit_code = call(
        f"echo -e '{command}' >> /etc/slurm/slurm.conf && scontrol reconfigure"
    )

    try:
        assert exit_code == 0

        new_max_array_size = executor.get_max_array_size()
        assert new_max_array_size == max_array_size

        with executor:
            futures = executor.map_to_futures(square, range(6))
            concurrent.futures.wait(futures)
            job_ids = [fut.cluster_jobid for fut in futures]

            # Count how often each job_id occurs which corresponds to the array size of the job
            occurences = list(Counter(job_ids).values())

            assert all(array_size <= max_array_size
                       for array_size in occurences)
    finally:
        _, _, exit_code = call(
            f"sed -i 's/{command}//g' /etc/slurm/slurm.conf && scontrol reconfigure"
        )
        assert exit_code == 0
        reset_max_array_size = executor.get_max_array_size()
        assert reset_max_array_size == original_max_array_size
Exemplo n.º 2
0
def test_slurm_max_submit_user():
    max_submit_jobs = 6

    # MaxSubmitJobs can either be defined at the user or at the qos level
    for command in ["user root", "qos normal"]:
        executor = cluster_tools.get_executor("slurm", debug=True)
        original_max_submit_jobs = executor.get_max_submit_jobs()

        _, _, exit_code = call(
            f"echo y | sacctmgr modify {command} set MaxSubmitJobs={max_submit_jobs}"
        )
        try:
            assert exit_code == 0

            new_max_submit_jobs = executor.get_max_submit_jobs()
            assert new_max_submit_jobs == max_submit_jobs

            with executor:
                futures = executor.map_to_futures(square, range(10))

                result = [fut.result() for fut in futures]
                assert result == [i**2 for i in range(10)]

                job_ids = {fut.cluster_jobid for fut in futures}
                # The 10 work packages should have been scheduled as 5 separate jobs,
                # because the cluster_tools schedule at most 1/3 of MaxSubmitJobs at once.
                assert len(job_ids) == 5
        finally:
            _, _, exit_code = call(
                f"echo y | sacctmgr modify {command} set MaxSubmitJobs=-1")
            assert exit_code == 0
            reset_max_submit_jobs = executor.get_max_submit_jobs()
            assert reset_max_submit_jobs == original_max_submit_jobs
Exemplo n.º 3
0
def test_slurm_deferred_submit():
    max_submit_jobs = 1

    # Only one job can be scheduled at a time
    _, _, exit_code = call(
        f"echo y | sacctmgr modify qos normal set MaxSubmitJobs={max_submit_jobs}"
    )
    executor = cluster_tools.get_executor("slurm", debug=True)

    try:
        with executor:
            time_of_start = time.time()
            futures = executor.map_to_futures(sleep, [0.5, 0.5])
            time_of_futures = time.time()
            concurrent.futures.wait(futures)
            time_of_result = time.time()

            # The futures should be returned before each job was scheduled
            assert time_of_futures - time_of_start < 0.5

            # Computing the results should have taken at least two seconds
            # since only one job is scheduled at a time and each job takes 0.5 seconds
            assert time_of_result - time_of_start > 1
    finally:
        _, _, exit_code = call(
            "echo y | sacctmgr modify qos normal set MaxSubmitJobs=-1")
Exemplo n.º 4
0
    def get_max_submit_jobs():
        max_submit_jobs_env = os.environ.get("SLURM_MAX_SUBMIT_JOBS", None)
        if max_submit_jobs_env is not None:
            logging.debug(
                f"SLURM_MAX_SUBMIT_JOBS env variable specified which is {max_submit_jobs_env}."
            )
            return int(max_submit_jobs_env)

        max_submit_jobs = 2**32
        # Check whether there is a limit per user
        stdout_user, stderr_user, _ = call(
            "sacctmgr list -n user $USER withassoc format=maxsubmitjobsperuser"
        )
        try:
            max_submit_jobs = int(stdout_user.decode("utf8"))
        except ValueError:
            # If there is no limit per user check whether there is a general limit
            stdout_qos, stderr_qos, _ = call(
                "sacctmgr list -n qos normal format=maxsubmitjobsperuser")
            try:
                max_submit_jobs = int(stdout_qos.decode("utf8"))
            except ValueError:
                logging.warning(
                    f"Slurm's MaxSubmitJobsPerUser couldn't be determined. Reason: {stderr_user}\n{stderr_qos}"
                )
                return max_submit_jobs
        logging.debug(f"Slurm MaxSubmitJobsPerUser is {max_submit_jobs}.")
        return max_submit_jobs
Exemplo n.º 5
0
    def check_job_state(
            self,
            job_id_with_index) -> Literal["failed", "ignore", "completed"]:

        job_states = []

        # If the output file was not found, we determine the job status so that
        # we can recognize jobs which failed hard (in this case, they don't produce output files)
        stdout, _, exit_code = call(
            "scontrol show job {}".format(job_id_with_index))
        stdout = stdout.decode("utf8")

        if exit_code == 0:
            job_state_search = re.search("JobState=([a-zA-Z_]*)", str(stdout))
            if job_state_search:
                job_states = [job_state_search.group(1)]
            else:
                logging.error("Could not extract slurm job state? {}".format(
                    stdout[0:10]))
        else:
            stdout, _, exit_code = call(
                "sacct -j {} -o State -P".format(job_id_with_index))
            stdout = stdout.decode("utf8")

            if exit_code == 0:
                job_states = stdout.split("\n")[1:]

        if len(job_states) == 0:
            logging.error(
                "Couldn't call scontrol nor sacct to determine job's status. Continuing to poll for output file. This could be an indicator for a failed job which was already cleaned up from the slurm db. If this is the case, the process will hang forever."
            )
            return "ignore"

        def matches_states(slurm_states):
            return len(list(set(job_states) & set(slurm_states))) > 0

        if matches_states(SLURM_STATES["Failure"]):
            return "failed"
        elif matches_states(SLURM_STATES["Ignore"]):
            return "ignore"
        elif matches_states(SLURM_STATES["Unclear"]):
            logging.warning(
                "The job state for {} is {}. It's unclear whether the job will recover. Will wait further"
                .format(job_id_with_index, job_states))
            return "ignore"
        elif matches_states(SLURM_STATES["Success"]):
            return "completed"
        else:
            logging.error("Unhandled slurm job state for job id {}? {}".format(
                job_id_with_index, job_states))
            return "ignore"
Exemplo n.º 6
0
def test_slurm_deferred_submit_shutdown():
    # Test that the SlurmExecutor stops scheduling jobs in a separate thread
    # once it was killed even if the executor was used multiple times and
    # therefore started multiple job submission threads
    max_submit_jobs = 1

    # Only one job can be scheduled at a time
    _, _, exit_code = call(
        f"echo y | sacctmgr modify qos normal set MaxSubmitJobs={max_submit_jobs}"
    )
    executor = cluster_tools.get_executor("slurm", debug=True)

    try:
        # Use the executor twice to start multiple job submission threads
        executor.map_to_futures(sleep, [0.5] * 10)
        executor.map_to_futures(sleep, [0.5] * 10)

        wait_until_first_job_was_submitted(executor)

        for submit_thread in executor.submit_threads:
            assert submit_thread.is_alive()

        with pytest.raises(SystemExit) as pytest_wrapped_e:
            executor.handle_kill(None, None)
        assert pytest_wrapped_e.type == SystemExit
        assert pytest_wrapped_e.value.code == 130

        # Wait for the threads to die down, but less than it would take to submit all jobs
        # which would take ~5 seconds since only one job is scheduled at a time
        for submit_thread in executor.submit_threads:
            submit_thread.join(1)
            assert not submit_thread.is_alive()

        # Wait for scheduled jobs to finish, so that the queue is empty again
        while executor.get_number_of_submitted_jobs() > 0:
            time.sleep(0.5)

    finally:
        _, _, exit_code = call(
            "echo y | sacctmgr modify qos normal set MaxSubmitJobs=-1"
        )
Exemplo n.º 7
0
 def get_max_submit_jobs():
     max_submit_jobs = 2**32
     # Check whether there is a limit per user
     stdout_user, stderr_user, _ = call(
         "sacctmgr list -n user $USER withassoc format=maxsubmitjobsperuser"
     )
     try:
         max_submit_jobs = int(stdout_user.decode("utf8"))
     except ValueError:
         # If there is no limit per user check whether there is a general limit
         stdout_qos, stderr_qos, _ = call(
             "sacctmgr list -n qos normal format=maxsubmitjobsperuser")
         try:
             max_submit_jobs = int(stdout_qos.decode("utf8"))
         except ValueError:
             logging.warning(
                 f"Slurm's MaxSubmitJobsPerUser couldn't be determined. Reason: {stderr_user}\n{stderr_qos}"
             )
             return max_submit_jobs
     logging.debug(f"Slurm MaxSubmitJobsPerUser is {max_submit_jobs}.")
     return max_submit_jobs
Exemplo n.º 8
0
 def get_max_array_size():
     max_array_size = 2**32
     # See https://unix.stackexchange.com/a/364615
     stdout, stderr, exit_code = call(
         "scontrol show config | sed -n '/^MaxArraySize/s/.*= *//p'")
     if exit_code == 0:
         max_array_size = int(stdout.decode("utf8"))
         logging.debug(f"Slurm MaxArraySize is {max_array_size}.")
     else:
         logging.warning(
             f"Slurm's MaxArraySize couldn't be determined. Reason: {stderr}"
         )
     return max_array_size
Exemplo n.º 9
0
 def get_number_of_submitted_jobs():
     number_of_submitted_jobs = 0
     # --array so that each job array element is displayed on a separate line and -h to hide the header
     stdout, stderr, exit_code = call("squeue --array -u $USER -h | wc -l")
     if exit_code == 0:
         number_of_submitted_jobs = int(stdout.decode("utf8"))
         logging.debug(
             f"Number of currently submitted jobs is {number_of_submitted_jobs}."
         )
     else:
         logging.warning(
             f"Number of currently submitted jobs couldn't be determined. Reason: {stderr}"
         )
     return number_of_submitted_jobs
Exemplo n.º 10
0
    def check_job_state(
            self,
            job_id_with_index) -> Literal["failed", "ignore", "completed"]:
        if len(str(job_id_with_index).split("_")) >= 2:
            a, b = job_id_with_index.split("_")
            job_id_with_index = f"{a}[{b}]"

        # If the output file was not found, we determine the job status so that
        # we can recognize jobs which failed hard (in this case, they don't produce output files)
        stdout, _, exit_code = call("qstat -f {}".format(job_id_with_index))

        if exit_code != 0:
            logging.error(
                "Couldn't call checkjob to determine job's status. {}. Continuing to poll for output file. This could be an indicator for a failed job which was already cleaned up from the pbs db. If this is the case, the process will hang forever."
                .format(job_id_with_index))
            return "ignore"
        else:

            job_state_search = re.search("job_state = ([a-zA-Z_]*)",
                                         str(stdout))
            if job_state_search:
                job_state = job_state_search.group(1)

                if job_state in PBS_STATES["Failure"]:
                    return "failed"
                elif job_state in PBS_STATES["Ignore"]:
                    return "ignore"
                elif job_state in PBS_STATES["Unclear"]:
                    logging.warning(
                        "The job state for {} is {}. It's unclear whether the job will recover. Will wait further"
                        .format(job_id_with_index, job_state))
                    return "ignore"
                elif job_state in PBS_STATES["Success"]:
                    return "completed"
                else:
                    logging.error(
                        "Unhandled pbs job state? {}".format(job_state))
                    return "ignore"
            else:
                logging.error("Could not extract pbs job state? {}...".format(
                    stdout[0:10]))
                return "ignore"
Exemplo n.º 11
0
    def get_max_array_size():
        max_array_size_env = os.environ.get("SLURM_MAX_ARRAY_SIZE", None)
        if max_array_size_env is not None:
            logging.debug(
                f"SLURM_MAX_ARRAY_SIZE env variable specified which is {max_array_size_env}."
            )
            return int(max_array_size_env)

        max_array_size = 2**32
        # See https://unix.stackexchange.com/a/364615
        stdout, stderr, exit_code = call(
            "scontrol show config | sed -n '/^MaxArraySize/s/.*= *//p'")
        if exit_code == 0:
            max_array_size = int(stdout.decode("utf8"))
            logging.debug(f"Slurm MaxArraySize is {max_array_size}.")
        else:
            logging.warning(
                f"Slurm's MaxArraySize couldn't be determined. Reason: {stderr}"
            )
        return max_array_size
Exemplo n.º 12
0
    def investigate_failed_job(
            self,
            job_id_with_index) -> Optional[Tuple[str, Type[RemoteException]]]:
        # We call `seff job_id` which should return some output including a line,
        # such as: "Memory Efficiency: 25019.18% of 1.00 GB"

        stdout, _, exit_code = call("seff {}".format(job_id_with_index))
        if exit_code != 0:
            return None

        # Parse stdout into a key-value object
        properties = {}
        stdout = stdout.decode("utf8")
        for line in stdout.split("\n"):
            if ":" not in line:
                continue
            key, value = line.split(":", 1)
            properties[key.strip()] = value.strip()

        def investigate_memory_consumption():
            if not properties.get("Memory Efficiency", None):
                return None

            # Extract the "25019.18% of 1.00 GB" part of the line
            efficiency_note = properties["Memory Efficiency"]
            PERCENTAGE_REGEX = r"([0-9]+(\.[0-9]+)?)%"

            # Extract the percentage to see whether it exceeds 100%.
            match = re.search(PERCENTAGE_REGEX, efficiency_note)
            percentage = None
            if match is None:
                return None

            try:
                percentage = float(match.group(1))
            except ValueError:
                return None

            if percentage < 100:
                return None

            reason = f"The job was probably terminated because it consumed too much memory ({efficiency_note})."
            return (reason, RemoteOutOfMemoryException)

        def investigate_exit_code():
            if not properties.get("State", None):
                return None
            if "exit code 137" not in properties["State"]:
                return None
            reason = (
                "The job was probably terminated because it consumed too "
                "much memory (at least, the exit code 137 suggests this). Please "
                "use the `seff` utility to inspect the failed job and its potential "
                "job siblings (in case of an array job) to doublecheck the memory "
                "consumption.")
            return (reason, RemoteOutOfMemoryException)

        investigation = investigate_memory_consumption()
        if investigation:
            return investigation

        return investigate_exit_code()