示例#1
0
def test_get_pending_jobs_info(pending_jobs, max_slots_filter,
                               max_nodes_filter, filter_by_pending_reasons,
                               expected_output, mocker):
    mock = mocker.patch("common.schedulers.slurm_commands.get_jobs_info",
                        return_value=pending_jobs,
                        autospec=True)

    pending_jobs = get_pending_jobs_info(max_slots_filter, max_nodes_filter,
                                         filter_by_pending_reasons)

    mock.assert_called_with(job_state_filter="PD")
    assert_that(pending_jobs).is_equal_to(expected_output)
示例#2
0
def hasPendingJobs(instance_properties, max_size):
    try:
        pending_jobs = get_pending_jobs_info(
            max_slots_filter=instance_properties.get("slots"),
            max_nodes_filter=max_size,
            filter_by_pending_reasons=PENDING_RESOURCES_REASONS,
        )
        return len(pending_jobs) > 0, False
    except Exception as e:
        log.error(
            "Failed when checking if node is down with exception %s. Reporting no pending jobs.",
            e)
        return False, True
示例#3
0
def get_required_nodes(instance_properties, max_size):
    log.info("Computing number of required nodes for submitted jobs")
    pending_jobs = get_pending_jobs_info(
        max_slots_filter=instance_properties.get("slots"),
        max_nodes_filter=max_size,
        filter_by_pending_reasons=PENDING_RESOURCES_REASONS,
    )
    slots_requested = []
    nodes_requested = []
    for job in pending_jobs:
        slots_requested.append(job.cpus_total)
        nodes_requested.append(job.nodes)

    return get_optimal_nodes(nodes_requested, slots_requested,
                             instance_properties)
示例#4
0
def hasPendingJobs(instance_properties, max_size):
    """
    Check if there is any pending job in the queue.

    :return: a pair (has_pending_job, has_error) where has_error communicates if there was
             an error when checking for pending jobs.
    """
    try:
        pending_jobs = get_pending_jobs_info(
            max_slots_filter=instance_properties.get("slots"),
            max_nodes_filter=max_size,
            filter_by_pending_reasons=PENDING_RESOURCES_REASONS,
        )
        return len(pending_jobs) > 0, False
    except Exception as e:
        log.error("Failed when checking if node is down with exception %s. Reporting no pending jobs.", e)
        return False, True
def get_required_nodes(instance_properties, max_size):
    log.info("Computing number of required nodes for submitted jobs")
    pending_jobs = get_pending_jobs_info(
        instance_properties=instance_properties,
        max_nodes_filter=max_size,
        filter_by_pending_reasons=PENDING_RESOURCES_REASONS,
    )
    logging.info("Found the following pending jobs:\n%s", pending_jobs)

    resources_requested = []
    nodes_requested = []
    for job in pending_jobs:
        resources_for_job = {}
        resources_for_job["gpus"] = process_gpus_total_for_job(job)
        resources_for_job["slots"] = job.cpus_total
        resources_requested.append(resources_for_job)
        nodes_requested.append(job.nodes)

    return get_optimal_nodes(nodes_requested, resources_requested, instance_properties)