예제 #1
0
def _test_job_dependencies(remote_command_executor, region, stack_name,
                           scaledown_idletime):
    logging.info(
        "Testing cluster doesn't scale when job dependencies are not satisfied"
    )
    sge_commands = SgeCommands(remote_command_executor)
    result = sge_commands.submit_command("sleep 60", nodes=1)
    job_id = sge_commands.assert_job_submitted(result.stdout)
    result = remote_command_executor.run_remote_command(
        "echo 'sleep 1' | qsub -hold_jid {0}".format(job_id),
        raise_on_error=False)
    dependent_job_id = sge_commands.assert_job_submitted(result.stdout)

    assert_that(_get_job_state(remote_command_executor,
                               dependent_job_id)).is_equal_to("hqw")

    # Assert scaling worked as expected
    assert_scaling_worked(sge_commands,
                          region,
                          stack_name,
                          scaledown_idletime,
                          expected_max=1,
                          expected_final=0)
    # Assert jobs were completed
    sge_commands.assert_job_succeeded(job_id)
    sge_commands.assert_job_succeeded(dependent_job_id)
예제 #2
0
def _test_job_arrays_and_parallel_jobs(remote_command_executor, region,
                                       stack_name, scaledown_idletime,
                                       max_slots):
    logging.info(
        "Testing cluster scales correctly with array jobs and parallel jobs")
    sge_commands = SgeCommands(remote_command_executor)

    result = remote_command_executor.run_remote_command(
        "echo 'sleep 1' | qsub -t 1-5", raise_on_error=False)
    array_job_id = sge_commands.assert_job_submitted(result.stdout,
                                                     is_array=True)

    result = remote_command_executor.run_remote_command(
        "echo 'sleep 1' | qsub -pe mpi 4", raise_on_error=False)
    parallel_job_id = sge_commands.assert_job_submitted(result.stdout)

    # Assert scaling worked as expected
    expected_max = math.ceil(float(5 + 4) / max_slots)
    assert_scaling_worked(sge_commands,
                          region,
                          stack_name,
                          scaledown_idletime,
                          expected_max=expected_max,
                          expected_final=0)
    # Assert jobs were completed
    sge_commands.assert_job_succeeded(array_job_id)
    sge_commands.assert_job_succeeded(parallel_job_id)
예제 #3
0
def _test_fsx_lustre_correctly_shared(remote_command_executor, mount_dir):
    logging.info("Testing fsx lustre correctly mounted on compute nodes")
    sge_commands = SgeCommands(remote_command_executor)
    remote_command_executor.run_remote_command("touch {mount_dir}/test_file".format(mount_dir=mount_dir))
    job_command = (
        "cat {mount_dir}/s3_test_file "
        "&& cat {mount_dir}/test_file "
        "&& touch {mount_dir}/compute_output".format(mount_dir=mount_dir)
    )
    result = sge_commands.submit_command(job_command)
    job_id = sge_commands.assert_job_submitted(result.stdout)
    sge_commands.wait_job_completed(job_id)
    sge_commands.assert_job_succeeded(job_id)
    remote_command_executor.run_remote_command("cat {mount_dir}/compute_output".format(mount_dir=mount_dir))
예제 #4
0
def _test_non_runnable_jobs(remote_command_executor, max_queue_size, max_slots,
                            region, cluster, scaledown_idletime):
    logging.info("Testing jobs that violate scheduling requirements")
    sge_commands = SgeCommands(remote_command_executor)

    # Make sure the cluster has at least 1 node in the queue so that we can verify cluster scales down correctly
    if sge_commands.compute_nodes_count() == 0:
        result = sge_commands.submit_command("sleep 1")
        job_id = sge_commands.assert_job_submitted(result.stdout)
        sge_commands.wait_job_completed(job_id)
    assert_that(sge_commands.compute_nodes_count()).is_greater_than(0)

    logging.info(
        "Testing cluster doesn't scale when job requires a capacity that is higher than the max available"
    )
    result = sge_commands.submit_command("sleep 1000",
                                         slots=(max_slots * max_queue_size) +
                                         1)
    max_slots_job_id = sge_commands.assert_job_submitted(result.stdout)
    assert_that(_get_job_state(remote_command_executor,
                               max_slots_job_id)).is_equal_to("qw")

    logging.info("Testing cluster doesn't scale when job is set on hold")
    result = sge_commands.submit_command("sleep 1000", hold=True)
    hold_job_id = sge_commands.assert_job_submitted(result.stdout)
    assert_that(_get_job_state(remote_command_executor,
                               hold_job_id)).is_equal_to("hqw")

    logging.info(
        "Testing cluster scales down when pending jobs cannot be submitted")
    assert_scaling_worked(sge_commands,
                          region,
                          cluster.cfn_name,
                          scaledown_idletime,
                          expected_max=1,
                          expected_final=0)
    # Assert jobs are still pending
    pending_jobs = remote_command_executor.run_remote_command(
        "qstat -s p | tail -n +3 | awk '{ print $1 }'").stdout
    pending_jobs = pending_jobs.splitlines()
    assert_that(pending_jobs).contains(max_slots_job_id, hold_job_id)