Exemplo n.º 1
0
def _test_dynamic_cluster_limits(remote_command_executor, max_queue_size, max_slots, region, asg_name):
    logging.info("Testing cluster limits are dynamically updated")
    torque_commands = TorqueCommands(remote_command_executor)

    # Make sure cluster is scaled to 0 when this test starts
    assert_that(torque_commands.compute_nodes_count()).is_equal_to(0)

    _assert_scheduler_configuration(remote_command_executor, torque_commands, max_slots, max_queue_size)

    # Submit a job to scale up to 1 node
    result = torque_commands.submit_command("sleep 1", nodes=1)
    job_id = torque_commands.assert_job_submitted(result.stdout)
    # Change ASG max size
    asg_client = boto3.client("autoscaling", region_name=region)
    new_max_size = max_queue_size + 1
    asg_client.update_auto_scaling_group(AutoScalingGroupName=asg_name, MaxSize=new_max_size)
    # sleeping for 200 seconds since daemons fetch this data every 3 minutes
    time.sleep(200)
    # Wait for job completion to be sure cluster scaled
    torque_commands.wait_job_completed(job_id)

    _assert_scheduler_configuration(remote_command_executor, torque_commands, max_slots, new_max_size)

    # Restore initial cluster size
    asg_client.update_auto_scaling_group(AutoScalingGroupName=asg_name, MaxSize=max_queue_size)
    # sleeping for 200 seconds since daemons fetch this data every 3 minutes
    time.sleep(200)
    _assert_scheduler_configuration(remote_command_executor, torque_commands, max_slots, max_queue_size)
Exemplo n.º 2
0
def _test_non_runnable_jobs(remote_command_executor, max_queue_size, max_slots,
                            region, cluster, scaledown_idletime):
    logging.info("Testing jobs that violate scheduling requirements")
    torque_commands = TorqueCommands(remote_command_executor)

    # Make sure the cluster has at least 1 node in the queue so that we can verify cluster scales down correctly
    if torque_commands.compute_nodes_count() == 0:
        result = torque_commands.submit_command("sleep 1")
        job_id = torque_commands.assert_job_submitted(result.stdout)
        torque_commands.wait_job_completed(job_id)
    assert_that(torque_commands.compute_nodes_count()).is_greater_than(0)

    logging.info(
        "Testing cluster doesn't scale when job requires a capacity that is higher than the max available"
    )
    # nodes limit enforced by scheduler
    result = remote_command_executor.run_remote_command(
        "echo 'sleep 1000' | qsub -l nodes={0}".format(max_queue_size + 1),
        raise_on_error=False)
    assert_that(result.stdout).contains("Job exceeds queue resource limits")
    # ppn limit enforced by daemons
    result = remote_command_executor.run_remote_command(
        "echo 'sleep 1000' | qsub -l nodes=1:ppn={0}".format(max_slots + 1),
        raise_on_error=False)
    ppn_job_id = torque_commands.assert_job_submitted(result.stdout)
    # ppn total limit enforced by scheduler
    result = remote_command_executor.run_remote_command(
        "echo 'sleep 1000' | qsub -l nodes=1:ppn={0}".format((max_slots *
                                                              max_queue_size) +
                                                             1),
        raise_on_error=False)
    assert_that(result.stdout).contains("Job exceeds queue resource limits")
    # ncpus limit enforced by scheduler
    result = remote_command_executor.run_remote_command(
        "echo 'sleep 1000' | qsub -l ncpus={0}".format(max_slots + 1),
        raise_on_error=False)
    assert_that(result.stdout).contains("Job exceeds queue resource limits")

    logging.info("Testing cluster doesn't scale when job is set on hold")
    result = remote_command_executor.run_remote_command(
        "echo 'sleep 1000' | qsub -l nodes=1 -h", raise_on_error=False)
    hold_job_id = torque_commands.assert_job_submitted(result.stdout)

    logging.info(
        "Testing cluster scales down when pending jobs cannot be submitted")
    assert_scaling_worked(torque_commands,
                          region,
                          cluster.cfn_name,
                          scaledown_idletime,
                          expected_max=1,
                          expected_final=0)
    # Assert jobs are still pending
    assert_that(_get_job_state(remote_command_executor,
                               ppn_job_id)).is_equal_to("Q")
    assert_that(_get_job_state(remote_command_executor,
                               hold_job_id)).is_equal_to("H")