示例#1
0
def _test_job_dependencies(remote_command_executor, region, stack_name,
                           scaledown_idletime, max_queue_size):
    logging.info(
        "Testing cluster doesn't scale when job dependencies are not satisfied"
    )
    slurm_commands = SlurmCommands(remote_command_executor)
    result = slurm_commands.submit_command("sleep 60", nodes=1)
    job_id = slurm_commands.assert_job_submitted(result.stdout)
    result = remote_command_executor.run_remote_command(
        "sbatch -N 1 --wrap='sleep 1' -d afterok:{0}".format(job_id))
    dependent_job_id = slurm_commands.assert_job_submitted(result.stdout)

    # Wait for reason to be computed
    time.sleep(3)
    assert_that(slurm_commands.get_job_info(job_id)).contains(
        "JobState=PENDING Reason=Nodes_required_for_job_are_DOWN,_DRAINED"
        "_or_reserved_for_jobs_in_higher_priority_partitions")
    assert_that(slurm_commands.get_job_info(dependent_job_id)).contains(
        "JobState=PENDING Reason=Dependency")

    assert_scaling_worked(slurm_commands,
                          region,
                          stack_name,
                          scaledown_idletime,
                          expected_max=1,
                          expected_final=0)
    # Assert scheduler configuration is correct
    _assert_dummy_nodes(remote_command_executor, max_queue_size)
    assert_that(
        _retrieve_slurm_compute_nodes_from_config(
            remote_command_executor)).is_empty()
    # Assert jobs were completed
    _assert_job_completed(remote_command_executor, job_id)
    _assert_job_completed(remote_command_executor, dependent_job_id)
示例#2
0
def _test_job_dependencies(remote_command_executor, region, stack_name, scaledown_idletime):
    logging.info("Testing cluster doesn't scale when job dependencies are not satisfied")
    slurm_commands = SlurmCommands(remote_command_executor)
    result = slurm_commands.submit_command("sleep 60", nodes=1)
    job_id = slurm_commands.assert_job_submitted(result.stdout)
    result = remote_command_executor.run_remote_command("sbatch -N 1 --wrap='sleep 1' -d afterok:{0}".format(job_id))
    dependent_job_id = slurm_commands.assert_job_submitted(result.stdout)

    # Wait for reason to be computed
    time.sleep(3)
    assert_that(_get_job_info(remote_command_executor, job_id)).contains(
        "JobState=PENDING Reason=Nodes_required_for_job_are_DOWN,_DRAINED"
        "_or_reserved_for_jobs_in_higher_priority_partitions"
    )
    assert_that(_get_job_info(remote_command_executor, dependent_job_id)).contains("JobState=PENDING Reason=Dependency")

    jobs_execution_time = 1
    estimated_scaleup_time = 5
    estimated_scaledown_time = 20
    asg_capacity_time_series, compute_nodes_time_series, timestamps = get_compute_nodes_allocation(
        scheduler_commands=slurm_commands,
        region=region,
        stack_name=stack_name,
        max_monitoring_time=minutes(jobs_execution_time)
        + minutes(scaledown_idletime)
        + minutes(estimated_scaleup_time)
        + minutes(estimated_scaledown_time),
    )
    assert_that(max(asg_capacity_time_series)).is_equal_to(1)
    assert_that(max(compute_nodes_time_series)).is_equal_to(1)
    assert_that(asg_capacity_time_series[-1]).is_equal_to(0)
    assert_that(compute_nodes_time_series[-1]).is_equal_to(0)
示例#3
0
def _gpu_test_scaleup(remote_command_executor, region, asg_name, stack_name,
                      scaledown_idletime, num_gpus):
    """Test cluster is scaling up correctly and GPU jobs are not aborted on slurmctld restart."""
    logging.info("Testing cluster scales correctly with GPU jobs")
    slurm_commands = SlurmCommands(remote_command_executor)
    # Assert initial conditions
    _assert_asg_has_no_node(region, asg_name)
    _assert_no_nodes_in_scheduler(slurm_commands)
    # g3.8xlarge has 32 vcpus and 2 GPUs, hardcoding tests for g3.8xlarge
    job_ids = []

    # sbatch --wrap 'sleep 10' -G 3
    result = slurm_commands.submit_command(command="sleep 10",
                                           nodes=-1,
                                           other_options="-G 3")
    job_ids.append(slurm_commands.assert_job_submitted(result.stdout))
    # Nodes/resources available after this job:
    # [{cpu:31, gpu:0}, {cpu:31, gpu:0}]

    # sbatch --wrap 'sleep 10' --cpus-per-gpu=10 --gpus-per-task=1
    result = slurm_commands.submit_command(
        command="sleep 10",
        nodes=-1,
        other_options="--cpus-per-gpu=10 --gpus-per-task=1")
    job_ids.append(slurm_commands.assert_job_submitted(result.stdout))
    # Nodes/resources available after this job:
    # [{cpu:31, gpu:0}, {cpu:31, gpu:0}, {cpu:22, gpu:1}]

    # sbatch --wrap 'sleep 10' -N 1 --gpus-per-node=1 -c 22 -n 1
    result = slurm_commands.submit_command(
        command="sleep 10",
        nodes=1,
        slots=1,
        other_options="--gpus-per-node=1 -c 23")
    job_ids.append(slurm_commands.assert_job_submitted(result.stdout))
    # Nodes/resources available after this job:
    # [{cpu:31, gpu:0}, {cpu:31, gpu:0}, {cpu:22, gpu:1}, {cpu:19, gpu:1}]

    # sbatch --wrap 'sleep 10' -c 31 -n 1
    result = slurm_commands.submit_command(command="sleep 10",
                                           nodes=-1,
                                           slots=1,
                                           other_options="-c 31")
    job_ids.append(slurm_commands.assert_job_submitted(result.stdout))
    # Nodes/resources available after this job:
    # [{cpu:0, gpu:0}, {cpu:31, gpu:0}, {cpu:22, gpu:1}, {cpu:19, gpu:1}]

    # Assert scaling worked as expected
    assert_scaling_worked(slurm_commands,
                          region,
                          stack_name,
                          scaledown_idletime,
                          expected_max=4,
                          expected_final=0)
    # Assert jobs were completed
    for job_id in job_ids:
        slurm_commands.assert_job_succeeded(job_id)
示例#4
0
def _test_job_arrays_and_parallel_jobs(remote_command_executor, region, stack_name, scaledown_idletime):
    logging.info("Testing cluster scales correctly with array jobs and parallel jobs")
    slurm_commands = SlurmCommands(remote_command_executor)

    result = remote_command_executor.run_remote_command("sbatch --wrap 'sleep 1' -a 1-5")
    array_job_id = slurm_commands.assert_job_submitted(result.stdout)

    result = remote_command_executor.run_remote_command("sbatch --wrap 'sleep 1' -c 3 -n 2")
    parallel_job_id = slurm_commands.assert_job_submitted(result.stdout)

    # Assert scaling worked as expected
    assert_scaling_worked(slurm_commands, region, stack_name, scaledown_idletime, expected_max=3, expected_final=0)
    # Assert jobs were completed
    _assert_job_completed(remote_command_executor, array_job_id)
    _assert_job_completed(remote_command_executor, parallel_job_id)
示例#5
0
def _gpu_resource_check(remote_command_executor):
    """Test GPU related resources are correctly allocated."""
    logging.info("Testing number of GPU/CPU resources allocated to job")
    slurm_commands = SlurmCommands(remote_command_executor)

    result = remote_command_executor.run_remote_command(
        "sbatch -G 1 --cpus-per-gpu 5 --wrap='sleep 1'")
    job_id = slurm_commands.assert_job_submitted(result.stdout)
    job_info = slurm_commands.get_job_info(job_id)
    assert_that(job_info).contains("TresPerJob=gpu:1", "CpusPerTres=gpu:5")

    result = remote_command_executor.run_remote_command(
        "sbatch --gres=gpu:2 --cpus-per-gpu 6 --wrap='sleep 1'")
    job_id = slurm_commands.assert_job_submitted(result.stdout)
    job_info = slurm_commands.get_job_info(job_id)
    assert_that(job_info).contains("TresPerNode=gpu:2", "CpusPerTres=gpu:6")
示例#6
0
def _test_dynamic_dummy_nodes(remote_command_executor, max_queue_size):
    logging.info("Testing dummy nodes are automatically reconfigured based on actual compute nodes")
    _assert_dummy_nodes(remote_command_executor, max_queue_size)
    slurm_commands = SlurmCommands(remote_command_executor)
    result = slurm_commands.submit_command("sleep 1", nodes=1)
    job_id = slurm_commands.assert_job_submitted(result.stdout)
    slurm_commands.wait_job_completed(job_id)
    _assert_dummy_nodes(remote_command_executor, max_queue_size - 1)
示例#7
0
def _test_dynamic_dummy_nodes(remote_command_executor, max_queue_size):
    logging.info("Testing dummy nodes are automatically reconfigured based on actual compute nodes")
    _assert_dummy_nodes(remote_command_executor, max_queue_size)
    slurm_commands = SlurmCommands(remote_command_executor)
    result = slurm_commands.submit_command("sleep 1", nodes=1)
    job_id = slurm_commands.assert_job_submitted(result.stdout)
    slurm_commands.wait_job_completed(job_id)
    _assert_dummy_nodes(remote_command_executor, max_queue_size - 1)
示例#8
0
def _test_cluster_limits(remote_command_executor, max_queue_size, region, asg_name):
    logging.info("Testing cluster doesn't scale when job requires a capacity that is higher than the max available")
    slurm_commands = SlurmCommands(remote_command_executor)
    result = slurm_commands.submit_command("sleep 1000", nodes=max_queue_size + 1)
    max_nodes_job_id = slurm_commands.assert_job_submitted(result.stdout)
    result = remote_command_executor.run_remote_command("sbatch -N 1 --wrap='sleep 1' --cpus-per-task 5")
    max_cpu_job_id = slurm_commands.assert_job_submitted(result.stdout)

    # Check we are not scaling
    time.sleep(60)
    assert_asg_desired_capacity(region, asg_name, expected=0)
    assert_that(_get_job_info(remote_command_executor, max_nodes_job_id)).contains(
        "JobState=PENDING Reason=PartitionNodeLimit"
    )
    assert_that(_get_job_info(remote_command_executor, max_cpu_job_id)).contains(
        "JobState=PENDING Reason=Nodes_required_for_job_are_DOWN,_DRAINED"
        "_or_reserved_for_jobs_in_higher_priority_partitions"
    )
示例#9
0
def _test_cluster_limits(remote_command_executor, max_queue_size, region, asg_name):
    logging.info("Testing cluster doesn't scale when job requires a capacity that is higher than the max available")
    slurm_commands = SlurmCommands(remote_command_executor)
    result = slurm_commands.submit_command("sleep 1", nodes=max_queue_size + 1)
    job_id = slurm_commands.assert_job_submitted(result.stdout)
    # Wait for reason to be computed
    time.sleep(3)
    assert_that(_get_job_info(remote_command_executor, job_id)).contains("JobState=PENDING Reason=PartitionNodeLimit")

    # Check we are not scaling
    time.sleep(60)
    asg_client = boto3.client("autoscaling", region_name=region)
    asg = asg_client.describe_auto_scaling_groups(AutoScalingGroupNames=[asg_name]).get("AutoScalingGroups")[0]
    assert_that(asg.get("DesiredCapacity")).is_equal_to(0)
示例#10
0
def _test_job_dependencies(remote_command_executor, region, stack_name,
                           scaledown_idletime):
    logging.info(
        "Testing cluster doesn't scale when job dependencies are not satisfied"
    )
    slurm_commands = SlurmCommands(remote_command_executor)
    result = slurm_commands.submit_command("sleep 60", nodes=1)
    job_id = slurm_commands.assert_job_submitted(result.stdout)
    result = remote_command_executor.run_remote_command(
        "sbatch -N 1 --wrap='sleep 1' -d afterok:{0}".format(job_id))
    dependent_job_id = slurm_commands.assert_job_submitted(result.stdout)

    # Wait for reason to be computed
    time.sleep(3)
    assert_that(_get_job_info(remote_command_executor, job_id)).contains(
        "JobState=PENDING Reason=Nodes_required_for_job_are_DOWN,_DRAINED"
        "_or_reserved_for_jobs_in_higher_priority_partitions")
    assert_that(_get_job_info(
        remote_command_executor,
        dependent_job_id)).contains("JobState=PENDING Reason=Dependency")

    jobs_execution_time = 1
    estimated_scaleup_time = 5
    estimated_scaledown_time = 20
    asg_capacity_time_series, compute_nodes_time_series, timestamps = get_compute_nodes_allocation(
        scheduler_commands=slurm_commands,
        region=region,
        stack_name=stack_name,
        max_monitoring_time=minutes(jobs_execution_time) +
        minutes(scaledown_idletime) + minutes(estimated_scaleup_time) +
        minutes(estimated_scaledown_time),
    )
    assert_that(max(asg_capacity_time_series)).is_equal_to(1)
    assert_that(max(compute_nodes_time_series)).is_equal_to(1)
    assert_that(asg_capacity_time_series[-1]).is_equal_to(0)
    assert_that(compute_nodes_time_series[-1]).is_equal_to(0)
示例#11
0
def _test_dynamic_dummy_nodes(remote_command_executor,
                              region,
                              asg_name,
                              max_queue_size,
                              slots=4,
                              gpus=0):
    logging.info(
        "Testing dummy nodes are automatically reconfigured based on actual compute nodes"
    )
    slurm_commands = SlurmCommands(remote_command_executor)
    # Assert initial conditions
    _assert_asg_has_no_node(region, asg_name)
    _assert_no_nodes_in_scheduler(slurm_commands)

    _assert_dummy_nodes(remote_command_executor, max_queue_size, slots, gpus)
    result = slurm_commands.submit_command("sleep 1", nodes=1)
    job_id = slurm_commands.assert_job_submitted(result.stdout)
    slurm_commands.wait_job_completed(job_id)
    _assert_dummy_nodes(remote_command_executor, max_queue_size - 1, slots,
                        gpus)
示例#12
0
def _test_cluster_limits(remote_command_executor, max_queue_size, region,
                         asg_name):
    logging.info(
        "Testing cluster doesn't scale when job requires a capacity that is higher than the max available"
    )
    slurm_commands = SlurmCommands(remote_command_executor)
    result = slurm_commands.submit_command("sleep 1", nodes=max_queue_size + 1)
    job_id = slurm_commands.assert_job_submitted(result.stdout)
    # Wait for reason to be computed
    time.sleep(3)
    assert_that(_get_job_info(
        remote_command_executor,
        job_id)).contains("JobState=PENDING Reason=PartitionNodeLimit")

    # Check we are not scaling
    time.sleep(60)
    asg_client = boto3.client("autoscaling", region_name=region)
    asg = asg_client.describe_auto_scaling_groups(
        AutoScalingGroupNames=[asg_name]).get("AutoScalingGroups")[0]
    assert_that(asg.get("DesiredCapacity")).is_equal_to(0)
def _test_mpi_job_termination(remote_command_executor, test_datadir):
    """
    Test canceling mpirun job will not leave stray processes.

    IntelMPI is known to leave stray processes after job termination if slurm process tracking is not setup correctly,
    i.e. using ProctrackType=proctrack/pgid
    Test IntelMPI script to make sure no stray processes after the job is cancelled
    This bug cannot be reproduced using OpenMPI
    """
    logging.info(
        "Testing no stray process left behind after mpirun job is terminated")
    slurm_commands = SlurmCommands(remote_command_executor)
    # Assert initial condition
    assert_that(slurm_commands.compute_nodes_count()).is_equal_to(2)

    # Submit mpi_job, which runs Intel MPI benchmarks with intelmpi
    # Leaving 1 vcpu on each node idle so that the process check job can run while mpi_job is running
    result = slurm_commands.submit_script(str(test_datadir / "mpi_job.sh"))
    job_id = slurm_commands.assert_job_submitted(result.stdout)

    # Check that mpi processes are started
    _assert_job_state(slurm_commands, job_id, job_state="RUNNING")
    _check_mpi_process(remote_command_executor,
                       slurm_commands,
                       test_datadir,
                       num_nodes=2,
                       after_completion=False)
    slurm_commands.cancel_job(job_id)

    # Make sure mpirun job is cancelled
    _assert_job_state(slurm_commands, job_id, job_state="CANCELLED")

    # Check that mpi processes are terminated
    _check_mpi_process(remote_command_executor,
                       slurm_commands,
                       test_datadir,
                       num_nodes=2,
                       after_completion=True)
示例#14
0
def _gpu_test_cluster_limits(remote_command_executor, max_queue_size,
                             num_gpus):
    """Test edge cases regarding the number of GPUs."""
    logging.info(
        "Testing scheduler does not accept jobs when requesting for more GPUs than available"
    )
    slurm_commands = SlurmCommands(remote_command_executor)
    # Expect commands below to fail with exit 1
    _submit_and_assert_job_rejected_node_config(
        remote_command_executor,
        "sbatch -N 1 --wrap='sleep 1' --gpus-per-task {0}".format(num_gpus +
                                                                  1))
    _submit_and_assert_job_rejected_node_config(
        remote_command_executor,
        "sbatch -N 1 --wrap='sleep 1' --gres=gpu:{0}".format(num_gpus + 1))
    _submit_and_assert_job_rejected_node_config(
        remote_command_executor,
        "sbatch -G {0} --wrap='sleep 1'".format(num_gpus * max_queue_size + 1))

    # Commands below should be correctly submitted
    result = slurm_commands.submit_command(
        "sleep 1",
        nodes=1,
        slots=num_gpus,
        other_options="-G {0} --gpus-per-task=1".format(num_gpus))
    slurm_commands.assert_job_submitted(result.stdout)
    result = slurm_commands.submit_command(
        "sleep 1", nodes=1, other_options="--gres=gpu:{0}".format(num_gpus))
    slurm_commands.assert_job_submitted(result.stdout)
    # Submit job without '-N' option(nodes=-1)
    result = slurm_commands.submit_command(
        "sleep 1",
        nodes=-1,
        other_options="-G {0} --gpus-per-node={1}".format(
            num_gpus * max_queue_size, num_gpus))
    slurm_commands.assert_job_submitted(result.stdout)
示例#15
0
def test_update_hit(region, scheduler, pcluster_config_reader,
                    clusters_factory, test_datadir, s3_bucket_factory):
    # Create S3 bucket for pre/post install scripts
    bucket_name = s3_bucket_factory()
    bucket = boto3.resource("s3", region_name=region).Bucket(bucket_name)
    bucket.upload_file(str(test_datadir / "preinstall.sh"),
                       "scripts/preinstall.sh")
    bucket.upload_file(str(test_datadir / "postinstall.sh"),
                       "scripts/postinstall.sh")

    # Create cluster with initial configuration
    init_config_file = pcluster_config_reader(resource_bucket=bucket_name)
    cluster = clusters_factory(init_config_file)

    # Update cluster with the same configuration, command should not result any error even if not using force update
    cluster.config_file = str(init_config_file)
    cluster.update(force=True)

    # Command executors
    command_executor = RemoteCommandExecutor(cluster)
    slurm_commands = SlurmCommands(command_executor)

    # Create shared dir for script results
    command_executor.run_remote_command("mkdir -p /shared/script_results")

    initial_queues_config = {
        "queue1": {
            "compute_resources": {
                "queue1_i1": {
                    "instance_type": "c5.xlarge",
                    "expected_running_instances": 1,
                    "expected_power_saved_instances": 1,
                    "enable_efa": False,
                    "disable_hyperthreading": False,
                },
                "queue1_i2": {
                    "instance_type": "t2.micro",
                    "expected_running_instances": 1,
                    "expected_power_saved_instances": 9,
                    "enable_efa": False,
                    "disable_hyperthreading": False,
                },
            },
            "compute_type": "ondemand",
        },
        "queue2": {
            "compute_resources": {
                "queue2_i1": {
                    "instance_type": "c5n.18xlarge",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 10,
                    "enable_efa": False,
                    "disable_hyperthreading": False,
                },
            },
            "compute_type": "ondemand",
        },
    }

    _assert_scheduler_nodes(queues_config=initial_queues_config,
                            slurm_commands=slurm_commands)
    _assert_launch_templates_config(queues_config=initial_queues_config,
                                    cluster_name=cluster.name,
                                    region=region)

    # Submit a job in order to verify that jobs are not affected by an update of the queue size
    result = slurm_commands.submit_command("sleep infinity",
                                           constraint="static")
    job_id = slurm_commands.assert_job_submitted(result.stdout)

    # Update cluster with new configuration
    updated_config_file = pcluster_config_reader(
        config_file="pcluster.config.update.ini",
        bucket=bucket_name,
        resource_bucket=bucket_name)
    cluster.config_file = str(updated_config_file)
    cluster.update()

    # Here is the expected list of nodes. Note that queue1-dy-t2micro-1 comes from the initial_count set when creating
    # the cluster:
    # queue1-dy-t2micro-1
    # queue1-st-c5xlarge-1
    # queue1-st-c5xlarge-2
    assert_initial_conditions(slurm_commands, 2, 1, partition="queue1")

    updated_queues_config = {
        "queue1": {
            "compute_resources": {
                "queue1_i1": {
                    "instance_type": "c5.xlarge",
                    "expected_running_instances": 2,
                    "expected_power_saved_instances": 2,
                    "disable_hyperthreading": False,
                    "enable_efa": False,
                },
                "queue1_i2": {
                    "instance_type": "c5.2xlarge",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 10,
                    "disable_hyperthreading": False,
                    "enable_efa": False,
                },
                "queue1_i3": {
                    "instance_type": "t2.micro",
                    "expected_running_instances":
                    1,  # This comes from initial_count before update
                    "expected_power_saved_instances": 9,
                    "disable_hyperthreading": False,
                    "enable_efa": False,
                },
            },
            "compute_type": "spot",
        },
        "queue2": {
            "compute_resources": {
                "queue2_i1": {
                    "instance_type": "c5n.18xlarge",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 1,
                    "enable_efa": True,
                    "disable_hyperthreading": True,
                },
            },
            "compute_type": "ondemand",
        },
        "queue3": {
            "compute_resources": {
                "queue3_i1": {
                    "instance_type": "c5n.18xlarge",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 10,
                    "disable_hyperthreading": True,
                    "enable_efa": True,
                },
                "queue3_i2": {
                    "instance_type": "t2.xlarge",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 10,
                    "disable_hyperthreading": False,
                    "enable_efa": False,
                },
            },
            "compute_type": "ondemand",
        },
    }

    _assert_scheduler_nodes(queues_config=updated_queues_config,
                            slurm_commands=slurm_commands)
    _assert_launch_templates_config(queues_config=updated_queues_config,
                                    cluster_name=cluster.name,
                                    region=region)

    # Read updated configuration
    updated_config = configparser.ConfigParser()
    updated_config.read(updated_config_file)

    # Check new S3 resources
    check_s3_read_resource(
        region, cluster,
        updated_config.get("cluster default", "s3_read_resource"))
    check_s3_read_write_resource(
        region, cluster,
        updated_config.get("cluster default", "s3_read_write_resource"))

    # Check new Additional IAM policies
    _check_role_attached_policy(
        region, cluster,
        updated_config.get("cluster default", "additional_iam_policies"))

    # Assert that the job submitted before the update is still running
    assert_that(
        slurm_commands.get_job_info(job_id)).contains("JobState=RUNNING")
示例#16
0
def test_update_slurm(region, pcluster_config_reader, s3_bucket_factory,
                      clusters_factory, test_datadir):
    # Create S3 bucket for pre/post install scripts
    bucket_name = s3_bucket_factory()
    bucket = boto3.resource("s3", region_name=region).Bucket(bucket_name)
    for script in [
            "preinstall.sh", "postinstall.sh", "updated_preinstall.sh",
            "updated_postinstall.sh"
    ]:
        bucket.upload_file(str(test_datadir / script), f"scripts/{script}")

    # Create cluster with initial configuration
    init_config_file = pcluster_config_reader(resource_bucket=bucket_name,
                                              bucket=bucket_name)
    cluster = clusters_factory(init_config_file)

    # Update cluster with the same configuration, command should not result any error even if not using force update
    cluster.update(str(init_config_file), force_update="true")

    # Command executors
    command_executor = RemoteCommandExecutor(cluster)
    slurm_commands = SlurmCommands(command_executor)

    # Create shared dir for script results
    command_executor.run_remote_command("mkdir -p /shared/script_results")

    initial_queues_config = {
        "queue1": {
            "compute_resources": {
                "queue1-i1": {
                    "instance_type": "c5.xlarge",
                    "expected_running_instances": 1,
                    "expected_power_saved_instances": 1,
                    "enable_efa": False,
                    "disable_hyperthreading": False,
                },
                "queue1-i2": {
                    "instance_type": "t2.micro",
                    "expected_running_instances": 1,
                    "expected_power_saved_instances": 9,
                    "enable_efa": False,
                    "disable_hyperthreading": False,
                },
            },
            "compute_type": "ondemand",
        },
        "queue2": {
            "compute_resources": {
                "queue2-i1": {
                    "instance_type": "c5n.18xlarge",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 10,
                    "enable_efa": False,
                    "disable_hyperthreading": False,
                }
            },
            "compute_type": "ondemand",
        },
    }

    _assert_scheduler_nodes(queues_config=initial_queues_config,
                            slurm_commands=slurm_commands)
    _assert_launch_templates_config(queues_config=initial_queues_config,
                                    cluster_name=cluster.name,
                                    region=region)

    # submit job in queue1 to verify original pre/post-install script execution
    initial_compute_nodes = slurm_commands.get_compute_nodes(
        filter_by_partition="queue1")
    _check_script(command_executor, slurm_commands, initial_compute_nodes[0],
                  "preinstall", "QWE")
    _check_script(command_executor, slurm_commands, initial_compute_nodes[0],
                  "postinstall", "RTY")

    # Submit a job in order to verify that jobs are not affected by an update of the queue size
    result = slurm_commands.submit_command("sleep infinity",
                                           constraint="static&c5.xlarge")
    job_id = slurm_commands.assert_job_submitted(result.stdout)

    # Update cluster with new configuration
    additional_policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonAppStreamServiceAccess"
    updated_config_file = pcluster_config_reader(
        config_file="pcluster.config.update.yaml",
        bucket=bucket_name,
        resource_bucket=bucket_name,
        additional_policy_arn=additional_policy_arn,
    )
    cluster.update(str(updated_config_file), force_update="true")

    # Here is the expected list of nodes.
    # the cluster:
    # queue1-st-c5xlarge-1
    # queue1-st-c5xlarge-2
    assert_initial_conditions(slurm_commands, 2, 0, partition="queue1")

    updated_queues_config = {
        "queue1": {
            "compute_resources": {
                "queue1-i1": {
                    "instance_type": "c5.xlarge",
                    "expected_running_instances": 2,
                    "expected_power_saved_instances": 2,
                    "disable_hyperthreading": False,
                    "enable_efa": False,
                },
                "queue1-i2": {
                    "instance_type": "c5.2xlarge",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 10,
                    "disable_hyperthreading": False,
                    "enable_efa": False,
                },
                "queue1-i3": {
                    "instance_type": "t2.micro",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 10,
                    "disable_hyperthreading": False,
                    "enable_efa": False,
                },
            },
            "compute_type": "spot",
        },
        "queue2": {
            "compute_resources": {
                "queue2-i1": {
                    "instance_type": "c5n.18xlarge",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 1,
                    "enable_efa": True,
                    "disable_hyperthreading": True,
                }
            },
            "compute_type": "ondemand",
            "networking": {
                "placement_group": {
                    "enabled": False
                }
            },
        },
        "queue3": {
            "compute_resources": {
                "queue3-i1": {
                    "instance_type": "c5n.18xlarge",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 10,
                    "disable_hyperthreading": True,
                    "enable_efa": True,
                },
                "queue3-i2": {
                    "instance_type": "t2.xlarge",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 10,
                    "disable_hyperthreading": False,
                    "enable_efa": False,
                },
            },
            "compute_type": "ondemand",
            "networking": {
                "placement_group": {
                    "enabled": False
                }
            },
        },
    }

    _assert_scheduler_nodes(queues_config=updated_queues_config,
                            slurm_commands=slurm_commands)
    _assert_launch_templates_config(queues_config=updated_queues_config,
                                    cluster_name=cluster.name,
                                    region=region)

    # Read updated configuration
    with open(updated_config_file, encoding="utf-8") as conf_file:
        updated_config = yaml.safe_load(conf_file)

    # Check new S3 resources
    check_s3_read_resource(
        region, cluster,
        get_policy_resources(updated_config, enable_write_access=False))
    check_s3_read_write_resource(
        region, cluster,
        get_policy_resources(updated_config, enable_write_access=True))

    # Check new Additional IAM policies
    _check_role_attached_policy(region, cluster, additional_policy_arn)

    # Assert that the job submitted before the update is still running
    assert_that(
        slurm_commands.get_job_info(job_id)).contains("JobState=RUNNING")

    _check_volume(cluster, updated_config, region)

    # Launch a new instance for queue1 and test updated pre/post install script execution and extra json update
    # Add a new dynamic node t2.micro to queue1-i3
    new_compute_node = _add_compute_nodes(slurm_commands, "queue1", "t2.micro")

    assert_that(len(new_compute_node)).is_equal_to(1)
    _check_script(command_executor, slurm_commands, new_compute_node[0],
                  "updated_preinstall", "ABC")
    _check_script(command_executor, slurm_commands, new_compute_node[0],
                  "updated_postinstall", "DEF")

    # check new extra json
    _check_extra_json(command_executor, slurm_commands, new_compute_node[0],
                      "test_value")