def _test_job_dependencies(remote_command_executor, region, stack_name, scaledown_idletime, max_queue_size): logging.info( "Testing cluster doesn't scale when job dependencies are not satisfied" ) slurm_commands = SlurmCommands(remote_command_executor) result = slurm_commands.submit_command("sleep 60", nodes=1) job_id = slurm_commands.assert_job_submitted(result.stdout) result = remote_command_executor.run_remote_command( "sbatch -N 1 --wrap='sleep 1' -d afterok:{0}".format(job_id)) dependent_job_id = slurm_commands.assert_job_submitted(result.stdout) # Wait for reason to be computed time.sleep(3) assert_that(slurm_commands.get_job_info(job_id)).contains( "JobState=PENDING Reason=Nodes_required_for_job_are_DOWN,_DRAINED" "_or_reserved_for_jobs_in_higher_priority_partitions") assert_that(slurm_commands.get_job_info(dependent_job_id)).contains( "JobState=PENDING Reason=Dependency") assert_scaling_worked(slurm_commands, region, stack_name, scaledown_idletime, expected_max=1, expected_final=0) # Assert scheduler configuration is correct _assert_dummy_nodes(remote_command_executor, max_queue_size) assert_that( _retrieve_slurm_compute_nodes_from_config( remote_command_executor)).is_empty() # Assert jobs were completed _assert_job_completed(remote_command_executor, job_id) _assert_job_completed(remote_command_executor, dependent_job_id)
def _test_job_dependencies(remote_command_executor, region, stack_name, scaledown_idletime): logging.info("Testing cluster doesn't scale when job dependencies are not satisfied") slurm_commands = SlurmCommands(remote_command_executor) result = slurm_commands.submit_command("sleep 60", nodes=1) job_id = slurm_commands.assert_job_submitted(result.stdout) result = remote_command_executor.run_remote_command("sbatch -N 1 --wrap='sleep 1' -d afterok:{0}".format(job_id)) dependent_job_id = slurm_commands.assert_job_submitted(result.stdout) # Wait for reason to be computed time.sleep(3) assert_that(_get_job_info(remote_command_executor, job_id)).contains( "JobState=PENDING Reason=Nodes_required_for_job_are_DOWN,_DRAINED" "_or_reserved_for_jobs_in_higher_priority_partitions" ) assert_that(_get_job_info(remote_command_executor, dependent_job_id)).contains("JobState=PENDING Reason=Dependency") jobs_execution_time = 1 estimated_scaleup_time = 5 estimated_scaledown_time = 20 asg_capacity_time_series, compute_nodes_time_series, timestamps = get_compute_nodes_allocation( scheduler_commands=slurm_commands, region=region, stack_name=stack_name, max_monitoring_time=minutes(jobs_execution_time) + minutes(scaledown_idletime) + minutes(estimated_scaleup_time) + minutes(estimated_scaledown_time), ) assert_that(max(asg_capacity_time_series)).is_equal_to(1) assert_that(max(compute_nodes_time_series)).is_equal_to(1) assert_that(asg_capacity_time_series[-1]).is_equal_to(0) assert_that(compute_nodes_time_series[-1]).is_equal_to(0)
def _gpu_test_scaleup(remote_command_executor, region, asg_name, stack_name, scaledown_idletime, num_gpus): """Test cluster is scaling up correctly and GPU jobs are not aborted on slurmctld restart.""" logging.info("Testing cluster scales correctly with GPU jobs") slurm_commands = SlurmCommands(remote_command_executor) # Assert initial conditions _assert_asg_has_no_node(region, asg_name) _assert_no_nodes_in_scheduler(slurm_commands) # g3.8xlarge has 32 vcpus and 2 GPUs, hardcoding tests for g3.8xlarge job_ids = [] # sbatch --wrap 'sleep 10' -G 3 result = slurm_commands.submit_command(command="sleep 10", nodes=-1, other_options="-G 3") job_ids.append(slurm_commands.assert_job_submitted(result.stdout)) # Nodes/resources available after this job: # [{cpu:31, gpu:0}, {cpu:31, gpu:0}] # sbatch --wrap 'sleep 10' --cpus-per-gpu=10 --gpus-per-task=1 result = slurm_commands.submit_command( command="sleep 10", nodes=-1, other_options="--cpus-per-gpu=10 --gpus-per-task=1") job_ids.append(slurm_commands.assert_job_submitted(result.stdout)) # Nodes/resources available after this job: # [{cpu:31, gpu:0}, {cpu:31, gpu:0}, {cpu:22, gpu:1}] # sbatch --wrap 'sleep 10' -N 1 --gpus-per-node=1 -c 22 -n 1 result = slurm_commands.submit_command( command="sleep 10", nodes=1, slots=1, other_options="--gpus-per-node=1 -c 23") job_ids.append(slurm_commands.assert_job_submitted(result.stdout)) # Nodes/resources available after this job: # [{cpu:31, gpu:0}, {cpu:31, gpu:0}, {cpu:22, gpu:1}, {cpu:19, gpu:1}] # sbatch --wrap 'sleep 10' -c 31 -n 1 result = slurm_commands.submit_command(command="sleep 10", nodes=-1, slots=1, other_options="-c 31") job_ids.append(slurm_commands.assert_job_submitted(result.stdout)) # Nodes/resources available after this job: # [{cpu:0, gpu:0}, {cpu:31, gpu:0}, {cpu:22, gpu:1}, {cpu:19, gpu:1}] # Assert scaling worked as expected assert_scaling_worked(slurm_commands, region, stack_name, scaledown_idletime, expected_max=4, expected_final=0) # Assert jobs were completed for job_id in job_ids: slurm_commands.assert_job_succeeded(job_id)
def _test_job_arrays_and_parallel_jobs(remote_command_executor, region, stack_name, scaledown_idletime): logging.info("Testing cluster scales correctly with array jobs and parallel jobs") slurm_commands = SlurmCommands(remote_command_executor) result = remote_command_executor.run_remote_command("sbatch --wrap 'sleep 1' -a 1-5") array_job_id = slurm_commands.assert_job_submitted(result.stdout) result = remote_command_executor.run_remote_command("sbatch --wrap 'sleep 1' -c 3 -n 2") parallel_job_id = slurm_commands.assert_job_submitted(result.stdout) # Assert scaling worked as expected assert_scaling_worked(slurm_commands, region, stack_name, scaledown_idletime, expected_max=3, expected_final=0) # Assert jobs were completed _assert_job_completed(remote_command_executor, array_job_id) _assert_job_completed(remote_command_executor, parallel_job_id)
def _gpu_resource_check(remote_command_executor): """Test GPU related resources are correctly allocated.""" logging.info("Testing number of GPU/CPU resources allocated to job") slurm_commands = SlurmCommands(remote_command_executor) result = remote_command_executor.run_remote_command( "sbatch -G 1 --cpus-per-gpu 5 --wrap='sleep 1'") job_id = slurm_commands.assert_job_submitted(result.stdout) job_info = slurm_commands.get_job_info(job_id) assert_that(job_info).contains("TresPerJob=gpu:1", "CpusPerTres=gpu:5") result = remote_command_executor.run_remote_command( "sbatch --gres=gpu:2 --cpus-per-gpu 6 --wrap='sleep 1'") job_id = slurm_commands.assert_job_submitted(result.stdout) job_info = slurm_commands.get_job_info(job_id) assert_that(job_info).contains("TresPerNode=gpu:2", "CpusPerTres=gpu:6")
def _test_dynamic_dummy_nodes(remote_command_executor, max_queue_size): logging.info("Testing dummy nodes are automatically reconfigured based on actual compute nodes") _assert_dummy_nodes(remote_command_executor, max_queue_size) slurm_commands = SlurmCommands(remote_command_executor) result = slurm_commands.submit_command("sleep 1", nodes=1) job_id = slurm_commands.assert_job_submitted(result.stdout) slurm_commands.wait_job_completed(job_id) _assert_dummy_nodes(remote_command_executor, max_queue_size - 1)
def _test_cluster_limits(remote_command_executor, max_queue_size, region, asg_name): logging.info("Testing cluster doesn't scale when job requires a capacity that is higher than the max available") slurm_commands = SlurmCommands(remote_command_executor) result = slurm_commands.submit_command("sleep 1000", nodes=max_queue_size + 1) max_nodes_job_id = slurm_commands.assert_job_submitted(result.stdout) result = remote_command_executor.run_remote_command("sbatch -N 1 --wrap='sleep 1' --cpus-per-task 5") max_cpu_job_id = slurm_commands.assert_job_submitted(result.stdout) # Check we are not scaling time.sleep(60) assert_asg_desired_capacity(region, asg_name, expected=0) assert_that(_get_job_info(remote_command_executor, max_nodes_job_id)).contains( "JobState=PENDING Reason=PartitionNodeLimit" ) assert_that(_get_job_info(remote_command_executor, max_cpu_job_id)).contains( "JobState=PENDING Reason=Nodes_required_for_job_are_DOWN,_DRAINED" "_or_reserved_for_jobs_in_higher_priority_partitions" )
def _test_cluster_limits(remote_command_executor, max_queue_size, region, asg_name): logging.info("Testing cluster doesn't scale when job requires a capacity that is higher than the max available") slurm_commands = SlurmCommands(remote_command_executor) result = slurm_commands.submit_command("sleep 1", nodes=max_queue_size + 1) job_id = slurm_commands.assert_job_submitted(result.stdout) # Wait for reason to be computed time.sleep(3) assert_that(_get_job_info(remote_command_executor, job_id)).contains("JobState=PENDING Reason=PartitionNodeLimit") # Check we are not scaling time.sleep(60) asg_client = boto3.client("autoscaling", region_name=region) asg = asg_client.describe_auto_scaling_groups(AutoScalingGroupNames=[asg_name]).get("AutoScalingGroups")[0] assert_that(asg.get("DesiredCapacity")).is_equal_to(0)
def _test_job_dependencies(remote_command_executor, region, stack_name, scaledown_idletime): logging.info( "Testing cluster doesn't scale when job dependencies are not satisfied" ) slurm_commands = SlurmCommands(remote_command_executor) result = slurm_commands.submit_command("sleep 60", nodes=1) job_id = slurm_commands.assert_job_submitted(result.stdout) result = remote_command_executor.run_remote_command( "sbatch -N 1 --wrap='sleep 1' -d afterok:{0}".format(job_id)) dependent_job_id = slurm_commands.assert_job_submitted(result.stdout) # Wait for reason to be computed time.sleep(3) assert_that(_get_job_info(remote_command_executor, job_id)).contains( "JobState=PENDING Reason=Nodes_required_for_job_are_DOWN,_DRAINED" "_or_reserved_for_jobs_in_higher_priority_partitions") assert_that(_get_job_info( remote_command_executor, dependent_job_id)).contains("JobState=PENDING Reason=Dependency") jobs_execution_time = 1 estimated_scaleup_time = 5 estimated_scaledown_time = 20 asg_capacity_time_series, compute_nodes_time_series, timestamps = get_compute_nodes_allocation( scheduler_commands=slurm_commands, region=region, stack_name=stack_name, max_monitoring_time=minutes(jobs_execution_time) + minutes(scaledown_idletime) + minutes(estimated_scaleup_time) + minutes(estimated_scaledown_time), ) assert_that(max(asg_capacity_time_series)).is_equal_to(1) assert_that(max(compute_nodes_time_series)).is_equal_to(1) assert_that(asg_capacity_time_series[-1]).is_equal_to(0) assert_that(compute_nodes_time_series[-1]).is_equal_to(0)
def _test_dynamic_dummy_nodes(remote_command_executor, region, asg_name, max_queue_size, slots=4, gpus=0): logging.info( "Testing dummy nodes are automatically reconfigured based on actual compute nodes" ) slurm_commands = SlurmCommands(remote_command_executor) # Assert initial conditions _assert_asg_has_no_node(region, asg_name) _assert_no_nodes_in_scheduler(slurm_commands) _assert_dummy_nodes(remote_command_executor, max_queue_size, slots, gpus) result = slurm_commands.submit_command("sleep 1", nodes=1) job_id = slurm_commands.assert_job_submitted(result.stdout) slurm_commands.wait_job_completed(job_id) _assert_dummy_nodes(remote_command_executor, max_queue_size - 1, slots, gpus)
def _test_cluster_limits(remote_command_executor, max_queue_size, region, asg_name): logging.info( "Testing cluster doesn't scale when job requires a capacity that is higher than the max available" ) slurm_commands = SlurmCommands(remote_command_executor) result = slurm_commands.submit_command("sleep 1", nodes=max_queue_size + 1) job_id = slurm_commands.assert_job_submitted(result.stdout) # Wait for reason to be computed time.sleep(3) assert_that(_get_job_info( remote_command_executor, job_id)).contains("JobState=PENDING Reason=PartitionNodeLimit") # Check we are not scaling time.sleep(60) asg_client = boto3.client("autoscaling", region_name=region) asg = asg_client.describe_auto_scaling_groups( AutoScalingGroupNames=[asg_name]).get("AutoScalingGroups")[0] assert_that(asg.get("DesiredCapacity")).is_equal_to(0)
def _test_mpi_job_termination(remote_command_executor, test_datadir): """ Test canceling mpirun job will not leave stray processes. IntelMPI is known to leave stray processes after job termination if slurm process tracking is not setup correctly, i.e. using ProctrackType=proctrack/pgid Test IntelMPI script to make sure no stray processes after the job is cancelled This bug cannot be reproduced using OpenMPI """ logging.info( "Testing no stray process left behind after mpirun job is terminated") slurm_commands = SlurmCommands(remote_command_executor) # Assert initial condition assert_that(slurm_commands.compute_nodes_count()).is_equal_to(2) # Submit mpi_job, which runs Intel MPI benchmarks with intelmpi # Leaving 1 vcpu on each node idle so that the process check job can run while mpi_job is running result = slurm_commands.submit_script(str(test_datadir / "mpi_job.sh")) job_id = slurm_commands.assert_job_submitted(result.stdout) # Check that mpi processes are started _assert_job_state(slurm_commands, job_id, job_state="RUNNING") _check_mpi_process(remote_command_executor, slurm_commands, test_datadir, num_nodes=2, after_completion=False) slurm_commands.cancel_job(job_id) # Make sure mpirun job is cancelled _assert_job_state(slurm_commands, job_id, job_state="CANCELLED") # Check that mpi processes are terminated _check_mpi_process(remote_command_executor, slurm_commands, test_datadir, num_nodes=2, after_completion=True)
def _gpu_test_cluster_limits(remote_command_executor, max_queue_size, num_gpus): """Test edge cases regarding the number of GPUs.""" logging.info( "Testing scheduler does not accept jobs when requesting for more GPUs than available" ) slurm_commands = SlurmCommands(remote_command_executor) # Expect commands below to fail with exit 1 _submit_and_assert_job_rejected_node_config( remote_command_executor, "sbatch -N 1 --wrap='sleep 1' --gpus-per-task {0}".format(num_gpus + 1)) _submit_and_assert_job_rejected_node_config( remote_command_executor, "sbatch -N 1 --wrap='sleep 1' --gres=gpu:{0}".format(num_gpus + 1)) _submit_and_assert_job_rejected_node_config( remote_command_executor, "sbatch -G {0} --wrap='sleep 1'".format(num_gpus * max_queue_size + 1)) # Commands below should be correctly submitted result = slurm_commands.submit_command( "sleep 1", nodes=1, slots=num_gpus, other_options="-G {0} --gpus-per-task=1".format(num_gpus)) slurm_commands.assert_job_submitted(result.stdout) result = slurm_commands.submit_command( "sleep 1", nodes=1, other_options="--gres=gpu:{0}".format(num_gpus)) slurm_commands.assert_job_submitted(result.stdout) # Submit job without '-N' option(nodes=-1) result = slurm_commands.submit_command( "sleep 1", nodes=-1, other_options="-G {0} --gpus-per-node={1}".format( num_gpus * max_queue_size, num_gpus)) slurm_commands.assert_job_submitted(result.stdout)
def test_update_hit(region, scheduler, pcluster_config_reader, clusters_factory, test_datadir, s3_bucket_factory): # Create S3 bucket for pre/post install scripts bucket_name = s3_bucket_factory() bucket = boto3.resource("s3", region_name=region).Bucket(bucket_name) bucket.upload_file(str(test_datadir / "preinstall.sh"), "scripts/preinstall.sh") bucket.upload_file(str(test_datadir / "postinstall.sh"), "scripts/postinstall.sh") # Create cluster with initial configuration init_config_file = pcluster_config_reader(resource_bucket=bucket_name) cluster = clusters_factory(init_config_file) # Update cluster with the same configuration, command should not result any error even if not using force update cluster.config_file = str(init_config_file) cluster.update(force=True) # Command executors command_executor = RemoteCommandExecutor(cluster) slurm_commands = SlurmCommands(command_executor) # Create shared dir for script results command_executor.run_remote_command("mkdir -p /shared/script_results") initial_queues_config = { "queue1": { "compute_resources": { "queue1_i1": { "instance_type": "c5.xlarge", "expected_running_instances": 1, "expected_power_saved_instances": 1, "enable_efa": False, "disable_hyperthreading": False, }, "queue1_i2": { "instance_type": "t2.micro", "expected_running_instances": 1, "expected_power_saved_instances": 9, "enable_efa": False, "disable_hyperthreading": False, }, }, "compute_type": "ondemand", }, "queue2": { "compute_resources": { "queue2_i1": { "instance_type": "c5n.18xlarge", "expected_running_instances": 0, "expected_power_saved_instances": 10, "enable_efa": False, "disable_hyperthreading": False, }, }, "compute_type": "ondemand", }, } _assert_scheduler_nodes(queues_config=initial_queues_config, slurm_commands=slurm_commands) _assert_launch_templates_config(queues_config=initial_queues_config, cluster_name=cluster.name, region=region) # Submit a job in order to verify that jobs are not affected by an update of the queue size result = slurm_commands.submit_command("sleep infinity", constraint="static") job_id = slurm_commands.assert_job_submitted(result.stdout) # Update cluster with new configuration updated_config_file = pcluster_config_reader( config_file="pcluster.config.update.ini", bucket=bucket_name, resource_bucket=bucket_name) cluster.config_file = str(updated_config_file) cluster.update() # Here is the expected list of nodes. Note that queue1-dy-t2micro-1 comes from the initial_count set when creating # the cluster: # queue1-dy-t2micro-1 # queue1-st-c5xlarge-1 # queue1-st-c5xlarge-2 assert_initial_conditions(slurm_commands, 2, 1, partition="queue1") updated_queues_config = { "queue1": { "compute_resources": { "queue1_i1": { "instance_type": "c5.xlarge", "expected_running_instances": 2, "expected_power_saved_instances": 2, "disable_hyperthreading": False, "enable_efa": False, }, "queue1_i2": { "instance_type": "c5.2xlarge", "expected_running_instances": 0, "expected_power_saved_instances": 10, "disable_hyperthreading": False, "enable_efa": False, }, "queue1_i3": { "instance_type": "t2.micro", "expected_running_instances": 1, # This comes from initial_count before update "expected_power_saved_instances": 9, "disable_hyperthreading": False, "enable_efa": False, }, }, "compute_type": "spot", }, "queue2": { "compute_resources": { "queue2_i1": { "instance_type": "c5n.18xlarge", "expected_running_instances": 0, "expected_power_saved_instances": 1, "enable_efa": True, "disable_hyperthreading": True, }, }, "compute_type": "ondemand", }, "queue3": { "compute_resources": { "queue3_i1": { "instance_type": "c5n.18xlarge", "expected_running_instances": 0, "expected_power_saved_instances": 10, "disable_hyperthreading": True, "enable_efa": True, }, "queue3_i2": { "instance_type": "t2.xlarge", "expected_running_instances": 0, "expected_power_saved_instances": 10, "disable_hyperthreading": False, "enable_efa": False, }, }, "compute_type": "ondemand", }, } _assert_scheduler_nodes(queues_config=updated_queues_config, slurm_commands=slurm_commands) _assert_launch_templates_config(queues_config=updated_queues_config, cluster_name=cluster.name, region=region) # Read updated configuration updated_config = configparser.ConfigParser() updated_config.read(updated_config_file) # Check new S3 resources check_s3_read_resource( region, cluster, updated_config.get("cluster default", "s3_read_resource")) check_s3_read_write_resource( region, cluster, updated_config.get("cluster default", "s3_read_write_resource")) # Check new Additional IAM policies _check_role_attached_policy( region, cluster, updated_config.get("cluster default", "additional_iam_policies")) # Assert that the job submitted before the update is still running assert_that( slurm_commands.get_job_info(job_id)).contains("JobState=RUNNING")
def test_update_slurm(region, pcluster_config_reader, s3_bucket_factory, clusters_factory, test_datadir): # Create S3 bucket for pre/post install scripts bucket_name = s3_bucket_factory() bucket = boto3.resource("s3", region_name=region).Bucket(bucket_name) for script in [ "preinstall.sh", "postinstall.sh", "updated_preinstall.sh", "updated_postinstall.sh" ]: bucket.upload_file(str(test_datadir / script), f"scripts/{script}") # Create cluster with initial configuration init_config_file = pcluster_config_reader(resource_bucket=bucket_name, bucket=bucket_name) cluster = clusters_factory(init_config_file) # Update cluster with the same configuration, command should not result any error even if not using force update cluster.update(str(init_config_file), force_update="true") # Command executors command_executor = RemoteCommandExecutor(cluster) slurm_commands = SlurmCommands(command_executor) # Create shared dir for script results command_executor.run_remote_command("mkdir -p /shared/script_results") initial_queues_config = { "queue1": { "compute_resources": { "queue1-i1": { "instance_type": "c5.xlarge", "expected_running_instances": 1, "expected_power_saved_instances": 1, "enable_efa": False, "disable_hyperthreading": False, }, "queue1-i2": { "instance_type": "t2.micro", "expected_running_instances": 1, "expected_power_saved_instances": 9, "enable_efa": False, "disable_hyperthreading": False, }, }, "compute_type": "ondemand", }, "queue2": { "compute_resources": { "queue2-i1": { "instance_type": "c5n.18xlarge", "expected_running_instances": 0, "expected_power_saved_instances": 10, "enable_efa": False, "disable_hyperthreading": False, } }, "compute_type": "ondemand", }, } _assert_scheduler_nodes(queues_config=initial_queues_config, slurm_commands=slurm_commands) _assert_launch_templates_config(queues_config=initial_queues_config, cluster_name=cluster.name, region=region) # submit job in queue1 to verify original pre/post-install script execution initial_compute_nodes = slurm_commands.get_compute_nodes( filter_by_partition="queue1") _check_script(command_executor, slurm_commands, initial_compute_nodes[0], "preinstall", "QWE") _check_script(command_executor, slurm_commands, initial_compute_nodes[0], "postinstall", "RTY") # Submit a job in order to verify that jobs are not affected by an update of the queue size result = slurm_commands.submit_command("sleep infinity", constraint="static&c5.xlarge") job_id = slurm_commands.assert_job_submitted(result.stdout) # Update cluster with new configuration additional_policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonAppStreamServiceAccess" updated_config_file = pcluster_config_reader( config_file="pcluster.config.update.yaml", bucket=bucket_name, resource_bucket=bucket_name, additional_policy_arn=additional_policy_arn, ) cluster.update(str(updated_config_file), force_update="true") # Here is the expected list of nodes. # the cluster: # queue1-st-c5xlarge-1 # queue1-st-c5xlarge-2 assert_initial_conditions(slurm_commands, 2, 0, partition="queue1") updated_queues_config = { "queue1": { "compute_resources": { "queue1-i1": { "instance_type": "c5.xlarge", "expected_running_instances": 2, "expected_power_saved_instances": 2, "disable_hyperthreading": False, "enable_efa": False, }, "queue1-i2": { "instance_type": "c5.2xlarge", "expected_running_instances": 0, "expected_power_saved_instances": 10, "disable_hyperthreading": False, "enable_efa": False, }, "queue1-i3": { "instance_type": "t2.micro", "expected_running_instances": 0, "expected_power_saved_instances": 10, "disable_hyperthreading": False, "enable_efa": False, }, }, "compute_type": "spot", }, "queue2": { "compute_resources": { "queue2-i1": { "instance_type": "c5n.18xlarge", "expected_running_instances": 0, "expected_power_saved_instances": 1, "enable_efa": True, "disable_hyperthreading": True, } }, "compute_type": "ondemand", "networking": { "placement_group": { "enabled": False } }, }, "queue3": { "compute_resources": { "queue3-i1": { "instance_type": "c5n.18xlarge", "expected_running_instances": 0, "expected_power_saved_instances": 10, "disable_hyperthreading": True, "enable_efa": True, }, "queue3-i2": { "instance_type": "t2.xlarge", "expected_running_instances": 0, "expected_power_saved_instances": 10, "disable_hyperthreading": False, "enable_efa": False, }, }, "compute_type": "ondemand", "networking": { "placement_group": { "enabled": False } }, }, } _assert_scheduler_nodes(queues_config=updated_queues_config, slurm_commands=slurm_commands) _assert_launch_templates_config(queues_config=updated_queues_config, cluster_name=cluster.name, region=region) # Read updated configuration with open(updated_config_file, encoding="utf-8") as conf_file: updated_config = yaml.safe_load(conf_file) # Check new S3 resources check_s3_read_resource( region, cluster, get_policy_resources(updated_config, enable_write_access=False)) check_s3_read_write_resource( region, cluster, get_policy_resources(updated_config, enable_write_access=True)) # Check new Additional IAM policies _check_role_attached_policy(region, cluster, additional_policy_arn) # Assert that the job submitted before the update is still running assert_that( slurm_commands.get_job_info(job_id)).contains("JobState=RUNNING") _check_volume(cluster, updated_config, region) # Launch a new instance for queue1 and test updated pre/post install script execution and extra json update # Add a new dynamic node t2.micro to queue1-i3 new_compute_node = _add_compute_nodes(slurm_commands, "queue1", "t2.micro") assert_that(len(new_compute_node)).is_equal_to(1) _check_script(command_executor, slurm_commands, new_compute_node[0], "updated_preinstall", "ABC") _check_script(command_executor, slurm_commands, new_compute_node[0], "updated_postinstall", "DEF") # check new extra json _check_extra_json(command_executor, slurm_commands, new_compute_node[0], "test_value")