def _test_job_dependencies(remote_command_executor, region, stack_name, scaledown_idletime, max_queue_size): logging.info( "Testing cluster doesn't scale when job dependencies are not satisfied" ) slurm_commands = SlurmCommands(remote_command_executor) result = slurm_commands.submit_command("sleep 60", nodes=1) job_id = slurm_commands.assert_job_submitted(result.stdout) result = remote_command_executor.run_remote_command( "sbatch -N 1 --wrap='sleep 1' -d afterok:{0}".format(job_id)) dependent_job_id = slurm_commands.assert_job_submitted(result.stdout) # Wait for reason to be computed time.sleep(3) assert_that(slurm_commands.get_job_info(job_id)).contains( "JobState=PENDING Reason=Nodes_required_for_job_are_DOWN,_DRAINED" "_or_reserved_for_jobs_in_higher_priority_partitions") assert_that(slurm_commands.get_job_info(dependent_job_id)).contains( "JobState=PENDING Reason=Dependency") assert_scaling_worked(slurm_commands, region, stack_name, scaledown_idletime, expected_max=1, expected_final=0) # Assert scheduler configuration is correct _assert_dummy_nodes(remote_command_executor, max_queue_size) assert_that( _retrieve_slurm_compute_nodes_from_config( remote_command_executor)).is_empty() # Assert jobs were completed _assert_job_completed(remote_command_executor, job_id) _assert_job_completed(remote_command_executor, dependent_job_id)
def _gpu_resource_check(remote_command_executor): """Test GPU related resources are correctly allocated.""" logging.info("Testing number of GPU/CPU resources allocated to job") slurm_commands = SlurmCommands(remote_command_executor) result = remote_command_executor.run_remote_command( "sbatch -G 1 --cpus-per-gpu 5 --wrap='sleep 1'") job_id = slurm_commands.assert_job_submitted(result.stdout) job_info = slurm_commands.get_job_info(job_id) assert_that(job_info).contains("TresPerJob=gpu:1", "CpusPerTres=gpu:5") result = remote_command_executor.run_remote_command( "sbatch --gres=gpu:2 --cpus-per-gpu 6 --wrap='sleep 1'") job_id = slurm_commands.assert_job_submitted(result.stdout) job_info = slurm_commands.get_job_info(job_id) assert_that(job_info).contains("TresPerNode=gpu:2", "CpusPerTres=gpu:6")
def test_update_hit(region, scheduler, pcluster_config_reader, clusters_factory, test_datadir, s3_bucket_factory): # Create S3 bucket for pre/post install scripts bucket_name = s3_bucket_factory() bucket = boto3.resource("s3", region_name=region).Bucket(bucket_name) bucket.upload_file(str(test_datadir / "preinstall.sh"), "scripts/preinstall.sh") bucket.upload_file(str(test_datadir / "postinstall.sh"), "scripts/postinstall.sh") # Create cluster with initial configuration init_config_file = pcluster_config_reader(resource_bucket=bucket_name) cluster = clusters_factory(init_config_file) # Update cluster with the same configuration, command should not result any error even if not using force update cluster.config_file = str(init_config_file) cluster.update(force=True) # Command executors command_executor = RemoteCommandExecutor(cluster) slurm_commands = SlurmCommands(command_executor) # Create shared dir for script results command_executor.run_remote_command("mkdir -p /shared/script_results") initial_queues_config = { "queue1": { "compute_resources": { "queue1_i1": { "instance_type": "c5.xlarge", "expected_running_instances": 1, "expected_power_saved_instances": 1, "enable_efa": False, "disable_hyperthreading": False, }, "queue1_i2": { "instance_type": "t2.micro", "expected_running_instances": 1, "expected_power_saved_instances": 9, "enable_efa": False, "disable_hyperthreading": False, }, }, "compute_type": "ondemand", }, "queue2": { "compute_resources": { "queue2_i1": { "instance_type": "c5n.18xlarge", "expected_running_instances": 0, "expected_power_saved_instances": 10, "enable_efa": False, "disable_hyperthreading": False, }, }, "compute_type": "ondemand", }, } _assert_scheduler_nodes(queues_config=initial_queues_config, slurm_commands=slurm_commands) _assert_launch_templates_config(queues_config=initial_queues_config, cluster_name=cluster.name, region=region) # Submit a job in order to verify that jobs are not affected by an update of the queue size result = slurm_commands.submit_command("sleep infinity", constraint="static") job_id = slurm_commands.assert_job_submitted(result.stdout) # Update cluster with new configuration updated_config_file = pcluster_config_reader( config_file="pcluster.config.update.ini", bucket=bucket_name, resource_bucket=bucket_name) cluster.config_file = str(updated_config_file) cluster.update() # Here is the expected list of nodes. Note that queue1-dy-t2micro-1 comes from the initial_count set when creating # the cluster: # queue1-dy-t2micro-1 # queue1-st-c5xlarge-1 # queue1-st-c5xlarge-2 assert_initial_conditions(slurm_commands, 2, 1, partition="queue1") updated_queues_config = { "queue1": { "compute_resources": { "queue1_i1": { "instance_type": "c5.xlarge", "expected_running_instances": 2, "expected_power_saved_instances": 2, "disable_hyperthreading": False, "enable_efa": False, }, "queue1_i2": { "instance_type": "c5.2xlarge", "expected_running_instances": 0, "expected_power_saved_instances": 10, "disable_hyperthreading": False, "enable_efa": False, }, "queue1_i3": { "instance_type": "t2.micro", "expected_running_instances": 1, # This comes from initial_count before update "expected_power_saved_instances": 9, "disable_hyperthreading": False, "enable_efa": False, }, }, "compute_type": "spot", }, "queue2": { "compute_resources": { "queue2_i1": { "instance_type": "c5n.18xlarge", "expected_running_instances": 0, "expected_power_saved_instances": 1, "enable_efa": True, "disable_hyperthreading": True, }, }, "compute_type": "ondemand", }, "queue3": { "compute_resources": { "queue3_i1": { "instance_type": "c5n.18xlarge", "expected_running_instances": 0, "expected_power_saved_instances": 10, "disable_hyperthreading": True, "enable_efa": True, }, "queue3_i2": { "instance_type": "t2.xlarge", "expected_running_instances": 0, "expected_power_saved_instances": 10, "disable_hyperthreading": False, "enable_efa": False, }, }, "compute_type": "ondemand", }, } _assert_scheduler_nodes(queues_config=updated_queues_config, slurm_commands=slurm_commands) _assert_launch_templates_config(queues_config=updated_queues_config, cluster_name=cluster.name, region=region) # Read updated configuration updated_config = configparser.ConfigParser() updated_config.read(updated_config_file) # Check new S3 resources check_s3_read_resource( region, cluster, updated_config.get("cluster default", "s3_read_resource")) check_s3_read_write_resource( region, cluster, updated_config.get("cluster default", "s3_read_write_resource")) # Check new Additional IAM policies _check_role_attached_policy( region, cluster, updated_config.get("cluster default", "additional_iam_policies")) # Assert that the job submitted before the update is still running assert_that( slurm_commands.get_job_info(job_id)).contains("JobState=RUNNING")
def test_update_slurm(region, pcluster_config_reader, s3_bucket_factory, clusters_factory, test_datadir): # Create S3 bucket for pre/post install scripts bucket_name = s3_bucket_factory() bucket = boto3.resource("s3", region_name=region).Bucket(bucket_name) for script in [ "preinstall.sh", "postinstall.sh", "updated_preinstall.sh", "updated_postinstall.sh" ]: bucket.upload_file(str(test_datadir / script), f"scripts/{script}") # Create cluster with initial configuration init_config_file = pcluster_config_reader(resource_bucket=bucket_name, bucket=bucket_name) cluster = clusters_factory(init_config_file) # Update cluster with the same configuration, command should not result any error even if not using force update cluster.update(str(init_config_file), force_update="true") # Command executors command_executor = RemoteCommandExecutor(cluster) slurm_commands = SlurmCommands(command_executor) # Create shared dir for script results command_executor.run_remote_command("mkdir -p /shared/script_results") initial_queues_config = { "queue1": { "compute_resources": { "queue1-i1": { "instance_type": "c5.xlarge", "expected_running_instances": 1, "expected_power_saved_instances": 1, "enable_efa": False, "disable_hyperthreading": False, }, "queue1-i2": { "instance_type": "t2.micro", "expected_running_instances": 1, "expected_power_saved_instances": 9, "enable_efa": False, "disable_hyperthreading": False, }, }, "compute_type": "ondemand", }, "queue2": { "compute_resources": { "queue2-i1": { "instance_type": "c5n.18xlarge", "expected_running_instances": 0, "expected_power_saved_instances": 10, "enable_efa": False, "disable_hyperthreading": False, } }, "compute_type": "ondemand", }, } _assert_scheduler_nodes(queues_config=initial_queues_config, slurm_commands=slurm_commands) _assert_launch_templates_config(queues_config=initial_queues_config, cluster_name=cluster.name, region=region) # submit job in queue1 to verify original pre/post-install script execution initial_compute_nodes = slurm_commands.get_compute_nodes( filter_by_partition="queue1") _check_script(command_executor, slurm_commands, initial_compute_nodes[0], "preinstall", "QWE") _check_script(command_executor, slurm_commands, initial_compute_nodes[0], "postinstall", "RTY") # Submit a job in order to verify that jobs are not affected by an update of the queue size result = slurm_commands.submit_command("sleep infinity", constraint="static&c5.xlarge") job_id = slurm_commands.assert_job_submitted(result.stdout) # Update cluster with new configuration additional_policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonAppStreamServiceAccess" updated_config_file = pcluster_config_reader( config_file="pcluster.config.update.yaml", bucket=bucket_name, resource_bucket=bucket_name, additional_policy_arn=additional_policy_arn, ) cluster.update(str(updated_config_file), force_update="true") # Here is the expected list of nodes. # the cluster: # queue1-st-c5xlarge-1 # queue1-st-c5xlarge-2 assert_initial_conditions(slurm_commands, 2, 0, partition="queue1") updated_queues_config = { "queue1": { "compute_resources": { "queue1-i1": { "instance_type": "c5.xlarge", "expected_running_instances": 2, "expected_power_saved_instances": 2, "disable_hyperthreading": False, "enable_efa": False, }, "queue1-i2": { "instance_type": "c5.2xlarge", "expected_running_instances": 0, "expected_power_saved_instances": 10, "disable_hyperthreading": False, "enable_efa": False, }, "queue1-i3": { "instance_type": "t2.micro", "expected_running_instances": 0, "expected_power_saved_instances": 10, "disable_hyperthreading": False, "enable_efa": False, }, }, "compute_type": "spot", }, "queue2": { "compute_resources": { "queue2-i1": { "instance_type": "c5n.18xlarge", "expected_running_instances": 0, "expected_power_saved_instances": 1, "enable_efa": True, "disable_hyperthreading": True, } }, "compute_type": "ondemand", "networking": { "placement_group": { "enabled": False } }, }, "queue3": { "compute_resources": { "queue3-i1": { "instance_type": "c5n.18xlarge", "expected_running_instances": 0, "expected_power_saved_instances": 10, "disable_hyperthreading": True, "enable_efa": True, }, "queue3-i2": { "instance_type": "t2.xlarge", "expected_running_instances": 0, "expected_power_saved_instances": 10, "disable_hyperthreading": False, "enable_efa": False, }, }, "compute_type": "ondemand", "networking": { "placement_group": { "enabled": False } }, }, } _assert_scheduler_nodes(queues_config=updated_queues_config, slurm_commands=slurm_commands) _assert_launch_templates_config(queues_config=updated_queues_config, cluster_name=cluster.name, region=region) # Read updated configuration with open(updated_config_file, encoding="utf-8") as conf_file: updated_config = yaml.safe_load(conf_file) # Check new S3 resources check_s3_read_resource( region, cluster, get_policy_resources(updated_config, enable_write_access=False)) check_s3_read_write_resource( region, cluster, get_policy_resources(updated_config, enable_write_access=True)) # Check new Additional IAM policies _check_role_attached_policy(region, cluster, additional_policy_arn) # Assert that the job submitted before the update is still running assert_that( slurm_commands.get_job_info(job_id)).contains("JobState=RUNNING") _check_volume(cluster, updated_config, region) # Launch a new instance for queue1 and test updated pre/post install script execution and extra json update # Add a new dynamic node t2.micro to queue1-i3 new_compute_node = _add_compute_nodes(slurm_commands, "queue1", "t2.micro") assert_that(len(new_compute_node)).is_equal_to(1) _check_script(command_executor, slurm_commands, new_compute_node[0], "updated_preinstall", "ABC") _check_script(command_executor, slurm_commands, new_compute_node[0], "updated_postinstall", "DEF") # check new extra json _check_extra_json(command_executor, slurm_commands, new_compute_node[0], "test_value")