def test_update(instance, region, pcluster_config_reader, clusters_factory, test_datadir): """ Test 'pcluster update' command. Grouped all tests in a single function so that cluster can be reused for all of them. """ s3_arn = "arn:aws:s3:::fake_bucket/*" init_config = PClusterConfig( max_queue_size=5, compute_instance_type=instance, compute_root_volume_size=30, s3_read_resource=s3_arn, s3_read_write_resource=s3_arn, ) cluster = _init_cluster(clusters_factory, pcluster_config_reader, init_config) command_executor = RemoteCommandExecutor(cluster) slurm_commands = SlurmCommands(command_executor) _verify_initialization(command_executor, slurm_commands, region, test_datadir, cluster, init_config) s3_arn_updated = "arn:aws:s3:::fake_bucket/fake_folder/*" updated_config = PClusterConfig( max_queue_size=10, compute_instance_type="c4.xlarge", compute_root_volume_size=40, s3_read_resource=s3_arn_updated, s3_read_write_resource=s3_arn_updated, ) _update_cluster(cluster, updated_config) # verify updated parameters _test_max_queue(region, cluster.cfn_name, updated_config.max_queue_size) _test_s3_read_resource(region, cluster, updated_config.s3_read_resource) _test_s3_read_write_resource(region, cluster, updated_config.s3_read_write_resource) # verify params that are NOT updated in OLD compute nodes compute_nodes = slurm_commands.get_compute_nodes() _test_compute_instance_type(region, cluster.cfn_name, init_config.compute_instance_type, compute_nodes[0]) _test_compute_root_volume_size(command_executor, slurm_commands, test_datadir, init_config.compute_root_volume_size, compute_nodes[0]) # add compute nodes and verify updated params in NEW compute nodes new_compute_nodes = _add_compute_nodes(slurm_commands) _test_compute_instance_type(region, cluster.cfn_name, updated_config.compute_instance_type, new_compute_nodes[0]) _test_compute_root_volume_size(command_executor, slurm_commands, test_datadir, updated_config.compute_root_volume_size, new_compute_nodes[0])
def test_update_slurm(region, pcluster_config_reader, s3_bucket_factory, clusters_factory, test_datadir): # Create S3 bucket for pre/post install scripts bucket_name = s3_bucket_factory() bucket = boto3.resource("s3", region_name=region).Bucket(bucket_name) for script in [ "preinstall.sh", "postinstall.sh", "updated_preinstall.sh", "updated_postinstall.sh" ]: bucket.upload_file(str(test_datadir / script), f"scripts/{script}") # Create cluster with initial configuration init_config_file = pcluster_config_reader(resource_bucket=bucket_name, bucket=bucket_name) cluster = clusters_factory(init_config_file) # Update cluster with the same configuration, command should not result any error even if not using force update cluster.update(str(init_config_file), force_update="true") # Command executors command_executor = RemoteCommandExecutor(cluster) slurm_commands = SlurmCommands(command_executor) # Create shared dir for script results command_executor.run_remote_command("mkdir -p /shared/script_results") initial_queues_config = { "queue1": { "compute_resources": { "queue1-i1": { "instance_type": "c5.xlarge", "expected_running_instances": 1, "expected_power_saved_instances": 1, "enable_efa": False, "disable_hyperthreading": False, }, "queue1-i2": { "instance_type": "t2.micro", "expected_running_instances": 1, "expected_power_saved_instances": 9, "enable_efa": False, "disable_hyperthreading": False, }, }, "compute_type": "ondemand", }, "queue2": { "compute_resources": { "queue2-i1": { "instance_type": "c5n.18xlarge", "expected_running_instances": 0, "expected_power_saved_instances": 10, "enable_efa": False, "disable_hyperthreading": False, } }, "compute_type": "ondemand", }, } _assert_scheduler_nodes(queues_config=initial_queues_config, slurm_commands=slurm_commands) _assert_launch_templates_config(queues_config=initial_queues_config, cluster_name=cluster.name, region=region) # submit job in queue1 to verify original pre/post-install script execution initial_compute_nodes = slurm_commands.get_compute_nodes( filter_by_partition="queue1") _check_script(command_executor, slurm_commands, initial_compute_nodes[0], "preinstall", "QWE") _check_script(command_executor, slurm_commands, initial_compute_nodes[0], "postinstall", "RTY") # Submit a job in order to verify that jobs are not affected by an update of the queue size result = slurm_commands.submit_command("sleep infinity", constraint="static&c5.xlarge") job_id = slurm_commands.assert_job_submitted(result.stdout) # Update cluster with new configuration additional_policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonAppStreamServiceAccess" updated_config_file = pcluster_config_reader( config_file="pcluster.config.update.yaml", bucket=bucket_name, resource_bucket=bucket_name, additional_policy_arn=additional_policy_arn, ) cluster.update(str(updated_config_file), force_update="true") # Here is the expected list of nodes. # the cluster: # queue1-st-c5xlarge-1 # queue1-st-c5xlarge-2 assert_initial_conditions(slurm_commands, 2, 0, partition="queue1") updated_queues_config = { "queue1": { "compute_resources": { "queue1-i1": { "instance_type": "c5.xlarge", "expected_running_instances": 2, "expected_power_saved_instances": 2, "disable_hyperthreading": False, "enable_efa": False, }, "queue1-i2": { "instance_type": "c5.2xlarge", "expected_running_instances": 0, "expected_power_saved_instances": 10, "disable_hyperthreading": False, "enable_efa": False, }, "queue1-i3": { "instance_type": "t2.micro", "expected_running_instances": 0, "expected_power_saved_instances": 10, "disable_hyperthreading": False, "enable_efa": False, }, }, "compute_type": "spot", }, "queue2": { "compute_resources": { "queue2-i1": { "instance_type": "c5n.18xlarge", "expected_running_instances": 0, "expected_power_saved_instances": 1, "enable_efa": True, "disable_hyperthreading": True, } }, "compute_type": "ondemand", "networking": { "placement_group": { "enabled": False } }, }, "queue3": { "compute_resources": { "queue3-i1": { "instance_type": "c5n.18xlarge", "expected_running_instances": 0, "expected_power_saved_instances": 10, "disable_hyperthreading": True, "enable_efa": True, }, "queue3-i2": { "instance_type": "t2.xlarge", "expected_running_instances": 0, "expected_power_saved_instances": 10, "disable_hyperthreading": False, "enable_efa": False, }, }, "compute_type": "ondemand", "networking": { "placement_group": { "enabled": False } }, }, } _assert_scheduler_nodes(queues_config=updated_queues_config, slurm_commands=slurm_commands) _assert_launch_templates_config(queues_config=updated_queues_config, cluster_name=cluster.name, region=region) # Read updated configuration with open(updated_config_file, encoding="utf-8") as conf_file: updated_config = yaml.safe_load(conf_file) # Check new S3 resources check_s3_read_resource( region, cluster, get_policy_resources(updated_config, enable_write_access=False)) check_s3_read_write_resource( region, cluster, get_policy_resources(updated_config, enable_write_access=True)) # Check new Additional IAM policies _check_role_attached_policy(region, cluster, additional_policy_arn) # Assert that the job submitted before the update is still running assert_that( slurm_commands.get_job_info(job_id)).contains("JobState=RUNNING") _check_volume(cluster, updated_config, region) # Launch a new instance for queue1 and test updated pre/post install script execution and extra json update # Add a new dynamic node t2.micro to queue1-i3 new_compute_node = _add_compute_nodes(slurm_commands, "queue1", "t2.micro") assert_that(len(new_compute_node)).is_equal_to(1) _check_script(command_executor, slurm_commands, new_compute_node[0], "updated_preinstall", "ABC") _check_script(command_executor, slurm_commands, new_compute_node[0], "updated_postinstall", "DEF") # check new extra json _check_extra_json(command_executor, slurm_commands, new_compute_node[0], "test_value")