예제 #1
0
def test_update(instance, region, pcluster_config_reader, clusters_factory,
                test_datadir):
    """
    Test 'pcluster update' command.

    Grouped all tests in a single function so that cluster can be reused for all of them.
    """
    s3_arn = "arn:aws:s3:::fake_bucket/*"
    init_config = PClusterConfig(
        max_queue_size=5,
        compute_instance_type=instance,
        compute_root_volume_size=30,
        s3_read_resource=s3_arn,
        s3_read_write_resource=s3_arn,
    )
    cluster = _init_cluster(clusters_factory, pcluster_config_reader,
                            init_config)
    command_executor = RemoteCommandExecutor(cluster)
    slurm_commands = SlurmCommands(command_executor)

    _verify_initialization(command_executor, slurm_commands, region,
                           test_datadir, cluster, init_config)

    s3_arn_updated = "arn:aws:s3:::fake_bucket/fake_folder/*"
    updated_config = PClusterConfig(
        max_queue_size=10,
        compute_instance_type="c4.xlarge",
        compute_root_volume_size=40,
        s3_read_resource=s3_arn_updated,
        s3_read_write_resource=s3_arn_updated,
    )
    _update_cluster(cluster, updated_config)

    # verify updated parameters
    _test_max_queue(region, cluster.cfn_name, updated_config.max_queue_size)
    _test_s3_read_resource(region, cluster, updated_config.s3_read_resource)
    _test_s3_read_write_resource(region, cluster,
                                 updated_config.s3_read_write_resource)

    # verify params that are NOT updated in OLD compute nodes
    compute_nodes = slurm_commands.get_compute_nodes()
    _test_compute_instance_type(region, cluster.cfn_name,
                                init_config.compute_instance_type,
                                compute_nodes[0])
    _test_compute_root_volume_size(command_executor, slurm_commands,
                                   test_datadir,
                                   init_config.compute_root_volume_size,
                                   compute_nodes[0])
    # add compute nodes and verify updated params in NEW compute nodes
    new_compute_nodes = _add_compute_nodes(slurm_commands)
    _test_compute_instance_type(region, cluster.cfn_name,
                                updated_config.compute_instance_type,
                                new_compute_nodes[0])
    _test_compute_root_volume_size(command_executor, slurm_commands,
                                   test_datadir,
                                   updated_config.compute_root_volume_size,
                                   new_compute_nodes[0])
예제 #2
0
def test_update_slurm(region, pcluster_config_reader, s3_bucket_factory,
                      clusters_factory, test_datadir):
    # Create S3 bucket for pre/post install scripts
    bucket_name = s3_bucket_factory()
    bucket = boto3.resource("s3", region_name=region).Bucket(bucket_name)
    for script in [
            "preinstall.sh", "postinstall.sh", "updated_preinstall.sh",
            "updated_postinstall.sh"
    ]:
        bucket.upload_file(str(test_datadir / script), f"scripts/{script}")

    # Create cluster with initial configuration
    init_config_file = pcluster_config_reader(resource_bucket=bucket_name,
                                              bucket=bucket_name)
    cluster = clusters_factory(init_config_file)

    # Update cluster with the same configuration, command should not result any error even if not using force update
    cluster.update(str(init_config_file), force_update="true")

    # Command executors
    command_executor = RemoteCommandExecutor(cluster)
    slurm_commands = SlurmCommands(command_executor)

    # Create shared dir for script results
    command_executor.run_remote_command("mkdir -p /shared/script_results")

    initial_queues_config = {
        "queue1": {
            "compute_resources": {
                "queue1-i1": {
                    "instance_type": "c5.xlarge",
                    "expected_running_instances": 1,
                    "expected_power_saved_instances": 1,
                    "enable_efa": False,
                    "disable_hyperthreading": False,
                },
                "queue1-i2": {
                    "instance_type": "t2.micro",
                    "expected_running_instances": 1,
                    "expected_power_saved_instances": 9,
                    "enable_efa": False,
                    "disable_hyperthreading": False,
                },
            },
            "compute_type": "ondemand",
        },
        "queue2": {
            "compute_resources": {
                "queue2-i1": {
                    "instance_type": "c5n.18xlarge",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 10,
                    "enable_efa": False,
                    "disable_hyperthreading": False,
                }
            },
            "compute_type": "ondemand",
        },
    }

    _assert_scheduler_nodes(queues_config=initial_queues_config,
                            slurm_commands=slurm_commands)
    _assert_launch_templates_config(queues_config=initial_queues_config,
                                    cluster_name=cluster.name,
                                    region=region)

    # submit job in queue1 to verify original pre/post-install script execution
    initial_compute_nodes = slurm_commands.get_compute_nodes(
        filter_by_partition="queue1")
    _check_script(command_executor, slurm_commands, initial_compute_nodes[0],
                  "preinstall", "QWE")
    _check_script(command_executor, slurm_commands, initial_compute_nodes[0],
                  "postinstall", "RTY")

    # Submit a job in order to verify that jobs are not affected by an update of the queue size
    result = slurm_commands.submit_command("sleep infinity",
                                           constraint="static&c5.xlarge")
    job_id = slurm_commands.assert_job_submitted(result.stdout)

    # Update cluster with new configuration
    additional_policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonAppStreamServiceAccess"
    updated_config_file = pcluster_config_reader(
        config_file="pcluster.config.update.yaml",
        bucket=bucket_name,
        resource_bucket=bucket_name,
        additional_policy_arn=additional_policy_arn,
    )
    cluster.update(str(updated_config_file), force_update="true")

    # Here is the expected list of nodes.
    # the cluster:
    # queue1-st-c5xlarge-1
    # queue1-st-c5xlarge-2
    assert_initial_conditions(slurm_commands, 2, 0, partition="queue1")

    updated_queues_config = {
        "queue1": {
            "compute_resources": {
                "queue1-i1": {
                    "instance_type": "c5.xlarge",
                    "expected_running_instances": 2,
                    "expected_power_saved_instances": 2,
                    "disable_hyperthreading": False,
                    "enable_efa": False,
                },
                "queue1-i2": {
                    "instance_type": "c5.2xlarge",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 10,
                    "disable_hyperthreading": False,
                    "enable_efa": False,
                },
                "queue1-i3": {
                    "instance_type": "t2.micro",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 10,
                    "disable_hyperthreading": False,
                    "enable_efa": False,
                },
            },
            "compute_type": "spot",
        },
        "queue2": {
            "compute_resources": {
                "queue2-i1": {
                    "instance_type": "c5n.18xlarge",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 1,
                    "enable_efa": True,
                    "disable_hyperthreading": True,
                }
            },
            "compute_type": "ondemand",
            "networking": {
                "placement_group": {
                    "enabled": False
                }
            },
        },
        "queue3": {
            "compute_resources": {
                "queue3-i1": {
                    "instance_type": "c5n.18xlarge",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 10,
                    "disable_hyperthreading": True,
                    "enable_efa": True,
                },
                "queue3-i2": {
                    "instance_type": "t2.xlarge",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 10,
                    "disable_hyperthreading": False,
                    "enable_efa": False,
                },
            },
            "compute_type": "ondemand",
            "networking": {
                "placement_group": {
                    "enabled": False
                }
            },
        },
    }

    _assert_scheduler_nodes(queues_config=updated_queues_config,
                            slurm_commands=slurm_commands)
    _assert_launch_templates_config(queues_config=updated_queues_config,
                                    cluster_name=cluster.name,
                                    region=region)

    # Read updated configuration
    with open(updated_config_file, encoding="utf-8") as conf_file:
        updated_config = yaml.safe_load(conf_file)

    # Check new S3 resources
    check_s3_read_resource(
        region, cluster,
        get_policy_resources(updated_config, enable_write_access=False))
    check_s3_read_write_resource(
        region, cluster,
        get_policy_resources(updated_config, enable_write_access=True))

    # Check new Additional IAM policies
    _check_role_attached_policy(region, cluster, additional_policy_arn)

    # Assert that the job submitted before the update is still running
    assert_that(
        slurm_commands.get_job_info(job_id)).contains("JobState=RUNNING")

    _check_volume(cluster, updated_config, region)

    # Launch a new instance for queue1 and test updated pre/post install script execution and extra json update
    # Add a new dynamic node t2.micro to queue1-i3
    new_compute_node = _add_compute_nodes(slurm_commands, "queue1", "t2.micro")

    assert_that(len(new_compute_node)).is_equal_to(1)
    _check_script(command_executor, slurm_commands, new_compute_node[0],
                  "updated_preinstall", "ABC")
    _check_script(command_executor, slurm_commands, new_compute_node[0],
                  "updated_postinstall", "DEF")

    # check new extra json
    _check_extra_json(command_executor, slurm_commands, new_compute_node[0],
                      "test_value")