Пример #1
0
def _test_job_dependencies(remote_command_executor, region, stack_name, scaledown_idletime):
    logging.info("Testing cluster doesn't scale when job dependencies are not satisfied")
    slurm_commands = SlurmCommands(remote_command_executor)
    result = slurm_commands.submit_command("sleep 60", nodes=1)
    job_id = slurm_commands.assert_job_submitted(result.stdout)
    result = remote_command_executor.run_remote_command("sbatch -N 1 --wrap='sleep 1' -d afterok:{0}".format(job_id))
    dependent_job_id = slurm_commands.assert_job_submitted(result.stdout)

    # Wait for reason to be computed
    time.sleep(3)
    assert_that(_get_job_info(remote_command_executor, job_id)).contains(
        "JobState=PENDING Reason=Nodes_required_for_job_are_DOWN,_DRAINED"
        "_or_reserved_for_jobs_in_higher_priority_partitions"
    )
    assert_that(_get_job_info(remote_command_executor, dependent_job_id)).contains("JobState=PENDING Reason=Dependency")

    jobs_execution_time = 1
    estimated_scaleup_time = 5
    estimated_scaledown_time = 20
    asg_capacity_time_series, compute_nodes_time_series, timestamps = get_compute_nodes_allocation(
        scheduler_commands=slurm_commands,
        region=region,
        stack_name=stack_name,
        max_monitoring_time=minutes(jobs_execution_time)
        + minutes(scaledown_idletime)
        + minutes(estimated_scaleup_time)
        + minutes(estimated_scaledown_time),
    )
    assert_that(max(asg_capacity_time_series)).is_equal_to(1)
    assert_that(max(compute_nodes_time_series)).is_equal_to(1)
    assert_that(asg_capacity_time_series[-1]).is_equal_to(0)
    assert_that(compute_nodes_time_series[-1]).is_equal_to(0)
Пример #2
0
def _test_job_dependencies(remote_command_executor, region, stack_name,
                           scaledown_idletime, max_queue_size):
    logging.info(
        "Testing cluster doesn't scale when job dependencies are not satisfied"
    )
    slurm_commands = SlurmCommands(remote_command_executor)
    result = slurm_commands.submit_command("sleep 60", nodes=1)
    job_id = slurm_commands.assert_job_submitted(result.stdout)
    result = remote_command_executor.run_remote_command(
        "sbatch -N 1 --wrap='sleep 1' -d afterok:{0}".format(job_id))
    dependent_job_id = slurm_commands.assert_job_submitted(result.stdout)

    # Wait for reason to be computed
    time.sleep(3)
    assert_that(_get_job_info(remote_command_executor, job_id)).contains(
        "JobState=PENDING Reason=Nodes_required_for_job_are_DOWN,_DRAINED"
        "_or_reserved_for_jobs_in_higher_priority_partitions")
    assert_that(_get_job_info(
        remote_command_executor,
        dependent_job_id)).contains("JobState=PENDING Reason=Dependency")

    assert_scaling_worked(slurm_commands,
                          region,
                          stack_name,
                          scaledown_idletime,
                          expected_max=1,
                          expected_final=0)
    # Assert scheduler configuration is correct
    _assert_dummy_nodes(remote_command_executor, max_queue_size)
    assert_that(
        _retrieve_slurm_compute_nodes_from_config(
            remote_command_executor)).is_empty()
    # Assert jobs were completed
    _assert_job_completed(remote_command_executor, job_id)
    _assert_job_completed(remote_command_executor, dependent_job_id)
Пример #3
0
def _test_dynamic_dummy_nodes(remote_command_executor, max_queue_size):
    logging.info("Testing dummy nodes are automatically reconfigured based on actual compute nodes")
    _assert_dummy_nodes(remote_command_executor, max_queue_size)
    slurm_commands = SlurmCommands(remote_command_executor)
    result = slurm_commands.submit_command("sleep 1", nodes=1)
    job_id = slurm_commands.assert_job_submitted(result.stdout)
    slurm_commands.wait_job_completed(job_id)
    _assert_dummy_nodes(remote_command_executor, max_queue_size - 1)
Пример #4
0
def test_update(instance, region, pcluster_config_reader, clusters_factory,
                test_datadir):
    """
    Test 'pcluster update' command.

    Grouped all tests in a single function so that cluster can be reused for all of them.
    """
    s3_arn = "arn:aws:s3:::fake_bucket/*"
    init_config = PClusterConfig(
        max_queue_size=5,
        compute_instance_type=instance,
        compute_root_volume_size=30,
        s3_read_resource=s3_arn,
        s3_read_write_resource=s3_arn,
    )
    cluster = _init_cluster(clusters_factory, pcluster_config_reader,
                            init_config)
    command_executor = RemoteCommandExecutor(cluster)
    slurm_commands = SlurmCommands(command_executor)

    _verify_initialization(command_executor, slurm_commands, region,
                           test_datadir, cluster, init_config)

    s3_arn_updated = "arn:aws:s3:::fake_bucket/fake_folder/*"
    updated_config = PClusterConfig(
        max_queue_size=10,
        compute_instance_type="c4.xlarge",
        compute_root_volume_size=40,
        s3_read_resource=s3_arn_updated,
        s3_read_write_resource=s3_arn_updated,
    )
    _update_cluster(cluster, updated_config)

    # verify updated parameters
    _test_max_queue(region, cluster.cfn_name, updated_config.max_queue_size)
    _test_s3_read_resource(region, cluster, updated_config.s3_read_resource)
    _test_s3_read_write_resource(region, cluster,
                                 updated_config.s3_read_write_resource)

    # verify params that are NOT updated in OLD compute nodes
    compute_nodes = slurm_commands.get_compute_nodes()
    _test_compute_instance_type(region, cluster.cfn_name,
                                init_config.compute_instance_type,
                                compute_nodes[0])
    _test_compute_root_volume_size(command_executor, slurm_commands,
                                   test_datadir,
                                   init_config.compute_root_volume_size,
                                   compute_nodes[0])
    # add compute nodes and verify updated params in NEW compute nodes
    new_compute_nodes = _add_compute_nodes(slurm_commands)
    _test_compute_instance_type(region, cluster.cfn_name,
                                updated_config.compute_instance_type,
                                new_compute_nodes[0])
    _test_compute_root_volume_size(command_executor, slurm_commands,
                                   test_datadir,
                                   updated_config.compute_root_volume_size,
                                   new_compute_nodes[0])
Пример #5
0
def _test_cluster_limits(remote_command_executor, max_queue_size, region, asg_name):
    logging.info("Testing cluster doesn't scale when job requires a capacity that is higher than the max available")
    slurm_commands = SlurmCommands(remote_command_executor)
    result = slurm_commands.submit_command("sleep 1", nodes=max_queue_size + 1)
    job_id = slurm_commands.assert_job_submitted(result.stdout)
    # Wait for reason to be computed
    time.sleep(3)
    assert_that(_get_job_info(remote_command_executor, job_id)).contains("JobState=PENDING Reason=PartitionNodeLimit")

    # Check we are not scaling
    time.sleep(60)
    asg_client = boto3.client("autoscaling", region_name=region)
    asg = asg_client.describe_auto_scaling_groups(AutoScalingGroupNames=[asg_name]).get("AutoScalingGroups")[0]
    assert_that(asg.get("DesiredCapacity")).is_equal_to(0)
Пример #6
0
def _test_job_arrays_and_parallel_jobs(remote_command_executor, region, stack_name, scaledown_idletime):
    logging.info("Testing cluster scales correctly with array jobs and parallel jobs")
    slurm_commands = SlurmCommands(remote_command_executor)

    result = remote_command_executor.run_remote_command("sbatch --wrap 'sleep 1' -a 1-5")
    array_job_id = slurm_commands.assert_job_submitted(result.stdout)

    result = remote_command_executor.run_remote_command("sbatch --wrap 'sleep 1' -c 3 -n 2")
    parallel_job_id = slurm_commands.assert_job_submitted(result.stdout)

    # Assert scaling worked as expected
    assert_scaling_worked(slurm_commands, region, stack_name, scaledown_idletime, expected_max=3, expected_final=0)
    # Assert jobs were completed
    _assert_job_completed(remote_command_executor, array_job_id)
    _assert_job_completed(remote_command_executor, parallel_job_id)
Пример #7
0
def _test_cluster_limits(remote_command_executor, max_queue_size, region, asg_name):
    logging.info("Testing cluster doesn't scale when job requires a capacity that is higher than the max available")
    slurm_commands = SlurmCommands(remote_command_executor)
    result = slurm_commands.submit_command("sleep 1000", nodes=max_queue_size + 1)
    max_nodes_job_id = slurm_commands.assert_job_submitted(result.stdout)
    result = remote_command_executor.run_remote_command("sbatch -N 1 --wrap='sleep 1' --cpus-per-task 5")
    max_cpu_job_id = slurm_commands.assert_job_submitted(result.stdout)

    # Check we are not scaling
    time.sleep(60)
    assert_asg_desired_capacity(region, asg_name, expected=0)
    assert_that(_get_job_info(remote_command_executor, max_nodes_job_id)).contains(
        "JobState=PENDING Reason=PartitionNodeLimit"
    )
    assert_that(_get_job_info(remote_command_executor, max_cpu_job_id)).contains(
        "JobState=PENDING Reason=Nodes_required_for_job_are_DOWN,_DRAINED"
        "_or_reserved_for_jobs_in_higher_priority_partitions"
    )
Пример #8
0
def _test_cluster_limits(remote_command_executor, max_queue_size, region,
                         asg_name):
    logging.info(
        "Testing cluster doesn't scale when job requires a capacity that is higher than the max available"
    )
    slurm_commands = SlurmCommands(remote_command_executor)
    result = slurm_commands.submit_command("sleep 1", nodes=max_queue_size + 1)
    job_id = slurm_commands.assert_job_submitted(result.stdout)
    # Wait for reason to be computed
    time.sleep(3)
    assert_that(_get_job_info(
        remote_command_executor,
        job_id)).contains("JobState=PENDING Reason=PartitionNodeLimit")

    # Check we are not scaling
    time.sleep(60)
    asg_client = boto3.client("autoscaling", region_name=region)
    asg = asg_client.describe_auto_scaling_groups(
        AutoScalingGroupNames=[asg_name]).get("AutoScalingGroups")[0]
    assert_that(asg.get("DesiredCapacity")).is_equal_to(0)
def test_slurm(region, pcluster_config_reader, clusters_factory, test_datadir,
               architecture):
    """
    Test all AWS Slurm related features.

    Grouped all tests in a single function so that cluster can be reused for all of them.
    """
    scaledown_idletime = 3
    # For OSs running _test_mpi_job_termination, spin up 2 compute nodes at cluster creation to run test
    # Else do not spin up compute node and start running regular slurm tests
    supports_impi = architecture == "x86_64"
    cluster_config = pcluster_config_reader(
        scaledown_idletime=scaledown_idletime)
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)
    slurm_commands = SlurmCommands(remote_command_executor)
    _test_slurm_version(remote_command_executor)

    if supports_impi:
        _test_mpi_job_termination(remote_command_executor, test_datadir)

    _assert_no_node_in_cluster(region, cluster.cfn_name, slurm_commands)
    _test_job_dependencies(slurm_commands, region, cluster.cfn_name,
                           scaledown_idletime)
    _test_job_arrays_and_parallel_jobs(
        slurm_commands,
        region,
        cluster.cfn_name,
        scaledown_idletime,
        partition="ondemand",
        instance_type="c5.xlarge",
        cpu_per_instance=4,
    )
    _gpu_resource_check(slurm_commands,
                        partition="gpu",
                        instance_type="g3.8xlarge")
    _test_cluster_limits(slurm_commands,
                         partition="ondemand",
                         instance_type="c5.xlarge",
                         max_count=5,
                         cpu_per_instance=4)
    _test_cluster_gpu_limits(
        slurm_commands,
        partition="gpu",
        instance_type="g3.8xlarge",
        max_count=5,
        gpu_per_instance=2,
        gpu_type="m60",
    )
    # Test torque command wrapper
    _test_torque_job_submit(remote_command_executor, test_datadir)
    assert_no_errors_in_logs(remote_command_executor, "slurm")
Пример #10
0
def _test_dynamic_dummy_nodes(remote_command_executor, max_queue_size):
    logging.info("Testing dummy nodes are automatically reconfigured based on actual compute nodes")
    _assert_dummy_nodes(remote_command_executor, max_queue_size)
    slurm_commands = SlurmCommands(remote_command_executor)
    result = slurm_commands.submit_command("sleep 1", nodes=1)
    job_id = slurm_commands.assert_job_submitted(result.stdout)
    slurm_commands.wait_job_completed(job_id)
    _assert_dummy_nodes(remote_command_executor, max_queue_size - 1)
Пример #11
0
def test_replace_compute_on_failure(region, pcluster_config_reader, clusters_factory, s3_bucket_factory, test_datadir):
    """
    Test that compute nodes get replaced on userdata failures and logs get saved in shared directory.

    The failure is caused by a post_install script that exits with errors on compute nodes.
    """
    bucket_name = s3_bucket_factory()
    bucket = boto3.resource("s3", region_name=region).Bucket(bucket_name)
    bucket.upload_file(str(test_datadir / "post_install.sh"), "post_install.sh")
    cluster_config = pcluster_config_reader(bucket_name=bucket_name)
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)

    # submit a job to spin up a compute node that will fail due to post_install script
    sge_commands = SlurmCommands(remote_command_executor)
    sge_commands.submit_command("sleep 1")
    instance_id = wait_compute_log(remote_command_executor)

    # extract logs and check one of them
    _assert_compute_logs(remote_command_executor, instance_id)

    # check that instance got already replaced or is marked as Unhealthy
    assert_instance_replaced_or_terminating(instance_id, region)
Пример #12
0
def _test_job_dependencies(remote_command_executor, region, stack_name,
                           scaledown_idletime):
    logging.info(
        "Testing cluster doesn't scale when job dependencies are not satisfied"
    )
    slurm_commands = SlurmCommands(remote_command_executor)
    result = slurm_commands.submit_command("sleep 60", nodes=1)
    job_id = slurm_commands.assert_job_submitted(result.stdout)
    result = remote_command_executor.run_remote_command(
        "sbatch -N 1 --wrap='sleep 1' -d afterok:{0}".format(job_id))
    dependent_job_id = slurm_commands.assert_job_submitted(result.stdout)

    # Wait for reason to be computed
    time.sleep(3)
    assert_that(_get_job_info(remote_command_executor, job_id)).contains(
        "JobState=PENDING Reason=Nodes_required_for_job_are_DOWN,_DRAINED"
        "_or_reserved_for_jobs_in_higher_priority_partitions")
    assert_that(_get_job_info(
        remote_command_executor,
        dependent_job_id)).contains("JobState=PENDING Reason=Dependency")

    jobs_execution_time = 1
    estimated_scaleup_time = 5
    estimated_scaledown_time = 20
    asg_capacity_time_series, compute_nodes_time_series, timestamps = get_compute_nodes_allocation(
        scheduler_commands=slurm_commands,
        region=region,
        stack_name=stack_name,
        max_monitoring_time=minutes(jobs_execution_time) +
        minutes(scaledown_idletime) + minutes(estimated_scaleup_time) +
        minutes(estimated_scaledown_time),
    )
    assert_that(max(asg_capacity_time_series)).is_equal_to(1)
    assert_that(max(compute_nodes_time_series)).is_equal_to(1)
    assert_that(asg_capacity_time_series[-1]).is_equal_to(0)
    assert_that(compute_nodes_time_series[-1]).is_equal_to(0)
Пример #13
0
def test_cluster_in_no_internet_subnet(
    region,
    scheduler,
    pcluster_config_reader,
    vpc_stack,
    s3_bucket_factory,
    clusters_factory,
    test_datadir,
    architecture,
    os,
    mpi_variants,
    bastion_instance,
):
    """This test creates a cluster in a subnet with no internet, run osu latency and checks that no failures occur."""
    bucket_name = s3_bucket_factory()
    _upload_pre_install_script(bucket_name, test_datadir)

    vpc_default_security_group_id = get_default_vpc_security_group(
        vpc_stack.cfn_outputs["VpcId"], region)
    cluster_config = pcluster_config_reader(
        vpc_default_security_group_id=vpc_default_security_group_id,
        bucket_name=bucket_name,
        architecture=architecture)
    cluster = clusters_factory(cluster_config)

    logging.info("Checking cluster has one static node")
    assert_that(len(get_compute_nodes_instance_ids(cluster.cfn_name,
                                                   region))).is_equal_to(1)

    remote_command_executor = RemoteCommandExecutor(cluster,
                                                    bastion=bastion_instance)
    slurm_commands = SlurmCommands(remote_command_executor)

    _check_no_internet_access(remote_command_executor)
    _check_hostname(remote_command_executor)
    _run_mpi_jobs(mpi_variants, remote_command_executor, test_datadir,
                  slurm_commands, cluster, region)
    utils.check_pcluster_list_cluster_log_streams(cluster, os)
    assert_no_errors_in_logs(remote_command_executor, scheduler)
    logging.info(
        "Checking compute node is scaled down after scaledown idle time")
    wait_for_num_instances_in_cluster(cluster.cfn_name, region, 1)
Пример #14
0
def _gpu_resource_check(remote_command_executor):
    """Test GPU related resources are correctly allocated."""
    logging.info("Testing number of GPU/CPU resources allocated to job")
    slurm_commands = SlurmCommands(remote_command_executor)

    result = remote_command_executor.run_remote_command(
        "sbatch -G 1 --cpus-per-gpu 5 --wrap='sleep 1'")
    job_id = slurm_commands.assert_job_submitted(result.stdout)
    job_info = slurm_commands.get_job_info(job_id)
    assert_that(job_info).contains("TresPerJob=gpu:1", "CpusPerTres=gpu:5")

    result = remote_command_executor.run_remote_command(
        "sbatch --gres=gpu:2 --cpus-per-gpu 6 --wrap='sleep 1'")
    job_id = slurm_commands.assert_job_submitted(result.stdout)
    job_info = slurm_commands.get_job_info(job_id)
    assert_that(job_info).contains("TresPerNode=gpu:2", "CpusPerTres=gpu:6")
Пример #15
0
def _test_mpi_job_termination(remote_command_executor, test_datadir):
    """
    Test canceling mpirun job will not leave stray processes.

    IntelMPI is known to leave stray processes after job termination if slurm process tracking is not setup correctly,
    i.e. using ProctrackType=proctrack/pgid
    Test IntelMPI script to make sure no stray processes after the job is cancelled
    This bug cannot be reproduced using OpenMPI
    """
    logging.info(
        "Testing no stray process left behind after mpirun job is terminated")
    slurm_commands = SlurmCommands(remote_command_executor)
    # Assert initial condition
    assert_that(slurm_commands.compute_nodes_count()).is_equal_to(2)

    # Submit mpi_job, which runs Intel MPI benchmarks with intelmpi
    # Leaving 1 vcpu on each node idle so that the process check job can run while mpi_job is running
    result = slurm_commands.submit_script(str(test_datadir / "mpi_job.sh"))
    job_id = slurm_commands.assert_job_submitted(result.stdout)

    # Check that mpi processes are started
    _assert_job_state(slurm_commands, job_id, job_state="RUNNING")
    _check_mpi_process(remote_command_executor,
                       slurm_commands,
                       test_datadir,
                       num_nodes=2,
                       after_completion=False)
    slurm_commands.cancel_job(job_id)

    # Make sure mpirun job is cancelled
    _assert_job_state(slurm_commands, job_id, job_state="CANCELLED")

    # Check that mpi processes are terminated
    _check_mpi_process(remote_command_executor,
                       slurm_commands,
                       test_datadir,
                       num_nodes=2,
                       after_completion=True)
Пример #16
0
def _test_dynamic_dummy_nodes(remote_command_executor,
                              region,
                              asg_name,
                              max_queue_size,
                              slots=4,
                              gpus=0):
    logging.info(
        "Testing dummy nodes are automatically reconfigured based on actual compute nodes"
    )
    slurm_commands = SlurmCommands(remote_command_executor)
    # Assert initial conditions
    _assert_asg_has_no_node(region, asg_name)
    _assert_no_nodes_in_scheduler(slurm_commands)

    _assert_dummy_nodes(remote_command_executor, max_queue_size, slots, gpus)
    result = slurm_commands.submit_command("sleep 1", nodes=1)
    job_id = slurm_commands.assert_job_submitted(result.stdout)
    slurm_commands.wait_job_completed(job_id)
    _assert_dummy_nodes(remote_command_executor, max_queue_size - 1, slots,
                        gpus)
Пример #17
0
def test_update_slurm(region, pcluster_config_reader, s3_bucket_factory,
                      clusters_factory, test_datadir):
    # Create S3 bucket for pre/post install scripts
    bucket_name = s3_bucket_factory()
    bucket = boto3.resource("s3", region_name=region).Bucket(bucket_name)
    for script in [
            "preinstall.sh", "postinstall.sh", "updated_preinstall.sh",
            "updated_postinstall.sh"
    ]:
        bucket.upload_file(str(test_datadir / script), f"scripts/{script}")

    # Create cluster with initial configuration
    init_config_file = pcluster_config_reader(resource_bucket=bucket_name,
                                              bucket=bucket_name)
    cluster = clusters_factory(init_config_file)

    # Update cluster with the same configuration, command should not result any error even if not using force update
    cluster.update(str(init_config_file), force_update="true")

    # Command executors
    command_executor = RemoteCommandExecutor(cluster)
    slurm_commands = SlurmCommands(command_executor)

    # Create shared dir for script results
    command_executor.run_remote_command("mkdir -p /shared/script_results")

    initial_queues_config = {
        "queue1": {
            "compute_resources": {
                "queue1-i1": {
                    "instance_type": "c5.xlarge",
                    "expected_running_instances": 1,
                    "expected_power_saved_instances": 1,
                    "enable_efa": False,
                    "disable_hyperthreading": False,
                },
                "queue1-i2": {
                    "instance_type": "t2.micro",
                    "expected_running_instances": 1,
                    "expected_power_saved_instances": 9,
                    "enable_efa": False,
                    "disable_hyperthreading": False,
                },
            },
            "compute_type": "ondemand",
        },
        "queue2": {
            "compute_resources": {
                "queue2-i1": {
                    "instance_type": "c5n.18xlarge",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 10,
                    "enable_efa": False,
                    "disable_hyperthreading": False,
                }
            },
            "compute_type": "ondemand",
        },
    }

    _assert_scheduler_nodes(queues_config=initial_queues_config,
                            slurm_commands=slurm_commands)
    _assert_launch_templates_config(queues_config=initial_queues_config,
                                    cluster_name=cluster.name,
                                    region=region)

    # submit job in queue1 to verify original pre/post-install script execution
    initial_compute_nodes = slurm_commands.get_compute_nodes(
        filter_by_partition="queue1")
    _check_script(command_executor, slurm_commands, initial_compute_nodes[0],
                  "preinstall", "QWE")
    _check_script(command_executor, slurm_commands, initial_compute_nodes[0],
                  "postinstall", "RTY")

    # Submit a job in order to verify that jobs are not affected by an update of the queue size
    result = slurm_commands.submit_command("sleep infinity",
                                           constraint="static&c5.xlarge")
    job_id = slurm_commands.assert_job_submitted(result.stdout)

    # Update cluster with new configuration
    additional_policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonAppStreamServiceAccess"
    updated_config_file = pcluster_config_reader(
        config_file="pcluster.config.update.yaml",
        bucket=bucket_name,
        resource_bucket=bucket_name,
        additional_policy_arn=additional_policy_arn,
    )
    cluster.update(str(updated_config_file), force_update="true")

    # Here is the expected list of nodes.
    # the cluster:
    # queue1-st-c5xlarge-1
    # queue1-st-c5xlarge-2
    assert_initial_conditions(slurm_commands, 2, 0, partition="queue1")

    updated_queues_config = {
        "queue1": {
            "compute_resources": {
                "queue1-i1": {
                    "instance_type": "c5.xlarge",
                    "expected_running_instances": 2,
                    "expected_power_saved_instances": 2,
                    "disable_hyperthreading": False,
                    "enable_efa": False,
                },
                "queue1-i2": {
                    "instance_type": "c5.2xlarge",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 10,
                    "disable_hyperthreading": False,
                    "enable_efa": False,
                },
                "queue1-i3": {
                    "instance_type": "t2.micro",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 10,
                    "disable_hyperthreading": False,
                    "enable_efa": False,
                },
            },
            "compute_type": "spot",
        },
        "queue2": {
            "compute_resources": {
                "queue2-i1": {
                    "instance_type": "c5n.18xlarge",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 1,
                    "enable_efa": True,
                    "disable_hyperthreading": True,
                }
            },
            "compute_type": "ondemand",
            "networking": {
                "placement_group": {
                    "enabled": False
                }
            },
        },
        "queue3": {
            "compute_resources": {
                "queue3-i1": {
                    "instance_type": "c5n.18xlarge",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 10,
                    "disable_hyperthreading": True,
                    "enable_efa": True,
                },
                "queue3-i2": {
                    "instance_type": "t2.xlarge",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 10,
                    "disable_hyperthreading": False,
                    "enable_efa": False,
                },
            },
            "compute_type": "ondemand",
            "networking": {
                "placement_group": {
                    "enabled": False
                }
            },
        },
    }

    _assert_scheduler_nodes(queues_config=updated_queues_config,
                            slurm_commands=slurm_commands)
    _assert_launch_templates_config(queues_config=updated_queues_config,
                                    cluster_name=cluster.name,
                                    region=region)

    # Read updated configuration
    with open(updated_config_file, encoding="utf-8") as conf_file:
        updated_config = yaml.safe_load(conf_file)

    # Check new S3 resources
    check_s3_read_resource(
        region, cluster,
        get_policy_resources(updated_config, enable_write_access=False))
    check_s3_read_write_resource(
        region, cluster,
        get_policy_resources(updated_config, enable_write_access=True))

    # Check new Additional IAM policies
    _check_role_attached_policy(region, cluster, additional_policy_arn)

    # Assert that the job submitted before the update is still running
    assert_that(
        slurm_commands.get_job_info(job_id)).contains("JobState=RUNNING")

    _check_volume(cluster, updated_config, region)

    # Launch a new instance for queue1 and test updated pre/post install script execution and extra json update
    # Add a new dynamic node t2.micro to queue1-i3
    new_compute_node = _add_compute_nodes(slurm_commands, "queue1", "t2.micro")

    assert_that(len(new_compute_node)).is_equal_to(1)
    _check_script(command_executor, slurm_commands, new_compute_node[0],
                  "updated_preinstall", "ABC")
    _check_script(command_executor, slurm_commands, new_compute_node[0],
                  "updated_postinstall", "DEF")

    # check new extra json
    _check_extra_json(command_executor, slurm_commands, new_compute_node[0],
                      "test_value")
Пример #18
0
def _gpu_test_scaleup(remote_command_executor, region, asg_name, stack_name,
                      scaledown_idletime, num_gpus):
    """Test cluster is scaling up correctly and GPU jobs are not aborted on slurmctld restart."""
    logging.info("Testing cluster scales correctly with GPU jobs")
    slurm_commands = SlurmCommands(remote_command_executor)
    # Assert initial conditions
    _assert_asg_has_no_node(region, asg_name)
    _assert_no_nodes_in_scheduler(slurm_commands)
    # g3.8xlarge has 32 vcpus and 2 GPUs, hardcoding tests for g3.8xlarge
    job_ids = []

    # sbatch --wrap 'sleep 10' -G 3
    result = slurm_commands.submit_command(command="sleep 10",
                                           nodes=-1,
                                           other_options="-G 3")
    job_ids.append(slurm_commands.assert_job_submitted(result.stdout))
    # Nodes/resources available after this job:
    # [{cpu:31, gpu:0}, {cpu:31, gpu:0}]

    # sbatch --wrap 'sleep 10' --cpus-per-gpu=10 --gpus-per-task=1
    result = slurm_commands.submit_command(
        command="sleep 10",
        nodes=-1,
        other_options="--cpus-per-gpu=10 --gpus-per-task=1")
    job_ids.append(slurm_commands.assert_job_submitted(result.stdout))
    # Nodes/resources available after this job:
    # [{cpu:31, gpu:0}, {cpu:31, gpu:0}, {cpu:22, gpu:1}]

    # sbatch --wrap 'sleep 10' -N 1 --gpus-per-node=1 -c 22 -n 1
    result = slurm_commands.submit_command(
        command="sleep 10",
        nodes=1,
        slots=1,
        other_options="--gpus-per-node=1 -c 23")
    job_ids.append(slurm_commands.assert_job_submitted(result.stdout))
    # Nodes/resources available after this job:
    # [{cpu:31, gpu:0}, {cpu:31, gpu:0}, {cpu:22, gpu:1}, {cpu:19, gpu:1}]

    # sbatch --wrap 'sleep 10' -c 31 -n 1
    result = slurm_commands.submit_command(command="sleep 10",
                                           nodes=-1,
                                           slots=1,
                                           other_options="-c 31")
    job_ids.append(slurm_commands.assert_job_submitted(result.stdout))
    # Nodes/resources available after this job:
    # [{cpu:0, gpu:0}, {cpu:31, gpu:0}, {cpu:22, gpu:1}, {cpu:19, gpu:1}]

    # Assert scaling worked as expected
    assert_scaling_worked(slurm_commands,
                          region,
                          stack_name,
                          scaledown_idletime,
                          expected_max=4,
                          expected_final=0)
    # Assert jobs were completed
    for job_id in job_ids:
        slurm_commands.assert_job_succeeded(job_id)
Пример #19
0
def _gpu_test_cluster_limits(remote_command_executor, max_queue_size,
                             num_gpus):
    """Test edge cases regarding the number of GPUs."""
    logging.info(
        "Testing scheduler does not accept jobs when requesting for more GPUs than available"
    )
    slurm_commands = SlurmCommands(remote_command_executor)
    # Expect commands below to fail with exit 1
    _submit_and_assert_job_rejected_node_config(
        remote_command_executor,
        "sbatch -N 1 --wrap='sleep 1' --gpus-per-task {0}".format(num_gpus +
                                                                  1))
    _submit_and_assert_job_rejected_node_config(
        remote_command_executor,
        "sbatch -N 1 --wrap='sleep 1' --gres=gpu:{0}".format(num_gpus + 1))
    _submit_and_assert_job_rejected_node_config(
        remote_command_executor,
        "sbatch -G {0} --wrap='sleep 1'".format(num_gpus * max_queue_size + 1))

    # Commands below should be correctly submitted
    result = slurm_commands.submit_command(
        "sleep 1",
        nodes=1,
        slots=num_gpus,
        other_options="-G {0} --gpus-per-task=1".format(num_gpus))
    slurm_commands.assert_job_submitted(result.stdout)
    result = slurm_commands.submit_command(
        "sleep 1", nodes=1, other_options="--gres=gpu:{0}".format(num_gpus))
    slurm_commands.assert_job_submitted(result.stdout)
    # Submit job without '-N' option(nodes=-1)
    result = slurm_commands.submit_command(
        "sleep 1",
        nodes=-1,
        other_options="-G {0} --gpus-per-node={1}".format(
            num_gpus * max_queue_size, num_gpus))
    slurm_commands.assert_job_submitted(result.stdout)
Пример #20
0
def test_update_hit(region, scheduler, pcluster_config_reader,
                    clusters_factory, test_datadir, s3_bucket_factory):
    # Create S3 bucket for pre/post install scripts
    bucket_name = s3_bucket_factory()
    bucket = boto3.resource("s3", region_name=region).Bucket(bucket_name)
    bucket.upload_file(str(test_datadir / "preinstall.sh"),
                       "scripts/preinstall.sh")
    bucket.upload_file(str(test_datadir / "postinstall.sh"),
                       "scripts/postinstall.sh")

    # Create cluster with initial configuration
    init_config_file = pcluster_config_reader(resource_bucket=bucket_name)
    cluster = clusters_factory(init_config_file)

    # Update cluster with the same configuration, command should not result any error even if not using force update
    cluster.config_file = str(init_config_file)
    cluster.update(force=True)

    # Command executors
    command_executor = RemoteCommandExecutor(cluster)
    slurm_commands = SlurmCommands(command_executor)

    # Create shared dir for script results
    command_executor.run_remote_command("mkdir -p /shared/script_results")

    initial_queues_config = {
        "queue1": {
            "compute_resources": {
                "queue1_i1": {
                    "instance_type": "c5.xlarge",
                    "expected_running_instances": 1,
                    "expected_power_saved_instances": 1,
                    "enable_efa": False,
                    "disable_hyperthreading": False,
                },
                "queue1_i2": {
                    "instance_type": "t2.micro",
                    "expected_running_instances": 1,
                    "expected_power_saved_instances": 9,
                    "enable_efa": False,
                    "disable_hyperthreading": False,
                },
            },
            "compute_type": "ondemand",
        },
        "queue2": {
            "compute_resources": {
                "queue2_i1": {
                    "instance_type": "c5n.18xlarge",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 10,
                    "enable_efa": False,
                    "disable_hyperthreading": False,
                },
            },
            "compute_type": "ondemand",
        },
    }

    _assert_scheduler_nodes(queues_config=initial_queues_config,
                            slurm_commands=slurm_commands)
    _assert_launch_templates_config(queues_config=initial_queues_config,
                                    cluster_name=cluster.name,
                                    region=region)

    # Submit a job in order to verify that jobs are not affected by an update of the queue size
    result = slurm_commands.submit_command("sleep infinity",
                                           constraint="static")
    job_id = slurm_commands.assert_job_submitted(result.stdout)

    # Update cluster with new configuration
    updated_config_file = pcluster_config_reader(
        config_file="pcluster.config.update.ini",
        bucket=bucket_name,
        resource_bucket=bucket_name)
    cluster.config_file = str(updated_config_file)
    cluster.update()

    # Here is the expected list of nodes. Note that queue1-dy-t2micro-1 comes from the initial_count set when creating
    # the cluster:
    # queue1-dy-t2micro-1
    # queue1-st-c5xlarge-1
    # queue1-st-c5xlarge-2
    assert_initial_conditions(slurm_commands, 2, 1, partition="queue1")

    updated_queues_config = {
        "queue1": {
            "compute_resources": {
                "queue1_i1": {
                    "instance_type": "c5.xlarge",
                    "expected_running_instances": 2,
                    "expected_power_saved_instances": 2,
                    "disable_hyperthreading": False,
                    "enable_efa": False,
                },
                "queue1_i2": {
                    "instance_type": "c5.2xlarge",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 10,
                    "disable_hyperthreading": False,
                    "enable_efa": False,
                },
                "queue1_i3": {
                    "instance_type": "t2.micro",
                    "expected_running_instances":
                    1,  # This comes from initial_count before update
                    "expected_power_saved_instances": 9,
                    "disable_hyperthreading": False,
                    "enable_efa": False,
                },
            },
            "compute_type": "spot",
        },
        "queue2": {
            "compute_resources": {
                "queue2_i1": {
                    "instance_type": "c5n.18xlarge",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 1,
                    "enable_efa": True,
                    "disable_hyperthreading": True,
                },
            },
            "compute_type": "ondemand",
        },
        "queue3": {
            "compute_resources": {
                "queue3_i1": {
                    "instance_type": "c5n.18xlarge",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 10,
                    "disable_hyperthreading": True,
                    "enable_efa": True,
                },
                "queue3_i2": {
                    "instance_type": "t2.xlarge",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 10,
                    "disable_hyperthreading": False,
                    "enable_efa": False,
                },
            },
            "compute_type": "ondemand",
        },
    }

    _assert_scheduler_nodes(queues_config=updated_queues_config,
                            slurm_commands=slurm_commands)
    _assert_launch_templates_config(queues_config=updated_queues_config,
                                    cluster_name=cluster.name,
                                    region=region)

    # Read updated configuration
    updated_config = configparser.ConfigParser()
    updated_config.read(updated_config_file)

    # Check new S3 resources
    check_s3_read_resource(
        region, cluster,
        updated_config.get("cluster default", "s3_read_resource"))
    check_s3_read_write_resource(
        region, cluster,
        updated_config.get("cluster default", "s3_read_write_resource"))

    # Check new Additional IAM policies
    _check_role_attached_policy(
        region, cluster,
        updated_config.get("cluster default", "additional_iam_policies"))

    # Assert that the job submitted before the update is still running
    assert_that(
        slurm_commands.get_job_info(job_id)).contains("JobState=RUNNING")