예제 #1
0
def test_head_node_stop(scheduler, pcluster_config_reader, clusters_factory,
                        region, os):
    head_ephemeral_mount = "/scratch_head"
    compute_ephemeral_mount = "/scratch_compute"
    folder = "myFolder"
    filename = "myFile"
    cluster_config = pcluster_config_reader(
        head_ephemeral_mount=head_ephemeral_mount,
        compute_ephemeral_mount=compute_ephemeral_mount)
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)

    _test_head_ephemeral_setup(remote_command_executor, head_ephemeral_mount,
                               folder, filename)

    # reboot headnode (instance store is preserved)
    reboot_head_node(cluster, remote_command_executor)
    _test_head_ephemeral_preserved(remote_command_executor,
                                   head_ephemeral_mount, folder, filename)

    # stop/start headnode (instance store is recreated)
    restart_head_node(cluster)
    # RemoteCommandExecutor needs to be re-initialized because HeadNode changed public IP
    new_remote_command_executor = RemoteCommandExecutor(cluster)
    _test_head_ephemeral_recreated(new_remote_command_executor,
                                   head_ephemeral_mount, folder, filename)
예제 #2
0
def test_torque(region, pcluster_config_reader, clusters_factory):
    """
    Test all AWS Torque related features.

    Grouped all tests in a single function so that cluster can be reused for all of them.
    """
    scaledown_idletime = 2
    max_queue_size = 5
    max_slots = 4
    initial_queue_size = 3  # in order to speed-up _test_jobs_executed_concurrently test
    cluster_config = pcluster_config_reader(
        scaledown_idletime=scaledown_idletime, max_queue_size=max_queue_size, initial_queue_size=initial_queue_size
    )
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)

    _test_torque_version(remote_command_executor)
    _test_jobs_executed_concurrently(remote_command_executor, max_slots)
    _test_non_runnable_jobs(remote_command_executor, max_queue_size, max_slots, region, cluster, scaledown_idletime)
    _test_job_dependencies(remote_command_executor, region, cluster.cfn_name, scaledown_idletime)
    _test_job_arrays_and_parallel_jobs(remote_command_executor, region, cluster.cfn_name, scaledown_idletime, max_slots)
    _test_dynamic_cluster_limits(remote_command_executor, max_queue_size, max_slots, region, cluster.asg)
    assert_overscaling_when_job_submitted_during_scaledown(
        remote_command_executor, "torque", region, cluster.cfn_name, scaledown_idletime
    )

    assert_no_errors_in_logs(remote_command_executor, ["/var/log/sqswatcher", "/var/log/jobwatcher"])
예제 #3
0
def test_create_wrong_pcluster_version(region, pcluster_config_reader,
                                       clusters_factory,
                                       pcluster_ami_without_standard_naming):
    """Test error message when AMI provided was baked by a pcluster whose version is different from current version"""
    current_version = get_installed_parallelcluster_version()
    wrong_version = "2.8.1"
    logging.info("Asserting wrong_version is different from current_version")
    assert_that(current_version != wrong_version).is_true()
    # Retrieve an AMI without 'aws-parallelcluster-<version>' in its name.
    # Therefore, we can bypass the version check in CLI and test version check of .bootstrapped file in Cookbook.
    wrong_ami = pcluster_ami_without_standard_naming(wrong_version)
    cluster_config = pcluster_config_reader(custom_ami=wrong_ami)
    cluster = clusters_factory(cluster_config, raise_on_error=False)

    _assert_head_node_is_running(region, cluster)
    remote_command_executor = RemoteCommandExecutor(cluster)

    logging.info("Verifying error in logs")
    assert_errors_in_logs(
        remote_command_executor,
        ["/var/log/cloud-init-output.log"],
        [
            "error_exit",
            fr"AMI was created.+{wrong_version}.+is.+used.+{current_version}"
        ],
    )
예제 #4
0
def test_create_wrong_os(region, os, pcluster_config_reader, clusters_factory,
                         architecture):
    """Test error message when os provide is different from the os of custom AMI"""
    # ubuntu1804 is specified in the config file but an AMI of centos7 is provided
    wrong_os = "centos7"
    logging.info("Asserting os fixture is different from wrong_os variable")
    assert_that(os != wrong_os).is_true()
    custom_ami = retrieve_latest_ami(region,
                                     wrong_os,
                                     ami_type="pcluster",
                                     architecture=architecture)
    cluster_config = pcluster_config_reader(custom_ami=custom_ami)
    cluster = clusters_factory(cluster_config, raise_on_error=False)

    _assert_head_node_is_running(region, cluster)
    username = get_username_for_os(wrong_os)
    remote_command_executor = RemoteCommandExecutor(cluster, username=username)

    logging.info("Verifying error in logs")
    assert_errors_in_logs(
        remote_command_executor,
        ["/var/log/cfn-init.log"],
        [
            "RuntimeError",
            fr"custom AMI.+{wrong_os}.+base.+os.+config file.+{os}"
        ],
    )
예제 #5
0
def test_slurm_cli_commands(
    request, scheduler, region, os, pcluster_config_reader, clusters_factory, s3_bucket_factory
):
    """Test pcluster cli commands are working."""
    # Use long scale down idle time so we know nodes are terminated by pcluster stop
    cluster_config = pcluster_config_reader(scaledown_idletime=60)
    # Using custom AMI not tagged by pcluser will generate a warning
    custom_ami = retrieve_latest_ami(region, os, ami_type="official", architecture="x86_64")
    config_file = "pcluster.config.with.warnings.yaml"
    cluster_config_with_warning = pcluster_config_reader(config_file=config_file, custom_ami=custom_ami)

    # Test below is not compatible with `--cluster` flag. Therefore, skip it if the flag is provided.
    if not request.config.getoption("cluster"):
        _test_create_with_warnings(cluster_config_with_warning, clusters_factory)

    cluster = _test_create_cluster(clusters_factory, cluster_config, request)
    _test_describe_cluster(cluster)
    _test_list_cluster(cluster.name, "CREATE_COMPLETE")

    _test_update_with_warnings(cluster_config_with_warning, cluster)
    check_status(cluster, "CREATE_COMPLETE", "running", "RUNNING")

    filters = [{}, {"node_type": "HeadNode"}, {"node_type": "Compute"}, {"queue_name": "ondemand1"}]
    for filter_ in filters:
        _test_describe_instances(cluster, **filter_)
    _test_pcluster_export_cluster_logs(s3_bucket_factory, cluster)
    check_pcluster_list_cluster_log_streams(cluster, os)
    _test_pcluster_get_cluster_log_events(cluster)
    _test_pcluster_get_cluster_stack_events(cluster)
    _test_pcluster_compute_fleet(cluster, expected_num_nodes=2)

    remote_command_executor = RemoteCommandExecutor(cluster)
    assert_no_errors_in_logs(remote_command_executor, scheduler)
예제 #6
0
def test_arm_pl(region, scheduler, instance, os, pcluster_config_reader,
                clusters_factory, test_datadir):
    """Test Arm Performance Library"""
    cluster_config = pcluster_config_reader()
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)

    # arm performance library version and gcc version
    armpl_version = "21.0.0"
    gcc_version = "9.3"

    # loading module armpl/{armpl_version} will load module armpl/gcc-{gcc_version}
    # and armpl/{armpl_version}_gcc-{gcc_vesion}  sequentially
    armpl_module_general_name = f"armpl/{armpl_version}"
    armpl_module_name = f"armpl/{armpl_version}_gcc-{gcc_version}"
    gcc_module_name = f"armpl/gcc-{gcc_version}"
    _test_armpl_examples(
        os,
        remote_command_executor,
        armpl_module_general_name,
        armpl_module_name,
        gcc_module_name,
        armpl_version,
        gcc_version,
    )
예제 #7
0
def test_efa(region, scheduler, instance, os, pcluster_config_reader,
             clusters_factory, test_datadir):
    """
    Test all EFA Features.

    Grouped all tests in a single function so that cluster can be reused for all of them.
    """
    max_queue_size = 2
    slots_per_instance = fetch_instance_slots(region, instance)
    cluster_config = pcluster_config_reader(max_queue_size=max_queue_size)
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)
    scheduler_commands = get_scheduler_commands(scheduler,
                                                remote_command_executor)

    _test_efa_installed(scheduler_commands, remote_command_executor)
    _test_mpi(remote_command_executor, slots_per_instance, scheduler, os)
    logging.info("Running on Instances: {0}".format(
        get_compute_nodes_instance_ids(cluster.cfn_name, region)))
    _test_osu_benchmarks("openmpi", remote_command_executor,
                         scheduler_commands, test_datadir, slots_per_instance)
    _test_osu_benchmarks("intelmpi", remote_command_executor,
                         scheduler_commands, test_datadir, slots_per_instance)
    _test_shm_transfer_is_enabled(scheduler_commands, remote_command_executor)

    assert_no_errors_in_logs(remote_command_executor,
                             ["/var/log/sqswatcher", "/var/log/jobwatcher"])
예제 #8
0
def test_sge(region, pcluster_config_reader, clusters_factory):
    """
    Test all AWS SGE related features.

    Grouped all tests in a single function so that cluster can be reused for all of them.
    """
    scaledown_idletime = 3
    max_queue_size = 5
    max_slots = 4
    cluster_config = pcluster_config_reader(
        scaledown_idletime=scaledown_idletime, max_queue_size=max_queue_size)
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)

    _test_sge_version(remote_command_executor)
    _test_non_runnable_jobs(remote_command_executor, max_queue_size, max_slots,
                            region, cluster, scaledown_idletime)
    _test_job_dependencies(remote_command_executor, region, cluster.cfn_name,
                           scaledown_idletime)
    _test_job_arrays_and_parallel_jobs(remote_command_executor, region,
                                       cluster.cfn_name, scaledown_idletime,
                                       max_slots)
    # TODO: _test_dynamic_max_cluster_size

    assert_no_errors_in_logs(remote_command_executor,
                             ["/var/log/sqswatcher", "/var/log/jobwatcher"])
def test_dynamic_placement_group_in_cluster(region, scheduler,
                                            pcluster_config_reader,
                                            clusters_factory, instance):
    """Test the case when placement_group is set to DYNAMIC. This test is not for awsbatch scheduler."""
    cluster_config = pcluster_config_reader(placement="cluster")
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)

    # For tratitional scheduler, the placement group name can be retrieved from main stack, for slurm, it can be
    # retrieved from ComputeFleetHITSubstack
    if scheduler == "slurm":
        placement_group = _get_slurm_placement_group_from_stack(
            cluster, region)
        # for slurm, the placement type can only be compute
        _check_head_node_placement_group(remote_command_executor, region, None)
    else:
        placement_group = utils.retrieve_cfn_resources(
            cluster.cfn_name, region)["DynamicPlacementGroup"]
        _check_head_node_placement_group(remote_command_executor, region,
                                         placement_group)
    # check the placement_group of compute nodes
    _assert_placement_group(cluster, scheduler, region, placement_group, None,
                            instance)

    # need to delete the cluster before deleting placement group
    cluster.delete()
예제 #10
0
def test_ebs_existing(request, vpc_stacks, region, scheduler,
                      pcluster_config_reader, snapshots_factory,
                      clusters_factory):
    logging.info("Testing ebs existing")
    existing_mount_dir = "existing_mount_dir"

    logging.info("Creating volume")

    volume_id = snapshots_factory.create_existing_volume(
        request, vpc_stacks[region].cfn_outputs["PublicSubnetId"], region)

    logging.info("Existing Volume id: %s" % volume_id)
    cluster_config = pcluster_config_reader(
        volume_id=volume_id, existing_mount_dir=existing_mount_dir)

    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)
    scheduler_commands = get_scheduler_commands(scheduler,
                                                remote_command_executor)
    existing_mount_dir = "/" + existing_mount_dir
    _test_ebs_correctly_mounted(remote_command_executor,
                                existing_mount_dir,
                                volume_size="9.8")
    _test_ebs_correctly_shared(remote_command_executor, existing_mount_dir,
                               scheduler_commands)
    # Checks for test data
    result = remote_command_executor.run_remote_command(
        "cat {}/test.txt".format(existing_mount_dir))
    assert_that(result.stdout.strip()).is_equal_to("hello world")

    # delete the cluster before detaching the EBS volume
    cluster.delete()
    # check the volume still exists after deleting the cluster
    _assert_volume_exist(volume_id, region)
예제 #11
0
def test_awsbatch(pcluster_config_reader, clusters_factory, test_datadir,
                  caplog, region):
    """
    Test all AWS Batch related features.

    Grouped all tests in a single function so that cluster can be reused for all of them.
    """
    caplog.set_level(
        logging.DEBUG
    )  # Needed for checks in _assert_compute_instance_type_validation_successful
    cluster_config = pcluster_config_reader()
    cluster = clusters_factory(cluster_config)
    _assert_compute_instance_type_validation_successful(caplog)
    remote_command_executor = RemoteCommandExecutor(cluster)

    min_vcpus = cluster.config["Scheduling"]["AwsBatchQueues"][0][
        "ComputeResources"][0]["MinvCpus"]
    max_vcpus = cluster.config["Scheduling"]["AwsBatchQueues"][0][
        "ComputeResources"][0]["MaxvCpus"]
    assert_that(get_batch_ce_min_size(cluster.cfn_name,
                                      region)).is_equal_to(int(min_vcpus))
    assert_that(get_batch_ce_max_size(cluster.cfn_name,
                                      region)).is_equal_to(int(max_vcpus))

    timeout = 120 if region.startswith(
        "cn-"
    ) else 60  # Longer timeout in china regions due to less reliable networking
    _test_simple_job_submission(remote_command_executor, test_datadir, timeout)
    _test_array_submission(remote_command_executor)
    _test_mnp_submission(remote_command_executor, test_datadir)
    _test_job_kill(remote_command_executor, timeout)
예제 #12
0
def test_replace_compute_on_failure(
    region, scheduler, pcluster_config_reader, s3_bucket_factory, clusters_factory, test_datadir
):
    """
    Test that compute nodes get replaced on userdata failures and logs get saved in shared directory.

    The failure is caused by a post_install script that exits with errors on compute nodes.
    """
    bucket_name = s3_bucket_factory()
    bucket = boto3.resource("s3", region_name=region).Bucket(bucket_name)
    bucket.upload_file(str(test_datadir / "post_install.sh"), "post_install.sh")
    cluster_config = pcluster_config_reader(bucket_name=bucket_name)
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)

    # submit a job to spin up a compute node that will fail due to post_install script
    scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor)
    scheduler_commands.submit_command("sleep 1")
    instance_id = wait_compute_log(remote_command_executor)[0]

    # extract logs and check one of them
    _assert_compute_logs(remote_command_executor, instance_id)

    # check that instance got already replaced or is marked as Unhealthy
    time.sleep(25)  # Instance waits for 10 seconds before terminating to allow logs to propagate to CloudWatch
    assert_instance_replaced_or_terminating(instance_id, region)
예제 #13
0
def test_hit_disable_hyperthreading(
    region, scheduler, instance, os, pcluster_config_reader, clusters_factory, default_threads_per_core
):
    """Test Disable Hyperthreading for HIT clusters."""
    slots_per_instance = fetch_instance_slots(region, instance)
    cluster_config = pcluster_config_reader()
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)
    scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor)
    _test_disable_hyperthreading_settings(
        remote_command_executor,
        scheduler_commands,
        slots_per_instance,
        scheduler,
        hyperthreading_disabled=False,
        partition="ht-enabled",
        default_threads_per_core=default_threads_per_core,
    )
    _test_disable_hyperthreading_settings(
        remote_command_executor,
        scheduler_commands,
        slots_per_instance,
        scheduler,
        hyperthreading_disabled=True,
        partition="ht-disabled",
        default_threads_per_core=default_threads_per_core,
    )

    assert_no_errors_in_logs(remote_command_executor, scheduler)
예제 #14
0
def test_mpi(scheduler, region, instance, pcluster_config_reader,
             clusters_factory):
    scaledown_idletime = 3
    max_queue_size = 3
    slots_per_instance = fetch_instance_slots(region, instance)
    cluster_config = pcluster_config_reader(
        scaledown_idletime=scaledown_idletime, max_queue_size=max_queue_size)
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)

    # This verifies that the job completes correctly
    _test_mpi(
        remote_command_executor,
        slots_per_instance,
        scheduler,
        region,
        cluster.cfn_name,
        scaledown_idletime,
        verify_scaling=False,
    )

    # This verifies that scaling worked
    _test_mpi(
        remote_command_executor,
        slots_per_instance,
        scheduler,
        region,
        cluster.cfn_name,
        scaledown_idletime,
        verify_scaling=True,
    )
예제 #15
0
def test_mpi_ssh(scheduler, pcluster_config_reader, clusters_factory,
                 test_datadir):
    cluster_config = pcluster_config_reader()
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)

    _test_mpi_ssh(remote_command_executor, scheduler, test_datadir)
예제 #16
0
def test_multiple_jobs_submission(scheduler, region, pcluster_config_reader, clusters_factory, test_datadir):
    scaledown_idletime = 4
    # Test jobs should take at most 9 minutes to be executed.
    # These guarantees that the jobs are executed in parallel.
    max_jobs_execution_time = 9

    cluster_config = pcluster_config_reader(scaledown_idletime=scaledown_idletime)
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)
    scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor)

    logging.info("Executing test jobs on cluster")
    remote_command_executor.run_remote_script(test_datadir / "cluster-check.sh", args=["submit", scheduler])

    logging.info("Monitoring asg capacity and compute nodes")
    asg_capacity_time_series, compute_nodes_time_series, timestamps = get_compute_nodes_allocation(
        scheduler_commands=scheduler_commands,
        region=region,
        stack_name=cluster.cfn_name,
        max_monitoring_time=minutes(max_jobs_execution_time) + minutes(scaledown_idletime) + minutes(5),
    )

    logging.info("Verifying test jobs completed successfully and in the expected time")
    _assert_test_jobs_completed(remote_command_executor, max_jobs_execution_time * 60)

    logging.info("Verifying auto-scaling worked correctly")
    _assert_scaling_works(
        asg_capacity_time_series=asg_capacity_time_series,
        compute_nodes_time_series=compute_nodes_time_series,
        expected_asg_capacity=(0, 3),
        expected_compute_nodes=(0, 3),
    )
예제 #17
0
def _test_fsx_lustre_configuration_options(
    cluster,
    region,
    scheduler,
    os,
    mount_dir,
    bucket_name,
    storage_type,
    auto_import_policy,
    deployment_type,
    data_compression_type,
    weekly_maintenance_start_time,
    imported_file_chunk_size,
    storage_capacity,
):
    _test_fsx_lustre(cluster, region, scheduler, os, mount_dir, bucket_name)
    remote_command_executor = RemoteCommandExecutor(cluster)
    fsx_fs_id = get_fsx_fs_id(cluster, region)
    fsx = boto3.client(
        "fsx",
        region_name=region).describe_file_systems(FileSystemIds=[fsx_fs_id])

    _test_storage_type(storage_type, fsx)
    _test_deployment_type(deployment_type, fsx)
    _test_auto_import(auto_import_policy, remote_command_executor, mount_dir,
                      bucket_name, region)
    _test_storage_capacity(remote_command_executor, mount_dir,
                           storage_capacity)
    _test_weekly_maintenance_start_time(weekly_maintenance_start_time, fsx)
    _test_imported_file_chunch_size(imported_file_chunk_size, fsx)
    _test_data_compression_type(data_compression_type, fsx)
예제 #18
0
def test_ebs_single(scheduler, pcluster_config_reader, clusters_factory,
                    kms_key_factory, region, os):
    mount_dir = "ebs_mount_dir"
    kms_key_id = kms_key_factory.create_kms_key(region)
    cluster_config = pcluster_config_reader(
        mount_dir=mount_dir,
        ec2_iam_role=kms_key_factory.iam_role_arn,
        ebs_kms_key_id=kms_key_id)
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)

    mount_dir = "/" + mount_dir
    scheduler_commands = get_scheduler_commands(scheduler,
                                                remote_command_executor)
    volume_id = get_ebs_volume_ids(cluster, region)[0]

    _test_ebs_correctly_mounted(remote_command_executor,
                                mount_dir,
                                volume_size=35)
    _test_ebs_correctly_shared(remote_command_executor, mount_dir,
                               scheduler_commands)
    _test_ebs_encrypted_with_kms(volume_id,
                                 region,
                                 encrypted=True,
                                 kms_key_id=kms_key_id)

    _test_root_volume_encryption(cluster,
                                 os,
                                 region,
                                 scheduler,
                                 encrypted=True)
예제 #19
0
def test_slurm_pmix(pcluster_config_reader, clusters_factory):
    """Test interactive job submission using PMIx."""
    num_computes = 2
    cluster_config = pcluster_config_reader(queue_size=num_computes)
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)

    # Ensure the expected PMIx version is listed when running `srun --mpi=list`.
    # Since we're installing PMIx v3.1.5, we expect to see pmix and pmix_v3 in the output.
    # Sample output:
    # [ec2-user@ip-172-31-33-187 ~]$ srun 2>&1 --mpi=list
    # srun: MPI types are...
    # srun: none
    # srun: openmpi
    # srun: pmi2
    # srun: pmix
    # srun: pmix_v3
    mpi_list_output = remote_command_executor.run_remote_command(
        "srun 2>&1 --mpi=list").stdout
    assert_that(mpi_list_output).matches(r"\s+pmix($|\s+)")
    assert_that(mpi_list_output).matches(r"\s+pmix_v3($|\s+)")

    # Compile and run an MPI program interactively
    mpi_module = "openmpi"
    binary_path = "/shared/ring"
    compile_mpi_ring(mpi_module,
                     remote_command_executor,
                     binary_path=binary_path)
    interactive_command = f"module load {mpi_module} && srun --mpi=pmix -N {num_computes} {binary_path}"
    remote_command_executor.run_remote_command(interactive_command)
def test_cloudwatch_logging(region, scheduler, instance, os,
                            pcluster_config_reader, test_datadir,
                            clusters_factory):
    """
    Test all CloudWatch logging features.

    All tests are grouped in a single function so that the cluster can be reused for all of them.
    """
    environ[
        "AWS_DEFAULT_REGION"] = region  # So that it doesn't have to be passed to boto3 calls
    config_params = get_config_param_vals()
    cluster_config = pcluster_config_reader(**config_params)
    cluster = clusters_factory(cluster_config)

    # change CW agent to debug mode
    remote_command_executor = RemoteCommandExecutor(cluster)
    remote_command_executor.run_remote_script(str(test_datadir /
                                                  "cw_agent_debug.sh"),
                                              run_as_root=True)
    test_runner = CloudWatchLoggingTestRunner(
        log_group_name=_get_log_group_name_for_cluster(cluster.name),
        enabled=True,
        retention_days=config_params.get("retention_days"),
        logs_persist_after_delete=True,
    )
    cluster_logs_state = CloudWatchLoggingClusterState(
        scheduler, os, cluster).get_logs_state()
    _test_cw_logs_before_after_delete(cluster, cluster_logs_state, test_runner,
                                      remote_command_executor)
예제 #21
0
def test_slurm_gpu(region, pcluster_config_reader, clusters_factory):
    """
    Test Slurm GPU related features.

    Grouped all tests in a single function so that cluster can be reused for all of them.
    """
    scaledown_idletime = 3
    max_queue_size = 4
    cluster_config = pcluster_config_reader(
        scaledown_idletime=scaledown_idletime, max_queue_size=max_queue_size)
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)

    _gpu_test_scaleup(remote_command_executor,
                      region,
                      cluster.asg,
                      cluster.cfn_name,
                      scaledown_idletime,
                      num_gpus=2)
    _test_dynamic_dummy_nodes(remote_command_executor,
                              region,
                              cluster.asg,
                              max_queue_size,
                              slots=32,
                              gpus=2)
    _gpu_test_cluster_limits(remote_command_executor, max_queue_size, 2)
    _gpu_resource_check(remote_command_executor)
    _gpu_test_conflicting_options(remote_command_executor, 2)

    assert_no_errors_in_logs(remote_command_executor,
                             ["/var/log/sqswatcher", "/var/log/jobwatcher"])
예제 #22
0
def test_ebs_snapshot(request, vpc_stacks, region, scheduler,
                      pcluster_config_reader, clusters_factory,
                      snapshots_factory):
    logging.info("Testing ebs snapshot")
    mount_dir = "ebs_mount_dir"
    volume_size = 10

    logging.info("Creating snapshot")

    snapshot_id = snapshots_factory.create_snapshot(
        request, vpc_stacks[region].cfn_outputs["PublicSubnetId"], region)

    logging.info("Snapshot id: %s" % snapshot_id)
    cluster_config = pcluster_config_reader(mount_dir=mount_dir,
                                            volume_size=volume_size,
                                            snapshot_id=snapshot_id)

    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)

    mount_dir = "/" + mount_dir
    scheduler_commands = get_scheduler_commands(scheduler,
                                                remote_command_executor)
    _test_ebs_correctly_mounted(remote_command_executor,
                                mount_dir,
                                volume_size="9.8")
    _test_ebs_correctly_shared(remote_command_executor, mount_dir,
                               scheduler_commands)

    # Checks for test data
    result = remote_command_executor.run_remote_command(
        "cat {}/test.txt".format(mount_dir))
    assert_that(result.stdout.strip()).is_equal_to("hello world")
예제 #23
0
def test_slurm(region, pcluster_config_reader, clusters_factory):
    """
    Test all AWS Slurm related features.

    Grouped all tests in a single function so that cluster can be reused for all of them.
    """
    scaledown_idletime = 3
    max_queue_size = 5
    cluster_config = pcluster_config_reader(
        scaledown_idletime=scaledown_idletime, max_queue_size=max_queue_size)
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)

    _test_slurm_version(remote_command_executor)
    _test_dynamic_max_cluster_size(remote_command_executor,
                                   region,
                                   cluster.asg,
                                   max_queue_size=max_queue_size)
    _test_cluster_limits(remote_command_executor, max_queue_size)
    _test_job_dependencies(remote_command_executor, region, cluster.cfn_name,
                           scaledown_idletime, max_queue_size)
    _test_job_arrays_and_parallel_jobs(remote_command_executor, region,
                                       cluster.cfn_name, scaledown_idletime)
    assert_overscaling_when_job_submitted_during_scaledown(
        remote_command_executor, "slurm", region, cluster.cfn_name,
        scaledown_idletime)
    _test_dynamic_dummy_nodes(remote_command_executor, region, cluster.asg,
                              max_queue_size)

    assert_no_errors_in_logs(remote_command_executor,
                             ["/var/log/sqswatcher", "/var/log/jobwatcher"])
예제 #24
0
def test_nodewatcher_terminates_failing_node(scheduler, region,
                                             pcluster_config_reader,
                                             clusters_factory, test_datadir):
    cluster_config = pcluster_config_reader()
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)
    scheduler_commands = get_scheduler_commands(scheduler,
                                                remote_command_executor)

    compute_nodes = scheduler_commands.get_compute_nodes()

    # submit a job that kills the slurm daemon so that the node enters a failing state
    scheduler_commands.submit_script(
        str(test_datadir / "{0}_kill_scheduler_job.sh".format(scheduler)))
    instance_id = wait_compute_log(remote_command_executor)

    _assert_compute_logs(remote_command_executor, instance_id)
    assert_instance_replaced_or_terminating(instance_id, region)
    # verify that desired capacity is still 1
    assert_that(get_desired_asg_capacity(region,
                                         cluster.cfn_name)).is_equal_to(1)
    _assert_nodes_removed_from_scheduler(scheduler_commands, compute_nodes)

    assert_no_errors_in_logs(remote_command_executor,
                             ["/var/log/sqswatcher", "/var/log/jobwatcher"])
def test_multiple_nics(scheduler, region, pcluster_config_reader, clusters_factory):
    cluster_config = pcluster_config_reader()
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)
    scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor)

    _test_head_node_nics(remote_command_executor, region)
    _test_compute_node_nics(cluster, region, remote_command_executor, scheduler_commands)
예제 #26
0
def test_spot_default(scheduler, pcluster_config_reader, clusters_factory):
    """Test that a cluster with spot instances can be created with default spot_price_value."""
    cluster_config = pcluster_config_reader()
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)
    scheduler_commands = get_scheduler_commands(scheduler,
                                                remote_command_executor)
    assert_that(scheduler_commands.compute_nodes_count()).is_equal_to(1)
def test_intel_hpc(region, scheduler, instance, os, pcluster_config_reader, clusters_factory, test_datadir):
    """Test Intel Cluster Checker"""
    cluster_config = pcluster_config_reader()
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)
    scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor)
    _test_intel_clck(remote_command_executor, scheduler_commands, test_datadir, os)

    assert_no_errors_in_logs(remote_command_executor, scheduler)
예제 #28
0
def test_update(instance, region, pcluster_config_reader, clusters_factory,
                test_datadir):
    """
    Test 'pcluster update' command.

    Grouped all tests in a single function so that cluster can be reused for all of them.
    """
    s3_arn = "arn:aws:s3:::fake_bucket/*"
    init_config = PClusterConfig(
        max_queue_size=5,
        compute_instance_type=instance,
        compute_root_volume_size=30,
        s3_read_resource=s3_arn,
        s3_read_write_resource=s3_arn,
    )
    cluster = _init_cluster(clusters_factory, pcluster_config_reader,
                            init_config)
    command_executor = RemoteCommandExecutor(cluster)
    slurm_commands = SlurmCommands(command_executor)

    _verify_initialization(command_executor, slurm_commands, region,
                           test_datadir, cluster, init_config)

    s3_arn_updated = "arn:aws:s3:::fake_bucket/fake_folder/*"
    updated_config = PClusterConfig(
        max_queue_size=10,
        compute_instance_type="c4.xlarge",
        compute_root_volume_size=40,
        s3_read_resource=s3_arn_updated,
        s3_read_write_resource=s3_arn_updated,
    )
    _update_cluster(cluster, updated_config)

    # verify updated parameters
    _test_max_queue(region, cluster.cfn_name, updated_config.max_queue_size)
    _test_s3_read_resource(region, cluster, updated_config.s3_read_resource)
    _test_s3_read_write_resource(region, cluster,
                                 updated_config.s3_read_write_resource)

    # verify params that are NOT updated in OLD compute nodes
    compute_nodes = slurm_commands.get_compute_nodes()
    _test_compute_instance_type(region, cluster.cfn_name,
                                init_config.compute_instance_type,
                                compute_nodes[0])
    _test_compute_root_volume_size(command_executor, slurm_commands,
                                   test_datadir,
                                   init_config.compute_root_volume_size,
                                   compute_nodes[0])
    # add compute nodes and verify updated params in NEW compute nodes
    new_compute_nodes = _add_compute_nodes(slurm_commands)
    _test_compute_instance_type(region, cluster.cfn_name,
                                updated_config.compute_instance_type,
                                new_compute_nodes[0])
    _test_compute_root_volume_size(command_executor, slurm_commands,
                                   test_datadir,
                                   updated_config.compute_root_volume_size,
                                   new_compute_nodes[0])
def test_scaling_performance(region, scheduler, os, instance,
                             pcluster_config_reader, clusters_factory,
                             request):
    """The test runs benchmarks for the scaling logic."""
    benchmarks_max_time = request.config.getoption("benchmarks_max_time")

    benchmark_params = {
        "region": region,
        "scheduler": scheduler,
        "os": os,
        "instance": instance,
        "scaling_target":
        request.config.getoption("benchmarks_target_capacity"),
        "scaledown_idletime": 2,
        "job_duration": 60,
    }

    cluster_config = pcluster_config_reader(
        scaledown_idletime=benchmark_params["scaledown_idletime"],
        scaling_target=benchmark_params["scaling_target"])
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)
    scheduler_commands = get_scheduler_commands(scheduler,
                                                remote_command_executor)

    logging.info("Starting benchmark with following parameters: %s",
                 benchmark_params)
    start_time = datetime.datetime.utcnow()
    kwargs = {"nodes": benchmark_params["scaling_target"]}
    result = scheduler_commands.submit_command(
        "sleep {0}".format(benchmark_params["job_duration"]), **kwargs)
    scheduler_commands.assert_job_submitted(result.stdout)
    compute_nodes_time_series, timestamps, end_time = publish_compute_nodes_metric(
        scheduler_commands,
        max_monitoring_time=minutes(benchmarks_max_time),
        region=region,
        cluster_name=cluster.cfn_name,
    )

    logging.info(
        "Benchmark completed. Producing outputs and performing assertions.")
    benchmark_params["total_time"] = "{0}seconds".format(
        int((end_time - start_time).total_seconds()))
    produce_benchmark_metrics_report(
        benchmark_params,
        region,
        cluster.cfn_name,
        start_time.replace(tzinfo=datetime.timezone.utc).isoformat(),
        end_time.replace(tzinfo=datetime.timezone.utc).isoformat(),
        benchmark_params["scaling_target"],
        request,
    )
    assert_that(max(compute_nodes_time_series)).is_equal_to(
        benchmark_params["scaling_target"])
    assert_that(compute_nodes_time_series[-1]).is_equal_to(0)
    assert_no_errors_in_logs(remote_command_executor, scheduler)
예제 #30
0
def test_ebs_multiple(scheduler, pcluster_config_reader, clusters_factory,
                      region, os):
    mount_dirs = ["/ebs_mount_dir_{0}".format(i) for i in range(0, 5)]
    volume_sizes = [15 + 5 * i for i in range(0, 5)]

    # for volume type sc1 and st1, the minimum volume sizes are 500G
    volume_sizes[3] = 500
    volume_sizes[4] = 500
    cluster_config = pcluster_config_reader(mount_dirs=mount_dirs,
                                            volume_sizes=volume_sizes)
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)

    scheduler_commands = get_scheduler_commands(scheduler,
                                                remote_command_executor)
    for mount_dir, volume_size in zip(mount_dirs, volume_sizes):
        # for volume size equal to 500G, the filesystem size is only about 492G
        # This is because the file systems use some of the total space available on a device for storing internal
        # structures and data (the file system's metadata). The overhead of the XFS filesystem is around 0.5%.
        # If we test with small volume size(eg: 40G), the number is not large enough to show the gap between the
        # partition size and the filesystem size. For sc1 and st1, the minimum size is 500G, so there will be a size
        # difference.
        _test_ebs_correctly_mounted(
            remote_command_executor, mount_dir,
            volume_size if volume_size != 500 else "49[0-9]")
        _test_ebs_correctly_shared(remote_command_executor, mount_dir,
                                   scheduler_commands)

    volume_ids = get_ebs_volume_ids(cluster, region)
    for i in range(len(volume_ids)):
        # test different volume types
        volume_id = volume_ids[i]
        ebs_settings = _get_ebs_settings_by_name(cluster.config, f"ebs{i+1}")
        volume_type = ebs_settings["VolumeType"]
        volume = describe_volume(volume_id, region)
        assert_that(volume[0]).is_equal_to(volume_type)
        encrypted = ebs_settings.get("Encrypted")
        if encrypted is None:
            # Default encryption if not specified
            encrypted = True
        _test_ebs_encrypted_with_kms(volume_id,
                                     region,
                                     encrypted=encrypted,
                                     kms_key_id=ebs_settings.get("KmsKeyId"))
        # test different iops
        # only io1, io2, gp3 can configure iops
        if volume_type in ["io1", "io2", "gp3"]:
            volume_iops = ebs_settings["Iops"]
            assert_that(volume[1]).is_equal_to(int(volume_iops))

    _test_root_volume_encryption(cluster,
                                 os,
                                 region,
                                 scheduler,
                                 encrypted=False)
    _assert_root_volume_configuration(cluster, os, region, scheduler)