def test_head_node_stop(scheduler, pcluster_config_reader, clusters_factory, region, os): head_ephemeral_mount = "/scratch_head" compute_ephemeral_mount = "/scratch_compute" folder = "myFolder" filename = "myFile" cluster_config = pcluster_config_reader( head_ephemeral_mount=head_ephemeral_mount, compute_ephemeral_mount=compute_ephemeral_mount) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) _test_head_ephemeral_setup(remote_command_executor, head_ephemeral_mount, folder, filename) # reboot headnode (instance store is preserved) reboot_head_node(cluster, remote_command_executor) _test_head_ephemeral_preserved(remote_command_executor, head_ephemeral_mount, folder, filename) # stop/start headnode (instance store is recreated) restart_head_node(cluster) # RemoteCommandExecutor needs to be re-initialized because HeadNode changed public IP new_remote_command_executor = RemoteCommandExecutor(cluster) _test_head_ephemeral_recreated(new_remote_command_executor, head_ephemeral_mount, folder, filename)
def test_torque(region, pcluster_config_reader, clusters_factory): """ Test all AWS Torque related features. Grouped all tests in a single function so that cluster can be reused for all of them. """ scaledown_idletime = 2 max_queue_size = 5 max_slots = 4 initial_queue_size = 3 # in order to speed-up _test_jobs_executed_concurrently test cluster_config = pcluster_config_reader( scaledown_idletime=scaledown_idletime, max_queue_size=max_queue_size, initial_queue_size=initial_queue_size ) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) _test_torque_version(remote_command_executor) _test_jobs_executed_concurrently(remote_command_executor, max_slots) _test_non_runnable_jobs(remote_command_executor, max_queue_size, max_slots, region, cluster, scaledown_idletime) _test_job_dependencies(remote_command_executor, region, cluster.cfn_name, scaledown_idletime) _test_job_arrays_and_parallel_jobs(remote_command_executor, region, cluster.cfn_name, scaledown_idletime, max_slots) _test_dynamic_cluster_limits(remote_command_executor, max_queue_size, max_slots, region, cluster.asg) assert_overscaling_when_job_submitted_during_scaledown( remote_command_executor, "torque", region, cluster.cfn_name, scaledown_idletime ) assert_no_errors_in_logs(remote_command_executor, ["/var/log/sqswatcher", "/var/log/jobwatcher"])
def test_create_wrong_pcluster_version(region, pcluster_config_reader, clusters_factory, pcluster_ami_without_standard_naming): """Test error message when AMI provided was baked by a pcluster whose version is different from current version""" current_version = get_installed_parallelcluster_version() wrong_version = "2.8.1" logging.info("Asserting wrong_version is different from current_version") assert_that(current_version != wrong_version).is_true() # Retrieve an AMI without 'aws-parallelcluster-<version>' in its name. # Therefore, we can bypass the version check in CLI and test version check of .bootstrapped file in Cookbook. wrong_ami = pcluster_ami_without_standard_naming(wrong_version) cluster_config = pcluster_config_reader(custom_ami=wrong_ami) cluster = clusters_factory(cluster_config, raise_on_error=False) _assert_head_node_is_running(region, cluster) remote_command_executor = RemoteCommandExecutor(cluster) logging.info("Verifying error in logs") assert_errors_in_logs( remote_command_executor, ["/var/log/cloud-init-output.log"], [ "error_exit", fr"AMI was created.+{wrong_version}.+is.+used.+{current_version}" ], )
def test_create_wrong_os(region, os, pcluster_config_reader, clusters_factory, architecture): """Test error message when os provide is different from the os of custom AMI""" # ubuntu1804 is specified in the config file but an AMI of centos7 is provided wrong_os = "centos7" logging.info("Asserting os fixture is different from wrong_os variable") assert_that(os != wrong_os).is_true() custom_ami = retrieve_latest_ami(region, wrong_os, ami_type="pcluster", architecture=architecture) cluster_config = pcluster_config_reader(custom_ami=custom_ami) cluster = clusters_factory(cluster_config, raise_on_error=False) _assert_head_node_is_running(region, cluster) username = get_username_for_os(wrong_os) remote_command_executor = RemoteCommandExecutor(cluster, username=username) logging.info("Verifying error in logs") assert_errors_in_logs( remote_command_executor, ["/var/log/cfn-init.log"], [ "RuntimeError", fr"custom AMI.+{wrong_os}.+base.+os.+config file.+{os}" ], )
def test_slurm_cli_commands( request, scheduler, region, os, pcluster_config_reader, clusters_factory, s3_bucket_factory ): """Test pcluster cli commands are working.""" # Use long scale down idle time so we know nodes are terminated by pcluster stop cluster_config = pcluster_config_reader(scaledown_idletime=60) # Using custom AMI not tagged by pcluser will generate a warning custom_ami = retrieve_latest_ami(region, os, ami_type="official", architecture="x86_64") config_file = "pcluster.config.with.warnings.yaml" cluster_config_with_warning = pcluster_config_reader(config_file=config_file, custom_ami=custom_ami) # Test below is not compatible with `--cluster` flag. Therefore, skip it if the flag is provided. if not request.config.getoption("cluster"): _test_create_with_warnings(cluster_config_with_warning, clusters_factory) cluster = _test_create_cluster(clusters_factory, cluster_config, request) _test_describe_cluster(cluster) _test_list_cluster(cluster.name, "CREATE_COMPLETE") _test_update_with_warnings(cluster_config_with_warning, cluster) check_status(cluster, "CREATE_COMPLETE", "running", "RUNNING") filters = [{}, {"node_type": "HeadNode"}, {"node_type": "Compute"}, {"queue_name": "ondemand1"}] for filter_ in filters: _test_describe_instances(cluster, **filter_) _test_pcluster_export_cluster_logs(s3_bucket_factory, cluster) check_pcluster_list_cluster_log_streams(cluster, os) _test_pcluster_get_cluster_log_events(cluster) _test_pcluster_get_cluster_stack_events(cluster) _test_pcluster_compute_fleet(cluster, expected_num_nodes=2) remote_command_executor = RemoteCommandExecutor(cluster) assert_no_errors_in_logs(remote_command_executor, scheduler)
def test_arm_pl(region, scheduler, instance, os, pcluster_config_reader, clusters_factory, test_datadir): """Test Arm Performance Library""" cluster_config = pcluster_config_reader() cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) # arm performance library version and gcc version armpl_version = "21.0.0" gcc_version = "9.3" # loading module armpl/{armpl_version} will load module armpl/gcc-{gcc_version} # and armpl/{armpl_version}_gcc-{gcc_vesion} sequentially armpl_module_general_name = f"armpl/{armpl_version}" armpl_module_name = f"armpl/{armpl_version}_gcc-{gcc_version}" gcc_module_name = f"armpl/gcc-{gcc_version}" _test_armpl_examples( os, remote_command_executor, armpl_module_general_name, armpl_module_name, gcc_module_name, armpl_version, gcc_version, )
def test_efa(region, scheduler, instance, os, pcluster_config_reader, clusters_factory, test_datadir): """ Test all EFA Features. Grouped all tests in a single function so that cluster can be reused for all of them. """ max_queue_size = 2 slots_per_instance = fetch_instance_slots(region, instance) cluster_config = pcluster_config_reader(max_queue_size=max_queue_size) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) _test_efa_installed(scheduler_commands, remote_command_executor) _test_mpi(remote_command_executor, slots_per_instance, scheduler, os) logging.info("Running on Instances: {0}".format( get_compute_nodes_instance_ids(cluster.cfn_name, region))) _test_osu_benchmarks("openmpi", remote_command_executor, scheduler_commands, test_datadir, slots_per_instance) _test_osu_benchmarks("intelmpi", remote_command_executor, scheduler_commands, test_datadir, slots_per_instance) _test_shm_transfer_is_enabled(scheduler_commands, remote_command_executor) assert_no_errors_in_logs(remote_command_executor, ["/var/log/sqswatcher", "/var/log/jobwatcher"])
def test_sge(region, pcluster_config_reader, clusters_factory): """ Test all AWS SGE related features. Grouped all tests in a single function so that cluster can be reused for all of them. """ scaledown_idletime = 3 max_queue_size = 5 max_slots = 4 cluster_config = pcluster_config_reader( scaledown_idletime=scaledown_idletime, max_queue_size=max_queue_size) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) _test_sge_version(remote_command_executor) _test_non_runnable_jobs(remote_command_executor, max_queue_size, max_slots, region, cluster, scaledown_idletime) _test_job_dependencies(remote_command_executor, region, cluster.cfn_name, scaledown_idletime) _test_job_arrays_and_parallel_jobs(remote_command_executor, region, cluster.cfn_name, scaledown_idletime, max_slots) # TODO: _test_dynamic_max_cluster_size assert_no_errors_in_logs(remote_command_executor, ["/var/log/sqswatcher", "/var/log/jobwatcher"])
def test_dynamic_placement_group_in_cluster(region, scheduler, pcluster_config_reader, clusters_factory, instance): """Test the case when placement_group is set to DYNAMIC. This test is not for awsbatch scheduler.""" cluster_config = pcluster_config_reader(placement="cluster") cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) # For tratitional scheduler, the placement group name can be retrieved from main stack, for slurm, it can be # retrieved from ComputeFleetHITSubstack if scheduler == "slurm": placement_group = _get_slurm_placement_group_from_stack( cluster, region) # for slurm, the placement type can only be compute _check_head_node_placement_group(remote_command_executor, region, None) else: placement_group = utils.retrieve_cfn_resources( cluster.cfn_name, region)["DynamicPlacementGroup"] _check_head_node_placement_group(remote_command_executor, region, placement_group) # check the placement_group of compute nodes _assert_placement_group(cluster, scheduler, region, placement_group, None, instance) # need to delete the cluster before deleting placement group cluster.delete()
def test_ebs_existing(request, vpc_stacks, region, scheduler, pcluster_config_reader, snapshots_factory, clusters_factory): logging.info("Testing ebs existing") existing_mount_dir = "existing_mount_dir" logging.info("Creating volume") volume_id = snapshots_factory.create_existing_volume( request, vpc_stacks[region].cfn_outputs["PublicSubnetId"], region) logging.info("Existing Volume id: %s" % volume_id) cluster_config = pcluster_config_reader( volume_id=volume_id, existing_mount_dir=existing_mount_dir) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) existing_mount_dir = "/" + existing_mount_dir _test_ebs_correctly_mounted(remote_command_executor, existing_mount_dir, volume_size="9.8") _test_ebs_correctly_shared(remote_command_executor, existing_mount_dir, scheduler_commands) # Checks for test data result = remote_command_executor.run_remote_command( "cat {}/test.txt".format(existing_mount_dir)) assert_that(result.stdout.strip()).is_equal_to("hello world") # delete the cluster before detaching the EBS volume cluster.delete() # check the volume still exists after deleting the cluster _assert_volume_exist(volume_id, region)
def test_awsbatch(pcluster_config_reader, clusters_factory, test_datadir, caplog, region): """ Test all AWS Batch related features. Grouped all tests in a single function so that cluster can be reused for all of them. """ caplog.set_level( logging.DEBUG ) # Needed for checks in _assert_compute_instance_type_validation_successful cluster_config = pcluster_config_reader() cluster = clusters_factory(cluster_config) _assert_compute_instance_type_validation_successful(caplog) remote_command_executor = RemoteCommandExecutor(cluster) min_vcpus = cluster.config["Scheduling"]["AwsBatchQueues"][0][ "ComputeResources"][0]["MinvCpus"] max_vcpus = cluster.config["Scheduling"]["AwsBatchQueues"][0][ "ComputeResources"][0]["MaxvCpus"] assert_that(get_batch_ce_min_size(cluster.cfn_name, region)).is_equal_to(int(min_vcpus)) assert_that(get_batch_ce_max_size(cluster.cfn_name, region)).is_equal_to(int(max_vcpus)) timeout = 120 if region.startswith( "cn-" ) else 60 # Longer timeout in china regions due to less reliable networking _test_simple_job_submission(remote_command_executor, test_datadir, timeout) _test_array_submission(remote_command_executor) _test_mnp_submission(remote_command_executor, test_datadir) _test_job_kill(remote_command_executor, timeout)
def test_replace_compute_on_failure( region, scheduler, pcluster_config_reader, s3_bucket_factory, clusters_factory, test_datadir ): """ Test that compute nodes get replaced on userdata failures and logs get saved in shared directory. The failure is caused by a post_install script that exits with errors on compute nodes. """ bucket_name = s3_bucket_factory() bucket = boto3.resource("s3", region_name=region).Bucket(bucket_name) bucket.upload_file(str(test_datadir / "post_install.sh"), "post_install.sh") cluster_config = pcluster_config_reader(bucket_name=bucket_name) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) # submit a job to spin up a compute node that will fail due to post_install script scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) scheduler_commands.submit_command("sleep 1") instance_id = wait_compute_log(remote_command_executor)[0] # extract logs and check one of them _assert_compute_logs(remote_command_executor, instance_id) # check that instance got already replaced or is marked as Unhealthy time.sleep(25) # Instance waits for 10 seconds before terminating to allow logs to propagate to CloudWatch assert_instance_replaced_or_terminating(instance_id, region)
def test_hit_disable_hyperthreading( region, scheduler, instance, os, pcluster_config_reader, clusters_factory, default_threads_per_core ): """Test Disable Hyperthreading for HIT clusters.""" slots_per_instance = fetch_instance_slots(region, instance) cluster_config = pcluster_config_reader() cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) _test_disable_hyperthreading_settings( remote_command_executor, scheduler_commands, slots_per_instance, scheduler, hyperthreading_disabled=False, partition="ht-enabled", default_threads_per_core=default_threads_per_core, ) _test_disable_hyperthreading_settings( remote_command_executor, scheduler_commands, slots_per_instance, scheduler, hyperthreading_disabled=True, partition="ht-disabled", default_threads_per_core=default_threads_per_core, ) assert_no_errors_in_logs(remote_command_executor, scheduler)
def test_mpi(scheduler, region, instance, pcluster_config_reader, clusters_factory): scaledown_idletime = 3 max_queue_size = 3 slots_per_instance = fetch_instance_slots(region, instance) cluster_config = pcluster_config_reader( scaledown_idletime=scaledown_idletime, max_queue_size=max_queue_size) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) # This verifies that the job completes correctly _test_mpi( remote_command_executor, slots_per_instance, scheduler, region, cluster.cfn_name, scaledown_idletime, verify_scaling=False, ) # This verifies that scaling worked _test_mpi( remote_command_executor, slots_per_instance, scheduler, region, cluster.cfn_name, scaledown_idletime, verify_scaling=True, )
def test_mpi_ssh(scheduler, pcluster_config_reader, clusters_factory, test_datadir): cluster_config = pcluster_config_reader() cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) _test_mpi_ssh(remote_command_executor, scheduler, test_datadir)
def test_multiple_jobs_submission(scheduler, region, pcluster_config_reader, clusters_factory, test_datadir): scaledown_idletime = 4 # Test jobs should take at most 9 minutes to be executed. # These guarantees that the jobs are executed in parallel. max_jobs_execution_time = 9 cluster_config = pcluster_config_reader(scaledown_idletime=scaledown_idletime) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) logging.info("Executing test jobs on cluster") remote_command_executor.run_remote_script(test_datadir / "cluster-check.sh", args=["submit", scheduler]) logging.info("Monitoring asg capacity and compute nodes") asg_capacity_time_series, compute_nodes_time_series, timestamps = get_compute_nodes_allocation( scheduler_commands=scheduler_commands, region=region, stack_name=cluster.cfn_name, max_monitoring_time=minutes(max_jobs_execution_time) + minutes(scaledown_idletime) + minutes(5), ) logging.info("Verifying test jobs completed successfully and in the expected time") _assert_test_jobs_completed(remote_command_executor, max_jobs_execution_time * 60) logging.info("Verifying auto-scaling worked correctly") _assert_scaling_works( asg_capacity_time_series=asg_capacity_time_series, compute_nodes_time_series=compute_nodes_time_series, expected_asg_capacity=(0, 3), expected_compute_nodes=(0, 3), )
def _test_fsx_lustre_configuration_options( cluster, region, scheduler, os, mount_dir, bucket_name, storage_type, auto_import_policy, deployment_type, data_compression_type, weekly_maintenance_start_time, imported_file_chunk_size, storage_capacity, ): _test_fsx_lustre(cluster, region, scheduler, os, mount_dir, bucket_name) remote_command_executor = RemoteCommandExecutor(cluster) fsx_fs_id = get_fsx_fs_id(cluster, region) fsx = boto3.client( "fsx", region_name=region).describe_file_systems(FileSystemIds=[fsx_fs_id]) _test_storage_type(storage_type, fsx) _test_deployment_type(deployment_type, fsx) _test_auto_import(auto_import_policy, remote_command_executor, mount_dir, bucket_name, region) _test_storage_capacity(remote_command_executor, mount_dir, storage_capacity) _test_weekly_maintenance_start_time(weekly_maintenance_start_time, fsx) _test_imported_file_chunch_size(imported_file_chunk_size, fsx) _test_data_compression_type(data_compression_type, fsx)
def test_ebs_single(scheduler, pcluster_config_reader, clusters_factory, kms_key_factory, region, os): mount_dir = "ebs_mount_dir" kms_key_id = kms_key_factory.create_kms_key(region) cluster_config = pcluster_config_reader( mount_dir=mount_dir, ec2_iam_role=kms_key_factory.iam_role_arn, ebs_kms_key_id=kms_key_id) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) mount_dir = "/" + mount_dir scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) volume_id = get_ebs_volume_ids(cluster, region)[0] _test_ebs_correctly_mounted(remote_command_executor, mount_dir, volume_size=35) _test_ebs_correctly_shared(remote_command_executor, mount_dir, scheduler_commands) _test_ebs_encrypted_with_kms(volume_id, region, encrypted=True, kms_key_id=kms_key_id) _test_root_volume_encryption(cluster, os, region, scheduler, encrypted=True)
def test_slurm_pmix(pcluster_config_reader, clusters_factory): """Test interactive job submission using PMIx.""" num_computes = 2 cluster_config = pcluster_config_reader(queue_size=num_computes) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) # Ensure the expected PMIx version is listed when running `srun --mpi=list`. # Since we're installing PMIx v3.1.5, we expect to see pmix and pmix_v3 in the output. # Sample output: # [ec2-user@ip-172-31-33-187 ~]$ srun 2>&1 --mpi=list # srun: MPI types are... # srun: none # srun: openmpi # srun: pmi2 # srun: pmix # srun: pmix_v3 mpi_list_output = remote_command_executor.run_remote_command( "srun 2>&1 --mpi=list").stdout assert_that(mpi_list_output).matches(r"\s+pmix($|\s+)") assert_that(mpi_list_output).matches(r"\s+pmix_v3($|\s+)") # Compile and run an MPI program interactively mpi_module = "openmpi" binary_path = "/shared/ring" compile_mpi_ring(mpi_module, remote_command_executor, binary_path=binary_path) interactive_command = f"module load {mpi_module} && srun --mpi=pmix -N {num_computes} {binary_path}" remote_command_executor.run_remote_command(interactive_command)
def test_cloudwatch_logging(region, scheduler, instance, os, pcluster_config_reader, test_datadir, clusters_factory): """ Test all CloudWatch logging features. All tests are grouped in a single function so that the cluster can be reused for all of them. """ environ[ "AWS_DEFAULT_REGION"] = region # So that it doesn't have to be passed to boto3 calls config_params = get_config_param_vals() cluster_config = pcluster_config_reader(**config_params) cluster = clusters_factory(cluster_config) # change CW agent to debug mode remote_command_executor = RemoteCommandExecutor(cluster) remote_command_executor.run_remote_script(str(test_datadir / "cw_agent_debug.sh"), run_as_root=True) test_runner = CloudWatchLoggingTestRunner( log_group_name=_get_log_group_name_for_cluster(cluster.name), enabled=True, retention_days=config_params.get("retention_days"), logs_persist_after_delete=True, ) cluster_logs_state = CloudWatchLoggingClusterState( scheduler, os, cluster).get_logs_state() _test_cw_logs_before_after_delete(cluster, cluster_logs_state, test_runner, remote_command_executor)
def test_slurm_gpu(region, pcluster_config_reader, clusters_factory): """ Test Slurm GPU related features. Grouped all tests in a single function so that cluster can be reused for all of them. """ scaledown_idletime = 3 max_queue_size = 4 cluster_config = pcluster_config_reader( scaledown_idletime=scaledown_idletime, max_queue_size=max_queue_size) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) _gpu_test_scaleup(remote_command_executor, region, cluster.asg, cluster.cfn_name, scaledown_idletime, num_gpus=2) _test_dynamic_dummy_nodes(remote_command_executor, region, cluster.asg, max_queue_size, slots=32, gpus=2) _gpu_test_cluster_limits(remote_command_executor, max_queue_size, 2) _gpu_resource_check(remote_command_executor) _gpu_test_conflicting_options(remote_command_executor, 2) assert_no_errors_in_logs(remote_command_executor, ["/var/log/sqswatcher", "/var/log/jobwatcher"])
def test_ebs_snapshot(request, vpc_stacks, region, scheduler, pcluster_config_reader, clusters_factory, snapshots_factory): logging.info("Testing ebs snapshot") mount_dir = "ebs_mount_dir" volume_size = 10 logging.info("Creating snapshot") snapshot_id = snapshots_factory.create_snapshot( request, vpc_stacks[region].cfn_outputs["PublicSubnetId"], region) logging.info("Snapshot id: %s" % snapshot_id) cluster_config = pcluster_config_reader(mount_dir=mount_dir, volume_size=volume_size, snapshot_id=snapshot_id) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) mount_dir = "/" + mount_dir scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) _test_ebs_correctly_mounted(remote_command_executor, mount_dir, volume_size="9.8") _test_ebs_correctly_shared(remote_command_executor, mount_dir, scheduler_commands) # Checks for test data result = remote_command_executor.run_remote_command( "cat {}/test.txt".format(mount_dir)) assert_that(result.stdout.strip()).is_equal_to("hello world")
def test_slurm(region, pcluster_config_reader, clusters_factory): """ Test all AWS Slurm related features. Grouped all tests in a single function so that cluster can be reused for all of them. """ scaledown_idletime = 3 max_queue_size = 5 cluster_config = pcluster_config_reader( scaledown_idletime=scaledown_idletime, max_queue_size=max_queue_size) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) _test_slurm_version(remote_command_executor) _test_dynamic_max_cluster_size(remote_command_executor, region, cluster.asg, max_queue_size=max_queue_size) _test_cluster_limits(remote_command_executor, max_queue_size) _test_job_dependencies(remote_command_executor, region, cluster.cfn_name, scaledown_idletime, max_queue_size) _test_job_arrays_and_parallel_jobs(remote_command_executor, region, cluster.cfn_name, scaledown_idletime) assert_overscaling_when_job_submitted_during_scaledown( remote_command_executor, "slurm", region, cluster.cfn_name, scaledown_idletime) _test_dynamic_dummy_nodes(remote_command_executor, region, cluster.asg, max_queue_size) assert_no_errors_in_logs(remote_command_executor, ["/var/log/sqswatcher", "/var/log/jobwatcher"])
def test_nodewatcher_terminates_failing_node(scheduler, region, pcluster_config_reader, clusters_factory, test_datadir): cluster_config = pcluster_config_reader() cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) compute_nodes = scheduler_commands.get_compute_nodes() # submit a job that kills the slurm daemon so that the node enters a failing state scheduler_commands.submit_script( str(test_datadir / "{0}_kill_scheduler_job.sh".format(scheduler))) instance_id = wait_compute_log(remote_command_executor) _assert_compute_logs(remote_command_executor, instance_id) assert_instance_replaced_or_terminating(instance_id, region) # verify that desired capacity is still 1 assert_that(get_desired_asg_capacity(region, cluster.cfn_name)).is_equal_to(1) _assert_nodes_removed_from_scheduler(scheduler_commands, compute_nodes) assert_no_errors_in_logs(remote_command_executor, ["/var/log/sqswatcher", "/var/log/jobwatcher"])
def test_multiple_nics(scheduler, region, pcluster_config_reader, clusters_factory): cluster_config = pcluster_config_reader() cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) _test_head_node_nics(remote_command_executor, region) _test_compute_node_nics(cluster, region, remote_command_executor, scheduler_commands)
def test_spot_default(scheduler, pcluster_config_reader, clusters_factory): """Test that a cluster with spot instances can be created with default spot_price_value.""" cluster_config = pcluster_config_reader() cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) assert_that(scheduler_commands.compute_nodes_count()).is_equal_to(1)
def test_intel_hpc(region, scheduler, instance, os, pcluster_config_reader, clusters_factory, test_datadir): """Test Intel Cluster Checker""" cluster_config = pcluster_config_reader() cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) _test_intel_clck(remote_command_executor, scheduler_commands, test_datadir, os) assert_no_errors_in_logs(remote_command_executor, scheduler)
def test_update(instance, region, pcluster_config_reader, clusters_factory, test_datadir): """ Test 'pcluster update' command. Grouped all tests in a single function so that cluster can be reused for all of them. """ s3_arn = "arn:aws:s3:::fake_bucket/*" init_config = PClusterConfig( max_queue_size=5, compute_instance_type=instance, compute_root_volume_size=30, s3_read_resource=s3_arn, s3_read_write_resource=s3_arn, ) cluster = _init_cluster(clusters_factory, pcluster_config_reader, init_config) command_executor = RemoteCommandExecutor(cluster) slurm_commands = SlurmCommands(command_executor) _verify_initialization(command_executor, slurm_commands, region, test_datadir, cluster, init_config) s3_arn_updated = "arn:aws:s3:::fake_bucket/fake_folder/*" updated_config = PClusterConfig( max_queue_size=10, compute_instance_type="c4.xlarge", compute_root_volume_size=40, s3_read_resource=s3_arn_updated, s3_read_write_resource=s3_arn_updated, ) _update_cluster(cluster, updated_config) # verify updated parameters _test_max_queue(region, cluster.cfn_name, updated_config.max_queue_size) _test_s3_read_resource(region, cluster, updated_config.s3_read_resource) _test_s3_read_write_resource(region, cluster, updated_config.s3_read_write_resource) # verify params that are NOT updated in OLD compute nodes compute_nodes = slurm_commands.get_compute_nodes() _test_compute_instance_type(region, cluster.cfn_name, init_config.compute_instance_type, compute_nodes[0]) _test_compute_root_volume_size(command_executor, slurm_commands, test_datadir, init_config.compute_root_volume_size, compute_nodes[0]) # add compute nodes and verify updated params in NEW compute nodes new_compute_nodes = _add_compute_nodes(slurm_commands) _test_compute_instance_type(region, cluster.cfn_name, updated_config.compute_instance_type, new_compute_nodes[0]) _test_compute_root_volume_size(command_executor, slurm_commands, test_datadir, updated_config.compute_root_volume_size, new_compute_nodes[0])
def test_scaling_performance(region, scheduler, os, instance, pcluster_config_reader, clusters_factory, request): """The test runs benchmarks for the scaling logic.""" benchmarks_max_time = request.config.getoption("benchmarks_max_time") benchmark_params = { "region": region, "scheduler": scheduler, "os": os, "instance": instance, "scaling_target": request.config.getoption("benchmarks_target_capacity"), "scaledown_idletime": 2, "job_duration": 60, } cluster_config = pcluster_config_reader( scaledown_idletime=benchmark_params["scaledown_idletime"], scaling_target=benchmark_params["scaling_target"]) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) logging.info("Starting benchmark with following parameters: %s", benchmark_params) start_time = datetime.datetime.utcnow() kwargs = {"nodes": benchmark_params["scaling_target"]} result = scheduler_commands.submit_command( "sleep {0}".format(benchmark_params["job_duration"]), **kwargs) scheduler_commands.assert_job_submitted(result.stdout) compute_nodes_time_series, timestamps, end_time = publish_compute_nodes_metric( scheduler_commands, max_monitoring_time=minutes(benchmarks_max_time), region=region, cluster_name=cluster.cfn_name, ) logging.info( "Benchmark completed. Producing outputs and performing assertions.") benchmark_params["total_time"] = "{0}seconds".format( int((end_time - start_time).total_seconds())) produce_benchmark_metrics_report( benchmark_params, region, cluster.cfn_name, start_time.replace(tzinfo=datetime.timezone.utc).isoformat(), end_time.replace(tzinfo=datetime.timezone.utc).isoformat(), benchmark_params["scaling_target"], request, ) assert_that(max(compute_nodes_time_series)).is_equal_to( benchmark_params["scaling_target"]) assert_that(compute_nodes_time_series[-1]).is_equal_to(0) assert_no_errors_in_logs(remote_command_executor, scheduler)
def test_ebs_multiple(scheduler, pcluster_config_reader, clusters_factory, region, os): mount_dirs = ["/ebs_mount_dir_{0}".format(i) for i in range(0, 5)] volume_sizes = [15 + 5 * i for i in range(0, 5)] # for volume type sc1 and st1, the minimum volume sizes are 500G volume_sizes[3] = 500 volume_sizes[4] = 500 cluster_config = pcluster_config_reader(mount_dirs=mount_dirs, volume_sizes=volume_sizes) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) for mount_dir, volume_size in zip(mount_dirs, volume_sizes): # for volume size equal to 500G, the filesystem size is only about 492G # This is because the file systems use some of the total space available on a device for storing internal # structures and data (the file system's metadata). The overhead of the XFS filesystem is around 0.5%. # If we test with small volume size(eg: 40G), the number is not large enough to show the gap between the # partition size and the filesystem size. For sc1 and st1, the minimum size is 500G, so there will be a size # difference. _test_ebs_correctly_mounted( remote_command_executor, mount_dir, volume_size if volume_size != 500 else "49[0-9]") _test_ebs_correctly_shared(remote_command_executor, mount_dir, scheduler_commands) volume_ids = get_ebs_volume_ids(cluster, region) for i in range(len(volume_ids)): # test different volume types volume_id = volume_ids[i] ebs_settings = _get_ebs_settings_by_name(cluster.config, f"ebs{i+1}") volume_type = ebs_settings["VolumeType"] volume = describe_volume(volume_id, region) assert_that(volume[0]).is_equal_to(volume_type) encrypted = ebs_settings.get("Encrypted") if encrypted is None: # Default encryption if not specified encrypted = True _test_ebs_encrypted_with_kms(volume_id, region, encrypted=encrypted, kms_key_id=ebs_settings.get("KmsKeyId")) # test different iops # only io1, io2, gp3 can configure iops if volume_type in ["io1", "io2", "gp3"]: volume_iops = ebs_settings["Iops"] assert_that(volume[1]).is_equal_to(int(volume_iops)) _test_root_volume_encryption(cluster, os, region, scheduler, encrypted=False) _assert_root_volume_configuration(cluster, os, region, scheduler)