def _test_mpi_ssh(remote_command_executor, scheduler, os, test_datadir): logging.info("Testing mpi SSH") mpi_module = OS_TO_OPENMPI_MODULE_MAP[os] scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) compute_node = scheduler_commands.get_compute_nodes() assert_that(len(compute_node)).is_equal_to(1) remote_host = compute_node[0] # Gets remote host ip from hostname remote_host_ip = remote_command_executor.run_remote_command( "getent hosts {0} | cut -d' ' -f1".format(remote_host)).stdout # Below job will timeout if the IP address is not in known_hosts mpirun_out_ip = remote_command_executor.run_remote_script( str(test_datadir / "mpi_ssh.sh"), args=[mpi_module, remote_host_ip]).stdout.splitlines() # mpirun_out_ip = "ip-10-0-127-71" assert_that(len(mpirun_out_ip)).is_equal_to(1) assert_that(mpirun_out_ip[-1]).is_equal_to(remote_host) mpirun_out = remote_command_executor.run_remote_script( str(test_datadir / "mpi_ssh.sh"), args=[mpi_module, remote_host]).stdout.splitlines() # mpirun_out = "ip-10-0-127-71" assert_that(len(mpirun_out)).is_equal_to(1) assert_that(mpirun_out[-1]).is_equal_to(remote_host)
def test_efa(region, scheduler, instance, os, pcluster_config_reader, clusters_factory, test_datadir): """ Test all EFA Features. Grouped all tests in a single function so that cluster can be reused for all of them. """ max_queue_size = 2 slots_per_instance = fetch_instance_slots(region, instance) cluster_config = pcluster_config_reader(max_queue_size=max_queue_size) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) _test_efa_installed(scheduler_commands, remote_command_executor) _test_mpi(remote_command_executor, slots_per_instance, scheduler, os) logging.info("Running on Instances: {0}".format( get_compute_nodes_instance_ids(cluster.cfn_name, region))) _test_osu_benchmarks("openmpi", remote_command_executor, scheduler_commands, test_datadir, slots_per_instance) _test_osu_benchmarks("intelmpi", remote_command_executor, scheduler_commands, test_datadir, slots_per_instance) _test_shm_transfer_is_enabled(scheduler_commands, remote_command_executor) assert_no_errors_in_logs(remote_command_executor, ["/var/log/sqswatcher", "/var/log/jobwatcher"])
def test_replace_compute_on_failure( region, scheduler, pcluster_config_reader, s3_bucket_factory, clusters_factory, test_datadir ): """ Test that compute nodes get replaced on userdata failures and logs get saved in shared directory. The failure is caused by a post_install script that exits with errors on compute nodes. """ bucket_name = s3_bucket_factory() bucket = boto3.resource("s3", region_name=region).Bucket(bucket_name) bucket.upload_file(str(test_datadir / "post_install.sh"), "post_install.sh") cluster_config = pcluster_config_reader(bucket_name=bucket_name) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) # submit a job to spin up a compute node that will fail due to post_install script scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) scheduler_commands.submit_command("sleep 1") instance_id = wait_compute_log(remote_command_executor)[0] # extract logs and check one of them _assert_compute_logs(remote_command_executor, instance_id) # check that instance got already replaced or is marked as Unhealthy time.sleep(25) # Instance waits for 10 seconds before terminating to allow logs to propagate to CloudWatch assert_instance_replaced_or_terminating(instance_id, region)
def _test_mpi_ssh(remote_command_executor, scheduler, test_datadir): logging.info("Testing mpi SSH") mpi_module = "openmpi" scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) compute_node = scheduler_commands.get_compute_nodes() assert_that(len(compute_node)).is_equal_to(1) remote_host = compute_node[0] # Gets remote host ip from hostname remote_host_ip = remote_command_executor.run_remote_command( "getent hosts {0} | cut -d' ' -f1".format(remote_host), timeout=10).stdout # Below job will timeout if the IP address is not in known_hosts mpirun_out_ip = remote_command_executor.run_remote_script( str(test_datadir / "mpi_ssh.sh"), args=[mpi_module, remote_host_ip], timeout=10).stdout.splitlines() # mpirun_out_ip = ["Warning: Permanently added '192.168.60.89' (ECDSA) to the list of known hosts.", # '', 'ip-192-168-60-89'] assert_that(len(mpirun_out_ip)).is_equal_to(3) assert_that(mpirun_out_ip[-1]).is_equal_to(remote_host) mpirun_out = remote_command_executor.run_remote_script( str(test_datadir / "mpi_ssh.sh"), args=[mpi_module, remote_host], timeout=10).stdout.splitlines() # mpirun_out = ["Warning: Permanently added 'ip-192-168-60-89,192.168.60.89' (ECDSA) to the list of known hosts.", # '', 'ip-192-168-60-89'] assert_that(len(mpirun_out)).is_equal_to(3) assert_that(mpirun_out[-1]).is_equal_to(remote_host)
def test_ebs_snapshot(request, vpc_stacks, region, scheduler, pcluster_config_reader, clusters_factory, snapshots_factory): logging.info("Testing ebs snapshot") mount_dir = "ebs_mount_dir" volume_size = 10 logging.info("Creating snapshot") snapshot_id = snapshots_factory.create_snapshot( request, vpc_stacks[region].cfn_outputs["PublicSubnetId"], region) logging.info("Snapshot id: %s" % snapshot_id) cluster_config = pcluster_config_reader(mount_dir=mount_dir, volume_size=volume_size, snapshot_id=snapshot_id) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) mount_dir = "/" + mount_dir scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) _test_ebs_correctly_mounted(remote_command_executor, mount_dir, volume_size="9.8") _test_ebs_correctly_shared(remote_command_executor, mount_dir, scheduler_commands) # Checks for test data result = remote_command_executor.run_remote_command( "cat {}/test.txt".format(mount_dir)) assert_that(result.stdout.strip()).is_equal_to("hello world")
def test_ebs_existing(request, vpc_stacks, region, scheduler, pcluster_config_reader, snapshots_factory, clusters_factory): logging.info("Testing ebs existing") existing_mount_dir = "existing_mount_dir" logging.info("Creating volume") volume_id = snapshots_factory.create_existing_volume( request, vpc_stacks[region].cfn_outputs["PublicSubnetId"], region) logging.info("Existing Volume id: %s" % volume_id) cluster_config = pcluster_config_reader( volume_id=volume_id, existing_mount_dir=existing_mount_dir) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) existing_mount_dir = "/" + existing_mount_dir _test_ebs_correctly_mounted(remote_command_executor, existing_mount_dir, volume_size="9.8") _test_ebs_correctly_shared(remote_command_executor, existing_mount_dir, scheduler_commands) # Checks for test data result = remote_command_executor.run_remote_command( "cat {}/test.txt".format(existing_mount_dir)) assert_that(result.stdout.strip()).is_equal_to("hello world") # delete the cluster before detaching the EBS volume cluster.delete() # check the volume still exists after deleting the cluster _assert_volume_exist(volume_id, region)
def assert_overscaling_when_job_submitted_during_scaledown( remote_command_executor, scheduler, region, stack_name, scaledown_idletime ): """Test that if a job gets submitted when a node is locked the cluster does not overscale""" logging.info("Testing cluster does not overscale when a job is submitted and a node is being terminated.") scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) if scheduler_commands.compute_nodes_count() == 0: result = scheduler_commands.submit_command("sleep 1") job_id = scheduler_commands.assert_job_submitted(result.stdout) scheduler_commands.wait_job_completed(job_id) assert_that(scheduler_commands.compute_nodes_count()).is_equal_to(1) scheduler_commands.wait_for_locked_node() result = scheduler_commands.submit_command("sleep 1") scheduler_commands.assert_job_submitted(result.stdout) # do not check scheduler scaling but only ASG. assert_scaling_worked( scheduler_commands, region, stack_name, scaledown_idletime, expected_max=1, expected_final=0, assert_scheduler=False, )
def test_ebs_single(scheduler, pcluster_config_reader, clusters_factory, kms_key_factory, region, os): mount_dir = "ebs_mount_dir" kms_key_id = kms_key_factory.create_kms_key(region) cluster_config = pcluster_config_reader( mount_dir=mount_dir, ec2_iam_role=kms_key_factory.iam_role_arn, ebs_kms_key_id=kms_key_id) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) mount_dir = "/" + mount_dir scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) volume_id = get_ebs_volume_ids(cluster, region)[0] _test_ebs_correctly_mounted(remote_command_executor, mount_dir, volume_size=35) _test_ebs_correctly_shared(remote_command_executor, mount_dir, scheduler_commands) _test_ebs_encrypted_with_kms(volume_id, region, encrypted=True, kms_key_id=kms_key_id) _test_root_volume_encryption(cluster, os, region, scheduler, encrypted=True)
def test_hit_disable_hyperthreading( region, scheduler, instance, os, pcluster_config_reader, clusters_factory, default_threads_per_core ): """Test Disable Hyperthreading for HIT clusters.""" slots_per_instance = fetch_instance_slots(region, instance) cluster_config = pcluster_config_reader() cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) _test_disable_hyperthreading_settings( remote_command_executor, scheduler_commands, slots_per_instance, scheduler, hyperthreading_disabled=False, partition="ht-enabled", default_threads_per_core=default_threads_per_core, ) _test_disable_hyperthreading_settings( remote_command_executor, scheduler_commands, slots_per_instance, scheduler, hyperthreading_disabled=True, partition="ht-disabled", default_threads_per_core=default_threads_per_core, ) assert_no_errors_in_logs(remote_command_executor, scheduler)
def test_nodewatcher_terminates_failing_node(scheduler, region, pcluster_config_reader, clusters_factory, test_datadir): cluster_config = pcluster_config_reader() cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) compute_nodes = scheduler_commands.get_compute_nodes() # submit a job that kills the slurm daemon so that the node enters a failing state scheduler_commands.submit_script( str(test_datadir / "{0}_kill_scheduler_job.sh".format(scheduler))) instance_id = wait_compute_log(remote_command_executor) _assert_compute_logs(remote_command_executor, instance_id) assert_instance_replaced_or_terminating(instance_id, region) # verify that desired capacity is still 1 assert_that(get_desired_asg_capacity(region, cluster.cfn_name)).is_equal_to(1) _assert_nodes_removed_from_scheduler(scheduler_commands, compute_nodes) assert_no_errors_in_logs(remote_command_executor, ["/var/log/sqswatcher", "/var/log/jobwatcher"])
def test_multiple_jobs_submission(scheduler, region, pcluster_config_reader, clusters_factory, test_datadir): scaledown_idletime = 4 # Test jobs should take at most 9 minutes to be executed. # These guarantees that the jobs are executed in parallel. max_jobs_execution_time = 9 cluster_config = pcluster_config_reader(scaledown_idletime=scaledown_idletime) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) logging.info("Executing test jobs on cluster") remote_command_executor.run_remote_script(test_datadir / "cluster-check.sh", args=["submit", scheduler]) logging.info("Monitoring asg capacity and compute nodes") asg_capacity_time_series, compute_nodes_time_series, timestamps = get_compute_nodes_allocation( scheduler_commands=scheduler_commands, region=region, stack_name=cluster.cfn_name, max_monitoring_time=minutes(max_jobs_execution_time) + minutes(scaledown_idletime) + minutes(5), ) logging.info("Verifying test jobs completed successfully and in the expected time") _assert_test_jobs_completed(remote_command_executor, max_jobs_execution_time * 60) logging.info("Verifying auto-scaling worked correctly") _assert_scaling_works( asg_capacity_time_series=asg_capacity_time_series, compute_nodes_time_series=compute_nodes_time_series, expected_asg_capacity=(0, 3), expected_compute_nodes=(0, 3), )
def test_multiple_nics(scheduler, region, pcluster_config_reader, clusters_factory): cluster_config = pcluster_config_reader() cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) _test_head_node_nics(remote_command_executor, region) _test_compute_node_nics(cluster, region, remote_command_executor, scheduler_commands)
def test_spot_default(scheduler, pcluster_config_reader, clusters_factory): """Test that a cluster with spot instances can be created with default spot_price_value.""" cluster_config = pcluster_config_reader() cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) assert_that(scheduler_commands.compute_nodes_count()).is_equal_to(1)
def test_intel_hpc(region, scheduler, instance, os, pcluster_config_reader, clusters_factory, test_datadir): """Test Intel Cluster Checker""" cluster_config = pcluster_config_reader() cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) _test_intel_clck(remote_command_executor, scheduler_commands, test_datadir, os) assert_no_errors_in_logs(remote_command_executor, scheduler)
def test_default_ebs(scheduler, pcluster_config_reader, clusters_factory): cluster_config = pcluster_config_reader() cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) mount_dir = "/shared" scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) _test_ebs_correctly_mounted(remote_command_executor, mount_dir, volume_size=20) _test_ebs_correctly_shared(remote_command_executor, mount_dir, scheduler_commands)
def test_scaling_performance(region, scheduler, os, instance, pcluster_config_reader, clusters_factory, request): """The test runs benchmarks for the scaling logic.""" benchmarks_max_time = request.config.getoption("benchmarks_max_time") benchmark_params = { "region": region, "scheduler": scheduler, "os": os, "instance": instance, "scaling_target": request.config.getoption("benchmarks_target_capacity"), "scaledown_idletime": 2, "job_duration": 60, } cluster_config = pcluster_config_reader( scaledown_idletime=benchmark_params["scaledown_idletime"], scaling_target=benchmark_params["scaling_target"]) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) logging.info("Starting benchmark with following parameters: %s", benchmark_params) start_time = datetime.datetime.utcnow() kwargs = {"nodes": benchmark_params["scaling_target"]} result = scheduler_commands.submit_command( "sleep {0}".format(benchmark_params["job_duration"]), **kwargs) scheduler_commands.assert_job_submitted(result.stdout) compute_nodes_time_series, timestamps, end_time = publish_compute_nodes_metric( scheduler_commands, max_monitoring_time=minutes(benchmarks_max_time), region=region, cluster_name=cluster.cfn_name, ) logging.info( "Benchmark completed. Producing outputs and performing assertions.") benchmark_params["total_time"] = "{0}seconds".format( int((end_time - start_time).total_seconds())) produce_benchmark_metrics_report( benchmark_params, region, cluster.cfn_name, start_time.replace(tzinfo=datetime.timezone.utc).isoformat(), end_time.replace(tzinfo=datetime.timezone.utc).isoformat(), benchmark_params["scaling_target"], request, ) assert_that(max(compute_nodes_time_series)).is_equal_to( benchmark_params["scaling_target"]) assert_that(compute_nodes_time_series[-1]).is_equal_to(0) assert_no_errors_in_logs(remote_command_executor, scheduler)
def test_ebs_multiple(scheduler, pcluster_config_reader, clusters_factory, region, os): mount_dirs = ["/ebs_mount_dir_{0}".format(i) for i in range(0, 5)] volume_sizes = [15 + 5 * i for i in range(0, 5)] # for volume type sc1 and st1, the minimum volume sizes are 500G volume_sizes[3] = 500 volume_sizes[4] = 500 cluster_config = pcluster_config_reader(mount_dirs=mount_dirs, volume_sizes=volume_sizes) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) for mount_dir, volume_size in zip(mount_dirs, volume_sizes): # for volume size equal to 500G, the filesystem size is only about 492G # This is because the file systems use some of the total space available on a device for storing internal # structures and data (the file system's metadata). The overhead of the XFS filesystem is around 0.5%. # If we test with small volume size(eg: 40G), the number is not large enough to show the gap between the # partition size and the filesystem size. For sc1 and st1, the minimum size is 500G, so there will be a size # difference. _test_ebs_correctly_mounted( remote_command_executor, mount_dir, volume_size if volume_size != 500 else "49[0-9]") _test_ebs_correctly_shared(remote_command_executor, mount_dir, scheduler_commands) volume_ids = get_ebs_volume_ids(cluster, region) for i in range(len(volume_ids)): # test different volume types volume_id = volume_ids[i] ebs_settings = _get_ebs_settings_by_name(cluster.config, f"ebs{i+1}") volume_type = ebs_settings["VolumeType"] volume = describe_volume(volume_id, region) assert_that(volume[0]).is_equal_to(volume_type) encrypted = ebs_settings.get("Encrypted") if encrypted is None: # Default encryption if not specified encrypted = True _test_ebs_encrypted_with_kms(volume_id, region, encrypted=encrypted, kms_key_id=ebs_settings.get("KmsKeyId")) # test different iops # only io1, io2, gp3 can configure iops if volume_type in ["io1", "io2", "gp3"]: volume_iops = ebs_settings["Iops"] assert_that(volume[1]).is_equal_to(int(volume_iops)) _test_root_volume_encryption(cluster, os, region, scheduler, encrypted=False) _assert_root_volume_configuration(cluster, os, region, scheduler)
def test_raid_fault_tolerance_mode(scheduler, pcluster_config_reader, clusters_factory): cluster_config = pcluster_config_reader() cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) mount_dir = "/raid_dir" _test_raid_correctly_configured(remote_command_executor, raid_type="1", volume_size=20, raid_devices=2) _test_raid_correctly_mounted(remote_command_executor, mount_dir, volume_size=20) _test_raid_correctly_shared(remote_command_executor, mount_dir, scheduler_commands)
def test_raid_fault_tolerance_mode(scheduler, pcluster_config_reader, clusters_factory): cluster_config = pcluster_config_reader() cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) mount_dir = "/raid_dir" _test_raid_correctly_configured(remote_command_executor, raid_type="1", volume_size=20, raid_devices=2) _test_raid_correctly_mounted(remote_command_executor, mount_dir, volume_size=20) _test_raid_correctly_shared(remote_command_executor, mount_dir, scheduler_commands)
def _submit_one_slot_job(): if not hasattr(local_data, "scheduler_commands"): local_data.scheduler_commands = get_scheduler_commands( benchmark_params["scheduler"], RemoteCommandExecutor(cluster)) local_data.scheduler_commands.submit_command( "sleep {0}; mkdir -p /shared/job-results; mktemp /shared/job-results/job.XXXXXXXX" .format(benchmark_params["job_duration"]), slots=1, after_ok=job_id, )
def test_ebs_multiple(scheduler, pcluster_config_reader, clusters_factory): mount_dirs = ["/ebs_mount_dir_{0}".format(i) for i in range(0, 5)] volume_sizes = [15 + 5 * i for i in range(0, 5)] cluster_config = pcluster_config_reader(mount_dirs=mount_dirs, volume_sizes=volume_sizes) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) for mount_dir, volume_size in zip(mount_dirs, volume_sizes): _test_ebs_correctly_mounted(remote_command_executor, mount_dir, volume_size) _test_ebs_correctly_shared(remote_command_executor, mount_dir, scheduler_commands)
def __init__(self, scheduler, os, cluster, feature_key=None, shared_dir=DEFAULT_SHARED_DIR): """Get the state of the cluster as it pertains to the CloudWatch logging feature.""" self.scheduler = scheduler self.platform = self._base_os_to_platform(os) self.cluster = cluster self.feature_key = feature_key self.shared_dir = self._get_shared_dir(shared_dir) self.remote_command_executor = RemoteCommandExecutor(self.cluster) self.scheduler_commands = get_scheduler_commands(self.scheduler, self.remote_command_executor) self._relevant_logs = {HEAD_NODE_ROLE_NAME: [], COMPUTE_NODE_ROLE_NAME: []} self._cluster_log_state = {HEAD_NODE_ROLE_NAME: {}, COMPUTE_NODE_ROLE_NAME: {}} self._set_cluster_log_state()
def reset_stateful_connection_objects(self, default_user_remote_command_executor): """Reset objects that might maintain an open SSH connection.""" del self._default_user_remote_command_executor del self._personalized_remote_command_executor del self._personalized_scheduler_commands self._default_user_remote_command_executor = default_user_remote_command_executor self._personalized_remote_command_executor = RemoteCommandExecutor( self.cluster, username=self.alias, alternate_ssh_key=self.ssh_private_key_path ) self._personalized_scheduler_commands = get_scheduler_commands( self.scheduler, self._personalized_remote_command_executor )
def test_nodewatcher_terminates_failing_node(scheduler, region, pcluster_config_reader, clusters_factory, test_datadir): # slurm test use more nodes because of internal request to test in multi-node settings initial_queue_size = 1 maintain_initial_size = "true" environ["AWS_DEFAULT_REGION"] = region cluster_config = pcluster_config_reader( initial_queue_size=initial_queue_size, maintain_initial_size=maintain_initial_size) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) compute_nodes = scheduler_commands.get_compute_nodes() instance_ids = get_compute_nodes_instance_ids(cluster.cfn_name, region) hostname_to_instance_id = get_instance_ids_compute_hostnames_conversion_dict( instance_ids, id_to_hostname=False) logging.info( "Testing that nodewatcher will terminate a node in failing state") # submit a job to run on all nodes scheduler_commands.submit_command("sleep infinity", nodes=initial_queue_size) expected_num_nodes_killed = 1 # simulate unexpected hardware failure by killing first x nodes nodes_to_remove = compute_nodes[:expected_num_nodes_killed] for node in nodes_to_remove: remote_command_executor.run_remote_script(str( test_datadir / "{0}_kill_scheduler_job.sh".format(scheduler)), args=[node]) # assert failing nodes are terminated according to ASG _assert_failing_nodes_terminated(nodes_to_remove, hostname_to_instance_id, region) nodes_to_retain = [ compute for compute in compute_nodes if compute not in nodes_to_remove ] # verify that desired capacity is still the initial_queue_size assert_that(get_desired_asg_capacity( region, cluster.cfn_name)).is_equal_to(initial_queue_size) # assert failing nodes are removed from scheduler config _assert_nodes_removed_and_replaced_in_scheduler( scheduler_commands, nodes_to_remove, nodes_to_retain, desired_capacity=initial_queue_size) assert_no_errors_in_logs(remote_command_executor, scheduler) test_maintain_initial_size(cluster.cfn_name, region, maintain_initial_size, initial_queue_size)
def test_default_ebs(scheduler, pcluster_config_reader, clusters_factory): cluster_config = pcluster_config_reader() cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) mount_dir = "/shared" scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) _test_ebs_correctly_mounted(remote_command_executor, mount_dir, volume_size=20) _test_ebs_correctly_shared(remote_command_executor, mount_dir, scheduler_commands)
def test_slurm_scaling(scheduler, region, instance, pcluster_config_reader, clusters_factory, test_datadir): """Test that slurm-specific scaling logic is behaving as expected for normal actions and failures.""" cluster_config = pcluster_config_reader(scaledown_idletime=3) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) _assert_cluster_initial_conditions(scheduler_commands, instance, 20, 20, 4, 1) _test_partition_states( scheduler_commands, cluster.cfn_name, region, active_partition="ondemand1", inactive_partition="ondemand2", num_static_nodes=2, num_dynamic_nodes=3, dynamic_instance_type=instance, ) _test_reset_terminated_nodes( scheduler_commands, cluster.cfn_name, region, partition="ondemand1", num_static_nodes=2, num_dynamic_nodes=3, dynamic_instance_type=instance, ) _test_replace_down_nodes( remote_command_executor, scheduler_commands, test_datadir, cluster.cfn_name, region, partition="ondemand1", num_static_nodes=2, num_dynamic_nodes=3, dynamic_instance_type=instance, ) _test_keep_or_replace_suspended_nodes( scheduler_commands, cluster.cfn_name, region, partition="ondemand1", num_static_nodes=2, num_dynamic_nodes=3, dynamic_instance_type=instance, ) assert_no_errors_in_logs(remote_command_executor, scheduler)
def test_multiple_jobs_submission(scheduler, region, pcluster_config_reader, clusters_factory, test_datadir): scaledown_idletime = 4 # Test jobs should take at most 9 minutes to be executed. # These guarantees that the jobs are executed in parallel. max_jobs_execution_time = 9 cluster_config = pcluster_config_reader( scaledown_idletime=scaledown_idletime) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) logging.info("Executing sleep job to start a dynamic node") result = scheduler_commands.submit_command("sleep 1") job_id = scheduler_commands.assert_job_submitted(result.stdout) retry(wait_fixed=seconds(30), stop_max_delay=seconds(500))(_assert_job_state)( scheduler_commands, job_id, job_state="COMPLETED") logging.info("Executing test jobs on cluster") remote_command_executor.run_remote_script(test_datadir / "cluster-check.sh", args=["submit", scheduler]) logging.info("Monitoring ec2 capacity and compute nodes") ec2_capacity_time_series, compute_nodes_time_series, timestamps = get_compute_nodes_allocation( scheduler_commands=scheduler_commands, region=region, stack_name=cluster.cfn_name, max_monitoring_time=minutes(max_jobs_execution_time) + minutes(scaledown_idletime) + minutes(5), ) logging.info( "Verifying test jobs completed successfully and in the expected time") _assert_test_jobs_completed(remote_command_executor, max_jobs_execution_time * 60) logging.info("Verifying auto-scaling worked correctly") _assert_scaling_works( ec2_capacity_time_series=ec2_capacity_time_series, compute_nodes_time_series=compute_nodes_time_series, expected_ec2_capacity=(0, 3), expected_compute_nodes=(0, 3), ) logging.info("Verifying no error in logs") assert_no_errors_in_logs(remote_command_executor, scheduler)
def test_disable_hyperthreading(region, scheduler, instance, os, pcluster_config_reader, clusters_factory): """Test Disable Hyperthreading""" slots_per_instance = fetch_instance_slots(region, instance) cluster_config = pcluster_config_reader() cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) _test_disable_hyperthreading(remote_command_executor, scheduler_commands, slots_per_instance, scheduler) assert_no_errors_in_logs(remote_command_executor, ["/var/log/sqswatcher", "/var/log/jobwatcher"])
def _test_mpi( remote_command_executor, slots_per_instance, scheduler, os, region=None, stack_name=None, scaledown_idletime=None, verify_scaling=False, ): logging.info("Testing mpi job") datadir = pathlib.Path(__file__).parent / "data/mpi/" mpi_module = OS_TO_OPENMPI_MODULE_MAP[os] # Compile mpi script command = "mpicc -o mpi_hello_world mpi_hello_world.c" if mpi_module != "no_module_available": command = "module load {0} && {1}".format(mpi_module, command) remote_command_executor.run_remote_command( command, additional_files=[str(datadir / "mpi_hello_world.c")]) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) # submit script using additional files result = scheduler_commands.submit_script(str( datadir / "mpi_submit_{0}.sh".format(mpi_module)), slots=2 * slots_per_instance) job_id = scheduler_commands.assert_job_submitted(result.stdout) if verify_scaling: assert_scaling_worked(scheduler_commands, region, stack_name, scaledown_idletime, expected_max=2, expected_final=0) # not checking assert_job_succeeded after cluster scale down cause the scheduler history might be gone else: scheduler_commands.wait_job_completed(job_id) scheduler_commands.assert_job_succeeded(job_id) mpi_out = remote_command_executor.run_remote_command( "cat /shared/mpi.out").stdout assert_that(mpi_out.splitlines()).is_length(2) assert_that(mpi_out).matches( r"Hello world from processor ip-.+, rank 0 out of 2 processors") assert_that(mpi_out).matches( r"Hello world from processor ip-.+, rank 1 out of 2 processors") assert_no_errors_in_logs(remote_command_executor, ["/var/log/sqswatcher", "/var/log/jobwatcher"])
def test_intel_hpc(region, scheduler, instance, os, pcluster_config_reader, clusters_factory, test_datadir): """Test Intel Cluster Checker""" slots_per_instance = fetch_instance_slots(region, instance) cluster_config = pcluster_config_reader() cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) _test_intel_clck(remote_command_executor, scheduler_commands, slots_per_instance, test_datadir) assert_no_errors_in_logs(remote_command_executor, ["/var/log/sqswatcher", "/var/log/jobwatcher"])
def test_scheduler_performance(region, scheduler, os, instance, pcluster_config_reader, clusters_factory, request): """The test runs a stress test to verify scheduler behaviour with many submitted jobs.""" benchmarks_max_time = request.config.getoption("benchmarks_max_time") instance_slots = get_instance_vcpus(region, instance) benchmark_params = { "region": region, "scheduler": scheduler, "os": os, "instance": instance, "scaling_target": request.config.getoption("benchmarks_target_capacity"), "scaledown_idletime": 2, "job_duration": 60, "jobs_to_submit": 2 * instance_slots * request.config.getoption("benchmarks_target_capacity"), } cluster_config = pcluster_config_reader( scaledown_idletime=benchmark_params["scaledown_idletime"], scaling_target=benchmark_params["scaling_target"] ) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) enable_asg_metrics(region, cluster) logging.info("Starting benchmark with following parameters: %s", benchmark_params) start_time = datetime.datetime.utcnow() _submit_jobs(benchmark_params, scheduler_commands, instance_slots, cluster) compute_nodes_time_series, timestamps, end_time = publish_compute_nodes_metric( scheduler_commands, max_monitoring_time=minutes(benchmarks_max_time), region=region, cluster_name=cluster.cfn_name, ) logging.info("Benchmark completed. Producing outputs and performing assertions.") benchmark_params["total_time"] = "{0}seconds".format(int((end_time - start_time).total_seconds())) produce_benchmark_metrics_report( benchmark_params, region, cluster.cfn_name, cluster.asg, start_time.replace(tzinfo=datetime.timezone.utc).isoformat(), end_time.replace(tzinfo=datetime.timezone.utc).isoformat(), benchmark_params["scaling_target"], request, ) assert_that(max(compute_nodes_time_series)).is_equal_to(benchmark_params["scaling_target"]) assert_that(compute_nodes_time_series[-1]).is_equal_to(0) _assert_jobs_completed(remote_command_executor, benchmark_params["jobs_to_submit"]) assert_no_errors_in_logs(remote_command_executor, ["/var/log/sqswatcher", "/var/log/jobwatcher"])
def test_sit_cli_commands(scheduler, region, pcluster_config_reader, clusters_factory): """Test pcluster cli commands are working.""" cluster_config = pcluster_config_reader(scaledown_idletime=60) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) _test_pcluster_instances_and_status(cluster, region) _test_pcluster_stop_and_start(scheduler_commands, cluster, region, expected_num_nodes=1) assert_no_errors_in_logs(remote_command_executor, scheduler)
def test_ebs_multiple(scheduler, pcluster_config_reader, clusters_factory): mount_dirs = ["/ebs_mount_dir_{0}".format(i) for i in range(0, 5)] volume_sizes = [15 + 5 * i for i in range(0, 5)] cluster_config = pcluster_config_reader(mount_dirs=mount_dirs, volume_sizes=volume_sizes) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) for mount_dir, volume_size in zip(mount_dirs, volume_sizes): _test_ebs_correctly_mounted(remote_command_executor, mount_dir, volume_size) _test_ebs_correctly_shared(remote_command_executor, mount_dir, scheduler_commands)