def test_multiple_jobs_submission(scheduler, region, pcluster_config_reader, clusters_factory, test_datadir): scaledown_idletime = 4 # Test jobs should take at most 9 minutes to be executed. # These guarantees that the jobs are executed in parallel. max_jobs_execution_time = 9 cluster_config = pcluster_config_reader(scaledown_idletime=scaledown_idletime) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) logging.info("Executing test jobs on cluster") remote_command_executor.run_remote_script(test_datadir / "cluster-check.sh", args=["submit", scheduler]) logging.info("Monitoring asg capacity and compute nodes") asg_capacity_time_series, compute_nodes_time_series, timestamps = get_compute_nodes_allocation( scheduler_commands=scheduler_commands, region=region, stack_name=cluster.cfn_name, max_monitoring_time=minutes(max_jobs_execution_time) + minutes(scaledown_idletime) + minutes(5), ) logging.info("Verifying test jobs completed successfully and in the expected time") _assert_test_jobs_completed(remote_command_executor, max_jobs_execution_time * 60) logging.info("Verifying auto-scaling worked correctly") _assert_scaling_works( asg_capacity_time_series=asg_capacity_time_series, compute_nodes_time_series=compute_nodes_time_series, expected_asg_capacity=(0, 3), expected_compute_nodes=(0, 3), )
def _test_job_dependencies(remote_command_executor, region, stack_name, scaledown_idletime): logging.info("Testing cluster doesn't scale when job dependencies are not satisfied") slurm_commands = SlurmCommands(remote_command_executor) result = slurm_commands.submit_command("sleep 60", nodes=1) job_id = slurm_commands.assert_job_submitted(result.stdout) result = remote_command_executor.run_remote_command("sbatch -N 1 --wrap='sleep 1' -d afterok:{0}".format(job_id)) dependent_job_id = slurm_commands.assert_job_submitted(result.stdout) # Wait for reason to be computed time.sleep(3) assert_that(_get_job_info(remote_command_executor, job_id)).contains( "JobState=PENDING Reason=Nodes_required_for_job_are_DOWN,_DRAINED" "_or_reserved_for_jobs_in_higher_priority_partitions" ) assert_that(_get_job_info(remote_command_executor, dependent_job_id)).contains("JobState=PENDING Reason=Dependency") jobs_execution_time = 1 estimated_scaleup_time = 5 estimated_scaledown_time = 20 asg_capacity_time_series, compute_nodes_time_series, timestamps = get_compute_nodes_allocation( scheduler_commands=slurm_commands, region=region, stack_name=stack_name, max_monitoring_time=minutes(jobs_execution_time) + minutes(scaledown_idletime) + minutes(estimated_scaleup_time) + minutes(estimated_scaledown_time), ) assert_that(max(asg_capacity_time_series)).is_equal_to(1) assert_that(max(compute_nodes_time_series)).is_equal_to(1) assert_that(asg_capacity_time_series[-1]).is_equal_to(0) assert_that(compute_nodes_time_series[-1]).is_equal_to(0)
def assert_scaling_worked( scheduler_commands, region, stack_name, scaledown_idletime, expected_max, expected_final, assert_asg=True, assert_scheduler=True, ): jobs_execution_time = 1 estimated_scaleup_time = 5 max_scaledown_time = 10 asg_capacity_time_series, compute_nodes_time_series, _ = get_compute_nodes_allocation( scheduler_commands=scheduler_commands, region=region, stack_name=stack_name, max_monitoring_time=minutes(jobs_execution_time) + minutes(scaledown_idletime) + minutes(estimated_scaleup_time) + minutes(max_scaledown_time), ) with soft_assertions(): if assert_asg: asg_capacity_time_series_str = f"asg_capacity_time_series={asg_capacity_time_series}" assert_that(max(asg_capacity_time_series)).described_as( asg_capacity_time_series_str).is_equal_to(expected_max) assert_that(asg_capacity_time_series[-1]).described_as( asg_capacity_time_series_str).is_equal_to(expected_final) if assert_scheduler: compute_nodes_time_series_str = f"compute_nodes_time_series={compute_nodes_time_series}" assert_that(max(compute_nodes_time_series)).described_as( compute_nodes_time_series_str).is_equal_to(expected_max) assert_that(compute_nodes_time_series[-1]).described_as( compute_nodes_time_series_str).is_equal_to(expected_final)
def test_multiple_jobs_submission(scheduler, region, pcluster_config_reader, clusters_factory, test_datadir): scaledown_idletime = 4 # Test jobs should take at most 9 minutes to be executed. # These guarantees that the jobs are executed in parallel. max_jobs_execution_time = 9 cluster_config = pcluster_config_reader( scaledown_idletime=scaledown_idletime) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) logging.info("Executing sleep job to start a dynamic node") result = scheduler_commands.submit_command("sleep 1") job_id = scheduler_commands.assert_job_submitted(result.stdout) retry(wait_fixed=seconds(30), stop_max_delay=seconds(500))(_assert_job_state)( scheduler_commands, job_id, job_state="COMPLETED") logging.info("Executing test jobs on cluster") remote_command_executor.run_remote_script(test_datadir / "cluster-check.sh", args=["submit", scheduler]) logging.info("Monitoring ec2 capacity and compute nodes") ec2_capacity_time_series, compute_nodes_time_series, timestamps = get_compute_nodes_allocation( scheduler_commands=scheduler_commands, region=region, stack_name=cluster.cfn_name, max_monitoring_time=minutes(max_jobs_execution_time) + minutes(scaledown_idletime) + minutes(5), ) logging.info( "Verifying test jobs completed successfully and in the expected time") _assert_test_jobs_completed(remote_command_executor, max_jobs_execution_time * 60) logging.info("Verifying auto-scaling worked correctly") _assert_scaling_works( ec2_capacity_time_series=ec2_capacity_time_series, compute_nodes_time_series=compute_nodes_time_series, expected_ec2_capacity=(0, 3), expected_compute_nodes=(0, 3), ) logging.info("Verifying no error in logs") assert_no_errors_in_logs(remote_command_executor, scheduler)
def assert_scaling_worked(scheduler_commands, region, stack_name, scaledown_idletime, expected_max, expected_final): jobs_execution_time = 1 estimated_scaleup_time = 5 max_scaledown_time = 10 asg_capacity_time_series, compute_nodes_time_series, _ = get_compute_nodes_allocation( scheduler_commands=scheduler_commands, region=region, stack_name=stack_name, max_monitoring_time=minutes(jobs_execution_time) + minutes(scaledown_idletime) + minutes(estimated_scaleup_time) + minutes(max_scaledown_time), ) assert_that(max(asg_capacity_time_series)).is_equal_to(expected_max) assert_that(max(compute_nodes_time_series)).is_equal_to(expected_max) assert_that(asg_capacity_time_series[-1]).is_equal_to(expected_final) assert_that(compute_nodes_time_series[-1]).is_equal_to(expected_final)
def _test_job_dependencies(remote_command_executor, region, stack_name, scaledown_idletime): logging.info( "Testing cluster doesn't scale when job dependencies are not satisfied" ) slurm_commands = SlurmCommands(remote_command_executor) result = slurm_commands.submit_command("sleep 60", nodes=1) job_id = slurm_commands.assert_job_submitted(result.stdout) result = remote_command_executor.run_remote_command( "sbatch -N 1 --wrap='sleep 1' -d afterok:{0}".format(job_id)) dependent_job_id = slurm_commands.assert_job_submitted(result.stdout) # Wait for reason to be computed time.sleep(3) assert_that(_get_job_info(remote_command_executor, job_id)).contains( "JobState=PENDING Reason=Nodes_required_for_job_are_DOWN,_DRAINED" "_or_reserved_for_jobs_in_higher_priority_partitions") assert_that(_get_job_info( remote_command_executor, dependent_job_id)).contains("JobState=PENDING Reason=Dependency") jobs_execution_time = 1 estimated_scaleup_time = 5 estimated_scaledown_time = 20 asg_capacity_time_series, compute_nodes_time_series, timestamps = get_compute_nodes_allocation( scheduler_commands=slurm_commands, region=region, stack_name=stack_name, max_monitoring_time=minutes(jobs_execution_time) + minutes(scaledown_idletime) + minutes(estimated_scaleup_time) + minutes(estimated_scaledown_time), ) assert_that(max(asg_capacity_time_series)).is_equal_to(1) assert_that(max(compute_nodes_time_series)).is_equal_to(1) assert_that(asg_capacity_time_series[-1]).is_equal_to(0) assert_that(compute_nodes_time_series[-1]).is_equal_to(0)