def _test_job_dependencies(remote_command_executor, region, stack_name, scaledown_idletime): logging.info( "Testing cluster doesn't scale when job dependencies are not satisfied" ) sge_commands = SgeCommands(remote_command_executor) result = sge_commands.submit_command("sleep 60", nodes=1) job_id = sge_commands.assert_job_submitted(result.stdout) result = remote_command_executor.run_remote_command( "echo 'sleep 1' | qsub -hold_jid {0}".format(job_id), raise_on_error=False) dependent_job_id = sge_commands.assert_job_submitted(result.stdout) assert_that(_get_job_state(remote_command_executor, dependent_job_id)).is_equal_to("hqw") # Assert scaling worked as expected assert_scaling_worked(sge_commands, region, stack_name, scaledown_idletime, expected_max=1, expected_final=0) # Assert jobs were completed sge_commands.assert_job_succeeded(job_id) sge_commands.assert_job_succeeded(dependent_job_id)
def _test_job_arrays_and_parallel_jobs(remote_command_executor, region, stack_name, scaledown_idletime, max_slots): logging.info( "Testing cluster scales correctly with array jobs and parallel jobs") sge_commands = SgeCommands(remote_command_executor) result = remote_command_executor.run_remote_command( "echo 'sleep 1' | qsub -t 1-5", raise_on_error=False) array_job_id = sge_commands.assert_job_submitted(result.stdout, is_array=True) result = remote_command_executor.run_remote_command( "echo 'sleep 1' | qsub -pe mpi 4", raise_on_error=False) parallel_job_id = sge_commands.assert_job_submitted(result.stdout) # Assert scaling worked as expected expected_max = math.ceil(float(5 + 4) / max_slots) assert_scaling_worked(sge_commands, region, stack_name, scaledown_idletime, expected_max=expected_max, expected_final=0) # Assert jobs were completed sge_commands.assert_job_succeeded(array_job_id) sge_commands.assert_job_succeeded(parallel_job_id)
def _test_fsx_lustre_correctly_shared(remote_command_executor, mount_dir): logging.info("Testing fsx lustre correctly mounted on compute nodes") sge_commands = SgeCommands(remote_command_executor) remote_command_executor.run_remote_command("touch {mount_dir}/test_file".format(mount_dir=mount_dir)) job_command = ( "cat {mount_dir}/s3_test_file " "&& cat {mount_dir}/test_file " "&& touch {mount_dir}/compute_output".format(mount_dir=mount_dir) ) result = sge_commands.submit_command(job_command) job_id = sge_commands.assert_job_submitted(result.stdout) sge_commands.wait_job_completed(job_id) sge_commands.assert_job_succeeded(job_id) remote_command_executor.run_remote_command("cat {mount_dir}/compute_output".format(mount_dir=mount_dir))
def _test_non_runnable_jobs(remote_command_executor, max_queue_size, max_slots, region, cluster, scaledown_idletime): logging.info("Testing jobs that violate scheduling requirements") sge_commands = SgeCommands(remote_command_executor) # Make sure the cluster has at least 1 node in the queue so that we can verify cluster scales down correctly if sge_commands.compute_nodes_count() == 0: result = sge_commands.submit_command("sleep 1") job_id = sge_commands.assert_job_submitted(result.stdout) sge_commands.wait_job_completed(job_id) assert_that(sge_commands.compute_nodes_count()).is_greater_than(0) logging.info( "Testing cluster doesn't scale when job requires a capacity that is higher than the max available" ) result = sge_commands.submit_command("sleep 1000", slots=(max_slots * max_queue_size) + 1) max_slots_job_id = sge_commands.assert_job_submitted(result.stdout) assert_that(_get_job_state(remote_command_executor, max_slots_job_id)).is_equal_to("qw") logging.info("Testing cluster doesn't scale when job is set on hold") result = sge_commands.submit_command("sleep 1000", hold=True) hold_job_id = sge_commands.assert_job_submitted(result.stdout) assert_that(_get_job_state(remote_command_executor, hold_job_id)).is_equal_to("hqw") logging.info( "Testing cluster scales down when pending jobs cannot be submitted") assert_scaling_worked(sge_commands, region, cluster.cfn_name, scaledown_idletime, expected_max=1, expected_final=0) # Assert jobs are still pending pending_jobs = remote_command_executor.run_remote_command( "qstat -s p | tail -n +3 | awk '{ print $1 }'").stdout pending_jobs = pending_jobs.splitlines() assert_that(pending_jobs).contains(max_slots_job_id, hold_job_id)