def test_nodewatcher_terminates_failing_node(scheduler, region, pcluster_config_reader, clusters_factory, test_datadir): cluster_config = pcluster_config_reader() cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) compute_nodes = scheduler_commands.get_compute_nodes() # submit a job that kills the slurm daemon so that the node enters a failing state scheduler_commands.submit_script( str(test_datadir / "{0}_kill_scheduler_job.sh".format(scheduler))) instance_id = wait_compute_log(remote_command_executor) _assert_compute_logs(remote_command_executor, instance_id) assert_instance_replaced_or_terminating(instance_id, region) # verify that desired capacity is still 1 assert_that(get_desired_asg_capacity(region, cluster.cfn_name)).is_equal_to(1) _assert_nodes_removed_from_scheduler(scheduler_commands, compute_nodes) assert_no_errors_in_logs(remote_command_executor, ["/var/log/sqswatcher", "/var/log/jobwatcher"])
def test_replace_compute_on_failure( region, scheduler, pcluster_config_reader, s3_bucket_factory, clusters_factory, test_datadir ): """ Test that compute nodes get replaced on userdata failures and logs get saved in shared directory. The failure is caused by a post_install script that exits with errors on compute nodes. """ bucket_name = s3_bucket_factory() bucket = boto3.resource("s3", region_name=region).Bucket(bucket_name) bucket.upload_file(str(test_datadir / "post_install.sh"), "post_install.sh") cluster_config = pcluster_config_reader(bucket_name=bucket_name) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) # submit a job to spin up a compute node that will fail due to post_install script scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) scheduler_commands.submit_command("sleep 1") instance_id = wait_compute_log(remote_command_executor)[0] # extract logs and check one of them _assert_compute_logs(remote_command_executor, instance_id) # check that instance got already replaced or is marked as Unhealthy time.sleep(25) # Instance waits for 10 seconds before terminating to allow logs to propagate to CloudWatch assert_instance_replaced_or_terminating(instance_id, region)