Пример #1
0
def test_nodewatcher_terminates_failing_node(scheduler, region,
                                             pcluster_config_reader,
                                             clusters_factory, test_datadir):
    cluster_config = pcluster_config_reader()
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)
    scheduler_commands = get_scheduler_commands(scheduler,
                                                remote_command_executor)

    compute_nodes = scheduler_commands.get_compute_nodes()

    # submit a job that kills the slurm daemon so that the node enters a failing state
    scheduler_commands.submit_script(
        str(test_datadir / "{0}_kill_scheduler_job.sh".format(scheduler)))
    instance_id = wait_compute_log(remote_command_executor)

    _assert_compute_logs(remote_command_executor, instance_id)
    assert_instance_replaced_or_terminating(instance_id, region)
    # verify that desired capacity is still 1
    assert_that(get_desired_asg_capacity(region,
                                         cluster.cfn_name)).is_equal_to(1)
    _assert_nodes_removed_from_scheduler(scheduler_commands, compute_nodes)

    assert_no_errors_in_logs(remote_command_executor,
                             ["/var/log/sqswatcher", "/var/log/jobwatcher"])
Пример #2
0
def test_replace_compute_on_failure(
    region, scheduler, pcluster_config_reader, s3_bucket_factory, clusters_factory, test_datadir
):
    """
    Test that compute nodes get replaced on userdata failures and logs get saved in shared directory.

    The failure is caused by a post_install script that exits with errors on compute nodes.
    """
    bucket_name = s3_bucket_factory()
    bucket = boto3.resource("s3", region_name=region).Bucket(bucket_name)
    bucket.upload_file(str(test_datadir / "post_install.sh"), "post_install.sh")
    cluster_config = pcluster_config_reader(bucket_name=bucket_name)
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)

    # submit a job to spin up a compute node that will fail due to post_install script
    scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor)
    scheduler_commands.submit_command("sleep 1")
    instance_id = wait_compute_log(remote_command_executor)[0]

    # extract logs and check one of them
    _assert_compute_logs(remote_command_executor, instance_id)

    # check that instance got already replaced or is marked as Unhealthy
    time.sleep(25)  # Instance waits for 10 seconds before terminating to allow logs to propagate to CloudWatch
    assert_instance_replaced_or_terminating(instance_id, region)