Exemplo n.º 1
0
def backup_test(clients, volume_name, size, base_image=""):  # NOQA
    for host_id, client in clients.iteritems():
        break

    volume = create_and_check_volume(client, volume_name, 2, size, base_image)

    lht_hostId = get_self_host_id()
    volume = volume.attach(hostId=lht_hostId)
    volume = common.wait_for_volume_healthy(client, volume_name)

    setting = client.by_id_setting(common.SETTING_BACKUP_TARGET)
    # test backupTarget for multiple settings
    backupstores = common.get_backupstore_url()
    for backupstore in backupstores:
        if common.is_backupTarget_s3(backupstore):
            backupsettings = backupstore.split("$")
            setting = client.update(setting, value=backupsettings[0])
            assert setting["value"] == backupsettings[0]

            credential = client.by_id_setting(
                common.SETTING_BACKUP_TARGET_CREDENTIAL_SECRET)
            credential = client.update(credential, value=backupsettings[1])
            assert credential["value"] == backupsettings[1]
        else:
            setting = client.update(setting, value=backupstore)
            assert setting["value"] == backupstore
            credential = client.by_id_setting(
                common.SETTING_BACKUP_TARGET_CREDENTIAL_SECRET)
            credential = client.update(credential, value="")
            assert credential["value"] == ""

        backupstore_test(client, lht_hostId, volume_name, size)

    cleanup_volume(client, volume)
Exemplo n.º 2
0
def recurring_job_labels_test(client,
                              labels,
                              volume_name,
                              size=SIZE,
                              backing_image=""):  # NOQA
    host_id = get_self_host_id()
    client.create_volume(name=volume_name,
                         size=size,
                         numberOfReplicas=2,
                         backingImage=backing_image)
    volume = common.wait_for_volume_detached(client, volume_name)

    # Simple Backup Job that runs every 1 minute, retains 1.
    jobs = [{
        "name": RECURRING_JOB_NAME,
        "cron": "*/1 * * * *",
        "task": "backup",
        "retain": 1,
        "labels": labels
    }]
    volume.recurringUpdate(jobs=jobs)
    volume.attach(hostId=host_id)
    volume = wait_for_volume_healthy(client, volume_name)
    write_volume_random_data(volume)

    # 1 minutes 15s
    time.sleep(75)
    labels["we-added-this-label"] = "definitely"
    jobs[0]["labels"] = labels
    volume = volume.recurringUpdate(jobs=jobs)
    volume = wait_for_volume_healthy(client, volume_name)
    write_volume_random_data(volume)

    # 2 minutes 15s
    time.sleep(135)
    snapshots = volume.snapshotList()
    count = 0
    for snapshot in snapshots:
        if snapshot.removed is False:
            count += 1
    # 1 from Backup, 1 from Volume Head.
    assert count == 2

    # Verify the Labels on the actual Backup.
    bv = client.by_id_backupVolume(volume_name)
    backups = bv.backupList().data
    assert len(backups) == 1

    b = bv.backupGet(name=backups[0].name)
    for key, val in iter(labels.items()):
        assert b.labels.get(key) == val
    assert b.labels.get(RECURRING_JOB_LABEL) == RECURRING_JOB_NAME
    # One extra Label from RecurringJob.
    assert len(b.labels) == len(labels) + 1
    if backing_image:
        assert b.volumeBackingImageName == \
               backing_image
        assert b.volumeBackingImageURL != ""

    cleanup_volume(client, volume)
Exemplo n.º 3
0
def test_empty_backup_volume(clients):  # NOQA
    for host_id, client in clients.iteritems():
        break
    lht_hostId = get_self_host_id()

    volName = generate_volume_name()
    volume = create_and_check_volume(client, volName)

    volume.attach(hostId=lht_hostId)
    volume = common.wait_for_volume_healthy(client, volName)

    bv, b1, snap1, _ = create_backup(client, volName)
    bv.backupDelete(name=b1["name"])
    common.wait_for_backup_delete(b1["name"], bv)

    backup_list = bv.backupList()
    assert len(backup_list) == 0

    # test the empty backup volume can recreate backup
    _, b2, snap2, _ = create_backup(client, volName)

    # test the empty backup volume is still deletable
    bv.backupDelete(name=b2["name"])
    common.wait_for_backup_delete(b1["name"], bv)
    bv = client.by_id_backupVolume(volName)
    client.delete(bv)
    common.wait_for_backup_volume_delete(client, volName)
    cleanup_volume(client, volume)
Exemplo n.º 4
0
def volume_iscsi_basic_test(clients, volume_name, base_image=""):  # NOQA
    # get a random client
    for host_id, client in clients.iteritems():
        break

    volume = create_and_check_volume(client, volume_name, 3, SIZE, base_image,
                                     "iscsi")
    volume.attach(hostId=host_id)
    volume = common.wait_for_volume_healthy(client, volume_name)

    volumes = client.list_volume()
    assert len(volumes) == 1
    assert volumes[0]["name"] == volume["name"]
    assert volumes[0]["size"] == volume["size"]
    assert volumes[0]["numberOfReplicas"] == volume["numberOfReplicas"]
    assert volumes[0]["state"] == volume["state"]
    assert volumes[0]["created"] == volume["created"]
    assert volumes[0]["frontend"] == "iscsi"
    endpoint = get_volume_endpoint(volumes[0])
    assert endpoint.startswith("iscsi://")

    try:
        dev = iscsi_login(endpoint)
        volume_rw_test(dev)
    finally:
        iscsi_logout(endpoint)

    cleanup_volume(client, volume)
Exemplo n.º 5
0
def test_hard_anti_affinity_live_rebuild(client, volume_name):  # NOQA
    """
    Test that volumes with Hard Anti-Affinity can build new replicas live once
    a valid node is available.

    If no nodes without existing replicas are available, the volume should
    remain in "Degraded" state. However, once one is available, the replica
    should now be scheduled successfully, with the volume returning to
    "Healthy" state.
    """
    volume = create_and_check_volume(client, volume_name)
    host_id = get_self_host_id()
    volume.attach(hostId=host_id)
    volume = wait_for_volume_healthy(client, volume_name)
    assert len(volume["replicas"]) == 3

    data = write_volume_random_data(volume)
    setting = client.by_id_setting(SETTING_REPLICA_SOFT_ANTI_AFFINITY)
    client.update(setting, value="false")
    node = client.by_id_node(host_id)
    client.update(node, allowScheduling=False)
    replica_names = map(lambda replica: replica.name, volume["replicas"])
    host_replica = get_host_replica(volume, host_id)

    volume.replicaRemove(name=host_replica["name"])
    wait_for_volume_degraded(client, volume_name)
    wait_scheduling_failure(client, volume_name)
    # Allow scheduling on host node again
    client.update(node, allowScheduling=True)
    wait_new_replica_ready(client, volume_name, replica_names)
    volume = wait_for_volume_healthy(client, volume_name)
    assert len(volume["replicas"]) == 3
    check_volume_data(volume, data)

    cleanup_volume(client, volume)
Exemplo n.º 6
0
def test_soft_anti_affinity_detach(client, volume_name):  # NOQA
    """
    Test that volumes with Soft Anti-Affinity can detach and reattach to a
    node properly.
    """
    volume = create_and_check_volume(client, volume_name)
    host_id = get_self_host_id()
    volume.attach(hostId=host_id)
    volume = wait_for_volume_healthy(client, volume_name)
    assert len(volume["replicas"]) == 3

    data = write_volume_random_data(volume)
    setting = client.by_id_setting(SETTING_REPLICA_SOFT_ANTI_AFFINITY)
    client.update(setting, value="true")
    node = client.by_id_node(host_id)
    client.update(node, allowScheduling=False)
    replica_names = map(lambda replica: replica.name, volume["replicas"])
    host_replica = get_host_replica(volume, host_id)

    volume.replicaRemove(name=host_replica["name"])
    wait_new_replica_ready(client, volume_name, replica_names)
    volume = wait_for_volume_healthy(client, volume_name)
    volume.detach()
    volume = wait_for_volume_detached(client, volume_name)
    assert len(volume["replicas"]) == 3

    volume.attach(hostId=host_id)
    volume = wait_for_volume_healthy(client, volume_name)
    assert len(volume["replicas"]) == 3
    check_volume_data(volume, data)

    cleanup_volume(client, volume)
Exemplo n.º 7
0
def test_soft_anti_affinity_scheduling(client, volume_name):  # NOQA
    """
    Test that volumes with Soft Anti-Affinity work as expected.

    With Soft Anti-Affinity, a new replica should still be scheduled on a node
    with an existing replica, which will result in "Healthy" state but limited
    redundancy.
    """
    volume = create_and_check_volume(client, volume_name)
    host_id = get_self_host_id()
    volume.attach(hostId=host_id)
    volume = wait_for_volume_healthy(client, volume_name)
    assert len(volume["replicas"]) == 3

    data = write_volume_random_data(volume)
    setting = client.by_id_setting(SETTING_REPLICA_SOFT_ANTI_AFFINITY)
    client.update(setting, value="true")
    node = client.by_id_node(host_id)
    client.update(node, allowScheduling=False)
    replica_names = map(lambda replica: replica.name, volume["replicas"])
    host_replica = get_host_replica(volume, host_id)

    volume.replicaRemove(name=host_replica["name"])
    wait_new_replica_ready(client, volume_name, replica_names)
    volume = wait_for_volume_healthy(client, volume_name)
    assert len(volume["replicas"]) == 3
    check_volume_data(volume, data)

    cleanup_volume(client, volume)
Exemplo n.º 8
0
def test_hard_anti_affinity_offline_rebuild(client, volume_name):  # NOQA
    """
    Test that volumes with Hard Anti-Affinity can build new replicas during
    the attaching process once a valid node is available.

    Once a new replica has been built as part of the attaching process, the
    volume should be Healthy again.
    """
    volume = create_and_check_volume(client, volume_name)
    host_id = get_self_host_id()
    volume.attach(hostId=host_id)
    volume = wait_for_volume_healthy(client, volume_name)
    assert len(volume["replicas"]) == 3

    data = write_volume_random_data(volume)
    setting = client.by_id_setting(SETTING_REPLICA_SOFT_ANTI_AFFINITY)
    client.update(setting, value="false")
    node = client.by_id_node(host_id)
    client.update(node, allowScheduling=False)
    replica_names = map(lambda replica: replica.name, volume["replicas"])
    host_replica = get_host_replica(volume, host_id)

    volume.replicaRemove(name=host_replica["name"])
    volume = wait_for_volume_degraded(client, volume_name)
    wait_scheduling_failure(client, volume_name)
    volume.detach()
    volume = wait_for_volume_detached(client, volume_name)
    client.update(node, allowScheduling=True)
    volume.attach(hostId=host_id)
    wait_new_replica_ready(client, volume_name, replica_names)
    volume = wait_for_volume_healthy(client, volume_name)
    assert len(volume["replicas"]) == 3
    check_volume_data(volume, data)

    cleanup_volume(client, volume)
Exemplo n.º 9
0
def test_hard_anti_affinity_detach(client, volume_name):  # NOQA
    """
    Test that volumes with Hard Anti-Affinity are still able to detach and
    reattach to a node properly, even in degraded state.

    1. Create a volume and attach to the current node
    2. Generate and write `data` to the volume.
    3. Set `soft anti-affinity` to false
    4. Disable current node's scheduling.
    5. Remove the replica on the current node
        1. Verify volume will be in degraded state.
        2. Verify volume reports condition `scheduled == false`
    6. Detach the volume.
    7. Verify that volume only have 2 replicas
        1. Unhealthy replica will be removed upon detach.
    8. Attach the volume again.
        1. Verify volume will be in degraded state.
        2. Verify volume reports condition `scheduled == false`
        3. Verify only two of three replicas of volume are healthy.
        4. Verify the remaining replica doesn't have `replica.HostID`, meaning
        it's unscheduled
    9. Check volume `data`
    """
    volume = create_and_check_volume(client, volume_name)
    host_id = get_self_host_id()
    volume.attach(hostId=host_id)
    volume = wait_for_volume_healthy(client, volume_name)
    assert len(volume.replicas) == 3

    data = write_volume_random_data(volume)
    setting = client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY)
    client.update(setting, value="false")
    node = client.by_id_node(host_id)
    client.update(node, allowScheduling=False)
    host_replica = get_host_replica(volume, host_id)

    volume.replicaRemove(name=host_replica.name)
    volume = wait_for_volume_degraded(client, volume_name)
    wait_scheduling_failure(client, volume_name)
    volume.detach()
    volume = wait_for_volume_detached(client, volume_name)
    assert len(volume.replicas) == 2

    volume.attach(hostId=host_id)
    # Make sure we're still not getting another successful replica.
    volume = wait_for_volume_degraded(client, volume_name)
    wait_scheduling_failure(client, volume_name)
    assert sum([
        1 for replica in volume.replicas
        if replica.running and replica.mode == "RW"
    ]) == 2
    assert sum([1 for replica in volume.replicas if not replica.hostId]) == 1
    assert len(volume.replicas) == 3
    check_volume_data(volume, data)

    cleanup_volume(client, volume)
Exemplo n.º 10
0
def ha_simple_recovery_test(client, volume_name, size, base_image=""):  # NOQA
    volume = create_and_check_volume(client, volume_name, 2, size, base_image)

    host_id = get_self_host_id()
    volume = volume.attach(hostId=host_id)
    volume = common.wait_for_volume_healthy(client, volume_name)

    ha_rebuild_replica_test(client, volume_name)

    cleanup_volume(client, volume)
Exemplo n.º 11
0
def test_hard_anti_affinity_scheduling(client, volume_name):  # NOQA
    """
    Test that volumes with Hard Anti-Affinity work as expected.

    With Hard Anti-Affinity, scheduling on nodes with existing replicas should
    be forbidden, resulting in "Degraded" state.

    1. Create a volume and attach to the current node
    2. Generate and write `data` to the volume.
    3. Set `soft anti-affinity` to false
    4. Disable current node's scheduling.
    5. Remove the replica on the current node
        1. Verify volume will be in degraded state.
        2. Verify volume reports condition `scheduled == false`
        3. Verify only two of three replicas of volume are healthy.
        4. Verify the remaining replica doesn't have `replica.HostID`, meaning
        it's unscheduled
    6. Check volume `data`
    """
    volume = create_and_check_volume(client, volume_name)
    host_id = get_self_host_id()
    volume.attach(hostId=host_id)
    volume = wait_for_volume_healthy(client, volume_name)
    assert len(volume.replicas) == 3

    data = write_volume_random_data(volume)
    setting = client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY)
    client.update(setting, value="false")
    node = client.by_id_node(host_id)
    client.update(node, allowScheduling=False)
    host_replica = get_host_replica(volume, host_id)

    volume.replicaRemove(name=host_replica.name)
    # Instead of waiting for timeout and lengthening the tests a significant
    # amount we can make sure the scheduling isn't working by making sure the
    # volume becomes Degraded and reports a scheduling error.
    wait_for_volume_degraded(client, volume_name)
    wait_scheduling_failure(client, volume_name)
    # While there are three replicas that should exist to meet the Volume's
    # request, only two of those volumes should actually be Healthy.
    volume = client.by_id_volume(volume_name)
    assert sum([
        1 for replica in volume.replicas
        if replica.running and replica.mode == "RW"
    ]) == 2
    # Confirm that the final volume is an unscheduled volume.
    assert sum([1 for replica in volume.replicas if not replica.hostId]) == 1
    # Three replicas in total should still exist.
    assert len(volume.replicas) == 3
    check_volume_data(volume, data)

    cleanup_volume(client, volume)
Exemplo n.º 12
0
def recurring_job_labels_test(client,
                              labels,
                              volume_name,
                              size=SIZE,
                              base_image=""):  # NOQA
    host_id = get_self_host_id()
    client.create_volume(name=volume_name, size=size, numberOfReplicas=2)
    volume = common.wait_for_volume_detached(client, volume_name)

    # Simple Backup Job that runs every 2 minutes, retains 1.
    jobs = [{
        "name": RECURRING_JOB_NAME,
        "cron": "*/2 * * * *",
        "task": "backup",
        "retain": 1,
        "labels": labels
    }]
    volume.recurringUpdate(jobs=jobs)
    volume.attach(hostId=host_id)
    volume = common.wait_for_volume_healthy(client, volume_name)

    # 5 minutes
    time.sleep(300)
    snapshots = volume.snapshotList()
    count = 0
    for snapshot in snapshots:
        if snapshot["removed"] is False:
            count += 1
    # 1 from Backup, 1 from Volume Head.
    assert count == 2

    # Verify the Labels on the actual Backup.
    bv = client.by_id_backupVolume(volume_name)
    backups = bv.backupList()
    assert len(backups) == 1

    b = bv.backupGet(name=backups[0]["name"])
    for key, val in labels.iteritems():
        assert b["labels"].get(key) == val
    assert b["labels"].get(RECURRING_JOB_LABEL) == RECURRING_JOB_NAME
    if base_image:
        assert b["labels"].get(BASE_IMAGE_LABEL) == base_image
        # One extra Label from the BaseImage being set.
        assert len(b["labels"]) == len(labels) + 2
    else:
        # At least one extra Label from RecurringJob.
        assert len(b["labels"]) == len(labels) + 1

    cleanup_volume(client, volume)
Exemplo n.º 13
0
def test_ha_prohibit_deleting_last_replica(client, volume_name):  # NOQA
    volume = create_and_check_volume(client, volume_name, 1)

    host_id = get_self_host_id()
    volume = volume.attach(hostId=host_id)
    volume = common.wait_for_volume_healthy(client, volume_name)

    assert len(volume["replicas"]) == 1
    replica0 = volume["replicas"][0]

    with pytest.raises(Exception) as e:
        volume.replicaRemove(name=replica0["name"])
    assert "no other healthy replica available" in str(e.value)

    cleanup_volume(client, volume)
Exemplo n.º 14
0
def test_hard_anti_affinity_offline_rebuild(client, volume_name):  # NOQA
    """
    Test that volumes with Hard Anti-Affinity can build new replicas during
    the attaching process once a valid node is available.

    Once a new replica has been built as part of the attaching process, the
    volume should be Healthy again.

    1. Create a volume and attach to the current node
    2. Generate and write `data` to the volume.
    3. Set `soft anti-affinity` to false
    4. Disable current node's scheduling.
    5. Remove the replica on the current node
        1. Verify volume will be in degraded state.
        2. Verify volume reports condition `scheduled == false`
    6. Detach the volume.
    7. Enable current node's scheduling.
    8. Attach the volume again.
    9. Wait for volume to become healthy with 3 replicas
    10. Check volume `data`
    """
    volume = create_and_check_volume(client, volume_name)
    host_id = get_self_host_id()
    volume.attach(hostId=host_id)
    volume = wait_for_volume_healthy(client, volume_name)
    assert len(volume.replicas) == 3

    data = write_volume_random_data(volume)
    setting = client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY)
    client.update(setting, value="false")
    node = client.by_id_node(host_id)
    client.update(node, allowScheduling=False)
    replica_names = map(lambda replica: replica.name, volume.replicas)
    host_replica = get_host_replica(volume, host_id)

    volume.replicaRemove(name=host_replica.name)
    volume = wait_for_volume_degraded(client, volume_name)
    wait_scheduling_failure(client, volume_name)
    volume.detach()
    volume = wait_for_volume_detached(client, volume_name)
    client.update(node, allowScheduling=True)
    volume.attach(hostId=host_id)
    wait_new_replica_ready(client, volume_name, replica_names)
    volume = wait_for_volume_healthy(client, volume_name)
    assert len(volume.replicas) == 3
    check_volume_data(volume, data)

    cleanup_volume(client, volume)
Exemplo n.º 15
0
def test_tag_scheduling(client, node_default_tags):  # NOQA
    """
    Test that scheduling succeeds if there are available Nodes/Disks with the
    requested Tags.
    """
    host_id = get_self_host_id()
    tag_specs = [
        # Select all Nodes.
        {
            "disk": [],
            "expected": 3,
            "node": []
        },
        # Selector works with AND on Disk Tags.
        {
            "disk": ["ssd", "nvme"],
            "expected": 2,
            "node": []
        },
        # Selector works with AND on Node Tags.
        {
            "disk": [],
            "expected": 2,
            "node": ["main", "storage"]
        },
        # Selector works based on combined Disk AND Node selector.
        {
            "disk": ["ssd", "nvme"],
            "expected": 1,
            "node": ["storage", "main"]
        }
    ]
    for specs in tag_specs:
        volume_name = generate_volume_name()  # NOQA
        client.create_volume(name=volume_name, size=SIZE, numberOfReplicas=3,
                             diskSelector=specs["disk"],
                             nodeSelector=specs["node"])
        volume = wait_for_volume_detached(client, volume_name)
        assert volume["diskSelector"] == specs["disk"]
        assert volume["nodeSelector"] == specs["node"]

        volume.attach(hostId=host_id)
        volume = wait_for_volume_healthy(client, volume_name)
        assert len(volume["replicas"]) == 3
        check_volume_replicas(volume, specs, node_default_tags)

        cleanup_volume(client, volume)
Exemplo n.º 16
0
def backup_labels_test(clients,
                       random_labels,
                       volume_name,
                       size=SIZE,
                       base_image=""):  # NOQA
    for _, client in clients.iteritems():
        break
    host_id = get_self_host_id()

    volume = create_and_check_volume(client, volume_name, 2, size, base_image)

    volume.attach(hostId=host_id)
    volume = common.wait_for_volume_healthy(client, volume_name)

    setting = client.by_id_setting(common.SETTING_BACKUP_TARGET)
    # test backupTarget for multiple settings
    backupstores = common.get_backupstore_url()
    for backupstore in backupstores:
        if common.is_backupTarget_s3(backupstore):
            backupsettings = backupstore.split("$")
            setting = client.update(setting, value=backupsettings[0])
            assert setting["value"] == backupsettings[0]

            credential = client.by_id_setting(
                common.SETTING_BACKUP_TARGET_CREDENTIAL_SECRET)
            credential = client.update(credential, value=backupsettings[1])
            assert credential["value"] == backupsettings[1]
        else:
            setting = client.update(setting, value=backupstore)
            assert setting["value"] == backupstore
            credential = client.by_id_setting(
                common.SETTING_BACKUP_TARGET_CREDENTIAL_SECRET)
            credential = client.update(credential, value="")
            assert credential["value"] == ""

        bv, b, _, _ = create_backup(client, volume_name, labels=random_labels)
        # If we're running the test with a BaseImage, check that this Label is
        # set properly.
        backup = bv.backupGet(name=b["name"])
        if base_image:
            assert backup["labels"].get(common.BASE_IMAGE_LABEL) == base_image
            # One extra Label from the BaseImage being set.
            assert len(backup["labels"]) == len(random_labels) + 1
        else:
            assert len(backup["labels"]) == len(random_labels)

    cleanup_volume(client, volume)
Exemplo n.º 17
0
def test_tag_scheduling_on_update(client, node_default_tags, volume_name):  # NOQA
    """
    Test that Replicas get scheduled if a Node/Disk disks updated with the
    proper Tags.
    """
    tag_spec = {
        "disk": ["ssd", "m2"],
        "expected": 1,
        "node": ["main", "fallback"]
    }
    client.create_volume(name=volume_name, size=SIZE, numberOfReplicas=3,
                         diskSelector=tag_spec["disk"],
                         nodeSelector=tag_spec["node"])
    volume = wait_for_volume_detached(client, volume_name)
    assert volume["diskSelector"] == tag_spec["disk"]
    assert volume["nodeSelector"] == tag_spec["node"]

    wait_scheduling_failure(client, volume_name)

    host_id = get_self_host_id()
    node = client.by_id_node(host_id)
    update_disks = get_update_disks(node["disks"])
    update_disks[0]["tags"] = tag_spec["disk"]
    node = node.diskUpdate(disks=update_disks)
    set_node_tags(client, node, tag_spec["node"])
    scheduled = False
    for i in range(RETRY_COUNTS):
        v = client.by_id_volume(volume_name)
        if v["conditions"]["scheduled"]["status"] == "True":
            scheduled = True
        if scheduled:
            break
        sleep(RETRY_INTERVAL)
    assert scheduled

    volume.attach(hostId=host_id)
    volume = wait_for_volume_healthy(client, volume_name)
    nodes = client.list_node()
    node_mapping = {node["id"]: {
        "disk": get_update_disks(node["disks"])[0]["tags"],
        "node": node["tags"]
    } for node in nodes}
    assert len(volume["replicas"]) == 3
    check_volume_replicas(volume, tag_spec, node_mapping)

    cleanup_volume(client, volume)
Exemplo n.º 18
0
def test_hard_anti_affinity_live_rebuild(client, volume_name):  # NOQA
    """
    Test that volumes with Hard Anti-Affinity can build new replicas live once
    a valid node is available.

    If no nodes without existing replicas are available, the volume should
    remain in "Degraded" state. However, once one is available, the replica
    should now be scheduled successfully, with the volume returning to
    "Healthy" state.

    1. Create a volume and attach to the current node
    2. Generate and write `data` to the volume.
    3. Set `soft anti-affinity` to false
    4. Disable current node's scheduling.
    5. Remove the replica on the current node
        1. Verify volume will be in degraded state.
        2. Verify volume reports condition `scheduled == false`
    6. Enable the current node's scheduling
    7. Wait for volume to start rebuilding and become healthy again
    8. Check volume `data`
    """
    volume = create_and_check_volume(client, volume_name)
    host_id = get_self_host_id()
    volume.attach(hostId=host_id)
    volume = wait_for_volume_healthy(client, volume_name)
    assert len(volume.replicas) == 3

    data = write_volume_random_data(volume)
    setting = client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY)
    client.update(setting, value="false")
    node = client.by_id_node(host_id)
    client.update(node, allowScheduling=False)
    replica_names = map(lambda replica: replica.name, volume.replicas)
    host_replica = get_host_replica(volume, host_id)

    volume.replicaRemove(name=host_replica.name)
    wait_for_volume_degraded(client, volume_name)
    wait_scheduling_failure(client, volume_name)
    # Allow scheduling on host node again
    client.update(node, allowScheduling=True)
    wait_new_replica_ready(client, volume_name, replica_names)
    volume = wait_for_volume_healthy(client, volume_name)
    assert len(volume.replicas) == 3
    check_volume_data(volume, data)

    cleanup_volume(client, volume)
Exemplo n.º 19
0
def test_deleting_backup_volume(clients):  # NOQA
    for host_id, client in clients.iteritems():
        break
    lht_hostId = get_self_host_id()

    volName = generate_volume_name()
    volume = create_and_check_volume(client, volName)

    volume.attach(hostId=lht_hostId)
    volume = common.wait_for_volume_healthy(client, volName)

    bv, _, snap1, _ = create_backup(client, volName)
    _, _, snap2, _ = create_backup(client, volName)

    bv = client.by_id_backupVolume(volName)
    client.delete(bv)
    common.wait_for_backup_volume_delete(client, volName)
    cleanup_volume(client, volume)
Exemplo n.º 20
0
def test_soft_anti_affinity_detach(client, volume_name):  # NOQA
    """
    Test that volumes with Soft Anti-Affinity can detach and reattach to a
    node properly.

    1. Create a volume and attach to the current node.
    2. Generate and write `data` to the volume
    3. Set `soft anti-affinity` to true
    4. Disable current node's scheduling.
    5. Remove the replica on the current node
    6. Wait for the new replica to be rebuilt
    7. Detach the volume.
    8. Verify there are 3 replicas
    9. Attach the volume again. Verify there are still 3 replicas
    10. Verify the `data`.
    """
    volume = create_and_check_volume(client, volume_name)
    host_id = get_self_host_id()
    volume.attach(hostId=host_id)
    volume = wait_for_volume_healthy(client, volume_name)
    assert len(volume.replicas) == 3

    data = write_volume_random_data(volume)
    setting = client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY)
    client.update(setting, value="true")
    node = client.by_id_node(host_id)
    client.update(node, allowScheduling=False)
    replica_names = list(map(lambda replica: replica.name, volume.replicas))
    host_replica = get_host_replica(volume, host_id)

    volume.replicaRemove(name=host_replica.name)
    wait_new_replica_ready(client, volume_name, replica_names)
    volume = wait_for_volume_healthy(client, volume_name)
    volume.detach()
    volume = wait_for_volume_detached(client, volume_name)
    assert len(volume.replicas) == 3

    volume.attach(hostId=host_id)
    volume = wait_for_volume_healthy(client, volume_name)
    assert len(volume.replicas) == 3
    check_volume_data(volume, data)

    cleanup_volume(client, volume)
Exemplo n.º 21
0
def test_hard_anti_affinity_scheduling(client, volume_name):  # NOQA
    """
    Test that volumes with Hard Anti-Affinity work as expected.

    With Hard Anti-Affinity, scheduling on nodes with existing replicas should
    be forbidden, resulting in "Degraded" state.
    """
    volume = create_and_check_volume(client, volume_name)
    host_id = get_self_host_id()
    volume.attach(hostId=host_id)
    volume = wait_for_volume_healthy(client, volume_name)
    assert len(volume["replicas"]) == 3

    data = write_volume_random_data(volume)
    setting = client.by_id_setting(SETTING_REPLICA_SOFT_ANTI_AFFINITY)
    client.update(setting, value="false")
    node = client.by_id_node(host_id)
    client.update(node, allowScheduling=False)
    host_replica = get_host_replica(volume, host_id)

    volume.replicaRemove(name=host_replica["name"])
    # Instead of waiting for timeout and lengthening the tests a significant
    # amount we can make sure the scheduling isn't working by making sure the
    # volume becomes Degraded and reports a scheduling error.
    volume = wait_for_volume_degraded(client, volume_name)
    wait_scheduling_failure(client, volume_name)
    # While there are three replicas that should exist to meet the Volume's
    # request, only two of those volumes should actually be Healthy.
    assert sum([
        1 for replica in volume["replicas"]
        if replica["running"] and replica["mode"] == "RW"
    ]) == 2
    # Confirm that the final volume is an unscheduled volume.
    assert sum([1 for replica in volume["replicas"]
                if not replica["hostId"]]) == 1
    # Three replicas in total should still exist.
    assert len(volume["replicas"]) == 3
    check_volume_data(volume, data)

    cleanup_volume(client, volume)
Exemplo n.º 22
0
def test_hard_anti_affinity_detach(client, volume_name):  # NOQA
    """
    Test that volumes with Hard Anti-Affinity are still able to detach and
    reattach to a node properly, even in degraded state.
    """
    volume = create_and_check_volume(client, volume_name)
    host_id = get_self_host_id()
    volume.attach(hostId=host_id)
    volume = wait_for_volume_healthy(client, volume_name)
    assert len(volume["replicas"]) == 3

    data = write_volume_random_data(volume)
    setting = client.by_id_setting(SETTING_REPLICA_SOFT_ANTI_AFFINITY)
    client.update(setting, value="false")
    node = client.by_id_node(host_id)
    client.update(node, allowScheduling=False)
    host_replica = get_host_replica(volume, host_id)

    volume.replicaRemove(name=host_replica["name"])
    volume = wait_for_volume_degraded(client, volume_name)
    wait_scheduling_failure(client, volume_name)
    volume.detach()
    volume = wait_for_volume_detached(client, volume_name)
    assert len(volume["replicas"]) == 2

    volume.attach(hostId=host_id)
    # Make sure we're still not getting another successful replica.
    volume = wait_for_volume_degraded(client, volume_name)
    wait_scheduling_failure(client, volume_name)
    assert sum([
        1 for replica in volume["replicas"]
        if replica["running"] and replica["mode"] == "RW"
    ]) == 2
    assert sum([1 for replica in volume["replicas"]
                if not replica["hostId"]]) == 1
    assert len(volume["replicas"]) == 3
    check_volume_data(volume, data)

    cleanup_volume(client, volume)
Exemplo n.º 23
0
def test_soft_anti_affinity_scheduling(client, volume_name):  # NOQA
    """
    Test that volumes with Soft Anti-Affinity work as expected.

    With Soft Anti-Affinity, a new replica should still be scheduled on a node
    with an existing replica, which will result in "Healthy" state but limited
    redundancy.

    1. Create a volume and attach to the current node
    2. Generate and write `data` to the volume.
    3. Set `soft anti-affinity` to true
    4. Disable current node's scheduling.
    5. Remove the replica on the current node
    6. Wait for the volume to complete rebuild. Volume should have 3 replicas.
    7. Verify `data`
    """
    volume = create_and_check_volume(client, volume_name)
    host_id = get_self_host_id()
    volume.attach(hostId=host_id)
    volume = wait_for_volume_healthy(client, volume_name)
    assert len(volume.replicas) == 3

    data = write_volume_random_data(volume)
    setting = client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY)
    client.update(setting, value="true")
    node = client.by_id_node(host_id)
    client.update(node, allowScheduling=False)
    replica_names = list(map(lambda replica: replica.name, volume.replicas))
    host_replica = get_host_replica(volume, host_id)

    volume.replicaRemove(name=host_replica.name)
    wait_new_replica_ready(client, volume_name, replica_names)
    volume = wait_for_volume_healthy(client, volume_name)
    assert len(volume.replicas) == 3
    check_volume_data(volume, data)

    cleanup_volume(client, volume)
Exemplo n.º 24
0
def ha_salvage_test(client, volume_name, base_image=""):  # NOQA
    volume = create_and_check_volume(client,
                                     volume_name,
                                     2,
                                     base_image=base_image)

    host_id = get_self_host_id()
    volume = volume.attach(hostId=host_id)
    volume = common.wait_for_volume_healthy(client, volume_name)

    assert len(volume["replicas"]) == 2
    replica0_name = volume["replicas"][0]["name"]
    replica1_name = volume["replicas"][1]["name"]

    data = write_volume_random_data(volume)

    common.k8s_delete_replica_pods_for_volume(volume_name)

    volume = common.wait_for_volume_faulted(client, volume_name)
    assert len(volume["replicas"]) == 2
    assert volume["replicas"][0]["failedAt"] != ""
    assert volume["replicas"][1]["failedAt"] != ""

    volume.salvage(names=[replica0_name, replica1_name])

    volume = common.wait_for_volume_detached(client, volume_name)
    assert len(volume["replicas"]) == 2
    assert volume["replicas"][0]["failedAt"] == ""
    assert volume["replicas"][1]["failedAt"] == ""

    volume = volume.attach(hostId=host_id)
    volume = common.wait_for_volume_healthy(client, volume_name)

    check_volume_data(volume, data)

    cleanup_volume(client, volume)
def test_backup_kubernetes_status(set_random_backupstore, client, core_api,
                                  pod):  # NOQA
    """
    Test that Backups have KubernetesStatus stored properly when there is an
    associated PersistentVolumeClaim and Pod.

    1. Setup a random backupstore
    2. Set settings Longhorn Static StorageClass to `longhorn-static-test`
    3. Create a volume and PV/PVC. Verify the StorageClass of PVC
    4. Create a Pod using the PVC.
    5. Check volume's Kubernetes status to reflect PV/PVC/Pod correctly.
    6. Create a backup for the volume.
    7. Verify the labels of created backup reflect PV/PVC/Pod status.
    8. Restore the backup to a volume. Wait for restoration to complete.
    9. Check the volume's Kubernetes Status
        1. Make sure the `lastPodRefAt` and `lastPVCRefAt` is snapshot created
    time
    10. Delete the backup and restored volume.
    11. Delete PV/PVC/Pod.
    12. Verify volume's Kubernetes Status updated to reflect history data.
    13. Attach the volume and create another backup. Verify the labels
    14. Verify the volume's Kubernetes status.
    15. Restore the previous backup to a new volume. Wait for restoration.
    16. Verify the restored volume's Kubernetes status.
        1. Make sure `lastPodRefAt` and `lastPVCRefAt` matched volume on step
        12
    """

    host_id = get_self_host_id()
    static_sc_name = "longhorn-static-test"
    setting = client.by_id_setting(SETTING_DEFAULT_LONGHORN_STATIC_SC)
    setting = client.update(setting, value=static_sc_name)
    assert setting.value == static_sc_name

    volume_name = "test-backup-kubernetes-status-pod"  # NOQA
    client.create_volume(name=volume_name, size=SIZE, numberOfReplicas=2)
    volume = wait_for_volume_detached(client, volume_name)

    pod_name = "pod-" + volume_name
    pv_name = "pv-" + volume_name
    pvc_name = "pvc-" + volume_name
    create_pv_for_volume(client, core_api, volume, pv_name)
    create_pvc_for_volume(client, core_api, volume, pvc_name)
    ret = core_api.list_namespaced_persistent_volume_claim(namespace='default')
    pvc_found = False
    for item in ret.items:
        if item.metadata.name == pvc_name:
            pvc_found = item
            break
    assert pvc_found
    assert pvc_found.spec.storage_class_name == static_sc_name

    pod['metadata']['name'] = pod_name
    pod['spec']['volumes'] = [{
        'name':
        pod['spec']['containers'][0]['volumeMounts'][0]['name'],
        'persistentVolumeClaim': {
            'claimName': pvc_name,
        },
    }]
    create_and_wait_pod(core_api, pod)

    ks = {
        'lastPodRefAt':
        '',
        'lastPVCRefAt':
        '',
        'namespace':
        'default',
        'pvcName':
        pvc_name,
        'pvName':
        pv_name,
        'pvStatus':
        'Bound',
        'workloadsStatus': [{
            'podName': pod_name,
            'podStatus': 'Running',
            'workloadName': '',
            'workloadType': ''
        }]
    }
    wait_volume_kubernetes_status(client, volume_name, ks)
    volume = wait_for_volume_healthy(client, volume_name)

    # Create Backup manually instead of calling create_backup since Kubernetes
    # is not guaranteed to mount our Volume to the test host.
    snap = create_snapshot(client, volume_name)
    volume.snapshotBackup(name=snap.name)
    wait_for_backup_completion(client, volume_name, snap.name)
    _, b = find_backup(client, volume_name, snap.name)
    # Check backup label
    status = loads(b.labels.get(KUBERNETES_STATUS_LABEL))
    assert status == ks
    # Check backup volume label
    for _ in range(RETRY_COUNTS):
        bv = client.by_id_backupVolume(volume_name)
        if bv is not None and bv.labels is not None:
            break
        time.sleep(RETRY_INTERVAL)
    assert bv is not None and bv.labels is not None
    status = loads(bv.labels.get(KUBERNETES_STATUS_LABEL))
    assert status == ks

    restore_name = generate_volume_name()
    client.create_volume(name=restore_name,
                         size=SIZE,
                         numberOfReplicas=2,
                         fromBackup=b.url)
    wait_for_volume_restoration_completed(client, restore_name)
    wait_for_volume_detached(client, restore_name)

    snapshot_created = b.snapshotCreated
    ks = {
        'lastPodRefAt':
        b.snapshotCreated,
        'lastPVCRefAt':
        b.snapshotCreated,
        'namespace':
        'default',
        'pvcName':
        pvc_name,
        # Restoration should not apply PersistentVolume data.
        'pvName':
        '',
        'pvStatus':
        '',
        'workloadsStatus': [{
            'podName': pod_name,
            'podStatus': 'Running',
            'workloadName': '',
            'workloadType': ''
        }]
    }
    wait_volume_kubernetes_status(client, restore_name, ks)
    restore = client.by_id_volume(restore_name)
    # We need to compare LastPodRefAt and LastPVCRefAt manually since
    # wait_volume_kubernetes_status only checks for empty or non-empty state.
    assert restore.kubernetesStatus.lastPodRefAt == ks["lastPodRefAt"]
    assert restore.kubernetesStatus.lastPVCRefAt == ks["lastPVCRefAt"]

    delete_backup(client, bv.name, b.name)
    client.delete(restore)
    wait_for_volume_delete(client, restore_name)
    delete_and_wait_pod(core_api, pod_name)
    delete_and_wait_pvc(core_api, pvc_name)
    delete_and_wait_pv(core_api, pv_name)

    # With the Pod, PVC, and PV deleted, the Volume should have both Ref
    # fields set. Check that a new Backup and Restore will use this instead of
    # manually populating the Ref fields.
    ks = {
        'lastPodRefAt':
        'NOT NULL',
        'lastPVCRefAt':
        'NOT NULL',
        'namespace':
        'default',
        'pvcName':
        pvc_name,
        'pvName':
        '',
        'pvStatus':
        '',
        'workloadsStatus': [{
            'podName': pod_name,
            'podStatus': 'Running',
            'workloadName': '',
            'workloadType': ''
        }]
    }
    wait_volume_kubernetes_status(client, volume_name, ks)
    volume = wait_for_volume_detached(client, volume_name)

    volume.attach(hostId=host_id)
    volume = wait_for_volume_healthy(client, volume_name)

    snap = create_snapshot(client, volume_name)
    volume.snapshotBackup(name=snap.name)
    volume = wait_for_backup_completion(client, volume_name, snap.name)
    bv, b = find_backup(client, volume_name, snap.name)
    new_b = bv.backupGet(name=b.name)
    status = loads(new_b.labels.get(KUBERNETES_STATUS_LABEL))
    # Check each field manually, we have no idea what the LastPodRefAt or the
    # LastPVCRefAt will be. We just know it shouldn't be SnapshotCreated.
    assert status['lastPodRefAt'] != snapshot_created
    assert status['lastPVCRefAt'] != snapshot_created
    assert status['namespace'] == "default"
    assert status['pvcName'] == pvc_name
    assert status['pvName'] == ""
    assert status['pvStatus'] == ""
    assert status['workloadsStatus'] == [{
        'podName': pod_name,
        'podStatus': 'Running',
        'workloadName': '',
        'workloadType': ''
    }]

    restore_name = generate_volume_name()
    client.create_volume(name=restore_name,
                         size=SIZE,
                         numberOfReplicas=2,
                         fromBackup=b.url)
    wait_for_volume_restoration_completed(client, restore_name)
    wait_for_volume_detached(client, restore_name)

    ks = {
        'lastPodRefAt':
        status['lastPodRefAt'],
        'lastPVCRefAt':
        status['lastPVCRefAt'],
        'namespace':
        'default',
        'pvcName':
        pvc_name,
        'pvName':
        '',
        'pvStatus':
        '',
        'workloadsStatus': [{
            'podName': pod_name,
            'podStatus': 'Running',
            'workloadName': '',
            'workloadType': ''
        }]
    }
    wait_volume_kubernetes_status(client, restore_name, ks)
    restore = client.by_id_volume(restore_name)
    assert restore.kubernetesStatus.lastPodRefAt == ks["lastPodRefAt"]
    assert restore.kubernetesStatus.lastPVCRefAt == ks["lastPVCRefAt"]

    # cleanup
    backupstore_cleanup(client)
    client.delete(restore)
    cleanup_volume(client, volume)
Exemplo n.º 26
0
def test_setting_toleration():
    """
    Test toleration setting

    1.  Set `taint-toleration` to "key1=value1:NoSchedule; key2:InvalidEffect".
    2.  Verify the request fails.
    3.  Create a volume and attach it.
    4.  Set `taint-toleration` to "key1=value1:NoSchedule; key2:NoExecute".
    5.  Verify that cannot update toleration setting when any volume is
        attached.
    6.  Generate and write `data1` into the volume.
    7.  Detach the volume.
    8.  Set `taint-toleration` to "key1=value1:NoSchedule; key2:NoExecute".
    9.  Wait for all the Longhorn system components to restart with new
        toleration.
    10. Verify that UI, manager, and drive deployer don't restart and
        don't have new toleration.
    11. Attach the volume again and verify the volume `data1`.
    12. Generate and write `data2` to the volume.
    13. Detach the volume.
    14. Clean the `toleration` setting.
    15. Wait for all the Longhorn system components to restart with no
        toleration.
    16. Attach the volume and validate `data2`.
    17. Generate and write `data3` to the volume.
    """
    client = get_longhorn_api_client()  # NOQA
    apps_api = get_apps_api_client()  # NOQA
    core_api = get_core_api_client()  # NOQA
    count = len(client.list_node())

    setting = client.by_id_setting(SETTING_TAINT_TOLERATION)

    with pytest.raises(Exception) as e:
        client.update(setting,
                      value="key1=value1:NoSchedule; key2:InvalidEffect")
    assert 'invalid effect' in str(e.value)

    volume_name = "test-toleration-vol"  # NOQA
    volume = create_and_check_volume(client, volume_name)
    volume.attach(hostId=get_self_host_id())
    volume = wait_for_volume_healthy(client, volume_name)

    setting_value_str = "key1=value1:NoSchedule; key2:NoExecute"
    setting_value_dicts = [
        {
            "key": "key1",
            "value": "value1",
            "operator": "Equal",
            "effect": "NoSchedule"
        },
        {
            "key": "key2",
            "value": None,
            "operator": "Exists",
            "effect": "NoExecute"
        },
    ]
    with pytest.raises(Exception) as e:
        client.update(setting, value=setting_value_str)
    assert 'cannot modify toleration setting before all volumes are detached' \
           in str(e.value)

    data1 = write_volume_random_data(volume)
    check_volume_data(volume, data1)

    volume.detach(hostId="")
    wait_for_volume_detached(client, volume_name)

    setting = client.update(setting, value=setting_value_str)
    assert setting.value == setting_value_str
    wait_for_toleration_update(core_api, apps_api, count, setting_value_dicts)

    client, node = wait_for_longhorn_node_ready()

    volume = client.by_id_volume(volume_name)
    volume.attach(hostId=node)
    volume = wait_for_volume_healthy(client, volume_name)
    check_volume_data(volume, data1)
    data2 = write_volume_random_data(volume)
    check_volume_data(volume, data2)
    volume.detach(hostId="")
    wait_for_volume_detached(client, volume_name)

    # cleanup
    setting_value_str = ""
    setting_value_dicts = []
    setting = client.by_id_setting(SETTING_TAINT_TOLERATION)
    setting = client.update(setting, value=setting_value_str)
    assert setting.value == setting_value_str
    wait_for_toleration_update(core_api, apps_api, count, setting_value_dicts)

    client, node = wait_for_longhorn_node_ready()

    volume = client.by_id_volume(volume_name)
    volume.attach(hostId=node)
    volume = wait_for_volume_healthy(client, volume_name)
    check_volume_data(volume, data2)
    data3 = write_volume_random_data(volume)
    check_volume_data(volume, data3)

    cleanup_volume(client, volume)
Exemplo n.º 27
0
def test_setting_priority_class(core_api, apps_api, scheduling_api,
                                priority_class, volume_name):  # NOQA
    """
    Test that the Priority Class setting is validated and utilized correctly.

    1. Verify that the name of a non-existent Priority Class cannot be used
    for the Setting.
    2. Create a new Priority Class in Kubernetes.
    3. Create and attach a Volume.
    4. Verify that the Priority Class Setting cannot be updated with an
    attached Volume.
    5. Generate and write `data1`.
    6. Detach the Volume.
    7. Update the Priority Class Setting to the new Priority Class.
    8. Wait for all the Longhorn system components to restart with the new
       Priority Class.
    9. Verify that UI, manager, and drive deployer don't have Priority Class
    10. Attach the Volume and verify `data1`.
    11. Generate and write `data2`.
    12. Unset the Priority Class Setting.
    13. Wait for all the Longhorn system components to restart with the new
        Priority Class.
    14. Verify that UI, manager, and drive deployer don't have Priority Class
    15. Attach the Volume and verify `data2`.
    16. Generate and write `data3`.

    Note: system components are workloads other than UI, manager, driver
     deployer
    """
    client = get_longhorn_api_client()  # NOQA
    count = len(client.list_node())
    name = priority_class['metadata']['name']
    setting = client.by_id_setting(SETTING_PRIORITY_CLASS)

    with pytest.raises(Exception) as e:
        client.update(setting, value=name)
    assert 'failed to get priority class ' in str(e.value)

    scheduling_api.create_priority_class(priority_class)

    volume = create_and_check_volume(client, volume_name)
    volume.attach(hostId=get_self_host_id())
    volume = wait_for_volume_healthy(client, volume_name)

    with pytest.raises(Exception) as e:
        client.update(setting, value=name)
    assert 'cannot modify priority class setting before all volumes are ' \
           'detached' in str(e.value)

    data1 = write_volume_random_data(volume)
    check_volume_data(volume, data1)

    volume.detach(hostId="")
    wait_for_volume_detached(client, volume_name)

    setting = client.update(setting, value=name)
    assert setting.value == name

    wait_for_priority_class_update(core_api, apps_api, count, priority_class)

    client, node = wait_for_longhorn_node_ready()

    volume = client.by_id_volume(volume_name)
    volume.attach(hostId=node)
    volume = wait_for_volume_healthy(client, volume_name)
    check_volume_data(volume, data1)
    data2 = write_volume_random_data(volume)
    check_volume_data(volume, data2)
    volume.detach(hostId="")
    wait_for_volume_detached(client, volume_name)

    setting = client.by_id_setting(SETTING_PRIORITY_CLASS)
    setting = client.update(setting, value='')
    assert setting.value == ''
    wait_for_priority_class_update(core_api, apps_api, count)

    client, node = wait_for_longhorn_node_ready()

    volume = client.by_id_volume(volume_name)
    volume.attach(hostId=node)
    volume = wait_for_volume_healthy(client, volume_name)
    check_volume_data(volume, data2)
    data3 = write_volume_random_data(volume)
    check_volume_data(volume, data3)

    cleanup_volume(client, volume)
Exemplo n.º 28
0
def test_instance_manager_cpu_reservation(client, core_api):  # NOQA
    """
    Test if the CPU requests of instance manager pods are controlled by
    the settings and the node specs correctly.

    1. Try to change the deprecated setting `Guaranteed Engine CPU`.
       --> The setting update should fail.
    2. Pick up node 1, set `node.engineManagerCPURequest` and
       `node.replicaManagerCPURequest` to 150 and 250, respectively.
       --> The IM pods on this node will be restarted. And the CPU requests
       of these IM pods matches the above milli value.
    3. Change the new settings `Guaranteed Engine Manager CPU` and
       `Guaranteed Replica Manager CPU` to 10 and 20, respectively.
       Then wait for all IM pods except for the pods on node 1 restarting.
       --> The CPU requests of the restarted IM pods equals to
           the new setting value multiply the kube node allocatable CPU.
    4. Set the both new settings to 0.
       --> All IM pods except for the pod on node 1 will be restarted without
        CPU requests.
    5. Set the fields on node 1 to 0.
       --> The IM pods on node 1 will be restarted without CPU requests.
    6. Set the both new settings to 2 random values,
       and the sum of the 2 values is small than 40.
       Then wait for all IM pods restarting.
       --> The CPU requests of all IM pods equals to
           the new setting value multiply the kube node allocatable CPU.
    7. Set the both new settings to 2 random values,
       and the single value or the sum of the 2 values is greater than 40.
       --> The setting update should fail.
    8. Create a volume, verify everything works as normal

    Note: use fixture to restore the setting into the original state
    """

    instance_managers = client.list_instance_manager()
    deprecated_setting = client.by_id_setting(SETTING_GUARANTEED_ENGINE_CPU)
    with pytest.raises(Exception) as e:
        client.update(deprecated_setting, value="0.1")

    host_node_name = get_self_host_id()
    host_node = client.by_id_node(host_node_name)
    other_ems, other_rms = [], []
    for im in instance_managers:
        if im.managerType == "engine":
            if im.nodeID == host_node_name:
                em_on_host = im
            else:
                other_ems.append(im)
        else:
            if im.nodeID == host_node_name:
                rm_on_host = im
            else:
                other_rms.append(im)
    assert em_on_host and rm_on_host
    host_kb_node = core_api.read_node(host_node_name)
    if host_kb_node.status.allocatable["cpu"].endswith('m'):
        allocatable_millicpu = int(host_kb_node.status.allocatable["cpu"][:-1])
    else:
        allocatable_millicpu = int(
            host_kb_node.status.allocatable["cpu"]) * 1000

    client.update(host_node,
                  allowScheduling=True,
                  engineManagerCPURequest=150,
                  replicaManagerCPURequest=250)
    time.sleep(5)
    guaranteed_engine_cpu_setting_check(client, core_api, [em_on_host],
                                        "Running", True, "150m")
    guaranteed_engine_cpu_setting_check(client, core_api, [rm_on_host],
                                        "Running", True, "250m")

    em_setting = client.by_id_setting(SETTING_GUARANTEED_ENGINE_MANAGER_CPU)
    client.update(em_setting, value="10")
    rm_setting = client.by_id_setting(SETTING_GUARANTEED_REPLICA_MANAGER_CPU)
    client.update(rm_setting, value="20")
    time.sleep(5)
    guaranteed_engine_cpu_setting_check(
        client, core_api, other_ems, "Running", True,
        str(int(allocatable_millicpu * 10 / 100)) + "m")
    guaranteed_engine_cpu_setting_check(
        client, core_api, other_rms, "Running", True,
        str(int(allocatable_millicpu * 20 / 100)) + "m")

    em_setting = client.by_id_setting(SETTING_GUARANTEED_ENGINE_MANAGER_CPU)
    client.update(em_setting, value="0")
    rm_setting = client.by_id_setting(SETTING_GUARANTEED_REPLICA_MANAGER_CPU)
    client.update(rm_setting, value="0")
    time.sleep(5)
    guaranteed_engine_cpu_setting_check(client, core_api, other_ems, "Running",
                                        True, "")
    guaranteed_engine_cpu_setting_check(client, core_api, other_rms, "Running",
                                        True, "")

    ems, rms = other_ems, other_rms
    ems.append(em_on_host)
    rms.append(rm_on_host)

    host_node = client.by_id_node(host_node_name)
    client.update(host_node,
                  allowScheduling=True,
                  engineManagerCPURequest=0,
                  replicaManagerCPURequest=0)
    time.sleep(5)
    guaranteed_engine_cpu_setting_check(client, core_api, ems, "Running", True,
                                        "")
    guaranteed_engine_cpu_setting_check(client, core_api, rms, "Running", True,
                                        "")

    client.update(em_setting, value="20")
    rm_setting = client.by_id_setting(SETTING_GUARANTEED_REPLICA_MANAGER_CPU)
    client.update(rm_setting, value="15")
    time.sleep(5)
    guaranteed_engine_cpu_setting_check(
        client, core_api, ems, "Running", True,
        str(int(allocatable_millicpu * 20 / 100)) + "m")
    guaranteed_engine_cpu_setting_check(
        client, core_api, rms, "Running", True,
        str(int(allocatable_millicpu * 15 / 100)) + "m")

    with pytest.raises(Exception) as e:
        client.update(em_setting, value="41")
    assert "should be between 0 to 40" in \
           str(e.value)

    em_setting = client.by_id_setting(SETTING_GUARANTEED_ENGINE_MANAGER_CPU)
    with pytest.raises(Exception) as e:
        client.update(em_setting, value="35")
    assert "The sum should not be smaller than 0% or greater than 40%" in \
           str(e.value)

    # Create a volume to test
    vol_name = generate_volume_name()
    volume = create_and_check_volume(client, vol_name)
    volume.attach(hostId=get_self_host_id())
    volume = wait_for_volume_healthy(client, vol_name)
    assert len(volume.replicas) == 3
    data = write_volume_random_data(volume)
    check_volume_data(volume, data)
    cleanup_volume(client, volume)
Exemplo n.º 29
0
def ha_backup_deletion_recovery_test(client,
                                     volume_name,
                                     size,
                                     base_image=""):  # NOQA
    volume = client.create_volume(name=volume_name,
                                  size=size,
                                  numberOfReplicas=2,
                                  baseImage=base_image)
    volume = common.wait_for_volume_detached(client, volume_name)

    host_id = get_self_host_id()
    volume = volume.attach(hostId=host_id)
    volume = common.wait_for_volume_healthy(client, volume_name)

    setting = client.by_id_setting(common.SETTING_BACKUP_TARGET)
    # test backupTarget for multiple settings
    backupstores = common.get_backupstore_url()
    for backupstore in backupstores:
        if common.is_backupTarget_s3(backupstore):
            backupsettings = backupstore.split("$")
            setting = client.update(setting, value=backupsettings[0])
            assert setting["value"] == backupsettings[0]

            credential = client.by_id_setting(
                common.SETTING_BACKUP_TARGET_CREDENTIAL_SECRET)
            credential = client.update(credential, value=backupsettings[1])
            assert credential["value"] == backupsettings[1]
        else:
            setting = client.update(setting, value=backupstore)
            assert setting["value"] == backupstore
            credential = client.by_id_setting(
                common.SETTING_BACKUP_TARGET_CREDENTIAL_SECRET)
            credential = client.update(credential, value="")
            assert credential["value"] == ""

        data = write_volume_random_data(volume)
        snap2 = volume.snapshotCreate()
        volume.snapshotCreate()

        volume.snapshotBackup(name=snap2["name"])

        _, b = common.find_backup(client, volume_name, snap2["name"])

        res_name = common.generate_volume_name()
        res_volume = client.create_volume(name=res_name,
                                          size=size,
                                          numberOfReplicas=2,
                                          fromBackup=b["url"])
        res_volume = common.wait_for_volume_restoration_completed(
            client, res_name)
        res_volume = common.wait_for_volume_detached(client, res_name)
        res_volume = res_volume.attach(hostId=host_id)
        res_volume = common.wait_for_volume_healthy(client, res_name)
        check_volume_data(res_volume, data)

        snapshots = res_volume.snapshotList()
        # only the backup snapshot + volume-head
        assert len(snapshots) == 2
        backup_snapshot = ""
        for snap in snapshots:
            if snap["name"] != "volume-head":
                backup_snapshot = snap["name"]
        assert backup_snapshot != ""

        res_volume.snapshotCreate()
        snapshots = res_volume.snapshotList()
        assert len(snapshots) == 3

        res_volume.snapshotDelete(name=backup_snapshot)
        res_volume.snapshotPurge()
        snapshots = res_volume.snapshotList()
        assert len(snapshots) == 2

        ha_rebuild_replica_test(client, res_name)

        res_volume = res_volume.detach()
        res_volume = common.wait_for_volume_detached(client, res_name)

        client.delete(res_volume)
        common.wait_for_volume_delete(client, res_name)

    cleanup_volume(client, volume)
Exemplo n.º 30
0
def test_recurring_job_kubernetes_status(client, core_api,
                                         volume_name):  # NOQA
    """
    Test RecurringJob properly backs up the KubernetesStatus

    1. Setup a random backupstore.
    2. Create a volume.
    3. Create a PV from the volume, and verify the PV status.
    4. Create a backup recurring job to run every 2 minutes.
    5. Verify the recurring job runs correctly.
    6. Verify the backup contains the Kubernetes Status labels
    """
    set_random_backupstore(client)
    host_id = get_self_host_id()
    client.create_volume(name=volume_name, size=SIZE, numberOfReplicas=2)
    volume = common.wait_for_volume_detached(client, volume_name)

    pv_name = "pv-" + volume_name
    create_pv_for_volume(client, core_api, volume, pv_name)
    ks = {
        'pvName': pv_name,
        'pvStatus': 'Available',
        'namespace': '',
        'pvcName': '',
        'lastPVCRefAt': '',
        'lastPodRefAt': '',
    }
    wait_volume_kubernetes_status(client, volume_name, ks)

    # Simple Backup Job that runs every 2 minutes, retains 1.
    jobs = [{
        "name": RECURRING_JOB_NAME,
        "cron": "*/2 * * * *",
        "task": "backup",
        "retain": 1
    }]
    volume.recurringUpdate(jobs=jobs)
    volume.attach(hostId=host_id)
    volume = common.wait_for_volume_healthy(client, volume_name)

    # 5 minutes
    time.sleep(300)
    snapshots = volume.snapshotList()
    count = 0
    for snapshot in snapshots:
        if snapshot.removed is False:
            count += 1
    # 1 from Backup, 1 from Volume Head.
    assert count == 2

    # Verify the Labels on the actual Backup.
    bv = client.by_id_backupVolume(volume_name)
    backups = bv.backupList().data
    assert len(backups) == 1

    b = bv.backupGet(name=backups[0].name)
    status = json.loads(b.labels.get(KUBERNETES_STATUS_LABEL))
    assert b.labels.get(RECURRING_JOB_LABEL) == RECURRING_JOB_NAME
    assert status == {
        'lastPodRefAt': '',
        'lastPVCRefAt': '',
        'namespace': '',
        'pvcName': '',
        'pvName': pv_name,
        'pvStatus': 'Available',
        'workloadsStatus': None
    }
    # Two Labels: KubernetesStatus and RecurringJob.
    assert len(b.labels) == 2

    cleanup_volume(client, volume)
    delete_and_wait_pv(core_api, pv_name)