Exemplo n.º 1
0
def run(ceph_cluster, **kw):
    """
    Performs various pool related validation tests
    Returns:
        1 -> Fail, 0 -> Pass
    """
    log.info(run.__doc__)
    config = kw["config"]
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)

    if config.get("ec_pool_recovery_improvement"):
        ec_config = config.get("ec_pool_recovery_improvement")
        if not rados_obj.create_erasure_pool(name="recovery", **ec_config):
            log.error("Failed to create the EC Pool")
            return 1

        if not rados_obj.bench_write(**ec_config):
            log.error("Failed to write objects into the EC Pool")
            return 1
        rados_obj.bench_read(**ec_config)
        log.info("Created the EC Pool, Finished writing data into the pool")

        # getting the acting set for the created pool
        acting_pg_set = rados_obj.get_pg_acting_set(pool_name=ec_config["pool_name"])
        if len(acting_pg_set) != ec_config["k"] + ec_config["m"]:
            log.error(
                f"acting set consists of only these : {acting_pg_set} OSD's, less than k+m"
            )
            return 1
        log.info(f" Acting set of the pool consists of OSD's : {acting_pg_set}")
        log.info(
            f"Killing m, i.e {ec_config['m']} OSD's from acting set to verify recovery"
        )
        stop_osds = [acting_pg_set.pop() for _ in range(ec_config["m"])]
        for osd_id in stop_osds:
            if not rados_obj.change_osd_state(action="stop", target=osd_id):
                log.error(f"Unable to stop the OSD : {osd_id}")
                return 1

        log.info("Stopped 'm' number of OSD's from, starting to wait for recovery")
        rados_obj.change_recover_threads(config=ec_config, action="set")

        # Sleeping for 25 seconds ( "osd_heartbeat_grace": "20" ) for osd's to be marked down
        time.sleep(25)

        # Waiting for up to 2.5 hours for the recovery to complete and PG's to enter active + Clean state
        end_time = datetime.datetime.now() + datetime.timedelta(seconds=9000)
        while end_time > datetime.datetime.now():
            flag = True
            status_report = rados_obj.run_ceph_command(cmd="ceph report")

            # Proceeding to check if all PG's are in active + clean
            for entry in status_report["num_pg_by_state"]:
                rec = (
                    "backfilling",
                    "degraded",
                    "incomplete",
                    "recovering",
                    "recovery_wait",
                    "backfilling_wait",
                    "peered",
                    "undersized",
                )
                if any(key in rec for key in entry["state"].split("+")):
                    flag = False

            if flag:
                log.info("The recovery and back-filling of the OSD is completed")
                break
            log.info(
                f"Waiting for active + clean. Active aletrs: {status_report['health']['checks'].keys()},"
                f"PG States : {status_report['num_pg_by_state']}"
                f" checking status again in 1 minute"
            )
            time.sleep(60)

        # getting the acting set for the created pool after recovery
        acting_pg_set = rados_obj.get_pg_acting_set(pool_name=ec_config["pool_name"])
        if len(acting_pg_set) != ec_config["k"] + ec_config["m"]:
            log.error(
                f"acting set consists of only these : {acting_pg_set} OSD's, less than k+m"
            )
            return 1
        log.info(f" Acting set of the pool consists of OSD's : {acting_pg_set}")
        # Changing recovery threads back to default
        rados_obj.change_recover_threads(config=ec_config, action="rm")

        log.debug("Starting the stopped OSD's")
        for osd_id in stop_osds:
            if not rados_obj.change_osd_state(action="restart", target=osd_id):
                log.error(f"Unable to restart the OSD : {osd_id}")
                return 1

        # Sleep for 5 seconds for OSD's to join the cluster
        time.sleep(5)

        if not flag:
            log.error("The pool did not reach active + Clean state after recovery")
            return 1

        # Deleting the pool created
        if not rados_obj.detete_pool(pool=ec_config["pool_name"]):
            log.error(f"the pool {ec_config['pool_name']} could not be deleted")
            return 1

        log.info("Successfully tested EC pool recovery with K osd's surviving")
        return 0
def verify_mon_db_trim(ceph_cluster, node: CephAdmin, **kwargs):
    """
    The Mon DB size should be reduced by removing the old mappings regularly. To verify this behaviour,
    Creating various scenarios where the DB would be updated with new mappings and verify it DB is getting trimmed.
    Verifies BZ:
    https://bugzilla.redhat.com/show_bug.cgi?id=1905339
    https://bugzilla.redhat.com/show_bug.cgi?id=1829646
    https://bugzilla.redhat.com/show_bug.cgi?id=1972281
    https://bugzilla.redhat.com/show_bug.cgi?id=1943357
    https://bugzilla.redhat.com/show_bug.cgi?id=1766702
    Args:
        ceph_cluster (ceph.ceph.Ceph): ceph cluster
        node: Cephadm node where the commands need to be executed
        kwargs: Any other KV pairs that need to be passed for testing
    Returns: Exception hit for failure conditions.
    """

    # Creating rados object to run rados commands
    rados_obj = RadosOrchestrator(node=node)
    mon_nodes = ceph_cluster.get_nodes(role="mon")
    osd_nodes = ceph_cluster.get_nodes(role="osd")
    client_node = ceph_cluster.get_nodes(role="client")[0]
    installer_node = ceph_cluster.get_nodes(role="installer")[0]
    daemon_info = rados_obj.run_ceph_command(cmd="ceph orch ps")
    mon_daemons = [
        entry for entry in daemon_info if entry["daemon_type"] == "mon"
    ]

    # Duration for which we will sleep after the mon DB changes are made and mon would have begun trimming old mappings
    mon_db_trim_wait_dur = 1200

    # List to capture the mon db size throughout the duration of the test to check the variations in DB size
    mon_db_size_list = list()
    mon_db_size_list.append(get_mondb_size(mon_nodes[0], mon_daemons))

    # Collecting first and last commits to osdmap
    status = rados_obj.run_ceph_command(cmd="ceph report")
    init_commmits = {
        "osdmap_first_committed": float(status["osdmap_first_committed"]),
        "osdmap_last_committed": float(status["osdmap_last_committed"]),
    }

    # creating scenarios where the mon db would be updated with new info
    change_config_for_slow_ops(rados_obj=rados_obj, action="set", **kwargs)
    mon_db_size_list.append(get_mondb_size(mon_nodes[0], mon_daemons))

    # Starting read and write on by creating a test pool .
    pool_name = "test_pool_ops"
    if not rados_obj.create_pool(pool_name=pool_name,
                                 crush_rule="stretch_rule"):
        error = "failed to create pool to run IO"
        raise TestCaseFailureException(error)
    cmd = f"rados --no-log-to-stderr -b 1024 -p {pool_name} bench 400 write --no-cleanup &"
    client_node.exec_command(sudo=True, cmd=cmd)

    mon_db_size_list.append(get_mondb_size(mon_nodes[0], mon_daemons))

    # deleting a previously created pool to increase OSD operations and map changes
    # Pool created as part of suite set-up workflow.
    rados_obj.detete_pool(pool="delete_pool")

    # Proceeding to reboot 1 OSD from each host to trigger rebalance & Backfill
    cluster_fsid = rados_obj.run_ceph_command(cmd="ceph fsid")["fsid"]
    daemon_info = rados_obj.run_ceph_command(cmd="ceph orch ps")
    osd_daemons = [
        entry for entry in daemon_info if entry["daemon_type"] == "osd"
    ]
    for onode in osd_nodes:
        for osd in osd_daemons:
            if re.search(osd["hostname"], onode.hostname):
                # Not using the container ID's provided in ceph orch ps command.
                # Bug : https://bugzilla.redhat.com/show_bug.cgi?id=1943494
                # cmd = f"podman restart {osd['container_id']}"
                cmd = f"systemctl restart ceph-{cluster_fsid}@osd.{osd['daemon_id']}.service"
                log.info(
                    f"rebooting osd-{osd['daemon_id']} on host {osd['hostname']}. Command {cmd}"
                )
                onode.exec_command(sudo=True, cmd=cmd)
                # Sleeping for 5 seconds for status to be updated
                time.sleep(5)
                break

    # Re-weighting the OSd's based on usage to trigger rebalance
    # todo: Verify re-balancing process on OSD's ( PG movement across cluster)
    # todo: Add re-balancing based on crush item provided
    # BZ: https://bugzilla.redhat.com/show_bug.cgi?id=1766702
    rados_obj.reweight_crush_items()
    """
    Waiting for 2 hours for cluster to get to active + clean state.
    Rationale: during cluster activities like backfill, rebalance, osd map change cause Mon DB to be updated.
    Hence, we can wait till mon DB updates are completed, after which DB size should be reduced,
    by trimming the old mappings once the new mappings are added.
    If cluster healthy state is reached within 2 hours, we exit the loop earlier, without waiting for stipulated time,
    But if cluster is still performing operations for long time,
     we would need at-least some data to make sure DB is not just increasing.
      ( DB is expected to increase when operations are in progress.
       Old mappings are removed when operations/ new updates are completed. )
    """
    end_time = datetime.datetime.now() + datetime.timedelta(seconds=7200)
    while end_time > datetime.datetime.now():
        status_report = rados_obj.run_ceph_command(cmd="ceph report")
        ceph_health_status = status_report["health"]
        recovery_tuple = ("OSD_DOWN", "PG_AVAILABILITY", "PG_DEGRADED",
                          "SLOW_OPS")
        daemon_info = rados_obj.run_ceph_command(cmd="ceph orch ps")
        mon_daemons = [
            entry for entry in daemon_info if entry["daemon_type"] == "mon"
        ]
        mon_db_size_list.append(get_mondb_size(mon_nodes[0], mon_daemons))

        # Checking for any health warnings that increase db size
        flag = (True if not any(key in ceph_health_status["checks"].keys()
                                for key in recovery_tuple) else False)

        # Proceeding to check if all PG's are in active + clean
        if flag:
            for entry in status_report["num_pg_by_state"]:
                rec = ("remapped", "backfilling", "degraded")
                if any(key in rec for key in entry["state"].split("+")):
                    flag = False

        if flag:
            log.info(
                f"The recovery and back-filling of the OSD is completed"
                f"Sleeping for {mon_db_trim_wait_dur} Seconds after clean for mon to remove old mappings"
            )
            time.sleep(mon_db_trim_wait_dur / 2)
            mon_db_size_list.append(get_mondb_size(mon_nodes[0], mon_daemons))
            time.sleep(mon_db_trim_wait_dur)
            break
        log.info(
            f"Waiting for active + clean. Active aletrs: {ceph_health_status['checks'].keys()},"
            f" checking status again in 1 minute")
        time.sleep(60)

    # Checking if any slow operations are reported and waiting for 'dur' for the slow_ops to be cleared
    end_time = datetime.datetime.now() + datetime.timedelta(
        seconds=mon_db_trim_wait_dur)
    while end_time > datetime.datetime.now():
        if not get_slow_ops_data(
                node=node, installer=installer_node, action="current"):
            log.info("Operations in progress, checking again in 30 seconds")
            time.sleep(30)
            continue
        # Logging all the historic operations for reference / future enhancement
        get_slow_ops_data(node=node,
                          installer=installer_node,
                          action="historic")
        break

    # collecting the final size of the Mon DB and the OSD map epoch times
    daemon_info = rados_obj.run_ceph_command(cmd="ceph orch ps")
    mon_daemons = [
        entry for entry in daemon_info if entry["daemon_type"] == "mon"
    ]
    final_db_size = get_mondb_size(mon_nodes[0], mon_daemons)
    final_status = rados_obj.run_ceph_command(cmd="ceph report")
    final_commmits = {
        "osdmap_first_committed":
        float(final_status["osdmap_first_committed"]),
        "osdmap_last_committed": float(final_status["osdmap_last_committed"]),
    }

    mon_db_max_size = max(mon_db_size_list)
    # Getting the trend of mon DB size when the operations were running on cluster.
    mon_db_size_size_change = list()
    for i in range(len(mon_db_size_list) - 1):
        mon_db_size_size_change.append(mon_db_size_list[i + 1] -
                                       mon_db_size_list[i])

    # Reverting the config changes made for generation slow_ops
    change_config_for_slow_ops(rados_obj=rados_obj, action="rm", **kwargs)

    # Checking the final results
    if True not in list(map(lambda x: x <= 0, mon_db_size_size_change)):
        error = f"The mon DB is only increasing since the test begun. DB sizes {mon_db_size_list}"
        raise TestCaseFailureException(error)

    if not final_db_size <= mon_db_max_size:
        error = (
            f"The mon DB size after cluster clean is higher than when operations were being performed.\n"
            f"max size during operations : {mon_db_max_size} , final DB size after clean {final_db_size}"
        )
        log.error(error)
        raise TestCaseFailureException()

    # Initial update of OSD maps can be the same at the beginning and end of test
    if (final_commmits["osdmap_first_committed"] <
            init_commmits["osdmap_first_committed"]):
        error = (
            f"The OSD map has not been updated of first commits\n"
            f"The commits are initial commits : {init_commmits}, final commits : {final_commmits}"
        )
        log.error(error)
        raise TestCaseFailureException()

    # Final updates need to be more than initial updates as there are OSD map changes during the duration of test
    if (final_commmits["osdmap_last_committed"] <=
            init_commmits["osdmap_last_committed"]):
        error = (
            f"The OSD map has not been updated of last commits\n"
            f"The commits are initial commits : {init_commmits}, final commits : {final_commmits}"
        )
        log.error(error)
        raise TestCaseFailureException()

    # The number of OSD mappings present on cluster should not exceed 800 in total
    # https://tracker.ceph.com/issues/37875#note-1
    if (final_commmits["osdmap_last_committed"] -
            final_commmits["osdmap_first_committed"]) > 800:
        error = (
            f"There are still too many old commits in Mon DB. OSD map not trimmed as per needed\n"
            f"The commits are initial commits : {init_commmits}, final commits : {final_commmits}"
        )
        log.error(error)
        raise TestCaseFailureException()

    # Checking the paxos trimming sizes
    if (int(final_status["paxos"]["last_committed"]) -
            int(final_status["paxos"]["first_committed"])) > 1000:
        error = (
            f"There are still too many old commits in Mon DB.\n"
            f"The commits are initial commits : {final_status['paxos']['first_committed']},"
            f" final commits : {final_status['paxos']['last_committed']}")
        log.error(error)
        raise TestCaseFailureException()

    log.info("mon DB was trimmed successfully")
Exemplo n.º 3
0
def run(ceph_cluster, **kw):
    """
    Test to create pool, then add , get , delete objects & Snapshots.
    Returns:
        1 -> Fail, 0 -> Pass
    """
    log.info(run.__doc__)
    config = kw["config"]
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)
    pool_obj = PoolFunctions(node=cephadm)
    client_node = rados_obj.ceph_cluster.get_nodes(role="client")[0]
    pool_target_configs = config["verify_client_pg_access"]["configurations"]
    num_snaps = config["verify_client_pg_access"]["num_snapshots"]
    log.debug(
        "Verifying the effects of rados put, get, snap & delete on pool with single PG"
    )

    # Creating pools and starting the test
    for entry in pool_target_configs.values():
        pool_name = entry["pool_name"]
        log.debug(
            f"Creating {entry['pool_type']} pool on the cluster with name {pool_name}"
        )
        if entry.get("pool_type", "replicated") == "erasure":
            method_should_succeed(rados_obj.create_erasure_pool,
                                  name=pool_name,
                                  **entry)
        else:
            method_should_succeed(
                rados_obj.create_pool,
                **entry,
            )

        # Creating and reading objects
        with parallel() as p:
            p.spawn(do_rados_put, client_node, pool_name, 500)
            p.spawn(do_rados_get, client_node, pool_name, 1)

        # Creating and deleting snapshots on the pool
        snapshots = []
        for _ in range(num_snaps):
            snap = pool_obj.create_pool_snap(pool_name=pool_name)
            if snap:
                snapshots.append(snap)
            else:
                log.error("Could not create snapshot on the pool")
                return 1

        if not pool_obj.delete_pool_snap(pool_name=pool_name):
            log.error("Could not delete the snapshots created")
            return 1

        # Deleting the objects created on the pool
        if not pool_obj.do_rados_delete(pool_name=pool_name):
            log.error("Could not delete the objects present on pool")
            return 1

        rados_obj.detete_pool(pool=pool_name)
        log.info(f"Completed all operations on pool {pool_name}")

    log.info(
        "Completed testing effects of rados put, get, snap & delete on pool with single PG"
    )
    return 0
Exemplo n.º 4
0
def run(ceph_cluster, **kw):
    """
    Prepares the cluster to run rados tests.
    Actions Performed:
    1. Create a Replicated and Erasure coded pools and write Objects into pools
    2. Setup email alerts for sending errors/warnings on the cluster.
        Verifies Bugs:
        https://bugzilla.redhat.com/show_bug.cgi?id=1849894
        https://bugzilla.redhat.com/show_bug.cgi?id=1878145
    3. Enable logging into file and check file permissions
        Verifies Bug : https://bugzilla.redhat.com/show_bug.cgi?id=1884469
    Args:
        ceph_cluster (ceph.ceph.Ceph): ceph cluster
        kw: Args that need to be passed to the test for initialization

    Returns:
        1 -> Fail, 0 -> Pass
    """
    log.info(run.__doc__)
    config = kw["config"]
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)
    mon_obj = MonConfigMethods(rados_obj=rados_obj)
    out, err = cephadm.shell(["uuidgen"])
    uuid = out.split("-")[0]

    if config.get("ec_pool"):
        ec_config = config.get("ec_pool")
        ec_config.setdefault("pool_name", f"ecpool_{uuid}")
        if not rados_obj.create_erasure_pool(name=uuid, **ec_config):
            log.error("Failed to create the EC Pool")
            return 1

        if ec_config.get("test_overwrites_pool"):
            if not rados_obj.verify_ec_overwrites(**ec_config):
                log.error("Failed to create the EC Pool")
                return 1
        else:
            if not rados_obj.bench_write(**ec_config):
                log.error("Failed to write objects into the EC Pool")
                return 1
            rados_obj.bench_read(**ec_config)
            log.info(
                "Created the EC Pool, Finished writing data into the pool")

        if ec_config.get("delete_pool"):
            if not rados_obj.detete_pool(pool=ec_config["pool_name"]):
                log.error("Failed to delete EC Pool")
                return 1

    if config.get("replicated_pool"):
        rep_config = config.get("replicated_pool")
        rep_config.setdefault("pool_name", f"repool_{uuid}")
        if not rados_obj.create_pool(**rep_config, ):
            log.error("Failed to create the replicated Pool")
            return 1
        if not rados_obj.bench_write(**rep_config):
            log.error("Failed to write objects into the EC Pool")
            return 1
        rados_obj.bench_read(**rep_config)
        log.info(
            "Created the replicated Pool, Finished writing data into the pool")
        if rep_config.get("delete_pool"):
            if not rados_obj.detete_pool(pool=rep_config["pool_name"]):
                log.error("Failed to delete replicated Pool")
                return 1

    if config.get("set_pool_configs"):
        changes = config["set_pool_configs"]
        pool_name = changes["pool_name"]
        configurations = changes["configurations"]
        for conf in configurations.keys():
            if not rados_obj.set_pool_property(
                    pool=pool_name, props=conf, value=configurations[conf]):
                log.error(f"failed to set property {conf} on the cluster")
                return 1
        log.info(f"made the config changes on the pool {pool_name}")

    if config.get("email_alerts"):
        alert_config = config.get("email_alerts")
        if not rados_obj.enable_email_alerts(**alert_config):
            log.error("Error while configuring email alerts")
            return 1
        log.info("email alerts configured")

    if config.get("Verify_config_parameters"):
        test_config = config.get("Verify_config_parameters")
        test_node = ceph_cluster.get_nodes(role="osd")[0]
        for conf in test_config["configurations"]:
            for entry in conf.values():
                if entry.get("location_type") == "host":
                    entry["location_value"] = test_node.hostname
                if not mon_obj.set_config(**entry):
                    log.error(f"Error setting config {conf}")
                    return 1
        log.info("done")
        pool_name = "test_pool_1"
        if not rados_obj.create_pool(pool_name=pool_name, pg_num=16):
            log.error("Failed to create the replicated Pool")
            return 1

        rados_obj.bench_write(pool_name=pool_name, rados_write_duration=50)

        # Removing test configurations
        for conf in test_config["configurations"]:
            for entry in conf.values():
                if entry.get("location_type") == "host":
                    entry["location_value"] = test_node.hostname
                if not mon_obj.remove_config(**entry):
                    log.error(f"Error setting config {conf}")
                    return 1
        log.info("finished removing values, passed")

    if config.get("log_to_file"):
        if not rados_obj.enable_file_logging():
            log.error("Error while setting config to enable logging into file")
            return 1
        log.info("Logging to file configured")

    if config.get("cluster_configuration_checks"):
        cls_config = config.get("cluster_configuration_checks")
        if not rados_obj.set_cluster_configuration_checks(**cls_config):
            log.error("Error while setting Cluster config checks")
            return 1
        log.info("Set up cluster configuration checks")

    if config.get("configure_balancer"):
        balancer_config = config.get("configure_balancer")
        if not rados_obj.enable_balancer(**balancer_config):
            log.error("Error while setting up balancer on the Cluster")
            return 1
        log.info("Set up Balancer on the cluster")

    if config.get("configure_pg_autoscaler"):
        autoscaler_config = config.get("configure_pg_autoscaler")
        if not rados_obj.configure_pg_autoscaler(**autoscaler_config):
            log.error("Error while setting up pg_autoscaler on the Cluster")
            return 1
        log.info("Set up pg_autoscaler on the cluster")

    if config.get("enable_compression"):
        compression_conf = config["enable_compression"]
        pool_name = compression_conf["pool_name"]
        for conf in compression_conf["configurations"]:
            for entry in conf.values():
                if not rados_obj.pool_inline_compression(pool_name=pool_name,
                                                         **entry):
                    log.error(
                        f"Error setting compression on pool : {pool_name} for config {conf}"
                    )
                    return 1
                if not rados_obj.bench_write(**compression_conf):
                    log.error("Failed to write objects into Pool")
                    return 1
                rados_obj.bench_read(**compression_conf)
                log.info(
                    "Created the replicated Pool, Finished writing data into the pool"
                )
        log.info("Completed compression tests")

    if config.get("delete_pools"):
        for name in config["delete_pools"]:
            if not rados_obj.detete_pool(name):
                log.error(f"the pool {name} could not be deleted")
                return 1
        log.info("deleted all the given pools successfully")

    log.info("All Pre-requisites completed to run Rados suite")
    return 0
def run(ceph_cluster, **kw) -> int:
    """
    Test to copy data from one pool to another
    Returns:
        1 -> Fail, 0 -> Pass
    """
    log.info(run.__doc__)
    config = kw["config"]
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)
    pool_obj = PoolFunctions(node=cephadm)
    client_node = ceph_cluster.get_nodes(role="client")[0]

    pool_configs_path = "conf/pacific/rados/test-confs/pool-configurations.yaml"

    with open(pool_configs_path, "r") as fd:
        pool_configs = yaml.safe_load(fd)

    pool_orig = pool_configs[config["pool-1-type"]][config["pool-1-conf"]]
    pool_target = pool_configs[config["pool-2-type"]][config["pool-2-conf"]]
    create_given_pool(rados_obj, pool_orig)
    create_given_pool(rados_obj, pool_target)

    # Writing objects with huge omap entries
    if not pool_obj.fill_omap_entries(pool_name=pool_orig["pool_name"],
                                      obj_end=500):
        log.error(
            f"Omap entries not generated on pool {pool_orig['pool_name']}")
        return 1

    do_rados_put(mon=client_node, pool=pool_orig["pool_name"], nobj=1000)

    snapshots = []
    for _ in range(5):
        snap = pool_obj.create_pool_snap(pool_name=pool_orig["pool_name"])
        if snap:
            snapshots.append(snap)
        else:
            log.error("Could not create snapshot on the pool")
            return 1

    # Using cppool to copy contents b/w the pools
    cmd = f"rados cppool {pool_orig['pool_name']} {pool_target['pool_name']}"
    client_node.exec_command(sudo=True, cmd=cmd, long_running=True)

    # Sleeping for 2 seconds after copy to perform get operations
    time.sleep(2)

    do_rados_get(client_node, pool_target["pool_name"], 1)

    # Checking if the snapshots of pool was also copied
    # Snapshots of pool should not be copied
    for snap_name in snapshots:
        if pool_obj.check_snap_exists(snap_name=snap_name,
                                      pool_name=pool_target["pool_name"]):
            log.error("Snapshot of pool exists")
            return 1

    # deleting the Target pool created after cppool
    rados_obj.detete_pool(pool=pool_target["pool_name"])

    # Creating new target pool to test import/export
    create_given_pool(rados_obj, pool_target)

    # Creating temp file to hold pool info
    client_node.exec_command(cmd="touch /tmp/file", )

    # crating export of data on old pool
    cmd = f"rados export -p {pool_orig['pool_name']} /tmp/file"
    client_node.exec_command(sudo=True, cmd=cmd, long_running=True)

    # Importing the file into the new pool
    cmd = f"rados import -p {pool_target['pool_name']} /tmp/file"
    client_node.exec_command(sudo=True, cmd=cmd, long_running=True)

    # Sleeping for 2 seconds after copy to perform get operations
    time.sleep(2)

    do_rados_get(client_node, pool_target["pool_name"], 1)

    # Checking if the snapshots of pool was also copied
    # Snapshots of pool should not be copied
    for snap_name in snapshots:
        if pool_obj.check_snap_exists(snap_name=snap_name,
                                      pool_name=pool_target["pool_name"]):
            log.error("Snapshot of pool exists")
            return 1

    # deleting the Original & Target pool created after cppool
    rados_obj.detete_pool(pool=pool_target["pool_name"])
    rados_obj.detete_pool(pool=pool_orig["pool_name"])
    return 0
Exemplo n.º 6
0
def run(ceph_cluster, **kw):
    """
    Test to Verify the pg-autoscale flag.
    Returns:
        1 -> Fail, 0 -> Pass
    """
    log.info(run.__doc__)
    config = kw["config"]
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)
    mon_obj = MonConfigMethods(rados_obj=rados_obj)
    pool_configs_path = "conf/pacific/rados/test-confs/pool-configurations.yaml"

    regex = r"\s*(\d.\d)-rhel-\d"
    build = (re.search(regex, config.get("build",
                                         config.get("rhbuild")))).groups()[0]
    if not float(build) > 5.0:
        log.info(
            "Test running on version less than 5.1, skipping verifying autoscaler flags"
        )
        return 0

    # Setting the no-autoscale flag
    cmd = "ceph osd pool set noautoscale"
    rados_obj.run_ceph_command(cmd=cmd)

    # sleeping for 5 seconds as the command takes some time to affect the status of pools
    time.sleep(5)

    # Getting the autoscale configurations after setting the flag
    # all the pools should have autoscale set to off
    cmd = "ceph osd pool autoscale-status"
    pool_status = rados_obj.run_ceph_command(cmd=cmd)

    for entry in pool_status:
        if entry["pg_autoscale_mode"] == "on":
            log.error(
                f"Pg autoscaler not turned off for pool : {entry['pool_name']}"
            )
            return 1

    if not mon_obj.verify_set_config(section="global",
                                     name="osd_pool_default_pg_autoscale_mode",
                                     value="off"):
        log.error(
            "Default autoscale mode not set to off upon setting the no-autoscale flag"
        )
        return 1

    if not mon_obj.verify_set_config(
            section="mgr", name="mgr/pg_autoscaler/noautoscale", value="true"):
        log.error(
            "autoscale Flag not set to true upon setting the no-autoscale flag"
        )
        return 1

    # Creating a new pool, with the flag off, new pool should be created with autoscaler profile turned off
    with open(pool_configs_path, "r") as fd:
        pool_configs = yaml.safe_load(fd)

    pool_conf = pool_configs["replicated"]["sample-pool-2"]
    create_given_pool(rados_obj, pool_conf)

    cmd = "ceph osd pool autoscale-status"
    pool_status = rados_obj.run_ceph_command(cmd=cmd)

    for entry in pool_status:
        if entry["pool_name"] == pool_conf["pool_name"]:
            if entry["pg_autoscale_mode"] == "on":
                log.error(
                    f"Pg autoscaler not turned off for the new pool : {entry['pool_name']} "
                    f"created with flag turned off")
                return 1

    # Turning the autoscale flag back on. All the setting made earlier should be reverted
    cmd = "ceph osd pool unset noautoscale"
    pool_status = rados_obj.run_ceph_command(cmd=cmd)

    # sleeping for 5 seconds as the command takes some time to affect the status of pools
    time.sleep(5)

    for entry in pool_status:
        if entry["pg_autoscale_mode"] == "off":
            log.error(
                f"Pg autoscaler not turned on for pool : {entry['pool_name']}")
            return 1

    if not mon_obj.verify_set_config(section="global",
                                     name="osd_pool_default_pg_autoscale_mode",
                                     value="on"):
        log.error(
            "Default autoscale mode not set to true upon removing the no-autoscale flag"
        )
        return 1

    if not mon_obj.verify_set_config(section="mgr",
                                     name="mgr/pg_autoscaler/noautoscale",
                                     value="false"):
        log.error(
            "autoscale Flag not set to false upon removing the no-autoscale flag"
        )
        return 1

    # Deleting the pool created earlier
    if not rados_obj.detete_pool(pool=pool_conf["pool_name"]):
        log.error(f"the pool {pool_conf['pool_name']} could not be deleted")
        return 1

    log.info("Autoscale flag is working as expected.")
    return 0
Exemplo n.º 7
0
def run(ceph_cluster, **kw):
    """
    Performs various pool related validation tests
    Returns:
        1 -> Fail, 0 -> Pass
    """
    log.info(run.__doc__)
    config = kw["config"]
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)
    mon_obj = MonConfigMethods(rados_obj=rados_obj)
    pool_obj = PoolFunctions(node=cephadm)

    if config.get("ec_pool_recovery_improvement"):
        ec_config = config.get("ec_pool_recovery_improvement")
        if not rados_obj.create_erasure_pool(name="recovery", **ec_config):
            log.error("Failed to create the EC Pool")
            return 1

        if not rados_obj.bench_write(**ec_config):
            log.error("Failed to write objects into the EC Pool")
            return 1
        rados_obj.bench_read(**ec_config)
        log.info("Created the EC Pool, Finished writing data into the pool")

        # getting the acting set for the created pool
        acting_pg_set = rados_obj.get_pg_acting_set(
            pool_name=ec_config["pool_name"])
        if len(acting_pg_set) != ec_config["k"] + ec_config["m"]:
            log.error(
                f"acting set consists of only these : {acting_pg_set} OSD's, less than k+m"
            )
            return 1
        log.info(
            f" Acting set of the pool consists of OSD's : {acting_pg_set}")
        log.info(
            f"Killing m, i.e {ec_config['m']} OSD's from acting set to verify recovery"
        )
        stop_osds = [acting_pg_set.pop() for _ in range(ec_config["m"])]
        for osd_id in stop_osds:
            if not rados_obj.change_osd_state(action="stop", target=osd_id):
                log.error(f"Unable to stop the OSD : {osd_id}")
                return 1

        log.info(
            "Stopped 'm' number of OSD's from, starting to wait for recovery")
        rados_obj.change_recover_threads(config=ec_config, action="set")

        # Sleeping for 25 seconds ( "osd_heartbeat_grace": "20" ) for osd's to be marked down
        time.sleep(25)

        # Waiting for up to 2.5 hours for the recovery to complete and PG's to enter active + Clean state
        end_time = datetime.datetime.now() + datetime.timedelta(seconds=9000)
        while end_time > datetime.datetime.now():
            flag = True
            status_report = rados_obj.run_ceph_command(cmd="ceph report")

            # Proceeding to check if all PG's are in active + clean
            for entry in status_report["num_pg_by_state"]:
                rec = (
                    "backfilling",
                    "degraded",
                    "incomplete",
                    "recovering",
                    "recovery_wait",
                    "backfilling_wait",
                    "peered",
                    "undersized",
                )
                if any(key in rec for key in entry["state"].split("+")):
                    flag = False

            if flag:
                log.info(
                    "The recovery and back-filling of the OSD is completed")
                break
            log.info(
                f"Waiting for active + clean. Active aletrs: {status_report['health']['checks'].keys()},"
                f"PG States : {status_report['num_pg_by_state']}"
                f" checking status again in 1 minute")
            time.sleep(60)

        # getting the acting set for the created pool after recovery
        acting_pg_set = rados_obj.get_pg_acting_set(
            pool_name=ec_config["pool_name"])
        if len(acting_pg_set) != ec_config["k"] + ec_config["m"]:
            log.error(
                f"acting set consists of only these : {acting_pg_set} OSD's, less than k+m"
            )
            return 1
        log.info(
            f" Acting set of the pool consists of OSD's : {acting_pg_set}")
        # Changing recovery threads back to default
        rados_obj.change_recover_threads(config=ec_config, action="rm")

        log.debug("Starting the stopped OSD's")
        for osd_id in stop_osds:
            if not rados_obj.change_osd_state(action="restart", target=osd_id):
                log.error(f"Unable to restart the OSD : {osd_id}")
                return 1

        # Sleep for 5 seconds for OSD's to join the cluster
        time.sleep(5)

        if not flag:
            log.error(
                "The pool did not reach active + Clean state after recovery")
            return 1

        # Deleting the pool created
        if not rados_obj.detete_pool(pool=ec_config["pool_name"]):
            log.error(
                f"the pool {ec_config['pool_name']} could not be deleted")
            return 1

        log.info("Successfully tested EC pool recovery with K osd's surviving")
        return 0

    if config.get("Compression_tests"):
        """
        Create a 2 replicated pools:
        1. Pool_1 : enable any compression algorithm(def snappy) and compression mode(aggressive/force).
        2. Pool_2 : set compression mode to none
        Writing the same amount of data on 2 pools, size of pool with compression on would consume less space
        """
        pool_config = config["Compression_tests"]["pool_config"]
        compression_config = config["Compression_tests"]["compression_config"]
        pool_1 = pool_config["pool-1"]
        pool_2 = pool_config["pool-2"]

        if config["Compression_tests"]["pool_type"] == "replicated":
            if not rados_obj.create_pool(pool_name=pool_1, **pool_config):
                log.error("could not create pool-1")
                return 1
            if not rados_obj.create_pool(pool_name=pool_2, **pool_config):
                log.error("could not create pool-2")
                return 1
        elif config["Compression_tests"]["pool_type"] == "erasure":
            pool_config["pool_name"] = pool_1
            if not rados_obj.create_erasure_pool(name=pool_1, **pool_config):
                log.error("could not create pool-1")
                return 1
            pool_config["pool_name"] = pool_2
            if not rados_obj.create_erasure_pool(name=pool_2, **pool_config):
                log.error("could not create pool-2")
                return 1
            del pool_config["pool_name"]

        log.debug("Created two pools to test compression")

        # Enabling compression on pool-1
        if not rados_obj.pool_inline_compression(pool_name=pool_1,
                                                 **compression_config):
            log.error(
                f"Error setting compression on pool : {pool_1} for config {compression_config}"
            )
            return 1

        # Writing the same amount of data into two pools
        if not rados_obj.bench_write(pool_name=pool_1, **pool_config):
            log.error(
                "Failed to write objects into Pool-1, with compression enabled"
            )
            return 1

        if not rados_obj.bench_write(pool_name=pool_2, **pool_config):
            log.error(
                "Failed to write objects into Pool-2, without compression enabled"
            )
            return 1
        # Sleeping for 5 seconds for status to be updated.
        time.sleep(5)

        log.debug(
            "Finished writing data into the two pools. Checking pool stats")
        try:
            pool_stats = rados_obj.run_ceph_command(
                cmd="ceph df detail")["pools"]
            pool_1_stats = [
                detail for detail in pool_stats if detail["name"] == pool_1
            ][0]["stats"]
            pool_2_stats = [
                detail for detail in pool_stats if detail["name"] == pool_2
            ][0]["stats"]
        except KeyError:
            log.error(
                "No stats about the pools requested found on the cluster")
            return 1

        log.debug(f"Pool-1 stats: {pool_1_stats}")
        log.debug(f"Pool-2 stats: {pool_2_stats}")
        if pool_1_stats["compress_bytes_used"] < 0:
            log.error("No data stored under pool-1 is compressed")
            return 1

        if pool_1_stats["kb_used"] >= pool_2_stats["kb_used"]:
            log.error("Compression has no effect on the pool size...")
            return 1

        if config["Compression_tests"].get("verify_compression_ratio_set"):
            # added verification for test: CEPH-83571672
            if not rados_obj.check_compression_size(pool_name=pool_1,
                                                    **compression_config):
                log.error("data not compressed in accordance to ratio set")
                return 1

        log.info("Pool size is less when compression is enabled")
        return 0

    if config.get("test_autoscaler_bulk_feature"):
        """
        Tests to verify the autoscaler bulk flag, which allows pools to make use of
        scale-down profile, making those pools start with full compliments of PG sets.
        Tests include
        1. creating new pools with bulk,
        2. enabling/disabling bulk flag on existing pools
        3. Verify the PG changes when the flag is set/unset
        Verifies bugs : https://bugzilla.redhat.com/show_bug.cgi?id=2049851
        """
        regex = r"\s*(\d.\d)-rhel-\d"
        build = (re.search(regex,
                           config.get("build",
                                      config.get("rhbuild")))).groups()[0]
        if not float(build) > 5.0:
            log.info(
                "Test running on version less than 5.1, skipping verifying bulk flags"
            )
            return 0

        # Creating a pool with bulk feature
        pool_name = config.get("pool_name")
        if not pool_obj.set_bulk_flag(pool_name=pool_name):
            log.error("Failed to create a pool with bulk features")
            return 1

        # Checking the autoscaler status, final PG counts, bulk flags
        pg_target_init = pool_obj.get_target_pg_num_bulk_flag(
            pool_name=pool_name)

        # Unsetting the bulk flag and checking the change in the PG counts
        if not pool_obj.rm_bulk_flag(pool_name=pool_name):
            log.error("Failed to create a pool with bulk features")
            return 1

        # Sleeping for 5 seconds for new PG num to bets et
        time.sleep(5)
        pg_target_interim = pool_obj.get_target_pg_num_bulk_flag(
            pool_name=pool_name)

        # The target PG's once the flag is disabled must be lesser than when enabled
        if pg_target_interim >= pg_target_init:
            log.error("PG's not reduced after bulk flag disabled")
            return 1

        # Setting the bulk flag on pool again and checking the change in the PG counts
        if not pool_obj.set_bulk_flag(pool_name=pool_name):
            log.error("Failed to disable/remove bulk features on pool")
            return 1

        # Sleeping for 5 seconds for new PG num to bets et
        time.sleep(5)

        pg_target_final = pool_obj.get_target_pg_num_bulk_flag(
            pool_name=pool_name)

        # The target PG's once the flag is disabled must be lesser than when enabled
        if pg_target_interim >= pg_target_final:
            log.error("PG's not Increased after bulk flag Enabled")
            return 1

        if config.get("delete_pool"):
            rados_obj.detete_pool(pool=pool_name)
        log.info("Verified the workings of bulk flag")
        return 0

    if config.get("verify_pool_target_ratio"):
        log.debug("Verifying target size ratio on pools")
        target_configs = config["verify_pool_target_ratio"]["configurations"]
        # Creating pools and starting the test
        for entry in target_configs.values():
            log.debug(f"Creating {entry['pool_type']} pool on the cluster")
            if entry.get("pool_type", "replicated") == "erasure":
                method_should_succeed(rados_obj.create_erasure_pool,
                                      name=entry["pool_name"],
                                      **entry)
            else:
                method_should_succeed(
                    rados_obj.create_pool,
                    **entry,
                )
            rados_obj.bench_write(**entry)
            if not pool_obj.verify_target_ratio_set(
                    pool_name=entry["pool_name"],
                    ratio=entry["target_size_ratio"]):
                log.error(
                    f"Could not change the target ratio on the pool: {entry['pool_name']}"
                )
                return 1
            log.debug("Set the ratio. getting the projected pg's")

            rados_obj.change_recover_threads(config=config, action="set")
            log.debug(
                "Waiting for the rebalancing to complete on the cluster after the change"
            )
            # Sleeping for 2 minutes for rebalancing to start & for new PG count to be updated.
            time.sleep(120)

            new_pg_count = int(
                pool_obj.get_pg_autoscaler_value(pool_name=entry["pool_name"],
                                                 item="pg_num_target"))
            if new_pg_count <= entry["pg_num"]:
                log.error(
                    f"Count of PG's not increased on the pool: {entry['pool_name']}"
                    f"Initial creation count : {entry['pg_num']}"
                    f"New count after setting num target : {new_pg_count}")
                return 1

            res = wait_for_clean_pg_sets(rados_obj)
            if not res:
                log.error(
                    "PG's in cluster are not active + Clean after the ratio change"
                )
                return 1
            if not pool_obj.verify_target_ratio_set(
                    pool_name=entry["pool_name"], ratio=0.0):
                log.error(
                    f"Could not remove the target ratio on the pool: {entry['pool_name']}"
                )
                return 1

            # Sleeping for 2 minutes for rebalancing to start & for new PG count to be updated.
            time.sleep(120)
            # Checking if after the removal of ratio, the PG count has reduced
            end_pg_count = int(
                pool_obj.get_pg_autoscaler_value(pool_name=entry["pool_name"],
                                                 item="pg_num_target"))
            if end_pg_count >= new_pg_count:
                log.error(
                    f"Count of PG's not changed/ reverted on the pool: {entry['pool_name']}"
                    f" after removing the target ratios")
                return 1
            rados_obj.change_recover_threads(config=config, action="rm")
            if entry.get("delete_pool", False):
                rados_obj.detete_pool(pool=entry["pool_name"])
            log.info(
                f"Completed the test of target ratio on pool: {entry['pool_name']} "
            )
        log.info("Target ratio tests completed")
        return 0

    if config.get("verify_mon_target_pg_per_osd"):
        pg_conf = config.get("verify_mon_target_pg_per_osd")
        if not mon_obj.set_config(**pg_conf):
            log.error("Could not set the value for mon_target_pg_per_osd ")
            return 1
        mon_obj.remove_config(**pg_conf)
        log.info("Set and verified the value for mon_target_pg_per_osd ")
        return 0

    if config.get("verify_pg_num_min"):
        log.debug("Verifying pg_num_min on pools")
        target_configs = config["verify_pg_num_min"]["configurations"]
        # Creating pools and starting the test
        for entry in target_configs.values():
            log.debug(f"Creating {entry['pool_type']} pool on the cluster")
            if entry.get("pool_type", "replicated") == "erasure":
                method_should_succeed(rados_obj.create_erasure_pool,
                                      name=entry["pool_name"],
                                      **entry)
            else:
                method_should_succeed(
                    rados_obj.create_pool,
                    **entry,
                )
            rados_obj.bench_write(**entry)

            if not rados_obj.set_pool_property(pool=entry["pool_name"],
                                               props="pg_num_min",
                                               value=entry["pg_num_min"]):
                log.error("Could not set the pg_min_size on the pool")
                return 1

            if entry.get("delete_pool", False):
                rados_obj.detete_pool(pool=entry["pool_name"])
            log.info(
                f"Completed the test of pg_min_num on pool: {entry['pool_name']} "
            )
        log.info("pg_min_num tests completed")
        return 0
Exemplo n.º 8
0
def run(ceph_cluster, **kw):
    """
    enables connectivity mode and deploys stretch cluster with arbiter mon node
    Actions Performed:
    1. Disables the automatic crush map update
    2. Collects the OSD daemons in the cluster and split them into 2 sites.
    3. If add capacity is selected, only half of the OSD's will be added to various sites initially.
    4. Adds the stretch rule into crush map.
    5. Adding monitors into the 2 sites.
    6. Create a replicated pool and deploy stretch mode.
    7. Create a test pool, write some data and perform add capacity. ( add osd nodes into two sites )
    8. Check for the bump in election epochs throughout.
    9. Check the acting set in PG for 4 OSD's. 2 from each site.
    Verifies bugs:
    [1]. https://bugzilla.redhat.com/show_bug.cgi?id=1937088
    [2]. https://bugzilla.redhat.com/show_bug.cgi?id=1952763
    Args:
        ceph_cluster (ceph.ceph.Ceph): ceph cluster
    """
    log.info("Deploying stretch cluster with arbiter mon node")
    log.info(run.__doc__)
    config = kw.get("config")
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)
    mon_obj = MonElectionStrategies(rados_obj=rados_obj)
    client_node = ceph_cluster.get_nodes(role="client")[0]
    tiebreaker_node = ceph_cluster.get_nodes(role="installer")[0]

    if not client_node and not tiebreaker_node:
        log.error(
            "Admin client and tie breaker node not configured, Cannot modify crush rules for stretch cluster"
        )
        return 1
    mon_state = get_mon_details(node=cephadm)
    if len(list(mon_state["monitors"])) < 5:
        log.error(
            f"Minimum of 5 Mon daemons needed to deploy a stretch cluster, found : {len(mon_state['monitors'])}"
        )
        return 1
    osd_details = get_osd_details(node=cephadm)
    if len(osd_details.keys()) < 4:
        log.error(
            f"Minimum of 4 osd daemons needed to deploy a stretch cluster, found : {len(osd_details.keys())}"
        )
        return 1

    if config.get("verify_forced_recovery"):
        log.info("Verifying forced recovery and healthy in stretch environment")

        pool_name = "stretch_pool_recovery"
        if not rados_obj.create_pool(pool_name=pool_name, pg_num=16):
            log.error("Failed to create the replicated Pool")
            return 1

        # getting the acting set for the created pool
        acting_pg_set = rados_obj.get_pg_acting_set(pool_name=pool_name)

        log.info(
            f"Killing 2 OSD's from acting set : {acting_pg_set} to verify recovery"
        )
        stop_osds = [acting_pg_set.pop() for _ in range(2)]
        for osd_id in stop_osds:
            if not rados_obj.change_osd_state(action="stop", target=osd_id):
                log.error(f"Unable to stop the OSD : {osd_id}")
                return 1

        # Sleeping for 25 seconds ( "osd_heartbeat_grace": "20" ) for osd's to be marked down
        time.sleep(25)

        log.info("Stopped 2 OSD's from acting set, starting to wait for recovery")
        rados_obj.change_recover_threads(config=config, action="set")

        if not rados_obj.bench_write(pool_name=pool_name, **config):
            log.error("Failed to write objects into the Pool")
            return 1

        log.debug("Triggering forced recovery in stretch mode")
        cmd = "ceph osd force_recovery_stretch_mode --yes-i-really-mean-it"
        rados_obj.run_ceph_command(cmd)
        log.info("Triggered the recovery in stretch mode")

        log.debug("Starting the stopped OSD's")
        for osd_id in stop_osds:
            if not rados_obj.change_osd_state(action="restart", target=osd_id):
                log.error(f"Unable to restart the OSD : {osd_id}")
                return 1

        # there was data written into pool when the OSD's were down.
        # Verifying if data is recovered and placed into the OSD's after bringing them back
        res = wait_for_clean_pg_sets(rados_obj)
        if not res:
            log.error("PG's in cluster are not active + Clean ")
            return 1

        log.debug("Forcing the stretch cluster into healthy mode")
        cmd = "ceph osd force_healthy_stretch_mode --yes-i-really-mean-it"
        rados_obj.run_ceph_command(cmd)

        log.info("Cluster has successfully recovered and is in healthy state")
        return 0

    # Finding and Deleting any stray EC pools that might have been left on cluster
    pool_dump = rados_obj.run_ceph_command(cmd="ceph osd dump")
    for entry in pool_dump["pools"]:
        if entry["type"] != 1 and entry["crush_rule"] != 0:
            log.info(
                f"A non-replicated pool found : {entry['pool_name']}, proceeding to delete pool"
            )
            if not rados_obj.detete_pool(pool=entry["pool_name"]):
                log.error(f"the pool {entry['pool_name']} could not be deleted")
                return 1
        log.debug("No pools other than replicated found on cluster")

    # disabling automatic crush update
    cmd = "ceph config set osd osd_crush_update_on_start false"
    cephadm.shell([cmd])

    site1 = config.get("site1", "site1")
    site2 = config.get("site2", "site2")

    # Collecting osd details and split them into Sita A and Site B
    sorted_osds = sort_osd_sites(all_osd_details=osd_details)
    site_a_osds = sorted_osds[0]
    site_b_osds = sorted_osds[1]
    if config.get("perform_add_capacity"):
        site_a_osds = sorted_osds[0][: (len(sorted_osds[0]) // 2)]
        site_b_osds = sorted_osds[1][: (len(sorted_osds[1]) // 2)]

    if not set_osd_sites(
        node=cephadm,
        osds=site_a_osds,
        site=site1,
        all_osd_details=osd_details,
    ):
        log.error("Failed to move the OSD's into sites")
        return 1

    if not set_osd_sites(
        node=cephadm,
        osds=site_b_osds,
        site=site2,
        all_osd_details=osd_details,
    ):
        log.error("Failed to move the OSD's into sites")
        return 1

    # collecting mon map to be compared after strtech cluster deployment
    stretch_rule_name = "stretch_rule"
    if not setup_crush_rule(
        node=client_node, rule_name=stretch_rule_name, site1=site1, site2=site2
    ):
        log.error("Failed to Add crush rules in the crush map")
        return 1

    # Setting the election strategy to connectivity mode
    if not mon_obj.set_election_strategy(mode="connectivity"):
        log.error("could not set election strategy to connectivity mode")
        return 1

    # Sleeping for 5 sec for the strategy to be active
    time.sleep(5)
    init_mon_state = get_mon_details(node=cephadm)

    # Checking if mon elections happened after changing election strategy
    if mon_state["epoch"] > init_mon_state["epoch"]:
        log.error("Election epoch not bumped up after setting the connectivity mode.")
        return 1

    # Checking updated election strategy in mon map
    strategy = mon_obj.get_election_strategy()
    if strategy != 3:
        log.error(
            f"cluster created election strategy other than connectivity, i.e {strategy}"
        )
        return 1
    log.info("Enabled connectivity mode on the cluster")

    log.info(f"selecting mon : {tiebreaker_node} as tie breaker monitor on site 3")
    if not set_mon_sites(
        node=cephadm, tiebreaker_node=tiebreaker_node, site1=site1, site2=site2
    ):
        log.error("Failed to ad monitors into respective sites")
        return 1

    # All the existing pools should be automatically changed with stretch rule. Creating a test pool
    pool_name = "test_pool_1"
    if not rados_obj.create_pool(pool_name=pool_name, pg_num=16):
        log.error("Failed to create the replicated Pool")
        return 1

    log.info("Monitors added to respective sites. enabling stretch rule")
    cmd = f"/bin/ceph mon enable_stretch_mode {tiebreaker_node.hostname} {stretch_rule_name} datacenter"
    try:
        cephadm.shell([cmd])
    except Exception as err:
        log.error(
            f"Error while enabling stretch rule on the datacenter. Command : {cmd}"
        )
        log.error(err)
        return 1

    if get_mon_details(node=cephadm)["epoch"] < init_mon_state["epoch"]:
        log.error("Election epoch not bumped up after Enabling strech mode")
        return 1

    # Increasing backfill/rebalance threads so that cluster will re-balance it faster
    rados_obj.change_recover_threads(config=config, action="set")

    # wait for active + clean after deployment of stretch mode
    # checking the state after deployment coz of BZ : https://bugzilla.redhat.com/show_bug.cgi?id=2025800
    res = wait_for_clean_pg_sets(rados_obj)
    if not res:
        status_report = rados_obj.run_ceph_command(cmd="ceph report")
        # Proceeding to check if all PG's are in active + clean
        for entry in status_report["num_pg_by_state"]:
            rec = ("remapped", "peering")
            if any(key in rec for key in entry["state"].split("+")):
                log.error(
                    "PG's in cluster are stuck in remapped+peering after stretch deployment."
                )
                return 1

    if config.get("perform_add_capacity"):
        pool_name = "test_stretch_pool"
        if not rados_obj.create_pool(
            pool_name=pool_name,
            crush_rule=stretch_rule_name,
        ):
            log.error("Failed to create the replicated Pool")
            return 1
        do_rados_put(mon=client_node, pool=pool_name, nobj=100)

        log.info("Performing add Capacity after the deployment of stretch cluster")
        site_a_osds = [osd for osd in sorted_osds[0] if osd not in site_a_osds]
        site_b_osds = [osd for osd in sorted_osds[1] if osd not in site_b_osds]

        if not set_osd_sites(
            node=cephadm,
            osds=site_a_osds,
            site=site1,
            all_osd_details=osd_details,
        ):
            log.error("Failed to move the OSD's into sites")
            return 1
        if not set_osd_sites(
            node=cephadm,
            osds=site_b_osds,
            site=site2,
            all_osd_details=osd_details,
        ):
            log.error("Failed to move the OSD's into sites")
            return 1

        flag = wait_for_clean_pg_sets(rados_obj)
        if not flag:
            log.error(
                "The cluster did not reach active + Clean state after add capacity"
            )
            return 1

        with parallel() as p:
            p.spawn(do_rados_get, client_node, pool_name, 10)
            for res in p:
                log.info(res)
        log.info("Successfully completed Add Capacity scenario")

    rados_obj.change_recover_threads(config=config, action="rm")

    # Checking if the pools have been updated with the new crush rules
    acting_set = rados_obj.get_pg_acting_set(pool_name=pool_name)
    if len(acting_set) != 4:
        log.error(
            f"There are {len(acting_set)} OSD's in PG. OSDs: {acting_set}. Stretch cluster requires 4"
        )
        return 1
    log.info(f"Acting set : {acting_set} Consists of 4 OSD's per PG")
    log.info("Stretch rule with arbiter monitor node set up successfully")
    return 0
Exemplo n.º 9
0
def run(ceph_cluster, **kw):
    """
    Test to create a large number of omap entries on the single PG pool and test osd resiliency
    Returns:
        1 -> Fail, 0 -> Pass
    """
    log.info(run.__doc__)
    config = kw["config"]
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)
    pool_obj = PoolFunctions(node=cephadm)

    pool_target_configs = config["verify_osd_omap_entries"]["configurations"]
    omap_target_configs = config["verify_osd_omap_entries"]["omap_config"]

    # Creating pools and starting the test
    for entry in pool_target_configs.values():
        log.debug(
            f"Creating {entry['pool_type']} pool on the cluster with name {entry['pool_name']}"
        )
        if entry.get("pool_type", "replicated") == "erasure":
            method_should_succeed(rados_obj.create_erasure_pool,
                                  name=entry["pool_name"],
                                  **entry)
        else:
            method_should_succeed(
                rados_obj.create_pool,
                **entry,
            )

        log.debug(
            "Created the pool. beginning to create large number of omap entries on the pool"
        )
        if not pool_obj.fill_omap_entries(pool_name=entry["pool_name"],
                                          **omap_target_configs):
            log.error(
                f"Omap entries not generated on pool {entry['pool_name']}")
            return 1

        # Fetching the current acting set for the pool
        acting_set = rados_obj.get_pg_acting_set(pool_name=entry["pool_name"])
        rados_obj.change_recover_threads(config={}, action="set")
        log.debug(
            f"Proceeding to restart OSd's from the acting set {acting_set}")
        for osd_id in acting_set:
            rados_obj.change_osd_state(action="stop", target=osd_id)
            # sleeping for 5 seconds for re-balancing to begin
            time.sleep(5)

            # Waiting for cluster to get clean state after OSD stopped
            if not wait_for_clean_pg_sets(rados_obj):
                log.error("PG's in cluster are not active + Clean state.. ")
                return 1
            rados_obj.change_osd_state(action="restart", target=osd_id)
            log.debug(
                f"Cluster reached clean state after osd {osd_id} stop and restart"
            )

        rados_obj.change_recover_threads(config={}, action="rm")
        # deleting the pool created after the test
        rados_obj.detete_pool(pool=entry["pool_name"])

        log.info(
            f"All the OSD's from the acting set {acting_set} were restarted "
            f"and object movement completed for pool {entry['pool_name']}")

    log.info(
        "Completed testing effects of large number of omap entries on pools ")
    return 0
Exemplo n.º 10
0
def run(ceph_cluster, **kw):
    """
    Performs various pool related validation tests
    Returns:
        1 -> Fail, 0 -> Pass
    """
    log.info(run.__doc__)
    config = kw["config"]
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)
    mon_obj = MonConfigMethods(rados_obj=rados_obj)

    if config.get("ec_pool_recovery_improvement"):
        ec_config = config.get("ec_pool_recovery_improvement")
        if not rados_obj.create_erasure_pool(name="recovery", **ec_config):
            log.error("Failed to create the EC Pool")
            return 1

        if not rados_obj.bench_write(**ec_config):
            log.error("Failed to write objects into the EC Pool")
            return 1
        rados_obj.bench_read(**ec_config)
        log.info("Created the EC Pool, Finished writing data into the pool")

        # getting the acting set for the created pool
        acting_pg_set = rados_obj.get_pg_acting_set(
            pool_name=ec_config["pool_name"])
        if len(acting_pg_set) != ec_config["k"] + ec_config["m"]:
            log.error(
                f"acting set consists of only these : {acting_pg_set} OSD's, less than k+m"
            )
            return 1
        log.info(
            f" Acting set of the pool consists of OSD's : {acting_pg_set}")
        log.info(
            f"Killing m, i.e {ec_config['m']} OSD's from acting set to verify recovery"
        )
        stop_osds = [acting_pg_set.pop() for _ in range(ec_config["m"])]
        for osd_id in stop_osds:
            if not rados_obj.change_osd_state(action="stop", target=osd_id):
                log.error(f"Unable to stop the OSD : {osd_id}")
                return 1

        log.info(
            "Stopped 'm' number of OSD's from, starting to wait for recovery")
        rados_obj.change_recover_threads(config=ec_config, action="set")

        # Sleeping for 25 seconds ( "osd_heartbeat_grace": "20" ) for osd's to be marked down
        time.sleep(25)

        # Waiting for up to 2.5 hours for the recovery to complete and PG's to enter active + Clean state
        end_time = datetime.datetime.now() + datetime.timedelta(seconds=9000)
        while end_time > datetime.datetime.now():
            flag = True
            status_report = rados_obj.run_ceph_command(cmd="ceph report")

            # Proceeding to check if all PG's are in active + clean
            for entry in status_report["num_pg_by_state"]:
                rec = (
                    "backfilling",
                    "degraded",
                    "incomplete",
                    "recovering",
                    "recovery_wait",
                    "backfilling_wait",
                    "peered",
                    "undersized",
                )
                if any(key in rec for key in entry["state"].split("+")):
                    flag = False

            if flag:
                log.info(
                    "The recovery and back-filling of the OSD is completed")
                break
            log.info(
                f"Waiting for active + clean. Active aletrs: {status_report['health']['checks'].keys()},"
                f"PG States : {status_report['num_pg_by_state']}"
                f" checking status again in 1 minute")
            time.sleep(60)

        # getting the acting set for the created pool after recovery
        acting_pg_set = rados_obj.get_pg_acting_set(
            pool_name=ec_config["pool_name"])
        if len(acting_pg_set) != ec_config["k"] + ec_config["m"]:
            log.error(
                f"acting set consists of only these : {acting_pg_set} OSD's, less than k+m"
            )
            return 1
        log.info(
            f" Acting set of the pool consists of OSD's : {acting_pg_set}")
        # Changing recovery threads back to default
        rados_obj.change_recover_threads(config=ec_config, action="rm")

        log.debug("Starting the stopped OSD's")
        for osd_id in stop_osds:
            if not rados_obj.change_osd_state(action="restart", target=osd_id):
                log.error(f"Unable to restart the OSD : {osd_id}")
                return 1

        # Sleep for 5 seconds for OSD's to join the cluster
        time.sleep(5)

        if not flag:
            log.error(
                "The pool did not reach active + Clean state after recovery")
            return 1

        # Deleting the pool created
        if not rados_obj.detete_pool(pool=ec_config["pool_name"]):
            log.error(
                f"the pool {ec_config['pool_name']} could not be deleted")
            return 1

        log.info("Successfully tested EC pool recovery with K osd's surviving")
        return 0

    if config.get("Compression_tests"):
        """
        Create a 2 replicated pools:
        1. Pool_1 : enable any compression algorithm(def snappy) and compression mode(aggressive/force).
        2. Pool_2 : set compression mode to none
        Writing the same amount of data on 2 pools, size of pool with compression on would consume less space
        """
        pool_config = config["Compression_tests"]["pool_config"]
        compression_config = config["Compression_tests"]["compression_config"]
        pool_1 = pool_config["pool-1"]
        pool_2 = pool_config["pool-2"]

        if config["Compression_tests"]["pool_type"] == "replicated":
            if not rados_obj.create_pool(pool_name=pool_1, **pool_config):
                log.error("could not create pool-1")
                return 1
            if not rados_obj.create_pool(pool_name=pool_2, **pool_config):
                log.error("could not create pool-2")
                return 1
        elif config["Compression_tests"]["pool_type"] == "erasure":
            pool_config["pool_name"] = pool_1
            if not rados_obj.create_erasure_pool(name=pool_1, **pool_config):
                log.error("could not create pool-1")
                return 1
            pool_config["pool_name"] = pool_2
            if not rados_obj.create_erasure_pool(name=pool_2, **pool_config):
                log.error("could not create pool-2")
                return 1
            del pool_config["pool_name"]

        log.debug("Created two pools to test compression")

        # Enabling compression on pool-1
        if not rados_obj.pool_inline_compression(pool_name=pool_1,
                                                 **compression_config):
            log.error(
                f"Error setting compression on pool : {pool_1} for config {compression_config}"
            )
            return 1

        # Writing the same amount of data into two pools
        if not rados_obj.bench_write(pool_name=pool_1, **pool_config):
            log.error(
                "Failed to write objects into Pool-1, with compression enabled"
            )
            return 1

        if not rados_obj.bench_write(pool_name=pool_2, **pool_config):
            log.error(
                "Failed to write objects into Pool-2, without compression enabled"
            )
            return 1
        # Sleeping for 5 seconds for status to be updated.
        time.sleep(5)

        log.debug(
            "Finished writing data into the two pools. Checking pool stats")
        try:
            pool_stats = rados_obj.run_ceph_command(
                cmd="ceph df detail")["pools"]
            pool_1_stats = [
                detail for detail in pool_stats if detail["name"] == pool_1
            ][0]["stats"]
            pool_2_stats = [
                detail for detail in pool_stats if detail["name"] == pool_2
            ][0]["stats"]
        except KeyError:
            log.error(
                "No stats about the pools requested found on the cluster")
            return 1

        log.debug(f"Pool-1 stats: {pool_1_stats}")
        log.debug(f"Pool-2 stats: {pool_2_stats}")
        if pool_1_stats["compress_bytes_used"] < 0:
            log.error("No data stored under pool-1 is compressed")
            return 1

        if pool_1_stats["kb_used"] >= pool_2_stats["kb_used"]:
            log.error("Compression has no effect on the pool size...")
            return 1

        if config["Compression_tests"].get("verify_compression_ratio_set"):
            # added verification for test: CEPH-83571672
            if not rados_obj.check_compression_size(pool_name=pool_1,
                                                    **compression_config):
                log.error("data not compressed in accordance to ratio set")
                return 1

        log.info("Pool size is less when compression is enabled")
        return 0

    if config.get("check_autoscaler_profile"):
        """
        Verifies that the default auto-scaler profile on 5.1 builds in scale-up
        Verifies bugs : 1. https://bugzilla.redhat.com/show_bug.cgi?id=2021738
        """
        build = config.get("build", config.get("rhbuild"))
        autoscale_conf = config.get("check_autoscaler_profile")
        regex = r"5.1-rhel-\d{1}"
        if re.search(regex, build):
            log.info(
                "Test running on 5.1 builds, checking the default autoscaler profile"
            )
            if not mon_obj.verify_set_config(**autoscale_conf):
                log.error(
                    f"The default value for autoscaler profile is not scale-up in buld {build}"
                )
                return 1
            log.info(f"Autoscale profile is scale-up in release : {build}")
        else:
            log.debug(
                f"The profile is already scale-up by default in release : {build}"
            )
        return 0
Exemplo n.º 11
0
def run(ceph_cluster, **kw):
    """
    enables connectivity mode and deploys stretch cluster with arbiter mon node
    Actions Performed:
    1. Disables the automatic crush map update
    2. Collects the OSD daemons in the cluster and split them into 2 sites.
    3. If add capacity is selected, only half of the OSD's will be added to various sites initially.
    4. Adds the stretch rule into crush map.
    5. Adding monitors into the 2 sites.
    6. Create a replicated pool and deploy stretch mode.
    7. Create a test pool, write some data and perform add capacity. ( add osd nodes into two sites )
    8. Check for the bump in election epochs throughout.
    9. Check the acting set in PG for 4 OSD's. 2 from each site.
    Verifies bugs:
    [1]. https://bugzilla.redhat.com/show_bug.cgi?id=1937088
    [2]. https://bugzilla.redhat.com/show_bug.cgi?id=1952763
    Args:
        ceph_cluster (ceph.ceph.Ceph): ceph cluster
    """
    log.info("Deploying stretch cluster with arbiter mon node")
    log.info(run.__doc__)
    config = kw.get("config")
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)
    mon_obj = MonElectionStrategies(rados_obj=rados_obj)
    client_node = ceph_cluster.get_nodes(role="client")[0]
    tiebreaker_node = ceph_cluster.get_nodes(role="installer")[0]

    if not client_node and not tiebreaker_node:
        log.error(
            "Admin client and tie breaker node not configured, Cannot modify crush rules for stretch cluster"
        )
        return 1
    mon_state = get_mon_details(node=cephadm)
    if len(list(mon_state["monitors"])) < 5:
        log.error(
            f"Minimum of 5 Mon daemons needed to deploy a stretch cluster, found : {len(mon_state['monitors'])}"
        )
        return 1
    osd_details = get_osd_details(node=cephadm)
    if len(osd_details.keys()) < 4:
        log.error(
            f"Minimum of 4 osd daemons needed to deploy a stretch cluster, found : {len(osd_details.keys())}"
        )
        return 1

    # Finding and Deleting any stray EC pools that might have been left on cluster
    pool_dump = rados_obj.run_ceph_command(cmd="ceph osd dump")
    for entry in pool_dump["pools"]:
        if entry["type"] != 1 and entry["crush_rule"] != 0:
            log.info(
                f"A non-replicated pool found : {entry['pool_name']}, proceeding to delete pool"
            )
            if not rados_obj.detete_pool(pool=entry["pool_name"]):
                log.error(
                    f"the pool {entry['pool_name']} could not be deleted")
                return 1
        log.debug("No pools other than replicated found on cluster")

    # disabling automatic crush update
    cmd = "ceph config set osd osd_crush_update_on_start false"
    cephadm.shell([cmd])

    site1 = config.get("site1", "site1")
    site2 = config.get("site2", "site2")

    # Collecting osd details and split them into Sita A and Site B
    sorted_osds = sort_osd_sites(all_osd_details=osd_details)
    site_a_osds = sorted_osds[0]
    site_b_osds = sorted_osds[1]
    if config.get("perform_add_capacity"):
        site_a_osds = sorted_osds[0][:(len(sorted_osds[0]) // 2)]
        site_b_osds = sorted_osds[1][:(len(sorted_osds[1]) // 2)]

    if not set_osd_sites(
            node=cephadm,
            osds=site_a_osds,
            site=site1,
            all_osd_details=osd_details,
    ):
        log.error("Failed to move the OSD's into sites")
        return 1

    if not set_osd_sites(
            node=cephadm,
            osds=site_b_osds,
            site=site2,
            all_osd_details=osd_details,
    ):
        log.error("Failed to move the OSD's into sites")
        return 1

    # collecting mon map to be compared after strtech cluster deployment
    stretch_rule_name = "stretch_rule"
    if not setup_crush_rule(node=client_node,
                            rule_name=stretch_rule_name,
                            site1=site1,
                            site2=site2):
        log.error("Failed to Add crush rules in the crush map")
        return 1

    # Setting the election strategy to connectivity mode
    if not mon_obj.set_election_strategy(mode="connectivity"):
        log.error("could not set election strategy to connectivity mode")
        return 1

    # Sleeping for 5 sec for the strategy to be active
    time.sleep(5)
    init_mon_state = get_mon_details(node=cephadm)

    # Checking if mon elections happened after changing election strategy
    if mon_state["epoch"] > init_mon_state["epoch"]:
        log.error(
            "Election epoch not bumped up after setting the connectivity mode."
        )
        return 1

    # Checking updated election strategy in mon map
    strategy = mon_obj.get_election_strategy()
    if strategy != 3:
        log.error(
            f"cluster created election strategy other than connectivity, i.e {strategy}"
        )
        return 1
    log.info("Enabled connectivity mode on the cluster")

    log.info(
        f"selecting mon : {tiebreaker_node} as tie breaker monitor on site 3")
    if not set_mon_sites(node=cephadm,
                         tiebreaker_node=tiebreaker_node,
                         site1=site1,
                         site2=site2):
        log.error("Failed to ad monitors into respective sites")
        return 1

    # All the existing pools should be automatically changed with stretch rule. Creating a test pool
    pool_name = "test_pool_1"
    if not rados_obj.create_pool(pool_name=pool_name, pg_num=16):
        log.error("Failed to create the replicated Pool")
        return 1

    log.info("Monitors added to respective sites. enabling stretch rule")
    cmd = f"/bin/ceph mon enable_stretch_mode {tiebreaker_node.hostname} {stretch_rule_name} datacenter"
    try:
        cephadm.shell([cmd])
    except Exception as err:
        log.error(
            f"Error while enabling stretch rule on the datacenter. Command : {cmd}"
        )
        log.error(err)
        return 1

    if get_mon_details(node=cephadm)["epoch"] < init_mon_state["epoch"]:
        log.error("Election epoch not bumped up after Enabling strech mode")
        return 1

    if config.get("perform_add_capacity"):
        pool_name = "test_stretch_pool"
        if not rados_obj.create_pool(
                pool_name=pool_name,
                crush_rule=stretch_rule_name,
        ):
            log.error("Failed to create the replicated Pool")
            return 1
        do_rados_put(mon=client_node, pool=pool_name, nobj=1000)

        # Increasing backfill/rebalance threads so that cluster will re-balance it faster after add capacity
        rados_obj.change_recover_threads(config=config, action="set")

        log.info(
            "Performing add Capacity after the deployment of stretch cluster")
        site_a_osds = [osd for osd in sorted_osds[0] if osd not in site_a_osds]
        site_b_osds = [osd for osd in sorted_osds[1] if osd not in site_b_osds]

        if not set_osd_sites(
                node=cephadm,
                osds=site_a_osds,
                site=site1,
                all_osd_details=osd_details,
        ):
            log.error("Failed to move the OSD's into sites")
            return 1
        if not set_osd_sites(
                node=cephadm,
                osds=site_b_osds,
                site=site2,
                all_osd_details=osd_details,
        ):
            log.error("Failed to move the OSD's into sites")
            return 1

        # Waiting for up to 2.5 hours for the PG's to enter active + Clean state after add capacity
        # Automation for bug : [1] & [2]
        end_time = datetime.datetime.now() + datetime.timedelta(seconds=9000)
        while end_time > datetime.datetime.now():
            flag = True
            status_report = rados_obj.run_ceph_command(cmd="ceph report")

            # Proceeding to check if all PG's are in active + clean
            for entry in status_report["num_pg_by_state"]:
                rec = (
                    "remapped",
                    "backfilling",
                    "degraded",
                    "incomplete",
                    "peering",
                    "recovering",
                    "recovery_wait",
                    "peering",
                    "undersized",
                    "backfilling_wait",
                )
                if any(key in rec for key in entry["state"].split("+")):
                    flag = False

            if flag:
                log.info(
                    "The recovery and back-filling of the OSD is completed")
                break
            log.info(
                f"Waiting for active + clean. Active aletrs: {status_report['health']['checks'].keys()},"
                f"PG States : {status_report['num_pg_by_state']}"
                f" checking status again in 2 minutes")
            time.sleep(120)
        rados_obj.change_recover_threads(config=config, action="rm")
        if not flag:
            log.error(
                "The cluster did not reach active + Clean state after add capacity"
            )
            return 1

        with parallel() as p:
            p.spawn(do_rados_get, client_node, pool_name, 10)
            for res in p:
                log.info(res)

    # Checking if the pools have been updated with the new crush rules
    acting_set = rados_obj.get_pg_acting_set(pool_name=pool_name)
    if len(acting_set) != 4:
        log.error(
            f"There are {len(acting_set)} OSD's in PG. OSDs: {acting_set}. Stretch cluster requires 4"
        )
        return 1
    log.info(f"Acting set : {acting_set} Consists of 4 OSD's per PG")
    log.info("Stretch rule with arbiter monitor node set up successfully")
    return 0