Exemplo n.º 1
0
def change_config_for_slow_ops(node: CephAdmin, action: str, **kwargs):
    """
    Changes few config Values on ceph cluster to intentionally increase changes of hitting slow_ops on
    the cluster network.

    Actions performed and rationale:
    * paxos_service_trim_min & paxos_service_trim_max set as mentioned in
    bz : https://bugzilla.redhat.com/show_bug.cgi?id=1943357#c0
    * osd_op_complaint_time -> reducing the time threshold by which OSD should respond to requests
    * osd_max_backfills & osd_recovery_max_active -> Incrasing the number of threads for recovery &
    backfill as to reduce n/w bandwidth for client IO operations

    Args:
        node: Cephadm node where the commands need to be executed
        action: weather to set the Config or to remove it from cluster
                Values : "set" -> to set the config values
                         "rm" -> to remove the config changes made
        kwargs: Any other optional args that need to be passed
    Returns: Exception in case of failures

    """
    value_map = {
        "paxos_service_trim_min": kwargs.get("paxos_service_trim_min", 10),
        "paxos_service_trim_max": kwargs.get("paxos_service_trim_max", 100),
        "osd_op_complaint_time": kwargs.get("osd_op_complaint_time", 0.000001),
        "osd_max_backfills": kwargs.get("osd_max_backfills", 8),
        "osd_recovery_max_active": kwargs.get("osd_recovery_max_active", 10),
    }
    cmd_map = {
        "paxos_service_trim_min":
        f"ceph config {action} mon paxos_service_trim_min",
        "paxos_service_trim_max":
        f"ceph config {action} mon paxos_service_trim_max",
        "osd_op_complaint_time":
        f"ceph config {action} osd osd_op_complaint_time",
        "osd_max_backfills":
        f"ceph config {action} osd osd_max_backfills",
        "osd_recovery_max_active":
        f"ceph config {action} osd osd_recovery_max_active",
    }

    # Removing the config values set when action is to remove
    if action == "rm":
        for cmd in cmd_map.keys():
            node.shell([cmd_map[cmd]])
        return

    # Adding the config values
    for val in cmd_map.keys():
        cmd = f"{cmd_map[val]} {value_map[val]}"
        node.shell([cmd])

    # Verifying the values set in the config
    config_dump = run_ceph_command(node, cmd="ceph config dump")
    for val in cmd_map.keys():
        for conf in config_dump:
            if conf["name"] == val:
                if float(conf["value"]) != float(value_map[val]):
                    error = f"Values do not match for config {conf['name']}"
                    raise TestBedSetupFailure(error)
Exemplo n.º 2
0
def unmute_health_alert(alert: str, node: CephAdmin) -> bool:
    """
    Un-Mutes the health alert on the cluster
    Args:
        alert: Name of the alert to be muted
        node: node on which command should be executed

    Returns: True -> pass, False -> failure

    """
    all_alerts = get_alerts(node)
    if alert not in all_alerts["muted_alerts"] + all_alerts["active_alerts"]:
        log.info(
            f"the alert: {alert} not generated on the cluster, Cannot mute")
        return True
    if alert in all_alerts["active_alerts"]:
        log.info(f"the alert: {alert} is already un-muted")
        return True

    # Un-Muting the given alert
    cmd = f"ceph health unmute {alert}"
    node.shell([cmd])
    # Sleeping for 2 sec for the alert to be logged
    time.sleep(2)
    all_alerts = get_alerts(node)
    log.info(
        f"Un-Muted the alert : {alert}. All the Un-muted alerts : {all_alerts['active_alerts']}"
    )
    return True if alert in all_alerts["active_alerts"] else False
Exemplo n.º 3
0
def set_osd_sites(
    node: CephAdmin, osds: list, site: str, all_osd_details: dict
) -> bool:
    """
    Collects all the details about the OSD's present on the cluster and distrubutes them among the two sites
    Args:
        node: Cephadm node where the commands need to be executed
        osds: list of OSD's to be added to the given site
        site: the name of the site.
        all_osd_details: dictionary of OSD's containing the details
            eg : {'2': {'weight': 0.01459, 'state': 'up', 'name': 'osd.2'},
                '7': {'weight': 0.01459, 'state': 'up', 'name': 'osd.7'}}

    Returns: True -> pass, False -> fail
    """
    # adding the identified OSD's into the respective sites
    sites = set()
    sites.add(site)
    if len(sites) > 2:
        log.error("There can only be 2 Sites with stretch cluster at present")
        return False
    try:
        for osd in osds:
            cmd = f"ceph osd crush move {all_osd_details[osd]['name']} host=host-{site}-{osd} datacenter={site}"
            node.shell([cmd])
            # sleeping for 20 seconds for osd to be moved
            time.sleep(20)
    except Exception:
        log.error("Failed to move the OSD's into Site A and Site B")
        return False

    cmd = "ceph osd tree"
    log.info(node.shell([cmd]))
    return True
Exemplo n.º 4
0
def get_pg_acting_set(node: CephAdmin, pool_name: str) -> list:
    """
    Fetches the PG details about the given pool and then returns the acting set of OSD's from sample PG of the pool
    Args:
        node: Cephadm node where the commands need to be executed
        pool_name: name of the pool whose one of the acting OSD set is needed

    Returns: list osd's part of acting set
    eg : [3,15,20]

    """
    # Collecting details about the cluster
    cmd = "ceph osd dump --format=json"
    out, err = node.shell([cmd])
    res = json.loads(out)
    for val in res["pools"]:
        if val["pool_name"] == pool_name:
            pool_id = val["pool"]
            break
    # Collecting the details of the 1st PG in the pool <ID>.0
    pg_num = f"{pool_id}.0"
    cmd = f"ceph pg map {pg_num} --format=json"
    out, err = node.shell([cmd])
    res = json.loads(out)
    return res["up"]
Exemplo n.º 5
0
def set_mon_sites(node: CephAdmin, tiebreaker_node, site1: str, site2: str) -> bool:
    """
    Adds the mon daemons into the two sites with arbiter node at site 3 as a tie breaker
    Args:
        node: Cephadm node where the commands need to be executed
        tiebreaker_node: name of the monitor to be added as tie breaker( site 3 )
        site1: Name the 1st site
        site2: Name of the 2nd site
    Returns: True -> pass, False -> fail

    """
    # Collecting the mon details
    mon_state = get_mon_details(node=node)
    monitors = list(mon_state["monitors"])
    monitors.remove(tiebreaker_node.hostname)
    commands = [
        f"/bin/ceph mon set_location {tiebreaker_node.hostname} datacenter=arbiter",
        f"/bin/ceph mon set_location {monitors[0]} datacenter={site1}",
        f"/bin/ceph mon set_location {monitors[1]} datacenter={site1}",
        f"/bin/ceph mon set_location {monitors[2]} datacenter={site2}",
        f"/bin/ceph mon set_location {monitors[3]} datacenter={site2}",
    ]
    for cmd in commands:
        try:
            node.shell([cmd])
        except Exception as err:
            log.error(err)
            return False
        # Sleeping till mon restarts with new site info and rejoin the mon quorum
        if not wait_for_alert(node=node, alert="MON_DOWN", duration=180):
            log.error("mon down after adding to site after waiting 180 seconds")
            return False
    log.info("Added all the mon nodes into respective sites")
    return True
Exemplo n.º 6
0
def detete_pool(node: CephAdmin, pool: str) -> bool:
    """
    Deletes the given pool from the cluster
    Args:
        node: Cephadm node where the commands need to be executed
        pool: name of the pool to be deleted

    Returns: True -> pass, False -> fail
    """
    # Checking if config is set to allow pool deletion
    config_dump = run_ceph_command(node, cmd="ceph config dump")
    if "mon_allow_pool_delete" not in [conf["name"] for conf in config_dump]:
        cmd = "ceph config set mon mon_allow_pool_delete true"
        node.shell([cmd])

    existing_pools = run_ceph_command(node, cmd="ceph df")
    if pool not in [ele["name"] for ele in existing_pools["pools"]]:
        log.error(f"Pool:{pool} does not exist on cluster, cannot delete")
        return True

    cmd = f"ceph osd pool delete {pool} {pool} --yes-i-really-really-mean-it"
    node.shell([cmd])

    existing_pools = run_ceph_command(node, cmd="ceph df")
    if pool not in [ele["name"] for ele in existing_pools["pools"]]:
        log.info(f"Pool:{pool} deleted Successfully")
        return True
    log.error(f"Pool:{pool} could not be deleted on cluster")
    return False
Exemplo n.º 7
0
def reweight_crush_items(node: CephAdmin, **kwargs) -> bool:
    """
    Performs Re-weight of various CRUSH items, based on key-value pairs sent
    Args:
        node: Cephadm node where the commands need to be executed
        **kwargs: Arguments for the commands

    Returns: True -> pass, False -> fail

    """
    """Not returning false as I am not verifying the result of this.
    ( PG would be redistributed based whether we increase or decrease the weight. )
    PG movement is slow and will take time based on no of objects.
    Need to implement this check. So for now assuming it worked and returning true."""
    if kwargs.get("name"):
        name = kwargs["name"]
        weight = kwargs["weight"]
        cmd = f"ceph osd crush reweight {name} {weight}"
        node.shell([cmd])
        return True

    # if no params are provided, Doing the re-balance by utilization.
    # todo: implementing the checks to verify the behaviour of the re-weight commands
    cmd = r"ceph osd reweight-by-utilization"
    node.shell([cmd])
    return True
Exemplo n.º 8
0
def generate_health_alert(alert: str, node: CephAdmin, **kwargs) -> bool:
    """
    Method to generate various health alerts
    Args:
        alert: name of the alert to be generated
        node: name of the installer node
        clear: Bool value which specifies if the given alert should be cleared
        kwargs: any other params that need to be sent for a particular alert

    Returns: True -> pass, False -> failure

    """
    clear = kwargs.get("clear")
    if alert == "OSDMAP_FLAGS":
        try:
            flag = kwargs.get("flag")
        except KeyError:
            log.error(f"Flag not provided to generate health alert : {alert}")
            return False
        cmd = f"ceph osd set {flag}"
        if clear:
            cmd = f"ceph osd unset {flag}"
        try:
            node.shell([cmd])
            log.debug(f"{flag} set")
        except Exception:
            log.error(f"Failed to set the osd flag {flag}")
            log.error(traceback.format_exc())
            return False
        # Sleeping for 5 seconds for the error to logged by cluster
        time.sleep(5)
        return True

    if alert == "MON_DISK_BIG":
        cmd = "ceph config set global mon_data_size_warn 2500000"
        if clear:
            cmd = "ceph config set global mon_data_size_warn 16106127360"
        try:
            node.shell([cmd])
            log.debug("changed the mon data warn size param")
        except Exception:
            log.error("Failed to change the mon data warn size")
            log.error(traceback.format_exc())
            return False
        # Sleeping for 5 seconds for the error to logged by cluster
        time.sleep(5)
        return True

    log.error(f"method not implemented to generate the alert : {alert}")
    return False
Exemplo n.º 9
0
def set_cluster_configuration_checks(node: CephAdmin, **kwargs) -> bool:
    """
    Sets up Cephadm to periodically scan each of the hosts in the cluster, and to understand the state of the OS,
     disks, NICs etc
     ref doc : https://docs.ceph.com/en/latest/cephadm/operations/#cluster-configuration-checks
    Args:
        node: Cephadm node where the commands need to be executed
        kwargs: Any other param that needs to passed

    Returns: True -> pass, False -> fail

    """
    cmd = "ceph config set mgr mgr/cephadm/config_checks_enabled true"
    node.shell([cmd])

    # Checking if the checks are enabled on cluster
    cmd = "ceph cephadm config-check status"
    out, err = node.shell([cmd])
    if not re.search("Enabled", out):
        log.error("Cluster config checks no t enabled")
        return False

    if kwargs.get("disable_check_list"):
        for check in kwargs.get("disable_check_list"):
            cmd = f"ceph cephadm config-check disable {check}"
            node.shell([cmd])

    if kwargs.get("enable_check_list"):
        for check in kwargs.get("enable_check_list"):
            cmd = f"ceph cephadm config-check enable {check}"
            node.shell([cmd])

    cmd = "ceph cephadm config-check ls"
    log.info(node.shell([cmd]))
    return True
Exemplo n.º 10
0
def run(ceph_cluster, **kw):
    """
    Cephadm Bootstrap

    Args:
        ceph_cluster (ceph.ceph.Ceph): Ceph cluster object
        kw: test data

    - Bootstrap cluster with default or custom image and
      returns after cephadm.bootstrap. To use default image, set 'registry'.

        Example:
            config:
                command: bootstrap
                base_cmd_args:
                    verbose: true
                args:
                    custom_image: true | false
                    mon-ip: <node_name>
                    mgr-id: <mgr_id>
                    fsid: <id>
    """
    config = kw.get("config")
    build = config.get("build", config.get("rhbuild"))
    ceph_cluster.rhcs_version = build
    config["overrides"] = kw.get("test_data", {}).get("custom-config")

    # Manage Ceph using ceph-admin orchestration
    command = config.pop("command")
    service = config.pop("service", "")
    log.info("Executing %s %s" % (service, command))

    instance = CephAdmin(cluster=ceph_cluster, **config)
    if "shell" in command:
        instance.shell(args=config["args"])
        return 0
    try:
        method = fetch_method(instance, command)
        out, err = method(config)

        # Verification of arguments
        # bootstrap response through stdout & stderr are combined here
        # currently console response coming through stderr.
        args = config.get("args", {})
        verify_bootstrap(instance, args, out + err)
    finally:
        # Get cluster state
        get_cluster_state(instance)
    return 0
Exemplo n.º 11
0
def get_slow_ops_data(node: CephAdmin, installer, action) -> bool:
    """
    Checks the operations running on the cluster
    Args:
        node: node: Cephadm node where the commands need to be executed
        installer: Name of the node where cephadm shell / Mon daemon are collocated
        action: Specifies weather to check for current operations or historic operations
                Values : "current" -> Checks for operations that are on going in cluster.
                         "historic" -> Operations that are completed and marked done by Monitor
    Returns: False if there are any running ops on the cluster, else True

    """

    # checking if any ops are currently present
    if action == "current":
        cmd = f" ceph daemon mon.{installer.hostname} ops -f json"
        out, err = node.shell([cmd])
        status = json.loads(out)
        log.info(status)
        if status["num_ops"] >= 1:
            log.error(
                f"There are operations on going on the cluster. Number : {status['num_ops']}"
            )
            for op in status["ops"]:
                log.error(
                    f"{op['description']} generated : {(op['type_data']['info'])}"
                )
            return False

    # Checking all the ops reports historically
    elif action == "historic":
        cmd = f"ceph daemon mon.{installer.hostname} dump_historic_ops -f json"
        out, err = node.shell([cmd])
        details = json.loads(out)
        size = details["size"]
        if size < 1:
            log.error("No slow operations generated on the cluster")
            return True

        total_dur = details["duration"]
        ops = details["ops"]
        log.info(
            f"No of slow_ops recorded : {size} for a total duration of {total_dur}\n"
            f"Slow ops generated for below items : \n")
        for op in ops:
            log.info(
                f"{op['description']} generated : {(op['type_data']['info'])}")

    return True
Exemplo n.º 12
0
def run(ceph_cluster, **kw):
    """Ceph-admin module to manage ceph-dashboard service.

    check ceph.ceph_admin.dashboard for test config.

    Args:
        ceph_cluster (ceph.ceph.Ceph): Ceph cluster object.
        kw: keyword arguments from test data.

    Returns:
        value 0 on success.

    """
    log.info("Running Ceph-admin Dashboard test")
    config = kw.get("config")

    build = config.get("build", config.get("rhbuild"))
    ceph_cluster.rhcs_version = build

    # Manage Ceph using ceph-admin orchestration
    command = config.pop("command")
    log.info("Executing dashboard %s operation" % command)
    instance = CephAdmin(cluster=ceph_cluster, **config)

    try:
        method = fetch_method(dashboard, command)
        method(instance, config.get("args"))
    finally:
        # Get cluster state
        get_cluster_state(instance)
    return 0
Exemplo n.º 13
0
def run(ceph_cluster, **kw):
    """
    Prepares the cluster & runs rados Customer Scenarios.
    Args:
        ceph_cluster (ceph.ceph.Ceph): ceph cluster
        kw: Args that need to be passed to the test for initialization
    Returns:
        1 -> Fail, 0 -> Pass
    """
    log.info(run.__doc__)
    config = kw.get("config")
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    if config.get("mondb_trim_config"):
        db_config = config.get("mondb_trim_config")
        try:
            verify_mon_db_trim(ceph_cluster=ceph_cluster,
                               node=cephadm,
                               **db_config)
            log.info("Mon DB is getting trimmed regularly")
        except (TestCaseFailureException, TestBedSetupFailure):
            log.error("Failed to verify mon db trimming")
            return 1

    log.info("Completed running the customer Scenario(s)")
    return 0
Exemplo n.º 14
0
def get_osd_details(node: CephAdmin) -> dict:
    """
    collects details such as weight and state of all OSD's on the cluster
    Args:
        node: Cephadm node where the commands need to be executed
    Returns: Dict -> pass, False -> fail
            dict eg : {'2': {'weight': 0.01459, 'state': 'up', 'name': 'osd.2'},
                        '7': {'weight': 0.01459, 'state': 'up', 'name': 'osd.7'}}
    """
    # Collecting all the OSD details
    cmd = "ceph osd tree"
    out, err = node.shell([cmd])
    log.info(out)
    regex = r"(\d{1,})\s+[\w]*\s+([.\d]*)\s+(osd.\d{1,})\s+(\w*)"
    osd_dict = {}
    if re.search(regex, out):
        osds = re.findall(regex, out)
        for osd in osds:
            osd_dict[osd[0]] = {
                "weight": float(osd[1]),
                "state": osd[3],
                "name": osd[2],
            }
    else:
        log.error("No osd's were found on the system")
    return osd_dict
Exemplo n.º 15
0
def configure_pg_autoscaler(node: CephAdmin, **kwargs) -> bool:
    """
    Configures pg_Autoscaler as a global global parameter and on pools
    Args:
        node: Cephadm node where the commands need to be executed
        **kwargs: Any other param that needs to be set

    Returns: True -> pass, False -> fail
    """

    if kwargs.get("enable"):
        mgr_modules = run_ceph_command(node, cmd="ceph mgr module ls")
        if "pg_autoscaler" not in mgr_modules["enabled_modules"]:
            cmd = "ceph mgr module enable pg_autoscaler"
            node.shell([cmd])

    if kwargs.get("pool_name"):
        pool_name = kwargs.get("pool_name")
        pg_scale_value = kwargs.get("pg_autoscale_value", "on")
        cmd = f"ceph osd pool set {pool_name} pg_autoscale_mode {pg_scale_value}"
        node.shell([cmd])

    if kwargs.get("default_mode"):
        default_mode = kwargs.get("default_mode")
        cmd = (
            f"ceph config set global osd_pool_default_pg_autoscale_mode {default_mode}"
        )
        node.shell([cmd])

    cmd = "ceph osd pool autoscale-status -f json"
    log.info(node.shell([cmd]))
    return True
def move_crush_item(node: CephAdmin, crush_obj: str, name: str,
                    value: str) -> None:
    """
    Moves the specified crush object to the given location, provided by name/value
    Args:
        node: node where the commands need to be executed
        crush_obj: Name of the CRUSH object to be moved
        name: New CRUSH object type
        value: New CRUSH object location

    Returns: None
    """
    cmd = f"ceph osd crush move {crush_obj} {name}={value}"
    try:
        node.shell([cmd])
        time.sleep(2)
    except Exception as err:
        log.error(err)
Exemplo n.º 17
0
def create_erasure_pool(node: CephAdmin, name: str, **kwargs) -> bool:
    """
    Creates a erasure code profile and then creates a pool with the same
    Args:
        node: Cephadm node where the commands need to be executed
        name: Name of the profile to create
        **kwargs: Any other param that needs to be set in the EC profile
    Returns: True -> pass, False -> fail

    """
    failure_domain = kwargs.get("crush-failure-domain", "osd")
    k = kwargs.get("k", 3)
    m = kwargs.get("m", 2)
    plugin = kwargs.get("plugin", "jerasure")
    pool_name = kwargs.get("pool_name")
    profile_name = f"ecprofile_{name}"

    # Creating a erasure coded profile with the options provided
    cmd = (
        f"ceph osd erasure-code-profile set {profile_name}"
        f" crush-failure-domain={failure_domain} k={k} m={m} plugin={plugin}")
    try:
        node.shell([cmd])
    except Exception as err:
        log.error(f"Failed to create ec profile : {profile_name}")
        log.error(err)
        return False

    cmd = f"ceph osd erasure-code-profile get {profile_name}"
    log.info(node.shell([cmd]))
    # Creating the pool with the profile created
    if not create_pool(
            node=node,
            ec_profile_name=profile_name,
            disable_pg_autoscale=True,
            **kwargs,
    ):
        log.error(f"Failed to create Pool {pool_name}")
        return False
    log.info(f"Created the ec profile : {profile_name} and pool : {pool_name}")
    return True
Exemplo n.º 18
0
def mute_health_alert(alert: str,
                      node: CephAdmin,
                      duration: str = None,
                      sticky: bool = False) -> bool:
    """
    Mutes the health alert generated on the cluster
    Args:
        alert: Name of the alert to be muted
        node: node on which command should be executed
        duration: duration for which the alert should be muted.
                Allowed Values: None -> mutes the specified alert indefinitely until the same alert is raised again
                                5m, 1h -> mutes the specified alert for specified duration
        sticky: makes use of the "--sticky" param to mute specified alert indefinitely

    Returns: True -> pass, False -> failure

    """
    all_alerts = get_alerts(node)
    if alert not in all_alerts["active_alerts"] + all_alerts["muted_alerts"]:
        log.info(
            f"the alert: {alert} not generated on the cluster, Cannot mute")
        return True
    if alert in all_alerts["muted_alerts"]:
        log.info(f"the alert: {alert} is already muted")
        return True

    # Muting the given alert along with specified duration
    cmd = f"ceph health mute {alert}"
    if duration:
        cmd += f" {duration}"
    if sticky:
        cmd += " --sticky"
    node.shell([cmd])

    # Sleeping for 5 sec for the alert to be logged
    time.sleep(5)
    all_alerts = get_alerts(node)
    log.info(
        f"Muted the alert : {alert}. All the muted alerts : {all_alerts['muted_alerts']}"
    )
    return True if alert in all_alerts["muted_alerts"] else False
Exemplo n.º 19
0
def run_rados_bench_write(node: CephAdmin, pool_name: str, **kwargs) -> bool:
    """
    Method to trigger Write operations via the Rados Bench tool
    Args:
        node: Cephadm node where the commands need to be executed
        pool_name: pool on which the operation will be performed
        kwargs: Any other param that needs to passed

    Returns: True -> pass, False -> fail

    """
    duration = kwargs.get("rados_write_duration", 200)
    byte_size = kwargs.get("byte_size", 4096)
    cmd = f"sudo rados --no-log-to-stderr -b {byte_size} -p {pool_name} bench {duration} write --no-cleanup"
    try:
        node.shell([cmd])
        return True
    except Exception as err:
        log.error(f"Error running rados bench write on pool : {pool_name}")
        log.error(err)
        return False
Exemplo n.º 20
0
def set_logging_to_file(node: CephAdmin) -> bool:
    """
    Enables the cluster logging into files at var/log/ceph and checks file permissions
    Args:
        node: Cephadm node where the commands need to be executed

    Returns: True -> pass, False -> fail
    """
    try:
        cmd = "ceph config set global log_to_file true"
        node.shell([cmd])
        cmd = "ceph config set global mon_cluster_log_to_file true"
        node.shell([cmd])
    except Exception:
        log.error("Error while enabling config to log into file")
        return False

    # Sleeping for 10 seconds for files to be generated
    time.sleep(10)

    cmd = "ls -ll /var/log/ceph"
    out, err = node.shell([cmd])
    log.info(out)
    regex = r"\s*([-rwx]*)\.\s+\d\s+([\w]*)\s+([\w]*)\s+[\w\s:]*(ceph[\w.]*log)"
    perm = "-rw-------"
    user = "******"
    files = ["ceph.log", "ceph.audit.log"]
    if re.search(regex, out):
        match = re.findall(regex, out)
        for val in match:
            if not (val[0] == perm and val[1] == user and val[2] == user):
                log.error(
                    f"file permissions are not correct for file : {val[3]}")
                return False
            if val[3] in files:
                files.remove(val[3])
    if files:
        log.error(f"Did not find the log files : {files}")
        return False
    return True
Exemplo n.º 21
0
def change_recover_threads(node: CephAdmin, config: dict, action: str):
    """
    increases or decreases the recovery threads based on the action sent
    Args:
        node: Cephadm node where the commands need to be executed
        config: Config from the suite file for the run
        action: Set or remove increase the backfill / recovery threads
            Values : "set" -> set the threads to specified value
                     "rm" -> remove the config changes made

    """

    cfg_map = {
        "osd_max_backfills": f"ceph config {action} osd osd_max_backfills",
        "osd_recovery_max_active": f"ceph config {action} osd osd_recovery_max_active",
    }
    for cmd in cfg_map:
        if action == "set":
            command = f"{cfg_map[cmd]} {config.get(cmd, 8)}"
        else:
            command = cfg_map[cmd]
        node.shell([command])
Exemplo n.º 22
0
def run_rados_bench_read(node: CephAdmin, pool_name: str, **kwargs) -> bool:
    """
    Method to trigger Read operations via the Rados Bench tool
    Args:
        node: Cephadm node where the commands need to be executed
        pool_name: pool on which the operation will be performed
        kwargs: Any other param that needs to passed

    Returns: True -> pass, False -> fail

    """
    duration = kwargs.get("rados_read_duration", 80)
    try:
        cmd = f"rados --no-log-to-stderr -p {pool_name} bench {duration} seq"
        node.shell([cmd])
        cmd = f"rados --no-log-to-stderr -p {pool_name} bench {duration} rand"
        node.shell([cmd])
        return True
    except Exception as err:
        log.error(f"Error running rados bench write on pool : {pool_name}")
        log.error(err)
        return False
Exemplo n.º 23
0
def create_pool(node: CephAdmin,
                pool_name: str,
                pg_num: int = 64,
                **kwargs) -> bool:
    """
    Create a pool named from the pool_name parameter.
     Args:
        node: Cephadm node where the commands need to be executed
        pool_name: name of the pool being created.
        pg_num: initial number of pgs.
        kwargs: Any other args that need to be passed

     Returns: True -> pass, False -> fail
    """

    log.info(f"creating pool_name {pool_name}")
    cmd = f"ceph osd pool create {pool_name} {pg_num} {pg_num}"
    if kwargs.get("ec_profile_name"):
        cmd = f"{cmd} erasure {kwargs['ec_profile_name']}"
    try:
        node.shell([cmd])
    except Exception as err:
        log.error(f"Error creating pool : {pool_name}")
        log.error(err)
        return False

    # Enabling rados application on the pool
    enable_app_cmd = f"sudo ceph osd pool application enable {pool_name} {kwargs.get('app_name','rados')}"
    node.shell([enable_app_cmd])

    cmd_map = {
        "min_size":
        f" ceph osd pool set {pool_name} min_size {kwargs.get('min_size')}",
        "size":
        f" ceph osd pool set {pool_name} size {kwargs.get('min_size')}",
        "erasure_code_use_overwrites":
        f"ceph osd pool set {pool_name} "
        f"allow_ec_overwrites {kwargs.get('erasure_code_use_overwrites')}",
        "disable_pg_autoscale":
        f"ceph osd pool set {pool_name} pg_autoscale_mode off",
        "crush_rule":
        f"sudo ceph osd pool set {pool_name} crush_rule {kwargs.get('crush_rule')}",
        "pool_quota":
        f"ceph osd pool set-quota {pool_name} {kwargs.get('pool_quota')}",
    }
    for key in kwargs:
        if cmd_map.get(key):
            try:
                node.shell([cmd_map[key]])
            except Exception as err:
                log.error(
                    f"Error setting the property : {key} for pool : {pool_name}"
                )
                log.error(err)
                return False

    log.info(f"Created pool {pool_name} successfully")
    return True
Exemplo n.º 24
0
def run(ceph_cluster, **kw):
    """
    Verifies the config change history in monitor configuration database changes
    Returns:
        1 -> Fail, 0 -> Pass
    """
    log.info(run.__doc__)
    config = kw["config"]
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)
    mon_obj = MonConfigMethods(rados_obj=rados_obj)

    # getting the last config change, to which we will roll back later
    init_config = mon_obj.get_ceph_log(count=1)[0]
    log.info("Config at the beginning of test. \n"
             f"Version: {init_config['version']}"
             f"Changes made: {init_config['changes']}")

    log.info(
        "Setting new changes and verifying if the changes are reflected in the log"
    )
    if not mon_obj.set_config(section="osd", name="osd_max_scrubs", value="8"):
        log.error("Error setting config ")
        return 1

    # Checking the versions and changes made.
    test_config = mon_obj.get_ceph_log(count=1)[0]
    log.info("Config changes made for test. \n"
             f"Version: {test_config['version']}"
             f"Changes made: {test_config['changes']}")

    if not test_config["version"] > init_config["version"]:
        log.error(f"The log is not updated with new config changes."
                  f"Version: {test_config['version']}")
        return 1
    try:
        name = test_config["changes"][0].get("name")
        value = str(test_config["changes"][0].get("new_value"))
        if not name == "osd/osd_max_scrubs" and value == "8":
            log.error(f"The log is not updated with new config changes."
                      f"Changes made: {test_config['changes']}")
            return 1
    except Exception:
        log.error(
            "The log collected does not contain the value and changes made")
        return 1

    log.info("The ceph config log is successfully updated after changes ")
    return 0
Exemplo n.º 25
0
def operator(test_config, step_config, **kw):
    """
    Using the provided test config file, this method triggers SDK calls of RBD
    of that specific scenario

    Arguments:
        test_config: containing the key/value pairs passed from the test-suite
        step_config: arguments required for a specific operation
        args: test data

    Returns:
        0 on success or 1 for failures
    """
    if step_config.get("method") == "shell":
        cephadm = CephAdmin(kw["ceph_cluster_dict"], test_config)
        cephadm.shell(args=step_config["args"])
    else:
        # maintain dictionary to map to classes based on service
        # instantiate class
        instance = CLASS_MAP[step_config["class"]](nodes=kw["ceph_nodes"])
        method = getattr(instance, step_config["method"])
        log.info(method)
        method(step_config["args"])
    return 0
Exemplo n.º 26
0
def run_ceph_command(node: CephAdmin, cmd: str) -> dict:
    """
    Runs ceph commands with json tag for the action specified otherwise treats action as command
    and returns formatted output
    Args:
        node: Cephadm node where the commands need to be executed
        cmd: Command that needs to be run

    Returns: dictionary of the output
    """

    cmd = f"{cmd} -f json"
    out, err = node.shell([cmd])
    status = json.loads(out)
    return status
Exemplo n.º 27
0
def enable_balancer(node: CephAdmin, **kwargs) -> bool:
    """
    Enables the balancer module with the given mode
    Args:
        node: Cephadm node where the commands need to be executed
        kwargs: Any other args that need to be passed
    Returns: True -> pass, False -> fail
    """
    # balancer is always enabled module, There is no need to enable the module via mgr.
    # To verify the same run ` ceph mgr module ls `, which would list all modules.
    # if found to be disabled, can be enabled by ` ceph mgr module enable balancer `
    mgr_modules = run_ceph_command(node, cmd="ceph mgr module ls")
    if not ("balancer" in mgr_modules["always_on_modules"]
            or "balancer" in mgr_modules["enabled_modules"]):
        log.error(f"Balancer is not enabled. Enabled modules on cluster are:"
                  f"{mgr_modules['always_on_modules']} & "
                  f"{mgr_modules['enabled_modules']}")

    # Setting the mode for the balancer. Available modes: none|crush-compat|upmap
    balancer_mode = kwargs.get("balancer_mode", "upmap")
    cmd = f"ceph balancer mode {balancer_mode}"
    node.shell([cmd])
    # Turning on the balancer on the system
    cmd = "ceph balancer on"
    node.shell([cmd])

    # Sleeping for 10 seconds after enabling balancer and then collecting the evaluation status
    time.sleep(10)
    cmd = "ceph balancer status"
    try:
        op, err = node.shell([cmd])
        log.info(op)
        return True
    except Exception:
        log.error("Exception hit while checking balancer status")
        return False
Exemplo n.º 28
0
def get_alerts(node: CephAdmin) -> dict:
    """
    Fetches all the current health alerts codes that are generated on the ceph cluster
    Args:
            node: node on which command should be executed

    Returns: list of the alerts present
            alert dictionary :
            { "active_alerts" : ['CEPHADM_REFRESH_FAILED', 'OSDMAP_FLAGS'],
             "muted_alerts" : ['MON_DISK_BIG'] }
    """
    cmd = "ceph health detail"
    all_alerts = {}
    out, err = node.shell([cmd])
    regex = r"(\(MUTED[\w\s,-]*\))?\s*\[\w{3}\]\s([\w_]*):"
    alerts = re.findall(regex, out)
    all_alerts["active_alerts"] = [alert[1] for alert in alerts if not alert[0]]
    all_alerts["muted_alerts"] = [alert[1] for alert in alerts if alert[0]]
    return all_alerts
Exemplo n.º 29
0
def run(ceph_cluster, **kw):
    """
    Verifies the config change reverts in monitor configuration database changes taken from logs
    Returns:
        1 -> Fail, 0 -> Pass
    """
    log.info(run.__doc__)
    config = kw["config"]
    cephadm = CephAdmin(cluster=ceph_cluster, **config)
    rados_obj = RadosOrchestrator(node=cephadm)
    mon_obj = MonConfigMethods(rados_obj=rados_obj)

    init_config = mon_obj.get_ceph_log(count=1)[0]
    if not mon_obj.set_config(
            section="mon", name="mon_max_log_epochs", value="1000"):
        log.error("Error setting config ")
        return 1
    log.info(
        f"Proceeding with reverting the last config change, selecting version: {init_config['version']}"
    )
    if not mon_obj.ceph_config_reset(version=init_config["version"]):
        log.error(
            f"Could not revert to the selected version : {init_config['version']}"
        )
        return 1

    log.info(
        "Reverted to selected version. Checking if the config value is removed"
    )
    if mon_obj.verify_set_config(section="mon",
                                 name="mon_max_log_epochs",
                                 value="1000"):
        log.error("Config is still set after the reset")
        return 1

    test_config = mon_obj.get_ceph_log(count=1)[0]
    log.info(
        f"reverted successfully to previous versions. config log : {test_config}"
    )

    log.info("The ceph config log is successfully updated after changes ")
    return 0
Exemplo n.º 30
0
def get_mon_details(node: CephAdmin) -> dict:
    """
    Collects the mon map details like election epoch, election strategy, active mons and fsid
    Args:
        node: Cephadm node where the commands need to be executed
    Returns: Dict -> pass, False -> fail
            dict eg : { 'epoch': '6', 'fsid': '00206990-70fb-11eb-a425-f0d4e2ebeb54', 'election_strategy': '1',
            'monitors': ['mon.dell-r640-016.dsal.lab.eng.tlv2.redhat.com', 'mon.dell-r640-019'] }
    """
    cmd = "ceph mon dump"
    mon_details = {}
    out, err = node.shell([cmd])
    log.info(out)
    regex_details = (
        r"\s*epoch\s+(\d{1,})\s+fsid\s+([\w-]*)[\w\W]*election_strategy:\s+(\d{1})"
    )
    regex_mon = r"\d{1}\:\s+[\[\]\w\:\./,]*\s+mon\.([\w\-_\.]*)"
    details = re.search(regex_details, out).groups()
    mon_details["epoch"] = int(details[0])
    mon_details["fsid"] = details[1]
    mon_details["election_strategy"] = int(details[2])
    mon_details["monitors"] = re.findall(regex_mon, out)
    return mon_details