def run(ceph_cluster, **kw): """ BZ https://bugzilla.redhat.com/show_bug.cgi?id=1754078 : Run scrub/deep scrub and check osd memory usage 1. Run deep scrub on a cluster parallely when IOs are running 2. check memory usage of osd daemons is not crossing 'osd memory target' value in ceph.conf on each osd node Args: ceph_cluster (ceph.ceph.Ceph): ceph cluster """ log.info("Running bz-1754078") log.info(run.__doc__) ceph_nodes = kw.get("ceph_nodes") config = kw.get("config") pg_count = config.get("pg_count", 8) timeout = config.get("timeout", 10) mons = [] role = "mon" for mnode in ceph_nodes: if mnode.role == role: mons.append(mnode) ctrlr = mons[0] log.info("chosing mon {cmon} as ctrlrmon".format(cmon=ctrlr.hostname)) helper = RadosHelper(ctrlr, config, log) with parallel() as p: helper = RadosHelper(mons[0], config, log) p.spawn(helper.run_radosbench, pg_count, timeout) helper = RadosHelper(mons[1], config, log) p.spawn(helper.run_scrub) helper = RadosHelper(mons[2], config, log) p.spawn(helper.run_deep_scrub) time.sleep(10) osd_nodes = [] role = "osd" with parallel() as p: for ceph in ceph_nodes: if ceph.role == role: osd_nodes.append(ceph) out, err = ceph.exec_command( cmd="sudo ceph osd ls-tree {host}".format( host=ceph.hostname)) osd_id_list_on_node = out.split() log.info("osds on node {}".format(ceph.hostname)) log.info(osd_id_list_on_node) osd_mem_target = check_osd_memory_target_of_node( ceph, osd_id_list_on_node) log.info("Node {a} osd_memory_target in bytes: {b}".format( a=ceph.hostname, b=osd_mem_target)) p.spawn( check_osd_daemon_memory_usage, ceph, osd_id_list_on_node, osd_mem_target, ) time.sleep(1) return 0
def run(ceph_cluster, **kw): """ 1. Create a LRC profile and then create a ec pool #ceph osd erasure-code-profile set $profile \ plugin=lrc \ k=4 m=2 l=3 \ ruleset-failure-domain=osd # ceph osd pool create $poolname 1 1 erasure $profile 2. start writing a large object so that we will get \ sometime to fail the osd while the reads and writes are in progress on an object # rados put -p lrcpool obj1 /src/path #rados get -p lrcpool obj1 /tmp/obj1 while above command is in progress kill primary osd responsible for the PG. primary can be found from # ceph pg dump 3. Bring back primary 4. Repeat the step 2 but this time kill some secondary osds Args: ceph_cluster (ceph.ceph.Ceph): """ log.info("Running test CEPH-9281") ceph_nodes = kw.get("ceph_nodes") config = kw.get("config") build = config.get("build", config.get("rhbuild")) mons = [] role = "client" for mnode in ceph_nodes: if mnode.role == role: mons.append(mnode) ctrlr = mons[0] log.info("chosing mon {cmon} as ctrlrmon".format(cmon=ctrlr.hostname)) helper = RadosHelper(ctrlr, config, log) """ create LRC profile """ sufix = random.randint(0, 10000) prof_name = "LRCprofile{suf}".format(suf=sufix) if build.startswith("4"): profile = "osd erasure-code-profile set {LRCprofile} plugin=lrc k=4 m=2 l=3 \ crush-failure-domain=osd".format(LRCprofile=prof_name) else: profile = "osd erasure-code-profile set {LRCprofile} plugin=lrc k=4 m=2 l=3 \ ruleset-failure-domain=osd crush-failure-domain=osd".format( LRCprofile=prof_name) try: (outbuf, err) = helper.raw_cluster_cmd(profile) log.info(outbuf) log.info("created profile {LRCprofile}".format(LRCprofile=prof_name)) except Exception: log.error("LRC profile creation failed") log.error(traceback.format_exc()) return 1 """create LRC ec pool""" pool_name = "lrcpool{suf}".format(suf=sufix) try: helper.create_pool(pool_name, 1, prof_name) log.info("Pool {pname} created".format(pname=pool_name)) except Exception: log.error("lrcpool create failed") log.error(traceback.format_exc()) return 1 """rados put and get in a parallel task""" with parallel() as p: p.spawn(do_rados_put, ctrlr, pool_name, 20) p.spawn(do_rados_get, ctrlr, pool_name, 10) for res in p: log.info(res) try: pri_osd_id = helper.get_pg_primary(pool_name, 0) log.info("PRIMARY={pri}".format(pri=pri_osd_id)) except Exception: log.error("getting primary failed") log.error(traceback.format_exc()) return 1 log.info("SIGTERM osd") target_osd_hostname = ceph_cluster.get_osd_metadata(pri_osd_id).get( "hostname") pri_osd_node = ceph_cluster.get_node_by_hostname(target_osd_hostname) pri_osd_service = ceph_cluster.get_osd_service_name(pri_osd_id) try: helper.kill_osd(pri_osd_node, pri_osd_service) log.info("osd killed") except Exception: log.error("killing osd failed") log.error(traceback.format_exc()) if not helper.wait_until_osd_state(osd_id=pri_osd_id, down=True): log.error("unexpected! osd is still up") return 1 time.sleep(5) log.info("Reviving osd {osd}".format(osd=pri_osd_id)) try: if helper.revive_osd(pri_osd_node, pri_osd_service): log.error("revive failed") return 1 except Exception: log.error("revive failed") log.error(traceback.format_exc()) return 1 if not helper.wait_until_osd_state(pri_osd_id): log.error("osd is DOWN") return 1 log.info( f"Revival of Primary OSD : {pri_osd_id} is complete\n Killing random OSD" ) time.sleep(10) try: rand_osd_id = helper.get_pg_random(pool_name, 0) log.info("RANDOM OSD={rosd}".format(rosd=rand_osd_id)) except Exception: log.error("getting random osd failed") log.error(traceback.format_exc()) return 1 log.info("SIGTERM osd") target_osd_hostname = ceph_cluster.get_osd_metadata(rand_osd_id).get( "hostname") rand_osd_node = ceph_cluster.get_node_by_hostname(target_osd_hostname) rand_osd_service = ceph_cluster.get_osd_service_name(rand_osd_id) try: helper.kill_osd(rand_osd_node, rand_osd_service) log.info("osd killed") except Exception: log.error("killing osd failed") log.error(traceback.format_exc()) if not helper.wait_until_osd_state(osd_id=rand_osd_id, down=True): log.error("unexpected! osd is still up") return 1 time.sleep(5) log.info("Reviving osd {osd}".format(osd=rand_osd_id)) try: if helper.revive_osd(rand_osd_node, rand_osd_service): log.error("revive failed") return 1 except Exception: log.error("revive failed") log.error(traceback.format_exc()) return 1 if not helper.wait_until_osd_state(rand_osd_id): log.error("osd is DOWN") return 1 log.info(f"Revival of Random OSD : {rand_osd_id} is complete") return 0
def run(ceph_cluster, **kw): """ CEPH-9928 RADOS: Corrupt snap info of an object and run list-inconsistent-snapset Steps: 1. create a replica 3 pool 2. take few pool snaps with writes on objects b/w every snap 3. chose primary osd and bring it down 4. go to backend and using ceph-object-store tool corrupt the snapset of the object 5. run deep-scrub on the pg 6. check rados list-inconsistent-pg <pool> 7. rados list-inconsistent-snapset <pg> Args: ceph_cluster (ceph.ceph.Ceph): """ log.info("Running CEPH-9928") log.info(run.__doc__) ceph_nodes = kw.get('ceph_nodes') config = kw.get('config') mons = [] role = 'client' for mnode in ceph_nodes: if mnode.role == role: mons.append(mnode) ctrlr = mons[0] log.info("chosing mon {cmon} as ctrlrmon".format(cmon=ctrlr.hostname)) helper = RadosHelper(ctrlr, config, log) """create a replica pool""" pname = "snapcorrupt_{rand}".format(rand=random.randint(0, 10000)) try: helper.create_pool(pname, 1) log.info("Pool {pname} created".format(pname=pname)) except Exception: log.error("Failed to create pool") log.error(traceback.format_exc()) return 1 time.sleep(5) """Get the target PG,osd for corruption operation""" oname = "UNIQUEOBJECT{i}".format(i=random.randint(0, 10000)) cmd = "osd map {pname} {obj} --format json".format(pname=pname, obj=oname) (out, err) = helper.raw_cluster_cmd(cmd) outbuf = out.read().decode() log.info(outbuf) cmdout = json.loads(outbuf) targt_pg = cmdout['pgid'] targt_osd_id = cmdout['up'][0] '''write data and take snaps''' putobj = "sudo rados -p {pool} put {obj} {path}".format(pool=pname, obj=oname, path="/etc/hosts") for i in range(10): (out, err) = ctrlr.exec_command(cmd=putobj) snapcmd = "sudo rados mksnap -p {pool} {sname}".format(pool=pname, sname="snap" + str(i)) (out, err) = ctrlr.exec_command(cmd=snapcmd) log.info("put {obj}, snap {snap}".format(obj=oname, snap="snap" + str(i))) ''' Goto destination osd, stop the osd use ceph-objectstore-tool to corrupt snap info ''' # target_osd = ceph_cluster.get_osd_by_id(targt_osd_id) # target_osd_node = target_osd.node target_osd_hostname = ceph_cluster.get_osd_metadata(targt_osd_id).get( 'hostname') log.info(target_osd_hostname) target_osd_node = ceph_cluster.get_node_by_hostname(target_osd_hostname) cot_environment = target_osd_node osd_service = ceph_cluster.get_osd_service_name(targt_osd_id) partition_path = ceph_cluster.get_osd_metadata(targt_osd_id).get( 'osd_data') helper.kill_osd(target_osd_node, osd_service) time.sleep(10) osd_metadata = ceph_cluster.get_osd_metadata(targt_osd_id) osd_data = osd_metadata.get('osd_data') osd_journal = osd_metadata.get('osd_journal') if ceph_cluster.containerized: docker_image_string = '{docker_registry}/{docker_image}:{docker_tag}'.format( docker_registry=ceph_cluster.ansible_config.get( 'ceph_docker_registry'), docker_image=ceph_cluster.ansible_config.get('ceph_docker_image'), docker_tag=ceph_cluster.ansible_config.get( 'ceph_docker_image_tag')) cot_environment = helper.get_mgr_proxy_container( target_osd_node, docker_image_string) out, err = cot_environment.exec_command( cmd='mount | grep "{partition_path} "'.format( partition_path=partition_path), check_ec=False) device_mount_data = out.read().decode() # type: str if not device_mount_data: cot_environment.exec_command( cmd='sudo mount {partition_path} {directory}'.format( partition_path=partition_path, directory=osd_data)) slist_cmd = "sudo ceph-objectstore-tool --data-path \ {osd_data} --journal-path \ {osd_journal} \ --head --op list {obj}".format(osd_data=osd_data, osd_journal=osd_journal, obj=oname) (out, err) = cot_environment.exec_command(cmd=slist_cmd) outbuf = out.read().decode() log.info(outbuf) corrupt_cmd = "sudo ceph-objectstore-tool --data-path \ {osd_data} --journal-path \ {osd_journal} \ {outbuf} clear-snapset \ corrupt".format(osd_data=osd_data, osd_journal=osd_journal, outbuf="'" + (outbuf) + "'") (out, err) = cot_environment.exec_command(cmd=corrupt_cmd) outbuf = out.read().decode() log.info(outbuf) helper.revive_osd(target_osd_node, osd_service) time.sleep(10) run_scrub = "pg deep-scrub {pgid}".format(pgid=targt_pg) (out, err) = helper.raw_cluster_cmd(run_scrub) outbuf = out.read().decode() log.info(outbuf) while 'HEALTH_ERR' and 'active+clean+inconsistent' not in outbuf: status = "-s --format json" (out, err) = helper.raw_cluster_cmd(status) outbuf = out.read().decode() log.info("HEALTH_ERR found as expected") log.info("inconsistent foud as expected") timeout = 300 found = 0 while timeout: incon_pg = "sudo rados list-inconsistent-pg \ {pname}".format(pname=pname) (out, err) = ctrlr.exec_command(cmd=incon_pg) outbuf = out.read().decode() log.info(outbuf) if targt_pg not in outbuf: time.sleep(1) timeout = timeout - 1 else: found = 1 break if timeout == 0 and found == 0: log.error("pg not listed as inconsistent") return 1 timeout = 300 found = 0 while timeout: incon_snap = "sudo rados list-inconsistent-snapset \ {pg}".format(pg=targt_pg) (out, err) = ctrlr.exec_command(cmd=incon_snap) outbuf = out.read().decode() log.info(outbuf) if oname not in outbuf: time.sleep(1) timeout = timeout - 1 else: found = 1 break if timeout == 0 and found == 0: log.error("object is not listed in inconsistent snap") return 1 return 0
def run(ceph_cluster, **kw): """ CEPH-9311 - RADOS: Pyramid erasure codes (Local Repai rable erasure codes): Bring down 2 osds (in case of k=4) from 2 localities so that recovery happens from local repair code 1. Create a LRC profile and then create a ec pool #ceph osd erasure-code-profile set $profile \ plugin=lrc \ k=4 m=2 l=3 \ ruleset-failure-domain=osd # ceph osd pool create $poolname 1 1 erasure $profile 2. start writing objects to the pool # rados -p poolname bench 1000 write --no-cleanup 3. Bring down 2 osds from 2 different localities which contains data chunk:(for this we need to figure out mapping) for ex: with k=4, m=2, l=3 mapping looks like chunk nr 01234567 step 1 _cDD_cDD (Here DD are data chunks ) step 2 cDDD____ step 3 ____cDDD from "step 1" in the above mapping we can see that data chunk is divided into 2 localities which is anlogous to 2 data center. so in our case for ex we have to bring down (3,7) OR (2,7) OR (2,6) OR (3,6) . Args: ceph_cluster (ceph.ceph.Ceph): ceph cluster """ log.info("Running test ceph-9311") ceph_nodes = kw.get('ceph_nodes') config = kw.get('config') mons = [] role = 'client' for mnode in ceph_nodes: if mnode.role == role: mons.append(mnode) ctrlr = mons[0] log.info("chosing mon {cmon} as ctrlrmon".format(cmon=ctrlr.hostname)) helper = RadosHelper(ctrlr, config, log) '''Create an LRC profile''' sufix = random.randint(0, 10000) prof_name = "LRCprofile{suf}".format(suf=sufix) profile = "osd erasure-code-profile set {LRCprofile} \ plugin=lrc\ k=4 m=2 l=3 \ ruleset-failure-domain=osd \ crush-failure-domain=osd".format(LRCprofile=prof_name) try: (out, err) = helper.raw_cluster_cmd(profile) outbuf = out.read().decode() log.info(outbuf) log.info("created profile {LRCprofile}".format(LRCprofile=prof_name)) except Exception: log.error("LRC profile creation failed") log.error(traceback.format_exc()) return 1 '''create LRC ec pool''' pool_name = "lrcpool{suf}".format(suf=sufix) try: helper.create_pool(pool_name, 1, prof_name) log.info("Pool {pname} created".format(pname=pool_name)) except Exception: log.error("lrcpool create failed") log.error(traceback.format_exc()) return 1 ''' Bringdown 2 osds which contains a 'D' from both localities we will be chosing osd at 2 and 7 from the given active set list ''' oname = "UNIQUEOBJECT{i}".format(i=random.randint(0, 10000)) cmd = "osd map {pname} {obj} --format json".format(pname=pool_name, obj=oname) (out, err) = helper.raw_cluster_cmd(cmd) outbuf = out.read().decode() log.info(outbuf) cmdout = json.loads(outbuf) # targt_pg = cmdout['pgid'] target_osds_ids = [] for i in [2, 7]: target_osds_ids.append(cmdout['up'][i]) # putobj = "sudo rados -p {pool} put {obj} {path}".format( # pool=pool_name, obj=oname, path="/etc/hosts" # ) for i in range(10): putobj = "sudo rados -p {pool} put {obj} {path}".format( pool=pool_name, obj="{oname}{i}".format(oname=oname, i=i), path="/etc/hosts") (out, err) = ctrlr.exec_command(cmd=putobj) '''Bringdown tosds''' osd_service_map_list = [] for osd_id in target_osds_ids: target_osd_node = ceph_cluster.get_osd_by_id(osd_id).node osd_service = ceph_cluster.get_osd_service_name(osd_id) osd_service_map_list.append({ 'osd_node': target_osd_node, 'osd_service': osd_service }) helper.kill_osd(target_osd_node, osd_service) time.sleep(5) outbuf = "degrade" timeout = 10 found = 0 status = '-s --format json' while timeout: if 'active' not in outbuf: (out, err) = helper.raw_cluster_cmd(status) outbuf = out.read().decode() time.sleep(1) timeout = timeout - 1 else: found = 1 break if timeout == 0 and found == 0: log.error("cluster didn't become active+clean..timeout") return 1 '''check whether read/write can be done on the pool''' for i in range(10): putobj = "sudo rados -p {pool} put {obj} {path}".format( pool=pool_name, obj="{oname}{i}".format(oname=oname, i=i), path="/etc/hosts") (out, err) = ctrlr.exec_command(cmd=putobj) log.info(out.read().decode()) for i in range(10): putobj = "sudo rados -p {pool} get {obj} {path}".format( pool=pool_name, obj="{oname}{i}".format(oname=oname, i=i), path="/tmp/{obj}{i}".format(obj=oname, i=i)) (out, err) = ctrlr.exec_command(cmd=putobj) log.info(out.read().decode()) '''donewith the test ,revive osds''' for osd_service_map in osd_service_map_list: helper.revive_osd(osd_service_map.get('osd_node'), osd_service_map.get('osd_service')) return 0
def run(ceph_cluster, **kw): """ CEPH-83571453-RADOS: Corrupt an object in ec pool followed by list-inconsistent-* commands 1. create a jerasure ec pool with k=4,m=2 2. create an object in the pool 3. chose primary osd from the acting set and go to the backend 4. corrupt object attrib from the backend 5. run deep-scrub on the pool 6. rados list-inconsistent-pg <pool> 7. rados list-inconsistent-obj <pg> Args: ceph_cluster (ceph.ceph.Ceph): ceph cluster """ log.info("Running CEPH-83571453") log.info(run.__doc__) ceph_nodes = kw.get("ceph_nodes") config = kw.get("config") build = config.get("build", config.get("rhbuild")) mons = [] role = "client" for mnode in ceph_nodes: if mnode.role == role: mons.append(mnode) ctrlr = mons[0] log.info("chosing mon {cmon} as ctrlrmon".format(cmon=ctrlr.hostname)) helper = RadosHelper(ctrlr, config, log) """create ec pool with k=4, m=2""" k = 4 m = 2 pname = "eccorrupt_{rand}_{k}_{m}".format(rand=random.randint(0, 10000), k=k, m=m) profile = pname if build.startswith("4"): prof_cmd = "osd erasure-code-profile set {profile} k={k} m={m} \ crush-failure-domain=osd".format(profile=profile, k=k, m=m) else: prof_cmd = "osd erasure-code-profile set {profile} k={k} m={m} \ ruleset-failure-domain=osd crush-failure-domain=osd".format( profile=profile, k=k, m=m) try: (outbuf, err) = helper.raw_cluster_cmd(prof_cmd) log.info(outbuf) log.info("created profile {ec}".format(ec=profile)) except Exception: log.error("ec profile creation failed") log.error(traceback.format_exc()) return 1 """create ec pool""" try: helper.create_pool(pname, 1, profile) log.info("Pool {pname} is create".format(pname=pname)) except Exception: log.error("failed to create pool") log.error(traceback.format_exc()) return 1 """check whether pool exists""" try: helper.get_pool_num(pname) except Exception: log.error("Unable to find pool") log.error(traceback.format_exc()) return 1 time.sleep(10) oname = "OBJ_{pname}".format(pname=pname) cmd = "osd map {pname} {obj} --format json".format(pname=pname, obj=oname) (outbuf, err) = helper.raw_cluster_cmd(cmd) log.info(outbuf) cmdout = json.loads(outbuf) targt_pg = cmdout["pgid"] """considering primary only as of now because of bug 1544680 """ targt_osd_id = cmdout["up"][0] """write data and take snaps""" putobj = "sudo rados -p {pool} put {obj} {path}".format(pool=pname, obj=oname, path="/etc/hosts") for i in range(10): (out, err) = ctrlr.exec_command(cmd=putobj) snapcmd = "sudo rados mksnap -p {pool} {sname}".format(pool=pname, sname="snap" + str(i)) (out, err) = ctrlr.exec_command(cmd=snapcmd) log.info("put {obj}, snap {snap}".format(obj=oname, snap="snap" + str(i))) """ Goto destination osd, stop the osd use ceph-objectstore-tool to corrupt snap info """ # target_osd = ceph_cluster.get_osd_by_id(targt_osd_id) # target_osd_node = target_osd.node target_osd_hostname = ceph_cluster.get_osd_metadata(targt_osd_id).get( "hostname") log.info(target_osd_hostname) target_osd_node = ceph_cluster.get_node_by_hostname(target_osd_hostname) cot_environment = target_osd_node osd_service = ceph_cluster.get_osd_service_name(targt_osd_id) partition_path = ceph_cluster.get_osd_metadata(targt_osd_id).get( "osd_data") helper.kill_osd(target_osd_node, osd_service) time.sleep(10) osd_metadata = ceph_cluster.get_osd_metadata(targt_osd_id) osd_data = osd_metadata.get("osd_data") osd_journal = osd_metadata.get("osd_journal") if ceph_cluster.containerized: docker_image_string = "{docker_registry}/{docker_image}:{docker_tag}".format( docker_registry=ceph_cluster.ansible_config.get( "ceph_docker_registry"), docker_image=ceph_cluster.ansible_config.get("ceph_docker_image"), docker_tag=ceph_cluster.ansible_config.get( "ceph_docker_image_tag"), ) cot_environment = helper.get_mgr_proxy_container( target_osd_node, docker_image_string) device_mount_data, err = cot_environment.exec_command( cmd='mount | grep "{partition_path} "'.format( partition_path=partition_path), check_ec=False, ) if not device_mount_data: cot_environment.exec_command( cmd="sudo mount {partition_path} {directory}".format( partition_path=partition_path, directory=osd_data)) slist_cmd = "sudo ceph-objectstore-tool --data-path \ {osd_data} --journal-path \ {osd_journal} \ --head --op list {obj}".format(osd_data=osd_data, osd_journal=osd_journal, obj=oname) (outbuf, err) = cot_environment.exec_command(cmd=slist_cmd) log.info(outbuf) corrupt_cmd = "sudo ceph-objectstore-tool --data-path \ {osd_data} --journal-path \ {osd_journal} \ {outbuf} clear-snapset \ corrupt".format(osd_data=osd_data, osd_journal=osd_journal, outbuf="'" + (outbuf) + "'") (outbuf, err) = cot_environment.exec_command(cmd=corrupt_cmd) log.info(outbuf) helper.revive_osd(target_osd_node, osd_service) time.sleep(10) run_scrub = "pg deep-scrub {pgid}".format(pgid=targt_pg) (outbuf, err) = helper.raw_cluster_cmd(run_scrub) log.info(outbuf) while "HEALTH_ERR" and "active+clean+inconsistent" not in outbuf: status = "-s --format json" (outbuf, err) = helper.raw_cluster_cmd(status) log.info("HEALTH_ERR found as expected") log.info("inconsistent foud as expected") timeout = 300 found = 0 while timeout: incon_pg = "sudo rados list-inconsistent-pg {pname}".format( pname=pname) (outbuf, err) = ctrlr.exec_command(cmd=incon_pg) log.info(outbuf) if targt_pg not in outbuf: time.sleep(1) timeout = timeout - 1 else: found = 1 break if timeout == 0 and found == 0: log.error("pg not listed as inconsistent") return 1 timeout = 300 found = 0 while timeout: incon_obj = "sudo rados list-inconsistent-snapset \ {pg}".format(pg=targt_pg) (outbuf, err) = ctrlr.exec_command(cmd=incon_obj) log.info(outbuf) if oname not in outbuf: time.sleep(1) timeout = timeout - 1 else: found = 1 break if timeout == 0 and found == 0: log.error("object is not listed in inconsistent obj") return 1 return 0
def run(**kw): """ 1. Create a LRC profile and then create a ec pool #ceph osd erasure-code-profile set $profile \ plugin=lrc \ k= m= l= \ ruleset-failure-domain=osd try different values for k, m and l # ceph osd pool create $poolname 1 1 erasure $profile 2. perform I/O #rados put -p poolname obj /path/ """ log.info("Running CEPH-9322") log.info(run.__doc__) ceph_nodes = kw.get('ceph_nodes') config = kw.get('config') build = config.get('build', config.get('rhbuild')) mons = [] osds = [] role = 'client' for mnode in ceph_nodes: if mnode.role == role: mons.append(mnode) role = 'osd' for osd in ceph_nodes: if osd.role == role: osds.append(osd) ctrlr = mons[0] log.info("chosing mon {cmon} as ctrlrmon".format(cmon=ctrlr.hostname)) helper = RadosHelper(ctrlr, config, log) '''beacause of limited machines resorting to following config''' lrc_config = [(4, 2, 3), (2, 1, 3), (2, 2, 2)] for conf in lrc_config: (k, m, l) = conf suffix = "{k}_{m}_{l}".format(k=k, m=m, l=l) prof_name = "LRCprofile{suf}".format(suf=suffix) if build.startswith('4'): profile = "osd erasure-code-profile set {LRC} plugin=lrc k={k} m={m} l={l} \ crush-failure-domain=osd".format(LRC=prof_name, k=k, m=m, l=l) else: profile = "osd erasure-code-profile set {LRC} plugin=lrc k={k} m={m} l={l} \ ruleset-failure-domain=osd crush-failure-domain=osd".format( LRC=prof_name, k=k, m=m, l=l) try: (out, err) = helper.raw_cluster_cmd(profile) outbuf = out.read().decode() log.info(outbuf) log.info("created profile {LRC}".format(LRC=prof_name)) except Exception: log.error("LRC profile creation failed") log.error(traceback.format_exc()) return 1 '''create LRC ec pool''' pname = "lrcpool{rand}{suf}".format(rand=random.randint(0, 10000), suf=suffix) try: helper.create_pool(pname, 1, prof_name) log.info("Pool {pname} created".format(pname=pname)) except Exception: log.error("failed to create lrcpool") log.error(traceback.format_exc()) return 1 '''check whether pool exists''' try: helper.get_pool_num(pname) except Exception: log.error("Unable to find pool") log.error(traceback.format_exc()) return 1 return 0
def run(ceph_cluster, **kw): """ BZ https://bugzilla.redhat.com/show_bug.cgi?id=1829646 : 1. Check the mon memory usage to store the osd map in the DB and the OSD map epoch time. 2. Bring down ODS(s) and check the memory usage. 3. The OSD map should be trimmed even when the OSD is down. 4. The DB size should be reduced by removing the old mappings once the new mappings are added. Args: ceph_cluster (ceph.ceph.Ceph): ceph cluster Note: the test cannot be run on a cluster that is just created as the DB size increases with new mappings for around next hour or so.. Please run the test on cluster who's age is at least 1.5 hours. By 1.5 hours, the MonDB will be updated with the mappings. """ log.info("Running bz-1829646") log.info(run.__doc__) ceph_nodes = kw.get("ceph_nodes") mons = [] osds = [] osd_list_dict = dict() config = kw.get("config") # The OSD back filling/Recovery can take up a lot of time... # time the method waits for the recovery to complete time_limit_for_recovery = 60 * 60 * 4 # time the method waits for the trimming on monDB to be completed once the recovery is done mon_db_trim_time = 60 * 15 # time interval at which the status of the cluster will be checked regularly for recovery completion recovery_wait_time = 60 * 3 # number of OSD's to be brought down during test execution osd_down_no = config.get("osd_count", 1) # selection for node or daemon to be brought down osd_node_bring_down = config.get("non_scale_setup", True) for node in ceph_nodes: if node.role == "mon": mons.append(node) if node.role == "osd": osds.append(node) controller = mons[0] log.info(f"choosing mon {controller.hostname} as Control monitor") helper = RadosHelper(controller, config, log) # collecting the osd daemons present on the OSD node for node in osds: osd_list_dict[node] = helper.collect_osd_daemon_ids( mon_node=controller, osd_node=node) # collecting the initial size of the Mon DB and the OSD map epoch times mon_db_initial_size = get_mon_db_size(mon_node=controller) osd_map_initial_epoch_times = get_status_from_ceph_report( mon_node=controller, operation="osdmap") log.info( f"Size of the MonDB before bringing down OSD's is {mon_db_initial_size}" ) log.info( f"the first and last commits to DB : {osd_map_initial_epoch_times}") # stopping the OSD daemon in one of the OSD nodes. # Randomly selecting a OSD node and a OSD daemon from that node osd_down_dictionary = {} random_osd_nodes = random.sample(osds, osd_down_no) if osd_node_bring_down: for node in random_osd_nodes: log.info(f"Randomly selected node : {node.hostname} ") change_osd_daemon_status(osd_node=node, task="stop") else: for node in random_osd_nodes: random_osd_daemon = random.choice(osd_list_dict[node]) log.info(f"Randomly selected node : {node.hostname} " f"from which OSD ID :{random_osd_daemon} will be stopped") change_osd_daemon_status(osd_node=node, osd_number=random_osd_daemon, task="stop") osd_down_dictionary[node] = random_osd_daemon print( "sleeping for 2 minutes so that OSD down is recorded and recovery process is started" ) time.sleep(120) recovery_start_time = time.time() mon_db_size_list = [] while time_limit_for_recovery: # collecting the health status to check the status about the recovery process ceph_health_status = get_status_from_ceph_report(mon_node=controller, operation="health") recovery_tuple = ("OSD_DOWN", "PG_AVAILABILITY", "PG_DEGRADED") mon_db_size_list.append(get_mon_db_size(mon_node=controller)) if not any(key in ceph_health_status["checks"].keys() for key in recovery_tuple): log.info("The recovery and back-filling of the OSD is completed") log.info( f"Sleeping {mon_db_trim_time / 60} minutes after the recovery for trimming of the MonDB to complete" ) time.sleep(mon_db_trim_time) mon_db_size_list.append(get_mon_db_size(mon_node=controller)) time.sleep(mon_db_trim_time) break time_limit_for_recovery -= recovery_wait_time log.info( f"The recovery and back-filling of the OSD is not completed / In-progress \n" f"Time elapsed since recovery start : {(time.time() - recovery_start_time) / 60} Minutes\n" f"checking the status of cluster recovery again in {recovery_wait_time / 60} minutes\n" f"Time remaining for process completion : {time_limit_for_recovery / 60} minutes" ) time.sleep(recovery_wait_time) # collecting the final size of the Mon DB and the OSD map epoch times mon_db_final_size = get_mon_db_size(mon_node=controller) log.info( f"the size of the cluster DB after the OSD recovery : {mon_db_final_size} " ) osd_map_final_epoch_times = get_status_from_ceph_report( mon_node=controller, operation="osdmap") max_mon_db_size_reached = max(mon_db_size_list) log.info( f"the Maximum size of the cluster DB during the OSD recovery : {max_mon_db_size_reached} " ) log.debug( f"the first and last commits to DB after recovery: {osd_map_initial_epoch_times}" ) # starting the stopped OSD if osd_node_bring_down: for node in random_osd_nodes: log.info(f"starting the OSD node {node.hostname}") change_osd_daemon_status(osd_node=node, task="start") time.sleep(5) else: for node in osd_down_dictionary.keys(): log.info( f"starting the OSD ID : {osd_down_dictionary[node]} on node {node.hostname}" ) change_osd_daemon_status(osd_node=node, osd_number=osd_down_dictionary[node], task="start") time.sleep(5) flag_db_size = 0 # checking the monDB size and the OSD map trimmings max_size_increase = abs(max_mon_db_size_reached - mon_db_initial_size) final_size_change = abs(mon_db_final_size - mon_db_initial_size) if max_size_increase > final_size_change: log.info( f"The monDB map was trimmed by : {abs(max_size_increase - final_size_change)}" ) else: log.error( f"The monDB was not trimmed. The size is equal or more :{abs(max_size_increase - final_size_change)}" ) flag_db_size = 1 # checking the OSD map, if the old mappings were updated. initial_epoch_time_difference = ( osd_map_initial_epoch_times["osdmap_last_committed"] - osd_map_initial_epoch_times["osdmap_first_committed"]) log.debug( f"The initial difference in the osd maps is : {initial_epoch_time_difference}" ) final_epoch_time_difference = ( osd_map_final_epoch_times["osdmap_last_committed"] - osd_map_final_epoch_times["osdmap_first_committed"]) log.debug( f"The Final difference in the osd maps is : {final_epoch_time_difference}" ) flag_osd_map = 1 if final_epoch_time_difference > 800 else 0 if flag_osd_map == 0 and flag_db_size == 0: return 0 return 1
def run(**kw): log.info("Running radoslib test") ceph_nodes = kw.get("ceph_nodes") config = kw.get("config") mons = [] osds = [] role = "client" for mnode in ceph_nodes: if mnode.role == role: mons.append(mnode) for osd in ceph_nodes: if osd.role == "osd": osds.append(osd) idx = 0 mon = mons[idx] print(mon.hostname) helper = RadosHelper(mon, config, log) """ try: Helper.create_pool("blabla1",4) log.info("poll created successfully") except: log.error("pool creation failed") return 1 try: pri_osd=Helper.get_pg_primary("new", 0) print pri_osd except: return 1 try: osdhost=Helper.get_osd_host(0) print osdhost except: log.error("getting osd host failed") return 1 ret=1 try: log.info("TRYING KILL") ret=Helper.kill_osd(1, osds) log.info("ret={ret}".format(ret=ret)) finally: return ret try: ret=Helper.is_up(1) if ret: log.info("UP") else: log.info("DOWN") return ret except: log.error("staus check failed") return 1 """ try: ret = helper.revive_osd(1, osds) return ret except Exception: log.error("revive failed") log.error(traceback.format_exc()) return 1
def run(ceph_cluster, **kw): """ CEPH-11538: Check for default messenger i.e. async messenger swith b/w simple and async messenger 1. By default 3.x wil have async messenger, anything below will have simple messenger 2. add ms_type = async for enabling async and check io 3. add ms_type=simple for enabling simple and check io Args: ceph_cluster (ceph.ceph.Ceph): ceph cluster """ log.info("Running CEPH-11538") log.info(run.__doc__) ceph_nodes = kw.get("ceph_nodes") config = kw.get("config") mons = [] osds = [] role = "client" for mnode in ceph_nodes: if mnode.role == role: mons.append(mnode) role = "osd" for osd in ceph_nodes: if osd.role == role: osds.append(osd) ctrlr = mons[0] log.info("chosing mon {cmon} as ctrlrmon".format(cmon=ctrlr.hostname)) helper = RadosHelper(ctrlr, config, log) """crete a pool for io""" pname = "mscheck_{rand}".format(rand=random.randint(0, 10000)) helper.create_pool(pname, 1) log.info("pool {pname} create".format(pname=pname)) time.sleep(5) cmd = "osd map {pname} {obj} --format json".format(pname=pname, obj="obj1") (outbuf, err) = helper.raw_cluster_cmd(cmd) log.info(outbuf) cmdout = json.loads(outbuf) targt_osd = cmdout["up"][0] """check what's the default messenger""" mstype = get_ms_type(targt_osd, osds, ceph_cluster) if mstype != "async+posix": log.error("default on luminous should be async but\ we have {mstype}".format(mstype=mstype)) return 1 """switch to simple and do IO""" inject_osd = "tell osd.* injectargs --ms_type simple" (out, err) = helper.raw_cluster_cmd(inject_osd) log.info(out) time.sleep(4) """check whether ms_type changed""" mstype = get_ms_type(targt_osd, osds, ceph_cluster) if "simple" == mstype: log.info("successfull changed to simple") else: log.error("failed to change the ms_type to simple") return 1 """change ms_type back to async""" inject_osd = "tell osd.* injectargs --ms_type async+posix" (out, err) = helper.raw_cluster_cmd(inject_osd) log.info(out) time.sleep(4) """check whether ms_type changed""" mstype = get_ms_type(targt_osd, osds, ceph_cluster) if "async+posix" == mstype: log.info("successfull changed to async+posix") else: log.error("failed to change the ms_type to async") return 1 return 0
def run(ceph_cluster, **kw): """ CEPH-9925 - [RADOS]: Rewrite a known omap item of a replica and list-inconsistent-obj Steps: 1. create an object in a replica pool 2. add some omap keys and corresponding values to the object 3. chose one of the replica and using ceph-objectstore-rool corrupt omap key or value 4. Run deep-scrub - scrub should report inconsistency 5. run rados list-inconsistent-pg <pool> - should list the pg in which object is inconsistent 6. Run rados list-inconsistent-obj <pg> should report omap digest mismatch error Args: ceph_cluster (ceph.ceph.Ceph): ceph cluster """ log.info("Running CEPH-9924") log.info(run.__doc__) ceph_nodes = kw.get("ceph_nodes") config = kw.get("config") mons = [] role = "client" for mnode in ceph_nodes: if mnode.role == role: mons.append(mnode) ctrlr = mons[0] log.info("chosing mon {cmon} as ctrlrmon".format(cmon=ctrlr.hostname)) helper = RadosHelper(ctrlr, config, log) """create an replica pool""" pname0 = "replica_pool_{rand}".format(rand=random.randint(0, 10000)) pname = pname0 try: helper.create_pool(pname, 128) log.info("Pool {pname} is create".format(pname=pname)) except Exception: log.error("failed to create pool") log.error(traceback.format_exc()) return 1 """check whether pool exists""" try: helper.get_pool_num(pname) except Exception: log.error("Unable to find pool") log.error(traceback.format_exc()) return 1 time.sleep(10) oname = "OBJ_{pname}".format(pname=pname) putobj = "sudo rados -p {pool} put {obj} {path}".format(pool=pname, obj=oname, path="/etc/hosts") (out, err) = ctrlr.exec_command(cmd=putobj) """ creating omap key/value pairs for an object""" for i in range(4): omapcmd = "sudo rados -p {pool} setomapval {obj} {keey} {valu}".format( pool=pname, obj=oname, keey="key" + str(i), valu="value" + str(i)) (out, err) = ctrlr.exec_command(cmd=omapcmd) log.info("put {obj}, omap key {keey} value {valu}".format( obj=oname, keey="key" + str(i), valu="value" + str(i))) """ Goto destination osd, stop the osd service to use ceph-objectstore-tool to corrupt omap keys """ cmd = "osd map {pname} {obj} --format json".format(pname=pname, obj=oname) (out, err) = helper.raw_cluster_cmd(cmd) outbuf = out.read().decode() log.info(outbuf) cmdout = json.loads(outbuf) targt_pg = cmdout["pgid"] """Considering non primary osd""" targt_osd_id = cmdout["up"][1] # target_osd = ceph_cluster.get_osd_by_id(targt_osd_id) # target_osd_node = target_osd.node target_osd_hostname = ceph_cluster.get_osd_metadata(targt_osd_id).get( "hostname") log.info(target_osd_hostname) target_osd_node = ceph_cluster.get_node_by_hostname(target_osd_hostname) cot_environment = target_osd_node osd_service = ceph_cluster.get_osd_service_name(targt_osd_id) partition_path = ceph_cluster.get_osd_metadata(targt_osd_id).get( "osd_data") helper.kill_osd(target_osd_node, osd_service) time.sleep(10) osd_metadata = ceph_cluster.get_osd_metadata(targt_osd_id) osd_data = osd_metadata.get("osd_data") osd_journal = osd_metadata.get("osd_journal") if ceph_cluster.containerized: # target_osd_node.exec_command(cmd='sudo yum install -y ceph-osd', check_ec=False) docker_image_string = "{docker_registry}/{docker_image}:{docker_tag}".format( docker_registry=ceph_cluster.ansible_config.get( "ceph_docker_registry"), docker_image=ceph_cluster.ansible_config.get("ceph_docker_image"), docker_tag=ceph_cluster.ansible_config.get( "ceph_docker_image_tag"), ) cot_environment = helper.get_mgr_proxy_container( target_osd_node, docker_image_string) out, err = cot_environment.exec_command( cmd='mount | grep "{partition_path} "'.format( partition_path=partition_path), check_ec=False, ) device_mount_data = out.read().decode() # type: str if not device_mount_data: cot_environment.exec_command( cmd="sudo mount {partition_path} {directory}".format( partition_path=partition_path, directory=osd_data)) # docker_image_string = '{docker_registry}/{docker_image}:{docker_tag}'.format( # docker_registry=ceph_cluster.ansible_config.get('ceph_docker_registry'), # docker_image=ceph_cluster.ansible_config.get('ceph_docker_image'), # docker_tag=ceph_cluster.ansible_config.get('ceph_docker_image_tag')) # mgr_proxy = helper.get_mgr_container_proxy(target_osd_node, docker_image_string) slist_cmd = "sudo ceph-objectstore-tool --data-path \ {osd_data} --journal-path \ {osd_journal} \ --pgid {pgid} {obj} list-omap".format(osd_data=osd_data, osd_journal=osd_journal, obj=oname, pgid=targt_pg) (out, err) = cot_environment.exec_command(cmd=slist_cmd) outbuf = out.read().decode() keylist = outbuf.split() log.info(outbuf) """corrupting an omap key by rewriting the omap key with different value""" corrupt_cmd = "sudo ceph-objectstore-tool --data-path \ {osd_data} --journal-path \ {osd_journal} \ --pgid {pgid} {obj} set-omap \ {outbuf} {path}".format( osd_data=osd_data, osd_journal=osd_journal, obj=oname, pgid=targt_pg, outbuf=keylist[0], path="/etc/hosts", ) (out, err) = cot_environment.exec_command(cmd=corrupt_cmd) outbuf = out.read().decode() log.info(outbuf) helper.revive_osd(target_osd_node, osd_service) time.sleep(10) run_scrub = "pg deep-scrub {pgid}".format(pgid=targt_pg) (out, err) = helper.raw_cluster_cmd(run_scrub) outbuf = out.read().decode() log.info(outbuf) while "HEALTH_ERR" and "active+clean+inconsistent" not in outbuf: status = "-s --format json" (out, err) = helper.raw_cluster_cmd(status) outbuf = out.read().decode() log.info("HEALTH_ERR found as expected") log.info("inconsistent found as expected") timeout = 100 found = 0 while timeout: incon_pg = "sudo rados list-inconsistent-pg {pname}".format( pname=pname) (out, err) = ctrlr.exec_command(cmd=incon_pg) outbuf = out.read().decode() log.info(outbuf) if targt_pg not in outbuf: time.sleep(1) timeout = timeout - 1 else: found = 1 break if timeout == 0 and found == 0: log.error("pg not listed as inconsistent") return 1 timeout = 100 found = 0 while timeout: incon_obj = "sudo rados list-inconsistent-obj {pg}".format(pg=targt_pg) (out, err) = ctrlr.exec_command(cmd=incon_obj) outbuf = out.read().decode() log.info(outbuf) if oname not in outbuf: time.sleep(1) timeout = timeout - 1 else: found = 1 break if timeout == 0 and found == 0: log.error("object is not listed in inconsistent obj") return 1 return 0
def run(ceph_cluster, **kw): """ CEPH-9939: Delete snapset objects in ec pool followed by list-inconsistent-* commands 1. create a jerasure ec pool with k=4,m=2 2. create an object in the pool 3. chose any of the osd from the acting set and go to the backend 4. delete snap object from the backend 5. run deep-scrub on the pool 6. rados list-inconsistent-pg <pool> 7. rados list-inconsistent-obj <pg> Args: ceph_cluster (ceph.ceph.Ceph): ceph cluster """ log.info("Running CEPH-9939") log.info(run.__doc__) ceph_nodes = kw.get('ceph_nodes') config = kw.get('config') mons = [] osds = [] role = 'client' for mnode in ceph_nodes: if mnode.role == role: mons.append(mnode) role = 'osd' for osd in ceph_nodes: if osd.role == role: osds.append(osd) ctrlr = mons[0] log.info("chosing mon {cmon} as ctrlrmon".format(cmon=ctrlr.hostname)) helper = RadosHelper(ctrlr, config, log) """create ec pool with k=4,m=2""" k = 4 m = 2 pname = "ecsnapdelete_{rand}_{k}_{m}".format(rand=random.randint(0, 1000), k=k, m=m) profile = pname prof_cmd = "osd erasure-code-profile set {profile}\ k={k}\ m={m}\ rulset-failure-domain=osd\ crush-failure-domain=osd".format(profile=profile, k=k, m=m) try: (out, err) = helper.raw_cluster_cmd(prof_cmd) outbuf = out.read().decode() log.info(outbuf) log.info("created profile {ec}".format(ec=profile)) except Exception: log.error("ec profile creation failed") log.error(traceback.format_exc()) return 1 '''create ec pool''' try: helper.create_pool(pname, 1, profile) log.info("Pool {pname} is create".format(pname=pname)) except Exception: log.error("failed to create pool") log.error(traceback.format_exc()) return 1 '''check whether pool exists''' try: helper.get_pool_num(pname) except Exception: log.error("Unable to find pool") log.error(traceback.format_exc()) return 1 time.sleep(10) oname = "OBJ_{pname}".format(pname=pname) cmd = "osd map {pname} {obj} --format json".format(pname=pname, obj=oname) (out, err) = helper.raw_cluster_cmd(cmd) outbuf = out.read().decode() log.info(outbuf) cmdout = json.loads(outbuf) targt_pg = cmdout['pgid'] '''considering primary only as of now because of bug 1544680 ''' targt_osd_id = cmdout['up'][0] '''write data and take snaps''' putobj = "sudo rados -p {pool} put {obj} {path}".format(pool=pname, obj=oname, path="/etc/hosts") for i in range(10): (out, err) = ctrlr.exec_command(cmd=putobj) snapcmd = "sudo rados mksnap -p {pool} {sname}".format(pool=pname, sname="snap" + str(i)) (out, err) = ctrlr.exec_command(cmd=snapcmd) log.info("put {obj}, snap {snap}".format(obj=oname, snap="snap" + str(i))) """Goto destination osd , stop osd use ceph-objectstore-tool to delete snap """ target_osd = ceph_cluster.get_osd_by_id(targt_osd_id) target_osd_node = target_osd.node cot_environment = target_osd_node osd_service = ceph_cluster.get_osd_service_name(targt_osd_id) partition_path = ceph_cluster.get_osd_data_partition_path(targt_osd_id) helper.kill_osd(target_osd_node, osd_service) time.sleep(10) osd_metadata = ceph_cluster.get_osd_metadata(targt_osd_id) osd_data = osd_metadata.get('osd_data') osd_journal = osd_metadata.get('osd_journal') if ceph_cluster.containerized: docker_image_string = '{docker_registry}/{docker_image}:{docker_tag}'.format( docker_registry=ceph_cluster.ansible_config.get( 'ceph_docker_registry'), docker_image=ceph_cluster.ansible_config.get('ceph_docker_image'), docker_tag=ceph_cluster.ansible_config.get( 'ceph_docker_image_tag')) cot_environment = helper.get_mgr_proxy_container( target_osd_node, docker_image_string) out, err = cot_environment.exec_command( cmd='mount | grep "{partition_path} "'.format( partition_path=partition_path), check_ec=False) device_mount_data = out.read().decode() # type: str if not device_mount_data: cot_environment.exec_command( cmd='sudo mount {partition_path} {directory}'.format( partition_path=partition_path, directory=osd_data)) slist_cmd = "sudo ceph-objectstore-tool --data-path \ {osd_data} --journal-path \ {osd_journal} \ --op list \ {obj}|grep \\\"snapid\\\":1".format(osd_data=osd_data, osd_journal=osd_journal, obj=oname) (out, err) = cot_environment.exec_command(cmd=slist_cmd) outbuf = out.read().decode() log.info(outbuf) corrupt_cmd = "sudo ceph-objectstore-tool --data-path \ {osd_data} --journal-path \ {osd_journal} \ {outbuf} remove".format(osd_data=osd_data, osd_journal=osd_journal, outbuf="'" + (outbuf) + "'") (out, err) = cot_environment.exec_command(cmd=corrupt_cmd) outbuf = out.read().decode() log.info(outbuf) helper.revive_osd(target_osd_node, osd_service) time.sleep(10) run_scrub = "pg deep-scrub {pgid}".format(pgid=targt_pg) (out, err) = helper.raw_cluster_cmd(run_scrub) outbuf = out.read().decode() log.info(outbuf) while 'HEALTH_ERR' and 'active+clean+inconsistent' not in outbuf: status = "-s --format json" (out, err) = helper.raw_cluster_cmd(status) outbuf = out.read().decode() log.info("HEALTH_ERR found as expected") log.info("inconsistent foud as expected") timeout = 300 found = 0 while timeout: incon_pg = "sudo rados list-inconsistent-pg \ {pname}".format(pname=pname) (out, err) = ctrlr.exec_command(cmd=incon_pg) outbuf = out.read().decode() log.info(outbuf) if targt_pg not in outbuf: time.sleep(1) timeout = timeout - 1 else: found = 1 break if timeout == 0 and found == 0: log.error("pg not listed as inconsistent") return 1 timeout = 300 found = 0 while timeout: incon_obj = "sudo rados list-inconsistent-obj {pg}".format(pg=targt_pg) (out, err) = ctrlr.exec_command(cmd=incon_obj) outbuf = out.read().decode() log.info(outbuf) if oname not in outbuf: time.sleep(1) timeout = timeout - 1 else: found = 1 break if timeout == 0 and found == 0: log.error("object is not listed in inconsistent obj") return 1 return 0
def run(ceph_cluster, **kw): """ 1. Create a LRC profile and then create a ec pool #ceph osd erasure-code-profile set $profile \ plugin=lrc \ k=4 m=2 l=3 \ ruleset-failure-domain=osd # ceph osd pool create $poolname 1 1 erasure $profile 2. start writing a large object so that we will get \ sometime to fail the osd while the reads and writes are in progress on an object # rados put -p lrcpool obj1 /src/path #rados get -p lrcpool obj1 /tmp/obj1 while above command is in progress kill primary osd responsible for the PG. primary can be found from # ceph pg dump 3. Bring back primary 4. Repeat the step 2 but this time kill some secondary osds Args: ceph_cluster (ceph.ceph.Ceph): """ log.info("Running test CEPH-9281") ceph_nodes = kw.get('ceph_nodes') config = kw.get('config') mons = [] role = 'client' for mnode in ceph_nodes: if mnode.role == role: mons.append(mnode) ctrlr = mons[0] log.info("chosing mon {cmon} as ctrlrmon".format(cmon=ctrlr.hostname)) helper = RadosHelper(ctrlr, config, log) ''' create LRC profile ''' sufix = random.randint(0, 10000) prof_name = "LRCprofile{suf}".format(suf=sufix) profile = "osd erasure-code-profile set {LRCprofile} \ plugin=lrc\ k=4 m=2 l=3 \ ruleset-failure-domain=osd \ crush-failure-domain=osd".format(LRCprofile=prof_name) try: (out, err) = helper.raw_cluster_cmd(profile) outbuf = out.read().decode() log.info(outbuf) log.info("created profile {LRCprofile}".format(LRCprofile=prof_name)) except Exception: log.error("LRC profile creation failed") log.error(traceback.format_exc()) return 1 '''create LRC ec pool''' pool_name = "lrcpool{suf}".format(suf=sufix) try: helper.create_pool(pool_name, 1, prof_name) log.info("Pool {pname} created".format(pname=pool_name)) except Exception: log.error("lrcpool create failed") log.error(traceback.format_exc()) return 1 '''rados put and get in a parallel task''' with parallel() as p: p.spawn(do_rados_put, ctrlr, pool_name, 20) p.spawn(do_rados_get, ctrlr, pool_name, 10) for res in p: log.info(res) try: pri_osd_id = helper.get_pg_primary(pool_name, 0) log.info("PRIMARY={pri}".format(pri=pri_osd_id)) except Exception: log.error("getting primary failed") log.error(traceback.format_exc()) return 1 log.info("SIGTERM osd") pri_osd = ceph_cluster.get_osd_by_id(pri_osd_id) pri_osd_node = pri_osd.node pri_osd_service = ceph_cluster.get_osd_service_name(pri_osd_id) try: helper.kill_osd(pri_osd_node, pri_osd_service) log.info("osd killed") except Exception: log.error("killing osd failed") log.error(traceback.format_exc()) time.sleep(10) if helper.is_up(pri_osd_id): log.error("unexpected! osd is still up") return 1 time.sleep(5) log.info("Reviving osd {osd}".format(osd=pri_osd_id)) try: if helper.revive_osd(pri_osd_node, pri_osd_service): log.error("revive failed") return 1 except Exception: log.error("revive failed") log.error(traceback.format_exc()) return 1 time.sleep(10) if helper.is_up(pri_osd_id): log.info("osd is UP") else: log.error("osd is DOWN") return 1 time.sleep(10) try: rand_osd_id = helper.get_pg_random(pool_name, 0) log.info("RANDOM OSD={rosd}".format(rosd=rand_osd_id)) except Exception: log.error("getting random osd failed") log.error(traceback.format_exc()) return 1 log.info("SIGTERM osd") rand_osd = ceph_cluster.get_osd_by_id(rand_osd_id) rand_osd_node = rand_osd.node rand_osd_service = ceph_cluster.get_osd_service_name(rand_osd_id) try: helper.kill_osd(rand_osd_node, rand_osd_service) log.info("osd killed") except Exception: log.error("killing osd failed") log.error(traceback.format_exc()) time.sleep(10) if helper.is_up(rand_osd_id): log.error("unexpected! osd is still up") return 1 time.sleep(5) log.info("Reviving osd {osd}".format(osd=rand_osd_id)) try: if helper.revive_osd(rand_osd_node, rand_osd_service): log.error("revive failed") return 1 except Exception: log.error("revive failed") log.error(traceback.format_exc()) return 1 time.sleep(30) if helper.is_up(pri_osd_id): log.info("osd is UP") else: log.error("osd is DOWN") return 1 return 0