def select_volume_bricks_to_bring_offline(mnode, volname): """Randomly selects bricks to bring offline without affecting the cluster from a non-tiered volume. Args: mnode (str): Node on which commands will be executed. volname (str): Name of the volume. Returns: list: On success returns list of bricks that can be brough offline. If volume doesn't exist or is a tiered volume returns empty list """ volume_bricks_to_bring_offline = [] # Check if volume is tiered if is_tiered_volume(mnode, volname): return volume_bricks_to_bring_offline # get volume type volume_type_info = get_volume_type_info(mnode, volname) volume_type = volume_type_info['volume_type_info']['typeStr'] # get subvols subvols_dict = get_subvols(mnode, volname) volume_subvols = subvols_dict['volume_subvols'] # select bricks from distribute volume if volume_type == 'Distribute': volume_bricks_to_bring_offline = [] # select bricks from replicated, distributed-replicated volume elif (volume_type == 'Replicate' or volume_type == 'Distributed-Replicate'): # Get replica count volume_replica_count = ( volume_type_info['volume_type_info']['replicaCount']) # Get quorum info quorum_info = get_client_quorum_info(mnode, volname) volume_quorum_info = quorum_info['volume_quorum_info'] # Get list of bricks to bring offline volume_bricks_to_bring_offline = ( get_bricks_to_bring_offline_from_replicated_volume( volume_subvols, volume_replica_count, volume_quorum_info)) # select bricks from Disperse, Distribured-Disperse volume elif (volume_type == 'Disperse' or volume_type == 'Distributed-Disperse'): # Get redundancy count volume_redundancy_count = ( volume_type_info['volume_type_info']['redundancyCount']) # Get list of bricks to bring offline volume_bricks_to_bring_offline = ( get_bricks_to_bring_offline_from_disperse_volume( volume_subvols, volume_redundancy_count)) return volume_bricks_to_bring_offline
def select_hot_tier_bricks_to_bring_offline(mnode, volname): """Randomly selects bricks to bring offline without affecting the cluster from a hot tier. Args: mnode (str): Node on which commands will be executed. volname (str): Name of the volume. Returns: list: On success returns list of bricks that can be brough offline from hot tier. If volume doesn't exist or is a non tiered volume returns empty list. """ hot_tier_bricks_to_bring_offline = [] # Check if volume is tiered if not is_tiered_volume(mnode, volname): return hot_tier_bricks_to_bring_offline # get volume type volume_type_info = get_volume_type_info(mnode, volname) hot_tier_type = volume_type_info['hot_tier_type_info']['hotBrickType'] # get subvols subvols_dict = get_subvols(mnode, volname) hot_tier_subvols = subvols_dict['hot_tier_subvols'] # select bricks from distribute volume if hot_tier_type == 'Distribute': hot_tier_bricks_to_bring_offline = [] # select bricks from replicated, distributed-replicated volume if (hot_tier_type == 'Replicate' or hot_tier_type == 'Distributed-Replicate'): # Get replica count hot_tier_replica_count = ( volume_type_info['hot_tier_type_info']['hotreplicaCount']) # Get quorum info quorum_info = get_client_quorum_info(mnode, volname) hot_tier_quorum_info = quorum_info['hot_tier_quorum_info'] # Get list of bricks to bring offline hot_tier_bricks_to_bring_offline = ( get_bricks_to_bring_offline_from_replicated_volume( hot_tier_subvols, hot_tier_replica_count, hot_tier_quorum_info)) return hot_tier_bricks_to_bring_offline
def test_manual_heal_should_trigger_heal(self): """ - create a single brick volume - add some files and directories - get arequal from mountpoint - add-brick such that this brick makes the volume a replica vol 1x2 - start heal - make sure heal is completed - get arequals from all bricks and compare with arequal from mountpoint """ # pylint: disable=too-many-statements,too-many-locals # Start IO on mounts g.log.info("Starting IO on all mounts...") self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Starting IO on %s:%s", mount_obj.client_system, mount_obj.mountpoint) cmd = ("python %s create_deep_dirs_with_files " "--dir-length 1 " "--dir-depth 1 " "--max-num-of-dirs 1 " "--num-of-files 10 %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) self.all_mounts_procs.append(proc) g.log.info("IO on %s:%s is started successfully", mount_obj.client_system, mount_obj.mountpoint) self.io_validation_complete = False # Validate IO self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients") self.io_validation_complete = True # Get arequal for mount before adding bricks g.log.info('Getting arequal before adding bricks...') ret, arequals = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after healing is successful') mount_point_total = arequals[0].splitlines()[-1].split(':')[-1] # Form brick list to add g.log.info('Forming brick list to add...') bricks_to_add = form_bricks_list(self.mnode, self.volname, 1, self.servers, self.all_servers_info) g.log.info('Brick list to add: %s', bricks_to_add) # Add bricks g.log.info("Start adding bricks to volume...") ret, _, _ = add_brick(self.mnode, self.volname, bricks_to_add, force=True, replica_count=2) self.assertFalse(ret, "Failed to add bricks %s" % bricks_to_add) g.log.info("Adding bricks is successful on volume %s", self.volname) # Make sure the newly added bricks are available in the volume # get the bricks for the volume g.log.info("Fetching bricks for the volume: %s", self.volname) bricks_list = get_all_bricks(self.mnode, self.volname) g.log.info("Brick list: %s", bricks_list) for brick in bricks_to_add: self.assertIn(brick, bricks_list, 'Brick %s is not in brick list' % brick) g.log.info('New bricks are present in the volume') # Make sure volume change from distribute to replicate volume vol_info_dict = get_volume_type_info(self.mnode, self.volname) vol_type = vol_info_dict['volume_type_info']['typeStr'] self.assertEqual( 'Replicate', vol_type, 'Volume type is not converted to Replicate ' 'after adding bricks') g.log.info('Volume type is successfully converted to Replicate ' 'after adding bricks') # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequal on bricks and compare with mount_point_total # It should be the same g.log.info('Getting arequal on bricks...') arequals_after_heal = {} for brick in bricks_list: g.log.info('Getting arequal on bricks %s...', brick) node, brick_path = brick.split(':') command = ('arequal-checksum -p %s ' '-i .glusterfs -i .landfill -i .trashcan' % brick_path) ret, arequal, _ = g.run(node, command) self.assertFalse(ret, 'Failed to get arequal on brick %s' % brick) g.log.info('Getting arequal for %s is successful', brick) brick_total = arequal.splitlines()[-1].split(':')[-1] arequals_after_heal[brick] = brick_total self.assertEqual( mount_point_total, brick_total, 'Arequals for mountpoint and %s are not equal' % brick) g.log.info('Arequals for mountpoint and %s are equal', brick) g.log.info('All arequals are equal for replicated')
def test_glustershd_on_all_volume_types(self): """ Test Script to verify the glustershd server vol file has only entries for replicate volumes * Create multiple volumes and start all volumes * Check the glustershd processes - Only One glustershd should be listed * Check the glustershd server vol file - should contain entries only for replicated involved volumes * Add bricks to the replicate volume - it should convert to distributed-replicate * Check the glustershd server vol file - newly added bricks should present * Check the glustershd processes - Only 1 glustershd should be listed """ # pylint: disable=too-many-statements nodes = self.servers # check the self-heal daemon process g.log.info("Starting to get self-heal daemon process on " "nodes %s", nodes) ret, glustershd_pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process " "found : %s" % glustershd_pids)) g.log.info( "Successful in getting Single self heal daemon process" " on all nodes %s", nodes) # For all the volumes, check whether bricks present in # glustershd server vol file volume_list = get_volume_list(self.mnode) for volume in volume_list: g.log.info("Volume Name: %s", volume) volume_type_info = get_volume_type_info(self.mnode, volume) volume_type = (volume_type_info['volume_type_info']['typeStr']) # get the bricks for the volume g.log.info("Fetching bricks for the volume : %s", volume) bricks_list = get_all_bricks(self.mnode, volume) g.log.info("Brick List : %s", bricks_list) # validate the bricks present in volume info with # glustershd server volume file g.log.info("Start parsing file %s on " "node %s", self.GLUSTERSHD, self.mnode) ret = do_bricks_exist_in_shd_volfile(self.mnode, volume, bricks_list) if volume_type == 'Distribute': self.assertFalse(ret, ("Bricks exist in glustershd server " "volume file for %s Volume" % volume_type)) g.log.info( "EXPECTED : Bricks doesn't exist in glustershd " "server volume file for %s Volume", volume_type) else: self.assertTrue(ret, ("Brick List from volume info is " "different from glustershd server " "volume file. Please check log " "file for details")) g.log.info( "Bricks exist in glustershd server volume file " "for %s Volume", volume_type) # expanding volume for Replicate for volume in volume_list: volume_type_info = get_volume_type_info(self.mnode, volume) volume_type = (volume_type_info['volume_type_info']['typeStr']) if volume_type == 'Replicate': g.log.info("Start adding bricks to volume %s", volume) ret = expand_volume(self.mnode, volume, self.servers, self.all_servers_info) self.assertTrue(ret, ("Failed to add bricks to " "volume %s " % volume)) g.log.info("Add brick successful") # Log Volume Info and Status after expanding the volume g.log.info("Logging volume info and Status after " "expanding volume") ret = log_volume_info_and_status(self.mnode, volume) self.assertTrue(ret, ("Logging volume info and status failed " "on volume %s", volume)) g.log.info( "Successful in logging volume info and status " "of volume %s", volume) # Verify volume's all process are online for 60 sec g.log.info("Verifying volume's all process are online") ret = wait_for_volume_process_to_be_online( self.mnode, volume, 60) self.assertTrue(ret, ("Volume %s : All process are not " "online", volume)) g.log.info( "Successfully verified volume %s processes " "are online", volume) # check the type for the replicate volume volume_type_info_for_replicate_after_adding_bricks = \ get_volume_type_info(self.mnode, volume) volume_type_for_replicate_after_adding_bricks = \ (volume_type_info_for_replicate_after_adding_bricks ['volume_type_info']['typeStr']) self.assertEqual(volume_type_for_replicate_after_adding_bricks, 'Distributed-Replicate', ("Replicate volume type is not converted to " "Distributed-Replicate after adding bricks")) g.log.info("Replicate Volume is successfully converted to" " Distributed-Replicate after adding bricks") # get the bricks for the volume after expanding bricks_list_after_expanding = get_all_bricks( self.mnode, volume) g.log.info("Brick List after expanding " "volume: %s", bricks_list_after_expanding) # validate the bricks present in volume info # with glustershd server volume file after adding bricks g.log.info("Starting parsing file %s", self.GLUSTERSHD) ret = do_bricks_exist_in_shd_volfile( self.mnode, volume, bricks_list_after_expanding) self.assertTrue(ret, ("Brick List from volume info is " "different from glustershd server " "volume file after expanding bricks. " "Please check log file for details")) g.log.info("Brick List from volume info is same as from " "glustershd server volume file after " "expanding bricks.") # check the self-heal daemon process g.log.info("Starting to get self-heal daemon process on " "nodes %s", nodes) ret, glustershd_pids_after_adding_bricks = \ get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process " "found : %s" % glustershd_pids_after_adding_bricks)) g.log.info( "Successful in getting Single self heal daemon process" " on all nodes %s", nodes) self.assertNotEqual( glustershd_pids, glustershd_pids_after_adding_bricks, "Self Daemon process is same before and" " after adding bricks") g.log.info("Self Heal Daemon Process is different before and " "after adding bricks")
def test_no_glustershd_with_distribute(self): """ Test Script to verify the glustershd server vol file has only entries for replicate volumes * Create multiple volumes and start all volumes * Check the glustershd processes - Only 1 glustershd should be listed * Stop all volumes * Check the glustershd processes - No glustershd should be running * Start the distribute volume only * Check the glustershd processes - No glustershd should be running """ nodes = self.servers # check the self-heal daemon process g.log.info("Starting to get self-heal daemon process on " "nodes %s", nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either no self heal daemon process found or " "more than One self heal daemon process " "found : %s" % pids)) g.log.info( "Successful in getting single self heal daemon process" " on all nodes %s", nodes) # stop all the volumes g.log.info("Going to stop all the volumes") volume_list = get_volume_list(self.mnode) for volume in volume_list: g.log.info("Stopping Volume : %s", volume) ret = volume_stop(self.mnode, volume) self.assertTrue(ret, ("Failed to stop volume %s" % volume)) g.log.info("Successfully stopped volume %s", volume) g.log.info("Successfully stopped all the volumes") # check the self-heal daemon process after stopping all volumes g.log.info("Starting to get self-heal daemon process on " "nodes %s", nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertFalse(ret, ("Self heal daemon process is still running " "after stopping all volumes ")) for node in pids: self.assertEqual(pids[node][0], -1, ("Self heal daemon is still " "running on node %s even " "after stoppong all " "volumes" % node)) g.log.info("EXPECTED: No self heal daemon process is " "running after stopping all volumes") # start the distribute volume only for volume in volume_list: volume_type_info = get_volume_type_info(self.mnode, volume) volume_type = (volume_type_info['volume_type_info']['typeStr']) if volume_type == 'Distribute': g.log.info("starting to start distribute volume: %s", volume) ret = volume_start(self.mnode, volume) self.assertTrue(ret, ("Failed to start volume %s" % volume)) g.log.info("Successfully started volume %s", volume) break # check the self-heal daemon process after starting distribute volume g.log.info("Starting to get self-heal daemon process on " "nodes %s", nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertFalse(ret, ("Self heal daemon process is still running " "after stopping all volumes ")) for node in pids: self.assertEqual(pids[node][0], -1, ("Self heal daemon is still " "running on node %s even " "after stopping all " "volumes" % node)) g.log.info("EXPECTED: No self heal daemon process is running " "after stopping all volumes")
def validate_xattr_values(self, dirname, ctime=True): """Validate existence and consistency of a specific xattr value across replica set Args: dirname (str): parent directory name Kwargs: ctime(bool): ctime feature enablement """ # pylint: disable=too-many-branches # Fetch all replica sets(subvols) in the volume ret = get_subvols(self.mnode, self.volname) # Iterating through each subvol(replicaset) for subvol in ret['volume_subvols']: brick_host_list = {} # Dict for storing host,brickpath pairs for each in subvol: # Fetching each replica in replica set # Splitting to brick,hostname pairs host, brick_path = each.split(':') brick_host_list[host] = brick_path # Fetch Complete parent directory path directory = brick_path + '/' + dirname # Fetching all entries recursively in a replicaset entry_list = get_dir_contents(host, directory, recursive=True) for each in entry_list: xattr_value = [] # list to store xattr value # Logic to get xattr values for host, brickpath in brick_host_list.items(): # Remove the prefix brick_path from entry-name each = sub(brick_path, '', each) # Adding the right brickpath name for fetching xattrval brick_entry_path = brickpath + each ret = get_extended_attributes_info(host, [brick_entry_path], encoding='hex', attr_name='trusted' '.glusterfs.' 'mdata') if ret: ret = ret[brick_entry_path]['trusted.glusterfs.mdata'] g.log.info("mdata xattr value of %s is %s", brick_entry_path, ret) else: pass if ctime: self.assertIsNotNone( ret, "glusterfs.mdata not set on" " {}".format(brick_entry_path)) g.log.info( "mdata xattr %s is set on the back-end" " bricks", ret) else: self.assertIsNone( ret, "trusted.glusterfs.mdata seen " " on {}".format(brick_entry_path)) g.log.info( "mdata xattr %s is not set on the back-end" " bricks", ret) xattr_value.append(ret) voltype = get_volume_type_info(self.mnode, self.volname) if voltype['volume_type_info']['arbiterCount'] == '0': ret = bool( xattr_value.count(xattr_value[0]) == len(xattr_value)) elif voltype['volume_type_info']['arbiterCount'] == '1': ret = bool(((xattr_value.count(xattr_value[0])) or (xattr_value.count(xattr_value[1])) > 1)) else: g.log.error("Arbiter value is neither 0 nor 1") if ctime: self.assertTrue( ret, 'trusted.glusterfs.mdata' + ' value not same across bricks for ' 'entry ' + each) else: self.assertTrue( ret, 'trusted.glusterfs.mdata' + ' seems to be set on some bricks for ' + each)