def test_server_side_healing_happens_only_when_glustershd_running(self): """ Test Script which verifies that the server side healing must happen only if the heal daemon is running on the node where source brick resides. * Create and start the Replicate volume * Check the glustershd processes - Only 1 glustershd should be listed * Bring down the bricks without affecting the cluster * Create files on volume * kill the glustershd on node where bricks is running * bring the bricks up which was killed in previous steps * check the heal info - heal info must show pending heal info, heal shouldn't happen since glustershd is down on source node * issue heal * trigger client side heal * heal should complete successfully """ # pylint: disable=too-many-locals,too-many-statements,too-many-lines # Setting Volume options options = { "metadata-self-heal": "on", "entry-self-heal": "on", "data-self-heal": "on" } ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Successfully set %s for volume %s", options, self.volname) # Check the self-heal daemon process ret, pids = get_self_heal_daemon_pid(self.servers) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process " "found : %s" % pids)) g.log.info( "Successful in verifying self heal daemon process" " on all nodes %s", self.servers) # Select the bricks to bring offline bricks_to_bring_offline = (select_volume_bricks_to_bring_offline( self.mnode, self.volname)) g.log.info("Brick List to bring offline : %s", bricks_to_bring_offline) # Bring down the selected bricks ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, "Failed to bring down the bricks") g.log.info("Brought down the brick process " "for %s", bricks_to_bring_offline) # Write files on all mounts all_mounts_procs, num_files_to_write = [], 100 for mount_obj in self.mounts: cmd = ("/usr/bin/env python %s create_files " "-f %s --base-file-name file %s" % (self.script_upload_path, num_files_to_write, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) # Validate IO ret = validate_io_procs(all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") g.log.info("IO is successful on all mounts") # Get online bricks list online_bricks = get_online_bricks_list(self.mnode, self.volname) g.log.info("Online Bricks for volume %s : %s", self.volname, online_bricks) # Get the nodes where bricks are running bring_offline_glustershd_nodes = [] for brick in online_bricks: bring_offline_glustershd_nodes.append(brick.split(":")[0]) g.log.info("self heal deamon on nodes %s to be killed", bring_offline_glustershd_nodes) # Kill the self heal daemon process on nodes ret = bring_self_heal_daemon_process_offline( bring_offline_glustershd_nodes) self.assertTrue( ret, ("Unable to bring self heal daemon process" " offline for nodes %s" % bring_offline_glustershd_nodes)) g.log.info( "Sucessfully brought down self heal process for " "nodes %s", bring_offline_glustershd_nodes) # Check the heal info heal_info = get_heal_info_summary(self.mnode, self.volname) g.log.info("Successfully got heal info %s for the volume %s", heal_info, self.volname) # Bring bricks online ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline, 'glusterd_restart') self.assertTrue( ret, ("Failed to bring bricks: %s online" % bricks_to_bring_offline)) # Issue heal ret = trigger_heal_full(self.mnode, self.volname) self.assertFalse(ret, ("Able to trigger heal on volume %s where " "self heal daemon is not running" % self.volname)) g.log.info( "Expected : Unable to trigger heal on volume %s where " "self heal daemon is not running", self.volname) # Wait for 130 sec to heal ret = monitor_heal_completion(self.mnode, self.volname, 130) self.assertFalse(ret, ("Heal Completed on volume %s" % self.volname)) g.log.info("Expected : Heal pending on volume %s", self.volname) # Check the heal info heal_info_after_triggering_heal = get_heal_info_summary( self.mnode, self.volname) g.log.info("Successfully got heal info for the volume %s", self.volname) # Compare with heal pending with the files wrote for node in online_bricks: self.assertGreaterEqual( int(heal_info_after_triggering_heal[node]['numberOfEntries']), num_files_to_write, ("Some of the files are healed from source bricks %s where " "self heal daemon is not running" % node)) g.log.info("EXPECTED: No files are healed from source bricks where " "self heal daemon is not running") # Unmount and Mount volume again as volume options were set # after mounting the volume for mount_obj in self.mounts: ret, _, _ = umount_volume(mount_obj.client_system, mount_obj.mountpoint) self.assertEqual(ret, 0, "Failed to unmount %s" % mount_obj.client_system) ret, _, _ = mount_volume(self.volname, mtype='glusterfs', mpoint=mount_obj.mountpoint, mserver=self.mnode, mclient=mount_obj.client_system) self.assertEqual(ret, 0, "Failed to mount %s" % mount_obj.client_system) all_mounts_procs = [] for mount_obj in self.mounts: cmd = ("/usr/bin/env python %s read %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) # Validate IO ret = validate_io_procs(all_mounts_procs, self.mounts) self.assertTrue(ret, "Reads failed on some of the clients") g.log.info("Reads successful on all mounts") # Wait for heal to complete ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, "Unable to heal the pending entries") g.log.info("Successfully healed the pending entries for volume %s", self.volname)
def test_glustershd_with_restarting_glusterd(self): """ Test Script to verify the self heal daemon process with restarting glusterd and rebooting the server * stop all volumes * restart glusterd - should not run self heal daemon process * start replicated involved volumes * single self heal daemon process running * restart glusterd * self heal daemon pid will change * bring down brick and restart glusterd * self heal daemon pid will change and its different from previous * brought up the brick """ # pylint: disable=too-many-statements nodes = self.volume['servers'] # stop the volume g.log.info("Stopping the volume %s", self.volname) ret = volume_stop(self.mnode, self.volname) self.assertTrue(ret, ("Failed to stop volume %s" % self.volname)) g.log.info("Successfully stopped volume %s", self.volname) # check the self heal daemon process after stopping the volume g.log.info("Verifying the self heal daemon process for " "volume %s", self.volname) ret = are_all_self_heal_daemons_are_online(self.mnode, self.volname) self.assertFalse(ret, ("Self Heal Daemon process is still running " "even after stopping volume %s" % self.volname)) g.log.info("Self Heal Daemon is not running after stopping " "volume %s", self.volname) # restart glusterd service on all the servers g.log.info("Restarting glusterd on all servers %s", nodes) ret = restart_glusterd(nodes) self.assertTrue(ret, ("Failed to restart glusterd on all nodes %s", nodes)) g.log.info("Successfully restarted glusterd on all nodes %s", nodes) self.assertTrue( wait_for_glusterd_to_start(self.servers), "Failed to start glusterd on %s" % self.servers) # check the self heal daemon process after restarting glusterd process g.log.info("Starting to get self-heal daemon process on" " nodes %s", nodes) ret = are_all_self_heal_daemons_are_online(self.mnode, self.volname) self.assertFalse(ret, ("Self Heal Daemon process is running after " "glusterd restart with volume %s in " "stop state" % self.volname)) g.log.info("Self Heal Daemon is not running after stopping " "volume and restarting glusterd %s", self.volname) # start the volume g.log.info("Starting the volume %s", self.volname) ret = volume_start(self.mnode, self.volname) self.assertTrue(ret, ("Failed to start volume %s" % self.volname)) g.log.info("Volume %s started successfully", self.volname) # Verfiy glustershd process releases its parent process g.log.info("Checking whether glustershd process is daemonized or not") ret = is_shd_daemonized(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) g.log.info("Single self heal daemon process on all nodes %s", nodes) # get the self heal daemon pids after starting volume g.log.info("Starting to get self-heal daemon process " "on nodes %s", nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) g.log.info("Successful in getting self heal daemon pids") glustershd_pids = pids # get the bricks for the volume g.log.info("Fetching bricks for the volume : %s", self.volname) bricks_list = get_all_bricks(self.mnode, self.volname) g.log.info("Brick List : %s", bricks_list) # validate the bricks present in volume info # with glustershd server volume file g.log.info("Starting parsing file %s on " "node %s", self.glustershd, self.mnode) ret = do_bricks_exist_in_shd_volfile(self.mnode, self.volname, bricks_list) self.assertTrue(ret, ("Brick List from volume info is different from " "glustershd server volume file. " "Please check log file for details.")) g.log.info("Successfully parsed %s file", self.glustershd) # restart glusterd service on all the servers g.log.info("Restarting glusterd on all servers %s", nodes) ret = restart_glusterd(nodes) self.assertTrue(ret, ("Failed to restart glusterd on all nodes %s", nodes)) g.log.info("Successfully restarted glusterd on all nodes %s", nodes) # Verify volume's all process are online for 60 sec g.log.info("Verifying volume's all process are online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname, 60) self.assertTrue(ret, ("Volume %s : All process are not " "online", self.volname)) g.log.info("Successfully Verified volume %s processes are online", self.volname) # Verfiy glustershd process releases its parent process ret = is_shd_daemonized(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) # check the self heal daemon process after starting volume and # restarting glusterd process g.log.info("Starting to get self-heal daemon process " "on nodes %s", nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) glustershd_pids_after_glusterd_restart = pids self.assertNotEqual(glustershd_pids, glustershd_pids_after_glusterd_restart, ("Self Heal Daemon pids are same after " "restarting glusterd process")) g.log.info("Self Heal Daemon process are different before and " "after restarting glusterd process") # select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = list(filter(None, ( bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks']))) # bring bricks offline g.log.info("Going to bring down the brick process " "for %s", bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, ("Failed to bring down the bricks. Please " "check the log file for more details.")) g.log.info("Brought down the brick process " "for %s successfully", bricks_to_bring_offline) # restart glusterd after brought down the brick g.log.info("Restart glusterd on all servers %s", nodes) ret = restart_glusterd(nodes) self.assertTrue(ret, ("Failed to restart glusterd on all nodes %s", nodes)) g.log.info("Successfully restarted glusterd on all nodes %s", nodes) # Verify volume's all process are online for 60 sec g.log.info("Verifying volume's all process are online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname, 60) self.assertTrue(ret, ("Volume %s : All process are not " "online", self.volname)) g.log.info("Successfully Verified volume %s processes are online", self.volname) # Verfiy glustershd process releases its parent process ret = is_shd_daemonized(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) # check the self heal daemon process after killing brick and # restarting glusterd process g.log.info("Starting to get self-heal daemon process " "on nodes %s", nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) glustershd_pids_after_killing_brick = pids self.assertNotEqual(glustershd_pids_after_glusterd_restart, glustershd_pids_after_killing_brick, ("Self Heal Daemon process are same from before " "killing the brick,restarting glusterd process")) g.log.info("Self Heal Daemon process are different after killing the " "brick, restarting the glusterd process") # brought the brick online g.log.info("bringing up the bricks : %s online", bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, ("Failed to brought the bricks online")) g.log.info("Successfully brought the bricks online") # check all bricks are online g.log.info("Verifying all bricka are online or not.....") ret = are_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, ("Not all bricks are online")) g.log.info("All bricks are online.")
def test_glustershd_on_all_volume_types(self): """ Test Script to verify the glustershd server vol file has only entries for replicate volumes * Create multiple volumes and start all volumes * Check the glustershd processes - Only One glustershd should be listed * Check the glustershd server vol file - should contain entries only for replicated involved volumes * Add bricks to the replicate volume - it should convert to distributed-replicate * Check the glustershd server vol file - newly added bricks should present * Check the glustershd processes - Only 1 glustershd should be listed """ # pylint: disable=too-many-statements nodes = self.servers # check the self-heal daemon process g.log.info("Starting to get self-heal daemon process on " "nodes %s", nodes) ret, glustershd_pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process " "found : %s" % glustershd_pids)) g.log.info( "Successful in getting Single self heal daemon process" " on all nodes %s", nodes) # For all the volumes, check whether bricks present in # glustershd server vol file volume_list = get_volume_list(self.mnode) for volume in volume_list: g.log.info("Volume Name: %s", volume) volume_type_info = get_volume_type_info(self.mnode, volume) volume_type = (volume_type_info['volume_type_info']['typeStr']) # get the bricks for the volume g.log.info("Fetching bricks for the volume : %s", volume) bricks_list = get_all_bricks(self.mnode, volume) g.log.info("Brick List : %s", bricks_list) # validate the bricks present in volume info with # glustershd server volume file g.log.info("Start parsing file %s on " "node %s", self.GLUSTERSHD, self.mnode) ret = do_bricks_exist_in_shd_volfile(self.mnode, volume, bricks_list) if volume_type == 'Distribute': self.assertFalse(ret, ("Bricks exist in glustershd server " "volume file for %s Volume" % volume_type)) g.log.info( "EXPECTED : Bricks doesn't exist in glustershd " "server volume file for %s Volume", volume_type) else: self.assertTrue(ret, ("Brick List from volume info is " "different from glustershd server " "volume file. Please check log " "file for details")) g.log.info( "Bricks exist in glustershd server volume file " "for %s Volume", volume_type) # expanding volume for Replicate for volume in volume_list: volume_type_info = get_volume_type_info(self.mnode, volume) volume_type = (volume_type_info['volume_type_info']['typeStr']) if volume_type == 'Replicate': g.log.info("Start adding bricks to volume %s", volume) ret = expand_volume(self.mnode, volume, self.servers, self.all_servers_info) self.assertTrue(ret, ("Failed to add bricks to " "volume %s " % volume)) g.log.info("Add brick successful") # Log Volume Info and Status after expanding the volume g.log.info("Logging volume info and Status after " "expanding volume") ret = log_volume_info_and_status(self.mnode, volume) self.assertTrue(ret, ("Logging volume info and status failed " "on volume %s", volume)) g.log.info( "Successful in logging volume info and status " "of volume %s", volume) # Verify volume's all process are online for 60 sec g.log.info("Verifying volume's all process are online") ret = wait_for_volume_process_to_be_online( self.mnode, volume, 60) self.assertTrue(ret, ("Volume %s : All process are not " "online", volume)) g.log.info( "Successfully verified volume %s processes " "are online", volume) # check the type for the replicate volume volume_type_info_for_replicate_after_adding_bricks = \ get_volume_type_info(self.mnode, volume) volume_type_for_replicate_after_adding_bricks = \ (volume_type_info_for_replicate_after_adding_bricks ['volume_type_info']['typeStr']) self.assertEqual(volume_type_for_replicate_after_adding_bricks, 'Distributed-Replicate', ("Replicate volume type is not converted to " "Distributed-Replicate after adding bricks")) g.log.info("Replicate Volume is successfully converted to" " Distributed-Replicate after adding bricks") # get the bricks for the volume after expanding bricks_list_after_expanding = get_all_bricks( self.mnode, volume) g.log.info("Brick List after expanding " "volume: %s", bricks_list_after_expanding) # validate the bricks present in volume info # with glustershd server volume file after adding bricks g.log.info("Starting parsing file %s", self.GLUSTERSHD) ret = do_bricks_exist_in_shd_volfile( self.mnode, volume, bricks_list_after_expanding) self.assertTrue(ret, ("Brick List from volume info is " "different from glustershd server " "volume file after expanding bricks. " "Please check log file for details")) g.log.info("Brick List from volume info is same as from " "glustershd server volume file after " "expanding bricks.") # check the self-heal daemon process g.log.info("Starting to get self-heal daemon process on " "nodes %s", nodes) ret, glustershd_pids_after_adding_bricks = \ get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process " "found : %s" % glustershd_pids_after_adding_bricks)) g.log.info( "Successful in getting Single self heal daemon process" " on all nodes %s", nodes) self.assertNotEqual( glustershd_pids, glustershd_pids_after_adding_bricks, "Self Daemon process is same before and" " after adding bricks") g.log.info("Self Heal Daemon Process is different before and " "after adding bricks")
def test_impact_of_replace_brick_for_glustershd(self): # pylint: disable=too-many-statements,too-many-branches,too-many-locals nodes = self.volume['servers'] replaced_bricks = [] # check the self-heal daemon process g.log.info("Starting to get self-heal daemon process on " "nodes %s", nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process " "found : %s" % pids)) g.log.info( "Successful in getting Single self heal daemon process" " on all nodes %s", nodes) glustershd_pids = pids # get the bricks for the volume g.log.info("Fetching bricks for the volume : %s", self.volname) bricks_list = get_all_bricks(self.mnode, self.volname) g.log.info("Brick List : %s", bricks_list) # validate the bricks present in volume info with # glustershd server volume file g.log.info("Starting parsing file %s on " "node %s", self.glustershd, self.mnode) ret = do_bricks_exist_in_shd_volfile(self.mnode, self.volname, bricks_list) self.assertTrue(ret, ("Brick List from volume info is different " "from glustershd server volume file. " "Please check log file for details")) g.log.info("Successfully parsed %s file", self.glustershd) # get the subvolumes g.log.info("Starting to get sub-volumes for volume %s", self.volname) subvols_dict = get_subvols(self.mnode, self.volname) num_subvols = len(subvols_dict['volume_subvols']) g.log.info("Number of subvolumes in volume %s:", num_subvols) # replace brick from each sub-vol for i in range(0, num_subvols): subvol_brick_list = subvols_dict['volume_subvols'][i] g.log.info("sub-volume %s brick list : %s", i, subvol_brick_list) brick_to_replace = subvol_brick_list[-1] new_brick = brick_to_replace + 'new' g.log.info("Replacing the brick %s for the volume : %s", brick_to_replace, self.volname) ret, _, err = replace_brick(self.mnode, self.volname, brick_to_replace, new_brick) self.assertFalse(ret, err) g.log.info('Replaced brick %s to %s successfully', brick_to_replace, new_brick) replaced_bricks.append(brick_to_replace) # Verify volume's all process are online for 60 sec g.log.info("Verifying volume's all process are online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname, timeout=60) self.assertTrue(ret, ("Volume %s : All process are not " "online", self.volname)) g.log.info("Successfully Verified volume %s processes are online", self.volname) # Verify glustershd process releases its parent process ret = is_shd_daemonized(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) # check the self-heal daemon process g.log.info("Starting to get self-heal daemon process on nodes " "%s", nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or" " more than One self heal daemon process" " found : %s" % pids)) g.log.info( "Successful in getting Single self heal daemon process" " on all nodes %s", nodes) glustershd_pids_after_replacement = pids # Compare pids before and after replacing self.assertNotEqual( glustershd_pids, glustershd_pids_after_replacement, "Self Daemon process is same before and" " after replacing bricks") g.log.info("Self Heal Daemon Process is different before and " "after replacing bricks") # get the bricks for the volume after replacing bricks_list_after_replacing = get_all_bricks(self.mnode, self.volname) g.log.info("Brick List after expanding " "volume: %s", bricks_list_after_replacing) # validate the bricks present in volume info # with glustershd server volume file after replacing bricks g.log.info("Starting parsing file %s", self.glustershd) ret = do_bricks_exist_in_shd_volfile(self.mnode, self.volname, bricks_list_after_replacing) self.assertTrue(ret, ("Brick List from volume info is different " "from glustershd server volume file after " "replacing bricks. Please check log file " "for details")) g.log.info("Successfully parsed %s file", self.glustershd) g.log.info("Starting to delete replaced brick dir's") # Remove brick directories of the replaced bricks as this is not # handled by tearDown class for bricks in replaced_bricks: node, brick_path = bricks.split(r':') cmd = "rm -rf " + brick_path ret, _, _ = g.run(node, cmd) if ret: raise ExecutionError("Failed to delete the brick dir's for" " %s and brick %s" % (node, brick_path)) g.log.info("Successfully deleted brick dir's for replaced bricks")
def test_glustershd_with_add_remove_brick(self): """ Test script to verify glustershd process with adding and removing bricks * check glustershd process - only 1 glustershd process should be running * bricks must be present in glustershd-server.vol file for the replicated involved volumes * Add bricks * check glustershd process - only 1 glustershd process should be running and its should be different from previous one * bricks which are added must present in glustershd-server.vol file * remove bricks * check glustershd process - only 1 glustershd process should be running and its different from previous one * bricks which are removed should not present in glustershd-server.vol file """ # pylint: disable=too-many-statements nodes = self.volume['servers'] bricks_list = [] glustershd_pids = {} # check the self-heal daemon process g.log.info("Starting to get self-heal daemon process on " "nodes %s", nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process " "found : %s", pids)) g.log.info("Successful in getting Single self heal daemon process" " on all nodes %s", nodes) glustershd_pids = pids # get the bricks for the volume g.log.info("Fetching bricks for the volume : %s", self.volname) bricks_list = get_all_bricks(self.mnode, self.volname) g.log.info("Brick List : %s", bricks_list) # validate the bricks present in volume info with # glustershd server volume file g.log.info("Starting parsing file %s on " "node %s", self.glustershd, self.mnode) ret = do_bricks_exist_in_shd_volfile(self.mnode, self.volname, bricks_list) self.assertTrue(ret, ("Brick List from volume info is different " "from glustershd server volume file. " "Please check log file for details")) g.log.info("Successfully parsed %s file", self.glustershd) # expanding volume g.log.info("Start adding bricks to volume %s", self.volname) ret = expand_volume(self.mnode, self.volname, self.servers, self.all_servers_info) self.assertTrue(ret, ("Failed to add bricks to " "volume %s " % self.volname)) g.log.info("Add brick successful") # Log Volume Info and Status after expanding the volume g.log.info("Logging volume info and Status after expanding volume") ret = log_volume_info_and_status(self.mnode, self.volname) self.assertTrue(ret, ("Logging volume info and status failed " "on volume %s", self.volname)) g.log.info("Successful in logging volume info and status " "of volume %s", self.volname) # Verify volume's all process are online for 60 sec g.log.info("Verifying volume's all process are online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname, 60) self.assertTrue(ret, ("Volume %s : All process are not " "online", self.volname)) g.log.info("Successfully Verified volume %s processes are online", self.volname) # Start Rebalance g.log.info("Starting Rebalance on the volume") ret, _, err = rebalance_start(self.mnode, self.volname) self.assertEqual(ret, 0, ("Failed to start rebalance on " "the volume %s with error %s" % (self.volname, err))) g.log.info("Successfully started rebalance on the " "volume %s", self.volname) # Log Rebalance status g.log.info("Log Rebalance status") _, _, _ = rebalance_status(self.mnode, self.volname) # Wait for rebalance to complete g.log.info("Waiting for rebalance to complete") ret = wait_for_rebalance_to_complete(self.mnode, self.volname) self.assertTrue(ret, ("Rebalance is not yet complete " "on the volume %s", self.volname)) g.log.info("Rebalance is successfully complete on " "the volume %s", self.volname) # Check Rebalance status after rebalance is complete g.log.info("Checking Rebalance status") ret, _, _ = rebalance_status(self.mnode, self.volname) self.assertEqual(ret, 0, ("Failed to get rebalance status for " "the volume %s", self.volname)) g.log.info("Successfully got rebalance status of the " "volume %s", self.volname) # Check the self-heal daemon process after adding bricks g.log.info("Starting to get self-heal daemon process on " "nodes %s", nodes) glustershd_pids_after_expanding = {} ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) g.log.info("Successful in getting self-heal daemon process " "on nodes %s", nodes) glustershd_pids_after_expanding = pids g.log.info("Self Heal Daemon Process ID's after expanding " "volume: %s", glustershd_pids_after_expanding) self.assertNotEqual(glustershd_pids, glustershd_pids_after_expanding, "Self Daemon process is same before and" " after adding bricks") g.log.info("Self Heal Daemon Process is different before and " "after adding bricks") # get the bricks for the volume after expanding bricks_list_after_expanding = get_all_bricks(self.mnode, self.volname) g.log.info("Brick List after expanding " "volume: %s", bricks_list_after_expanding) # validate the bricks present in volume info # with glustershd server volume file after adding bricks g.log.info("Starting parsing file %s", self.glustershd) ret = do_bricks_exist_in_shd_volfile(self.mnode, self.volname, bricks_list_after_expanding) self.assertTrue(ret, ("Brick List from volume info is different " "from glustershd server volume file after " "expanding bricks. Please check log file " "for details")) g.log.info("Successfully parsed %s file", self.glustershd) # shrink the volume g.log.info("Starting volume shrink") ret = shrink_volume(self.mnode, self.volname) self.assertTrue(ret, ("Failed to shrink the volume on " "volume %s", self.volname)) g.log.info("Shrinking volume is successful on " "volume %s", self.volname) # Log Volume Info and Status after shrinking the volume g.log.info("Logging volume info and Status after shrinking volume") ret = log_volume_info_and_status(self.mnode, self.volname) self.assertTrue(ret, ("Logging volume info and status failed on " "volume %s", self.volname)) g.log.info("Successful in logging volume info and status " "of volume %s", self.volname) # get the bricks after shrinking the volume bricks_list_after_shrinking = get_all_bricks(self.mnode, self.volname) g.log.info("Brick List after shrinking " "volume: %s", bricks_list_after_shrinking) self.assertEqual(len(bricks_list_after_shrinking), len(bricks_list), "Brick Count is mismatched after " "shrinking the volume %s" % self.volname) g.log.info("Brick Count matched before before expanding " "and after shrinking volume") # Verfiy glustershd process releases its parent process ret = is_shd_daemonized(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) # check the self-heal daemon process after removing bricks g.log.info("Starting to get self-heal daemon process " "on nodes %s", nodes) glustershd_pids_after_shrinking = {} ret, pids = get_self_heal_daemon_pid(nodes) glustershd_pids_after_shrinking = pids self.assertNotEqual(glustershd_pids_after_expanding, glustershd_pids_after_shrinking, "Self Heal Daemon process is same " "after adding bricks and shrinking volume") g.log.info("Self Heal Daemon Process is different after adding bricks " "and shrinking volume") # validate bricks present in volume info # with glustershd server volume file after removing bricks g.log.info("Starting parsing file %s", self.glustershd) ret = do_bricks_exist_in_shd_volfile(self.mnode, self.volname, bricks_list_after_shrinking) self.assertTrue(ret, ("Brick List from volume info is different " "from glustershd server volume file after " "removing bricks. Please check log file " "for details")) g.log.info("Successfully parsed %s file", self.glustershd)
def test_impact_of_replace_brick_on_glustershd(self): """ Test Script to verify the glustershd server vol file has only entries for replicate volumes 1.Create multiple volumes and start all volumes 2.Check the glustershd processes - Only 1 glustershd should be listed 3.Do replace brick on the replicate volume 4.Confirm that the brick is replaced 5.Check the glustershd processes - Only 1 glustershd should be listed and pid should be different 6.glustershd server vol should be updated with new bricks """ # Check the self-heal daemon process ret, glustershd_pids = get_self_heal_daemon_pid(self.servers) self.assertTrue(ret, ("Either no self heal daemon process found or " "more than one self heal daemon process " "found : %s" % glustershd_pids)) g.log.info( "Successful in getting single self heal daemon process" " on all nodes %s", self.servers) volume_list = get_volume_list(self.mnode) for volume in volume_list: # Log Volume Info and Status before replacing brick ret = log_volume_info_and_status(self.mnode, volume) self.assertTrue(ret, ("Logging volume info and status " "failed on volume %s", volume)) g.log.info( "Successful in logging volume info and status " "of volume %s", volume) # Selecting a random source brick to replace src_brick = choice(get_all_bricks(self.mnode, volume)) src_node, original_brick = src_brick.split(":") # Creating a random destination brick in such a way # that the brick is select from the same node but always # picks a different from the original brick list_of_bricks = [ brick for brick in get_servers_bricks_dict( src_node, self.all_servers_info)[src_node] if brick not in original_brick ] dst_brick = ('{}:{}/{}_replaced'.format( src_node, choice(list_of_bricks), original_brick.split('/')[::-1][0])) # Replace brick for the volume ret, _, _ = replace_brick(self.mnode, volume, src_brick, dst_brick) self.assertFalse( ret, "Failed to replace brick " "from the volume %s" % volume) g.log.info( "Successfully replaced faulty brick from " "the volume %s", volume) # Verify all volume process are online ret = wait_for_volume_process_to_be_online(self.mnode, volume) self.assertTrue(ret, "Volume %s : All process are not online" % volume) g.log.info("Volume %s : All process are online", volume) # Check the self-heal daemon process after replacing brick ret, pid_after_replace = get_self_heal_daemon_pid(self.servers) self.assertTrue( ret, "Either no self heal daemon process " "found or more than one self heal " "daemon process found : %s" % pid_after_replace) g.log.info( "Successful in getting Single self heal " " daemon process on all nodes %s", self.servers) # Compare the glustershd pids self.assertNotEqual( glustershd_pids, pid_after_replace, "Self heal daemon process should be different " "after replacing bricks in %s volume" % volume) g.log.info("EXPECTED: Self heal daemon process should be different" " after replacing bricks in replicate volume") # Get the bricks for the volume bricks_list = get_all_bricks(self.mnode, volume) g.log.info("Brick List : %s", bricks_list) # Validate the bricks present in volume info with # glustershd server volume file ret = do_bricks_exist_in_shd_volfile(self.mnode, volume, bricks_list) self.assertTrue(ret, ("Brick List from volume info is " "different from glustershd server " "volume file. Please check log file " "for details")) g.log.info( "Bricks in volume %s exists in glustershd server " "volume file", volume)
def test_existing_glustershd_should_take_care_of_self_healing(self): """ Test Script which verifies that the existing glustershd should take care of self healing * Create and start the Replicate volume * Check the glustershd processes - Note the pids * Bring down the One brick ( lets say brick1) without affecting the cluster * Create 1000 files on volume * bring the brick1 up which was killed in previous steps * check the heal info - proactive self healing should start * Bring down brick1 again * wait for 60 sec and brought up the brick1 * Check the glustershd processes - pids should be different * Monitor the heal till its complete """ # pylint: disable=too-many-locals,too-many-lines,too-many-statements nodes = self.servers # check the self-heal daemon process g.log.info("Starting to get self-heal daemon process on " "nodes %s", nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process " "found : %s" % pids)) g.log.info( "Successful in getting Single self heal daemon process" " on all nodes %s", nodes) glustershd_pids = pids # select the bricks to bring offline g.log.info("Selecting bricks to brought offline for volume %s", self.volname) bricks_to_bring_offline = \ select_volume_bricks_to_bring_offline(self.mnode, self.volname) g.log.info("Brick List to bring offline : %s", bricks_to_bring_offline) # Bring down the selected bricks g.log.info("Going to bring down the brick process " "for %s", bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, ("Failed to bring down the bricks. Please " "check the log file for more details.")) g.log.info("Brought down the brick process " "for %s successfully", bricks_to_bring_offline) # get the bricks which are running g.log.info("getting the brick list which are online") online_bricks = get_online_bricks_list(self.mnode, self.volname) g.log.info("Online Bricks for volume %s : %s", self.volname, online_bricks) # write 1MB files to the mounts g.log.info("Starting IO on all mounts...") g.log.info("mounts: %s", self.mounts) all_mounts_procs = [] cmd = ("for i in `seq 1 1000`; " "do dd if=/dev/urandom of=%s/file_$i " "bs=1M count=1; " "done" % self.mounts[0].mountpoint) g.log.info(cmd) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients") # check the heal info g.log.info("Get the pending heal info for the volume %s", self.volname) heal_info = get_heal_info_summary(self.mnode, self.volname) g.log.info("Successfully got heal info for the volume %s", self.volname) g.log.info("Heal Info for volume %s : %s", self.volname, heal_info) # Bring bricks online g.log.info("Bring bricks: %s online", bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline, 'glusterd_restart') self.assertTrue( ret, ("Failed to bring bricks: %s online" % bricks_to_bring_offline)) g.log.info("Successfully brought all bricks: %s online", bricks_to_bring_offline) # Wait for 90 sec to start self healing g.log.info('Waiting for 90 sec to start self healing') time.sleep(90) # check the heal info g.log.info("Get the pending heal info for the volume %s", self.volname) heal_info_after_brick_online = get_heal_info_summary( self.mnode, self.volname) g.log.info("Successfully got heal info for the volume %s", self.volname) g.log.info("Heal Info for volume %s : %s", self.volname, heal_info_after_brick_online) # check heal pending is decreased flag = False for brick in online_bricks: if int(heal_info_after_brick_online[brick]['numberOfEntries'])\ < int(heal_info[brick]['numberOfEntries']): flag = True break self.assertTrue(flag, "Pro-active self heal is not started") g.log.info("Pro-active self heal is started") # bring down bricks again g.log.info("Going to bring down the brick process " "for %s", bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, ("Failed to bring down the bricks. Please " "check the log file for more details.")) g.log.info("Brought down the brick process " "for %s successfully", bricks_to_bring_offline) # wait for 60 sec and brought up the brick again g.log.info('waiting for 60 sec and brought up the brick again') time.sleep(60) g.log.info("Bring bricks: %s online", bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline, 'glusterd_restart') self.assertTrue( ret, ("Failed to bring bricks: %s online" % bricks_to_bring_offline)) g.log.info("Successfully brought all bricks: %s online", bricks_to_bring_offline) # Verfiy glustershd process releases its parent process ret = is_shd_daemonized(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) # check the self-heal daemon process g.log.info("Starting to get self-heal daemon process on " "nodes %s", nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process " "found : %s" % pids)) g.log.info( "Successful in getting Single self heal daemon process" " on all nodes %s", nodes) shd_pids_after_bricks_online = pids # compare the glustershd pids self.assertNotEqual(glustershd_pids, shd_pids_after_bricks_online, ("self heal daemon process are same before and " "after bringing up bricks online")) g.log.info("EXPECTED : self heal daemon process are different before " "and after bringing up bricks online") # wait for heal to complete g.log.info("Monitoring the heal.....") ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, ("Heal is not completed on volume %s" % self.volname)) g.log.info("Heal Completed on volume %s", self.volname) # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully')
def test_glustershd_on_newly_probed_server(self): """ Test script to verify glustershd process on newly probed server * check glustershd process - only 1 glustershd process should be running * Add new node to cluster * check glustershd process - only 1 glustershd process should be running on all servers inclusing newly probed server * stop the volume * add another node to cluster * check glustershd process - glustershd process shouldn't be running on servers including newly probed server * start the volume * check glustershd process - only 1 glustershd process should be running on all servers inclusing newly probed server """ # pylint: disable=too-many-statements nodes = self.volume['servers'][:-2] # check the self-heal daemon process g.log.info("Starting to get self heal daemon process on " "nodes %s", nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either no self heal daemon process found or " "more than one self heal daemon process " "found : %s" % pids)) g.log.info( "Successful in getting single self heal daemon process" " on all nodes %s", nodes) # Add new node to the cluster g.log.info("Peer probe for %s", self.extra_servers[0]) ret = peer_probe_servers(self.mnode, self.extra_servers[0]) self.assertTrue( ret, "Failed to peer probe server : %s" % self.extra_servers[0]) g.log.info( "Peer probe success for %s and all peers are in " "connected state", self.extra_servers[0]) nodes.append(self.extra_servers[0]) # check the self-heal daemon process and it should be running on # newly probed servers g.log.info("Starting to get self-heal daemon process on " "nodes %s", nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either no self heal daemon process found or " "more than one self heal daemon process " "found : %s" % pids)) g.log.info( "Successful in getting single self heal daemon process" " on all nodes %s", nodes) # stop the volume g.log.info("Stopping the volume %s", self.volname) ret = volume_stop(self.mnode, self.volname) self.assertTrue(ret, ("Failed to stop volume %s" % self.volname)) g.log.info("Successfully stopped volume %s", self.volname) # Add another new node to the cluster g.log.info("peer probe for %s", self.extra_servers[1]) ret = peer_probe_servers(self.mnode, self.extra_servers[1]) self.assertTrue( ret, "Failed to peer probe server : %s" % self.extra_servers[1]) g.log.info( "Peer probe success for %s and all peers are in " "connected state", self.extra_servers[1]) nodes.append(self.extra_servers[1]) # check the self-heal daemon process after stopping volume and # no self heal daemon should be running including newly probed node g.log.info("Starting to get self-heal daemon process on " "nodes %s", nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertFalse(ret, ("Self Heal Daemon process is running even " "after stopping volume %s" % self.volname)) for node in pids: self.assertEquals(pids[node][0], -1, ("Self Heal Daemon is still " "running on node %s even " "after stopping all " "volumes" % node)) g.log.info("Expected : No self heal daemon process is running " "after stopping all volumes") # start the volume g.log.info("Starting volume %s", self.volname) ret = volume_start(self.mnode, self.volname) self.assertTrue(ret, ("Failed to start volume %s" % self.volname)) g.log.info("Volume %s started successfully", self.volname) # Verify volume's all process are online for 60 sec g.log.info("Verifying volume's all process are online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname, 60) self.assertTrue(ret, ("Volume %s : All process are not " "online", self.volname)) g.log.info("Successfully Verified volume %s processes are online", self.volname) # Verfiy glustershd process releases its parent process g.log.info("verifying self heal daemon process is daemonized") ret = is_shd_daemonized(nodes) self.assertTrue(ret, ("Either no self heal daemon process found or " "more than one self heal daemon process " "found : %s" % pids)) # check the self-heal daemon process g.log.info("Starting to get self-heal daemon process on " "nodes %s", nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either no self heal daemon process found or " "more than one self heal daemon process " "found : %s" % pids)) g.log.info( "Successful in getting single self heal daemon process" " on all nodes %s", nodes) # detach extra servers from the cluster g.log.info("peer detaching extra servers %s from cluster", self.extra_servers) ret = peer_detach_servers(self.mnode, self.extra_servers) self.assertTrue( ret, "Failed to peer detach extra servers : %s" % self.extra_servers) g.log.info("Peer detach success for %s ", self.extra_servers)
def test_no_glustershd_with_distribute(self): """ Test Script to verify the glustershd server vol file has only entries for replicate volumes * Create multiple volumes and start all volumes * Check the glustershd processes - Only 1 glustershd should be listed * Stop all volumes * Check the glustershd processes - No glustershd should be running * Start the distribute volume only * Check the glustershd processes - No glustershd should be running """ nodes = self.servers # check the self-heal daemon process g.log.info("Starting to get self-heal daemon process on " "nodes %s", nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either no self heal daemon process found or " "more than One self heal daemon process " "found : %s" % pids)) g.log.info( "Successful in getting single self heal daemon process" " on all nodes %s", nodes) # stop all the volumes g.log.info("Going to stop all the volumes") volume_list = get_volume_list(self.mnode) for volume in volume_list: g.log.info("Stopping Volume : %s", volume) ret = volume_stop(self.mnode, volume) self.assertTrue(ret, ("Failed to stop volume %s" % volume)) g.log.info("Successfully stopped volume %s", volume) g.log.info("Successfully stopped all the volumes") # check the self-heal daemon process after stopping all volumes g.log.info("Starting to get self-heal daemon process on " "nodes %s", nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertFalse(ret, ("Self heal daemon process is still running " "after stopping all volumes ")) for node in pids: self.assertEqual(pids[node][0], -1, ("Self heal daemon is still " "running on node %s even " "after stoppong all " "volumes" % node)) g.log.info("EXPECTED: No self heal daemon process is " "running after stopping all volumes") # start the distribute volume only for volume in volume_list: volume_type_info = get_volume_type_info(self.mnode, volume) volume_type = (volume_type_info['volume_type_info']['typeStr']) if volume_type == 'Distribute': g.log.info("starting to start distribute volume: %s", volume) ret = volume_start(self.mnode, volume) self.assertTrue(ret, ("Failed to start volume %s" % volume)) g.log.info("Successfully started volume %s", volume) break # check the self-heal daemon process after starting distribute volume g.log.info("Starting to get self-heal daemon process on " "nodes %s", nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertFalse(ret, ("Self heal daemon process is still running " "after stopping all volumes ")) for node in pids: self.assertEqual(pids[node][0], -1, ("Self heal daemon is still " "running on node %s even " "after stopping all " "volumes" % node)) g.log.info("EXPECTED: No self heal daemon process is running " "after stopping all volumes")
def test_impact_of_replace_brick_for_glustershd(self): nodes = self.volume['servers'] # check the self-heal daemon process g.log.info("Starting to get self-heal daemon process on " "nodes %s" % nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process " "found : %s" % pids)) g.log.info("Successful in getting Single self heal daemon process" " on all nodes %s", nodes) glustershd_pids = pids # get the bricks for the volume g.log.info("Fetching bricks for the volume : %s" % self.volname) bricks_list = get_all_bricks(self.mnode, self.volname) g.log.info("Brick List : %s" % bricks_list) # validate the bricks present in volume info with # glustershd server volume file g.log.info("Starting parsing file %s on " "node %s" % (self.GLUSTERSHD, self.mnode)) ret = do_bricks_exist_in_shd_volfile(self.mnode, self.volname, bricks_list) self.assertTrue(ret, ("Brick List from volume info is different " "from glustershd server volume file. " "Please check log file for details")) g.log.info("Successfully parsed %s file" % self.GLUSTERSHD) # replace brick brick_to_replace = bricks_list[-1] new_brick = brick_to_replace + 'new' g.log.info("Replacing the brick %s for the volume : %s" % (brick_to_replace, self.volname)) ret, out, err = replace_brick(self.mnode, self.volname, brick_to_replace, new_brick) self.assertFalse(ret, err) g.log.info('Replaced brick %s to %s successfully' % (brick_to_replace, new_brick)) # check bricks bricks_list = get_all_bricks(self.mnode, self.volname) self.assertEqual(bricks_list[-1], new_brick, 'Replaced brick and ' 'new brick are not equal') # Verify volume's all process are online for 60 sec g.log.info("Verifying volume's all process are online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname, timeout=60) self.assertTrue(ret, ("Volume %s : All process are not " "online", self.volname)) g.log.info("Successfully Verified volume %s processes are online", self.volname) # Verify glustershd process releases its parent process ret = is_shd_daemonized(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) # check the self-heal daemon process g.log.info("Starting to get self-heal daemon process on " "nodes %s" % nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process " "found : %s" % pids)) g.log.info("Successful in getting Single self heal daemon process" " on all nodes %s", nodes) glustershd_pids_after_replacement = pids # Compare pids before and after replacing self.assertNotEqual(glustershd_pids, glustershd_pids_after_replacement, "Self Daemon process is same before and" " after replacing bricks") g.log.info("Self Heal Daemon Process is different before and " "after replacing bricks") # get the bricks for the volume after replacing bricks_list_after_replacing = get_all_bricks(self.mnode, self.volname) g.log.info("Brick List after expanding " "volume: %s" % bricks_list_after_replacing) # validate the bricks present in volume info # with glustershd server volume file after replacing bricks g.log.info("Starting parsing file %s" % self.GLUSTERSHD) ret = do_bricks_exist_in_shd_volfile(self.mnode, self.volname, bricks_list_after_replacing) self.assertTrue(ret, ("Brick List from volume info is different " "from glustershd server volume file after " "replacing bricks. Please check log file " "for details")) g.log.info("Successfully parsed %s file" % self.GLUSTERSHD)