def test_glusterd_replace_brick(self): """ Create a volume and start it. - Get list of all the bricks which are online - Select a brick randomly from the bricks which are online - Form a non-existing brick path on node where the brick has to replace - Perform replace brick and it should fail - Form a new brick which valid brick path replace brick should succeed """ # pylint: disable=too-many-function-args # Getting all the bricks which are online bricks_online = get_online_bricks_list(self.mnode, self.volname) self.assertIsNotNone(bricks_online, "Unable to get the online bricks") g.log.info("got the brick list from the volume") # Getting one random brick from the online bricks to be replaced brick_to_replace = random.choice(bricks_online) g.log.info("Brick to replace %s", brick_to_replace) node_for_brick_replace = brick_to_replace.split(':')[0] new_brick_to_replace = form_bricks_list(self.mnode, self.volname, 1, node_for_brick_replace, self.all_servers_info) # performing replace brick with non-existing brick path path = ":/brick/non_existing_path" non_existing_path = node_for_brick_replace + path # Replace brick for non-existing path ret, _, _ = replace_brick(self.mnode, self.volname, brick_to_replace, non_existing_path) self.assertNotEqual(ret, 0, ("Replace brick with commit force" " on a non-existing brick passed")) g.log.info("Replace brick with non-existing brick with commit" "force failed as expected") # calling replace brick by passing brick_to_replace and # new_brick_to_replace with valid brick path ret = replace_brick_from_volume(self.mnode, self.volname, self.servers, self.all_servers_info, brick_to_replace, new_brick_to_replace[0], delete_brick=True) self.assertTrue(ret, ("Replace brick with commit force failed")) # Validating whether the brick replaced is online halt = 20 counter = 0 _rc = False g.log.info("Wait for some seconds for the replaced brick " "to get online") while counter < halt: ret = are_bricks_online(self.mnode, self.volname, new_brick_to_replace) if not ret: g.log.info("The replaced brick isn't online, " "Retry after 2 seconds .......") time.sleep(2) counter = counter + 2 else: _rc = True g.log.info("The replaced brick is online after being replaced") break if not _rc: raise ExecutionError("The replaced brick isn't online")
def test_replace_brick_quorum(self): ''' -> Create volume -> Set quorum type -> Set quorum ratio to 95% -> Start the volume -> Stop the glusterd on one node -> Now quorum is in not met condition -> Check all bricks went to offline or not -> Perform replace brick operation -> Start glusterd on same node which is already stopped -> Check all bricks are in online or not -> Verify in vol info that old brick not replaced with new brick ''' # Forming brick list, 6 bricks for creating volume, 7th brick for # performing replace brick operation brick_list = form_bricks_list(self.mnode, self.volname, 7, self.servers, self.all_servers_info) # Create Volume ret, _, _ = volume_create(self.mnode, self.volname, brick_list[0:6], replica_count=3) self.assertEqual(ret, 0, "Failed to create volume %s" % self.volname) g.log.info("Volume created successfully %s", self.volname) # Enabling server quorum ret = set_volume_options(self.mnode, self.volname, {'cluster.server-quorum-type': 'server'}) self.assertTrue( ret, "Failed to set server quorum on volume %s" % self.volname) g.log.info("Able to set server quorum successfully on volume %s", self.volname) # Setting Quorum ratio in percentage ret = set_volume_options(self.mnode, 'all', {'cluster.server-quorum-ratio': '95%'}) self.assertTrue( ret, "Failed to set server quorum ratio on %s" % self.servers) g.log.info("Able to set server quorum ratio successfully on %s", self.servers) # Start the volume ret, _, _ = volume_start(self.mnode, self.volname) self.assertEqual(ret, 0, "Failed to start volume %s" % self.volname) g.log.info("Volume started successfully %s", self.volname) # Stop glusterd on one of the node random_server = random.choice(self.servers[1:]) ret = stop_glusterd(random_server) self.assertTrue(ret, "Failed to stop glusterd for %s" % random_server) g.log.info("Glusterd stopped successfully on server %s", random_server) # Checking whether glusterd is running or not ret = is_glusterd_running(random_server) self.assertEqual( ret, 1, "Glusterd is still running on the node %s " "where glusterd stopped" % random_server) g.log.info("Glusterd is not running on the server %s", random_server) # Verifying node count in volume status after glusterd stopped # on one of the server, Its not possible to check the brick status # immediately in volume status after glusterd stop count = 0 while count < 100: vol_status = get_volume_status(self.mnode, self.volname) servers_count = len(vol_status[self.volname].keys()) if servers_count == 5: break sleep(2) count += 1 # creating brick list from volume status offline_bricks = [] vol_status = get_volume_status(self.mnode, self.volname) for node in vol_status[self.volname]: for brick_path in vol_status[self.volname][node]: if brick_path != 'Self-heal Daemon': offline_bricks.append(':'.join([node, brick_path])) # Checking bricks are offline or not with quorum ratio(95%) ret = are_bricks_offline(self.mnode, self.volname, offline_bricks) self.assertTrue( ret, "Bricks are online when quorum is in not met " "condition for %s" % self.volname) g.log.info( "Bricks are offline when quorum is in not met " "condition for %s", self.volname) # Getting random brick from offline brick list self.random_brick = random.choice(offline_bricks) # Performing replace brick commit force when quorum not met self.replace_brick_failed = False ret, _, _ = replace_brick(self.mnode, self.volname, self.random_brick, brick_list[6]) self.assertNotEqual( ret, 0, "Replace brick should fail when quorum is " "in not met condition but replace brick " "success on %s" % self.volname) g.log.info( "Failed to replace brick when quorum is in not met " "condition %s", self.volname) self.replace_brick_failed = True # Start glusterd on one of the node ret = start_glusterd(random_server) self.assertTrue( ret, "Failed to start glusterd on server %s" % random_server) g.log.info("Glusterd started successfully on server %s", random_server) # Verifying node count in volume status after glusterd started # on one of the servers, Its not possible to check the brick status # immediately in volume status after glusterd start count = 0 while count < 100: vol_status = get_volume_status(self.mnode, self.volname) servers_count = len(vol_status[self.volname].keys()) if servers_count == 6: break sleep(2) count += 1 # Checking bricks are online or not count = 0 while count < 100: ret = are_bricks_online(self.mnode, self.volname, brick_list[0:6]) if ret: break sleep(2) count += 1 self.assertTrue(ret, "All bricks are not online for %s" % self.volname) g.log.info("All bricks are online for volume %s", self.volname) # Comparing brick lists of before and after performing replace brick # operation after_brick_list = get_all_bricks(self.mnode, self.volname) self.assertListEqual( after_brick_list, brick_list[0:6], "Bricks are not same before and after performing " "replace brick operation for volume %s" % self.volname) g.log.info( "Bricks are same before and after performing replace " "brick operation for volume %s", self.volname)
def test_impact_of_replace_brick_for_glustershd(self): # pylint: disable=too-many-statements,too-many-branches,too-many-locals nodes = self.volume['servers'] replaced_bricks = [] # check the self-heal daemon process g.log.info("Starting to get self-heal daemon process on " "nodes %s", nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process " "found : %s" % pids)) g.log.info( "Successful in getting Single self heal daemon process" " on all nodes %s", nodes) glustershd_pids = pids # get the bricks for the volume g.log.info("Fetching bricks for the volume : %s", self.volname) bricks_list = get_all_bricks(self.mnode, self.volname) g.log.info("Brick List : %s", bricks_list) # validate the bricks present in volume info with # glustershd server volume file g.log.info("Starting parsing file %s on " "node %s", self.glustershd, self.mnode) ret = do_bricks_exist_in_shd_volfile(self.mnode, self.volname, bricks_list) self.assertTrue(ret, ("Brick List from volume info is different " "from glustershd server volume file. " "Please check log file for details")) g.log.info("Successfully parsed %s file", self.glustershd) # get the subvolumes g.log.info("Starting to get sub-volumes for volume %s", self.volname) subvols_dict = get_subvols(self.mnode, self.volname) num_subvols = len(subvols_dict['volume_subvols']) g.log.info("Number of subvolumes in volume %s:", num_subvols) # replace brick from each sub-vol for i in range(0, num_subvols): subvol_brick_list = subvols_dict['volume_subvols'][i] g.log.info("sub-volume %s brick list : %s", i, subvol_brick_list) brick_to_replace = subvol_brick_list[-1] new_brick = brick_to_replace + 'new' g.log.info("Replacing the brick %s for the volume : %s", brick_to_replace, self.volname) ret, _, err = replace_brick(self.mnode, self.volname, brick_to_replace, new_brick) self.assertFalse(ret, err) g.log.info('Replaced brick %s to %s successfully', brick_to_replace, new_brick) replaced_bricks.append(brick_to_replace) # Verify volume's all process are online for 60 sec g.log.info("Verifying volume's all process are online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname, timeout=60) self.assertTrue(ret, ("Volume %s : All process are not " "online", self.volname)) g.log.info("Successfully Verified volume %s processes are online", self.volname) # Verify glustershd process releases its parent process ret = is_shd_daemonized(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) # check the self-heal daemon process g.log.info("Starting to get self-heal daemon process on nodes " "%s", nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or" " more than One self heal daemon process" " found : %s" % pids)) g.log.info( "Successful in getting Single self heal daemon process" " on all nodes %s", nodes) glustershd_pids_after_replacement = pids # Compare pids before and after replacing self.assertNotEqual( glustershd_pids, glustershd_pids_after_replacement, "Self Daemon process is same before and" " after replacing bricks") g.log.info("Self Heal Daemon Process is different before and " "after replacing bricks") # get the bricks for the volume after replacing bricks_list_after_replacing = get_all_bricks(self.mnode, self.volname) g.log.info("Brick List after expanding " "volume: %s", bricks_list_after_replacing) # validate the bricks present in volume info # with glustershd server volume file after replacing bricks g.log.info("Starting parsing file %s", self.glustershd) ret = do_bricks_exist_in_shd_volfile(self.mnode, self.volname, bricks_list_after_replacing) self.assertTrue(ret, ("Brick List from volume info is different " "from glustershd server volume file after " "replacing bricks. Please check log file " "for details")) g.log.info("Successfully parsed %s file", self.glustershd) g.log.info("Starting to delete replaced brick dir's") # Remove brick directories of the replaced bricks as this is not # handled by tearDown class for bricks in replaced_bricks: node, brick_path = bricks.split(r':') cmd = "rm -rf " + brick_path ret, _, _ = g.run(node, cmd) if ret: raise ExecutionError("Failed to delete the brick dir's for" " %s and brick %s" % (node, brick_path)) g.log.info("Successfully deleted brick dir's for replaced bricks")
def test_impact_of_replace_brick_on_glustershd(self): """ Test Script to verify the glustershd server vol file has only entries for replicate volumes 1.Create multiple volumes and start all volumes 2.Check the glustershd processes - Only 1 glustershd should be listed 3.Do replace brick on the replicate volume 4.Confirm that the brick is replaced 5.Check the glustershd processes - Only 1 glustershd should be listed and pid should be different 6.glustershd server vol should be updated with new bricks """ # Check the self-heal daemon process ret, glustershd_pids = get_self_heal_daemon_pid(self.servers) self.assertTrue(ret, ("Either no self heal daemon process found or " "more than one self heal daemon process " "found : %s" % glustershd_pids)) g.log.info( "Successful in getting single self heal daemon process" " on all nodes %s", self.servers) volume_list = get_volume_list(self.mnode) for volume in volume_list: # Log Volume Info and Status before replacing brick ret = log_volume_info_and_status(self.mnode, volume) self.assertTrue(ret, ("Logging volume info and status " "failed on volume %s", volume)) g.log.info( "Successful in logging volume info and status " "of volume %s", volume) # Selecting a random source brick to replace src_brick = choice(get_all_bricks(self.mnode, volume)) src_node, original_brick = src_brick.split(":") # Creating a random destination brick in such a way # that the brick is select from the same node but always # picks a different from the original brick list_of_bricks = [ brick for brick in get_servers_bricks_dict( src_node, self.all_servers_info)[src_node] if brick not in original_brick ] dst_brick = ('{}:{}/{}_replaced'.format( src_node, choice(list_of_bricks), original_brick.split('/')[::-1][0])) # Replace brick for the volume ret, _, _ = replace_brick(self.mnode, volume, src_brick, dst_brick) self.assertFalse( ret, "Failed to replace brick " "from the volume %s" % volume) g.log.info( "Successfully replaced faulty brick from " "the volume %s", volume) # Verify all volume process are online ret = wait_for_volume_process_to_be_online(self.mnode, volume) self.assertTrue(ret, "Volume %s : All process are not online" % volume) g.log.info("Volume %s : All process are online", volume) # Check the self-heal daemon process after replacing brick ret, pid_after_replace = get_self_heal_daemon_pid(self.servers) self.assertTrue( ret, "Either no self heal daemon process " "found or more than one self heal " "daemon process found : %s" % pid_after_replace) g.log.info( "Successful in getting Single self heal " " daemon process on all nodes %s", self.servers) # Compare the glustershd pids self.assertNotEqual( glustershd_pids, pid_after_replace, "Self heal daemon process should be different " "after replacing bricks in %s volume" % volume) g.log.info("EXPECTED: Self heal daemon process should be different" " after replacing bricks in replicate volume") # Get the bricks for the volume bricks_list = get_all_bricks(self.mnode, volume) g.log.info("Brick List : %s", bricks_list) # Validate the bricks present in volume info with # glustershd server volume file ret = do_bricks_exist_in_shd_volfile(self.mnode, volume, bricks_list) self.assertTrue(ret, ("Brick List from volume info is " "different from glustershd server " "volume file. Please check log file " "for details")) g.log.info( "Bricks in volume %s exists in glustershd server " "volume file", volume)
def test_replacing_all_arbiters(self): """ - Create an arbiter volume 4(2+1) distributed replicate - Start writing IO - While the I/O's are going on replace all the arbiter bricks - check for the new bricks attached successfully - Check for heals - Validate IO """ # pylint: disable=too-many-locals,too-many-statements # get the bricks for the volume g.log.info("Fetching bricks for the volume: %s", self.volname) bricks_list = get_all_bricks(self.mnode, self.volname) g.log.info("Brick list: %s", bricks_list) # Clear all brick folders. Its need to prevent healing with old files for brick in bricks_list: g.log.info('Clearing brick %s', brick) node, brick_path = brick.split(':') ret, _, err = g.run(node, 'cd %s/ ; rm -rf *' % brick_path) self.assertFalse(ret, err) g.log.info('Clearing brick %s is successful', brick) g.log.info('Clearing for all brick is successful') # Creating files on client side for mount_obj in self.mounts: g.log.info("Generating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create dirs with file g.log.info('Creating dirs with file...') command = ("/usr/bin/env python %s create_deep_dirs_with_files " "-d 3 -l 3 -n 3 -f 20 %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # replace bricks subvols = get_subvols(self.mnode, self.volname)['volume_subvols'] for subvol in subvols: g.log.info('Replacing arbiter brick for %s', subvol) brick_to_replace = subvol[-1] self.bricks_to_clean.append(brick_to_replace) new_brick = brick_to_replace + 'new' g.log.info("Replacing the brick %s for the volume: %s", brick_to_replace, self.volname) ret, _, err = replace_brick(self.mnode, self.volname, brick_to_replace, new_brick) self.assertFalse(ret, err) g.log.info('Replaced brick %s to %s successfully', brick_to_replace, new_brick) # check replaced bricks subvols = get_subvols(self.mnode, self.volname)['volume_subvols'] index = 0 for subvol in subvols: expected_brick_path = self.bricks_to_clean[index] + 'new' brick_to_check = subvol[-1] self.assertEqual(expected_brick_path, brick_to_check, 'Brick %s is not replaced brick' % brick_to_check) index += 1 # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue( ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s: All process are online", self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Validate IO ret = validate_io_procs(self.all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") self.io_validation_complete = True
def test_impact_of_replace_brick_for_glustershd(self): nodes = self.volume['servers'] # check the self-heal daemon process g.log.info("Starting to get self-heal daemon process on " "nodes %s" % nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process " "found : %s" % pids)) g.log.info("Successful in getting Single self heal daemon process" " on all nodes %s", nodes) glustershd_pids = pids # get the bricks for the volume g.log.info("Fetching bricks for the volume : %s" % self.volname) bricks_list = get_all_bricks(self.mnode, self.volname) g.log.info("Brick List : %s" % bricks_list) # validate the bricks present in volume info with # glustershd server volume file g.log.info("Starting parsing file %s on " "node %s" % (self.GLUSTERSHD, self.mnode)) ret = do_bricks_exist_in_shd_volfile(self.mnode, self.volname, bricks_list) self.assertTrue(ret, ("Brick List from volume info is different " "from glustershd server volume file. " "Please check log file for details")) g.log.info("Successfully parsed %s file" % self.GLUSTERSHD) # replace brick brick_to_replace = bricks_list[-1] new_brick = brick_to_replace + 'new' g.log.info("Replacing the brick %s for the volume : %s" % (brick_to_replace, self.volname)) ret, out, err = replace_brick(self.mnode, self.volname, brick_to_replace, new_brick) self.assertFalse(ret, err) g.log.info('Replaced brick %s to %s successfully' % (brick_to_replace, new_brick)) # check bricks bricks_list = get_all_bricks(self.mnode, self.volname) self.assertEqual(bricks_list[-1], new_brick, 'Replaced brick and ' 'new brick are not equal') # Verify volume's all process are online for 60 sec g.log.info("Verifying volume's all process are online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname, timeout=60) self.assertTrue(ret, ("Volume %s : All process are not " "online", self.volname)) g.log.info("Successfully Verified volume %s processes are online", self.volname) # Verify glustershd process releases its parent process ret = is_shd_daemonized(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) # check the self-heal daemon process g.log.info("Starting to get self-heal daemon process on " "nodes %s" % nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process " "found : %s" % pids)) g.log.info("Successful in getting Single self heal daemon process" " on all nodes %s", nodes) glustershd_pids_after_replacement = pids # Compare pids before and after replacing self.assertNotEqual(glustershd_pids, glustershd_pids_after_replacement, "Self Daemon process is same before and" " after replacing bricks") g.log.info("Self Heal Daemon Process is different before and " "after replacing bricks") # get the bricks for the volume after replacing bricks_list_after_replacing = get_all_bricks(self.mnode, self.volname) g.log.info("Brick List after expanding " "volume: %s" % bricks_list_after_replacing) # validate the bricks present in volume info # with glustershd server volume file after replacing bricks g.log.info("Starting parsing file %s" % self.GLUSTERSHD) ret = do_bricks_exist_in_shd_volfile(self.mnode, self.volname, bricks_list_after_replacing) self.assertTrue(ret, ("Brick List from volume info is different " "from glustershd server volume file after " "replacing bricks. Please check log file " "for details")) g.log.info("Successfully parsed %s file" % self.GLUSTERSHD)