def verify_all_process_of_volume_are_online(mnode, volname): """Verifies whether all the processes of volume are online Args: mnode (str): Node on which cmd has to be executed. volname (str): volume name Returns: bool: Returns True if all the processes of volume are online. False Otherwise. """ # Importing here to avoid cyclic imports from glustolibs.gluster.brick_libs import are_bricks_online, get_all_bricks # Verify all the brick process are online bricks_list = get_all_bricks(mnode, volname) if not bricks_list: g.log.error("Failed to get the brick list " "from the volume %s", volname) return False ret = are_bricks_online(mnode, volname, bricks_list) if not ret: g.log.error("All bricks are not online of " "the volume %s", volname) return False # ToDO: Verify all self-heal-daemons are running for non-distribute volumes return True
def test_volume_start_force(self): # get the brick list and create a volume num_of_bricks = len(self.servers) bricks_list = form_bricks_list(self.mnode, self.volname, num_of_bricks, self.servers, self.all_servers_info) ret, _, _ = volume_create(self.mnode, self.volname, bricks_list) self.assertEqual(ret, 0, "Failed to create volume") # remove brick path in one node and try to start the volume with force # and without force index_of_node = random.randint(0, len(bricks_list) - 1) brick_node = bricks_list[index_of_node] node = brick_node.split(":")[0] brick_path = brick_node.split(":")[1] cmd = "rm -rf %s" % brick_path ret, _, _ = g.run(node, cmd) self.assertEqual(ret, 0, "Failed to delete the brick") g.log.info("Deleted the brick successfully") ret, _, _ = volume_start(self.mnode, self.volname) self.assertNotEqual(ret, 0, "Volume start succeeded") ret, _, _ = volume_start(self.mnode, self.volname, force=True) self.assertEqual(ret, 0, "Volume start with force failed") # volume start force should not bring the brick online ret = are_bricks_online(self.mnode, self.volname, [bricks_list[index_of_node]]) self.assertFalse(ret, "Volume start force brought the bricks online") g.log.info("Volume start force didn't bring the brick online")
def _restart_volume_and_bring_all_offline_bricks_online(self): """Restart volume and bring all offline bricks online""" ret = is_heal_complete(self.mnode, self.volname) self.assertFalse(ret, 'Heal is completed') g.log.info('Heal is pending') ret = bring_bricks_online( self.mnode, self.volname, self.bricks_to_bring_offline, bring_bricks_online_methods=['volume_start_force']) self.assertTrue( ret, 'Failed to bring bricks %s online' % self.bricks_to_bring_offline) # Check if bricks are back online or not ret = are_bricks_online(self.mnode, self.volname, self.bricks_to_bring_offline) self.assertTrue( ret, 'Bricks not online %s even after restart' % self.bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', self.bricks_to_bring_offline)
def _guster_volume_cleanup(self, vol_name): # Check brick status. Restart vol if bricks are offline openshift_ops.switch_oc_project(self._master, self._registry_project_name) brick_list = brick_libs.get_all_bricks("auto_get_gluster_endpoint", vol_name) self.assertIsNotNone(brick_list, "Failed to get brick list") check_bricks = brick_libs.are_bricks_online( "auto_get_gluster_endpoint", vol_name, brick_list) if not check_bricks: start_vol, _, _ = volume_ops.volume_start( "auto_get_gluster_endpoint", vol_name, force=True) self.assertFalse(start_vol, "Failed to start volume using force")
def test_disperse_vol(self): bricks_list = get_all_bricks(self.mnode, self.volname) ret = bring_bricks_offline(self.volname, bricks_list[0:2]) self.assertTrue(ret, "Failed to bring down the bricks") g.log.info("Successfully brought the bricks down") ret = bring_bricks_online(self.mnode, self.volname, bricks_list[0:2]) self.assertTrue(ret, "Failed to bring up the bricks") g.log.info("Successfully brought the bricks up") # Verifying all bricks online ret = are_bricks_online(self.mnode, self.volname, bricks_list) if not ret: self.assertTrue(ret, "All bricks are not online") g.log.info("Logging volume info and status") ret = log_volume_info_and_status(self.mnode, self.volname) self.assertTrue(ret, ("Logging volume info and status failed " "on volume %s", self.volname)) g.log.info( "Successful in logging volume info and status " "of volume %s", self.volname)
def test_brickreset_ec_volume(self): # pylint: disable=too-many-branches,too-many-statements,too-many-locals """ - Start resource consumption tool - Create IO on dir2 of volume mountpoint - Reset brick start - Check if brick is offline - Reset brick with destination same as source with force running IO's - Validating IO's and waiting for it to complete on dir2 - Remove dir2 - Create 5 directory and 5 files in dir of mountpoint - Rename all files inside dir1 at mountpoint - Create softlink and hardlink of files in dir1 of mountpoint - Delete op for deleting all file in one of the dirs inside dir1 - Change chmod, chown, chgrp - Create tiny, small, medium and large file - Create IO's - Validating IO's and waiting for it to complete - Calculate arequal before kiiling brick - Get brick from Volume - Reset brick - Check if brick is offline - Reset brick by giving a different source and dst node - Reset brick by giving dst and source same without force - Obtain hostname - Reset brick with dst-source same force using hostname - Successful - Monitor heal completion - Bring down other bricks to max redundancy - Get arequal after bringing down bricks - Bring bricks online - Reset brick by giving a same source and dst brick - Kill brick manually - Check if brick is offline - Reset brick by giving a same source and dst brick - Wait for brick to come online - Bring down other bricks to max redundancy - Get arequal after bringing down bricks - Bring bricks online - Remove brick from backend - Check if brick is offline - Reset brick by giving dst and source same without force - Successful - Monitor heal completion - Compare the arequal's calculated """ # Starting resource consumption using top log_file_mem_monitor = getcwd() + '/mem_usage.log' cmd = 'for i in {1..100};do top -n 1 -b|egrep \ "RES|gluster" & free -h 2>&1 >> ' + \ log_file_mem_monitor + ' ;sleep 10;done' g.log.info(cmd) for mount_obj in self.mounts: g.run_async(mount_obj.client_system, cmd) bricks_list = [] # Get the bricks from the volume g.log.info("Fetching bricks for the volume : %s", self.volname) bricks_list = get_all_bricks(self.mnode, self.volname) g.log.info("Brick List : %s", bricks_list) # Creating directory2 cmd = ('mkdir %s/dir2' % self.mounts[0].mountpoint) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Failed to create directory2") g.log.info("Directory 2 on %s created successfully", self.mounts[0]) # Creating files on client side for dir2 for mount_obj in self.mounts: g.log.info("Generating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create dirs with file g.log.info('Creating dirs with file...') command = ("/usr/bin/env python %s create_deep_dirs_with_files " "-d 2 -l 2 -n 2 -f 20 %s/dir2" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Reset a brick g.log.info('Reset of brick using start') brick_reset = choice(bricks_list) ret, _, _ = reset_brick(self.mnode, self.volname, brick_reset, "start") # Check if the brick is offline g.log.info("Check the brick status if it is offline") offline_bricks = get_offline_bricks_list(self.mnode, self.volname) self.assertEqual(offline_bricks[0], brick_reset, "Brick not offline") g.log.info("Expected : Brick is offline") # Reset brick with dest same as source with force while running IO's g.log.info('Reset of brick with same src and dst brick') ret, _, _ = reset_brick(self.mnode, self.volname, brick_reset, "commit", brick_reset, force="true") self.assertEqual(ret, 0, "Not Expected: Reset brick failed") g.log.info("Expected : Reset brick is successful") # Validating IO's and waiting to complete self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients") self.io_validation_complete = True # List all files and dirs created g.log.info("List all files and directories:") ret = list_all_files_and_dirs_mounts(self.mounts) self.assertTrue(ret, "Failed to list all files and dirs") g.log.info("Listing all files and directories is successful") # Deleting dir2 cmd = ('rm -rf %s/dir2' % self.mounts[0].mountpoint) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Failed to delete directory2") g.log.info("Directory 2 deleted successfully for %s", self.mounts[0]) del self.all_mounts_procs[:] # Creating dir1 cmd = ('mkdir %s/dir1' % self.mounts[0].mountpoint) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Failed to create directory1") g.log.info("Directory 1 created successfully for %s", self.mounts[0]) # Create 5 dir and 5 files in each dir at mountpoint on dir1 start, end = 1, 5 for mount_obj in self.mounts: # Number of dir and files to be created. dir_range = str(start) + ".." + str(end) file_range = str(start) + ".." + str(end) # Create dir 1-5 at mountpoint. cmd = ('mkdir %s/dir1/dir{%s};' % (mount_obj.mountpoint, dir_range)) g.run(mount_obj.client_system, cmd) # Create files inside each dir. cmd = ('touch %s/dir1/dir{%s}/file{%s};' % (mount_obj.mountpoint, dir_range, file_range)) g.run(mount_obj.client_system, cmd) # Increment counter so that at next client dir and files are made # with diff offset. Like at next client dir will be named # dir6, dir7...dir10. Same with files. start += 5 end += 5 # Rename all files inside dir1 at mountpoint on dir1 clients = [] for mount_obj in self.mounts: clients.append(mount_obj.client_system) cmd = ('cd %s/dir1/dir1/; ' 'for FILENAME in *;' 'do mv $FILENAME Unix_$FILENAME; ' 'done;' % mount_obj.mountpoint) g.run_parallel(clients, cmd) # Truncate at any dir in mountpoint inside dir1 # start is an offset to be added to dirname to act on # diff files at diff clients. start = 1 for mount_obj in self.mounts: cmd = ('cd %s/dir1/dir%s/; ' 'for FILENAME in *;' 'do echo > $FILENAME; ' 'done;' % (mount_obj.mountpoint, str(start))) g.run(mount_obj.client_system, cmd) # Create softlink and hardlink of files in mountpoint. Start is an # offset to be added to dirname to act on diff files at diff clients. start = 1 for mount_obj in self.mounts: cmd = ('cd %s/dir1/dir%s; ' 'for FILENAME in *; ' 'do ln -s $FILENAME softlink_$FILENAME; ' 'done;' % (mount_obj.mountpoint, str(start))) g.run(mount_obj.client_system, cmd) cmd = ('cd %s/dir1/dir%s; ' 'for FILENAME in *; ' 'do ln $FILENAME hardlink_$FILENAME; ' 'done;' % (mount_obj.mountpoint, str(start + 1))) g.run(mount_obj.client_system, cmd) start += 5 # Delete op for deleting all file in one of the dirs. start is being # used as offset like in previous testcase in dir1 start = 1 for mount_obj in self.mounts: cmd = ('cd %s/dir1/dir%s; ' 'for FILENAME in *; ' 'do rm -f $FILENAME; ' 'done;' % (mount_obj.mountpoint, str(start))) g.run(mount_obj.client_system, cmd) start += 5 # chmod, chown, chgrp inside dir1 # start and end used as offset to access diff files # at diff clients. start, end = 2, 5 for mount_obj in self.mounts: dir_file_range = '%s..%s' % (str(start), str(end)) cmd = ('chmod 777 %s/dir1/dir{%s}/file{%s}' % (mount_obj.mountpoint, dir_file_range, dir_file_range)) g.run(mount_obj.client_system, cmd) cmd = ('chown root %s/dir1/dir{%s}/file{%s}' % (mount_obj.mountpoint, dir_file_range, dir_file_range)) g.run(mount_obj.client_system, cmd) cmd = ('chgrp root %s/dir1/dir{%s}/file{%s}' % (mount_obj.mountpoint, dir_file_range, dir_file_range)) g.run(mount_obj.client_system, cmd) start += 5 end += 5 # Create tiny, small, medium nd large file # at mountpoint. Offset to differ filenames # at diff clients. offset = 1 for mount_obj in self.mounts: cmd = 'fallocate -l 100 tiny_file%s.txt' % str(offset) g.run(mount_obj.client_system, cmd) cmd = 'fallocate -l 20M small_file%s.txt' % str(offset) g.run(mount_obj.client_system, cmd) cmd = 'fallocate -l 200M medium_file%s.txt' % str(offset) g.run(mount_obj.client_system, cmd) cmd = 'fallocate -l 1G large_file%s.txt' % str(offset) g.run(mount_obj.client_system, cmd) offset += 1 # Creating files on client side for dir1 for mount_obj in self.mounts: g.log.info("Generating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create dirs with file g.log.info('Creating dirs with file...') command = ("/usr/bin/env python %s create_deep_dirs_with_files " "-d 2 -l 2 -n 2 -f 20 %s/dir1" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validating IO's and waiting to complete self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients") self.io_validation_complete = True # List all files and dirs created g.log.info("List all files and directories:") ret = list_all_files_and_dirs_mounts(self.mounts) self.assertTrue(ret, "Failed to list all files and dirs") g.log.info("Listing all files and directories is successful") # Get areequal before killing the brick g.log.info('Getting areequal before killing of brick...') ret, result_before_killing_brick = (collect_mounts_arequal( self.mounts[0])) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting areequal before killing of brick ' 'is successful') # Reset a brick g.log.info('Reset of brick using start') ret, _, _ = reset_brick(self.mnode, self.volname, bricks_list[0], "start") # Check if the brick is offline g.log.info("Check the brick status if it is offline") ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[0]]) self.assertTrue(ret, "Brick is not offline") g.log.info("Expected : Brick is offline") # Reset brick by giving a different source and dst brick g.log.info('Reset of brick by giving different src and dst brick') ret, _, _ = reset_brick(self.mnode, self.volname, bricks_list[0], "commit", bricks_list[1]) self.assertNotEqual(ret, 0, "Not Expected: Reset brick is successfull") g.log.info("Expected : Source and Destination brick must be same for" " reset") # Reset brick with destination same as source g.log.info('Reset of brick with same src and dst brick') ret, _, _ = reset_brick(self.mnode, self.volname, bricks_list[0], "commit", bricks_list[0]) self.assertNotEqual(ret, 0, "Not Expected : Reset brick is successful") g.log.info("Expected : Reset brick failed,Vol id is same use force") # Obtain hostname of node ret, hostname_node1, _ = g.run(self.mnode, "hostname") self.assertEqual(ret, 0, ("Failed to obtain hostname of node %s", self.mnode)) g.log.info("Obtained hostname of client. IP- %s, hostname- %s", self.mnode, hostname_node1.strip()) # Reset brick with destination same as source with force using hostname g.log.info('Reset of brick with same src and dst brick') ret, _, _ = reset_brick(hostname_node1.strip(), self.volname, bricks_list[0], "commit", bricks_list[0], force="true") self.assertEqual(ret, 0, "Not Expected: Reset brick failed") g.log.info("Expected : Reset brick is successful") # Wait for brick to come online g.log.info("Waiting for brick to come online") ret = wait_for_bricks_to_be_online(self.mnode, self.volname) self.assertTrue(ret, "Bricks are not online") g.log.info("Expected : Bricks are online") # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') g.log.info('Heal has completed successfully') # Check if bricks are online all_bricks = get_all_bricks(self.mnode, self.volname) ret = are_bricks_online(self.mnode, self.volname, all_bricks) self.assertTrue(ret, 'All bricks are not online') g.log.info('All bricks are online') # Bring down other bricks to max redundancy # Get List of bricks to bring offline # Bringing bricks offline ret = bring_bricks_offline(self.volname, bricks_list[1:3]) self.assertTrue(ret, 'Bricks not offline') g.log.info('Bricks are offline successfully') sleep(2) # Check if 4 bricks are online all_bricks = [] all_bricks = [ bricks_list[0], bricks_list[3], bricks_list[4], bricks_list[5] ] ret = are_bricks_online(self.mnode, self.volname, all_bricks) self.assertTrue(ret, 'All bricks are not online') g.log.info('All bricks are online') # Check mount point cmd = 'ls -lrt /mnt' ret, _, _ = g.run(self.mounts[0].client_system, cmd) g.log.info("Client mount point details ") # Get arequal after bringing down bricks g.log.info('Getting arequal after bringing down bricks...') ret, result_offline_redundant_brick1 = (collect_mounts_arequal( self.mounts[0])) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks offline ' 'is successful') # Bring bricks online list_of_bricks_to_bring_online = bricks_list[1:3] ret = bring_bricks_online(self.mnode, self.volname, list_of_bricks_to_bring_online) self.assertTrue(ret, 'Bricks not brought online') g.log.info('Bricks are online successfully') # Wait for brick to come online g.log.info("Waiting for brick to come online") ret = wait_for_bricks_to_be_online(self.mnode, self.volname) self.assertTrue(ret, "Bricks are not online") g.log.info("Expected : Bricks are online") # Check if bricks are online all_bricks = get_all_bricks(self.mnode, self.volname) ret = are_bricks_online(self.mnode, self.volname, all_bricks) self.assertTrue(ret, 'All bricks are not online') g.log.info('All bricks are online') # Reset brick without bringing down brick g.log.info('Reset of brick by giving different src and dst brick') ret, _, _ = reset_brick(self.mnode, self.volname, bricks_list[1], "commit", bricks_list[1]) self.assertNotEqual(ret, 0, "Not Expected: Reset brick passed") g.log.info("Expected : Brick reset failed as source brick must be" " stopped") # Kill the brick manually ret = bring_bricks_offline(self.volname, [bricks_list[1]]) self.assertTrue(ret, 'Brick not offline') g.log.info('Brick is offline successfully') # Check if the brick is offline g.log.info("Check the brick status if it is offline") ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[1]]) self.assertTrue(ret, "Brick is not offline") g.log.info("Expected : Brick is offline") # Reset brick with dest same as source after killing brick manually g.log.info('Reset of brick by giving different src and dst brick') ret, _, _ = reset_brick(self.mnode, self.volname, bricks_list[1], "commit", bricks_list[1], force="true") self.assertEqual(ret, 0, "Not Expected: Reset brick failed") g.log.info("Expected : Reset brick is successful") # Wait for brick to come online g.log.info("Waiting for brick to come online") ret = wait_for_bricks_to_be_online(self.mnode, self.volname) self.assertTrue(ret, "Bricks are not online") g.log.info("Expected : Bricks are online") # Check if bricks are online all_bricks = get_all_bricks(self.mnode, self.volname) ret = are_bricks_online(self.mnode, self.volname, all_bricks) self.assertTrue(ret, 'All bricks are not online') g.log.info('All bricks are online') # Bring down other bricks to max redundancy # Bringing bricks offline ret = bring_bricks_offline(self.volname, bricks_list[2:4]) self.assertTrue(ret, 'Bricks not offline') g.log.info('Bricks are offline successfully') # Check mount point cmd = 'ls -lrt /mnt' ret, _, _ = g.run(self.mounts[0].client_system, cmd) g.log.info("Client mount point details") # Get arequal after bringing down bricks g.log.info('Getting arequal after bringing down redundant bricks...') ret, result_offline_redundant_brick2 = (collect_mounts_arequal( self.mounts[0])) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks offline ' 'is successful') # Bring bricks online list_of_bricks_to_bring_online = bricks_list[2:4] ret = bring_bricks_online(self.mnode, self.volname, list_of_bricks_to_bring_online) self.assertTrue(ret, 'Bricks not brought online') g.log.info('Bricks are online successfully') # Removing brick from backend brick = bricks_list[0].strip().split(":") cmd = "rm -rf %s" % brick[1] ret, _, _ = g.run(self.mnode, cmd) self.assertEqual(ret, 0, "Failed to delete brick %s" % bricks_list[0]) g.log.info("Removed brick %s sucessfully", bricks_list[0]) # Check if the brick is offline count = 0 while count <= 20: g.log.info("Check the brick status if it is offline") ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[0]]) if ret: break sleep(2) count = +1 self.assertTrue(ret, "Brick is not offline") g.log.info("Expected : Brick is offline") # Reset brick with destination same as source g.log.info('Reset of brick with same src and dst brick') ret, _, _ = reset_brick(hostname_node1.strip(), self.volname, bricks_list[0], "commit", bricks_list[0]) self.assertEqual(ret, 0, "Not Expected: Reset brick failed") g.log.info("Expected : Reset brick is successful") # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') g.log.info('Heal has completed successfully') # Comparing arequals self.assertEqual( result_before_killing_brick, result_offline_redundant_brick1, 'Arequals are not equals before killing brick' 'processes and after offlining redundant bricks') g.log.info('Arequals are equals before killing brick' 'processes and after offlining redundant bricks') # Comparing arequals self.assertEqual( result_offline_redundant_brick2, result_offline_redundant_brick1, 'Arequals are not equals for offlining redundant' ' bricks') g.log.info('Arequals are equals for offlining redundant bricks') # Deleting dir1 cmd = ('rm -rf %s/dir1' % self.mounts[0].mountpoint) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Failed to delete directory1") g.log.info("Directory 1 deleted successfully for %s", self.mounts[0])
def test_glusterd_replace_brick(self): """ Create a volume and start it. - Get list of all the bricks which are online - Select a brick randomly from the bricks which are online - Form a non-existing brick path on node where the brick has to replace - Perform replace brick and it should fail - Form a new brick which valid brick path replace brick should succeed """ # pylint: disable=too-many-function-args # Getting all the bricks which are online bricks_online = get_online_bricks_list(self.mnode, self.volname) self.assertIsNotNone(bricks_online, "Unable to get the online bricks") g.log.info("got the brick list from the volume") # Getting one random brick from the online bricks to be replaced brick_to_replace = random.choice(bricks_online) g.log.info("Brick to replace %s", brick_to_replace) node_for_brick_replace = brick_to_replace.split(':')[0] new_brick_to_replace = form_bricks_list(self.mnode, self.volname, 1, node_for_brick_replace, self.all_servers_info) # performing replace brick with non-existing brick path path = ":/brick/non_existing_path" non_existing_path = node_for_brick_replace + path # Replace brick for non-existing path ret, _, _ = replace_brick(self.mnode, self.volname, brick_to_replace, non_existing_path) self.assertNotEqual(ret, 0, ("Replace brick with commit force" " on a non-existing brick passed")) g.log.info("Replace brick with non-existing brick with commit" "force failed as expected") # calling replace brick by passing brick_to_replace and # new_brick_to_replace with valid brick path ret = replace_brick_from_volume(self.mnode, self.volname, self.servers, self.all_servers_info, brick_to_replace, new_brick_to_replace[0], delete_brick=True) self.assertTrue(ret, ("Replace brick with commit force failed")) # Validating whether the brick replaced is online halt = 20 counter = 0 _rc = False g.log.info("Wait for some seconds for the replaced brick " "to get online") while counter < halt: ret = are_bricks_online(self.mnode, self.volname, new_brick_to_replace) if not ret: g.log.info("The replaced brick isn't online, " "Retry after 2 seconds .......") time.sleep(2) counter = counter + 2 else: _rc = True g.log.info("The replaced brick is online after being replaced") break if not _rc: raise ExecutionError("The replaced brick isn't online")
def test_create_snap_bricks(self): """ 1. get brick list 2. check all bricks are online 3. Selecting one brick randomly to bring it offline 4. get brick list 5. check all bricks are online 6. Offline Bricks list 7. Online Bricks list 8. Create snapshot of volume 9. snapshot create should fail """ bricks_list = [] # get the bricks from the volume g.log.info("Fetching bricks for the volume : %s" % self.volname) bricks_list = get_all_bricks(self.mnode, self.volname) g.log.info("Brick List : %s" % bricks_list) # check all bricks are online g.log.info("Verifying all bricks are online or not.....") ret = are_bricks_online(self.mnode, self.volname, bricks_list) self.assertTrue(ret, ("Not all bricks are online")) g.log.info("All bricks are online.") # Selecting one brick randomly to bring it offline g.log.info("Selecting one brick randomly to bring it offline") brick_to_bring_offline = random.choice(bricks_list) g.log.info("Brick to bring offline:%s " % brick_to_bring_offline) ret = bring_bricks_offline(self.volname, brick_to_bring_offline, None) self.assertTrue(ret, "Failed to bring the bricks offline") g.log.info("Randomly Selected brick: %s" % brick_to_bring_offline) # get brick list g.log.info("Fetching bricks for the volume : %s" % self.volname) bricks_list = get_all_bricks(self.mnode, self.volname) g.log.info("Brick List : %s" % bricks_list) # check all bricks are online g.log.info("Verifying all bricks are online or not.....") ret = are_bricks_online(self.mnode, self.volname, bricks_list) self.assertFalse(ret, ("Not all bricks are online")) g.log.info("All bricks are online.") # get the bricks for the volume g.log.info("Fetching bricks for the volume : %s" % self.volname) bricks_list = get_all_bricks(self.mnode, self.volname) g.log.info("Brick List : %s" % bricks_list) # Offline Bricks list offbricks = get_offline_bricks_list(self.mnode, self.volname) g.log.info("Bricks Offline: %s" % offbricks) # Online Bricks list onbricks = get_online_bricks_list(self.mnode, self.volname) g.log.info("Bricks Online: %s" % onbricks) # Create snapshot of volume ret = snap_create(self.mnode, self.volname, "snap1", False, "Description with $p3c1al characters!") self.assertTrue(ret, ("Failed to create snapshot snap1")) g.log.info("Snapshot snap1 of volume %s created Successfully" % (self.volname)) # Volume status ret = get_volume_info(self.mnode, self.volname) self.assertTrue(ret, ("Failed to perform gluster volume" "info on volume %s" % self.volname)) g.log.info("Gluster volume info on volume %s is successful" % self.volname) # snapshot list ret = snap_list(self.mnode) self.assertTrue( ret, ("Failed to list snapshot of volume %s" % self.volname)) g.log.info("Snapshot list command for volume %s was successful" % self.volname)
def test_self_heal(self): """ Description:- - Create files on mount point - Kill one brick from volume - rm -rfv on mount point - bring bricks online - wait for heals - list """ # pylint: disable=too-many-statements # IO on the mount point g.log.info("Starting IO on all mounts...") self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Starting IO on %s:%s", mount_obj.client_system, mount_obj.mountpoint) cmd = ("/usr/bin/env python %s create_deep_dirs_with_files " "--dirname-start-num %d " "--dir-depth 2 " "--dir-length 35 " "--max-num-of-dirs 5 " "--num-of-files 5 %s" % ( self.script_upload_path, self.counter, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) self.all_mounts_procs.append(proc) self.counter = self.counter + 10 # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = list(filter(None, ( bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks']))) # Killing one brick from the volume set g.log.info("Bringing bricks: %s offline", bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, ("Failed to bring bricks: %s offline", bricks_to_bring_offline)) g.log.info("Successful in bringing bricks: %s offline", bricks_to_bring_offline) # Validate if bricks are offline g.log.info("Validating if bricks: %s are offline", bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, "Not all the bricks in list: %s are offline" % bricks_to_bring_offline) g.log.info("Successfully validated that bricks: %s are all offline", bricks_to_bring_offline) # Validate IO self.assertTrue( validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients" ) self.io_validation_complete = True # Checking volume status g.log.info("Logging volume info and Status after bringing bricks " "offline from the volume %s", self.volname) ret = log_volume_info_and_status(self.mnode, self.volname) self.assertTrue(ret, ("Logging volume info and status failed on " "volume %s", self.volname)) g.log.info("Successful in logging volume info and status of volume %s", self.volname) # Removing files from the mount point when one brick is down g.log.info("Removing files from the mount point") mountpoint = self.mounts[0].mountpoint client = self.mounts[0].client_system cmd = "rm -rfv %s/*" % mountpoint ret, _, _ = g.run(client, cmd) if ret != 0: raise ExecutionError("failed to delete the files") # Bringing bricks online g.log.info('Bringing bricks %s online', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bricks %s are online', bricks_to_bring_offline) # Check if bricks are online g.log.info("Checking bricks are online or not") ret = are_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not online' % bricks_to_bring_offline) g.log.info('Bricks %s are online', bricks_to_bring_offline) # Monitoring heals on the volume g.log.info("Wait for heal completion...") ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, "Self heal didn't complete even after waiting " "for 20 minutes.") g.log.info("self-heal is successful after changing the volume type " "from replicated to arbitered volume") # List all files and dirs created g.log.info("List all files and directories:") ret = list_all_files_and_dirs_mounts(self.mounts) self.assertTrue(ret, "Failed to list all files and dirs") g.log.info("Listing all files and directories is successful")
def test_heal_client_io_hang(self): mountpoint = self.mounts[0].mountpoint # disable server side heal ret = disable_heal(self.mnode, self.volname) self.assertTrue(ret, ("Failed to disable server side heal")) g.log.info("Successfully disabled server side heal") # Log Volume Info and Status after disabling client side heal g.log.info("Logging volume info and status") ret = log_volume_info_and_status(self.mnode, self.volname) self.assertTrue(ret, ("Logging volume info and status failed " "on volume %s", self.volname)) bricks_list = get_all_bricks(self.mnode, self.volname) self.assertIsNotNone(bricks_list, "Failed to get the bricks list") # Create files cmd = ("cd %s; mkdir test; cd test; for i in `seq 1 100` ;" "do touch file$i; done" % mountpoint) ret, _, err = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, err) g.log.info('Finished creating files while all the bricks are UP') # Bring bricks offline ret = bring_bricks_offline(self.volname, bricks_list[0:1]) self.assertTrue(ret, "Failed to bring down the bricks") g.log.info("Successfully brought the bricks down") # Start pumping IO from client cmd = ("cd %s; mkdir test; cd test; for i in `seq 1 100` ;" "do dd if=/dev/urandom of=file$i bs=1M " "count=5;done" % mountpoint) ret, _, err = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, err) g.log.info('Finished writing on files while a brick is DOWN') # Bring bricks online ret = bring_bricks_online(self.mnode, self.volname, bricks_list[0:1]) self.assertTrue(ret, "Failed to bring up the bricks") g.log.info("Successfully brought the bricks up") # Verifying all bricks online ret = are_bricks_online(self.mnode, self.volname, bricks_list) self.assertTrue(ret, "All bricks are not online") # Start client side heal by reading/writing files. appendcmd = ("cd %s; mkdir test; cd test; for i in `seq 1 100` ;" "do dd if=/dev/urandom of=file$i bs=1M " "count=1 oflag=append conv=notrunc;done" % mountpoint) readcmd = ("cd %s; mkdir test; cd test; for i in `seq 1 100` ;" "do dd if=file$i of=/dev/zero bs=1M " "count=5;done" % mountpoint) ret, _, err = g.run(self.mounts[0].client_system, appendcmd) self.assertEqual(ret, 0, err) g.log.info('Finished append on files after bringing bricks online') ret, _, err = g.run(self.mounts[0].client_system, readcmd) self.assertEqual(ret, 0, err) g.log.info('Finished read on files after bringing bricks online') # check the heal info and completion ec_check_heal_comp(self) # Log Volume Info and Status after bringing the brick up g.log.info("Logging volume info and status") ret = log_volume_info_and_status(self.mnode, self.volname) self.assertTrue(ret, ("Logging volume info and status failed " "on volume %s", self.volname)) g.log.info( "Successful in logging volume info and status " "of volume %s", self.volname)
def test_metadata_split_brain_resolution(self): # Setting options g.log.info('Setting options...') options = {"metadata-self-heal": "off", "entry-self-heal": "off", "data-self-heal": "off"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Successfully set %s for volume %s", options, self.volname) # Creating files and directories on client side g.log.info('Creating files and directories...') cmd = ("mkdir %s/test_metadata_sb && cd %s/test_metadata_sb &&" "for i in `seq 1 3`; do mkdir dir.$i; for j in `seq 1 5`;" "do dd if=/dev/urandom of=dir.$i/file.$j bs=1K count=1;" "done; dd if=/dev/urandom of=file.$i bs=1K count=1; done" % (self.mounts[0].mountpoint, self.mounts[0].mountpoint)) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Creating files and directories failed") g.log.info("Files & directories created successfully") # Check arequals for all the bricks g.log.info('Getting arequal before getting bricks offline...') self.verify_brick_arequals() g.log.info('Getting arequal before getting bricks offline ' 'is successful') # Set option self-heal-daemon to OFF g.log.info('Setting option self-heal-daemon to off...') options = {"self-heal-daemon": "off"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Option 'self-heal-daemon' is set to 'off' successfully") bricks_list = get_all_bricks(self.mnode, self.volname) # Bring brick1 offline g.log.info('Bringing brick %s offline', bricks_list[0]) ret = bring_bricks_offline(self.volname, bricks_list[0]) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_list[0]) ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[0]]) self.assertTrue(ret, 'Brick %s is not offline' % bricks_list[0]) g.log.info('Bringing brick %s offline is successful', bricks_list[0]) # Change metadata of some files & directories cmd = ("cd %s/test_metadata_sb &&" "for i in `seq 1 2`; do chmod -R 0555 dir.$i file.$i ; done" % self.mounts[0].mountpoint) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Updating file permissions failed") g.log.info("File permissions updated successfully") # Bricng brick1 online and check the status # Bring brick3 online and check status g.log.info('Bringing brick %s online', bricks_list[0]) ret = bring_bricks_online(self.mnode, self.volname, [bricks_list[0]]) self.assertTrue(ret, 'Failed to bring brick %s online' % bricks_list[0]) g.log.info('Bringing brick %s online is successful', bricks_list[0]) g.log.info("Verifying if brick %s is online", bricks_list[0]) ret = are_bricks_online(self.mnode, self.volname, bricks_list) self.assertTrue(ret, ("Brick %s did not come up", bricks_list[0])) g.log.info("Brick %s has come online.", bricks_list[0]) # Bring brick2 offline g.log.info('Bringing brick %s offline', bricks_list[1]) ret = bring_bricks_offline(self.volname, bricks_list[1]) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_list[1]) ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[1]]) self.assertTrue(ret, 'Brick %s is not offline' % bricks_list[1]) g.log.info('Bringing brick %s offline is successful', bricks_list[1]) # Change metadata of same files & directories as before cmd = ("cd %s/test_metadata_sb &&" "for i in `seq 1 2` ; do chmod -R 0777 dir.$i file.$i ; done" % self.mounts[0].mountpoint) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Updating file permissions failed") g.log.info("File permissions updated successfully") # Bricng brick2 online and check the status g.log.info('Bringing brick %s online', bricks_list[1]) ret = bring_bricks_online(self.mnode, self.volname, [bricks_list[1]]) self.assertTrue(ret, 'Failed to bring brick %s online' % bricks_list[1]) g.log.info('Bringing brick %s online is successful', bricks_list[1]) g.log.info("Verifying if brick %s is online", bricks_list[1]) ret = are_bricks_online(self.mnode, self.volname, bricks_list) self.assertTrue(ret, ("Brick %s did not come up", bricks_list[1])) g.log.info("Brick %s has come online.", bricks_list[1]) # Set option self-heal-daemon to ON g.log.info('Setting option self-heal-daemon to on...') options = {"self-heal-daemon": "on"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Option 'self-heal-daemon' is set to 'on' successfully") g.log.info("Checking if files are in split-brain") ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertTrue(ret, "Unable to create split-brain scenario") g.log.info("Successfully created split brain scenario") g.log.info("Resolving split-brain by using the source-brick option " "by choosing second brick as source for all the files") node, _ = bricks_list[1].split(':') command = ("gluster v heal " + self.volname + " split-brain " "source-brick " + bricks_list[1]) ret, _, _ = g.run(node, command) self.assertEqual(ret, 0, "Command execution not successful") # waiting for heal to complete ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, "Heal not completed") # Do lookup on the files from mount cmd = ("ls -lR %s/test_metadata_sb" % self.mounts[0].mountpoint) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Failed to lookup") g.log.info("Lookup successful") # Checking if files are still in split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, "File still in split-brain") g.log.info("Successfully resolved split brain situation using " "CLI based resolution") # Check arequals for all the bricks g.log.info('Getting arequal for all the bricks after heal...') self.verify_brick_arequals() g.log.info('Getting arequal after heal is successful') # Change metadata of same files & directories as before cmd = ("cd %s/test_metadata_sb &&" "for i in `seq 1 2` ; do chmod -R 0555 dir.$i file.$i ; done" % self.mounts[0].mountpoint) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Updating file permissions failed") g.log.info("File permissions updated successfully") # Do lookup on the mount cmd = ("find %s | xargs stat" % self.mounts[0].mountpoint) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Lookup on the mount failed") g.log.info("Lookup on the mount is successful") # Check arequals for all the bricks g.log.info('Getting arequal for all the bricks...') self.verify_brick_arequals() g.log.info('Getting arequal is successful')
def test_heal_for_conservative_merge_with_two_bricks_blame(self): """ 1) Create 1x3 volume and fuse mount the volume 2) On mount created a dir dir1 3) Pkill glusterfsd on node n1 (b2 on node2 and b3 and node3 up) 4) touch f{1..10} on the mountpoint 5) b2 and b3 xattrs would be blaming b1 as files are created while b1 is down 6) Reset the b3 xattrs to NOT blame b1 by using setattr 7) Now pkill glusterfsd of b2 on node2 8) Restart glusterd on node1 to bring up b1 9) Now bricks b1 online , b2 down, b3 online 10) touch x{1..10} under dir1 itself 11) Again reset xattr on node3 of b3 so that it doesn't blame b2, as done for b1 in step 6 12) Do restart glusterd on node2 hosting b2 to bring all bricks online 13) Check for heal info, split-brain and arequal for the bricks """ # pylint: disable=too-many-locals # Create dir `dir1/` on mountpont path = self.mounts[0].mountpoint + "/dir1" ret = mkdir(self.mounts[0].client_system, path, parents=True) self.assertTrue(ret, "Directory {} creation failed".format(path)) all_bricks = get_all_bricks(self.mnode, self.volname) self.assertIsNotNone(all_bricks, "Unable to fetch bricks of volume") brick1, brick2, brick3 = all_bricks # Bring first brick offline self._bring_brick_offline_and_check(brick1) # touch f{1..10} files on the mountpoint cmd = ("cd {mpt}; for i in `seq 1 10`; do touch f$i" "; done".format(mpt=path)) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Unable to create files on mountpoint") # Check b2 and b3 xattrs are blaming b1 and are same self.assertEqual(self._get_fattr_for_the_brick(brick2), self._get_fattr_for_the_brick(brick3), "Both the bricks xattrs are not blaming " "brick: {}".format(brick1)) # Reset the xattrs of dir1 on b3 for brick b1 first_xattr_to_reset = "trusted.afr.{}-client-0".format(self.volname) xattr_value = "0x000000000000000000000000" host, brick_path = brick3.split(":") brick_path = brick_path + "/dir1" ret = set_fattr(host, brick_path, first_xattr_to_reset, xattr_value) self.assertTrue(ret, "Unable to set xattr for the directory") # Kill brick2 on the node2 self._bring_brick_offline_and_check(brick2) # Restart glusterd on node1 to bring the brick1 online self.assertTrue(restart_glusterd([brick1.split(":")[0]]), "Unable to " "restart glusterd") # checking for peer status post glusterd restart self._check_peers_status() # Check if the brick b1 on node1 is online or not online_bricks = get_online_bricks_list(self.mnode, self.volname) self.assertIsNotNone(online_bricks, "Unable to fetch online bricks") self.assertIn(brick1, online_bricks, "Brick:{} is still offline after " "glusterd restart".format(brick1)) # Create 10 files under dir1 naming x{1..10} cmd = ("cd {mpt}; for i in `seq 1 10`; do touch x$i" "; done".format(mpt=path)) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Unable to create files on mountpoint") # Reset the xattrs from brick3 on to brick2 second_xattr_to_reset = "trusted.afr.{}-client-1".format(self.volname) ret = set_fattr(host, brick_path, second_xattr_to_reset, xattr_value) self.assertTrue(ret, "Unable to set xattr for the directory") # Bring brick2 online self.assertTrue(restart_glusterd([brick2.split(":")[0]]), "Unable to " "restart glusterd") self._check_peers_status() self.assertTrue(are_bricks_online(self.mnode, self.volname, [brick2])) # Check are there any files in split-brain and heal completion self.assertFalse(is_volume_in_split_brain(self.mnode, self.volname), "Some files are in split brain for " "volume: {}".format(self.volname)) self.assertTrue(monitor_heal_completion(self.mnode, self.volname), "Conservative merge of files failed") # Check arequal checksum of all the bricks is same ret, arequal_from_the_bricks = collect_bricks_arequal(all_bricks) self.assertTrue(ret, "Arequal is collected successfully across the" " bricks in the subvol {}".format(all_bricks)) self.assertEqual(len(set(arequal_from_the_bricks)), 1, "Arequal is " "same on all the bricks in the subvol")
def test_mkdir_with_subvol_down(self): ''' Test mkdir hashed to a down subvol ''' # pylint: disable=too-many-locals # pylint: disable=too-many-branches # pylint: disable=too-many-statements # pylint: disable=W0212 mount_obj = self.mounts[0] mountpoint = mount_obj.mountpoint # directory that needs to be created parent_dir = mountpoint + '/parent' child_dir = mountpoint + '/parent/child' # get hashed subvol for name "parent" subvols = (get_subvols(self.mnode, self.volname))['volume_subvols'] hashed, count = find_hashed_subvol(subvols, "/", "parent") self.assertIsNotNone(hashed, "Could not find hashed subvol") # bring target_brick offline bring_bricks_offline(self.volname, subvols[count]) ret = are_bricks_offline(self.mnode, self.volname, subvols[count]) self.assertTrue( ret, ('Error in bringing down subvolume %s', subvols[count])) g.log.info('target subvol is offline') # create parent dir ret, _, err = g.run(self.clients[0], ("mkdir %s" % parent_dir)) self.assertNotEqual( ret, 0, ('Expected mkdir of %s to fail with %s', parent_dir, err)) g.log.info('mkdir of dir %s failed as expected', parent_dir) # check that parent_dir does not exist on any bricks and client brickobject = create_brickobjectlist(subvols, "/") for brickdir in brickobject: adp = "%s/parent" % brickdir.path bpath = adp.split(":") self.assertTrue( (file_exists(brickdir._host, bpath[1])) == 0, ('Expected dir %s not to exist on servers', parent_dir)) for client in self.clients: self.assertTrue( (file_exists(client, parent_dir)) == 0, ('Expected dir %s not to exist on clients', parent_dir)) g.log.info('dir %s does not exist on mount as expected', parent_dir) # Bring up the subvols and create parent directory bring_bricks_online(self.mnode, self.volname, subvols[count], bring_bricks_online_methods=None) ret = are_bricks_online(self.mnode, self.volname, subvols[count]) self.assertTrue( ret, ("Error in bringing back subvol %s online", subvols[count])) g.log.info('Subvol is back online') ret, _, _ = g.run(self.clients[0], ("mkdir %s" % parent_dir)) self.assertEqual(ret, 0, ('Expected mkdir of %s to succeed', parent_dir)) g.log.info('mkdir of dir %s successful', parent_dir) # get hash subvol for name "child" hashed, count = find_hashed_subvol(subvols, "parent", "child") self.assertIsNotNone(hashed, "Could not find hashed subvol") # bring target_brick offline bring_bricks_offline(self.volname, subvols[count]) ret = are_bricks_offline(self.mnode, self.volname, subvols[count]) self.assertTrue( ret, ('Error in bringing down subvolume %s', subvols[count])) g.log.info('target subvol is offline') # create child dir ret, _, err = g.run(self.clients[0], ("mkdir %s" % child_dir)) self.assertNotEqual( ret, 0, ('Expected mkdir of %s to fail with %s', child_dir, err)) g.log.info('mkdir of dir %s failed', child_dir) # check if child_dir exists on any bricks for brickdir in brickobject: adp = "%s/parent/child" % brickdir.path bpath = adp.split(":") self.assertTrue( (file_exists(brickdir._host, bpath[1])) == 0, ('Expected dir %s not to exist on servers', child_dir)) for client in self.clients: self.assertTrue((file_exists(client, child_dir)) == 0) g.log.info('dir %s does not exist on mount as expected', child_dir)
def test_ec_truncate_file_with_brick_down(self): """ Test steps: 1. Create a volume, start and mount it on a client 2. Bring down redundant bricks in the subvol 3. Create a file on the volume using "touch" 4. Truncate the file using "O_TRUNC" 5. Bring the brick online 6. Write data on the file and wait for heal completion 7. Check for crashes and coredumps """ # pylint: disable=unsubscriptable-object for restart_type in ("volume_start", "node_reboot"): # Time stamp from mnode for checking cores at the end of test ret, test_timestamp, _ = g.run(self.mnode, "date +%s") self.assertEqual(ret, 0, "date command failed") test_timestamp = test_timestamp.strip() # Create a file using touch file_name = self.mounts[0].mountpoint + "/test_1" ret, _, err = g.run(self.mounts[0].client_system, "touch {}".format(file_name)) self.assertEqual(ret, 0, "File creation failed") g.log.info("File Created successfully") # List two bricks in each subvol subvols = get_subvols(self.mnode, self.volname)['volume_subvols'] bricks_to_bring_offline = [] for subvol in subvols: self.assertTrue(subvol, "List is empty") bricks_to_bring_offline.extend(sample(subvol, 2)) # Bring two bricks of each subvol offline ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, "Bricks are still online") # Validating the bricks are offline or not ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, "Few of the bricks are still online in" " {} in".format(bricks_to_bring_offline)) # Truncate the file cmd = ( 'python -c "import os, sys; fd = os.open(\'{}\', os.O_TRUNC )' '; os.close( fd )"').format(file_name) ret, _, err = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, err) g.log.info("File truncated successfully") # Bring back the bricks online if restart_type == "volume_start": # Bring back bricks online by volume start ret, _, err = volume_start(self.mnode, self.volname, force=True) self.assertEqual(ret, 0, err) g.log.info("All bricks are online") elif restart_type == "node_reboot": # Bring back the bricks online by node restart for brick in bricks_to_bring_offline: node_to_reboot = brick.split(":")[0] ret = reboot_nodes_and_wait_to_come_online(node_to_reboot) self.assertTrue( ret, "Reboot Failed on node: " "{}".format(node_to_reboot)) g.log.info("Node: %s rebooted successfully", node_to_reboot) time.sleep(60) # Check whether bricks are online or not ret = are_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, "Bricks {} are still offline".format(bricks_to_bring_offline)) # write data to the file cmd = ('python -c "import os, sys;fd = os.open(\'{}\', ' 'os.O_RDWR) ;' 'os.write(fd, \'This is test after truncate\'.encode());' ' os.close(fd)"').format(file_name) ret, _, err = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, err) g.log.info("Data written successfully on to the file") # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, "Heal pending for file {}".format(file_name)) # check for any crashes on servers and client for nodes in (self.servers, [self.clients[0]]): ret = is_core_file_created(nodes, test_timestamp) self.assertTrue(ret, "Cores found on the {} nodes".format(nodes))
def test_validate_authreject_vol(self): """ -Set Authentication -For all the clients -Fetch the bricks -Check if bricks are online -Create directory -Mount the volume -Check if it is mounted -Check authentication logs -Reset the Volume -Check if bricks are online -Mounting the vol on client1 """ # pylint: disable=too-many-statements # Set Authentication option = {"auth.reject": "\"*\""} ret = set_volume_options(self.mnode, self.volname, option) self.assertTrue(ret, "Failed to set authentication") g.log.info("Authentication set Successfully") for client in self.clients: # Fetching all the bricks self.mountpoint = '/mnt/testvol' g.log.info("Fetching bricks for the volume : %s", self.volname) bricks_list = get_all_bricks(self.mnode, self.volname) self.assertIsNotNone(bricks_list, "Brick list is empty") g.log.info("Brick List : %s", bricks_list) # Check are bricks online ret = are_bricks_online(self.mnode, self.volname, bricks_list) self.assertTrue(ret, "All bricks are not online") g.log.info("All bricks are online") # Creating directory to mount cmd = ("mkdir -p /mnt/testvol") ret, _, _ = g.run(client, cmd) self.assertEqual(ret, 0, "Failed to create directory") # Using this way to check because of bug 1586036 # Mounting volume ret, _, _ = mount_volume(self.volname, self.mount_type, self.mountpoint, self.mnode, client) # Checking if volume is mounted out = is_mounted(self.volname, self.mountpoint, self.mnode, client, self.mount_type, user='******') if (ret == 0) & (not out): g.log.error("Mount executed successfully due to bug 1586036") elif (ret == 1) & (not out): g.log.info("Expected:Mounting has failed successfully") else: raise ExecutionError("Unexpected Mounting of Volume %s" "successful" % self.volname) # Checking client logs for authentication error cmd = ("grep AUTH_FAILED /var/log/glusterfs/mnt-" "testvol.log") ret, _, _ = g.run(client, cmd) self.assertEqual( ret, 0, "Mounting has not failed due to" "authentication error") g.log.info("Mounting has failed due to authentication error") # Reset Volume ret, _, _ = volume_reset(mnode=self.mnode, volname=self.volname) self.assertEqual(ret, 0, "Failed to reset volume") g.log.info("Volume %s reset operation is successful", self.volname) # Check if bricks are online and Mounting the vol on client1 # Fetching bricks g.log.info("Fetching bricks for the volume : %s", self.volname) bricks_list = get_all_bricks(self.mnode, self.volname) self.assertIsNotNone(bricks_list, "Brick list is empty") g.log.info("Brick List : %s", bricks_list) # Checking if bricks are online ret = are_bricks_online(self.mnode, self.volname, bricks_list) self.assertTrue(ret, "All bricks are not online") g.log.info("All bricks are online") # Creating directory to mount cmd = ("mkdir -p /mnt/testvol") ret, _, _ = g.run(self.clients[0], cmd) self.assertEqual(ret, 0, "Failed to create directory") # Mounting Volume ret, _, _ = mount_volume(self.volname, self.mount_type, self.mountpoint, self.servers[0], self.clients[0]) self.assertEqual(ret, 0, "Failed to mount volume") g.log.info("Mounted Successfully") # Checking if Volume is mounted out = is_mounted(self.volname, self.mountpoint, self.servers[0], self.clients[0], self.mount_type, user='******') self.assertTrue(out, "Volume %s has failed to mount" % self.volname) g.log.info("Volume is mounted successfully %s", self.volname)
def test_glustershd_with_restarting_glusterd(self): """ Test Script to verify the self heal daemon process with restarting glusterd and rebooting the server * stop all volumes * restart glusterd - should not run self heal daemon process * start replicated involved volumes * single self heal daemon process running * restart glusterd * self heal daemon pid will change * bring down brick and restart glusterd * self heal daemon pid will change and its different from previous * brought up the brick """ # pylint: disable=too-many-statements nodes = self.volume['servers'] # stop the volume g.log.info("Stopping the volume %s", self.volname) ret = volume_stop(self.mnode, self.volname) self.assertTrue(ret, ("Failed to stop volume %s" % self.volname)) g.log.info("Successfully stopped volume %s", self.volname) # check the self heal daemon process after stopping the volume g.log.info("Verifying the self heal daemon process for " "volume %s", self.volname) ret = are_all_self_heal_daemons_are_online(self.mnode, self.volname) self.assertFalse(ret, ("Self Heal Daemon process is still running " "even after stopping volume %s" % self.volname)) g.log.info("Self Heal Daemon is not running after stopping " "volume %s", self.volname) # restart glusterd service on all the servers g.log.info("Restarting glusterd on all servers %s", nodes) ret = restart_glusterd(nodes) self.assertTrue(ret, ("Failed to restart glusterd on all nodes %s", nodes)) g.log.info("Successfully restarted glusterd on all nodes %s", nodes) self.assertTrue( wait_for_glusterd_to_start(self.servers), "Failed to start glusterd on %s" % self.servers) # check the self heal daemon process after restarting glusterd process g.log.info("Starting to get self-heal daemon process on" " nodes %s", nodes) ret = are_all_self_heal_daemons_are_online(self.mnode, self.volname) self.assertFalse(ret, ("Self Heal Daemon process is running after " "glusterd restart with volume %s in " "stop state" % self.volname)) g.log.info("Self Heal Daemon is not running after stopping " "volume and restarting glusterd %s", self.volname) # start the volume g.log.info("Starting the volume %s", self.volname) ret = volume_start(self.mnode, self.volname) self.assertTrue(ret, ("Failed to start volume %s" % self.volname)) g.log.info("Volume %s started successfully", self.volname) # Verfiy glustershd process releases its parent process g.log.info("Checking whether glustershd process is daemonized or not") ret = is_shd_daemonized(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) g.log.info("Single self heal daemon process on all nodes %s", nodes) # get the self heal daemon pids after starting volume g.log.info("Starting to get self-heal daemon process " "on nodes %s", nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) g.log.info("Successful in getting self heal daemon pids") glustershd_pids = pids # get the bricks for the volume g.log.info("Fetching bricks for the volume : %s", self.volname) bricks_list = get_all_bricks(self.mnode, self.volname) g.log.info("Brick List : %s", bricks_list) # validate the bricks present in volume info # with glustershd server volume file g.log.info("Starting parsing file %s on " "node %s", self.glustershd, self.mnode) ret = do_bricks_exist_in_shd_volfile(self.mnode, self.volname, bricks_list) self.assertTrue(ret, ("Brick List from volume info is different from " "glustershd server volume file. " "Please check log file for details.")) g.log.info("Successfully parsed %s file", self.glustershd) # restart glusterd service on all the servers g.log.info("Restarting glusterd on all servers %s", nodes) ret = restart_glusterd(nodes) self.assertTrue(ret, ("Failed to restart glusterd on all nodes %s", nodes)) g.log.info("Successfully restarted glusterd on all nodes %s", nodes) # Verify volume's all process are online for 60 sec g.log.info("Verifying volume's all process are online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname, 60) self.assertTrue(ret, ("Volume %s : All process are not " "online", self.volname)) g.log.info("Successfully Verified volume %s processes are online", self.volname) # Verfiy glustershd process releases its parent process ret = is_shd_daemonized(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) # check the self heal daemon process after starting volume and # restarting glusterd process g.log.info("Starting to get self-heal daemon process " "on nodes %s", nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) glustershd_pids_after_glusterd_restart = pids self.assertNotEqual(glustershd_pids, glustershd_pids_after_glusterd_restart, ("Self Heal Daemon pids are same after " "restarting glusterd process")) g.log.info("Self Heal Daemon process are different before and " "after restarting glusterd process") # select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = list(filter(None, ( bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks']))) # bring bricks offline g.log.info("Going to bring down the brick process " "for %s", bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, ("Failed to bring down the bricks. Please " "check the log file for more details.")) g.log.info("Brought down the brick process " "for %s successfully", bricks_to_bring_offline) # restart glusterd after brought down the brick g.log.info("Restart glusterd on all servers %s", nodes) ret = restart_glusterd(nodes) self.assertTrue(ret, ("Failed to restart glusterd on all nodes %s", nodes)) g.log.info("Successfully restarted glusterd on all nodes %s", nodes) # Verify volume's all process are online for 60 sec g.log.info("Verifying volume's all process are online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname, 60) self.assertTrue(ret, ("Volume %s : All process are not " "online", self.volname)) g.log.info("Successfully Verified volume %s processes are online", self.volname) # Verfiy glustershd process releases its parent process ret = is_shd_daemonized(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) # check the self heal daemon process after killing brick and # restarting glusterd process g.log.info("Starting to get self-heal daemon process " "on nodes %s", nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) glustershd_pids_after_killing_brick = pids self.assertNotEqual(glustershd_pids_after_glusterd_restart, glustershd_pids_after_killing_brick, ("Self Heal Daemon process are same from before " "killing the brick,restarting glusterd process")) g.log.info("Self Heal Daemon process are different after killing the " "brick, restarting the glusterd process") # brought the brick online g.log.info("bringing up the bricks : %s online", bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, ("Failed to brought the bricks online")) g.log.info("Successfully brought the bricks online") # check all bricks are online g.log.info("Verifying all bricka are online or not.....") ret = are_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, ("Not all bricks are online")) g.log.info("All bricks are online.")
def test_afr_gfid_heal(self): """ Description: This test case runs split-brain resolution on a 5 files in split-brain on a 1x2 volume. After resolving split-brain, it makes sure that split brain resolution doesn't work on files already in split brain. """ g.log.info("disabling the self heal daemon") ret = disable_self_heal_daemon(self.mnode, self.volname) self.assertTrue(ret, "unable to disable self heal daemon") g.log.info("Successfully disabled the self heal daemon") # getting list of all bricks all_bricks = get_all_bricks(self.mnode, self.volname) self.assertIsNotNone(all_bricks, "failed to get list of bricks") g.log.info("bringing down brick1") ret = bring_bricks_offline(self.volname, all_bricks[0:1]) self.assertTrue(ret, "unable to bring brick1 offline") g.log.info("Successfully brought the following brick offline " ": %s", str(all_bricks[0])) g.log.info("verifying if brick1 is offline") ret = are_bricks_offline(self.mnode, self.volname, all_bricks[0:1]) self.assertTrue(ret, "brick1 is still online") g.log.info("verified: brick1 is offline") g.log.info("creating 5 files from mount point") all_mounts_procs = [] for mount_obj in self.mounts: cmd = ("python %s create_files " "-f 5 --base-file-name test_file --fixed-file-size 1k %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) # Validate I/O g.log.info("Wait for IO to complete and validate IO.....") ret = validate_io_procs(all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") g.log.info("IO is successful on all mounts") g.log.info("Successfully created a file from mount point") g.log.info("bringing brick 1 back online") ret = bring_bricks_online(self.mnode, self.volname, all_bricks[0:1]) self.assertIsNotNone(ret, "unable to bring brick 1 online") g.log.info("Successfully brought the following brick online " ": %s", str(all_bricks[0])) g.log.info("verifying if brick1 is online") ret = are_bricks_online(self.mnode, self.volname, all_bricks[0:1]) self.assertTrue(ret, "brick1 is not online") g.log.info("verified: brick1 is online") g.log.info("bringing down brick2") ret = bring_bricks_offline(self.volname, all_bricks[1:2]) self.assertTrue(ret, "unable to bring brick2 offline") g.log.info("Successfully brought the following brick offline " ": %s", str(all_bricks[1])) g.log.info("verifying if brick2 is offline") ret = are_bricks_offline(self.mnode, self.volname, all_bricks[1:2]) self.assertTrue(ret, "brick2 is still online") g.log.info("verified: brick2 is offline") g.log.info("creating 5 new files of same name from mount point") all_mounts_procs = [] for mount_obj in self.mounts: cmd = ("python %s create_files " "-f 5 --base-file-name test_file --fixed-file-size 10k %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) # Validate I/O g.log.info("Wait for IO to complete and validate IO.....") ret = validate_io_procs(all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") g.log.info("IO is successful on all mounts") g.log.info("Successfully created a new file of same name " "from mount point") g.log.info("bringing brick2 back online") ret = bring_bricks_online(self.mnode, self.volname, all_bricks[1:2]) self.assertIsNotNone(ret, "unable to bring brick2 online") g.log.info("Successfully brought the following brick online " ": %s", str(all_bricks[1])) g.log.info("verifying if brick2 is online") ret = are_bricks_online(self.mnode, self.volname, all_bricks[1:2]) self.assertTrue(ret, "brick2 is not online") g.log.info("verified: brick2 is online") g.log.info("enabling the self heal daemon") ret = enable_self_heal_daemon(self.mnode, self.volname) self.assertTrue(ret, "failed to enable self heal daemon") g.log.info("Successfully enabled the self heal daemon") g.log.info("checking if volume is in split-brain") ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertTrue(ret, "unable to create split-brain scenario") g.log.info("Successfully created split brain scenario") g.log.info("resolving split-brain by choosing first brick as " "the source brick") node, brick_path = all_bricks[0].split(':') for fcount in range(5): command = ("gluster v heal " + self.volname + " split-brain " "source-brick " + all_bricks[0] + ' /test_file' + str(fcount) + '.txt') ret, _, _ = g.run(node, command) self.assertEqual(ret, 0, "command execution not successful") # triggering heal ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, "heal not triggered") g.log.info("Successfully triggered heal") # waiting for heal to complete ret = monitor_heal_completion(self.mnode, self.volname, timeout_period=240) self.assertTrue(ret, "heal not completed") g.log.info("Heal completed successfully") # checking if any file is in split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, "file still in split-brain") g.log.info("Successfully resolved split brain situation using " "CLI based resolution") g.log.info("resolving split-brain on a file not in split-brain") node, brick_path = all_bricks[0].split(':') command = ("gluster v heal " + self.volname + " split-brain " "source-brick " + all_bricks[1] + " /test_file0.txt") ret, _, _ = g.run(node, command) self.assertNotEqual( ret, 0, "Unexpected: split-brain resolution " "command is successful on a file which" " is not in split-brain") g.log.info("Expected: split-brian resolution command failed on " "a file which is not in split-brain") g.log.info("checking the split-brain status of each file") for fcount in range(5): fpath = (self.mounts[0].mountpoint + '/test_file' + str(fcount) + '.txt') status = get_fattr(self.mounts[0].client_system, fpath, 'replica.split-brain-status') compare_string = ("The file is not under data or metadata " "split-brain") self.assertEqual( status.rstrip('\x00'), compare_string, "file test_file%s is under" " split-brain" % str(fcount)) g.log.info("none of the files are under split-brain")
def test_heal_when_dir_quota_exceeded_(self): # Create a directory to set the quota_limit_usage path = "/dir" g.log.info("Creating a directory") self.all_mounts_procs = [] for mount_object in self.mounts: cmd = ("python %s create_deep_dir -d 0 -l 0 %s%s " % (self.script_upload_path, mount_object.mountpoint, path)) ret = g.run(mount_object.client_system, cmd) self.assertTrue(ret, "Failed to create directory on mountpoint") g.log.info("Directory created successfully on mountpoint") # Enable Quota g.log.info("Enabling quota on the volume %s", self.volname) ret, _, _ = quota_enable(self.mnode, self.volname) self.assertEqual(ret, 0, ("Failed to enable quota on the volume " "%s", self.volname)) g.log.info("Successfully enabled quota on the volume %s", self.volname) # Set quota-soft-timeout to 0 g.log.info("Setting up soft timeout to 0") ret, _, _ = quota_set_soft_timeout(self.mnode, self.volname, "0") self.assertEqual(ret, 0, ("Failed to set quota-soft-timeout")) g.log.info("Successfully set the quota-soft-timeout") # Set quota-hard-timeout to 0 g.log.info("Setting up hard timeout with 0") ret, _, _ = quota_set_hard_timeout(self.mnode, self.volname, "0") self.assertEqual(ret, 0, ("Failed to set quota-hard-timeout")) g.log.info("successfully set the quota-hard-timeout") # Set Quota limit on the newly created directory g.log.info("Set Quota Limit on the path %s of the volume %s", path, self.volname) ret, _, _ = quota_limit_usage(self.mnode, self.volname, path=path, limit="1GB") self.assertEqual(ret, 0, ("Failed to set quota limit on path %s of " " the volume %s", path, self.volname)) g.log.info("Successfully set the Quota limit on %s of the volume " "%s", path, self.volname) # Create 2 files of size 400MB inside the directory for mount_object in self.mounts: g.log.info("Creating Files on %s:%s", mount_object.client_system, path) cmd = ("cd %s%s && for i in `seq 1 2` ;" "do dd if=/dev/urandom of=file$i bs=20M " "count=20; done" % (mount_object.mountpoint, path)) ret, _, _ = g.run(mount_object.client_system, cmd) self.assertEqual(ret, 0, ("Failed to create files on %s", path)) g.log.info("Files created successfully on mountpoint") bricks_list = get_all_bricks(self.mnode, self.volname) # Bring brick2 offline g.log.info('Bringing bricks %s offline', bricks_list[2]) ret = bring_bricks_offline(self.volname, bricks_list[2]) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_list[2]) ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[2]]) self.assertTrue(ret, 'Brick %s is not offline' % bricks_list[2]) g.log.info('Bringing brick %s offline is successful', bricks_list[2]) # Create a file of size 500MB inside the directory and it should fail # as the quota limit exceeds cmd = ("cd %s%s && dd if=/dev/urandom of=file3 bs=20M count=25" % (mount_object.mountpoint, path)) ret, _, _ = g.run(mount_object.client_system, cmd) self.assertEqual(ret, 1, ("Writing a file of 500MB succeeded while " "it was not supposed to.")) g.log.info("Writing a file of size 500MB failed as expected " "due to quota limit on the directory.") # Bring brick2 online and check status g.log.info('Bringing brick %s online...', bricks_list[2]) ret = bring_bricks_online(self.mnode, self.volname, [bricks_list[2]]) self.assertTrue(ret, 'Failed to bring brick %s online' % bricks_list[2]) g.log.info('Bringing brick %s online is successful', bricks_list[2]) g.log.info("Verifying if brick %s is online", bricks_list[2]) ret = are_bricks_online(self.mnode, self.volname, bricks_list) self.assertTrue(ret, ("Brick %s did not come up", bricks_list[2])) g.log.info("Brick %s has come online.", bricks_list[2]) # Trigger heal ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Starting heal failed') g.log.info('Index heal launched') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully')
def test_heal_when_quota_object_limit_exceeded(self): # Create a directory to set the quota_limit_objects path = "/dir" g.log.info("Creating a directory") self.all_mounts_procs = [] for mount_object in self.mounts: cmd = "/usr/bin/env python %s create_deep_dir -d 0 -l 0 %s%s" % ( self.script_upload_path, mount_object.mountpoint, path) ret = g.run(mount_object.client_system, cmd) self.assertTrue(ret, "Failed to create directory on mountpoint") g.log.info("Directory created successfully on mountpoint") # Enable Quota g.log.info("Enabling quota on the volume %s", self.volname) ret, _, _ = quota_enable(self.mnode, self.volname) self.assertEqual(ret, 0, ("Failed to enable quota on the volume " "%s", self.volname)) g.log.info("Successfully enabled quota on the volume %s", self.volname) # Set quota-soft-timeout to 0 g.log.info("Setting up soft timeout to 0") ret, _, _ = quota_set_soft_timeout(self.mnode, self.volname, "0") self.assertEqual(ret, 0, ("Failed to set quota-soft-timeout")) g.log.info("Successfully set the quota-soft-timeout") # Set quota-hard-timeout to 0 g.log.info("Setting up hard timeout with 0") ret, _, _ = quota_set_hard_timeout(self.mnode, self.volname, "0") self.assertEqual(ret, 0, ("Failed to set quota-hard-timeout")) g.log.info("successfully set the quota-hard-timeout") # Set Quota limit on the newly created directory g.log.info("Set Quota Limit on the path %s of the volume %s", path, self.volname) ret, _, _ = quota_limit_objects(self.mnode, self.volname, path=path, limit="5") self.assertEqual(ret, 0, ("Failed to set quota limit on path %s of " " the volume %s", path, self.volname)) g.log.info( "Successfully set the quota limit on %s of the volume " "%s", path, self.volname) # Create 3 files inside the directory for mount_object in self.mounts: g.log.info("Creating Files on %s:%s", mount_object.client_system, path) cmd = ("/usr/bin/env python %s create_files -f 3 " "--base-file-name file-0 %s%s" % (self.script_upload_path, mount_object.mountpoint, path)) ret, _, _ = g.run(mount_object.client_system, cmd) self.assertEqual(ret, 0, ("Failed to create files on %s", path)) g.log.info("Files created successfully on mountpoint") bricks_list = get_all_bricks(self.mnode, self.volname) # Bring brick3 offline g.log.info('Bringing brick %s offline', bricks_list[2]) ret = bring_bricks_offline(self.volname, bricks_list[2]) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_list[2]) ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[2]]) self.assertTrue(ret, 'Brick %s is not offline' % bricks_list[2]) g.log.info('Bringing brick %s offline is successful', bricks_list[2]) # Try creating 5 more files, which should fail as the quota limit # exceeds cmd = ("/usr/bin/env python %s create_files -f 5 --base-file-name " "file-1 %s%s" % (self.script_upload_path, mount_object.mountpoint, path)) ret, _, _ = g.run(mount_object.client_system, cmd) self.assertNotEqual(ret, 0, ("Creating 5 files succeeded while it was" "not supposed to.")) g.log.info("Creating 5 files failed as expected due to quota object" "limit on the directory.") # Bring brick3 online and check status g.log.info('Bringing brick %s online', bricks_list[2]) ret = bring_bricks_online(self.mnode, self.volname, [bricks_list[2]]) self.assertTrue(ret, 'Failed to bring brick %s online' % bricks_list[2]) g.log.info('Bringing brick %s online is successful', bricks_list[2]) g.log.info("Verifying if brick %s is online", bricks_list[2]) ret = are_bricks_online(self.mnode, self.volname, bricks_list) self.assertTrue(ret, ("Brick %s did not come up", bricks_list[2])) g.log.info("Brick %s has come online.", bricks_list[2]) # Trigger heal ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Starting heal failed') g.log.info('Index heal launched') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully')
def test_files_on_mount(self): """"" Description:- - I/O on the mounts - kill brick in cyclic order - list the files after healing """ "" # IO on the mount point # Each client will write 2 files each of 1 GB and keep # modifying the same file g.log.info("Starting IO on all mounts...") for mount_obj in self.mounts: g.log.info("Starting IO on %s:%s", mount_obj.client_system, mount_obj.mountpoint) cmd = ("python %s " "--file-sizes-list 1G " "--chunk-sizes-list 128 " "--write-time 900 " "--num-of-files 2 " "--base-file-name test_brick_down_from_client_%s.txt " "--dir %s " % (self.script_upload_path, mount_obj.client_system, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) self.all_mounts_procs.append(proc) self.counter = self.counter + 10 self.io_validation_complete = False # Killing bricks in cyclic order bricks_list = get_all_bricks(self.mnode, self.volname) # Total number of cyclic brick-down cycles to be executed number_of_cycles = 0 while number_of_cycles < 3: number_of_cycles += 1 for brick in bricks_list: # Bring brick offline g.log.info('Bringing bricks %s offline', brick) ret = bring_bricks_offline(self.volname, [brick]) self.assertTrue(ret, ("Failed to bring bricks %s offline" % brick)) ret = are_bricks_offline(self.mnode, self.volname, [brick]) self.assertTrue(ret, 'Bricks %s are not offline' % brick) g.log.info('Bringing bricks %s offline is successful', brick) # Introducing 30 second sleep when brick is down g.log.info( "Waiting for 30 seconds, with ongoing IO while " "brick %s is offline", brick) ret = time.sleep(30) # Bring brick online g.log.info('Bringing bricks %s online', brick) ret = bring_bricks_online(self.mnode, self.volname, [brick]) self.assertTrue(ret, ("Failed to bring bricks %s online " % brick)) g.log.info('Bricks %s are online', brick) # Check if bricks are online ret = are_bricks_online(self.mnode, self.volname, bricks_list) self.assertTrue(ret, 'Bricks %s are not online' % bricks_list) g.log.info('Bricks %s are online', bricks_list) # Check daemons g.log.info('Checking daemons...') ret = are_all_self_heal_daemons_are_online( self.mnode, self.volname) self.assertTrue(ret, ("Some of the self-heal Daemons are " "offline")) g.log.info('All self-heal Daemons are online') # Validate IO self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients") self.io_validation_complete = True # Checking volume status g.log.info( "Logging volume info and Status after bringing bricks " "offline from the volume %s", self.volname) ret = log_volume_info_and_status(self.mnode, self.volname) self.assertTrue(ret, ("Logging volume info and status failed on " "volume %s" % self.volname)) g.log.info("Successful in logging volume info and status of volume %s", self.volname) # Monitoring heals on the volume g.log.info("Wait for self-heal to completeon the volume") ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, ("Self heal didn't complete even after waiting " "for 20 minutes.")) g.log.info("self-heal is successful after changing the volume type " "from replicated to arbitered volume") # List all files and dirs created g.log.info("List all files and directories:") ret = list_all_files_and_dirs_mounts(self.mounts) self.assertTrue(ret, "Failed to list all files and dirs") g.log.info("Listing all files and directories is successful")
def test_entry_heal_with_quota(self): """ - Create a 1x3 volume - Set quota object limit - Create files less than the limit - Bring down a brick and create more files until limit is hit - Delete one file so that we are below the limit, and create one more file - Bring the brick back up and launch heal - Verify that after heal is complete, the deleted file does not re-appear in any of the bricks. """ # pylint: disable=too-many-statements # Enable Quota g.log.info("Enabling quota on the volume %s", self.volname) ret, _, _ = quota_enable(self.mnode, self.volname) self.assertEqual( ret, 0, ("Failed to enable quota on the volume %s", self.volname)) g.log.info("Successfully enabled quota on the volume %s", self.volname) # Check if quota is enabled g.log.info("Validate Quota is enabled on the volume %s", self.volname) ret = is_quota_enabled(self.mnode, self.volname) self.assertTrue( ret, ("Quota is not enabled on the volume %s", self.volname)) g.log.info("Successfully Validated quota is enabled on volume %s", self.volname) # Set quota related options options = { "quota-deem-statfs": "on", "soft-timeout": "0", "hard-timeout": "0" } g.log.info("setting quota volume options %s", options) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, ("Unable to set volume option %s for " "volume %s" % (options, self.volname))) g.log.info("Successfully set %s for volume %s", options, self.volname) # Create directory on mount ret = mkdir(self.mounts[0].client_system, "%s/dir" % self.mounts[0].mountpoint) self.assertTrue(ret, "mkdir failed") # Set Quota limit on the directory path = "/dir" g.log.info( "Setting Quota Limit object on the path %s of the " "volume %s", path, self.volname) ret, _, _ = quota_limit_objects(self.mnode, self.volname, path=path, limit="10") self.assertEqual(ret, 0, ("Failed to set quota limit object " "on path %s of the volume %s", path, self.volname)) g.log.info( "Successfully set the Quota limit object on %s of the " "volume %s", path, self.volname) cmd = ("touch %s/dir/file{1..5}" % self.mounts[0].mountpoint) ret, _, _ = g.run(self.clients[0], cmd) self.assertEqual(ret, 0, "file creation failed") # Bring brick3 offline bricks_list = get_all_bricks(self.mnode, self.volname) g.log.info('Bringing brick %s offline', bricks_list[2]) ret = bring_bricks_offline(self.volname, bricks_list[2]) self.assertTrue(ret, 'Failed to bring brick %s offline' % bricks_list[2]) ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[2]]) self.assertTrue(ret, 'Brick %s is not offline' % bricks_list[2]) g.log.info('Bringing brick %s offline was successful', bricks_list[2]) # Create files until quota object limit cmd = ("touch %s/dir/file{6..9}" % self.mounts[0].mountpoint) ret, _, _ = g.run(self.clients[0], cmd) self.assertEqual(ret, 0, "file creation failed") # The next create must fail cmd = ("touch %s/dir/file10" % self.mounts[0].mountpoint) ret, _, _ = g.run(self.clients[0], cmd) self.assertEqual( ret, 1, ("Creation of %s/dir/file10 succeeded while " "it was not supposed to." % self.mounts[0].mountpoint)) g.log.info( "Creation of %s/dir/file10 failed as expected due to " "quota object limit.", self.mounts[0].mountpoint) # Delete one file and re-try the create to succeed. cmd = ("rm %s/dir/file1" % self.mounts[0].mountpoint) ret, _, _ = g.run(self.clients[0], cmd) self.assertEqual(ret, 0, "File deletion failed") cmd = ("touch %s/dir/file10" % self.mounts[0].mountpoint) ret, _, _ = g.run(self.clients[0], cmd) self.assertEqual(ret, 0, "File creation failed") # Bring brick3 online and check status g.log.info('Bringing brick %s online...', bricks_list[2]) ret = bring_bricks_online(self.mnode, self.volname, [bricks_list[2]]) self.assertTrue(ret, 'Failed to bring brick %s online' % bricks_list[2]) g.log.info('Bringing brick %s online is successful', bricks_list[2]) g.log.info("Verifying if brick3 is online....") ret = are_bricks_online(self.mnode, self.volname, bricks_list) self.assertTrue(ret, ("brick3 did not come up")) g.log.info("brick3 has come online.") # Trigger heal ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Starting heal failed') g.log.info('Index heal launched') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Verify that file10 did not get recreated on the down brick by an # accidental conservative merge. for brick in bricks_list: node, brick_path = brick.split(':') ret, _, _ = g.run(node, 'stat %s/dir/file10' % brick_path) self.assertFalse(ret, 'File present!')
def test_snap_self_heal(self): """ Steps: 1. create a volume 2. mount volume 3. create snapshot of that volume 4. Activate snapshot 5. Clone snapshot and Mount 6. Perform I/O 7. Bring Down Few bricks from volume without affecting the volume or cluster. 8. Perform I/O 9. Bring back down bricks to online 10. Validate heal is complete with areequal """ # pylint: disable=too-many-statements, too-many-locals # Creating snapshot: g.log.info("Starting to Create snapshot") ret, _, _ = snap_create(self.mnode, self.volname, self.snap) self.assertEqual( ret, 0, ("Failed to create snapshot for volume %s" % self.volname)) g.log.info("Snapshot %s created successfully for volume %s", self.snap, self.volname) # Activating snapshot g.log.info("Starting to Activate Snapshot") ret, _, _ = snap_activate(self.mnode, self.snap) self.assertEqual(ret, 0, ("Failed to Activate snapshot %s" % self.snap)) g.log.info("Snapshot %s activated successfully", self.snap) # snapshot list ret, _, _ = snap_list(self.mnode) self.assertEqual(ret, 0, ("Failed to list all the snapshot")) g.log.info("Snapshot list command was successful") # Creating a Clone volume from snapshot: g.log.info("Starting to Clone volume from Snapshot") ret, _, _ = snap_clone(self.mnode, self.snap, self.clone) self.assertEqual(ret, 0, ("Failed to clone %s from snapshot %s" % (self.clone, self.snap))) g.log.info("%s created successfully", self.clone) # start clone volumes g.log.info("start to created clone volumes") ret, _, _ = volume_start(self.mnode, self.clone) self.assertEqual(ret, 0, "Failed to start clone %s" % self.clone) g.log.info("clone volume %s started successfully", self.clone) # Mounting a clone volume g.log.info("Mounting a clone volume") ret, _, _ = mount_volume(self.clone, self.mount_type, self.mount1, self.mnode, self.clients[0]) self.assertEqual(ret, 0, "Failed to mount clone Volume %s" % self.clone) g.log.info("Clone volume %s mounted Successfully", self.clone) # Checking cloned volume mounted or not ret = is_mounted(self.clone, self.mount1, self.mnode, self.clients[0], self.mount_type) self.assertTrue( ret, "Failed to mount clone volume on mount point: %s" % self.mount1) g.log.info("clone Volume %s mounted on %s", self.clone, self.mount1) # write files on all mounts g.log.info("Starting IO on all mounts...") g.log.info("mounts: %s", self.mount1) all_mounts_procs = [] cmd = ("python %s create_files " "-f 10 --base-file-name file %s" % (self.script_upload_path, self.mount1)) proc = g.run(self.clients[0], cmd) all_mounts_procs.append(proc) g.log.info("Successful in creating I/O on mounts") # get the bricks from the volume g.log.info("Fetching bricks for the volume : %s", self.clone) bricks_list = get_all_bricks(self.mnode, self.clone) g.log.info("Brick List : %s", bricks_list) # Select bricks to bring offline g.log.info("Starting to bring bricks to offline") bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = filter( None, (bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks'])) g.log.info("Brick to bring offline: %s ", bricks_to_bring_offline) ret = bring_bricks_offline(self.clone, bricks_to_bring_offline) self.assertTrue(ret, "Failed to bring the bricks offline") g.log.info("Successful in bringing bricks: %s offline", bricks_to_bring_offline) # Offline Bricks list offline_bricks = get_offline_bricks_list(self.mnode, self.clone) self.assertIsNotNone( offline_bricks, "Failed to get offline bricklist" "for volume %s" % self.clone) for bricks in offline_bricks: self.assertIn(bricks, bricks_to_bring_offline, "Failed to validate " "Bricks offline") g.log.info("Bricks Offline: %s", offline_bricks) # Online Bricks list online_bricks = get_online_bricks_list(self.mnode, self.clone) self.assertIsNotNone( online_bricks, "Failed to get online bricks" " for volume %s" % self.clone) g.log.info("Bricks Online: %s", online_bricks) # write files mountpoint g.log.info("Starting IO on all mounts...") g.log.info("mounts: %s", self.mount1) all_mounts_procs = [] cmd = ("python %s create_files " "-f 10 --base-file-name file %s" % (self.script_upload_path, self.mount1)) proc = g.run(self.clients[0], cmd) all_mounts_procs.append(proc) g.log.info("Successful in creating I/O on mounts") # Bring all bricks online g.log.info("bring all bricks online") ret = bring_bricks_online(self.mnode, self.clone, bricks_to_bring_offline) self.assertTrue(ret, "Failed to bring bricks online") g.log.info("Successful in bringing all bricks online") # Validate Bricks are online g.log.info("Validating all bricks are online") ret = are_bricks_online(self.mnode, self.clone, bricks_list) self.assertTrue(ret, "Failed to bring all the bricks online") g.log.info("bricks online: %s", bricks_list) # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.clone) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online" % self.clone)) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.clone) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.clone) self.assertTrue( ret, ("Volume %s : All process are not online" % self.clone)) g.log.info("Volume %s : All process are online", self.clone) # wait for the heal process to complete g.log.info("waiting for heal process to complete") ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, "Failed to complete the heal process") g.log.info("Successfully completed heal process") # Check areequal # get the subvolumes g.log.info("Starting to get sub-volumes for volume %s", self.clone) subvols = get_subvols(self.mnode, self.clone) num_subvols = len(subvols['volume_subvols']) g.log.info("Number of subvolumes in volume %s:", num_subvols) # Get arequals and compare g.log.info("Starting to Compare areequals") for i in range(0, num_subvols): # Get arequal for first brick subvol_brick_list = subvols['volume_subvols'][i] node, brick_path = subvol_brick_list[0].split(':') command = ('arequal-checksum -p %s ' '-i .glusterfs -i .landfill -i .trashcan' % brick_path) ret, arequal, _ = g.run(node, command) first_brick_total = arequal.splitlines()[-1].split(':')[-1] # Get arequal for every brick and compare with first brick for brick in subvol_brick_list: node, brick_path = brick.split(':') command = ('arequal-checksum -p %s ' '-i .glusterfs -i .landfill -i .trashcan' % brick_path) ret, brick_arequal, _ = g.run(node, command) self.assertFalse(ret, 'Failed to get arequal on brick %s' % brick) g.log.info('Getting arequal for %s is successful', brick) brick_total = brick_arequal.splitlines()[-1].split(':')[-1] self.assertEqual( first_brick_total, brick_total, 'Arequals for subvol and %s are not equal' % brick) g.log.info('Arequals for subvol and %s are equal', brick) g.log.info('All arequals are equal for distributed-replicated')
def test_ec_lookup_and_move_operations_few_bricks_are_offline(self): """ Test Steps: 1. Mount this volume on 3 mount point, c1, c2, and c3 2. Bring down two bricks offline in each subvol. 3. On client1: under dir1 create files f{1..10000} run in background 4. On client2: under root dir of mountpoint touch x{1..1000} 5. On client3: after step 4 action completed, start creating x{1001..10000} 6. Bring bricks online which were offline(brought up all the bricks which were down (2 in each of the two subvols) 7. While IO on Client1 and Client3 were happening, On client2 move all the x* files into dir1 8. Perform lookup from client 3 """ # List two bricks in each subvol all_subvols_dict = get_subvols(self.mnode, self.volname) subvols = all_subvols_dict['volume_subvols'] bricks_to_bring_offline = [] for subvol in subvols: self.assertTrue(subvol, "List is empty") bricks_to_bring_offline.extend(sample(subvol, 2)) # Bring two bricks of each subvol offline ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, "Bricks are still online") g.log.info("Bricks are offline %s", bricks_to_bring_offline) # Validating the bricks are offline or not ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, "Few of the bricks are still online in" " {} in".format(bricks_to_bring_offline)) g.log.info("%s bricks are offline as expected", bricks_to_bring_offline) # Create directory on client1 dir_on_mount = self.mounts[0].mountpoint + '/dir1' ret = mkdir(self.mounts[0].client_system, dir_on_mount) self.assertTrue(ret, "unable to create directory on client" " 1 {}".format(self.mounts[0].client_system)) g.log.info("Dir1 created on %s successfully", self.mounts[0].client_system) # Next IO to be ran in the background so using mount_procs # and run_async. self.mount_procs = [] # On client1: under dir1 create files f{1..10000} run in background self._run_create_files(file_count=10000, base_name="f_", mpoint=dir_on_mount, client=self.mounts[0].client_system) # On client2: under root dir of the mountpoint touch x{1..1000} cmd = ("/usr/bin/env python {} create_files -f 1000 --fixed-file-size" " 10k --base-file-name x {}".format(self.script_upload_path, self.mounts[1].mountpoint)) ret, _, err = g.run(self.mounts[1].client_system, cmd) self.assertEqual(ret, 0, "File creation failed on {} with {}". format(self.mounts[1].client_system, err)) g.log.info("File creation successful on %s", self.mounts[1].client_system) # On client3: start creating x{1001..10000} cmd = ("cd {}; for i in `seq 1000 10000`; do touch x$i; done; " "cd -".format(self.mounts[2].mountpoint)) proc = g.run_async(self.mounts[2].client_system, cmd) self.mount_procs.append(proc) # Bring bricks online with volume start force ret, _, err = volume_start(self.mnode, self.volname, force=True) self.assertEqual(ret, 0, err) g.log.info("Volume: %s started successfully", self.volname) # Check whether bricks are online or not ret = are_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, "Bricks {} are still offline". format(bricks_to_bring_offline)) g.log.info("Bricks %s are online now", bricks_to_bring_offline) # From client2 move all the files with name starting with x into dir1 cmd = ("for i in `seq 0 999`; do mv {}/x$i.txt {}; " "done".format(self.mounts[1].mountpoint, dir_on_mount)) proc = g.run_async(self.mounts[1].client_system, cmd) self.mount_procs.append(proc) # Perform a lookup in loop from client3 for 20 iterations cmd = ("ls -R {}".format(self.mounts[2].mountpoint)) counter = 20 while counter: ret, _, err = g.run(self.mounts[2].client_system, cmd) self.assertEqual(ret, 0, "ls while mv operation being carried" " failed with {}".format(err)) g.log.debug("ls successful for the %s time", 21-counter) counter -= 1 self.assertTrue(validate_io_procs(self.mount_procs, self.mounts), "IO failed on the clients") # Emptying mount_procs for not validating IO in tearDown self.mount_procs *= 0 # Wait for heal to complete ret = monitor_heal_completion(self.mnode, self.volname,) self.assertTrue(ret, "Heal didn't completed in the expected time") g.log.info("Heal completed successfully on %s volume", self.volname)
def test_replace_brick_quorum(self): ''' -> Create volume -> Set quorum type -> Set quorum ratio to 95% -> Start the volume -> Stop the glusterd on one node -> Now quorum is in not met condition -> Check all bricks went to offline or not -> Perform replace brick operation -> Start glusterd on same node which is already stopped -> Check all bricks are in online or not -> Verify in vol info that old brick not replaced with new brick ''' # Forming brick list, 6 bricks for creating volume, 7th brick for # performing replace brick operation brick_list = form_bricks_list(self.mnode, self.volname, 7, self.servers, self.all_servers_info) # Create Volume ret, _, _ = volume_create(self.mnode, self.volname, brick_list[0:6], replica_count=3) self.assertEqual(ret, 0, "Failed to create volume %s" % self.volname) g.log.info("Volume created successfully %s", self.volname) # Enabling server quorum ret = set_volume_options(self.mnode, self.volname, {'cluster.server-quorum-type': 'server'}) self.assertTrue( ret, "Failed to set server quorum on volume %s" % self.volname) g.log.info("Able to set server quorum successfully on volume %s", self.volname) # Setting Quorum ratio in percentage ret = set_volume_options(self.mnode, 'all', {'cluster.server-quorum-ratio': '95%'}) self.assertTrue( ret, "Failed to set server quorum ratio on %s" % self.servers) g.log.info("Able to set server quorum ratio successfully on %s", self.servers) # Start the volume ret, _, _ = volume_start(self.mnode, self.volname) self.assertEqual(ret, 0, "Failed to start volume %s" % self.volname) g.log.info("Volume started successfully %s", self.volname) # Stop glusterd on one of the node random_server = random.choice(self.servers[1:]) ret = stop_glusterd(random_server) self.assertTrue(ret, "Failed to stop glusterd for %s" % random_server) g.log.info("Glusterd stopped successfully on server %s", random_server) # Checking whether glusterd is running or not ret = is_glusterd_running(random_server) self.assertEqual( ret, 1, "Glusterd is still running on the node %s " "where glusterd stopped" % random_server) g.log.info("Glusterd is not running on the server %s", random_server) # Verifying node count in volume status after glusterd stopped # on one of the server, Its not possible to check the brick status # immediately in volume status after glusterd stop count = 0 while count < 100: vol_status = get_volume_status(self.mnode, self.volname) servers_count = len(vol_status[self.volname].keys()) if servers_count == 5: break sleep(2) count += 1 # creating brick list from volume status offline_bricks = [] vol_status = get_volume_status(self.mnode, self.volname) for node in vol_status[self.volname]: for brick_path in vol_status[self.volname][node]: if brick_path != 'Self-heal Daemon': offline_bricks.append(':'.join([node, brick_path])) # Checking bricks are offline or not with quorum ratio(95%) ret = are_bricks_offline(self.mnode, self.volname, offline_bricks) self.assertTrue( ret, "Bricks are online when quorum is in not met " "condition for %s" % self.volname) g.log.info( "Bricks are offline when quorum is in not met " "condition for %s", self.volname) # Getting random brick from offline brick list self.random_brick = random.choice(offline_bricks) # Performing replace brick commit force when quorum not met self.replace_brick_failed = False ret, _, _ = replace_brick(self.mnode, self.volname, self.random_brick, brick_list[6]) self.assertNotEqual( ret, 0, "Replace brick should fail when quorum is " "in not met condition but replace brick " "success on %s" % self.volname) g.log.info( "Failed to replace brick when quorum is in not met " "condition %s", self.volname) self.replace_brick_failed = True # Start glusterd on one of the node ret = start_glusterd(random_server) self.assertTrue( ret, "Failed to start glusterd on server %s" % random_server) g.log.info("Glusterd started successfully on server %s", random_server) # Verifying node count in volume status after glusterd started # on one of the servers, Its not possible to check the brick status # immediately in volume status after glusterd start count = 0 while count < 100: vol_status = get_volume_status(self.mnode, self.volname) servers_count = len(vol_status[self.volname].keys()) if servers_count == 6: break sleep(2) count += 1 # Checking bricks are online or not count = 0 while count < 100: ret = are_bricks_online(self.mnode, self.volname, brick_list[0:6]) if ret: break sleep(2) count += 1 self.assertTrue(ret, "All bricks are not online for %s" % self.volname) g.log.info("All bricks are online for volume %s", self.volname) # Comparing brick lists of before and after performing replace brick # operation after_brick_list = get_all_bricks(self.mnode, self.volname) self.assertListEqual( after_brick_list, brick_list[0:6], "Bricks are not same before and after performing " "replace brick operation for volume %s" % self.volname) g.log.info( "Bricks are same before and after performing replace " "brick operation for volume %s", self.volname)
def test_validate_authreject_vol(self): """ -Set Authentication Reject for client1 -Check if bricks are online -Mounting the vol on client1 -Check if bricks are online -Mounting the vol on client2 -Reset the Volume -Check if bricks are online -Mounting the vol on client1 """ # pylint: disable=too-many-statements # Obtain hostname of clients ret, hostname_client1, _ = g.run(self.clients[0], "hostname") self.assertEqual( ret, 0, ("Failed to obtain hostname of client %s" % self.clients[0])) g.log.info("Obtained hostname of client. IP- %s, hostname- %s", self.clients[0], hostname_client1.strip()) # Set Authentication option = {"auth.reject": hostname_client1.strip()} ret = set_volume_options(self.mnode, self.volname, option) self.assertTrue( ret, ("Failed to set authentication with option: %s" % option)) g.log.info("Authentication Set successfully with option: %s", option) # Fetching all the bricks self.mountpoint = "/mnt/testvol" bricks_list = get_all_bricks(self.mnode, self.volname) self.assertIsNotNone(bricks_list, "Brick list is empty") g.log.info("Brick List : %s", bricks_list) # Check are bricks online ret = are_bricks_online(self.mnode, self.volname, bricks_list) self.assertTrue(ret, "All bricks are not online") # Using this way to check because of bug 1586036 # Mounting volume ret, _, _ = mount_volume(self.volname, self.mount_type, self.mountpoint, self.mnode, self.clients[0]) # Checking if volume is mounted out = is_mounted(self.volname, self.mountpoint, self.mnode, self.clients[0], self.mount_type, user='******') if (ret == 0) & (not out): g.log.error("Mount executed successfully due to bug 1586036") elif (ret == 1) & (not out): g.log.info("Expected:Mounting has failed successfully") else: raise ExecutionError( "Unexpected Mounting of Volume %s successful" % self.volname) # Checking client logs for authentication error cmd = ("grep AUTH_FAILED /var/log/glusterfs/mnt-" "testvol.log") ret, _, _ = g.run(self.clients[0], cmd) self.assertEqual( ret, 0, "Mounting has not failed due to" "authentication error") g.log.info("Mounting has failed due to authentication error") # Mounting the vol on client2 # Check bricks are online ret = are_bricks_online(self.mnode, self.volname, bricks_list) self.assertTrue(ret, "All bricks are not online") # Mounting Volume ret, _, _ = mount_volume(self.volname, self.mount_type, self.mountpoint, self.mnode, self.clients[1]) self.assertEqual(ret, 0, "Failed to mount volume") g.log.info("Mounted Successfully") # Checking if volume is mounted out = is_mounted(self.volname, self.mountpoint, self.mnode, self.clients[1], self.mount_type, user='******') self.assertTrue(out, "Volume %s has failed to mount" % self.volname) # Reset Volume ret, _, _ = volume_reset(mnode=self.mnode, volname=self.volname) self.assertEqual(ret, 0, "Failed to reset volume") g.log.info("Volume %s reset operation is successful", self.volname) # Checking if bricks are online ret = are_bricks_online(self.mnode, self.volname, bricks_list) self.assertTrue(ret, "All bricks are not online") # Mounting Volume ret, _, _ = mount_volume(self.volname, self.mount_type, self.mountpoint, self.mnode, self.clients[0]) self.assertEqual(ret, 0, "Failed to mount volume") g.log.info("Mounted Successfully") # Checking if Volume is mounted out = is_mounted(self.volname, self.mountpoint, self.servers[0], self.clients[0], self.mount_type, user='******') self.assertTrue(out, "Volume %s has failed to mount" % self.volname) g.log.info("Volume is mounted successfully %s", self.volname)
def test_node_reboot_subdir_mounted_io_running(self): """ Verify node reboot operation when sub-dirs are mounted and IOs are running Steps: 1. Create two sub-directories on mounted volume. 2. Un mount volume from clients. 3. Set auth.allow on sub dir d1 for client1 and d2 for client2. 4. Mount sub-dir d1 on client1 and d2 on client2. 5. Perform IO on mounts. 6. Reboot the node from which sub-dirs are mounted and wait for node to come up. 7. Verify if peers are connected. 8. Check whether bricks are online. 9. Validate IO process. """ # Creating two sub directories on mounted volume ret = mkdir(self.mounts[0].client_system, "%s/d1" % self.mounts[0].mountpoint) self.assertTrue(ret, ("Failed to create directory 'd1' in volume %s " "from client %s" % (self.volname, self.mounts[0].client_system))) ret = mkdir(self.mounts[0].client_system, "%s/d2" % self.mounts[0].mountpoint) self.assertTrue(ret, ("Failed to create directory 'd2' in volume %s " "from client %s" % (self.volname, self.mounts[0].client_system))) # Unmounting volumes ret = self.unmount_volume(self.mounts) self.assertTrue(ret, "Failed to unmount one or more volumes") g.log.info("Successfully unmounted all volumes") # Setting authentication for directories auth_dict = { '/d1': [self.mounts[0].client_system], '/d2': [self.mounts[1].client_system] } ret = set_auth_allow(self.volname, self.mnode, auth_dict) self.assertTrue(ret, "Failed to set authentication") g.log.info("Successfully set authentication on sub directories") # Creating mounts list self.subdir_mounts = [ copy.deepcopy(self.mounts[0]), copy.deepcopy(self.mounts[1]) ] self.subdir_mounts[0].volname = "%s/d1" % self.volname self.subdir_mounts[1].volname = "%s/d2" % self.volname # Mounting sub directories to authenticated clients for mount_obj in self.subdir_mounts: ret = mount_obj.mount() self.assertTrue( ret, ("Failed to mount sub directory %s on client" " %s" % (mount_obj.volname, mount_obj.client_system))) g.log.info("Successfully mounted sub directory %s on client %s", mount_obj.volname, mount_obj.client_system) g.log.info("Successfully mounted sub directories to authenticated " "clients") # Start IO on all mounts. all_mounts_procs = [] count = 1 for mount_obj in self.subdir_mounts: g.log.info("Starting IO on %s:%s", mount_obj.client_system, mount_obj.mountpoint) cmd = ("/usr/bin/env python %s create_deep_dirs_with_files " "--dirname-start-num %d " "--dir-depth 2 " "--dir-length 10 " "--max-num-of-dirs 5 " "--num-of-files 5 %s" % (self.script_upload_path, count, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) count = count + 10 # Reboot node and wait for node to come up. ret, _ = reboot_nodes_and_wait_to_come_online(self.mnode) self.assertTrue( ret, "Node reboot failed. Node %s has not came up" % self.mnode) # Check whether peers are in connected state ret = self.validate_peers_are_connected() self.assertTrue(ret, "All nodes are not in connected state.") # Get the bricks list of the volume g.log.info("Fetching bricks list of the volume : %s", self.volname) bricks_list = get_all_bricks(self.mnode, self.volname) g.log.info("Brick List : %s", bricks_list) # Check whether all bricks are online g.log.info("Verifying whether all bricks are online.") ret = are_bricks_online(self.mnode, self.volname, bricks_list) self.assertTrue(ret, "All bricks are not online.") g.log.info("All bricks are online.") # Validate IO g.log.info("Validating IO's") ret = validate_io_procs(all_mounts_procs, self.subdir_mounts) self.assertTrue(ret, "IO failed on some of the clients") g.log.info("Successfully validated all io's") # Get stat of all the files/dirs created. g.log.info("Get stat of all the files/dirs created.") ret = get_mounts_stat(self.subdir_mounts) self.assertTrue(ret, "Stat failed on some of the clients") g.log.info("Successfully got stat of all files/dirs created") # Unmount sub-directories ret = self.unmount_volume(self.subdir_mounts) self.assertTrue(ret, "Failed to unmount one or more sub-directories") g.log.info("Successfully unmounted all sub-directories")
def test_data_split_brain_resolution(self): # Setting options g.log.info('Setting options...') options = { "metadata-self-heal": "off", "entry-self-heal": "off", "data-self-heal": "off" } ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Successfully set %s for volume %s", options, self.volname) # Creating files and directories on client side g.log.info('Creating files and directories...') cmd = ("for i in `seq 1 10`; do mkdir %s/dir.$i; for j in `seq 1 5`;" "do dd if=/dev/urandom of=%s/dir.$i/file.$j bs=1K count=1;" "done; dd if=/dev/urandom of=%s/file.$i bs=1K count=1; done" % (self.mounts[0].mountpoint, self.mounts[0].mountpoint, self.mounts[0].mountpoint)) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Creating files and directories failed") g.log.info("Files & directories created successfully") # Check arequals for all the bricks g.log.info('Getting arequal before getting bricks offline...') self.verify_brick_arequals() g.log.info('Getting arequal before getting bricks offline ' 'is successful') # Set option self-heal-daemon to OFF g.log.info('Setting option self-heal-daemon to off...') options = {"self-heal-daemon": "off"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Option 'self-heal-daemon' is set to 'off' successfully") bricks_list = get_all_bricks(self.mnode, self.volname) # Bring brick1 offline g.log.info('Bringing brick %s offline', bricks_list[0]) ret = bring_bricks_offline(self.volname, bricks_list[0]) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_list[0]) ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[0]]) self.assertTrue(ret, 'Brick %s is not offline' % bricks_list[0]) g.log.info('Bringing brick %s offline is successful', bricks_list[0]) # Modify the contents of the files cmd = ("for i in `seq 1 10`; do for j in `seq 1 5`;" "do dd if=/dev/urandom of=%s/dir.$i/file.$j bs=1M count=1;" "done; dd if=/dev/urandom of=%s/file.$i bs=1K count=1; done" % (self.mounts[0].mountpoint, self.mounts[0].mountpoint)) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Updating file contents failed") g.log.info("File contents updated successfully") # Bricng brick1 online and check the status g.log.info('Bringing brick %s online', bricks_list[0]) ret = bring_bricks_online(self.mnode, self.volname, [bricks_list[0]]) self.assertTrue(ret, 'Failed to bring brick %s online' % bricks_list[0]) g.log.info('Bringing brick %s online is successful', bricks_list[0]) g.log.info("Verifying if brick %s is online", bricks_list[0]) ret = are_bricks_online(self.mnode, self.volname, bricks_list) self.assertTrue(ret, ("Brick %s did not come up", bricks_list[0])) g.log.info("Brick %s has come online.", bricks_list[0]) # Bring brick2 offline g.log.info('Bringing brick %s offline', bricks_list[1]) ret = bring_bricks_offline(self.volname, bricks_list[1]) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_list[1]) ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[1]]) self.assertTrue(ret, 'Brick %s is not offline' % bricks_list[1]) g.log.info('Bringing brick %s offline is successful', bricks_list[1]) # Modify the contents of the files cmd = ("for i in `seq 1 10`; do for j in `seq 1 5`;" "do dd if=/dev/urandom of=%s/dir.$i/file.$j bs=1M count=2;" "done; dd if=/dev/urandom of=%s/file.$i bs=1K count=2; done" % (self.mounts[0].mountpoint, self.mounts[0].mountpoint)) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Updating file contents failed") g.log.info("File contents updated successfully") # Bricng brick2 online and check the status g.log.info('Bringing brick %s online', bricks_list[1]) ret = bring_bricks_online(self.mnode, self.volname, [bricks_list[1]]) self.assertTrue(ret, 'Failed to bring brick %s online' % bricks_list[1]) g.log.info('Bringing brick %s online is successful', bricks_list[1]) g.log.info("Verifying if brick %s is online", bricks_list[1]) ret = are_bricks_online(self.mnode, self.volname, bricks_list) self.assertTrue(ret, ("Brick %s did not come up", bricks_list[1])) g.log.info("Brick %s has come online.", bricks_list[1]) # Set option self-heal-daemon to ON g.log.info('Setting option self-heal-daemon to on...') options = {"self-heal-daemon": "on"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Option 'self-heal-daemon' is set to 'on' successfully") g.log.info("Checking if files are in split-brain") ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertTrue(ret, "Unable to create split-brain scenario") g.log.info("Successfully created split brain scenario") g.log.info("Resolving split-brain by using the source-brick option " "by choosing second brick as source for all the files") node, _ = bricks_list[1].split(':') command = ("gluster v heal " + self.volname + " split-brain " "source-brick " + bricks_list[1]) ret, _, _ = g.run(node, command) self.assertEqual(ret, 0, "Command execution not successful") # triggering heal ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, "Heal not triggered") # waiting for heal to complete ret = monitor_heal_completion(self.mnode, self.volname, timeout_period=120) self.assertTrue(ret, "Heal not completed") # Try accessing the file content from the mount cmd = ("for i in `seq 1 10`; do cat %s/file.$i > /dev/null;" "for j in `seq 1 5` ; do cat %s/dir.$i/file.$j > /dev/null;" "done ; done" % (self.mounts[0].mountpoint, self.mounts[0].mountpoint)) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Unable to access the file contents") g.log.info("File contents are accessible") # checking if file is in split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, "File still in split-brain") g.log.info("Successfully resolved split brain situation using " "CLI based resolution") # Check arequals for all the bricks g.log.info('Getting arequal for all the bricks after heal...') self.verify_brick_arequals() g.log.info('Getting arequal after heal is successful')
def test_ec_replace_brick(self): """ - Start resource consumption tool - Create directory dir1 - Create 5 directory and 5 files in dir of mountpoint - Rename all files inside dir1 at mountpoint - Create softlink and hardlink of files in dir1 of mountpoint - Delete op for deleting all file in one of the dirs inside dir1 - Change chmod, chown, chgrp - Create tiny, small, medium and large file - Get arequal before replacing brick - Replace brick - Get arequal after replacing brick - Compare Arequal's - Create IO's - Replace brick while IO's are going on - Validating IO's and waiting for it to complete """ # pylint: disable=too-many-branches,too-many-statements,too-many-locals # Starting resource consumption using top log_file_mem_monitor = '/var/log/glusterfs/mem_usage.log' cmd = ("for i in {1..20};do top -n 1 -b|egrep " "'RES|gluster' & free -h 2>&1 >> %s ;" "sleep 10;done" % (log_file_mem_monitor)) g.log.info(cmd) cmd_list_procs = [] for server in self.servers: proc = g.run_async(server, cmd) cmd_list_procs.append(proc) # Creating dir1 ret = mkdir(self.mounts[0].client_system, "%s/dir1" % self.mounts[0].mountpoint) self.assertTrue(ret, "Failed to create dir1") g.log.info("Directory dir1 on %s created successfully", self.mounts[0]) # Create 5 dir and 5 files in each dir at mountpoint on dir1 start, end = 1, 5 for mount_obj in self.mounts: # Number of dir and files to be created. dir_range = ("%s..%s" % (str(start), str(end))) file_range = ("%s..%s" % (str(start), str(end))) # Create dir 1-5 at mountpoint. ret = mkdir(mount_obj.client_system, "%s/dir1/dir{%s}" % (mount_obj.mountpoint, dir_range)) self.assertTrue(ret, "Failed to create directory") g.log.info("Directory created successfully") # Create files inside each dir. cmd = ('touch %s/dir1/dir{%s}/file{%s};' % (mount_obj.mountpoint, dir_range, file_range)) ret, _, _ = g.run(mount_obj.client_system, cmd) self.assertFalse(ret, "File creation failed") g.log.info("File created successfull") # Increment counter so that at next client dir and files are made # with diff offset. Like at next client dir will be named # dir6, dir7...dir10. Same with files. start += 5 end += 5 # Rename all files inside dir1 at mountpoint on dir1 cmd = ('cd %s/dir1/dir1/; ' 'for FILENAME in *;' 'do mv $FILENAME Unix_$FILENAME; cd ~;' 'done;' % self.mounts[0].mountpoint) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Failed to rename file on " "client") g.log.info("Successfully renamed file on client") # Truncate at any dir in mountpoint inside dir1 # start is an offset to be added to dirname to act on # diff files at diff clients. start = 1 for mount_obj in self.mounts: cmd = ('cd %s/dir1/dir%s/; ' 'for FILENAME in *;' 'do echo > $FILENAME; cd ~;' 'done;' % (mount_obj.mountpoint, str(start))) ret, _, _ = g.run(mount_obj.client_system, cmd) self.assertFalse(ret, "Truncate failed") g.log.info("Truncate of files successfull") # Create softlink and hardlink of files in mountpoint. Start is an # offset to be added to dirname to act on diff files at diff clients. start = 1 for mount_obj in self.mounts: cmd = ('cd %s/dir1/dir%s; ' 'for FILENAME in *; ' 'do ln -s $FILENAME softlink_$FILENAME; cd ~;' 'done;' % (mount_obj.mountpoint, str(start))) ret, _, _ = g.run(mount_obj.client_system, cmd) self.assertFalse(ret, "Creating Softlinks have failed") g.log.info("Softlink of files have been changed successfully") cmd = ('cd %s/dir1/dir%s; ' 'for FILENAME in *; ' 'do ln $FILENAME hardlink_$FILENAME; cd ~;' 'done;' % (mount_obj.mountpoint, str(start + 1))) ret, _, _ = g.run(mount_obj.client_system, cmd) self.assertFalse(ret, "Creating Hardlinks have failed") g.log.info("Hardlink of files have been changed successfully") start += 5 # chmod, chown, chgrp inside dir1 # start and end used as offset to access diff files # at diff clients. start, end = 2, 5 for mount_obj in self.mounts: dir_file_range = '%s..%s' % (str(start), str(end)) cmd = ('chmod 777 %s/dir1/dir{%s}/file{%s}' % (mount_obj.mountpoint, dir_file_range, dir_file_range)) ret, _, _ = g.run(mount_obj.client_system, cmd) self.assertFalse(ret, "Changing mode of files has failed") g.log.info("Mode of files have been changed successfully") cmd = ('chown root %s/dir1/dir{%s}/file{%s}' % (mount_obj.mountpoint, dir_file_range, dir_file_range)) ret, _, _ = g.run(mount_obj.client_system, cmd) self.assertFalse(ret, "Changing owner of files has failed") g.log.info("Owner of files have been changed successfully") cmd = ('chgrp root %s/dir1/dir{%s}/file{%s}' % (mount_obj.mountpoint, dir_file_range, dir_file_range)) ret, _, _ = g.run(mount_obj.client_system, cmd) self.assertFalse(ret, "Changing group of files has failed") g.log.info("Group of files have been changed successfully") start += 5 end += 5 # Create tiny, small, medium and large file # at mountpoint. Offset to differ filenames # at diff clients. offset = 1 for mount_obj in self.mounts: cmd = 'fallocate -l 100 tiny_file%s.txt' % str(offset) ret, _, _ = g.run(mount_obj.client_system, cmd) self.assertFalse(ret, "Fallocate for tiny files failed") g.log.info("Fallocate for tiny files successfully") cmd = 'fallocate -l 20M small_file%s.txt' % str(offset) ret, _, _ = g.run(mount_obj.client_system, cmd) self.assertFalse(ret, "Fallocate for small files failed") g.log.info("Fallocate for small files successfully") cmd = 'fallocate -l 200M medium_file%s.txt' % str(offset) ret, _, _ = g.run(mount_obj.client_system, cmd) self.assertFalse(ret, "Fallocate for medium files failed") g.log.info("Fallocate for medium files successfully") cmd = 'fallocate -l 1G large_file%s.txt' % str(offset) ret, _, _ = g.run(mount_obj.client_system, cmd) self.assertFalse(ret, "Fallocate for large files failed") g.log.info("Fallocate for large files successfully") offset += 1 # Get arequal before replacing brick ret, result_before_replacing_brick = (collect_mounts_arequal( self.mounts[0])) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before replacing of brick ' 'is successful') # Replacing a brick of random choice ret = replace_brick_from_volume(self.mnode, self.volname, self.servers, self.all_servers_info) self.assertTrue(ret, "Unexpected:Replace brick is not successful") g.log.info("Expected : Replace brick is successful") # Wait for brick to come online ret = wait_for_bricks_to_be_online(self.mnode, self.volname) self.assertTrue(ret, "Unexpected:Bricks are not online") g.log.info("Expected : Bricks are online") # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Unexpected:Heal has not yet completed') g.log.info('Heal has completed successfully') # Check if bricks are online all_bricks = get_all_bricks(self.mnode, self.volname) ret = are_bricks_online(self.mnode, self.volname, all_bricks) self.assertTrue(ret, 'Unexpected:All bricks are not online') g.log.info('All bricks are online') # Get areequal after replacing brick ret, result_after_replacing_brick = (collect_mounts_arequal( self.mounts[0])) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting areequal after replacing of brick ' 'is successful') # Comparing arequals self.assertEqual( result_before_replacing_brick, result_after_replacing_brick, 'Arequals are not equals before replacing ' 'brick and after replacing brick') g.log.info('Arequals are equals before replacing brick ' 'and after replacing brick') # Creating files on client side for dir1 # Write IO all_mounts_procs, count = [], 1 for mount_obj in self.mounts: g.log.info("Starting IO on %s:%s", mount_obj.client_system, mount_obj.mountpoint) cmd = ("/usr/bin/env python %s create_deep_dirs_with_files " "--dirname-start-num %d " "--dir-depth 2 " "--dir-length 10 " "--max-num-of-dirs 5 " "--num-of-files 5 %s/dir1" % (self.script_upload_path1, count, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) count += 10 # Replacing a brick while IO's are going on ret = replace_brick_from_volume(self.mnode, self.volname, self.servers, self.all_servers_info) self.assertTrue(ret, "Unexpected:Replace brick is not successful") g.log.info("Expected : Replace brick is successful") # Wait for brick to come online ret = wait_for_bricks_to_be_online(self.mnode, self.volname) self.assertTrue(ret, "Unexpected:Bricks are not online") g.log.info("Expected : Bricks are online") # Validating IO's and waiting to complete ret = validate_io_procs(all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") g.log.info("Successfully validated all io's") # Create 2 directories and start IO's which opens FD ret = mkdir(self.mounts[0].client_system, "%s/count{1..2}" % self.mounts[0].mountpoint) self.assertTrue(ret, "Failed to create directories") g.log.info("Directories created on %s successfully", self.mounts[0]) all_fd_procs, count = [], 1 for mount_obj in self.mounts: cmd = ("cd %s ;/usr/bin/env python %s -n 10 -t 120 " "-d 5 -c 16 --dir count%s" % (mount_obj.mountpoint, self.script_upload_path2, count)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_fd_procs.append(proc) count += 1 # Replacing a brick while open FD IO's are going on ret = replace_brick_from_volume(self.mnode, self.volname, self.servers, self.all_servers_info) self.assertTrue(ret, "Unexpected:Replace brick is not successful") g.log.info("Expected : Replace brick is successful") # Wait for brick to come online ret = wait_for_bricks_to_be_online(self.mnode, self.volname) self.assertTrue(ret, "Unexpected:Bricks are not online") g.log.info("Expected : Bricks are online") # Validating IO's and waiting to complete ret = validate_io_procs(all_fd_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") g.log.info("Successfully validated all io's") # Close connection and check file exist for memory log ret = file_exists(self.mnode, '/var/log/glusterfs/mem_usage.log') self.assertTrue(ret, "Unexpected:Memory log file does " "not exist") g.log.info("Memory log file exists") for proc in cmd_list_procs: ret, _, _ = proc.async_communicate() self.assertEqual(ret, 0, "Memory logging failed") g.log.info("Memory logging is successful")
def _perform_brick_ops_and_enable_self_heal(self, op_type): '''Refactor of steps common to all tests: Brick down and perform metadata/data operations''' # First brick in the subvol will always be online and used for self # heal, so make keys match brick index self.op_cmd = { # The operation with key `4` in every op_type will be used for # final data consistency check # Metadata Operations (owner and permission changes) 'metadata': { 2: '''cd {0}; for i in `seq 1 3`; do chown -R qa_all:qa_func \ dir.$i file.$i; chmod -R 555 dir.$i file.$i; done;''', 3: '''cd {0}; for i in `seq 1 3`; do chown -R :qa_system \ dir.$i file.$i; chmod -R 777 dir.$i file.$i; done;''', 4: '''cd {0}; for i in `seq 1 6`; do chown -R qa_all:qa_system \ dir.$i file.$i; chmod -R 777 dir.$i file.$i; done;''', }, # Data Operations (append data to the files) 'data': { 2: '''cd {0}; for i in `seq 1 3`; do {1} 2K >> file.$i; for j in `seq 1 3`; do {1} 2K >> dir.$i/file.$j; done; done;''', 3: '''cd {0}; for i in `seq 1 3`; do {1} 3K >> file.$i; for j in `seq 1 3`; do {1} 3K >> dir.$i/file.$j; done; done;''', 4: '''cd {0}; for i in `seq 1 6`; do {1} 4K >> file.$i; for j in `seq 1 6`; do {1} 4K >> dir.$i/file.$j; done; done;''', }, # Create files and directories when brick is down with no # initial IO 'gfid': { 2: '''cd {0}; for i in `seq 1 3`; do {1} 2K > file.2.$i; mkdir dir.2.$i; for j in `seq 1 3`; do {1} 2K > dir.2.$i/file.2.$j; done; done;''', 3: '''cd {0}; for i in `seq 1 3`; do {1} 2K > file.3.$i; mkdir dir.3.$i; for j in `seq 1 3`; do {1} 2K > dir.3.$i/file.3.$j; done; done;''', 4: '''cd {0}; for i in `seq 4 6`; do {1} 2K > file.$i; mkdir dir.$i; for j in `seq 4 6`; do {1} 2K > dir.$i/file.$j; done; done;''', }, # Create different file type with same name while a brick was down # with no initial IO and validate failure 'file_type': { 2: 'cd {0}; for i in `seq 1 6`; do {1} 2K > notype.$i; done;', 3: 'cd {0}; for i in `seq 1 6`; do mkdir -p notype.$i; done;', 4: '''cd {0}; for i in `seq 1 6`; do {1} 2K > file.$i; for j in `seq 1 6`; do mkdir -p dir.$i; {1} 2K > dir.$i/file.$j; done; done;''', }, # Create symlinks for files and directories while a brick was down # Out of 6 files, 6 dirs and 6 files in each dir, symlink # outer 2 files, inner 2 files in each dir, 2 dirs and # verify it's a symlink(-L) and linking file exists(-e) 'symlink': { 2: '''cd {0}; for i in `seq 1 2`; do ln -sr file.$i sl_file.2.$i; [ -L sl_file.2.$i ] && [ -e sl_file.2.$i ] || exit -1; for j in `seq 1 2`; do ln -sr dir.$i/file.$j dir.$i/sl_file.2.$j; done; [ -L dir.$i/sl_file.2.$j ] && [ -e dir.$i/sl_file.2.$j ] \ || exit -1; done; for k in `seq 3 4`; do ln -sr dir.$k sl_dir.2.$k; [ -L sl_dir.2.$k ] && [ -e sl_dir.2.$k ] || exit -1; done;''', 3: '''cd {0}; for i in `seq 1 2`; do ln -sr file.$i sl_file.3.$i; [ -L sl_file.3.$i ] && [ -e sl_file.3.$i ] || exit -1; for j in `seq 1 2`; do ln -sr dir.$i/file.$j dir.$i/sl_file.3.$j; done; [ -L dir.$i/sl_file.3.$j ] && [ -e dir.$i/sl_file.3.$j ] \ || exit -1; done; for k in `seq 3 4`; do ln -sr dir.$k sl_dir.3.$k; [ -L sl_dir.3.$k ] && [ -e sl_dir.3.$k ] || exit -1; done;''', 4: '''cd {0}; ln -sr dir.4 sl_dir_new.4; mkdir sl_dir_new.4/dir.1; {1} 4K >> sl_dir_new.4/dir.1/test_file; {1} 4K >> sl_dir_new.4/test_file; ''', }, } bricks = get_online_bricks_list(self.mnode, self.volname) self.assertIsNotNone(bricks, 'Not able to get list of bricks in the volume') # Make first brick always online and start operations from second brick for index, brick in enumerate(bricks[1:], start=2): # Bring brick offline ret = bring_bricks_offline(self.volname, brick) self.assertTrue(ret, 'Unable to bring {} offline'.format(brick)) self.assertTrue( are_bricks_offline(self.mnode, self.volname, [brick]), 'Brick {} is not offline'.format(brick)) # Perform file/dir operation cmd = self.op_cmd[op_type][index].format(self.fqpath, self.io_cmd) ret, _, err = g.run(self.client, cmd) if op_type == 'file_type' and index == 3: # Should fail with ENOTCONN as one brick is down, lookupt can't # happen and quorum is not met self.assertNotEqual( ret, 0, '{0} should fail as lookup fails, quorum is not ' 'met'.format(cmd)) self.assertIn( 'Transport', err, '{0} should fail with ENOTCONN ' 'error'.format(cmd)) else: self.assertEqual(ret, 0, '{0} failed with {1}'.format(cmd, err)) self.assertFalse(err, '{0} failed with {1}'.format(cmd, err)) # Bring brick online ret = bring_bricks_online( self.mnode, self.volname, brick, bring_bricks_online_methods='volume_start_force') self.assertTrue( are_bricks_online(self.mnode, self.volname, [brick]), 'Brick {} is not online'.format(brick)) # Assert metadata/data operations resulted in pending heals self.assertFalse(is_heal_complete(self.mnode, self.volname)) # Enable and wait self heal daemon to be online self.assertTrue(enable_self_heal_daemon(self.mnode, self.volname), 'Enabling self heal daemon failed') self.assertTrue( wait_for_self_heal_daemons_to_be_online(self.mnode, self.volname), 'Not all self heal daemons are online')
def test_volume_status_show_brick_online_though_brickpath_deleted(self): """ Test Case: 1) Create a volume and start it. 2) Fetch the brick list 3) Bring any one brick down umount the brick 4) Force start the volume and check that all the bricks are not online 5) Remount the removed brick and bring back the brick online 6) Force start the volume and check if all the bricks are online """ # Fetching the brick list brick_list = get_all_bricks(self.mnode, self.volname) self.assertIsNotNone(brick_list, "Failed to get the bricks in" " the volume") # Bringing one brick down random_brick = random.choice(brick_list) ret = bring_bricks_offline(self.volname, random_brick) self.assertTrue(ret, "Failed to bring offline") # Creating a list of bricks to be removed remove_bricks_list = [] remove_bricks_list.append(random_brick) # Checking if the brick is offline or not ret = are_bricks_offline(self.mnode, self.volname, remove_bricks_list) self.assertTrue(ret, 'Bricks %s are not offline' % random_brick) g.log.info('Brick %s is offline as expected', random_brick) # umounting the brick which was made offline self.brick_node, volume_brick = random_brick.split(':') self.node_brick = '/'.join(volume_brick.split('/')[0:3]) g.log.info('Start umount brick %s...', self.node_brick) ret, _, _ = g.run(self.brick_node, 'umount %s' % self.node_brick) self.assertFalse(ret, 'Failed to umount brick %s' % self.node_brick) g.log.info('Successfully umounted brick %s', self.node_brick) self.check_for_remount = True # Force starting the volume ret, _, _ = volume_start(self.mnode, self.volname, True) self.assertEqual(ret, 0, "Faile to force start volume") g.log.info("Successfully force start volume") # remounting the offline brick g.log.info('Start remount brick %s with read-write option...', self.node_brick) ret, _, _ = g.run(self.brick_node, 'mount %s' % self.node_brick) self.assertFalse(ret, 'Failed to remount brick %s' % self.node_brick) g.log.info('Successfully remounted %s with read-write option', self.node_brick) self.check_for_remount = False # Checking that all the bricks shouldn't be online ret = are_bricks_online(self.mnode, self.volname, brick_list) self.assertFalse(ret, "Unexpected: All the bricks are online") g.log.info("Expected: All the bricks are not online") # Bringing back the offline brick online ret = bring_bricks_online(self.mnode, self.volname, remove_bricks_list) self.assertTrue(ret, "Failed to bring bricks online") g.log.info("Successfully brought bricks online") # Force starting the volume ret, _, _ = volume_start(self.mnode, self.volname, True) self.assertEqual(ret, 0, "Faile to force start volume") g.log.info("Successfully force start volume") # Checking if all the bricks are online or not ret = are_bricks_online(self.mnode, self.volname, brick_list) self.assertTrue(ret, "Unexpected: All the bricks are not online") g.log.info("Expected: All the bricks are online")