def test_rename_files_with_brick_down(self): """ Description: Tests to check that there is no data loss when rename is performed with a brick of volume down. Steps : 1) Create a volume. 2) Mount the volume using FUSE. 3) Create 1000 files on the mount point. 4) Create the soft-link for file{1..100} 5) Create the hard-link for file{101..200} 6) Check for the file count on the mount point. 7) Begin renaming the files, in multiple iterations. 8) Let few iterations of the rename complete successfully. 9) Then while rename is still in progress, kill a brick part of the volume. 10) Let the brick be down for sometime, such that the a couple of rename iterations are completed. 11) Bring the brick back online. 12) Wait for the IO to complete. 13) Check if there is any data loss. 14) Check if all the files are renamed properly. """ # Creating 1000 files on volume root m_point = self.mounts[0].mountpoint command = 'touch ' + m_point + '/file{1..1000}_0' ret, _, _ = g.run(self.clients[0], command) self.assertEqual(ret, 0, "File creation failed on %s" % m_point) g.log.info("Files successfully created on the mount point") # Create soft links for a few files for i in range(1, 100): ret = create_link_file(self.clients[0], '{}/file{}_0'.format(m_point, i), '{}/soft_link_file{}_0'.format(m_point, i), soft=True) self.assertTrue(ret, "Failed to create soft links for files") g.log.info("Created soft links for files successfully") # Create hard links for a few files for i in range(101, 200): ret = create_link_file(self.clients[0], '{}/file{}_0'.format(m_point, i), '{}/hard_link_file{}_0'.format(m_point, i), soft=False) self.assertTrue(ret, "Failed to create hard links for files") g.log.info("Created hard links for files successfully") # Calculate file count for the mount-point cmd = ("ls -lR %s/ | wc -l" % m_point) ret, count_before, _ = g.run(self.clients[0], cmd) self.assertEqual(ret, 0, "Failed to get file count") g.log.info("File count before rename is:%s", count_before) # Start renaming the files in multiple iterations g.log.info("Starting to rename the files") all_mounts_procs = [] cmd = ('for i in `seq 1 1000`; do for j in `seq 0 5`;do mv -f ' '%s/file$i\\_$j %s/file$i\\_$(expr $j + 1); done; done' % (m_point, m_point)) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Waiting for some time for a iteration of rename to complete g.log.info("Waiting for few rename iterations to complete") sleep(120) # Get the information about the bricks part of the volume brick_list = get_all_bricks(self.mnode, self.volname) # Kill a brick part of the volume ret = bring_bricks_offline(self.volname, choice(brick_list)) self.assertTrue(ret, "Failed to bring brick offline") g.log.info("Successfully brought brick offline") # Let the brick be down for some time g.log.info("Keeping brick down for few minutes") sleep(60) # Bring the brick online using gluster v start force ret, _, _ = volume_start(self.mnode, self.volname, force=True) self.assertEqual(ret, 0, "Volume start with force failed") g.log.info("Volume start with force successful") # Close connection and check if rename has completed ret, _, _ = proc.async_communicate() self.assertEqual(ret, 0, "Rename is not completed") g.log.info("Rename is completed") # Do lookup on the files # Calculate file count from mount cmd = ("ls -lR %s/ | wc -l" % m_point) ret, count_after, _ = g.run(self.clients[0], cmd) self.assertEqual(ret, 0, "Failed to do lookup and" "get file count") g.log.info("Lookup successful. File count after" " rename is:%s", count_after) # Check if there is any data loss self.assertEqual(int(count_before), int(count_after), "The file count before and after" " rename is not same. There is data loss.") g.log.info("The file count before and after rename is same." " No data loss occurred.") # Checking if all files were renamed Successfully ret = get_volume_type(brick_list[0] + "/") if ret in ("Replicate", "Disperse", "Arbiter", "Distributed-Replicate", "Distribute-Disperse", "Distribute-Arbiter"): cmd = ("ls -lR %s/file*_6 | wc -l" % m_point) ret, out, _ = g.run(self.clients[0], cmd) self.assertEqual(int(out), 1000, "Rename failed on some files") g.log.info("All the files are renamed successfully")
def test_self_heal_differing_in_file_type(self): """ testing self heal of files with different file types with default configuration Description: - create IO - calculate arequal - bring down all bricks processes from selected set - calculate arequal and compare with arequal before getting bricks offline - modify the data - arequal before getting bricks online - bring bricks online - check daemons and healing completion - start healing - calculate arequal and compare with arequal before bringing bricks online and after bringing bricks online """ # pylint: disable=too-many-locals,too-many-statements # Creating files on client side all_mounts_procs = [] test_file_type_differs_self_heal_folder = \ 'test_file_type_differs_self_heal' g.log.info("Generating data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) # Creating files command = ("cd %s/ ; " "mkdir %s ;" "cd %s/ ;" "for i in `seq 1 10` ; " "do mkdir l1_dir.$i ; " "for j in `seq 1 5` ; " "do mkdir l1_dir.$i/l2_dir.$j ; " "for k in `seq 1 10` ; " "do dd if=/dev/urandom of=l1_dir.$i/l2_dir.$j/test.$k " "bs=1k count=$k ; " "done ; " "done ; " "done ; " % (self.mounts[0].mountpoint, test_file_type_differs_self_heal_folder, test_file_type_differs_self_heal_folder)) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) all_mounts_procs.append(proc) # wait for io to complete self.assertTrue(wait_for_io_to_complete(all_mounts_procs, self.mounts), "Io failed to complete on some of the clients") # Get arequal before getting bricks offline g.log.info('Getting arequal before getting bricks offline...') ret, result_before_offline = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks offline ' 'is successful') # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = list( filter(None, (bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks']))) # Bring brick offline g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Get arequal after getting bricks offline g.log.info('Getting arequal after getting bricks offline...') ret, result_after_offline = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks offline ' 'is successful') # Checking arequals before bringing bricks offline # and after bringing bricks offline self.assertItemsEqual( result_before_offline, result_after_offline, 'Checksums before and after ' 'bringing bricks offline are not equal') g.log.info('Checksums before and after ' 'bringing bricks offline are equal') # Modify the data all_mounts_procs = [] g.log.info("Modifying data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) command = ("cd %s/%s/ ; " "for i in `seq 1 10` ; " "do for j in `seq 1 5` ; " "do for k in `seq 1 10` ; " "do rm -f l1_dir.$i/l2_dir.$j/test.$k ; " "mkdir l1_dir.$i/l2_dir.$j/test.$k ; " "done ; " "done ; " "done ;" % (self.mounts[0].mountpoint, test_file_type_differs_self_heal_folder)) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients") # Get arequal before getting bricks online g.log.info('Getting arequal before getting bricks online...') ret, result_before_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks online ' 'is successful') # Bring brick online g.log.info('Bringing bricks %s online', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue( ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequal after getting bricks online g.log.info('Getting arequal after getting bricks online...') ret, result_after_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks online ' 'is successful') # Checking arequals before bringing bricks online # and after bringing bricks online self.assertItemsEqual( result_before_online, result_after_online, 'Checksums before and ' 'after bringing bricks online are not equal') g.log.info('Checksums before and after bringing bricks online ' 'are equal')
def test_volume_create(self): # create and start a volume self.volume['name'] = "first_volume" self.volname = "first_volume" ret = setup_volume(self.mnode, self.all_servers_info, self.volume) self.assertTrue(ret, "Failed to create and start volume") # bring a brick down and volume start force should bring it to online g.log.info("Get all the bricks of the volume") bricks_list = get_all_bricks(self.mnode, self.volname) self.assertIsNotNone(bricks_list, "Failed to get the brick list") g.log.info("Successfully got the list of bricks of volume") ret = bring_bricks_offline(self.volname, bricks_list[0:2]) self.assertTrue(ret, "Failed to bring down the bricks") g.log.info("Successfully brought the bricks down") ret, _, _ = volume_start(self.mnode, self.volname, force=True) self.assertEqual(ret, 0, "Failed to start the volume") g.log.info("Volume start with force is success") ret = wait_for_bricks_to_be_online(self.mnode, self.volname) self.assertTrue(ret, "Failed to bring the bricks online") g.log.info("Volume start with force successfully brought all the " "bricks online") # create volume with previously used bricks and different volume name self.volname = "second_volume" ret, _, _ = volume_create(self.mnode, self.volname, bricks_list) self.assertNotEqual( ret, 0, "Expected: It should fail to create a " "volume with previously used bricks. Actual:" "Successfully created the volume with previously" " used bricks") g.log.info("Failed to create the volume with previously used bricks") # create a volume with already existing volume name self.volume['name'] = "first_volume" ret = setup_volume(self.mnode, self.all_servers_info, self.volume) self.assertTrue( ret, "Expected: It should fail to create a volume" " with already existing volume name. Actual: " "Successfully created the volume with " "already existing volname") g.log.info("Failed to create the volume with already existing volname") # creating a volume with non existing brick path should fail self.volname = "second_volume" bricks_list = form_bricks_list(self.mnode, self.volname, len(self.servers), self.servers, self.all_servers_info) nonexisting_brick_index = random.randint(0, len(bricks_list) - 1) non_existing_brick = bricks_list[nonexisting_brick_index].split(":")[0] non_existing_path = ":/brick/non_existing_path" non_existing_brick = non_existing_brick + non_existing_path bricks_list[nonexisting_brick_index] = non_existing_brick ret, _, _ = volume_create(self.mnode, self.volname, bricks_list) self.assertNotEqual( ret, 0, "Expected: Creating a volume with non " "existing brick path should fail. Actual: " "Successfully created the volume with " "non existing brick path") g.log.info("Failed to create the volume with non existing brick path") # cleanup the volume and peer detach all servers. form two clusters,try # to create a volume with bricks whose nodes are in different clusters # cleanup volumes vol_list = get_volume_list(self.mnode) self.assertIsNotNone(vol_list, "Failed to get the volume list") for volume in vol_list: ret = cleanup_volume(self.mnode, volume) self.assertTrue(ret, "Unable to delete volume % s" % volume) # peer detach all servers ret = peer_detach_servers(self.mnode, self.servers) self.assertTrue(ret, "Peer detach to all servers is failed") g.log.info("Peer detach to all the servers is success") # form cluster 1 ret, _, _ = peer_probe(self.servers[0], self.servers[1]) self.assertEqual( ret, 0, "Peer probe from %s to %s is failed" % (self.servers[0], self.servers[1])) g.log.info("Peer probe is success from %s to %s" % (self.servers[0], self.servers[1])) # form cluster 2 ret, _, _ = peer_probe(self.servers[2], self.servers[3]) self.assertEqual( ret, 0, "Peer probe from %s to %s is failed" % (self.servers[2], self.servers[3])) g.log.info("Peer probe is success from %s to %s" % (self.servers[2], self.servers[3])) # Creating a volume with bricks which are part of another # cluster should fail ret = setup_volume(self.mnode, self.all_servers_info, self.volume) self.assertFalse( ret, "Expected: Creating a volume with bricks" " which are part of another cluster should fail." " Actual: Successfully created the volume with " "bricks which are part of another cluster") g.log.info("Failed to create the volume with bricks which are " "part of another cluster") # form a cluster, bring a node down. try to create a volume when one of # the brick node is down ret, _, _ = peer_detach(self.servers[2], self.servers[3]) self.assertEqual(ret, 0, "Peer detach is failed") g.log.info("Peer detach is success") ret = peer_probe_servers(self.mnode, self.servers) self.assertTrue(ret, "Peer probe is failed") g.log.info("Peer probe to all the servers is success") random_server = self.servers[random.randint(1, len(self.servers) - 1)] ret = stop_glusterd(random_server) self.assertTrue(ret, "Glusterd is stopped successfully") self.volume['name'] = "third_volume" ret = setup_volume(self.mnode, self.all_servers_info, self.volume) self.assertFalse( ret, "Expected: It should fail to create a volume " "when one of the node is down. Actual: Successfully " "created the volume with bbrick whose node is down") g.log.info("Failed to create the volume with brick whose node is down")
def test_metadata_self_heal(self): """ Test MetaData Self-Heal (heal command) Description: - set the volume option "metadata-self-heal": "off" "entry-self-heal": "off" "data-self-heal": "off" - create IO - set the volume option "self-heal-daemon": "off" - bring down all bricks processes from selected set - Change the permissions, ownership and the group of the files under "test_meta_data_self_heal" folder - get arequal before getting bricks online - bring bricks online - set the volume option "self-heal-daemon": "on" - check daemons and start healing - check is heal is completed - check for split-brain - get arequal after getting bricks online and compare with arequal before getting bricks online - check group and user are 'qa' """ # pylint: disable=too-many-locals,too-many-statements # Setting options g.log.info('Setting options...') options = {"metadata-self-heal": "off", "entry-self-heal": "off", "data-self-heal": "off"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options') g.log.info("Options " "'metadata-self-heal', " "'entry-self-heal', " "'data-self-heal', " "are set to 'off' successfully") # Creating files on client side all_mounts_procs = [] test_meta_data_self_heal_folder = 'test_meta_data_self_heal' g.log.info("Generating data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) # Create files g.log.info('Creating files...') command = ("cd %s/ ; " "mkdir %s ;" "cd %s/ ;" "for i in `seq 1 50` ; " "do dd if=/dev/urandom of=test.$i bs=10k count=1 ; " "done ;" % (self.mounts[0].mountpoint, test_meta_data_self_heal_folder, test_meta_data_self_heal_folder)) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) all_mounts_procs.append(proc) # wait for io to complete self.assertTrue( wait_for_io_to_complete(all_mounts_procs, self.mounts), "Io failed to complete on some of the clients") # Setting options g.log.info('Setting options...') options = {"self-heal-daemon": "off"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options') g.log.info("Option 'self-heal-daemon' is set to 'off' successfully") # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = list(filter(None, ( bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks']))) # Bring brick offline g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Changing the permissions, ownership and the group # of the files under "test_meta_data_self_heal" folder g.log.info("Modifying data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) # Change permissions to 444 g.log.info('Changing permissions...') command = ("cd %s/%s/ ; " "chmod -R 444 *" % (self.mounts[0].mountpoint, test_meta_data_self_heal_folder)) ret, out, err = g.run(self.mounts[0].client_system, command) self.assertEqual(ret, 0, err) g.log.info('Permissions are changed successfully') # Change the ownership to qa g.log.info('Changing the ownership...') command = ("cd %s/%s/ ; " "chown -R qa *" % (self.mounts[0].mountpoint, test_meta_data_self_heal_folder)) ret, out, err = g.run(self.mounts[0].client_system, command) self.assertEqual(ret, 0, err) g.log.info('Ownership is changed successfully') # Change the group to qa g.log.info('Changing the group...') command = ("cd %s/%s/ ; " "chgrp -R qa *" % (self.mounts[0].mountpoint, test_meta_data_self_heal_folder)) ret, out, err = g.run(self.mounts[0].client_system, command) self.assertEqual(ret, 0, err) g.log.info('Group is changed successfully') # Get arequal before getting bricks online g.log.info('Getting arequal before getting bricks online...') ret, result_before_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks online ' 'is successful') # Bring brick online g.log.info('Bringing bricks %s online...', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Setting options g.log.info('Setting options...') options = {"self-heal-daemon": "on"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options') g.log.info("Option 'self-heal-daemon' is set to 'on' successfully") # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Volume process %s not online " "despite waiting for 5 minutes", self.volname)) g.log.info("Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue(ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequal after getting bricks online g.log.info('Getting arequal after getting bricks online...') ret, result_after_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks online ' 'is successful') # Checking arequals before bringing bricks online # and after bringing bricks online self.assertItemsEqual(result_before_online, result_after_online, 'Checksums are not equal') g.log.info('Checksums before bringing bricks online ' 'and after bringing bricks online are equal') # Adding servers and client in single dict to check permissions nodes_to_check = {} all_bricks = get_all_bricks(self.mnode, self.volname) for brick in all_bricks: node, brick_path = brick.split(':') nodes_to_check[node] = brick_path nodes_to_check[self.mounts[0].client_system] = \ self.mounts[0].mountpoint # Checking for user and group for node in nodes_to_check: # Get file list command = ("cd %s/%s/ ; " "ls" % (nodes_to_check[node], test_meta_data_self_heal_folder)) ret, out, err = g.run(node, command) file_list = out.split() for file_name in file_list: file_to_check = '%s/%s/%s' % (nodes_to_check[node], test_meta_data_self_heal_folder, file_name) g.log.info('Checking for permissions, user and group for %s', file_name) # Check for permissions cmd = ("stat -c '%a %n' {} | awk '{{print $1}}'" .format(file_to_check)) ret, permissions, _ = g.run(node, cmd) self.assertEqual(permissions.split('\n')[0], '444', 'Permissions %s is not equal to 444' % permissions) g.log.info("Permissions are '444' for %s", file_name) # Check for user cmd = ("ls -ld {} | awk '{{print $3}}'" .format(file_to_check)) ret, username, _ = g.run(node, cmd) self.assertEqual(username.split('\n')[0], 'qa', 'User %s is not equal qa' % username) g.log.info("User is 'qa' for %s", file_name) # Check for group cmd = ("ls -ld {} | awk '{{print $4}}'" .format(file_to_check)) ret, groupname, _ = g.run(node, cmd) self.assertEqual(groupname.split('\n')[0], 'qa', 'Group %s is not equal qa' % groupname) g.log.info("Group is 'qa' for %s", file_name)
def test_heal_gfid_1x3(self): """ Description: This test case verifies the gfid self-heal on a 1x3 replicate volume. 1. file created at mount point 2. 2 bricks brought down 3. file deleted 4. created a new file from the mount point 5. all bricks brought online 6. check if gfid worked correctly """ g.log.info("setting the quorum type to fixed") options = {"cluster.quorum-type": "fixed"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, "unable to set the quorum type to fixed") g.log.info("Successfully set the quorum type to fixed") g.log.info("creating a file from mount point") all_mounts_procs = [] for mount_obj in self.mounts: cmd = ("python %s create_files " "-f 1 --base-file-name test_file --fixed-file-size 10k %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) # Validate I/O self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients") g.log.info("Successfully created a file from mount point") # getting list of all bricks all_bricks = get_all_bricks(self.mnode, self.volname) self.assertIsNotNone(all_bricks, "unable to get list of bricks") g.log.info("bringing down brick1 and brick2") ret = bring_bricks_offline(self.volname, all_bricks[:2]) self.assertTrue(ret, "unable to bring bricks offline") g.log.info("Successfully brought the following bricks offline " ": %s", str(all_bricks[:2])) g.log.info("deleting the file from mount point") command = "rm -f " + self.mounts[0].mountpoint + "/test_file1" ret, _, _ = g.run(self.mounts[0].client_system, command) self.assertEqual(ret, 0, "unable to remove file from mount point") g.log.info("Successfully deleted file from mountpoint") g.log.info("creating a new file of same name and different size " "from mount point") all_mounts_procs = [] for mount_obj in self.mounts: cmd = ("python %s create_files " "-f 1 --base-file-name test_file --fixed-file-size 1M %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) # Validate I/O self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients") g.log.info("Successfully created a new file of same name " "from mount point") g.log.info("bringing bricks 1 and 2 back online") ret = bring_bricks_online(self.mnode, self.volname, all_bricks[:2]) self.assertIsNotNone(ret, "unable to bring bricks online") g.log.info("Successfully brought the following bricks online " ": %s", str(all_bricks[:2])) g.log.info("checking if stat structure of the file is returned") ret = get_file_stat(self.mounts[0].client_system, self.mounts[0].mountpoint + '/test_file0.txt') self.assertTrue(ret, "unable to get file stats") g.log.info("file stat structure returned successfully") g.log.info("checking if the heal has completed") ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, "heal not completed") g.log.info("Self heal was completed successfully") g.log.info("checking if the areequal checksum of all the bricks in " "the subvol match") checksum_list = [] for brick in all_bricks: node, brick_path = brick.split(':') command = "arequal-checksum -p " + brick_path + \ " -i .glusterfs -i .landfill" ret, out, _ = g.run(node, command) self.assertEqual( ret, 0, "unable to get the arequal checksum " "of the brick") checksum_list.append(out) # checking file size of healed file on each brick to verify # correctness of choice for sink and source stat_dict = get_file_stat(node, brick_path + '/test_file0.txt') self.assertEqual( stat_dict['size'], '1048576', "file size of healed file is different " "than expected") flag = all(val == checksum_list[0] for val in checksum_list) self.assertTrue(flag, "the arequal checksum of all bricks is" "not same") g.log.info("the arequal checksum of all the bricks in the subvol " "is same")
def test_server_side_healing_happens_only_when_glustershd_running(self): """ Test Script which verifies that the server side healing must happen only if the heal daemon is running on the node where source brick resides. * Create and start the Replicate volume * Check the glustershd processes - Only 1 glustershd should be listed * Bring down the bricks without affecting the cluster * Create files on volume * kill the glustershd on node where bricks is running * bring the bricks up which was killed in previous steps * check the heal info - heal info must show pending heal info, heal shouldn't happen since glustershd is down on source node * issue heal * trigger client side heal * heal should complete successfully """ # pylint: disable=too-many-locals,too-many-statements,too-many-lines # Setting Volume options options = { "metadata-self-heal": "on", "entry-self-heal": "on", "data-self-heal": "on" } ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Successfully set %s for volume %s", options, self.volname) # Check the self-heal daemon process ret, pids = get_self_heal_daemon_pid(self.servers) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process " "found : %s" % pids)) g.log.info( "Successful in verifying self heal daemon process" " on all nodes %s", self.servers) # Select the bricks to bring offline bricks_to_bring_offline = (select_volume_bricks_to_bring_offline( self.mnode, self.volname)) g.log.info("Brick List to bring offline : %s", bricks_to_bring_offline) # Bring down the selected bricks ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, "Failed to bring down the bricks") g.log.info("Brought down the brick process " "for %s", bricks_to_bring_offline) # Write files on all mounts all_mounts_procs, num_files_to_write = [], 100 for mount_obj in self.mounts: cmd = ("/usr/bin/env python %s create_files " "-f %s --base-file-name file %s" % (self.script_upload_path, num_files_to_write, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) # Validate IO ret = validate_io_procs(all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") g.log.info("IO is successful on all mounts") # Get online bricks list online_bricks = get_online_bricks_list(self.mnode, self.volname) g.log.info("Online Bricks for volume %s : %s", self.volname, online_bricks) # Get the nodes where bricks are running bring_offline_glustershd_nodes = [] for brick in online_bricks: bring_offline_glustershd_nodes.append(brick.split(":")[0]) g.log.info("self heal deamon on nodes %s to be killed", bring_offline_glustershd_nodes) # Kill the self heal daemon process on nodes ret = bring_self_heal_daemon_process_offline( bring_offline_glustershd_nodes) self.assertTrue( ret, ("Unable to bring self heal daemon process" " offline for nodes %s" % bring_offline_glustershd_nodes)) g.log.info( "Sucessfully brought down self heal process for " "nodes %s", bring_offline_glustershd_nodes) # Check the heal info heal_info = get_heal_info_summary(self.mnode, self.volname) g.log.info("Successfully got heal info %s for the volume %s", heal_info, self.volname) # Bring bricks online ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline, 'glusterd_restart') self.assertTrue( ret, ("Failed to bring bricks: %s online" % bricks_to_bring_offline)) # Issue heal ret = trigger_heal_full(self.mnode, self.volname) self.assertFalse(ret, ("Able to trigger heal on volume %s where " "self heal daemon is not running" % self.volname)) g.log.info( "Expected : Unable to trigger heal on volume %s where " "self heal daemon is not running", self.volname) # Wait for 130 sec to heal ret = monitor_heal_completion(self.mnode, self.volname, 130) self.assertFalse(ret, ("Heal Completed on volume %s" % self.volname)) g.log.info("Expected : Heal pending on volume %s", self.volname) # Check the heal info heal_info_after_triggering_heal = get_heal_info_summary( self.mnode, self.volname) g.log.info("Successfully got heal info for the volume %s", self.volname) # Compare with heal pending with the files wrote for node in online_bricks: self.assertGreaterEqual( int(heal_info_after_triggering_heal[node]['numberOfEntries']), num_files_to_write, ("Some of the files are healed from source bricks %s where " "self heal daemon is not running" % node)) g.log.info("EXPECTED: No files are healed from source bricks where " "self heal daemon is not running") # Unmount and Mount volume again as volume options were set # after mounting the volume for mount_obj in self.mounts: ret, _, _ = umount_volume(mount_obj.client_system, mount_obj.mountpoint) self.assertEqual(ret, 0, "Failed to unmount %s" % mount_obj.client_system) ret, _, _ = mount_volume(self.volname, mtype='glusterfs', mpoint=mount_obj.mountpoint, mserver=self.mnode, mclient=mount_obj.client_system) self.assertEqual(ret, 0, "Failed to mount %s" % mount_obj.client_system) all_mounts_procs = [] for mount_obj in self.mounts: cmd = ("/usr/bin/env python %s read %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) # Validate IO ret = validate_io_procs(all_mounts_procs, self.mounts) self.assertTrue(ret, "Reads failed on some of the clients") g.log.info("Reads successful on all mounts") # Wait for heal to complete ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, "Unable to heal the pending entries") g.log.info("Successfully healed the pending entries for volume %s", self.volname)
def test_ec_version(self): """ Create a directory on the mountpoint Create files on the mountpoint Bring down a brick say b1 Create more files on the mountpoint Bring down another brick b2 Bring up brick b1 Wait for healing to complete Check if EC version is updated Check is EC size is updated """ # pylint: disable=too-many-statements,too-many-branches,too-many-locals # Creating dir1 on the mountpoint ret = mkdir(self.mounts[0].client_system, "%s/dir1" % self.mounts[0].mountpoint) self.assertTrue(ret, "Failed to create dir1") g.log.info("Directory dir1 on %s created successfully", self.mounts[0]) # Creating files on client side for dir1 g.log.info("Generating data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) # Create dirs with file command = ("cd %s/dir1; for i in {1..10};do" " dd if=/dev/urandom of=file.$i " "bs=1024 count=10000; done" % self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validating IO's and waiting to complete self.assertTrue( validate_io_procs(self.all_mounts_procs, self.mounts[0]), "IO failed on some of the clients" ) self.io_validation_complete = True # Bringing brick b1 offline sub_vols = get_subvols(self.mnode, self.volname) self.bricks_list1 = list(choice(sub_vols['volume_subvols'])) brick_b1_down = choice(self.bricks_list1) ret = bring_bricks_offline(self.volname, brick_b1_down) self.assertTrue(ret, 'Brick %s is not offline' % brick_b1_down) g.log.info('Brick %s is offline successfully', brick_b1_down) del self.all_mounts_procs[:] # Creating files on client side for dir1 g.log.info("Generating data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) # Create dirs with file command = ("cd %s/dir1; for i in {11..20};do" " dd if=/dev/urandom of=file.$i " "bs=1024 count=10000; done" % self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validating IO's and waiting to complete self.assertTrue( validate_io_procs(self.all_mounts_procs, self.mounts[0]), "IO failed on some of the clients" ) self.io_validation_complete = True # Changing mode owner and group of files dir_file_range = '2..5' cmd = ('chmod 777 %s/dir1/file.{%s}' % (self.mounts[0].mountpoint, dir_file_range)) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertFalse(ret, "Changing mode of files has failed") g.log.info("Mode of files have been changed successfully") cmd = ('chown root %s/dir1/file.{%s}' % (self.mounts[0].mountpoint, dir_file_range)) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertFalse(ret, "Changing owner of files has failed") g.log.info("Owner of files have been changed successfully") cmd = ('chgrp root %s/dir1/file.{%s}' % (self.mounts[0].mountpoint, dir_file_range)) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertFalse(ret, "Changing group of files has failed") g.log.info("Group of files have been changed successfully") # Create softlink and hardlink of files in mountpoint. cmd = ('cd %s/dir1/; ' 'for FILENAME in *; ' 'do ln -s $FILENAME softlink_$FILENAME; ' 'done;' % self.mounts[0].mountpoint) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertFalse(ret, "Creating Softlinks have failed") g.log.info("Softlink of files have been changed successfully") cmd = ('cd %s/dir1/; ' 'for FILENAME in *; ' 'do ln $FILENAME hardlink_$FILENAME; ' 'done;' % (self.mounts[0].mountpoint)) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertFalse(ret, "Creating Hardlinks have failed") g.log.info("Hardlink of files have been changed successfully") # Bringing brick b2 offline bricks_list2 = deepcopy(self.bricks_list1) bricks_list2.remove(brick_b1_down) brick_b2_down = choice(bricks_list2) ret = bring_bricks_offline(self.volname, brick_b2_down) self.assertTrue(ret, 'Brick %s is not offline' % brick_b2_down) g.log.info('Brick %s is offline successfully', brick_b2_down) # Bring brick b1 online ret = bring_bricks_online(self.mnode, self.volname, [brick_b1_down], 'glusterd_restart') self.assertTrue(ret, 'Brick %s is not brought' 'online' % brick_b1_down) g.log.info('Brick %s is online successfully', brick_b1_down) # Delete brick2 from brick list as we are not checking for heal # completion in brick 2 as it is offline self.bricks_list1.remove(brick_b2_down) # Check if EC version is same on all bricks which are up ret = self.get_xattr("ec.version") self.assertTrue(ret, "Healing not completed and EC version is" "not updated") g.log.info("Healing is completed and EC version is updated") # Check if EC size is same on all bricks which are up ret = self.get_xattr("ec.size") self.assertTrue(ret, "Healing not completed and EC size is" "not updated") g.log.info("Healing is completed and EC size is updated")
def test_heal_when_quota_object_limit_exceeded(self): # Create a directory to set the quota_limit_objects path = "/dir" g.log.info("Creating a directory") self.all_mounts_procs = [] for mount_object in self.mounts: cmd = "/usr/bin/env python %s create_deep_dir -d 0 -l 0 %s%s" % ( self.script_upload_path, mount_object.mountpoint, path) ret = g.run(mount_object.client_system, cmd) self.assertTrue(ret, "Failed to create directory on mountpoint") g.log.info("Directory created successfully on mountpoint") # Enable Quota g.log.info("Enabling quota on the volume %s", self.volname) ret, _, _ = quota_enable(self.mnode, self.volname) self.assertEqual(ret, 0, ("Failed to enable quota on the volume " "%s", self.volname)) g.log.info("Successfully enabled quota on the volume %s", self.volname) # Set quota-soft-timeout to 0 g.log.info("Setting up soft timeout to 0") ret, _, _ = quota_set_soft_timeout(self.mnode, self.volname, "0") self.assertEqual(ret, 0, ("Failed to set quota-soft-timeout")) g.log.info("Successfully set the quota-soft-timeout") # Set quota-hard-timeout to 0 g.log.info("Setting up hard timeout with 0") ret, _, _ = quota_set_hard_timeout(self.mnode, self.volname, "0") self.assertEqual(ret, 0, ("Failed to set quota-hard-timeout")) g.log.info("successfully set the quota-hard-timeout") # Set Quota limit on the newly created directory g.log.info("Set Quota Limit on the path %s of the volume %s", path, self.volname) ret, _, _ = quota_limit_objects(self.mnode, self.volname, path=path, limit="5") self.assertEqual(ret, 0, ("Failed to set quota limit on path %s of " " the volume %s", path, self.volname)) g.log.info( "Successfully set the quota limit on %s of the volume " "%s", path, self.volname) # Create 3 files inside the directory for mount_object in self.mounts: g.log.info("Creating Files on %s:%s", mount_object.client_system, path) cmd = ("/usr/bin/env python %s create_files -f 3 " "--base-file-name file-0 %s%s" % (self.script_upload_path, mount_object.mountpoint, path)) ret, _, _ = g.run(mount_object.client_system, cmd) self.assertEqual(ret, 0, ("Failed to create files on %s", path)) g.log.info("Files created successfully on mountpoint") bricks_list = get_all_bricks(self.mnode, self.volname) # Bring brick3 offline g.log.info('Bringing brick %s offline', bricks_list[2]) ret = bring_bricks_offline(self.volname, bricks_list[2]) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_list[2]) ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[2]]) self.assertTrue(ret, 'Brick %s is not offline' % bricks_list[2]) g.log.info('Bringing brick %s offline is successful', bricks_list[2]) # Try creating 5 more files, which should fail as the quota limit # exceeds cmd = ("/usr/bin/env python %s create_files -f 5 --base-file-name " "file-1 %s%s" % (self.script_upload_path, mount_object.mountpoint, path)) ret, _, _ = g.run(mount_object.client_system, cmd) self.assertNotEqual(ret, 0, ("Creating 5 files succeeded while it was" "not supposed to.")) g.log.info("Creating 5 files failed as expected due to quota object" "limit on the directory.") # Bring brick3 online and check status g.log.info('Bringing brick %s online', bricks_list[2]) ret = bring_bricks_online(self.mnode, self.volname, [bricks_list[2]]) self.assertTrue(ret, 'Failed to bring brick %s online' % bricks_list[2]) g.log.info('Bringing brick %s online is successful', bricks_list[2]) g.log.info("Verifying if brick %s is online", bricks_list[2]) ret = are_bricks_online(self.mnode, self.volname, bricks_list) self.assertTrue(ret, ("Brick %s did not come up", bricks_list[2])) g.log.info("Brick %s has come online.", bricks_list[2]) # Trigger heal ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Starting heal failed') g.log.info('Index heal launched') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully')
def test_metadata_split_brain_resolution(self): # Setting options g.log.info('Setting options...') options = {"metadata-self-heal": "off", "entry-self-heal": "off", "data-self-heal": "off"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Successfully set %s for volume %s", options, self.volname) # Creating files and directories on client side g.log.info('Creating files and directories...') cmd = ("mkdir %s/test_metadata_sb && cd %s/test_metadata_sb &&" "for i in `seq 1 3`; do mkdir dir.$i; for j in `seq 1 5`;" "do dd if=/dev/urandom of=dir.$i/file.$j bs=1K count=1;" "done; dd if=/dev/urandom of=file.$i bs=1K count=1; done" % (self.mounts[0].mountpoint, self.mounts[0].mountpoint)) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Creating files and directories failed") g.log.info("Files & directories created successfully") # Check arequals for all the bricks g.log.info('Getting arequal before getting bricks offline...') self.verify_brick_arequals() g.log.info('Getting arequal before getting bricks offline ' 'is successful') # Set option self-heal-daemon to OFF g.log.info('Setting option self-heal-daemon to off...') options = {"self-heal-daemon": "off"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Option 'self-heal-daemon' is set to 'off' successfully") bricks_list = get_all_bricks(self.mnode, self.volname) # Bring brick1 offline g.log.info('Bringing brick %s offline', bricks_list[0]) ret = bring_bricks_offline(self.volname, bricks_list[0]) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_list[0]) ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[0]]) self.assertTrue(ret, 'Brick %s is not offline' % bricks_list[0]) g.log.info('Bringing brick %s offline is successful', bricks_list[0]) # Change metadata of some files & directories cmd = ("cd %s/test_metadata_sb &&" "for i in `seq 1 2`; do chmod -R 0555 dir.$i file.$i ; done" % self.mounts[0].mountpoint) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Updating file permissions failed") g.log.info("File permissions updated successfully") # Bricng brick1 online and check the status # Bring brick3 online and check status g.log.info('Bringing brick %s online', bricks_list[0]) ret = bring_bricks_online(self.mnode, self.volname, [bricks_list[0]]) self.assertTrue(ret, 'Failed to bring brick %s online' % bricks_list[0]) g.log.info('Bringing brick %s online is successful', bricks_list[0]) g.log.info("Verifying if brick %s is online", bricks_list[0]) ret = are_bricks_online(self.mnode, self.volname, bricks_list) self.assertTrue(ret, ("Brick %s did not come up", bricks_list[0])) g.log.info("Brick %s has come online.", bricks_list[0]) # Bring brick2 offline g.log.info('Bringing brick %s offline', bricks_list[1]) ret = bring_bricks_offline(self.volname, bricks_list[1]) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_list[1]) ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[1]]) self.assertTrue(ret, 'Brick %s is not offline' % bricks_list[1]) g.log.info('Bringing brick %s offline is successful', bricks_list[1]) # Change metadata of same files & directories as before cmd = ("cd %s/test_metadata_sb &&" "for i in `seq 1 2` ; do chmod -R 0777 dir.$i file.$i ; done" % self.mounts[0].mountpoint) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Updating file permissions failed") g.log.info("File permissions updated successfully") # Bricng brick2 online and check the status g.log.info('Bringing brick %s online', bricks_list[1]) ret = bring_bricks_online(self.mnode, self.volname, [bricks_list[1]]) self.assertTrue(ret, 'Failed to bring brick %s online' % bricks_list[1]) g.log.info('Bringing brick %s online is successful', bricks_list[1]) g.log.info("Verifying if brick %s is online", bricks_list[1]) ret = are_bricks_online(self.mnode, self.volname, bricks_list) self.assertTrue(ret, ("Brick %s did not come up", bricks_list[1])) g.log.info("Brick %s has come online.", bricks_list[1]) # Set option self-heal-daemon to ON g.log.info('Setting option self-heal-daemon to on...') options = {"self-heal-daemon": "on"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Option 'self-heal-daemon' is set to 'on' successfully") g.log.info("Checking if files are in split-brain") ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertTrue(ret, "Unable to create split-brain scenario") g.log.info("Successfully created split brain scenario") g.log.info("Resolving split-brain by using the source-brick option " "by choosing second brick as source for all the files") node, _ = bricks_list[1].split(':') command = ("gluster v heal " + self.volname + " split-brain " "source-brick " + bricks_list[1]) ret, _, _ = g.run(node, command) self.assertEqual(ret, 0, "Command execution not successful") # waiting for heal to complete ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, "Heal not completed") # Do lookup on the files from mount cmd = ("ls -lR %s/test_metadata_sb" % self.mounts[0].mountpoint) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Failed to lookup") g.log.info("Lookup successful") # Checking if files are still in split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, "File still in split-brain") g.log.info("Successfully resolved split brain situation using " "CLI based resolution") # Check arequals for all the bricks g.log.info('Getting arequal for all the bricks after heal...') self.verify_brick_arequals() g.log.info('Getting arequal after heal is successful') # Change metadata of same files & directories as before cmd = ("cd %s/test_metadata_sb &&" "for i in `seq 1 2` ; do chmod -R 0555 dir.$i file.$i ; done" % self.mounts[0].mountpoint) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Updating file permissions failed") g.log.info("File permissions updated successfully") # Do lookup on the mount cmd = ("find %s | xargs stat" % self.mounts[0].mountpoint) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, "Lookup on the mount failed") g.log.info("Lookup on the mount is successful") # Check arequals for all the bricks g.log.info('Getting arequal for all the bricks...') self.verify_brick_arequals() g.log.info('Getting arequal is successful')
def test_resolving_meta_data(self): """ - Create a file test_file.txt - Find out which brick the file resides on and kill arbiter brick in the replica pair - Modify the permissions of the file - Bring back the killed brick - Kill the other brick in the replica pair - Modify the permissions of the file - Bring back the killed brick - Trigger heal - Check if heal is completed - Check for split-brain """ # pylint: disable=too-many-locals,too-many-statements # Creating files on client side file_to_create = 'test_file.txt' for mount_obj in self.mounts: g.log.info("Generating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create file g.log.info('Creating file...') command = ("cd %s ; " "touch %s" % (mount_obj.mountpoint, file_to_create)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients") self.io_validation_complete = True # get bricks with file g.log.info('Getting bricks with file...') subvols_dict = get_subvols(self.mnode, self.volname) brick_list_with_file = [] for subvol in subvols_dict['volume_subvols']: for brick in subvol: node, brick_path = brick.split(':') ret, brick_file_list, _ = g.run(node, 'ls %s' % brick_path) if 'test_file.txt' in brick_file_list: brick_list_with_file.append(brick) g.log.info('Bricks with file: %s', brick_list_with_file) # Bring arbiter brick offline bricks_to_bring_offline = [brick_list_with_file[-1]] g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Modify the data self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Modifying data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Modify the permissions g.log.info('Modifying the permissions of the file...') command = ("cd %s ; " "chmod 600 %s" % (mount_obj.mountpoint, file_to_create)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients") self.io_validation_complete = True # Bring arbiter brick online g.log.info('Bringing bricks %s online...', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Bring 1-st data brick offline bricks_to_bring_offline = [brick_list_with_file[0]] g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Modify the data self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Modifying data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Modify the permissions g.log.info('Modifying the permissions of the file...') command = ("cd %s ; " "chmod 644 %s" % (mount_obj.mountpoint, file_to_create)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients") self.io_validation_complete = True # Bring 1-st data brick online g.log.info('Bringing bricks %s online...', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state')
def test_afr_gfid_heal(self): """ Description: This test case runs split-brain resolution on a 5 files in split-brain on a 1x2 volume. After resolving split-brain, it makes sure that split brain resolution doesn't work on files already in split brain. """ g.log.info("disabling the self heal daemon") ret = disable_self_heal_daemon(self.mnode, self.volname) self.assertTrue(ret, "unable to disable self heal daemon") g.log.info("Successfully disabled the self heal daemon") # getting list of all bricks all_bricks = get_all_bricks(self.mnode, self.volname) self.assertIsNotNone(all_bricks, "failed to get list of bricks") g.log.info("bringing down brick1") ret = bring_bricks_offline(self.volname, all_bricks[0:1]) self.assertTrue(ret, "unable to bring brick1 offline") g.log.info("Successfully brought the following brick offline " ": %s", str(all_bricks[0])) g.log.info("verifying if brick1 is offline") ret = are_bricks_offline(self.mnode, self.volname, all_bricks[0:1]) self.assertTrue(ret, "brick1 is still online") g.log.info("verified: brick1 is offline") g.log.info("creating 5 files from mount point") all_mounts_procs = [] for mount_obj in self.mounts: cmd = ("python %s create_files " "-f 5 --base-file-name test_file --fixed-file-size 1k %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) # Validate I/O g.log.info("Wait for IO to complete and validate IO.....") ret = validate_io_procs(all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") g.log.info("IO is successful on all mounts") g.log.info("Successfully created a file from mount point") g.log.info("bringing brick 1 back online") ret = bring_bricks_online(self.mnode, self.volname, all_bricks[0:1]) self.assertIsNotNone(ret, "unable to bring brick 1 online") g.log.info("Successfully brought the following brick online " ": %s", str(all_bricks[0])) g.log.info("verifying if brick1 is online") ret = are_bricks_online(self.mnode, self.volname, all_bricks[0:1]) self.assertTrue(ret, "brick1 is not online") g.log.info("verified: brick1 is online") g.log.info("bringing down brick2") ret = bring_bricks_offline(self.volname, all_bricks[1:2]) self.assertTrue(ret, "unable to bring brick2 offline") g.log.info("Successfully brought the following brick offline " ": %s", str(all_bricks[1])) g.log.info("verifying if brick2 is offline") ret = are_bricks_offline(self.mnode, self.volname, all_bricks[1:2]) self.assertTrue(ret, "brick2 is still online") g.log.info("verified: brick2 is offline") g.log.info("creating 5 new files of same name from mount point") all_mounts_procs = [] for mount_obj in self.mounts: cmd = ("python %s create_files " "-f 5 --base-file-name test_file --fixed-file-size 10k %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) # Validate I/O g.log.info("Wait for IO to complete and validate IO.....") ret = validate_io_procs(all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") g.log.info("IO is successful on all mounts") g.log.info("Successfully created a new file of same name " "from mount point") g.log.info("bringing brick2 back online") ret = bring_bricks_online(self.mnode, self.volname, all_bricks[1:2]) self.assertIsNotNone(ret, "unable to bring brick2 online") g.log.info("Successfully brought the following brick online " ": %s", str(all_bricks[1])) g.log.info("verifying if brick2 is online") ret = are_bricks_online(self.mnode, self.volname, all_bricks[1:2]) self.assertTrue(ret, "brick2 is not online") g.log.info("verified: brick2 is online") g.log.info("enabling the self heal daemon") ret = enable_self_heal_daemon(self.mnode, self.volname) self.assertTrue(ret, "failed to enable self heal daemon") g.log.info("Successfully enabled the self heal daemon") g.log.info("checking if volume is in split-brain") ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertTrue(ret, "unable to create split-brain scenario") g.log.info("Successfully created split brain scenario") g.log.info("resolving split-brain by choosing first brick as " "the source brick") node, brick_path = all_bricks[0].split(':') for fcount in range(5): command = ("gluster v heal " + self.volname + " split-brain " "source-brick " + all_bricks[0] + ' /test_file' + str(fcount) + '.txt') ret, _, _ = g.run(node, command) self.assertEqual(ret, 0, "command execution not successful") # triggering heal ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, "heal not triggered") g.log.info("Successfully triggered heal") # waiting for heal to complete ret = monitor_heal_completion(self.mnode, self.volname, timeout_period=240) self.assertTrue(ret, "heal not completed") g.log.info("Heal completed successfully") # checking if any file is in split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, "file still in split-brain") g.log.info("Successfully resolved split brain situation using " "CLI based resolution") g.log.info("resolving split-brain on a file not in split-brain") node, brick_path = all_bricks[0].split(':') command = ("gluster v heal " + self.volname + " split-brain " "source-brick " + all_bricks[1] + " /test_file0.txt") ret, _, _ = g.run(node, command) self.assertNotEqual( ret, 0, "Unexpected: split-brain resolution " "command is successful on a file which" " is not in split-brain") g.log.info("Expected: split-brian resolution command failed on " "a file which is not in split-brain") g.log.info("checking the split-brain status of each file") for fcount in range(5): fpath = (self.mounts[0].mountpoint + '/test_file' + str(fcount) + '.txt') status = get_fattr(self.mounts[0].client_system, fpath, 'replica.split-brain-status') compare_string = ("The file is not under data or metadata " "split-brain") self.assertEqual( status.rstrip('\x00'), compare_string, "file test_file%s is under" " split-brain" % str(fcount)) g.log.info("none of the files are under split-brain")
def test_gluster_clone_heal(self): """ Test gluster compilation on mount point(Heal command) - Creating directory test_compilation - Compile gluster on mountpoint - Select bricks to bring offline - Bring brick offline - Validate IO - Bring bricks online - Wait for volume processes to be online - Verify volume's all process are online - Monitor heal completion - Check for split-brain - Get arequal after getting bricks online - Compile gluster on mountpoint again - Select bricks to bring offline - Bring brick offline - Validate IO - Bring bricks online - Wait for volume processes to be online - Verify volume's all process are online - Monitor heal completion - Check for split-brain - Get arequal after getting bricks online """ # pylint: disable=too-many-branches,too-many-statements,too-many-locals # Creating directory test_compilation ret = mkdir(self.mounts[0].client_system, "{}/test_compilation".format(self.mounts[0].mountpoint)) self.assertTrue(ret, "Failed to create directory") g.log.info( "Directory 'test_compilation' on %s created " "successfully", self.mounts[0]) # Compile gluster on mountpoint cmd = ("cd %s/test_compilation ; rm -rf glusterfs; git clone" " git://github.com/gluster/glusterfs.git ; cd glusterfs ;" " ./autogen.sh ;./configure CFLAGS='-g3 -O0 -DDEBUG'; make ;" " cd ../..;" % self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd) # Select bricks to bring offline bricks_to_bring_offline = select_volume_bricks_to_bring_offline( self.mnode, self.volname) self.assertIsNotNone(bricks_to_bring_offline, "List is empty") # Bring brick offline ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks {} offline'.format( bricks_to_bring_offline)) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Bricks {} are not offline'.format(bricks_to_bring_offline)) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Validate IO self.assertTrue(validate_io_procs([proc], self.mounts[0]), "IO failed on some of the clients") # Bring bricks online ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks {} online'.format(bricks_to_bring_offline)) # Wait for volume processes to be online ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume {} processes to " "be online".format(self.volname))) # Verify volume's all process are online ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue( ret, ("Volume {} : All process are not online".format(self.volname))) # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequal after getting bricks online ret, result_after_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info("Arequal of mountpoint %s", result_after_online) # Compile gluster on mountpoint again proc1 = g.run_async(self.mounts[0].client_system, cmd) # Select bricks to bring offline bricks_to_bring_offline = select_volume_bricks_to_bring_offline( self.mnode, self.volname) self.assertIsNotNone(bricks_to_bring_offline, "List is empty") # Bring brick offline ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks {} offline'.format( bricks_to_bring_offline)) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Bricks {} are not offline'.format(bricks_to_bring_offline)) # Validate IO self.assertTrue(validate_io_procs([proc1], self.mounts[0]), "IO failed on some of the clients") # Bring bricks online ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks {} online'.format(bricks_to_bring_offline)) # Wait for volume processes to be online ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume {} processes to " "be online".format(self.volname))) # Verify volume's all process are online ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue( ret, ("Volume {} : All process are not online".format(self.volname))) # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') # Get arequal after getting bricks online ret, result_after_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info("Arequal of mountpoint %s", result_after_online)
def test_self_heal_algorithm_full_daemon_off(self): """"" Description:- Checking healing when algorithm is set to "full" and self heal daemon is "off". """"" # pylint: disable=too-many-statements # Setting volume option of self heal & algorithm options = {"metadata-self-heal": "disable", "entry-self-heal": "disable", "data-self-heal": "disable", "data-self-heal-algorithm": "full", "self-heal-daemon": "off"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, "Failed to set the volume options %s" % options) g.log.info(" Volume set options success") # Select bricks to bring down bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = bricks_to_bring_offline_dict['volume_bricks'] g.log.info("Bringing bricks: %s offline", bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, "Failed to bring bricks: %s offline" % bricks_to_bring_offline) g.log.info("Successful in bringing bricks: %s offline", bricks_to_bring_offline) # Validate if bricks are offline g.log.info("Validating if bricks: %s are offline", bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, "Not all the bricks in list:%s are offline" % bricks_to_bring_offline) g.log.info("Successfully validated that bricks %s are all offline", bricks_to_bring_offline) # IO on the mount point all_mounts_procs = [] g.log.info("Creating Files on %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) cmd = ("cd %s ;for i in `seq 1 100` ;" "do dd if=/dev/urandom of=file$i bs=1M " "count=1;done" % self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients" ) # Collecting Arequal before bring the bricks up g.log.info("Collecting Arequal before the bring of bricks down") result_before = collect_mounts_arequal(self.mounts) # Turning self heal daemon ON optionstwo = {"self-heal-daemon": "on"} ret = set_volume_options(self.mnode, self.volname, optionstwo) self.assertTrue(ret, "Failed to turn self-heal ON") g.log.info("Volume set options %s: success", optionstwo) # Bring bricks online g.log.info("Bring bricks: %s online", bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, "Failed to bring bricks: %s online" % bricks_to_bring_offline) g.log.info("Successfully brought all bricks:%s online", bricks_to_bring_offline) # Waiting for bricks to come online g.log.info("Waiting for brick process to come online") ret = wait_for_bricks_to_be_online(self.mnode, self.volname, timeout=30) self.assertTrue(ret, "bricks didn't come online after adding bricks") g.log.info("Bricks are online") # Verifying all bricks online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue(ret, "Volume %s : All process are not online" % self.volname) g.log.info("Volume %s : All process are online", self.volname) # Wait for self heal processes to come online g.log.info("Wait for selfheal process to come online") ret = wait_for_self_heal_daemons_to_be_online(self.mnode, self.volname, timeout=300) self.assertTrue(ret, "Self-heal process are not online") g.log.info("All self heal process are online") # Wait for self-heal to complete g.log.info("Wait for self-heal to complete") ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, "Self heal didn't complete even after waiting " "for 20 minutes. 20 minutes is too much a time for " "current test workload") g.log.info("self-heal is successful after replace-brick operation") # arequal after healing g.log.info("Collecting Arequal before the bring of bricks down") result_after = collect_mounts_arequal(self.mounts) # Comparing the results g.log.info("comparing both the results") self.assertEqual(result_before, result_after, "Arequals are not equal")
def test_glustershd_with_restarting_glusterd(self): """ Test Script to verify the self heal daemon process with restarting glusterd and rebooting the server * stop all volumes * restart glusterd - should not run self heal daemon process * start replicated involved volumes * single self heal daemon process running * restart glusterd * self heal daemon pid will change * bring down brick and restart glusterd * self heal daemon pid will change and its different from previous * brought up the brick """ # pylint: disable=too-many-statements nodes = self.volume['servers'] # stop the volume g.log.info("Stopping the volume %s", self.volname) ret = volume_stop(self.mnode, self.volname) self.assertTrue(ret, ("Failed to stop volume %s" % self.volname)) g.log.info("Successfully stopped volume %s", self.volname) # check the self heal daemon process after stopping the volume g.log.info("Verifying the self heal daemon process for " "volume %s", self.volname) ret = are_all_self_heal_daemons_are_online(self.mnode, self.volname) self.assertFalse(ret, ("Self Heal Daemon process is still running " "even after stopping volume %s" % self.volname)) g.log.info("Self Heal Daemon is not running after stopping " "volume %s", self.volname) # restart glusterd service on all the servers g.log.info("Restarting glusterd on all servers %s", nodes) ret = restart_glusterd(nodes) self.assertTrue(ret, ("Failed to restart glusterd on all nodes %s", nodes)) g.log.info("Successfully restarted glusterd on all nodes %s", nodes) self.assertTrue( wait_for_glusterd_to_start(self.servers), "Failed to start glusterd on %s" % self.servers) # check the self heal daemon process after restarting glusterd process g.log.info("Starting to get self-heal daemon process on" " nodes %s", nodes) ret = are_all_self_heal_daemons_are_online(self.mnode, self.volname) self.assertFalse(ret, ("Self Heal Daemon process is running after " "glusterd restart with volume %s in " "stop state" % self.volname)) g.log.info("Self Heal Daemon is not running after stopping " "volume and restarting glusterd %s", self.volname) # start the volume g.log.info("Starting the volume %s", self.volname) ret = volume_start(self.mnode, self.volname) self.assertTrue(ret, ("Failed to start volume %s" % self.volname)) g.log.info("Volume %s started successfully", self.volname) # Verfiy glustershd process releases its parent process g.log.info("Checking whether glustershd process is daemonized or not") ret = is_shd_daemonized(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) g.log.info("Single self heal daemon process on all nodes %s", nodes) # get the self heal daemon pids after starting volume g.log.info("Starting to get self-heal daemon process " "on nodes %s", nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) g.log.info("Successful in getting self heal daemon pids") glustershd_pids = pids # get the bricks for the volume g.log.info("Fetching bricks for the volume : %s", self.volname) bricks_list = get_all_bricks(self.mnode, self.volname) g.log.info("Brick List : %s", bricks_list) # validate the bricks present in volume info # with glustershd server volume file g.log.info("Starting parsing file %s on " "node %s", self.glustershd, self.mnode) ret = do_bricks_exist_in_shd_volfile(self.mnode, self.volname, bricks_list) self.assertTrue(ret, ("Brick List from volume info is different from " "glustershd server volume file. " "Please check log file for details.")) g.log.info("Successfully parsed %s file", self.glustershd) # restart glusterd service on all the servers g.log.info("Restarting glusterd on all servers %s", nodes) ret = restart_glusterd(nodes) self.assertTrue(ret, ("Failed to restart glusterd on all nodes %s", nodes)) g.log.info("Successfully restarted glusterd on all nodes %s", nodes) # Verify volume's all process are online for 60 sec g.log.info("Verifying volume's all process are online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname, 60) self.assertTrue(ret, ("Volume %s : All process are not " "online", self.volname)) g.log.info("Successfully Verified volume %s processes are online", self.volname) # Verfiy glustershd process releases its parent process ret = is_shd_daemonized(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) # check the self heal daemon process after starting volume and # restarting glusterd process g.log.info("Starting to get self-heal daemon process " "on nodes %s", nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) glustershd_pids_after_glusterd_restart = pids self.assertNotEqual(glustershd_pids, glustershd_pids_after_glusterd_restart, ("Self Heal Daemon pids are same after " "restarting glusterd process")) g.log.info("Self Heal Daemon process are different before and " "after restarting glusterd process") # select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = list(filter(None, ( bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks']))) # bring bricks offline g.log.info("Going to bring down the brick process " "for %s", bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, ("Failed to bring down the bricks. Please " "check the log file for more details.")) g.log.info("Brought down the brick process " "for %s successfully", bricks_to_bring_offline) # restart glusterd after brought down the brick g.log.info("Restart glusterd on all servers %s", nodes) ret = restart_glusterd(nodes) self.assertTrue(ret, ("Failed to restart glusterd on all nodes %s", nodes)) g.log.info("Successfully restarted glusterd on all nodes %s", nodes) # Verify volume's all process are online for 60 sec g.log.info("Verifying volume's all process are online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname, 60) self.assertTrue(ret, ("Volume %s : All process are not " "online", self.volname)) g.log.info("Successfully Verified volume %s processes are online", self.volname) # Verfiy glustershd process releases its parent process ret = is_shd_daemonized(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) # check the self heal daemon process after killing brick and # restarting glusterd process g.log.info("Starting to get self-heal daemon process " "on nodes %s", nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) glustershd_pids_after_killing_brick = pids self.assertNotEqual(glustershd_pids_after_glusterd_restart, glustershd_pids_after_killing_brick, ("Self Heal Daemon process are same from before " "killing the brick,restarting glusterd process")) g.log.info("Self Heal Daemon process are different after killing the " "brick, restarting the glusterd process") # brought the brick online g.log.info("bringing up the bricks : %s online", bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, ("Failed to brought the bricks online")) g.log.info("Successfully brought the bricks online") # check all bricks are online g.log.info("Verifying all bricka are online or not.....") ret = are_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, ("Not all bricks are online")) g.log.info("All bricks are online.")
def test_self_heal(self): """ Description:- - Create files on mount point - Kill one brick from volume - rm -rfv on mount point - bring bricks online - wait for heals - list """ # pylint: disable=too-many-statements # IO on the mount point g.log.info("Starting IO on all mounts...") self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Starting IO on %s:%s", mount_obj.client_system, mount_obj.mountpoint) cmd = ("/usr/bin/env python %s create_deep_dirs_with_files " "--dirname-start-num %d " "--dir-depth 2 " "--dir-length 35 " "--max-num-of-dirs 5 " "--num-of-files 5 %s" % ( self.script_upload_path, self.counter, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) self.all_mounts_procs.append(proc) self.counter = self.counter + 10 # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = list(filter(None, ( bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks']))) # Killing one brick from the volume set g.log.info("Bringing bricks: %s offline", bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, ("Failed to bring bricks: %s offline", bricks_to_bring_offline)) g.log.info("Successful in bringing bricks: %s offline", bricks_to_bring_offline) # Validate if bricks are offline g.log.info("Validating if bricks: %s are offline", bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, "Not all the bricks in list: %s are offline" % bricks_to_bring_offline) g.log.info("Successfully validated that bricks: %s are all offline", bricks_to_bring_offline) # Validate IO self.assertTrue( validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients" ) self.io_validation_complete = True # Checking volume status g.log.info("Logging volume info and Status after bringing bricks " "offline from the volume %s", self.volname) ret = log_volume_info_and_status(self.mnode, self.volname) self.assertTrue(ret, ("Logging volume info and status failed on " "volume %s", self.volname)) g.log.info("Successful in logging volume info and status of volume %s", self.volname) # Removing files from the mount point when one brick is down g.log.info("Removing files from the mount point") mountpoint = self.mounts[0].mountpoint client = self.mounts[0].client_system cmd = "rm -rfv %s/*" % mountpoint ret, _, _ = g.run(client, cmd) if ret != 0: raise ExecutionError("failed to delete the files") # Bringing bricks online g.log.info('Bringing bricks %s online', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bricks %s are online', bricks_to_bring_offline) # Check if bricks are online g.log.info("Checking bricks are online or not") ret = are_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not online' % bricks_to_bring_offline) g.log.info('Bricks %s are online', bricks_to_bring_offline) # Monitoring heals on the volume g.log.info("Wait for heal completion...") ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, "Self heal didn't complete even after waiting " "for 20 minutes.") g.log.info("self-heal is successful after changing the volume type " "from replicated to arbitered volume") # List all files and dirs created g.log.info("List all files and directories:") ret = list_all_files_and_dirs_mounts(self.mounts) self.assertTrue(ret, "Failed to list all files and dirs") g.log.info("Listing all files and directories is successful")
def test_dist_to_repl_automatic_heal_should_be_triggered(self): """ - create a single brick volume - add some files and directories - get arequal from mountpoint - add-brick such that this brick makes the volume a replica vol 1x2 - make sure heal is completed - get arequals from all bricks and compare with arequal from mountpoint - bring down brick 0 - create new files and validate IO - bring brick 0 up - make sure heal is completed """ # pylint: disable=too-many-statements,too-many-locals # Start IO on mounts g.log.info("Starting IO on all mounts...") self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Starting IO on %s:%s", mount_obj.client_system, mount_obj.mountpoint) cmd = ("/usr/bin/env python %s create_deep_dirs_with_files " "--dir-length 1 " "--dir-depth 1 " "--max-num-of-dirs 1 " "--num-of-files 10 %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) self.all_mounts_procs.append(proc) g.log.info("IO on %s:%s is started successfully", mount_obj.client_system, mount_obj.mountpoint) self.io_validation_complete = False # Validate IO self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients") self.io_validation_complete = True # Get arequal for mount before adding bricks g.log.info('Getting arequal before adding bricks...') ret, arequals = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after healing is successful') mount_point_total = arequals[0].splitlines()[-1].split(':')[-1] # Form brick list to add g.log.info('Forming brick list to add...') bricks_to_add = form_bricks_list(self.mnode, self.volname, 1, self.servers, self.all_servers_info) g.log.info('Brick list to add: %s', bricks_to_add) # Add bricks g.log.info("Start adding bricks to volume...") ret, _, _ = add_brick(self.mnode, self.volname, bricks_to_add, force=True, replica_count=2) self.assertFalse(ret, "Failed to add bricks %s" % bricks_to_add) g.log.info("Adding bricks is successful on volume %s", self.volname) # Make sure the newly added bricks are available in the volume # get the bricks for the volume g.log.info("Fetching bricks for the volume: %s", self.volname) bricks_list = get_all_bricks(self.mnode, self.volname) g.log.info("Brick list: %s", bricks_list) for brick in bricks_to_add: self.assertIn(brick, bricks_list, 'Brick %s is not in brick list' % brick) g.log.info('New bricks are present in the volume') # Make sure volume change from distribute to replicate volume vol_info_dict = get_volume_type_info(self.mnode, self.volname) vol_type = vol_info_dict['volume_type_info']['typeStr'] self.assertEqual( 'Replicate', vol_type, 'Volume type is not converted to Replicate ' 'after adding bricks') g.log.info('Volume type is successfully converted to Replicate ' 'after adding bricks') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequal on bricks and compare with mount_point_total # It should be the same g.log.info('Getting arequal on bricks...') arequals_after_heal = {} for brick in bricks_list: g.log.info('Getting arequal on bricks %s...', brick) node, brick_path = brick.split(':') command = ('arequal-checksum -p %s ' '-i .glusterfs -i .landfill -i .trashcan' % brick_path) ret, arequal, _ = g.run(node, command) self.assertFalse(ret, 'Failed to get arequal on brick %s' % brick) g.log.info('Getting arequal for %s is successful', brick) brick_total = arequal.splitlines()[-1].split(':')[-1] arequals_after_heal[brick] = brick_total self.assertEqual( mount_point_total, brick_total, 'Arequals for mountpoint and %s are not equal' % brick) g.log.info('Arequals for mountpoint and %s are equal', brick) g.log.info('All arequals are equal for replicated') # Bring brick 0 offline g.log.info('Bringing bricks %s offline...', bricks_list[0]) ret = bring_bricks_offline(self.volname, [bricks_list[0]]) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_list[0]) ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[0]]) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_list[0]) g.log.info('Bringing bricks %s offline is successful', bricks_list[0]) # Start IO on mounts g.log.info("Starting IO on all mounts...") self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Starting IO on %s:%s", mount_obj.client_system, mount_obj.mountpoint) cmd = ("/usr/bin/env python %s create_files -f 100 " "--fixed-file-size 1k %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) self.all_mounts_procs.append(proc) g.log.info("IO on %s:%s is started successfully", mount_obj.client_system, mount_obj.mountpoint) self.io_validation_complete = False # Validate IO self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients") self.io_validation_complete = True # Bring brick 0 online g.log.info('Bringing bricks %s online...', bricks_list[0]) ret = bring_bricks_online(self.mnode, self.volname, [bricks_list[0]]) self.assertTrue(ret, 'Failed to bring bricks %s online' % bricks_list[0]) g.log.info('Bringing bricks %s online is successful', bricks_list[0]) # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state')
def test_brick_process_not_started_on_read_only_node_disks(self): """ * create volume and start * kill one brick * start IO * unmount the brick directory from node * remount the brick directory with read-only option * start the volume with "force" option * check for error 'posix: initializing translator failed' in log file * remount the brick directory with read-write option * start the volume with "force" option * validate IO """ # pylint: disable=too-many-locals,too-many-statements # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = bricks_to_bring_offline_dict['volume_bricks'] # Bring brick offline g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Creating files for all volumes for mount_obj in self.mounts: g.log.info("Starting IO on %s:%s", mount_obj.client_system, mount_obj.mountpoint) cmd = ("/usr/bin/env python %s create_files -f 100 " "%s/%s/test_dir" % (self.script_upload_path, mount_obj.mountpoint, mount_obj.client_system)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) self.all_mounts_procs.append(proc) # umount brick brick_node, volume_brick = bricks_to_bring_offline[0].split(':') node_brick = '/'.join(volume_brick.split('/')[0:3]) g.log.info('Start umount brick %s...', node_brick) ret, _, _ = g.run(brick_node, 'umount -l %s' % node_brick) self.assertFalse(ret, 'Failed to umount brick %s' % node_brick) g.log.info('Successfully umounted %s', node_brick) # get time before remount the directory and checking logs for error g.log.info('Getting time before remount the directory and ' 'checking logs for error...') _, time_before_checking_logs, _ = g.run(brick_node, 'date -u +%s') g.log.info('Time before remount the directory and checking logs - %s', time_before_checking_logs) # remount the directory with read-only option g.log.info('Start remount brick %s with read-only option...', node_brick) ret, _, _ = g.run(brick_node, 'mount -o ro %s' % node_brick) self.assertFalse(ret, 'Failed to remount brick %s' % node_brick) g.log.info('Successfully remounted %s with read-only option', node_brick) # start volume with "force" option g.log.info('starting volume with "force" option...') ret, _, _ = volume_start(self.mnode, self.volname, force=True) self.assertFalse( ret, 'Failed to start volume %s with "force" option' % self.volname) g.log.info('Successfully started volume %s with "force" option', self.volname) # check logs for an 'initializing translator failed' error g.log.info( "Checking logs for an 'initializing translator failed' " "error for %s brick...", node_brick) error_msg = 'posix: initializing translator failed' cmd = ("cat /var/log/glusterfs/bricks/%s-%s-%s.log | " "grep '%s'" % (volume_brick.split('/')[-3], volume_brick.split('/')[-2], volume_brick.split('/')[-1], error_msg)) ret, log_msgs, _ = g.run(brick_node, cmd) log_msg = log_msgs.rstrip().split('\n')[-1] self.assertTrue(error_msg in log_msg, 'No errors in logs') g.log.info('EXPECTED: %s', error_msg) # get time from log message log_time_msg = log_msg.split('E')[0][1:-2].split('.')[0] log_time_msg_converted = calendar.timegm( time.strptime(log_time_msg, '%Y-%m-%d %H:%M:%S')) g.log.info('Time_msg from logs - %s ', log_time_msg) g.log.info('Time from logs - %s ', log_time_msg_converted) # get time after remount the directory checking logs for error g.log.info('Getting time after remount the directory and ' 'checking logs for error...') _, time_after_checking_logs, _ = g.run(brick_node, 'date -u +%s') g.log.info('Time after remount the directory and checking logs - %s', time_after_checking_logs) # check time periods g.log.info('Checking if an error is in right time period...') self.assertTrue( int(time_before_checking_logs) <= int(log_time_msg_converted) <= int(time_after_checking_logs), 'Expected error is not in right time period') g.log.info('Expected error is in right time period') # umount brick g.log.info('Start umount brick %s...', node_brick) ret, _, _ = g.run(brick_node, 'umount -l %s' % node_brick) self.assertFalse(ret, 'Failed to umount brick %s' % node_brick) g.log.info('Successfully umounted %s', node_brick) # remount the directory with read-write option g.log.info('Start remount brick %s with read-write option...', node_brick) ret, _, _ = g.run(brick_node, 'mount %s' % node_brick) self.assertFalse(ret, 'Failed to remount brick %s' % node_brick) g.log.info('Successfully remounted %s with read-write option', node_brick) # start volume with "force" option g.log.info('starting volume with "force" option...') ret, _, _ = volume_start(self.mnode, self.volname, force=True) self.assertFalse( ret, 'Failed to start volume %s with "force" option' % self.volname) g.log.info('Successfully started volume %s with "force" option', self.volname) # Validate IO g.log.info('Validating IO on all mounts') self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients") g.log.info('Successfully Validated IO on all mounts') self.io_validation_complete = True
def test_heal_info_no_hang(self): """ Testcase steps: 1. Start kernel untar on the mount 2. While untar is going on, kill a brick of the replica. 3. Wait for the untar to be over, resulting in pending heals. 4. Get the approx. number of pending heals and save it 5. Bring the brick back online. 6. Trigger heal 7. Run more I/Os with dd command 8. Run heal info command and check that it completes successfully under a timeout that is based on the no. of heals in step 4. """ self.list_of_io_processes = [] self.linux_untar_dir = "{}/{}".format(self.mounts[0].mountpoint, "linuxuntar") ret = mkdir(self.clients[0], self.linux_untar_dir) self.assertTrue(ret, "Failed to create dir linuxuntar for untar") # Start linux untar on dir linuxuntar ret = run_linux_untar(self.clients[0], self.mounts[0].mountpoint, dirs=tuple(['linuxuntar'])) self.list_of_io_processes += ret self.is_io_running = True # Kill brick resulting in heal backlog. brick_to_bring_offline = random.choice(self.bricks_list) ret = bring_bricks_offline(self.volname, brick_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s offline' % brick_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, [brick_to_bring_offline]) self.assertTrue(ret, 'Bricks %s are not offline' % brick_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', brick_to_bring_offline) ret = self._wait_for_untar_completion() self.assertFalse(ret, "IO didn't complete or failed on client") self.is_io_running = False # Get approx. no. of entries to be healed. cmd = ("gluster volume heal %s statistics heal-count | grep Number " "| awk '{sum+=$4} END {print sum/2}'" % self.volname) ret, self.num_entries, _ = g.run(self.mnode, cmd) self.assertEqual(ret, 0, "Failed to get heal-count statistics") # Restart the down bricks ret = bring_bricks_online(self.mnode, self.volname, [brick_to_bring_offline]) self.assertTrue(ret, 'Failed to bring brick %s online' % brick_to_bring_offline) g.log.info('Bringing brick %s online is successful', brick_to_bring_offline) # Trigger heal ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Starting heal failed') g.log.info('Index heal launched') # Run more I/O cmd = ("for i in `seq 1 10`; do dd if=/dev/urandom of=%s/file_$i " "bs=1M count=100; done" % self.mounts[0].mountpoint) ret = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) # Get heal info ret = self._does_heal_info_complete_within_timeout() self.assertTrue(ret, 'Heal info timed out') g.log.info('Heal info completed succesfully')
def test_create_snap_bricks(self): """ 1. get brick list 2. check all bricks are online 3. Selecting one brick randomly to bring it offline 4. get brick list 5. check all bricks are online 6. Offline Bricks list 7. Online Bricks list 8. Create snapshot of volume 9. snapshot create should fail """ bricks_list = [] # get the bricks from the volume g.log.info("Fetching bricks for the volume : %s" % self.volname) bricks_list = get_all_bricks(self.mnode, self.volname) g.log.info("Brick List : %s" % bricks_list) # check all bricks are online g.log.info("Verifying all bricks are online or not.....") ret = are_bricks_online(self.mnode, self.volname, bricks_list) self.assertTrue(ret, ("Not all bricks are online")) g.log.info("All bricks are online.") # Selecting one brick randomly to bring it offline g.log.info("Selecting one brick randomly to bring it offline") brick_to_bring_offline = random.choice(bricks_list) g.log.info("Brick to bring offline:%s " % brick_to_bring_offline) ret = bring_bricks_offline(self.volname, brick_to_bring_offline, None) self.assertTrue(ret, "Failed to bring the bricks offline") g.log.info("Randomly Selected brick: %s" % brick_to_bring_offline) # get brick list g.log.info("Fetching bricks for the volume : %s" % self.volname) bricks_list = get_all_bricks(self.mnode, self.volname) g.log.info("Brick List : %s" % bricks_list) # check all bricks are online g.log.info("Verifying all bricks are online or not.....") ret = are_bricks_online(self.mnode, self.volname, bricks_list) self.assertFalse(ret, ("Not all bricks are online")) g.log.info("All bricks are online.") # get the bricks for the volume g.log.info("Fetching bricks for the volume : %s" % self.volname) bricks_list = get_all_bricks(self.mnode, self.volname) g.log.info("Brick List : %s" % bricks_list) # Offline Bricks list offbricks = get_offline_bricks_list(self.mnode, self.volname) g.log.info("Bricks Offline: %s" % offbricks) # Online Bricks list onbricks = get_online_bricks_list(self.mnode, self.volname) g.log.info("Bricks Online: %s" % onbricks) # Create snapshot of volume ret = snap_create(self.mnode, self.volname, "snap1", False, "Description with $p3c1al characters!") self.assertTrue(ret, ("Failed to create snapshot snap1")) g.log.info("Snapshot snap1 of volume %s created Successfully" % (self.volname)) # Volume status ret = get_volume_info(self.mnode, self.volname) self.assertTrue(ret, ("Failed to perform gluster volume" "info on volume %s" % self.volname)) g.log.info("Gluster volume info on volume %s is successful" % self.volname) # snapshot list ret = snap_list(self.mnode) self.assertTrue( ret, ("Failed to list snapshot of volume %s" % self.volname)) g.log.info("Snapshot list command for volume %s was successful" % self.volname)
def test_entry_heal_with_quota(self): """ - Create a 1x3 volume - Set quota object limit - Create files less than the limit - Bring down a brick and create more files until limit is hit - Delete one file so that we are below the limit, and create one more file - Bring the brick back up and launch heal - Verify that after heal is complete, the deleted file does not re-appear in any of the bricks. """ # pylint: disable=too-many-statements # Enable Quota g.log.info("Enabling quota on the volume %s", self.volname) ret, _, _ = quota_enable(self.mnode, self.volname) self.assertEqual( ret, 0, ("Failed to enable quota on the volume %s", self.volname)) g.log.info("Successfully enabled quota on the volume %s", self.volname) # Check if quota is enabled g.log.info("Validate Quota is enabled on the volume %s", self.volname) ret = is_quota_enabled(self.mnode, self.volname) self.assertTrue( ret, ("Quota is not enabled on the volume %s", self.volname)) g.log.info("Successfully Validated quota is enabled on volume %s", self.volname) # Set quota related options options = { "quota-deem-statfs": "on", "soft-timeout": "0", "hard-timeout": "0" } g.log.info("setting quota volume options %s", options) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, ("Unable to set volume option %s for " "volume %s" % (options, self.volname))) g.log.info("Successfully set %s for volume %s", options, self.volname) # Create directory on mount ret = mkdir(self.mounts[0].client_system, "%s/dir" % self.mounts[0].mountpoint) self.assertTrue(ret, "mkdir failed") # Set Quota limit on the directory path = "/dir" g.log.info( "Setting Quota Limit object on the path %s of the " "volume %s", path, self.volname) ret, _, _ = quota_limit_objects(self.mnode, self.volname, path=path, limit="10") self.assertEqual(ret, 0, ("Failed to set quota limit object " "on path %s of the volume %s", path, self.volname)) g.log.info( "Successfully set the Quota limit object on %s of the " "volume %s", path, self.volname) cmd = ("touch %s/dir/file{1..5}" % self.mounts[0].mountpoint) ret, _, _ = g.run(self.clients[0], cmd) self.assertEqual(ret, 0, "file creation failed") # Bring brick3 offline bricks_list = get_all_bricks(self.mnode, self.volname) g.log.info('Bringing brick %s offline', bricks_list[2]) ret = bring_bricks_offline(self.volname, bricks_list[2]) self.assertTrue(ret, 'Failed to bring brick %s offline' % bricks_list[2]) ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[2]]) self.assertTrue(ret, 'Brick %s is not offline' % bricks_list[2]) g.log.info('Bringing brick %s offline was successful', bricks_list[2]) # Create files until quota object limit cmd = ("touch %s/dir/file{6..9}" % self.mounts[0].mountpoint) ret, _, _ = g.run(self.clients[0], cmd) self.assertEqual(ret, 0, "file creation failed") # The next create must fail cmd = ("touch %s/dir/file10" % self.mounts[0].mountpoint) ret, _, _ = g.run(self.clients[0], cmd) self.assertEqual( ret, 1, ("Creation of %s/dir/file10 succeeded while " "it was not supposed to." % self.mounts[0].mountpoint)) g.log.info( "Creation of %s/dir/file10 failed as expected due to " "quota object limit.", self.mounts[0].mountpoint) # Delete one file and re-try the create to succeed. cmd = ("rm %s/dir/file1" % self.mounts[0].mountpoint) ret, _, _ = g.run(self.clients[0], cmd) self.assertEqual(ret, 0, "File deletion failed") cmd = ("touch %s/dir/file10" % self.mounts[0].mountpoint) ret, _, _ = g.run(self.clients[0], cmd) self.assertEqual(ret, 0, "File creation failed") # Bring brick3 online and check status g.log.info('Bringing brick %s online...', bricks_list[2]) ret = bring_bricks_online(self.mnode, self.volname, [bricks_list[2]]) self.assertTrue(ret, 'Failed to bring brick %s online' % bricks_list[2]) g.log.info('Bringing brick %s online is successful', bricks_list[2]) g.log.info("Verifying if brick3 is online....") ret = are_bricks_online(self.mnode, self.volname, bricks_list) self.assertTrue(ret, ("brick3 did not come up")) g.log.info("brick3 has come online.") # Trigger heal ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Starting heal failed') g.log.info('Index heal launched') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Verify that file10 did not get recreated on the down brick by an # accidental conservative merge. for brick in bricks_list: node, brick_path = brick.split(':') ret, _, _ = g.run(node, 'stat %s/dir/file10' % brick_path) self.assertFalse(ret, 'File present!')
def test_add_identical_brick(self): """ In this test case: 1. Create Dist Volume on Node 1 2. Down brick on Node 1 3. Peer Probe N2 from N1 4. Add identical brick on newly added node 5. Check volume status """ # pylint: disable=too-many-statements # Create a distributed volume on Node1 number_of_brick = 1 servers_info_from_single_node = { self.servers[0]: self.all_servers_info[self.servers[0]] } self.volname = "testvol" bricks_list = form_bricks_list(self.servers[0], self.volname, number_of_brick, self.servers[0], servers_info_from_single_node) ret, _, _ = volume_create(self.servers[0], self.volname, bricks_list, force=False) self.assertEqual(ret, 0, "Volume create failed") g.log.info("Volume %s created successfully", self.volname) ret, _, _ = volume_start(self.servers[0], self.volname, True) self.assertEqual(ret, 0, ("Failed to start the " "volume %s", self.volname)) g.log.info("Get all the bricks of the volume") bricks_list = get_all_bricks(self.mnode, self.volname) self.assertIsNotNone(bricks_list, "Failed to get the brick list") g.log.info("Successfully got the list of bricks of volume") ret = bring_bricks_offline(self.volname, bricks_list[0]) self.assertTrue(ret, "Failed to bring down the bricks") g.log.info("Successfully brought the bricks down") ret, _, _ = peer_probe(self.servers[0], self.servers[1]) self.assertEqual(ret, 0, ("peer probe from %s to %s is failed", self.servers[0], self.servers[1])) g.log.info("peer probe is success from %s to " "%s", self.servers[0], self.servers[1]) # wait for some time before add-brick time.sleep(2) # Replace just host IP to create identical brick add_bricks = [] add_bricks.append( string.replace(bricks_list[0], self.servers[0], self.servers[1])) ret, _, _ = add_brick(self.mnode, self.volname, add_bricks) self.assertEqual(ret, 0, "Failed to add the bricks to the volume") g.log.info("Successfully added bricks to volume %s", add_bricks[0]) ret, _, _ = volume_start(self.mnode, self.volname, force=True) self.assertEqual(ret, 0, "Volume start with force failed") vol_status = get_volume_status(self.mnode, self.volname) self.assertIsNotNone( vol_status, "Failed to get volume " "status for %s" % self.volname)
def test_heal_client_io_hang(self): mountpoint = self.mounts[0].mountpoint # disable server side heal ret = disable_heal(self.mnode, self.volname) self.assertTrue(ret, ("Failed to disable server side heal")) g.log.info("Successfully disabled server side heal") # Log Volume Info and Status after disabling client side heal g.log.info("Logging volume info and status") ret = log_volume_info_and_status(self.mnode, self.volname) self.assertTrue(ret, ("Logging volume info and status failed " "on volume %s", self.volname)) bricks_list = get_all_bricks(self.mnode, self.volname) self.assertIsNotNone(bricks_list, "Failed to get the bricks list") # Create files cmd = ("cd %s; mkdir test; cd test; for i in `seq 1 100` ;" "do touch file$i; done" % mountpoint) ret, _, err = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, err) g.log.info('Finished creating files while all the bricks are UP') # Bring bricks offline ret = bring_bricks_offline(self.volname, bricks_list[0:1]) self.assertTrue(ret, "Failed to bring down the bricks") g.log.info("Successfully brought the bricks down") # Start pumping IO from client cmd = ("cd %s; mkdir test; cd test; for i in `seq 1 100` ;" "do dd if=/dev/urandom of=file$i bs=1M " "count=5;done" % mountpoint) ret, _, err = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, err) g.log.info('Finished writing on files while a brick is DOWN') # Bring bricks online ret = bring_bricks_online(self.mnode, self.volname, bricks_list[0:1]) self.assertTrue(ret, "Failed to bring up the bricks") g.log.info("Successfully brought the bricks up") # Verifying all bricks online ret = are_bricks_online(self.mnode, self.volname, bricks_list) self.assertTrue(ret, "All bricks are not online") # Start client side heal by reading/writing files. appendcmd = ("cd %s; mkdir test; cd test; for i in `seq 1 100` ;" "do dd if=/dev/urandom of=file$i bs=1M " "count=1 oflag=append conv=notrunc;done" % mountpoint) readcmd = ("cd %s; mkdir test; cd test; for i in `seq 1 100` ;" "do dd if=file$i of=/dev/zero bs=1M " "count=5;done" % mountpoint) ret, _, err = g.run(self.mounts[0].client_system, appendcmd) self.assertEqual(ret, 0, err) g.log.info('Finished append on files after bringing bricks online') ret, _, err = g.run(self.mounts[0].client_system, readcmd) self.assertEqual(ret, 0, err) g.log.info('Finished read on files after bringing bricks online') # check the heal info and completion ec_check_heal_comp(self) # Log Volume Info and Status after bringing the brick up g.log.info("Logging volume info and status") ret = log_volume_info_and_status(self.mnode, self.volname) self.assertTrue(ret, ("Logging volume info and status failed " "on volume %s", self.volname)) g.log.info( "Successful in logging volume info and status " "of volume %s", self.volname)
def test_heal_info_should_have_fixed_fields(self): """ - Create IO - While IO is creating - bring down a couple of bricks - Wait for IO to complete - Bring up the down bricks - Wait for heal to complete - Check for fields 'Brick', 'Status', 'Number of entries' in heal info """ # Creating files on client side for mount_obj in self.mounts: g.log.info("Generating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create files g.log.info('Creating files...') command = ("/usr/bin/env python %s create_deep_dirs_with_files " "-d 2 -l 2 -f 50 %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = list( filter(None, (bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks']))) # Bring brick offline g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Validate IO self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients") self.io_validation_complete = True # Bring brick online g.log.info('Bringing bricks %s online...', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get heal info g.log.info('Getting heal info...') heal_info_dicts = get_heal_info_summary(self.mnode, self.volname) self.assertFalse(ret, 'Failed to get heal info') g.log.info(heal_info_dicts) bricks_list = get_all_bricks(self.mnode, self.volname) self.assertIsNotNone(bricks_list, 'Brick list is None') # Check all fields in heal info dict g.log.info('Checking for all the fields in heal info...') for brick in bricks_list: g.log.info('Checking fields for %s', brick) self.assertEqual(heal_info_dicts[brick]['status'], 'Connected', 'Status is not Connected for brick %s' % brick) self.assertEqual(heal_info_dicts[brick]['numberOfEntries'], '0', 'numberOfEntries is not 0 for brick %s' % brick) g.log.info('Successfully checked for all the fields in heal info')
def test_multiple_clients_dd_on_same_file_default(self): """ - Create 2GB file - While creating file, start reading file - Bring down brick1 - Bring back the brick brick1 - Start healing - Bring down brick1 - Wait for IO to complete - Wait for reading to complete - Bring back the brick brick1 - Start healing - Wait for heal to complete - Check for split-brain - Calculate arequals on all the bricks and compare with mountpoint """ # pylint: disable=too-many-statements,too-many-locals bricks_list = get_all_bricks(self.mnode, self.volname) self.assertIsNotNone(bricks_list, 'Brick list is None') # Creating files on client side for mount_obj in self.mounts: g.log.info("Generating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create files g.log.info('Creating files...') command = ("cd %s ; " "dd if=/dev/urandom of=test_file bs=1M count=2020" % mount_obj.mountpoint) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Reading files on client side all_mounts_procs_read = [] for mount_obj in self.mounts: g.log.info("Reading data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create files g.log.info('Reading files...') command = ("python %s read %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) all_mounts_procs_read.append(proc) # Bring brick1 offline g.log.info('Bringing bricks %s offline...', bricks_list[1]) ret = bring_bricks_offline(self.volname, [bricks_list[1]]) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_list[1]) ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[1]]) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_list[1]) g.log.info('Bringing bricks %s offline is successful', bricks_list[1]) # Bring brick1 online g.log.info('Bringing bricks %s online...', bricks_list[1]) ret = bring_bricks_online(self.mnode, self.volname, [bricks_list[1]]) self.assertTrue(ret, 'Failed to bring bricks %s online' % bricks_list[1]) g.log.info('Bringing bricks %s online is successful', bricks_list[1]) # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Bring brick1 offline g.log.info('Bringing bricks %s offline...', bricks_list[1]) ret = bring_bricks_offline(self.volname, [bricks_list[1]]) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_list[1]) ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[1]]) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_list[1]) g.log.info('Bringing bricks %s offline is successful', bricks_list[1]) # Validate IO self.assertTrue( validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients" ) # Validate reading self.assertTrue( validate_io_procs(all_mounts_procs_read, self.mounts), "Reading failed on some of the clients" ) self.io_validation_complete = True # Bring brick1 online g.log.info('Bringing bricks %s online...', bricks_list[1]) ret = bring_bricks_online(self.mnode, self.volname, [bricks_list[1]]) self.assertTrue(ret, 'Failed to bring bricks %s online' % bricks_list[1]) g.log.info('Bringing bricks %s online is successful', bricks_list[1]) # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequal for mount g.log.info('Getting arequal...') ret, arequals = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after healing is successful') mount_point_total = arequals[0].splitlines()[-1].split(':')[-1] # Get arequal on bricks and compare with mount_point_total # It should be the same g.log.info('Getting arequal on bricks...') arequals_after_heal = {} for brick in bricks_list: g.log.info('Getting arequal on bricks %s...', brick) node, brick_path = brick.split(':') command = ('arequal-checksum -p %s ' '-i .glusterfs -i .landfill -i .trashcan' % brick_path) ret, arequal, _ = g.run(node, command) self.assertFalse(ret, 'Failed to get arequal on brick %s' % brick) g.log.info('Getting arequal for %s is successful', brick) brick_total = arequal.splitlines()[-1].split(':')[-1] arequals_after_heal[brick] = brick_total self.assertEqual(mount_point_total, brick_total, 'Arequals for mountpoint and %s are not equal' % brick) g.log.info('Arequals for mountpoint and %s are equal', brick) g.log.info('All arequals are equal')
def test_rebalance_with_brick_down(self): """ Rebalance with brick down in replica - Create a Replica volume. - Bring down one of the brick down in the replica pair - Do some IO and create files on the mount point - Add a pair of bricks to the volume - Initiate rebalance - Bring back the brick which was down. - After self heal happens, all the files should be present. """ # Log the volume info and status before brick is down. log_volume_info_and_status(self.mnode, self.volname) # Bring one fo the bricks offline brick_list = get_all_bricks(self.mnode, self.volname) ret = bring_bricks_offline(self.volname, choice(brick_list)) # Log the volume info and status after brick is down. log_volume_info_and_status(self.mnode, self.volname) # Create files at mountpoint. cmd = ( "/usr/bin/env python %s create_files " "-f 2000 --fixed-file-size 1k --base-file-name file %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async( self.mounts[0].client_system, cmd, user=self.mounts[0].user) self.all_mounts_procs.append(proc) # Wait for IO to complete. self.assertTrue(wait_for_io_to_complete(self.all_mounts_procs, self.mounts[0]), "IO failed on some of the clients") g.log.info("IO completed on the clients") # Compute the arequal checksum before bringing all bricks online arequal_before_all_bricks_online = collect_mounts_arequal(self.mounts) # Log the volume info and status before expanding volume. log_volume_info_and_status(self.mnode, self.volname) # Expand the volume. ret = expand_volume(self.mnode, self.volname, self.servers, self.all_servers_info) self.assertTrue(ret, ("Failed to expand the volume %s", self.volname)) g.log.info("Expanding volume is successful on " "volume %s", self.volname) # Log the voluem info after expanding volume. log_volume_info_and_status(self.mnode, self.volname) # Start Rebalance. ret, _, _ = rebalance_start(self.mnode, self.volname) self.assertEqual(ret, 0, ("Failed to start rebalance on the volume " "%s", self.volname)) g.log.info("Successfully started rebalance on the volume %s", self.volname) # Wait for rebalance to complete ret = wait_for_rebalance_to_complete(self.mnode, self.volname) self.assertTrue(ret, ("Rebalance is not yet complete on the volume " "%s", self.volname)) g.log.info("Rebalance is successfully complete on the volume %s", self.volname) # Log the voluem info and status before bringing all bricks online log_volume_info_and_status(self.mnode, self.volname) # Bring all bricks online. ret, _, _ = volume_start(self.mnode, self.volname, force=True) self.assertEqual(ret, 0, "Not able to start volume with force option") g.log.info("Volume start with force option successful.") # Log the volume info and status after bringing all beicks online log_volume_info_and_status(self.mnode, self.volname) # Monitor heal completion. ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, "heal has not yet completed") g.log.info("Self heal completed") # Compute the arequal checksum after all bricks online. arequal_after_all_bricks_online = collect_mounts_arequal(self.mounts) # Comparing arequal checksum before and after the operations. self.assertEqual(arequal_before_all_bricks_online, arequal_after_all_bricks_online, "arequal checksum is NOT MATCHING") g.log.info("arequal checksum is SAME")
def test_client_side_quorum_with_fixed_for_cross3(self): """ Test Script to verify the Client Side Quorum with fixed for cross 3 volume * Disable self heal daemom * set cluster.quorum-type to fixed. * start I/O( write and read )from the mount point - must succeed * Bring down brick1 * start I/0 ( write and read ) - must succeed * Bring down brick2 * start I/0 ( write and read ) - must succeed * set the cluster.quorum-count to 1 * start I/0 ( write and read ) - must succeed * set the cluster.quorum-count to 2 * start I/0 ( write and read ) - read and write will fail * bring back the brick1 online * start I/0 ( write and read ) - must succeed * Bring back brick2 online * start I/0 ( write and read ) - must succeed * set cluster.quorum-type to auto * start I/0 ( write and read ) - must succeed * Bring down brick1 and brick2 * start I/0 ( write and read ) - read and write will fail * set the cluster.quorum-count to 1 * start I/0 ( write and read ) - read and write will fail * set the cluster.quorum-count to 3 * start I/0 ( write and read ) - read and write will fail * set the quorum-type to none * start I/0 ( write and read ) - must succeed """ # pylint: disable=too-many-locals,too-many-statements,too-many-branches # Disable self heal daemon options = {"cluster.self-heal-daemon": "off"} g.log.info("setting %s for the volume %s", options, self.volname) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, ("Unable to set %s for volume %s" % (options, self.volname))) g.log.info("Successfully set %s for volume %s", options, self.volname) # set cluster.quorum-type to fixed options = {"cluster.quorum-type": "fixed"} g.log.info("setting %s for the volume %s", options, self.volname) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, ("Unable to set %s for volume %s" % (options, self.volname))) g.log.info("Successfully set %s for volume %s", options, self.volname) # start I/O( write ) - must succeed all_mounts_procs = [] g.log.info("Starting IO on mountpoint %s", self.mounts[0].mountpoint) cmd = ("/usr/bin/env python %s create_files " "-f 10 --base-file-name file %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "IO failed on mountpoint %s" % self.mounts[0].mountpoint) # read the file g.log.info("Start reading files on %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = "/usr/bin/env python %s read %s" % (self.script_upload_path, self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "Reads failed on some of the clients") # get the subvolumes g.log.info("Starting to get sub-volumes for volume %s", self.volname) subvols_dict = get_subvols(self.mnode, self.volname) num_subvols = len(subvols_dict['volume_subvols']) g.log.info("Number of subvolumes in volume %s:", num_subvols) # bring down brick1 for all the subvolumes offline_brick1_from_replicasets = [] for i in range(0, num_subvols): subvol_brick_list = subvols_dict['volume_subvols'][i] g.log.info("sub-volume %s brick list : %s", i, subvol_brick_list) brick_to_bring_offline1 = subvol_brick_list[0] g.log.info("Going to bring down the brick process " "for %s", brick_to_bring_offline1) ret = bring_bricks_offline(self.volname, brick_to_bring_offline1) self.assertTrue(ret, ("Failed to bring down the bricks. Please " "check the log file for more details.")) g.log.info("Brought down the brick process " "for %s successfully", brick_to_bring_offline1) offline_brick1_from_replicasets.append(brick_to_bring_offline1) # start I/0 ( write and read ) - must succeed g.log.info("Starting IO on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = ("/usr/bin/env python %s create_files " "-f 10 --base-file-name testfile %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "IO failed on mountpoint %s" % self.mounts[0].mountpoint) # read the file g.log.info("Start reading files on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = "/usr/bin/env python %s read %s" % (self.script_upload_path, self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "Reads failed on mountpoint %s" % self.mounts[0].mountpoint) # bring down brick2 for all the subvolumes offline_brick2_from_replicasets = [] for i in range(0, num_subvols): subvol_brick_list = subvols_dict['volume_subvols'][i] g.log.info("sub-volume %s brick list : %s", i, subvol_brick_list) brick_to_bring_offline2 = subvol_brick_list[1] g.log.info("Going to bring down the brick process " "for %s", brick_to_bring_offline2) ret = bring_bricks_offline(self.volname, brick_to_bring_offline2) self.assertTrue(ret, ("Failed to bring down the bricks. Please " "check the log file for more details.")) g.log.info("Brought down the brick process " "for %s successfully", brick_to_bring_offline2) offline_brick2_from_replicasets.append(brick_to_bring_offline2) # start I/0 ( write and read ) - must succeed g.log.info("Starting IO on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = ("/usr/bin/env python %s create_files " "-f 10 --base-file-name newfile %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "IO failed on mountpoint %s" % self.mounts[0].mountpoint) # read the file g.log.info("Start reading files on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = "/usr/bin/env python %s read %s" % (self.script_upload_path, self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "Reads failed on mountpoint %s" % self.mounts[0].mountpoint) # set the cluster.quorum-count to 1 options = {"cluster.quorum-count": "1"} g.log.info("setting %s for the volume %s", options, self.volname) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue( ret, "Unable to set %s for volume %s" % (options, self.volname)) g.log.info("Successfully set %s for volume %s", options, self.volname) # start I/0 ( write and read ) - must succeed g.log.info("Starting IO on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = ("/usr/bin/env python %s create_files " "-f 10 --base-file-name filename %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "IO failed on mountpoint %s" % self.mounts[0].mountpoint) # read the file g.log.info("Start reading files on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = "/usr/bin/env python %s read %s" % (self.script_upload_path, self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "Reads failed on mountpoint %s" % self.mounts[0].mountpoint) # set the cluster.quorum-count to 2 options = {"cluster.quorum-count": "2"} g.log.info("setting %s for the volume %s", options, self.volname) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, ("Unable to set %s for volume %s" % (options, self.volname))) g.log.info("Successfully set %s for volume %s", options, self.volname) # start I/0 ( write and read ) - read and write will fail g.log.info("Starting IO on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = ("dd if=/dev/urandom of=%s/test_file bs=1M count=1" % self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO g.log.info("Validating whether IO failed with " "Transport endpoint is not connected") ret, _ = is_io_procs_fail_with_error(self, all_mounts_procs, self.mounts, self.mount_type) self.assertTrue(ret, ("Unexpected Error and IO successful" " on not connected transport endpoint")) g.log.info("EXPECTED: Transport endpoint is not connected" " while creating file") # read the file g.log.info("Start reading files on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = ("cat %s/file1.txt" % self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO g.log.info("Validating whether IO failed with " "Transport endpoint is not connected") ret, _ = is_io_procs_fail_with_error(self, all_mounts_procs, self.mounts, self.mount_type) self.assertTrue(ret, ("Unexpected error and IO successful" " on not connected transport endpoint")) g.log.info("EXPECTED: Transport endpoint is not connected" " while reading file") # bring back the brick1 online for all subvolumes g.log.info("bringing up the brick : %s online", offline_brick1_from_replicasets) ret = bring_bricks_online( self.mnode, self.volname, offline_brick1_from_replicasets, bring_bricks_online_methods='glusterd_restart') self.assertTrue(ret, ("Failed to brought the brick %s online" % offline_brick1_from_replicasets)) g.log.info("Successfully brought the brick %s online", offline_brick1_from_replicasets) # start I/0 ( write and read ) - must succeed g.log.info("Starting IO on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = ("/usr/bin/env python %s create_files " "-f 10 --base-file-name newfilename %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "IO failed on mountpoint %s" % self.mounts[0].mountpoint) # read the file g.log.info("Start reading files on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = "/usr/bin/env python %s read %s" % (self.script_upload_path, self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "Reads failed on mountpoint %s" % self.mounts[0].mountpoint) # Bring back brick2 online g.log.info("bringing up the brick : %s online", offline_brick2_from_replicasets) ret = bring_bricks_online( self.mnode, self.volname, offline_brick2_from_replicasets, bring_bricks_online_methods='glusterd_restart') self.assertTrue(ret, ("Failed to brought the brick %s online" % offline_brick2_from_replicasets)) g.log.info("Successfully brought the brick %s online", offline_brick2_from_replicasets) # start I/0 ( write and read ) - must succeed g.log.info("Starting IO on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = ("/usr/bin/env python %s create_files " "-f 10 --base-file-name textfile %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "IO failed on mountpoint %s" % self.mounts[0].mountpoint) # read the file g.log.info("Start reading files on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = "/usr/bin/env python %s read %s" % (self.script_upload_path, self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "Reads failed on mountpoint %s" % self.mounts[0].mountpoint) # set cluster.quorum-type to auto options = {"cluster.quorum-type": "auto"} g.log.info("setting %s for the volume %s", options, self.volname) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, ("Unable to set %s for volume %s" % (options, self.volname))) g.log.info("Successfully set %s for volume %s", options, self.volname) # start I/0 ( write and read ) - must succeed g.log.info("Starting IO on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = ("/usr/bin/env python %s create_files " "-f 10 --base-file-name newtextfile %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "IO failed on mountpoint %s" % self.mounts[0].mountpoint) # read the file g.log.info("Start reading files on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = "/usr/bin/env python %s read %s" % (self.script_upload_path, self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "Reads failed on mountpoint %s" % self.mounts[0].mountpoint) # bring down brick1 and brick2 for all the subvolumes for i in range(0, num_subvols): subvol_brick_list = subvols_dict['volume_subvols'][i] g.log.info("sub-volume %s brick list : %s", i, subvol_brick_list) bricks_to_bring_offline = subvol_brick_list[0:2] g.log.info("Going to bring down the brick process for %s", bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, "Failed to bring down the bricks. Please " "check the log file for more details.") g.log.info("Brought down the brick process " "for %s successfully", bricks_to_bring_offline) # start I/0 ( write and read ) - read and write will fail all_mounts_procs = [] g.log.info("Start creating file on mountpoint %s", self.mounts[0].mountpoint) cmd = ("dd if=/dev/urandom of=%s/new_test_file bs=1M count=1" % self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO g.log.info("Validating whether IO failed with " "Transport endpoint is not connected") ret, _ = is_io_procs_fail_with_error(self, all_mounts_procs, self.mounts, self.mount_type) self.assertTrue(ret, ("Unexpected error and IO successful" " on not connected transport endpoint")) g.log.info("EXPECTED: Transport endpoint is not connected" " while creating files") # read the file g.log.info("Start reading files on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] g.log.info("Starting reading file") cmd = ("cat %s/file1.txt" % self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO g.log.info("Validating whether IO failed with " "Transport endpoint is not connected") ret, _ = is_io_procs_fail_with_error(self, all_mounts_procs, self.mounts, self.mount_type) self.assertTrue(ret, ("Unexpected error and IO successful" " on not connected transport endpoint")) g.log.info("EXPECTED: Transport endpoint is not connected" " while reading file") # set the cluster.quorum-count to 1 options = {"cluster.quorum-count": "1"} g.log.info("setting %s for the volume %s", options, self.volname) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue( ret, "Unable to set %s for volume %s" % (options, self.volname)) g.log.info("Successfully set %s for volume %s", options, self.volname) # start I/0 ( write and read ) - read and write will fail g.log.info("Start creating files on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = ("dd if=/dev/urandom of=%s/new_test_file bs=1M count=1" % self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO g.log.info("Validating whether IO failed with " "Transport endpoint is not connected") ret, _ = is_io_procs_fail_with_error(self, all_mounts_procs, self.mounts, self.mount_type) self.assertTrue(ret, ("Unexpected error and IO successful" " on not connected transport endpoint")) g.log.info("EXPECTED: Transport endpoint is not connected" " while creating files") # read the file g.log.info("Start reading files on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = ("cat %s/file1.txt" % self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO g.log.info("Validating whether IO failed with " "Transport endpoint is not connected") ret, _ = is_io_procs_fail_with_error(self, all_mounts_procs, self.mounts, self.mount_type) self.assertTrue(ret, ("Unexpected error and IO successful" " on not connected transport endpoint")) g.log.info("EXPECTED: Transport endpoint is not connected" " while reading file") # set the cluster.quorum-count to 3 options = {"cluster.quorum-count": "3"} g.log.info("setting %s for the volume %s", options, self.volname) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue( ret, "Unable to set %s for volume %s" % (options, self.volname)) g.log.info("Successfully set %s for volume %s", options, self.volname) # start I/0 ( write and read ) - read and write will fail g.log.info("Start creating files on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = ("dd if=/dev/urandom of=%s/new_test_file bs=1M count=1" % self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO g.log.info("Validating whether IO failed with " "Transport endpoint is not connected") ret, _ = is_io_procs_fail_with_error(self, all_mounts_procs, self.mounts, self.mount_type) self.assertTrue(ret, ("Unexpected error and IO successful" " on not connected transport endpoint")) g.log.info("EXPECTED: Transport endpoint is not connected" " while creating files") # read the file g.log.info("Start reading files on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = ("cat %s/file1.txt" % self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO g.log.info("Validating whether IO failed with " "Transport endpoint is not connected") ret, _ = is_io_procs_fail_with_error(self, all_mounts_procs, self.mounts, self.mount_type) self.assertTrue(ret, ("Unexpected error and IO successful" " on not connected transport endpoint")) g.log.info("EXPECTED: Transport endpoint is not connected" " while reading file") # set the quorum-type to none options = {"cluster.quorum-type": "none"} g.log.info("setting %s for the volume %s", options, self.volname) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue( ret, "Unable to set %s for volume %s" % (options, self.volname)) g.log.info("Successfully set %s for volume %s", options, self.volname) # start I/0 ( write and read ) - must succeed g.log.info("Starting IO on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = ("/usr/bin/env python %s create_files " "-f 10 --base-file-name lastfile %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "IO failed on mountpoint %s" % self.mounts[0].mountpoint) # read the file g.log.info("Start reading files on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = "/usr/bin/env python %s read %s" % (self.script_upload_path, self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "Reads failed on mountpoint %s" % self.mounts[0].mountpoint)
def test_shd_should_not_crash_executed_heal_info(self): """ - set "entry-self-heal", "metadata-self-heal", "data-self-heal" to off - write a few files - bring down brick0 - add IO - do a heal info and check for files pending heal on last 2 bricks - set "performance.enable-least-priority" to "enable" - bring down brick1 - set the "quorum-type" to "fixed" - add IO - do a heal info and check for files pending heal on the last brick """ # pylint: disable=too-many-statements bricks_list = get_all_bricks(self.mnode, self.volname) # Setting options g.log.info('Setting options...') options = { "metadata-self-heal": "off", "entry-self-heal": "off", "data-self-heal": "off" } ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Successfully set %s for volume %s", options, self.volname) # Creating files on client side for mount_obj in self.mounts: g.log.info("Generating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create files g.log.info('Creating files...') command = ("/usr/bin/env python %s create_files -f 10 " "--fixed-file-size 1M %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients") self.io_validation_complete = True # Bring brick0 offline g.log.info('Bringing bricks %s offline', bricks_list[0]) ret = bring_bricks_offline(self.volname, bricks_list[0]) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_list[0]) ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[0]]) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_list[0]) g.log.info('Bringing bricks %s offline is successful', bricks_list[0]) # Creating files on client side number_of_files_one_brick_off = '1000' self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Generating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create files g.log.info('Creating files...') command = ("/usr/bin/env python %s create_files " "-f %s " "--fixed-file-size 1k " "--base-file-name new_file " "%s" % (self.script_upload_path, number_of_files_one_brick_off, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients") self.io_validation_complete = True # Get heal info g.log.info("Getting heal info...") heal_info_data = get_heal_info_summary(self.mnode, self.volname) self.assertIsNotNone(heal_info_data, 'Failed to get heal info.') g.log.info('Success in getting heal info') # Check quantity of file pending heal for brick in bricks_list[1:]: self.assertEqual(heal_info_data[brick]['numberOfEntries'], str(int(number_of_files_one_brick_off) + 1), 'Number of files pending heal is not correct') # Setting options g.log.info('Setting options...') options = {"performance.enable-least-priority": "enable"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Successfully set %s for volume %s", options, self.volname) # Bring brick1 offline g.log.info('Bringing bricks %s offline', bricks_list[1]) ret = bring_bricks_offline(self.volname, bricks_list[1]) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_list[1]) ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[1]]) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_list[1]) g.log.info('Bringing bricks %s offline is successful', bricks_list[1]) # Setting options g.log.info('Setting options...') options = {"quorum-type": "fixed"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options %s' % options) g.log.info("Successfully set %s for volume %s", options, self.volname) # Creating files on client side number_of_files_two_brick_off = '100' self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Generating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create files g.log.info('Creating files...') command = ("/usr/bin/env python %s create_files " "-f %s " "--fixed-file-size 1k " "--base-file-name new_new_file " "%s" % (self.script_upload_path, number_of_files_two_brick_off, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients") self.io_validation_complete = True # Get heal info g.log.info("Getting heal info...") heal_info_data = get_heal_info_summary(self.mnode, self.volname) self.assertIsNotNone(heal_info_data, 'Failed to get heal info.') g.log.info('Success in getting heal info') # Check quantity of file pending heal number_of_files_to_check = str( int(number_of_files_one_brick_off) + int(number_of_files_two_brick_off) + 1) self.assertEqual(heal_info_data[bricks_list[-1]]['numberOfEntries'], number_of_files_to_check, 'Number of files pending heal is not correct')
def test_ec_lookup_and_move_operations_few_bricks_are_offline(self): """ Test Steps: 1. Mount this volume on 3 mount point, c1, c2, and c3 2. Bring down two bricks offline in each subvol. 3. On client1: under dir1 create files f{1..10000} run in background 4. On client2: under root dir of mountpoint touch x{1..1000} 5. On client3: after step 4 action completed, start creating x{1001..10000} 6. Bring bricks online which were offline(brought up all the bricks which were down (2 in each of the two subvols) 7. While IO on Client1 and Client3 were happening, On client2 move all the x* files into dir1 8. Perform lookup from client 3 """ # List two bricks in each subvol all_subvols_dict = get_subvols(self.mnode, self.volname) subvols = all_subvols_dict['volume_subvols'] bricks_to_bring_offline = [] for subvol in subvols: self.assertTrue(subvol, "List is empty") bricks_to_bring_offline.extend(sample(subvol, 2)) # Bring two bricks of each subvol offline ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, "Bricks are still online") g.log.info("Bricks are offline %s", bricks_to_bring_offline) # Validating the bricks are offline or not ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, "Few of the bricks are still online in" " {} in".format(bricks_to_bring_offline)) g.log.info("%s bricks are offline as expected", bricks_to_bring_offline) # Create directory on client1 dir_on_mount = self.mounts[0].mountpoint + '/dir1' ret = mkdir(self.mounts[0].client_system, dir_on_mount) self.assertTrue(ret, "unable to create directory on client" " 1 {}".format(self.mounts[0].client_system)) g.log.info("Dir1 created on %s successfully", self.mounts[0].client_system) # Next IO to be ran in the background so using mount_procs # and run_async. self.mount_procs = [] # On client1: under dir1 create files f{1..10000} run in background self._run_create_files(file_count=10000, base_name="f_", mpoint=dir_on_mount, client=self.mounts[0].client_system) # On client2: under root dir of the mountpoint touch x{1..1000} cmd = ("/usr/bin/env python {} create_files -f 1000 --fixed-file-size" " 10k --base-file-name x {}".format(self.script_upload_path, self.mounts[1].mountpoint)) ret, _, err = g.run(self.mounts[1].client_system, cmd) self.assertEqual(ret, 0, "File creation failed on {} with {}". format(self.mounts[1].client_system, err)) g.log.info("File creation successful on %s", self.mounts[1].client_system) # On client3: start creating x{1001..10000} cmd = ("cd {}; for i in `seq 1000 10000`; do touch x$i; done; " "cd -".format(self.mounts[2].mountpoint)) proc = g.run_async(self.mounts[2].client_system, cmd) self.mount_procs.append(proc) # Bring bricks online with volume start force ret, _, err = volume_start(self.mnode, self.volname, force=True) self.assertEqual(ret, 0, err) g.log.info("Volume: %s started successfully", self.volname) # Check whether bricks are online or not ret = are_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, "Bricks {} are still offline". format(bricks_to_bring_offline)) g.log.info("Bricks %s are online now", bricks_to_bring_offline) # From client2 move all the files with name starting with x into dir1 cmd = ("for i in `seq 0 999`; do mv {}/x$i.txt {}; " "done".format(self.mounts[1].mountpoint, dir_on_mount)) proc = g.run_async(self.mounts[1].client_system, cmd) self.mount_procs.append(proc) # Perform a lookup in loop from client3 for 20 iterations cmd = ("ls -R {}".format(self.mounts[2].mountpoint)) counter = 20 while counter: ret, _, err = g.run(self.mounts[2].client_system, cmd) self.assertEqual(ret, 0, "ls while mv operation being carried" " failed with {}".format(err)) g.log.debug("ls successful for the %s time", 21-counter) counter -= 1 self.assertTrue(validate_io_procs(self.mount_procs, self.mounts), "IO failed on the clients") # Emptying mount_procs for not validating IO in tearDown self.mount_procs *= 0 # Wait for heal to complete ret = monitor_heal_completion(self.mnode, self.volname,) self.assertTrue(ret, "Heal didn't completed in the expected time") g.log.info("Heal completed successfully on %s volume", self.volname)
def test_data_self_heal_algorithm_diff_heal_command(self): """ Test Volume Option - 'cluster.data-self-heal-algorithm' : 'diff' Description: - set the volume option "metadata-self-heal": "off" "entry-self-heal": "off" "data-self-heal": "off" "data-self-heal-algorithm": "diff" "self-heal-daemon": "off" - create IO - calculate arequal - bring down all bricks processes from selected set - modify the data - get arequal before getting bricks online - bring bricks online - expand volume by adding bricks to the volume - do rebalance - set the volume option "self-heal-daemon": "on" and check for daemons - start healing - check if heal is completed - check for split-brain - calculate arequal and compare with arequal before bringing bricks offline and after bringing bricks online """ # pylint: disable=too-many-branches,too-many-statements # Setting options g.log.info('Setting options...') options = { "metadata-self-heal": "off", "entry-self-heal": "off", "data-self-heal": "off", "data-self-heal-algorithm": "diff" } ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options') g.log.info("Options " "'metadata-self-heal', " "'entry-self-heal', " "'data-self-heal', " "'self-heal-daemon' " "are set to 'off'," "'data-self-heal-algorithm' " "is set to 'diff' successfully") # Creating files on client side all_mounts_procs = [] g.log.info("Generating data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) # Creating files command = "/usr/bin/env python %s create_files -f 100 %s" % ( self.script_upload_path, self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients") # Setting options g.log.info('Setting options...') options = {"self-heal-daemon": "off"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options') g.log.info("Option 'self-heal-daemon' is set to 'off' successfully") # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = list( filter(None, (bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks']))) # Bring brick offline g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Modify the data all_mounts_procs = [] g.log.info("Modifying data for %s:%s", self.mounts[0].client_system, self.mounts[0].mountpoint) command = ("/usr/bin/env python %s create_files -f 100 " "--fixed-file-size 1M %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients") # Get arequal before getting bricks online g.log.info('Getting arequal before getting bricks online...') ret, result_before_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal before getting bricks online ' 'is successful') # Bring brick online g.log.info('Bringing bricks %s online...', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Expand volume by adding bricks to the volume g.log.info("Start adding bricks to volume...") ret = expand_volume(self.mnode, self.volname, self.servers, self.all_servers_info) self.assertTrue(ret, ("Failed to expand the volume when IO in " "progress on volume %s", self.volname)) g.log.info("Expanding volume is successful on volume %s", self.volname) # Do rebalance ret, _, _ = rebalance_start(self.mnode, self.volname) self.assertEqual(ret, 0, 'Failed to start rebalance') g.log.info('Rebalance is started') ret = wait_for_rebalance_to_complete(self.mnode, self.volname) self.assertTrue(ret, 'Rebalance is not completed') g.log.info('Rebalance is completed successfully') # Setting options g.log.info('Setting options...') options = {"self-heal-daemon": "on"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, 'Failed to set options') g.log.info("Option 'self-heal-daemon' is set to 'on' successfully") # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequal after getting bricks online g.log.info('Getting arequal after getting bricks online...') ret, result_after_online = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after getting bricks online ' 'is successful') # Checking arequals before bringing bricks offline # and after bringing bricks online self.assertItemsEqual(result_before_online, result_after_online, 'Checksums are not equal') g.log.info('Checksums are equal')
def test_file_access(self): """ Test file access. """ # pylint: disable=protected-access # pylint: disable=too-many-locals # pylint: disable=too-many-statements mount_obj = self.mounts[0] mountpoint = mount_obj.mountpoint # get subvol list subvols = (get_subvols(self.mnode, self.volname))['volume_subvols'] self.assertIsNotNone(subvols, "failed to get subvols") # create a file srcfile = mountpoint + '/testfile' ret, _, err = g.run(self.clients[0], ("touch %s" % srcfile)) self.assertEqual(ret, 0, ("File creation failed for %s err %s", srcfile, err)) g.log.info("testfile creation successful") # find hashed subvol srchashed, scount = find_hashed_subvol(subvols, "/", "testfile") self.assertIsNotNone(srchashed, "could not find srchashed") g.log.info("hashed subvol for srcfile %s subvol count %s", srchashed._host, str(scount)) # rename the file such that the new name hashes to a new subvol tmp = find_new_hashed(subvols, "/", "testfile") self.assertIsNotNone(tmp, "could not find new hashed for dstfile") g.log.info("dst file name : %s dst hashed_subvol : %s " "subvol count : %s", tmp.newname, tmp.hashedbrickobject._host, str(tmp.subvol_count)) dstname = str(tmp.newname) dstfile = mountpoint + "/" + dstname dsthashed = tmp.hashedbrickobject dcount = tmp.subvol_count ret, _, err = g.run(self.clients[0], ("mv %s %s" % (srcfile, dstfile))) self.assertEqual(ret, 0, ("rename failed for %s err %s", srcfile, err)) g.log.info("cmd: mv srcfile dstfile successful") # check that on dsthash_subvol the file is a linkto file filepath = dsthashed._fqpath + "/" + dstname file_stat = get_file_stat(dsthashed._host, filepath) self.assertEqual(file_stat['access'], "1000", ("Expected file " "permission to be 1000" " on subvol %s", dsthashed._host)) g.log.info("dsthash_subvol has the expected linkto file") # check on srchashed the file is a data file filepath = srchashed._fqpath + "/" + dstname file_stat = get_file_stat(srchashed._host, filepath) self.assertNotEqual(file_stat['access'], "1000", ("Expected file " "permission not to" "be 1000 on subvol" "%s", srchashed._host)) # Bring down the hashed subvol of dstfile(linkto file) ret = bring_bricks_offline(self.volname, subvols[dcount]) self.assertTrue(ret, ('Error in bringing down subvolume %s', subvols[dcount])) g.log.info('dst subvol %s is offline', subvols[dcount]) # Need to access the file through a fresh lookup through a new mount # create a new dir(choosing server to do a mount) ret, _, _ = g.run(self.mnode, ("mkdir -p /mnt")) self.assertEqual(ret, 0, ('mkdir of mount dir failed')) g.log.info("mkdir of mount dir succeeded") # do a temp mount ret = mount_volume(self.volname, self.mount_type, "/mnt", self.mnode, self.mnode) self.assertTrue(ret, ('temporary mount failed')) g.log.info("temporary mount succeeded") # check that file is accessible (stat) ret, _, _ = g.run(self.mnode, ("stat /mnt/%s" % dstname)) self.assertEqual(ret, 0, ('stat error on for dst file %s', dstname)) g.log.info("stat on /mnt/%s successful", dstname) # cleanup temporary mount ret = umount_volume(self.mnode, "/mnt") self.assertTrue(ret, ('temporary mount failed')) g.log.info("umount successful") # Bring up the hashed subvol ret = bring_bricks_online(self.mnode, self.volname, subvols[dcount], bring_bricks_online_methods=None) self.assertTrue(ret, "Error in bringing back subvol online") g.log.info('Subvol is back online') # now bring down the cached subvol ret = bring_bricks_offline(self.volname, subvols[scount]) self.assertTrue(ret, ('Error in bringing down subvolume %s', subvols[scount])) g.log.info('target subvol %s is offline', subvols[scount]) # file access should fail ret, _, _ = g.run(self.clients[0], ("stat %s" % dstfile)) self.assertEqual(ret, 1, ('stat error on for file %s', dstfile)) g.log.info("dstfile access failed as expected")