def test_mkdir_with_subvol_down(self): ''' Test mkdir hashed to a down subvol ''' # pylint: disable=too-many-locals # pylint: disable=too-many-branches # pylint: disable=too-many-statements # pylint: disable=W0212 mount_obj = self.mounts[0] mountpoint = mount_obj.mountpoint # directory that needs to be created parent_dir = mountpoint + '/parent' child_dir = mountpoint + '/parent/child' # get hashed subvol for name "parent" subvols = (get_subvols(self.mnode, self.volname))['volume_subvols'] hashed, count = find_hashed_subvol(subvols, "/", "parent") self.assertIsNotNone(hashed, "Could not find hashed subvol") # bring target_brick offline bring_bricks_offline(self.volname, subvols[count]) ret = are_bricks_offline(self.mnode, self.volname, subvols[count]) self.assertTrue( ret, ('Error in bringing down subvolume %s', subvols[count])) g.log.info('target subvol is offline') # create parent dir ret, _, err = g.run(self.clients[0], ("mkdir %s" % parent_dir)) self.assertNotEqual( ret, 0, ('Expected mkdir of %s to fail with %s', parent_dir, err)) g.log.info('mkdir of dir %s failed as expected', parent_dir) # check that parent_dir does not exist on any bricks and client brickobject = create_brickobjectlist(subvols, "/") for brickdir in brickobject: adp = "%s/parent" % brickdir.path bpath = adp.split(":") self.assertTrue( (file_exists(brickdir._host, bpath[1])) == 0, ('Expected dir %s not to exist on servers', parent_dir)) for client in self.clients: self.assertTrue( (file_exists(client, parent_dir)) == 0, ('Expected dir %s not to exist on clients', parent_dir)) g.log.info('dir %s does not exist on mount as expected', parent_dir) # Bring up the subvols and create parent directory bring_bricks_online(self.mnode, self.volname, subvols[count], bring_bricks_online_methods=None) ret = are_bricks_online(self.mnode, self.volname, subvols[count]) self.assertTrue( ret, ("Error in bringing back subvol %s online", subvols[count])) g.log.info('Subvol is back online') ret, _, _ = g.run(self.clients[0], ("mkdir %s" % parent_dir)) self.assertEqual(ret, 0, ('Expected mkdir of %s to succeed', parent_dir)) g.log.info('mkdir of dir %s successful', parent_dir) # get hash subvol for name "child" hashed, count = find_hashed_subvol(subvols, "parent", "child") self.assertIsNotNone(hashed, "Could not find hashed subvol") # bring target_brick offline bring_bricks_offline(self.volname, subvols[count]) ret = are_bricks_offline(self.mnode, self.volname, subvols[count]) self.assertTrue( ret, ('Error in bringing down subvolume %s', subvols[count])) g.log.info('target subvol is offline') # create child dir ret, _, err = g.run(self.clients[0], ("mkdir %s" % child_dir)) self.assertNotEqual( ret, 0, ('Expected mkdir of %s to fail with %s', child_dir, err)) g.log.info('mkdir of dir %s failed', child_dir) # check if child_dir exists on any bricks for brickdir in brickobject: adp = "%s/parent/child" % brickdir.path bpath = adp.split(":") self.assertTrue( (file_exists(brickdir._host, bpath[1])) == 0, ('Expected dir %s not to exist on servers', child_dir)) for client in self.clients: self.assertTrue((file_exists(client, child_dir)) == 0) g.log.info('dir %s does not exist on mount as expected', child_dir)
def select_cold_tier_bricks_to_bring_offline(mnode, volname): """Randomly selects bricks to bring offline without affecting the cluster from a cold tier. Args: mnode (str): Node on which commands will be executed. volname (str): Name of the volume. Returns: list: On success returns list of bricks that can be brough offline from cold tier. If volume doesn't exist or is a non tiered volume returns empty list. """ cold_tier_bricks_to_bring_offline = [] # Check if volume is tiered if not is_tiered_volume(mnode, volname): return cold_tier_bricks_to_bring_offline # get volume type volume_type_info = get_volume_type_info(mnode, volname) cold_tier_type = volume_type_info['cold_tier_type_info']['coldBrickType'] # get subvols subvols_dict = get_subvols(mnode, volname) cold_tier_subvols = subvols_dict['cold_tier_subvols'] # select bricks from distribute volume if cold_tier_type == 'Distribute': cold_tier_bricks_to_bring_offline = [] # select bricks from replicated, distributed-replicated volume elif (cold_tier_type == 'Replicate' or cold_tier_type == 'Distributed-Replicate'): # Get replica count cold_tier_replica_count = ( volume_type_info['cold_tier_type_info']['coldreplicaCount']) # Get quorum info quorum_info = get_client_quorum_info(mnode, volname) cold_tier_quorum_info = quorum_info['cold_tier_quorum_info'] # Get list of bricks to bring offline cold_tier_bricks_to_bring_offline = ( get_bricks_to_bring_offline_from_replicated_volume( cold_tier_subvols, cold_tier_replica_count, cold_tier_quorum_info)) # select bricks from Disperse, Distribured-Disperse volume elif (cold_tier_type == 'Disperse' or cold_tier_type == 'Distributed-Disperse'): # Get redundancy count cold_tier_redundancy_count = ( volume_type_info['cold_tier_type_info']['coldredundancyCount']) # Get list of bricks to bring offline cold_tier_bricks_to_bring_offline = ( get_bricks_to_bring_offline_from_disperse_volume( cold_tier_subvols, cold_tier_redundancy_count)) return cold_tier_bricks_to_bring_offline
def test_access_file_with_stale_linkto_xattr(self): """ Description: Checks if the files are accessible as non-root user if the files have stale linkto xattr. Steps: 1) Create a volume and start it. 2) Mount the volume on client node using FUSE. 3) Create a file. 4) Enable performance.parallel-readdir and performance.readdir-ahead on the volume. 5) Rename the file in order to create a linkto file. 6) Force the linkto xattr values to become stale by changing the dht subvols in the graph 7) Login as an non-root user and access the file. """ # pylint: disable=protected-access # Set permissions on the mount-point m_point = self.mounts[0].mountpoint ret = set_file_permissions(self.clients[0], m_point, "-R 777") self.assertTrue(ret, "Failed to set file permissions") g.log.info("Successfully set file permissions on mount-point") # Creating a file on the mount-point cmd = 'dd if=/dev/urandom of={}/FILE-1 count=1 bs=16k'.format(m_point) ret, _, _ = g.run(self.clients[0], cmd) self.assertEqual(ret, 0, "File to create file") # Enable performance.parallel-readdir and # performance.readdir-ahead on the volume options = { "performance.parallel-readdir": "enable", "performance.readdir-ahead": "enable" } ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, "Failed to set volume options") g.log.info("Successfully set volume options") # Finding a file name such that renaming source file to it will form a # linkto file subvols = (get_subvols(self.mnode, self.volname))['volume_subvols'] newhash = find_new_hashed(subvols, "/", "FILE-1") new_name = str(newhash.newname) new_host = str(newhash.hashedbrickobject._host) new_name_path = str(newhash.hashedbrickobject._fqpath)[:-1] # Move file such that it hashes to some other subvol and forms linkto # file ret = move_file(self.clients[0], "{}/FILE-1".format(m_point), "{}/{}".format(m_point, new_name)) self.assertTrue(ret, "Rename failed") g.log.info('Renamed file %s to %s', "{}/FILE-1".format(m_point), "{}/{}".format(m_point, new_name)) # Check if "dst_file" is linkto file ret = is_linkto_file(new_host, '{}{}'.format(new_name_path, new_name)) self.assertTrue(ret, "File is not a linkto file") g.log.info("File is linkto file") # Force the linkto xattr values to become stale by changing the dht # subvols in the graph; for that: # disable performance.parallel-readdir and # performance.readdir-ahead on the volume options = { "performance.parallel-readdir": "disable", "performance.readdir-ahead": "disable" } ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, "Failed to disable volume options") g.log.info("Successfully disabled volume options") # Access the file as non-root user cmd = "ls -lR {}".format(m_point) ret, _, _ = g.run(self.mounts[0].client_system, cmd, user="******") self.assertEqual(ret, 0, "Lookup failed ") g.log.info("Lookup successful")
def test_self_heal_with_diff_algorithm(self): """ Test Steps: 1. Create a replicated/distributed-replicate volume and mount it 2. Set data/metadata/entry-self-heal to off and data-self-heal-algorithm to diff 3. Create few files inside a directory with some data 4. Check arequal of the subvol and all the bricks in the subvol should have same checksum 5. Bring down a brick from the subvol and validate it is offline 6. Modify the data of existing files under the directory 7. Bring back the brick online and wait for heal to complete 8. Check arequal of the subvol and all the brick in the same subvol should have same checksum """ # Setting options for key, value in (("data-self-heal", "off"), ("metadata-self-heal", "off"), ("entry-self-heal", "off"), ("data-self-heal-algorithm", "diff")): ret = set_volume_options(self.mnode, self.volname, {key: value}) self.assertTrue(ret, 'Failed to set %s to %s.' % (key, value)) g.log.info("%s set to %s successfully", key, value) # Create few files under a directory with data mountpoint = self.mounts[0].mountpoint client = self.mounts[0].client_system cmd = ("mkdir %s/test_diff_self_heal ; cd %s/test_diff_self_heal ;" "for i in `seq 1 100` ; do dd if=/dev/urandom of=file.$i " " bs=1M count=1; done;" % (mountpoint, mountpoint)) ret, _, _ = g.run(client, cmd) self.assertEqual(ret, 0, "Failed to create file on mountpoint") g.log.info("Successfully created files on mountpoint") # Check arequal checksum of all the bricks is same subvols = get_subvols(self.mnode, self.volname)['volume_subvols'] for subvol in subvols: ret, arequal_from_the_bricks = collect_bricks_arequal(subvol) self.assertTrue( ret, "Arequal is collected successfully across " "the bricks in the subvol {}".format(subvol)) cmd = len(set(arequal_from_the_bricks)) if (self.volume_type == "arbiter" or self.volume_type == "distributed-arbiter"): cmd = len(set(arequal_from_the_bricks[:2])) self.assertEqual( cmd, 1, "Arequal" " is same on all the bricks in the subvol") # List a brick in each subvol and bring them offline brick_to_bring_offline = [] for subvol in subvols: self.assertTrue(subvol, "List is empty") brick_to_bring_offline.extend(sample(subvol, 1)) ret = bring_bricks_offline(self.volname, brick_to_bring_offline) self.assertTrue( ret, "Unable to bring brick: {} offline".format(brick_to_bring_offline)) # Validate the brick is offline ret = are_bricks_offline(self.mnode, self.volname, brick_to_bring_offline) self.assertTrue( ret, "Brick:{} is still online".format(brick_to_bring_offline)) # Modify files under test_diff_self_heal directory cmd = ("for i in `seq 1 100` ; do truncate -s 0 file.$i ; " "truncate -s 2M file.$i ; done;") ret, _, _ = g.run(client, cmd) self.assertEqual(ret, 0, "Failed to modify the files") g.log.info("Successfully modified files") # Start volume with force to bring all bricks online ret, _, _ = volume_start(self.mnode, self.volname, force=True) self.assertEqual(ret, 0, "Volume start with force failed") g.log.info("Volume: %s started successfully", self.volname) # Verify volume's all process are online ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue( ret, ("Volume %s : All process are not online", self.volname)) # Monitor heal completion self.assertTrue( monitor_heal_completion(self.mnode, self.volname, interval_check=10), "Heal failed after 20 mins") # Check are there any files in split-brain self.assertFalse( is_volume_in_split_brain(self.mnode, self.volname), "Some files are in split brain for " "volume: {}".format(self.volname)) # Check arequal checksum of all the bricks is same for subvol in subvols: ret, arequal_from_the_bricks = collect_bricks_arequal(subvol) self.assertTrue( ret, "Arequal is collected successfully across " "the bricks in the subvol {}".format(subvol)) cmd = len(set(arequal_from_the_bricks)) if (self.volume_type == "arbiter" or self.volume_type == "distributed-arbiter"): cmd = len(set(arequal_from_the_bricks[:2])) self.assertEqual( cmd, 1, "Arequal" " is same on all the bricks in the subvol")
def test_mem_leak_on_gluster_procs_after_ssl_enabled(self): """ Steps: Scenario 1: 1) Enable management encryption on the cluster. 2) Create a 2X3 volume. 3) Mount the volume using FUSE on a client node. 4) Start doing IO on the mount (ran IO till the volume is ~88% full) 5) Simultaneously start collecting the memory usage for 'glusterfsd' process. 6) Issue the command "# gluster v heal <volname> info" continuously in a loop. """ # Fill the vol approx 88% bricks = get_all_bricks(self.mnode, self.volname) usable_size = int(get_usable_size_per_disk(bricks[0]) * 0.88) procs = [] counter = 1 for _ in get_subvols(self.mnode, self.volname)['volume_subvols']: filename = "{}/test_file_{}".format(self.mounts[0].mountpoint, str(counter)) proc = g.run_async( self.mounts[0].client_system, "fallocate -l {}G {}".format(usable_size, filename)) procs.append(proc) counter += 1 # Start monitoring resource usage on servers and clients # default interval = 60 sec # count = 780 (60 *12) => for 12 hrs monitor_proc_dict = self.start_memory_and_cpu_usage_logging( self.test_id, count=780) self.assertIsNotNone( monitor_proc_dict, "Failed to start monitoring on servers and " "clients") ret = validate_io_procs(procs, self.mounts) self.assertTrue(ret, "IO Failed") # Perform gluster heal info for 12 hours end_time = datetime.now() + timedelta(hours=12) while True: curr_time = datetime.now() cmd = "gluster volume heal %s info" % self.volname ret, _, _ = g.run(self.mnode, cmd) self.assertEqual(ret, 0, "Failed to execute heal info cmd") if curr_time > end_time: g.log.info("Successfully ran for 12 hours. Checking for " "memory leaks") break # Wait for monitoring processes to complete ret = wait_for_logging_processes_to_stop(monitor_proc_dict, cluster=True) self.assertTrue(ret, "ERROR: Failed to stop monitoring processes") # Check if there are any memory leaks and OOM killers ret = self.check_for_memory_leaks_and_oom_kills_on_servers( self.test_id) self.assertFalse(ret, "Memory leak and OOM kills check failed on servers") ret = self.check_for_memory_leaks_and_oom_kills_on_clients( self.test_id) self.assertFalse(ret, "Memory leak and OOM kills check failed on clients") g.log.info("No memory leaks/OOM kills found on serves and clients")
def test_remove_brick_operations(self): """ Steps: 1. Remove data brick count number of bricks from the volume should fail 2. step 1 with force option should fail 3. Remove redundant brick count number of bricks from the volume should fail 4. step 3 with force option should fail 5. Remove data brick count+1 number of bricks from the volume should fail 6. step 5 with force option should fail 7. Remove disperse count number of bricks from the volume with one wrong brick path should fail 8. step 7 with force option should fail 9. Start remove brick on first subvol bricks 10. Remove all the subvols to make a pure EC vol by start remove brick on second subvol bricks 11. Start remove brick on third subvol bricks 12. Write files and perform read on mountpoints """ # pylint: disable=too-many-locals # pylint: disable=too-many-statements subvols_list = get_subvols(self.mnode, self.volname) volinfo = get_volume_info(self.mnode, self.volname) initial_brickcount = volinfo[self.volname]['brickCount'] data_brick_count = (self.volume['voltype']['disperse_count'] - self.volume['voltype']['redundancy_count']) # Try to remove data brick count number of bricks from the volume bricks_list_to_remove = ( subvols_list['volume_subvols'][0][0:data_brick_count]) ret, _, _ = remove_brick(self.mnode, self.volname, bricks_list_to_remove, option="start") self.assertEqual(ret, 1, ("ERROR: Removed bricks %s from the volume " "%s" % (bricks_list_to_remove, self.volname))) # Trying with force option ret, _, _ = remove_brick(self.mnode, self.volname, bricks_list_to_remove, option="force") self.assertEqual(ret, 1, ("ERROR: Removed bricks %s from the volume " "%s" % (bricks_list_to_remove, self.volname))) # Try to remove redundant brick count number of bricks from the volume bricks_list_to_remove = ( subvols_list['volume_subvols'][0] [0:self.volume['voltype']['redundancy_count']]) ret, _, _ = remove_brick(self.mnode, self.volname, bricks_list_to_remove, option="start") self.assertEqual(ret, 1, ("ERROR: Removed bricks %s from the volume " "%s" % (bricks_list_to_remove, self.volname))) # Trying with force option ret, _, _ = remove_brick(self.mnode, self.volname, bricks_list_to_remove, option="force") self.assertEqual(ret, 1, ("ERROR: Removed bricks %s from the volume" "%s" % (bricks_list_to_remove, self.volname))) # Try to remove data brick count+1 number of bricks from the volume bricks_list_to_remove = ( subvols_list['volume_subvols'][0][0:data_brick_count + 1]) ret, _, _ = remove_brick(self.mnode, self.volname, bricks_list_to_remove, option="start") self.assertEqual(ret, 1, ("ERROR: Removed bricks %s from the volume " "%s" % (bricks_list_to_remove, self.volname))) # Trying with force option ret, _, _ = remove_brick(self.mnode, self.volname, bricks_list_to_remove, option="force") self.assertEqual(ret, 1, ("ERROR: Removed bricks %s from the volume " "%s" % (bricks_list_to_remove, self.volname))) # Try to remove disperse count number of bricks from the volume with # one wrong brick path bricks_list_to_remove = (subvols_list['volume_subvols'][0] [0:self.volume['voltype']['disperse_count']]) bricks_list_to_remove[0] = bricks_list_to_remove[0] + "wrong_path" ret, _, _ = remove_brick(self.mnode, self.volname, bricks_list_to_remove, option="start") self.assertEqual(ret, 1, ("ERROR: Removed bricks %s from the volume " "%s" % (bricks_list_to_remove, self.volname))) # Trying with force option ret, _, _ = remove_brick(self.mnode, self.volname, bricks_list_to_remove, option="force") self.assertEqual(ret, 1, ("ERROR: Removed bricks %s from the volume " "%s" % (bricks_list_to_remove, self.volname))) # Verify that the brick count is intact volinfo = get_volume_info(self.mnode, self.volname) latest_brickcount = volinfo[self.volname]['brickCount'] self.assertEqual(initial_brickcount, latest_brickcount, ("Brick count is not expected to " "change, but changed")) # Start remove brick on first subvol bricks bricks_list_to_remove = subvols_list['volume_subvols'][0] ret, _, _ = remove_brick(self.mnode, self.volname, bricks_list_to_remove, option="start") self.assertEqual(ret, 0, ("Failed to remove bricks %s from the volume " "%s" % (bricks_list_to_remove, self.volname))) # Verify that the brick count is intact volinfo = get_volume_info(self.mnode, self.volname) latest_brickcount = volinfo[self.volname]['brickCount'] self.assertEqual(initial_brickcount, latest_brickcount, ("Brick count is not expected to " "change, but changed")) # Wait for remove brick to complete ret = wait_for_remove_brick_to_complete(self.mnode, self.volname, bricks_list_to_remove) self.assertTrue(ret, ("Remove brick is not yet complete on the volume " "%s" % self.volname)) g.log.info("Remove brick is successfully complete on the volume %s", self.volname) # Commit the remove brick operation ret, _, _ = remove_brick(self.mnode, self.volname, bricks_list_to_remove, option="commit") self.assertEqual(ret, 0, ("Failed to commit remove bricks %s from the volume " "%s" % (bricks_list_to_remove, self.volname))) # Remove all the subvols to make a pure EC vol # Start remove brick on second subvol bricks bricks_list_to_remove = subvols_list['volume_subvols'][1] ret, _, _ = remove_brick(self.mnode, self.volname, bricks_list_to_remove, option="start") self.assertEqual(ret, 0, ("Failed to remove bricks %s from the volume " "%s" % (bricks_list_to_remove, self.volname))) # Wait for remove brick to complete ret = wait_for_remove_brick_to_complete(self.mnode, self.volname, bricks_list_to_remove) self.assertTrue(ret, ("Remove brick is not yet complete on the volume " "%s", self.volname)) # Commit the remove brick operation ret, _, _ = remove_brick(self.mnode, self.volname, bricks_list_to_remove, option="commit") self.assertEqual(ret, 0, ("Failed to commit remove bricks %s from the volume" " %s" % (bricks_list_to_remove, self.volname))) g.log.info("Remove brick is successfully complete on the volume %s", self.volname) # Start remove brick on third subvol bricks bricks_list_to_remove = subvols_list['volume_subvols'][2] ret, _, _ = remove_brick(self.mnode, self.volname, bricks_list_to_remove, option="start") self.assertEqual(ret, 0, ("Failed to remove bricks %s from " "the volume %s" % (bricks_list_to_remove, self.volname))) # Wait for remove brick to complete ret = wait_for_remove_brick_to_complete(self.mnode, self.volname, bricks_list_to_remove) self.assertTrue(ret, ("Remove brick is not yet complete on the volume " "%s" % self.volname)) g.log.info("Remove brick is successfully complete on the volume %s", self.volname) # Commit the remove brick operation ret, _, _ = remove_brick(self.mnode, self.volname, bricks_list_to_remove, option="commit") self.assertEqual(ret, 0, ("Failed to commit remove bricks %s from the volume " "%s" % (bricks_list_to_remove, self.volname))) g.log.info("Remove brick is successfully complete on the volume %s", self.volname) # Log volume info and status ret = log_volume_info_and_status(self.mnode, self.volname) self.assertTrue(ret, ("Logging volume info and status failed " "on volume %s" % self.volname)) g.log.info( "Successful in logging volume info and status " "of volume %s", self.volname) # Validate IO ret = validate_io_procs(self.all_mounts_procs, self.mounts) self.io_validation_complete = True self.assertTrue(ret, "IO failed on some of the clients") g.log.info("IO is successful on all mounts") # Write some files on the mount point cmd1 = ("cd %s; mkdir test; cd test; for i in `seq 1 100` ;" "do touch file$i; done" % self.mounts[0].mountpoint) ret, _, _ = g.run(self.mounts[0].client_system, cmd1) self.assertEqual(ret, 0, ("Write operation failed on client " "%s " % self.mounts[0].client_system)) g.log.info("Writes on mount point successful") # Perform read operation on mountpoint cmd2 = ("cd %s; ls -lRt;" % self.mounts[0].mountpoint) ret, _, _ = g.run(self.mounts[0].client_system, cmd2) self.assertEqual(ret, 0, ("Read operation failed on client " "%s " % self.mounts[0].client_system)) g.log.info("Read on mount point successful")
def test_ec_open_fd(self): """ Test Steps: - disable server side heal - Create a file - Set volume option to implement open FD on file - Bring a brick down,say b1 - Open FD on file - Bring brick b1 up - write to open FD file - Monitor heal - Check xattr , ec.version and ec.size of file - Check stat of file """ # pylint: disable=too-many-branches,too-many-statements,too-many-locals mountpoint = self.mounts[0].mountpoint # Disable server side heal ret = disable_heal(self.mnode, self.volname) self.assertTrue(ret, ("Failed to disable server side heal")) g.log.info("Successfully disabled server side heal") # Log Volume Info and Status after disabling server side heal ret = log_volume_info_and_status(self.mnode, self.volname) self.assertTrue(ret, ("Logging volume info and status failed " "on volume %s", self.volname)) # Create a file cmd = ("cd %s; touch 'file_openfd';" % mountpoint) ret, _, err = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, err) g.log.info('Finished creating a file while all the bricks are UP') # Set volume options ret = set_volume_options(self.mnode, self.volname, {"performance.read-after-open": "yes"}) self.assertTrue(ret, 'Failed to set volume {}' ' options'.format(self.volname)) g.log.info('Successfully set %s volume options', self.volname,) # Bringing brick b1 offline sub_vols = get_subvols(self.mnode, self.volname) subvols_list = sub_vols['volume_subvols'] bricks_list1 = subvols_list[0] brick_b1_down = choice(bricks_list1) ret = bring_bricks_offline(self.volname, brick_b1_down) self.assertTrue(ret, 'Brick %s is not offline' % brick_b1_down) g.log.info('Brick %s is offline successfully', brick_b1_down) node = self.mounts[0].client_system # Open FD proc = open_file_fd(mountpoint, time=100, client=node) # Bring brick b1 online ret = bring_bricks_online(self.mnode, self.volname, [brick_b1_down], 'glusterd_restart') self.assertTrue(ret, 'Brick {} is not brought ' 'online'.format(brick_b1_down)) g.log.info('Brick %s is online successfully', brick_b1_down) # Validate peers are connected ret = self.validate_peers_are_connected() self.assertTrue(ret, "Peers are not in connected state after bringing" " an offline brick to online via `glusterd restart`") g.log.info("Successfully validated peers are in connected state") # Check if write to FD is successful g.log.info('Open FD on file successful') ret, _, _ = proc.async_communicate() self.assertEqual(ret, 0, "Write to FD is successful") # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') g.log.info('Heal has completed successfully') file_openfd = os.path.join(mountpoint, 'file_openfd') # Check if data exists on file ret = check_if_pattern_in_file(node, 'xyz', file_openfd) self.assertEqual(ret, 0, 'xyz does not exists in file') g.log.info('xyz exists in file') file_fd = 'file_openfd' # Check if EC version is same on all bricks which are up ret = validate_xattr_on_all_bricks(bricks_list1, file_fd, 'trusted.ec.version') self.assertTrue(ret, "Healing not completed and EC version is " "not updated") g.log.info("Healing is completed and EC version is updated") # Check if EC size is same on all bricks which are up ret = validate_xattr_on_all_bricks(bricks_list1, file_fd, 'trusted.ec.size') self.assertTrue(ret, "Healing not completed and EC size is " "not updated") g.log.info("Healing is completed and EC size is updated") # Check stat of file cmd = "cd %s; du -kh file_openfd" % mountpoint ret, _, err = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, err) g.log.info('File %s is accessible', file_fd)
def test_metadata_self_heal_on_open_fd(self): """ Description: Pro-active metadata self heal on open fd Steps : 1) Create a volume. 2) Mount the volume using FUSE. 3) Create test executable on volume mount. 4) While test execution is in progress, bring down brick1. 5) From mount point, change ownership, permission, group id of the test file. 6) While test execution is in progress, bring back brick1 online. 7) Do stat on the test file to check ownership, permission, group id on mount point and on bricks 8) Stop test execution. 9) Do stat on the test file to check ownership, permission, group id on mount point and on bricks. 10) There should be no pending heals in the heal info command. 11) There should be no split-brain. 12) Calculate arequal of the bricks and mount point and it should be same. """ # pylint: disable=too-many-statements,too-many-locals # pylint: disable=too-many-branches bricks_list = get_all_bricks(self.mnode, self.volname) self.assertIsNotNone(bricks_list, 'Brick list is None') client = self.clients[0] # Create test executable file on mount point m_point = self.mounts[0].mountpoint test_file = "testfile.sh" cmd = ("echo 'while true; do echo 'Press CTRL+C to stop execution';" " done' >> {}/{}".format(m_point, test_file)) ret, _, _ = g.run(client, cmd) self.assertEqual(ret, 0, "Failed to create test file") # Execute the test file cmd = "cd {}; sh {}".format(m_point, test_file) g.run_async(client, cmd) # Get pid of the test file _cmd = "ps -aux | grep -v grep | grep testfile.sh | awk '{print $2}'" ret, out, _ = g.run(client, _cmd) self.assertEqual(ret, 0, "Failed to get pid of test file execution") # Bring brick1 offline ret = bring_bricks_offline(self.volname, [bricks_list[1]]) self.assertTrue( ret, 'Failed to bring bricks {} ' 'offline'.format(bricks_list[1])) ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[1]]) self.assertTrue(ret, 'Bricks {} are not ' 'offline'.format(bricks_list[1])) # change uid, gid and permission from client cmd = "chown {} {}/{}".format(self.user, m_point, test_file) ret, _, _ = g.run(client, cmd) self.assertEqual(ret, 0, "chown failed") cmd = "chgrp {} {}/{}".format(self.user, m_point, test_file) ret, _, _ = g.run(client, cmd) self.assertEqual(ret, 0, "chgrp failed") cmd = "chmod 777 {}/{}".format(m_point, test_file) ret, _, _ = g.run(client, cmd) self.assertEqual(ret, 0, "chown failed") # Bring brick1 online ret = bring_bricks_online(self.mnode, self.volname, [bricks_list[1]]) self.assertTrue( ret, 'Failed to bring bricks {} online'.format(bricks_list[1])) ret = get_pathinfo(client, "{}/{}".format(m_point, test_file)) self.assertIsNotNone( ret, "Unable to get " "trusted.glusterfs.pathinfo of file") nodes_to_check = {} bricks_list = [] for brick in ret['brickdir_paths']: node, brick_path = brick.split(':') if node[0:2].isdigit(): nodes_to_check[node] = os.path.dirname(brick_path) path = node + ":" + os.path.dirname(brick_path) else: nodes_to_check[gethostbyname(node)] = ( os.path.dirname(brick_path)) path = gethostbyname(node) + ":" + os.path.dirname(brick_path) bricks_list.append(path) nodes_to_check[client] = m_point # Verify that the changes are successful on bricks and client self._verify_stat_info(nodes_to_check, test_file) # Kill the test executable file for pid in out.split('\n')[:-1]: cmd = "kill -s 9 {}".format(pid) ret, _, _ = g.run(client, cmd) self.assertEqual(ret, 0, "Failed to kill test file execution") # Verify that the changes are successful on bricks and client self._verify_stat_info(nodes_to_check, test_file) # Verify there are no pending heals heal_info = get_heal_info_summary(self.mnode, self.volname) self.assertIsNotNone(heal_info, 'Unable to get heal info') for brick in bricks_list: self.assertEqual(int(heal_info[brick]['numberOfEntries']), 0, ("Pending heal on brick {} ".format(brick))) # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get arequal for mount ret, arequals = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') mount_point_total = arequals[0].splitlines()[-1].split(':')[-1] # Collecting data bricks vol_info = get_volume_info(self.mnode, self.volname) self.assertIsNotNone(vol_info, 'Unable to get volume info') data_brick_list = [] for brick in bricks_list: for brick_info in vol_info[self.volname]["bricks"]["brick"]: if brick_info["name"] == brick: if brick_info["isArbiter"] == "0": data_brick_list.append(brick) bricks_list = data_brick_list # Get arequal on bricks and compare with mount_point_total # It should be the same arbiter = self.volume_type.find('arbiter') >= 0 subvols = get_subvols(self.mnode, self.volname)['volume_subvols'] stop = len(subvols[0]) - 1 if arbiter else len(subvols[0]) for subvol in subvols: subvol = [i for i in subvol if i in bricks_list] if subvol: ret, arequal = collect_bricks_arequal(subvol[0:stop]) self.assertTrue( ret, 'Unable to get arequal checksum ' 'on {}'.format(subvol[0:stop])) self.assertEqual( len(set(arequal)), 1, 'Mismatch of arequal ' 'checksum among {} is ' 'identified'.format(subvol[0:stop])) brick_total = arequal[-1].splitlines()[-1].split(':')[-1] self.assertEqual( brick_total, mount_point_total, "Arequals for mountpoint and {} " "are not equal".format(subvol[0:stop]))
def is_layout_complete(mnode, volname, dirpath): """This function reads the subvols in the given volume and checks whether layout is complete or not. Layout starts at zero, ends at 32-bits high, and has no holes or overlaps Args: volname (str): volume name mnode (str): Node on which cmd has to be executed. dirpath (str): directory path; starting from root of mount point. Returns (bool): True if layout is complete False if layout has any holes or overlaps Example: is_layout_complete("abc.xyz.com", "testvol", "/") is_layout_complete("abc.xyz.com", "testvol", "/dir1/dir2/dir3") """ subvols_list = get_subvols(mnode, volname)['volume_subvols'] trim_subvols_list = [y for x in subvols_list for y in x] # append the dirpath to the elements in the list final_subvols_list = [x + dirpath for x in trim_subvols_list] complete_hash_list = [] for fqpath in final_subvols_list: hash_list = BrickDir(fqpath).hashrange complete_hash_list.append(hash_list) joined_hashranges = [y for x in complete_hash_list for y in x] g.log.debug("joined range list: %s" % joined_hashranges) # remove duplicate hashes collapsed_ranges = list(set(joined_hashranges)) # sort the range list for good measure collapsed_ranges.sort() # first hash in the list is 0? if collapsed_ranges[0] != 0: g.log.error('First hash in range (%d) is not zero' % collapsed_ranges[0]) return False # last hash in the list is 32-bits high? if collapsed_ranges[-1] != int(0xffffffff): g.log.error('Last hash in ranges (%s) is not 0xffffffff' % hex(collapsed_ranges[-1])) return False # remove the first and last hashes clipped_ranges = collapsed_ranges[1:-1] g.log.debug('clipped: %s' % clipped_ranges) # walk through the list in pairs and look for diff == 1 iter_ranges = iter(clipped_ranges) for first in iter_ranges: second = next(iter_ranges) hash_difference = second - first g.log.debug('%d - %d = %d' % (second, first, hash_difference)) if hash_difference > 1: g.log.error("Layout has holes") return False elif hash_difference < 1: g.log.error("Layout has overlaps")
def test_split_brain(self): """ Description: Create split-brain on files and check if IO's fail - Disable self-heal and cluster-quorum-type - Get the bricks from the volume - Write IO and validate IO - Bring 1st set of brick offline(1 Data brick and arbiter brick) - Write IO and validate IO - Bring 2nd set of bricks offline(1 Data brick and arbiter brick) - Write IO and validate IO - Check volume is in split-brain - Write IO and validate IO - should fail - Enable self-heal and cluster-quorum-type - Write IO and validate IO - should fail """ # Disable self-heal and cluster-quorum-type options = {"self-heal-daemon": "off", "cluster.quorum-type": "none"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, ("Unable to set volume option %s for " "volume %s" % (options, self.volname))) # Get the bricks from the volume sub_vols = get_subvols(self.mnode, self.volname) self.bricks_to_bring_offline = list(sub_vols['volume_subvols'][0]) # Write IO's write_cmd = ("/usr/bin/env python %s create_files -f 1 " "--base-file-name test_file --fixed-file-size 1k %s" % (self.script_upload_path, self.mounts[0].mountpoint)) ret, _, _ = g.run(self.mounts[0].client_system, write_cmd) # Bring 1st set of brick offline(1 Data brick and arbiter brick) for bricks in ((0, -1), (1, -1)): down_bricks = [] for brick in bricks: down_bricks.append(self.bricks_to_bring_offline[brick]) ret = bring_bricks_offline(self.volname, down_bricks) self.assertTrue( ret, 'Failed to bring bricks {} offline'.format(down_bricks)) proc = g.run_async(self.mounts[0].client_system, write_cmd) # Validate I/O self.assertTrue(validate_io_procs([proc], self.mounts), "IO failed on some of the clients") # Bring bricks online self._bring_bricks_online() # Check volume is in split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertTrue(ret, "unable to create split-brain scenario") g.log.info("Successfully created split brain scenario") # Write IO's proc2 = g.run_async(self.mounts[0].client_system, write_cmd) # Validate I/O self.assertFalse(validate_io_procs([proc2], self.mounts), "IO passed on split-brain") g.log.info("Expected - IO's failed due to split-brain") # Enable self-heal and cluster-quorum-type options = {"self-heal-daemon": "on", "cluster.quorum-type": "auto"} ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, ("Unable to set volume option %s for " "volume %s" % (options, self.volname))) # Write IO's proc3 = g.run_async(self.mounts[0].client_system, write_cmd) # Validate I/O self.assertFalse(validate_io_procs([proc3], self.mounts), "IO passed on split-brain") g.log.info("Expected - IO's failed due to split-brain")
def test_brick_full_add_brick_rebalance(self): """ Test case: 1. Create a volume, start it and mount it. 2. Create a data set on the client node such that all the available space is used and "No space left on device" error is generated. 3. Set cluster.min-free-disk to 30%. 4. Add bricks to the volume, trigger rebalance and wait for rebalance to complete. """ # Create a data set on the client node such that all the available # space is used and "No space left on device" error is generated bricks = get_all_bricks(self.mnode, self.volname) # Calculate the usable size and fill till it reaches # min free limit usable_size = get_usable_size_per_disk(bricks[0]) subvols = get_subvols(self.mnode, self.volname)['volume_subvols'] filename = "abc" for subvol in subvols: while (subvols[find_hashed_subvol(subvols, "/", filename)[1]] == subvol): filename = self._get_random_string() ret, _, err = g.run(self.mounts[0].client_system, "fallocate -l {}G {}/{}".format( usable_size, self.mounts[0].mountpoint, filename)) err_msg = 'No space left on device' if ret and err_msg in err: ret = 0 self.assertFalse(ret, "Failed to fill disk to min free limit") g.log.info("Disk filled up to min free limit") # Try to perfrom I/O from mount point(This should fail) ret, _, _ = g.run(self.mounts[0].client_system, "fallocate -l 5G {}/mfile".format( self.mounts[0].mountpoint)) self.assertTrue(ret, "Unexpected: Able to do I/O even when disks are " "filled to min free limit") g.log.info("Expected: Unable to perfrom I/O as min free disk is hit") # Set cluster.min-free-disk to 30% ret = set_volume_options(self.mnode, self.volname, {'cluster.min-free-disk': '30%'}) self.assertTrue(ret, "Failed to set cluster.min-free-disk to 30%") # Add brick to volume ret = expand_volume(self.mnode, self.volname, self.servers, self.all_servers_info) self.assertTrue(ret, "Failed to add brick on volume %s" % self.volname) # Trigger rebalance and wait for it to complete ret, _, _ = rebalance_start(self.mnode, self.volname, force=True) self.assertEqual(ret, 0, "Failed to start rebalance on the volume %s" % self.volname) # Wait for rebalance to complete ret = wait_for_rebalance_to_complete(self.mnode, self.volname, timeout=1200) self.assertTrue(ret, "Rebalance is not yet complete on the volume " "%s" % self.volname) g.log.info("Rebalance successfully completed")
def test_gfid_split_brain_resolution(self): """ - create gfid split-brain of files and resolves them using source-brick option of the CLI. """ # pylint: disable=too-many-statements # pylint: disable=too-many-locals # Disable all self-heals and client-quorum options = { "self-heal-daemon": "off", "data-self-heal": "off", "metadata-self-heal": "off", "entry-self-heal": "off", "cluster.quorum-type": "none" } g.log.info("setting volume options %s", options) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, ("Unable to set volume option %s for " "volume %s" % (options, self.volname))) g.log.info("Successfully set %s for volume %s", options, self.volname) # Create dir inside which I/O will be performed. ret = mkdir(self.mounts[0].client_system, "%s/test_gfid_split_brain" % self.mounts[0].mountpoint) self.assertTrue(ret, "mkdir failed") # get the subvolumes g.log.info("Starting to get sub-volumes for volume %s", self.volname) subvols_dict = get_subvols(self.mnode, self.volname) num_subvols = len(subvols_dict['volume_subvols']) g.log.info("Number of subvolumes in volume %s:", num_subvols) # Toggle bricks and perform I/O file_list = [ "file1.txt", "file2.txt", "file3.txt", "file4.txt", "file5.txt", "file6.txt", "file7.txt", "file8.txt", "file9.txt", "file10.txt" ] brick_index = 0 offline_bricks = [] for _ in range(0, 3): for i in range(0, num_subvols): subvol_brick_list = subvols_dict['volume_subvols'][i] offline_bricks.append(subvol_brick_list[brick_index % 3]) offline_bricks.append(subvol_brick_list[(brick_index + 1) % 3]) self.toggle_bricks_and_perform_io(file_list, offline_bricks) brick_index += 1 offline_bricks[:] = [] # Enable shd g.log.info("enabling the self heal daemon") ret = enable_self_heal_daemon(self.mnode, self.volname) self.assertTrue(ret, "failed to enable self heal daemon") g.log.info("Successfully enabled the self heal daemon") # Wait for self heal processes to come online g.log.info("Wait for selfheal process to come online") timeout = 300 ret = wait_for_self_heal_daemons_to_be_online(self.mnode, self.volname, timeout) self.assertTrue(ret, "Self-heal process are not online") g.log.info("All self heal process are online") # Trigger heal ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Starting heal failed') g.log.info('Index heal launched') # checking if file is in split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertTrue(ret, "Files are not in split-brain as expected.") g.log.info("Files are still in split-brain") # First brick of each replica will be used as source-brick first_brick_list = [] for i in range(0, num_subvols): subvol_brick_list = subvols_dict['volume_subvols'][i] brick = subvol_brick_list[0] first_brick_list.append(brick) # Find which dht subvols the 10 files are present in and trigger heal for filename in file_list: fpath = self.mounts[0].mountpoint + "/test_gfid_split_brain/" + \ filename gfile = GlusterFile(self.clients[0], fpath) for brick in first_brick_list: _, brick_path = brick.split(':') match = [ brick for item in gfile.hashed_bricks if brick_path in item ] if match: self.resolve_gfid_split_brain( "/test_gfid_split_brain/" + filename, brick) # Trigger heal to complete pending data/metadata heals ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Starting heal failed') g.log.info('Index heal launched') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Get arequals and compare for i in range(0, num_subvols): # Get arequal for first brick subvol_brick_list = subvols_dict['volume_subvols'][i] node, brick_path = subvol_brick_list[0].split(':') command = ('arequal-checksum -p %s ' '-i .glusterfs -i .landfill -i .trashcan' % brick_path) ret, arequal, _ = g.run(node, command) first_brick_total = arequal.splitlines()[-1].split(':')[-1] # Get arequal for every brick and compare with first brick for brick in subvol_brick_list[1:]: node, brick_path = brick.split(':') command = ('arequal-checksum -p %s ' '-i .glusterfs -i .landfill -i .trashcan' % brick_path) ret, brick_arequal, _ = g.run(node, command) self.assertFalse(ret, 'Failed to get arequal on brick %s' % brick) g.log.info('Getting arequal for %s is successful', brick) brick_total = brick_arequal.splitlines()[-1].split(':')[-1] self.assertEqual( first_brick_total, brick_total, 'Arequals for subvol and %s are not equal' % brick) g.log.info('Arequals for subvol and %s are equal', brick)
def test_heal_info_shouldnot_list_files_being_accessed(self): """ - bring brick 1 offline - create files and validate IO - get entries before accessing file - get first filename from active subvol without offline bricks - access and modify the file - while accessing - get entries - Compare entries before accessing and while accessing - validate IO """ # Bring 1-st brick offline brick_to_bring_offline = [self.bricks_list[0]] g.log.info('Bringing bricks %s offline...' % brick_to_bring_offline) ret = bring_bricks_offline(self.volname, brick_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s offline' % brick_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, brick_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % brick_to_bring_offline) g.log.info('Bringing bricks %s offline is successful' % brick_to_bring_offline) # Creating files on client side for mount_obj in self.mounts: g.log.info("Generating data for %s:%s" % (mount_obj.client_system, mount_obj.mountpoint)) # Creating files cmd = ("python %s create_files -f 100 %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) self.all_mounts_procs.append(proc) # Validate IO g.log.info("Wait for IO to complete and validate IO ...") ret = validate_io_procs(self.all_mounts_procs, self.mounts) self.io_validation_complete = True self.assertTrue(ret, "IO failed on some of the clients") g.log.info("IO is successful on all mounts") # Get entries before accessing file g.log.info("Getting entries_before_accessing file...") entries_before_accessing = get_heal_info_summary( self.mnode, self.volname) self.assertNotEqual(entries_before_accessing, None, 'Can`t get heal info summary') g.log.info( "Getting entries_before_accessing file finished successfully") # Get filename to access from active subvol without offline bricks # Get last subvol subvols = get_subvols(self.mnode, self.volname) subvol_without_offline_brick = subvols['volume_subvols'][-1] # Get first brick server and brick path # and get first file from filelist subvol_mnode, mnode_brick = subvol_without_offline_brick[0].split(':') ret, file_list, err = g.run(subvol_mnode, 'ls %s' % mnode_brick) file_to_edit = file_list.splitlines()[0] # Access and modify the file g.log.info("Start modifying IO on all mounts...") self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Modifying IO on %s:%s", mount_obj.client_system, mount_obj.mountpoint) cmd = ("cd %s/ ; " "dd if=/dev/zero of=%s bs=1G count=1" % (mount_obj.mountpoint, file_to_edit)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) self.all_mounts_procs.append(proc) g.log.info("IO on %s:%s is modified successfully" % (mount_obj.client_system, mount_obj.mountpoint)) self.io_validation_complete = False # Get entries while accessing file g.log.info("Getting entries while accessing file...") entries_while_accessing = get_heal_info_summary( self.mnode, self.volname) self.assertNotEqual(entries_before_accessing, None, 'Can`t get heal info summary') g.log.info("Getting entries while accessing file " "finished successfully") # Compare dicts before accessing and while accessing g.log.info('Comparing entries before modifying and while modifying...') ret = cmp(entries_before_accessing, entries_while_accessing) self.assertEqual(ret, 0, 'Entries before modifying and while modifying' 'are not equal') g.log.info('Comparison entries before modifying and while modifying' 'finished successfully.') # Validate IO g.log.info("Wait for IO to complete and validate IO ...") ret = validate_io_procs(self.all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") self.io_validation_complete = True g.log.info("IO is successful on all mounts")
def test_one_brick_full_add_brick_rebalance(self): """ Test case: 1. Create a pure distribute volume with 3 bricks. 2. Start it and mount it on client. 3. Fill one disk of the volume till it's full 4. Add brick to volume, start rebalance and wait for it to complete. 5. Check arequal checksum before and after add brick should be same. 6. Check if link files are present on bricks or not. """ # Fill few bricks till it is full bricks = get_all_bricks(self.mnode, self.volname) # Calculate the usable size and fill till it reaches # min free limit usable_size = get_usable_size_per_disk(bricks[0]) subvols = get_subvols(self.mnode, self.volname)['volume_subvols'] fname = "abc" # Create directories in hierarchy dirp = "/dir1/dir2/" path = "{}{}".format(self.mounts[0].mountpoint, dirp) ret = mkdir(self.mounts[0].client_system, path, parents=True) self.assertTrue(ret, "Failed to create dir hierarchy") for _ in range(0, usable_size): # Create files inside directories while (subvols[find_hashed_subvol(subvols, dirp, fname)[1]][0] != subvols[0][0]): fname = self._get_random_string() ret, _, _ = g.run(self.mounts[0].client_system, "fallocate -l 1G {}{}".format(path, fname)) self.assertFalse(ret, "Failed to fill disk to min free limit") fname = self._get_random_string() g.log.info("Disk filled up to min free limit") # Collect arequal checksum before ops arequal_checksum_before = collect_mounts_arequal(self.mounts[0]) # Add brick to volume ret = expand_volume(self.mnode, self.volname, self.servers, self.all_servers_info) self.assertTrue(ret, "Failed to add brick on volume %s" % self.volname) # Trigger rebalance and wait for it to complete ret, _, _ = rebalance_start(self.mnode, self.volname, force=True) self.assertEqual(ret, 0, "Failed to start rebalance on the volume %s" % self.volname) # Wait for rebalance to complete ret = wait_for_rebalance_to_complete(self.mnode, self.volname, timeout=1800) self.assertTrue(ret, "Rebalance is not yet complete on the volume " "%s" % self.volname) g.log.info("Rebalance successfully completed") # Check for data loss by comparing arequal before and after ops arequal_checksum_after = collect_mounts_arequal(self.mounts[0]) self.assertEqual(arequal_checksum_before, arequal_checksum_after, "arequal checksum is NOT MATCHNG") g.log.info("arequal checksum is SAME") # Check if linkto files exist or not as rebalance is already # completed we shouldn't be seeing any linkto files for brick in bricks: node, path = brick.split(":") path += dirp list_of_files = get_dir_contents(node, path) self.assertIsNotNone(list_of_files, "Unable to get files") for filename in list_of_files: ret = get_dht_linkto_xattr(node, "{}{}".format(path, filename)) self.assertIsNone(ret, "Unable to fetch dht linkto xattr")
def test_impact_of_replace_brick_for_glustershd(self): # pylint: disable=too-many-statements,too-many-branches,too-many-locals nodes = self.volume['servers'] replaced_bricks = [] # check the self-heal daemon process g.log.info("Starting to get self-heal daemon process on " "nodes %s", nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process " "found : %s" % pids)) g.log.info( "Successful in getting Single self heal daemon process" " on all nodes %s", nodes) glustershd_pids = pids # get the bricks for the volume g.log.info("Fetching bricks for the volume : %s", self.volname) bricks_list = get_all_bricks(self.mnode, self.volname) g.log.info("Brick List : %s", bricks_list) # validate the bricks present in volume info with # glustershd server volume file g.log.info("Starting parsing file %s on " "node %s", self.glustershd, self.mnode) ret = do_bricks_exist_in_shd_volfile(self.mnode, self.volname, bricks_list) self.assertTrue(ret, ("Brick List from volume info is different " "from glustershd server volume file. " "Please check log file for details")) g.log.info("Successfully parsed %s file", self.glustershd) # get the subvolumes g.log.info("Starting to get sub-volumes for volume %s", self.volname) subvols_dict = get_subvols(self.mnode, self.volname) num_subvols = len(subvols_dict['volume_subvols']) g.log.info("Number of subvolumes in volume %s:", num_subvols) # replace brick from each sub-vol for i in range(0, num_subvols): subvol_brick_list = subvols_dict['volume_subvols'][i] g.log.info("sub-volume %s brick list : %s", i, subvol_brick_list) brick_to_replace = subvol_brick_list[-1] new_brick = brick_to_replace + 'new' g.log.info("Replacing the brick %s for the volume : %s", brick_to_replace, self.volname) ret, _, err = replace_brick(self.mnode, self.volname, brick_to_replace, new_brick) self.assertFalse(ret, err) g.log.info('Replaced brick %s to %s successfully', brick_to_replace, new_brick) replaced_bricks.append(brick_to_replace) # Verify volume's all process are online for 60 sec g.log.info("Verifying volume's all process are online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname, timeout=60) self.assertTrue(ret, ("Volume %s : All process are not " "online", self.volname)) g.log.info("Successfully Verified volume %s processes are online", self.volname) # Verify glustershd process releases its parent process ret = is_shd_daemonized(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or " "more than One self heal daemon process found")) # check the self-heal daemon process g.log.info("Starting to get self-heal daemon process on nodes " "%s", nodes) ret, pids = get_self_heal_daemon_pid(nodes) self.assertTrue(ret, ("Either No self heal daemon process found or" " more than One self heal daemon process" " found : %s" % pids)) g.log.info( "Successful in getting Single self heal daemon process" " on all nodes %s", nodes) glustershd_pids_after_replacement = pids # Compare pids before and after replacing self.assertNotEqual( glustershd_pids, glustershd_pids_after_replacement, "Self Daemon process is same before and" " after replacing bricks") g.log.info("Self Heal Daemon Process is different before and " "after replacing bricks") # get the bricks for the volume after replacing bricks_list_after_replacing = get_all_bricks(self.mnode, self.volname) g.log.info("Brick List after expanding " "volume: %s", bricks_list_after_replacing) # validate the bricks present in volume info # with glustershd server volume file after replacing bricks g.log.info("Starting parsing file %s", self.glustershd) ret = do_bricks_exist_in_shd_volfile(self.mnode, self.volname, bricks_list_after_replacing) self.assertTrue(ret, ("Brick List from volume info is different " "from glustershd server volume file after " "replacing bricks. Please check log file " "for details")) g.log.info("Successfully parsed %s file", self.glustershd) g.log.info("Starting to delete replaced brick dir's") # Remove brick directories of the replaced bricks as this is not # handled by tearDown class for bricks in replaced_bricks: node, brick_path = bricks.split(r':') cmd = "rm -rf " + brick_path ret, _, _ = g.run(node, cmd) if ret: raise ExecutionError("Failed to delete the brick dir's for" " %s and brick %s" % (node, brick_path)) g.log.info("Successfully deleted brick dir's for replaced bricks")
def test_client_side_quorum_with_auto_option_cross2(self): """ Test Script to verify the Client Side Quorum with auto option * set cluster.quorum-type to auto. * start I/O from the mount point. * kill 2-nd brick process from the each and every replica set * perform ops """ # pylint: disable=too-many-branches,too-many-statements # set cluster.quorum-type to auto options = {"cluster.quorum-type": "auto"} g.log.info("setting cluster.quorum-type to auto on " "volume %s", self.volname) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, ("Unable to set volume option %s for" "volume %s" % (options, self.volname))) g.log.info("Successfully set %s for volume %s", options, self.volname) # Start IO on mounts g.log.info("Starting IO .....") all_mounts_procs = [] cmd = ("python %s create_files " "-f 10 --base-file-name file %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients") # get the subvolumes g.log.info("Starting to get sub-volumes for volume %s", self.volname) subvols_dict = get_subvols(self.mnode, self.volname) num_subvols = len(subvols_dict['volume_subvols']) g.log.info("Number of subvolumes in volume %s:", num_subvols) # bring 2-nd bricks offline for all the subvolumes offline_bricks = [] for i in range(0, num_subvols): subvol_brick_list = subvols_dict['volume_subvols'][i] g.log.info("sub-volume %s brick list : %s", i, subvol_brick_list) bricks_to_bring_offline = subvol_brick_list[1] g.log.info("Going to bring down the brick process " "for %s", bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, ("Failed to bring down the bricks. Please " "check the log file for more details.")) g.log.info("Brought down the brick process " "for %s successfully", bricks_to_bring_offline) offline_bricks.append(bricks_to_bring_offline) # create new file named newfile0.txt g.log.info("Start creating new file on all mounts...") all_mounts_procs = [] cmd = ("python %s create_files " "-f 1 --base-file-name newfile %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients") # create directory user1 g.log.info("Start creating directory on all mounts...") all_mounts_procs = [] cmd = ("python %s create_deep_dir %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients") # create h/w link to file g.log.info("Start creating hard link for file0.txt on mount") cmd = ("ln %s/file0.txt %s/file0.txt_hwlink" % (self.mounts[0].mountpoint, self.mounts[0].mountpoint)) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertFalse( ret, 'Failed to create hard link ' 'for file0.txt on %s' % self.mounts[0].mountpoint) g.log.info("Hard link for file0.txt on %s is created successfully", self.mounts[0].mountpoint) # create s/w link g.log.info("Start creating soft link for file1.txt on mount") cmd = ("ln -s %s/file1.txt %s/file1.txt_swlink" % (self.mounts[0].mountpoint, self.mounts[0].mountpoint)) ret, _, _ = g.run(self.mounts[0].client_system, cmd) self.assertFalse( ret, 'Failed to create soft link ' 'for file1.txt on %s' % self.mounts[0].mountpoint) g.log.info("Soft link for file1.txt on %s is created successfully", self.mounts[0].mountpoint) # append to file g.log.info("Appending to file1.txt on all mounts") for mount_obj in self.mounts: cmd = ("cat %s/file0.txt >> %s/file1.txt" % (mount_obj.mountpoint, mount_obj.mountpoint)) ret, _, _ = g.run(mount_obj.client_system, cmd) self.assertFalse( ret, 'Failed to append file1.txt on %s' % mount_obj.mountpoint) g.log.info("Appending for file1.txt on %s is successful", mount_obj.mountpoint) # modify the file g.log.info("Modifying file1.txt on all mounts") for mount_obj in self.mounts: cmd = ("echo 'Modify Contents' > %s/file1.txt" % mount_obj.mountpoint) ret, _, _ = g.run(mount_obj.client_system, cmd) self.assertFalse( ret, 'Failed to modify file1.txt on %s' % mount_obj.mountpoint) g.log.info("Modifying for file1.txt on %s is successful", mount_obj.mountpoint) # truncate the file g.log.info("Truncating file1.txt on all mounts") for mount_obj in self.mounts: cmd = "truncate -s 0 %s/file1.txt" % mount_obj.mountpoint ret, _, _ = g.run(mount_obj.client_system, cmd) self.assertFalse( ret, 'Failed to truncate file1.txt on %s' % mount_obj.mountpoint) g.log.info("Truncating for file1.txt on %s is successful", mount_obj.mountpoint) # read the file g.log.info("Starting reading files on all mounts") all_mounts_procs = [] for mount_obj in self.mounts: cmd = ("python %s read %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "Reads failed on some of the clients") # stat on file g.log.info("stat on file1.txt on all mounts") for mount_obj in self.mounts: cmd = "stat %s/file1.txt" % mount_obj.mountpoint ret, _, _ = g.run(mount_obj.client_system, cmd) self.assertFalse( ret, 'Failed to stat file1.txt on %s' % mount_obj.mountpoint) g.log.info("Stat for file1.txt on %s is successful", mount_obj.mountpoint) # stat on dir g.log.info("stat on directory on all mounts") for mount_obj in self.mounts: cmd = ("python %s stat %s" % (self.script_upload_path, mount_obj.mountpoint)) ret, _, _ = g.run(mount_obj.client_system, cmd) self.assertFalse( ret, 'Failed to stat directory on %s' % mount_obj.mountpoint) g.log.info("Stat for directory on %s is successful", mount_obj.mountpoint) # ls on mount point g.log.info("ls on mount point on all mounts") for mount_obj in self.mounts: cmd = ("python %s ls %s" % (self.script_upload_path, mount_obj.mountpoint)) ret, _, _ = g.run(mount_obj.client_system, cmd) self.assertFalse(ret, 'Failed to ls on %s' % mount_obj.mountpoint) g.log.info("ls for %s is successful", mount_obj.mountpoint) # bring back the bricks online for all subvolumes g.log.info("bringing up the brick : %s online", offline_bricks) ret = bring_bricks_online(self.mnode, self.volname, offline_bricks) self.assertTrue( ret, ("Failed to brought the brick %s online" % offline_bricks)) g.log.info("Successfully brought the bricks")
def test_replace_brick_self_heal_io_in_progress(self): """ - Create directory on mount point and write files/dirs - Create another set of files (1K files) - While creation of files/dirs are in progress Kill one brick - Remove the contents of the killed brick(simulating disk replacement) - When the IO's are still in progress, restart glusterd on the nodes where we simulated disk replacement to bring back bricks online - Start volume heal - Wait for IO's to complete - Verify whether the files are self-healed - Calculate arequals of the mount point and all the bricks """ # pylint: disable=too-many-locals,too-many-statements,too-many-branches # Create dirs with files g.log.info('Creating dirs with file...') command = ("/usr/bin/env python %s create_deep_dirs_with_files " "-d 2 -l 2 -n 2 -f 10 %s" % (self.script_upload_path, self.mounts[0].mountpoint)) ret, _, err = g.run(self.mounts[0].client_system, command, user=self.mounts[0].user) self.assertFalse(ret, err) g.log.info("IO is successful") # Creating another set of files (1K files) self.all_mounts_procs = [] # Create dirs with files g.log.info('Creating 1K files...') command = ("/usr/bin/env python %s create_files " "-f 1500 --fixed-file-size 10k %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, command, user=self.mounts[0].user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO ret = validate_io_procs(self.all_mounts_procs, self.mounts[0]) self.assertTrue(ret, "IO failed on some of the clients") self.io_validation_complete = True g.log.info("IO is successful on all mounts") # Select bricks to bring offline bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = bricks_to_bring_offline_dict['volume_bricks'] # Bring brick offline ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Remove the content of the killed bricks for brick in bricks_to_bring_offline: brick_node, brick_path = brick.split(':') # Removing files command = ('cd %s ; rm -rf *' % brick_path) ret, _, err = g.run(brick_node, command) self.assertFalse(ret, err) g.log.info('Files are deleted on brick %s', brick) # Bring brick online ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Wait for volume processes to be online ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue( ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s : All process are online", self.volname) # Wait for self-heal-daemons to be online ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal daemons are online") # Start healing ret = trigger_heal_full(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Check arequals for "replicated" all_bricks = get_all_bricks(self.mnode, self.volname) if self.volume_type == "replicated": # Get arequal after bricks are online ret, arequals = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting arequal after successfully bringing' 'bricks online.') mount_point_total = arequals[0].splitlines()[-1].split(':')[-1] # Get arequal on bricks and compare with mount_point_total ret, arequals = collect_bricks_arequal(all_bricks) self.assertTrue(ret, 'Failed to get arequal on bricks') for arequal in arequals: brick_total = arequal.splitlines()[-1].split(':')[-1] self.assertEqual( mount_point_total, brick_total, 'Arequals for mountpoint and brick ' 'are not equal') g.log.info('Arequals for mountpoint and brick are equal') # Check arequals for "distributed-replicated" if self.volume_type == "distributed-replicated": # Get the subvolumes subvols_dict = get_subvols(self.mnode, self.volname) num_subvols = len(subvols_dict['volume_subvols']) g.log.info("Number of subvolumes in volume %s:", num_subvols) # Get arequals and compare for i in range(0, num_subvols): # Get arequal for first brick subvol_brick_list = subvols_dict['volume_subvols'][i] ret, arequal = collect_bricks_arequal(subvol_brick_list[0]) self.assertTrue(ret, 'Failed to get arequal on first brick') first_brick_total = arequal[0].splitlines()[-1].split(':')[-1] # Get arequal for every brick and compare with first brick ret, arequals = collect_bricks_arequal(subvol_brick_list) self.assertTrue(ret, 'Failed to get arequal on bricks') for arequal in arequals: brick_total = arequal.splitlines()[-1].split(':')[-1] self.assertEqual( first_brick_total, brick_total, 'Arequals for subvol and brick are ' 'not equal') g.log.info('Arequals for subvol and brick are equal')
def test_client_side_quorum_with_fixed_for_cross2(self): """ Test Script to verify the Client Side Quorum with fixed for cross 2 volume * Disable self heal daemom * set cluster.quorum-type to fixed. * start I/O( write and read )from the mount point - must succeed * Bring down brick1 * start I/0 ( write and read ) - must succeed * set the cluster.quorum-count to 1 * start I/0 ( write and read ) - must succeed * set the cluster.quorum-count to 2 * start I/0 ( write and read ) - read must pass, write will fail * bring back the brick1 online * start I/0 ( write and read ) - must succeed * Bring down brick2 * start I/0 ( write and read ) - read must pass, write will fail * set the cluster.quorum-count to 1 * start I/0 ( write and read ) - must succeed * cluster.quorum-count back to 2 and cluster.quorum-type to auto * start I/0 ( write and read ) - must succeed * Bring back brick2 online * Bring down brick1 * start I/0 ( write and read ) - read must pass, write will fail * set the quorum-type to none * start I/0 ( write and read ) - must succeed """ # pylint: disable=too-many-branches,too-many-statements # Disable self heal daemon options = {"cluster.self-heal-daemon": "off"} g.log.info("setting %s for the volume %s", options, self.volname) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, ("Unable to set %s for volume %s" % (options, self.volname))) g.log.info("Successfully set %s for volume %s", options, self.volname) # set cluster.quorum-type to fixed options = {"cluster.quorum-type": "fixed"} g.log.info("setting %s for the volume %s", options, self.volname) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, ("Unable to set %s for volume %s" % (options, self.volname))) g.log.info("Successfully set %s for volume %s", options, self.volname) # start I/O( write ) - must succeed g.log.info("Starting IO on all mounts...") g.log.info("mounts: %s", self.mounts) all_mounts_procs = [] cmd = ("python %s create_files " "-f 10 --base-file-name file %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients") # read the file g.log.info("Start reading files on all mounts") all_mounts_procs = [] for mount_obj in self.mounts: cmd = ("python %s read " "%s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "Reads failed on some of the clients") # get the subvolumes g.log.info("Starting to get sub-volumes for volume %s", self.volname) subvols_dict = get_subvols(self.mnode, self.volname) num_subvols = len(subvols_dict['volume_subvols']) g.log.info("Number of subvolumes in volume %s:", num_subvols) # Bring down brick1 for all the subvolumes subvolumes_first_brick_list = [] subvolumes_second_brick_list = [] for i in range(0, num_subvols): subvol_brick_list = subvols_dict['volume_subvols'][i] g.log.info("sub-volume %s brick list : %s", i, subvol_brick_list) subvolumes_first_brick_list.append(subvol_brick_list[0]) subvolumes_second_brick_list.append(subvol_brick_list[1]) g.log.info("Going to bring down the brick process " "for %s", subvolumes_first_brick_list) ret = bring_bricks_offline(self.volname, subvolumes_first_brick_list) self.assertTrue(ret, ("Failed to bring down the bricks. Please " "check the log file for more details.")) g.log.info("Brought down the brick process " "for %s successfully", subvolumes_first_brick_list) # start I/0 ( write and read ) - must succeed g.log.info("Starting IO on all mounts...") g.log.info("mounts: %s", self.mounts) all_mounts_procs = [] cmd = ("python %s create_files " "-f 10 --base-file-name second_file %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients") # read the file g.log.info("Start reading files on all mounts") all_mounts_procs = [] for mount_obj in self.mounts: cmd = ("python %s read " "%s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "Reads failed on some of the clients") # set the cluster.quorum-count to 1 options = {"cluster.quorum-count": "1"} g.log.info("setting %s for the volume %s", options, self.volname) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, ("Unable to set %s for volume %s" % (options, self.volname))) g.log.info("Successfully set %s for volume %s", options, self.volname) # start I/0 ( write and read ) - must succeed g.log.info("Starting IO on mount.....") all_mounts_procs = [] cmd = ("python %s create_files " "-f 10 --base-file-name third_file %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients") # read the file g.log.info("Start reading files on all mounts") all_mounts_procs = [] for mount_obj in self.mounts: cmd = ("python %s read " "%s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "Reads failed on some of the clients") # set the cluster.quorum-count to 2 options = {"cluster.quorum-count": "2"} g.log.info("setting %s for the volume %s", options, self.volname) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, ("Unable to set %s for volume %s" % (options, self.volname))) g.log.info("Successfully set %s for volume %s", options, self.volname) # start I/0 ( write and read ) - read must pass, write will fail g.log.info("Starting IO on mount......") all_mounts_procs = [] cmd = ("python %s create_files " "-f 10 --base-file-name fourth_file %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO g.log.info("Validating whether IO failed with Read Only File System") ret, _ = is_io_procs_fail_with_rofs(self, all_mounts_procs, self.mounts) self.assertTrue(ret, ("Unexpected Error and IO successful" " on Read-Only File System")) g.log.info("EXPECTED Read-only file system in IO while creating file") # read the file g.log.info("Start reading files on all mounts") all_mounts_procs = [] for mount_obj in self.mounts: cmd = ("python %s read " "%s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "Reads failed on some of the clients") # bring back the brick1 online for all subvolumes g.log.info("bringing up the bricks : %s online", subvolumes_first_brick_list) ret = bring_bricks_online(self.mnode, self.volname, subvolumes_first_brick_list) self.assertTrue(ret, ("Failed to brought the bricks %s online" % subvolumes_first_brick_list)) g.log.info("Successfully brought the bricks %s online", subvolumes_first_brick_list) # start I/0 ( write and read ) - must succeed g.log.info("Starting IO on mount.....") all_mounts_procs = [] cmd = ("python %s create_files " "-f 10 --base-file-name fifth_file %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients") # read the file g.log.info("Start reading files on all mounts") all_mounts_procs = [] for mount_obj in self.mounts: cmd = ("python %s read " "%s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "Reads failed on some of the clients") # Bring down brick2 for all the subvolumes g.log.info("Going to bring down the brick process " "for %s", subvolumes_second_brick_list) ret = bring_bricks_offline(self.volname, subvolumes_second_brick_list) self.assertTrue(ret, ("Failed to bring down the bricks. Please " "check the log file for more details.")) g.log.info("Brought down the brick process " "for %s successfully", subvolumes_second_brick_list) # start I/0 ( write and read ) - read must pass, write will fail g.log.info("Start creating files on mounts.....") all_mounts_procs = [] cmd = ("python %s create_files " "-f 10 --base-file-name sixth_file %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO g.log.info("Validating whether IO failed with Read Only File System") ret, _ = is_io_procs_fail_with_rofs(self, all_mounts_procs, self.mounts) self.assertTrue(ret, ("Unexpected Error and IO successful" " on Read-Only File System")) g.log.info("EXPECTED Read-only file system in IO while creating file") # read the file g.log.info("Start reading files on all mounts") all_mounts_procs = [] for mount_obj in self.mounts: cmd = ("python %s read " "%s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "Reads failed on some of the clients") # set the cluster.quorum-count to 1 options = {"cluster.quorum-count": "1"} g.log.info("setting %s for the volume %s", options, self.volname) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, ("Unable to set %s for volume %s" % (options, self.volname))) g.log.info("Successfully set %s for volume %s", options, self.volname) # start I/0 ( write and read ) - must succeed g.log.info("Starting IO on mount.....") all_mounts_procs = [] cmd = ("python %s create_files " "-f 10 --base-file-name seventh_file %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients") # read the file g.log.info("Start reading files on all mounts") all_mounts_procs = [] for mount_obj in self.mounts: cmd = ("python %s read " "%s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "Reads failed on some of the clients") # set cluster.quorum-type to auto and cluster.quorum-count back to 2 options = {"cluster.quorum-type": "auto", "cluster.quorum-count": "2"} g.log.info("setting %s for the volume %s", options, self.volname) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, ("Unable to set %s for volume %s" % (options, self.volname))) g.log.info("Successfully set %s for volume %s", options, self.volname) # start I/0 ( write and read ) - must succeed g.log.info("Starting IO on mount.....") all_mounts_procs = [] cmd = ("python %s create_files " "-f 10 --base-file-name eigth_file %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients") # read the file g.log.info("Start reading files on all mounts") all_mounts_procs = [] for mount_obj in self.mounts: cmd = ("python %s read " "%s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "Reads failed on some of the clients") # Bring back brick2 online for all the subvolumes g.log.info("bringing up the bricks : %s online", subvolumes_second_brick_list) ret = bring_bricks_online(self.mnode, self.volname, subvolumes_second_brick_list) self.assertTrue(ret, ("Failed to brought the brick %s online" % subvolumes_second_brick_list)) g.log.info("Successfully brought the brick %s online", subvolumes_second_brick_list) # Bring down brick1 again for all the subvolumes g.log.info("Going to bring down the brick process " "for %s", subvolumes_first_brick_list) ret = bring_bricks_offline(self.volname, subvolumes_first_brick_list) self.assertTrue(ret, ("Failed to bring down the bricks. Please " "check the log file for more details.")) g.log.info("Brought down the brick process " "for %s successfully", subvolumes_first_brick_list) # start I/0 ( write and read ) - read must pass, write will fail g.log.info("Start creating files on mounts.....") all_mounts_procs = [] cmd = ("python %s create_files " "-f 10 --base-file-name ninth_file %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO g.log.info("Validating whether IO failed with Read Only File System") ret, _ = is_io_procs_fail_with_rofs(self, all_mounts_procs, self.mounts) self.assertTrue(ret, ("Unexpected Error and IO successful" " on Read-Only File System")) g.log.info("EXPECTED Read-only file system in IO while creating file") # read the file g.log.info("Start reading files on all mounts") all_mounts_procs = [] for mount_obj in self.mounts: cmd = ("python %s read " "%s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "Reads failed on some of the clients") # set the quorum-type to none options = {"cluster.quorum-type": "none"} g.log.info("setting %s for the volume %s", options, self.volname) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, ("Unable to set %s for volume %s" % (options, self.volname))) g.log.info("Successfully set %s for volume %s", options, self.volname) # start I/0 ( write and read ) - must succeed g.log.info("Starting IO on mount.....") all_mounts_procs = [] cmd = ("python %s create_files " "-f 10 --base-file-name tenth_file %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "IO failed on some of the clients") # read the file g.log.info("Start reading files on all mounts") all_mounts_procs = [] for mount_obj in self.mounts: cmd = ("python %s read " "%s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "Reads failed on some of the clients") # bring back the bricks online for all subvolumes g.log.info("bringing up the brick : %s online", subvolumes_first_brick_list) ret = bring_bricks_online(self.mnode, self.volname, subvolumes_first_brick_list) self.assertTrue(ret, ("Failed to brought the brick %s online" % subvolumes_first_brick_list)) g.log.info("Successfully brought the bricks")
def test_heal_full_after_deleting_files(self): """ - Create IO - Calculate arequal from mount - Delete data from backend from the EC volume - Trigger heal full - Check if heal is completed - Check for split-brain - Calculate arequal checksum and compare it """ # pylint: disable=too-many-locals,too-many-statements # Creating files on client side for mount_obj in self.mounts: g.log.info("Generating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create dirs with file g.log.info('Creating dirs with file...') command = ("/usr/bin/env python %s create_deep_dirs_with_files " "-d 2 -l 2 -n 2 -f 20 %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO g.log.info("Wait for IO to complete and validate IO ...") ret = validate_io_procs(self.all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") self.io_validation_complete = True g.log.info("IO is successful on all mounts") # Get areequal before deleting the files from brick g.log.info('Getting areequal before getting bricks offline...') ret, result_before_killing_procs = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting areequal before getting bricks offline ' 'is successful') subvols = get_subvols(self.mnode, self.volname)['volume_subvols'] # Delete data from backend from the erasure node for subvol in subvols: erasure = subvol[-1] g.log.info('Clearing ec brick %s', erasure) node, brick_path = erasure.split(':') ret, _, err = g.run(node, 'cd %s/ ; rm -rf *' % brick_path) g.log.error('Clearing ec brick %s is unsuccessful', erasure) self.assertFalse(ret, err) g.log.info('Clearing data from brick is successful') # Trigger heal full ret = trigger_heal_full(self.mnode, self.volname) # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Get areequal after healing g.log.info('Getting areequal after getting bricks online...') ret, result_after_healing = collect_mounts_arequal(self.mounts) self.assertTrue(ret, 'Failed to get arequal') g.log.info('Getting areequal after getting bricks online ' 'is successful') # Comparing areequals self.assertEqual( result_before_killing_procs, result_after_healing, 'Areequals areequals before before killing arbiter ' 'processes and after healing are not equal') g.log.info('Areequals areequals before before killing arbiter ' 'processes and after healing equal')
def test_client_side_quorum_with_auto_option_overwrite_fixed(self): """ Test Script to verify the Client Side Quorum with auto option * check the default value of cluster.quorum-type * try to set any junk value to cluster.quorum-type other than {none,auto,fixed} * check the default value of cluster.quorum-count * set cluster.quorum-type to fixed and cluster.quorum-count to 1 * start I/O from the mount point * kill 2 of the brick process from the each replica set. * set cluster.quorum-type to auto """ # pylint: disable=too-many-locals,too-many-statements # check the default value of cluster.quorum-type option = "cluster.quorum-type" g.log.info("Getting %s for the volume %s", option, self.volname) option_dict = get_volume_options(self.mnode, self.volname, option) self.assertIsNotNone(option_dict, ("Failed to get %s volume option" " for volume %s" % (option, self.volname))) self.assertEqual(option_dict['cluster.quorum-type'], 'auto', ("Default value for %s is not auto" " for volume %s" % (option, self.volname))) g.log.info("Succesfully verified default value of %s for volume %s", option, self.volname) # set the junk value to cluster.quorum-type junk_values = ["123", "abcd", "fixxed", "Aauto"] for each_junk_value in junk_values: options = {"cluster.quorum-type": "%s" % each_junk_value} g.log.info("setting %s for the volume " "%s", options, self.volname) ret = set_volume_options(self.mnode, self.volname, options) self.assertFalse(ret, ("Able to set junk value %s for " "volume %s" % (options, self.volname))) g.log.info( "Expected: Unable to set junk value %s " "for volume %s", options, self.volname) # check the default value of cluster.quorum-count option = "cluster.quorum-count" g.log.info("Getting %s for the volume %s", option, self.volname) option_dict = get_volume_options(self.mnode, self.volname, option) self.assertIsNotNone(option_dict, ("Failed to get %s volume option" " for volume %s" % (option, self.volname))) self.assertEqual(option_dict['cluster.quorum-count'], '(null)', ("Default value for %s is not null" " for volume %s" % (option, self.volname))) g.log.info("Successful in getting %s for the volume %s", option, self.volname) # set cluster.quorum-type to fixed and cluster.quorum-count to 1 options = {"cluster.quorum-type": "fixed", "cluster.quorum-count": "1"} g.log.info("setting %s for the volume %s", options, self.volname) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, ("Unable to set %s for volume %s" % (options, self.volname))) g.log.info("Successfully set %s for volume %s", options, self.volname) # create files g.log.info("Starting IO on all mounts...") g.log.info("mounts: %s", self.mounts) cmd = ("/usr/bin/env python %s create_files " "-f 10 --base-file-name file %s" % (self.script_upload_path, self.mounts[0].mountpoint)) ret, _, err = g.run(self.mounts[0].client_system, cmd) self.assertFalse( ret, "IO failed on %s with '%s'" % (self.mounts[0].client_system, err)) # get the subvolumes g.log.info("starting to get subvolumes for volume %s", self.volname) subvols_dict = get_subvols(self.mnode, self.volname) num_subvols = len(subvols_dict['volume_subvols']) g.log.info("Number of subvolumes in volume %s is %s", self.volname, num_subvols) # bring bricks offline( 2 bricks ) for all the subvolumes for i in range(0, num_subvols): subvol_brick_list = subvols_dict['volume_subvols'][i] g.log.info("sub-volume %s brick list : %s", i, subvol_brick_list) bricks_to_bring_offline = subvol_brick_list[0:2] g.log.info("Going to bring down the brick process " "for %s", bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, ("Failed to bring down the bricks. Please " "check the log file for more details.")) g.log.info("Brought down the brick process " "for %s successfully", bricks_to_bring_offline) # create files g.log.info("Starting IO on all mounts...") g.log.info("mounts: %s", self.mounts) cmd = ("/usr/bin/env python %s create_files " "-f 10 --base-file-name second_file %s" % (self.script_upload_path, self.mounts[0].mountpoint)) ret, _, err = g.run(self.mounts[0].client_system, cmd) self.assertFalse( ret, "IO failed on %s with '%s'" % (self.mounts[0].client_system, err)) # set cluster.quorum-type to auto options = {"cluster.quorum-type": "auto"} g.log.info("setting %s for volume %s", options, self.volname) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, ("Unable to set volume option %s for " "volume %s" % (options, self.volname))) g.log.info("Successfully set %s for volume %s", options, self.volname) # create files all_mounts_procs = [] g.log.info("Starting IO on mountpount...") g.log.info("mounts: %s", self.mounts) cmd = ("mkdir %s/newdir && touch %s/newdir/myfile{1..3}.txt" % (self.mounts[0].mountpoint, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO g.log.info("Validating whether IO failed with " "Transport endpoint is not connected") ret, _ = is_io_procs_fail_with_error(self, all_mounts_procs, self.mounts, self.mount_type) self.assertTrue(ret, ("Unexpected error and IO successful" " on not connected transport endpoint")) g.log.info("EXPECTED: Transport endpoint is not connected" " while creating files")
def test_metadata_self_heal_client_side_heal(self): """ Testcase steps: 1.Turn off the options self heal daemon 2.Create IO 3.Calculate arequal of the bricks and mount point 4.Bring down "brick1" process 5.Change the permissions of the directories and files 6.Change the ownership of the directories and files 7.Change the group of the directories and files 8.Bring back the brick "brick1" process 9.Execute "find . | xargs stat" from the mount point to trigger heal 10.Verify the changes in permissions are not self healed on brick1 11.Verify the changes in permissions on all bricks but brick1 12.Verify the changes in ownership are not self healed on brick1 13.Verify the changes in ownership on all the bricks but brick1 14.Verify the changes in group are not successfully self-healed on brick1 15.Verify the changes in group on all the bricks but brick1 16.Turn on the option metadata-self-heal 17.Execute "find . | xargs md5sum" from the mount point to trgger heal 18.Wait for heal to complete 19.Verify the changes in permissions are self-healed on brick1 20.Verify the changes in ownership are successfully self-healed on brick1 21.Verify the changes in group are successfully self-healed on brick1 22.Calculate arequal check on all the bricks and mount point """ # Setting options ret = set_volume_options(self.mnode, self.volname, {"self-heal-daemon": "off"}) self.assertTrue(ret, 'Failed to set options self-heal-daemon ' 'and metadata-self-heal to OFF') g.log.info("Options are set successfully") # Creating files on client side self.test_meta_data_self_heal_folder = 'test_meta_data_self_heal' for mount_object in self.mounts: command = ("cd {0}/ ; mkdir {1} ; cd {1}/ ;" "for i in `seq 1 100` ; " "do mkdir dir.$i ; " "for j in `seq 1 5` ; " "do dd if=/dev/urandom of=dir.$i/file.$j " "bs=1K count=$j ; done ; done ;".format (mount_object.mountpoint, self.test_meta_data_self_heal_folder)) proc = g.run_async(mount_object.client_system, command, user=mount_object.user) self.all_mounts_procs.append(proc) # Validate IO self.validate_io_on_clients() # Calculate and check arequal of the bricks and mount point self.check_arequal_from_mount_point_and_bricks() # Select bricks to bring offline from a replica set subvols_dict = get_subvols(self.mnode, self.volname) subvols = subvols_dict['volume_subvols'] bricks_to_bring_offline = [] bricks_to_be_online = [] for subvol in subvols: bricks_to_bring_offline.append(subvol[0]) for brick in subvol[1:]: bricks_to_be_online.append(brick) # Bring bricks offline ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Change the permissions of the directories and files self.all_mounts_procs = [] for mount_obj in self.mounts: command = ('cd {}/{}; ' 'for i in `seq 1 100` ; ' 'do chmod 555 dir.$i ; done ; ' 'for i in `seq 1 50` ; ' 'do for j in `seq 1 5` ; ' 'do chmod 666 dir.$i/file.$j ; done ; done ; ' 'for i in `seq 51 100` ; ' 'do for j in `seq 1 5` ; ' 'do chmod 444 dir.$i/file.$j ; done ; done ;' .format(mount_obj.mountpoint, self.test_meta_data_self_heal_folder)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO self.validate_io_on_clients() # Change the ownership of the directories and files self.all_mounts_procs = [] for mount_obj in self.mounts: command = ('cd {}/{} ; ' 'for i in `seq 1 35` ; ' 'do chown -R qa_func dir.$i ; done ; ' 'for i in `seq 36 70` ; ' 'do chown -R qa_system dir.$i ; done ; ' 'for i in `seq 71 100` ; ' 'do chown -R qa_perf dir.$i ; done ;' .format(mount_obj.mountpoint, self.test_meta_data_self_heal_folder)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO self.validate_io_on_clients() # Change the group of the directories and files self.all_mounts_procs = [] for mount_obj in self.mounts: command = ('cd {}/{}; ' 'for i in `seq 1 100` ; ' 'do chgrp -R qa_all dir.$i ; done ;' .format(mount_obj.mountpoint, self.test_meta_data_self_heal_folder)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO self.validate_io_on_clients() # Bring brick online ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Trigger heal from mount point self.trigger_heal_from_mount_point() # Verify the changes are not self healed on brick1 for each subvol for brick in bricks_to_bring_offline: node, brick_path = brick.split(':') dir_list = get_dir_contents(node, "{}/{}".format( brick_path, self.test_meta_data_self_heal_folder)) self.assertIsNotNone(dir_list, "Dir list from " "brick is empty") g.log.info("Successfully got dir list from bick") # Verify changes for dirs for folder in dir_list: ret = get_file_stat(node, "{}/{}/{}".format( brick_path, self.test_meta_data_self_heal_folder, folder)) self.assertEqual('755', ret['access'], "Permissions mismatch on node {}" .format(node)) self.assertEqual('root', ret['username'], "User id mismatch on node {}" .format(node)) self.assertEqual('root', ret['groupname'], "Group id mismatch on node {}" .format(node)) # Get list of files for each dir file_list = get_dir_contents(node, "{}/{}/{}".format( brick_path, self.test_meta_data_self_heal_folder, folder)) self.assertIsNotNone(file_list, "File list from " "brick is empty.") g.log.info("Successfully got file list from bick.") if file_list: for file_name in file_list: ret = get_file_stat(node, "{}/{}/{}/{}".format( brick_path, self.test_meta_data_self_heal_folder, folder, file_name)) self.assertEqual('644', ret['access'], "Permissions mismatch on node" " {} for file {}".format(node, file_name)) self.assertEqual('root', ret['username'], "User id mismatch on node" " {} for file {}".format(node, file_name)) self.assertEqual('root', ret['groupname'], "Group id mismatch on node" " {} for file {}".format(node, file_name)) # Verify the changes are self healed on all bricks except brick1 # for each subvol self.check_permssions_on_bricks(bricks_to_be_online) # Setting options ret = set_volume_options(self.mnode, self.volname, {"metadata-self-heal": "on"}) self.assertTrue(ret, 'Failed to set options to ON.') g.log.info("Options are set successfully") # Trigger heal from mount point self.trigger_heal_from_mount_point() # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Verify the changes are self healed on brick1 for each subvol self.check_permssions_on_bricks(bricks_to_bring_offline) # Calculate and check arequal of the bricks and mount point self.check_arequal_from_mount_point_and_bricks()
def test_afr_dir_entry_creation_with_subvol_down(self): """ 1. Create a distributed-replicated(3X3)/distributed-arbiter(3X(2+1)) and mount it on one client 2. Kill 3 bricks corresponding to the 1st subvol 3. Unmount and remount the volume on the same client 4. Create deep dir from mount point mkdir -p dir1/subdir1/deepdir1 5. Create files under dir1/subdir1/deepdir1; touch <filename> 6. Now bring all sub-vols up by volume start force 7. Validate backend bricks for dir creation, the subvol which is offline will have no dirs created, whereas other subvols will have dirs created from step 4 8. Trigger heal from client by "#find . | xargs stat" 9. Verify that the directory entries are created on all back-end bricks 10. Create new dir (dir2) on location dir1/subdir1/deepdir1 11. Trigger rebalance and wait for the completion 12. Check backend bricks for all entries of dirs 13. Check if files are getting created on the subvol which was offline """ # Bring down first subvol of bricks offline self.subvols = get_subvols(self.mnode, self.volname)['volume_subvols'] first_subvol = self.subvols[0] ret = bring_bricks_offline(self.volname, first_subvol) self.assertTrue( ret, "Unable to bring {} bricks offline".format(first_subvol)) # Check bricks are offline or not ret = are_bricks_offline(self.mnode, self.volname, first_subvol) self.assertTrue(ret, "Bricks {} are still online".format(first_subvol)) # Unmount and remount the volume ret, _, _ = umount_volume(self.mounts[0].client_system, self.mounts[0].mountpoint) self.assertFalse(ret, "Failed to unmount volume.") ret, _, _ = mount_volume(self.volname, self.mount_type, self.mounts[0].mountpoint, self.mnode, self.mounts[0].client_system) self.assertFalse(ret, "Failed to remount volume.") g.log.info('Successfully umounted and remounted volume.') # At this step, sleep is must otherwise file creation will fail sleep(2) # Create dir `dir1/subdir1/deepdir1` on mountpont directory1 = "dir1/subdir1/deepdir1" path = self.mounts[0].mountpoint + "/" + directory1 ret = mkdir(self.mounts[0].client_system, path, parents=True) self.assertTrue(ret, "Directory {} creation failed".format(path)) # Create files on the 2nd and 3rd subvols which are online brickobject = create_brickobjectlist(self.subvols, directory1) self.assertIsNotNone(brickobject, "Failed to get brick object list") self._create_number_of_files_on_the_subvol(brickobject[1], directory1, 5, mountpath=path) self._create_number_of_files_on_the_subvol(brickobject[2], directory1, 5, mountpath=path) # Bring bricks online using volume start force ret, _, err = volume_start(self.mnode, self.volname, force=True) self.assertEqual(ret, 0, err) g.log.info("Volume: %s started successfully", self.volname) # Check all bricks are online ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue( ret, "Few process after volume start are offline for " "volume: {}".format(self.volname)) # Validate Directory is not created on the bricks of the subvol which # is offline for subvol in self.subvols: self._check_file_exists(subvol, "/" + directory1, exists=(subvol != first_subvol)) # Trigger heal from the client cmd = "cd {}; find . | xargs stat".format(self.mounts[0].mountpoint) ret, _, err = g.run(self.mounts[0].client_system, cmd) self.assertEqual(ret, 0, err) # Validate the directory1 is present on all the bricks for subvol in self.subvols: self._check_file_exists(subvol, "/" + directory1, exists=True) # Create new dir (dir2) on location dir1/subdir1/deepdir1 directory2 = "/" + directory1 + '/dir2' path = self.mounts[0].mountpoint + directory2 ret = mkdir(self.mounts[0].client_system, path, parents=True) self.assertTrue(ret, "Directory {} creation failed".format(path)) # Trigger rebalance and validate the completion ret, _, err = rebalance_start(self.mnode, self.volname) self.assertEqual(ret, 0, err) g.log.info("Rebalance on volume %s started successfully", self.volname) ret = wait_for_rebalance_to_complete(self.mnode, self.volname) self.assertTrue( ret, "Rebalance didn't complete on the volume: " "{}".format(self.volname)) # Validate all dirs are present on all bricks in each subvols for subvol in self.subvols: for each_dir in ("/" + directory1, directory2): self._check_file_exists(subvol, each_dir, exists=True) # Validate if files are getting created on the subvol which was # offline self._create_number_of_files_on_the_subvol(brickobject[0], directory1, 5, mountpath=path)
def test_file_access(self): """ Test file access. """ # pylint: disable=protected-access # pylint: disable=too-many-locals # pylint: disable=too-many-statements mount_obj = self.mounts[0] mountpoint = mount_obj.mountpoint # get subvol list subvols = (get_subvols(self.mnode, self.volname))['volume_subvols'] self.assertIsNotNone(subvols, "failed to get subvols") # create a file srcfile = mountpoint + '/testfile' ret, _, err = g.run(self.clients[0], ("touch %s" % srcfile)) self.assertEqual(ret, 0, ("File creation failed for %s err %s", srcfile, err)) g.log.info("testfile creation successful") # find hashed subvol srchashed, scount = find_hashed_subvol(subvols, "/", "testfile") self.assertIsNotNone(srchashed, "could not find srchashed") g.log.info("hashed subvol for srcfile %s subvol count %s", srchashed._host, str(scount)) # rename the file such that the new name hashes to a new subvol tmp = find_new_hashed(subvols, "/", "testfile") self.assertIsNotNone(tmp, "could not find new hashed for dstfile") g.log.info("dst file name : %s dst hashed_subvol : %s " "subvol count : %s", tmp.newname, tmp.hashedbrickobject._host, str(tmp.subvol_count)) dstname = str(tmp.newname) dstfile = mountpoint + "/" + dstname dsthashed = tmp.hashedbrickobject dcount = tmp.subvol_count ret, _, err = g.run(self.clients[0], ("mv %s %s" % (srcfile, dstfile))) self.assertEqual(ret, 0, ("rename failed for %s err %s", srcfile, err)) g.log.info("cmd: mv srcfile dstfile successful") # check that on dsthash_subvol the file is a linkto file filepath = dsthashed._fqpath + "/" + dstname file_stat = get_file_stat(dsthashed._host, filepath) self.assertEqual(file_stat['access'], "1000", ("Expected file " "permission to be 1000" " on subvol %s", dsthashed._host)) g.log.info("dsthash_subvol has the expected linkto file") # check on srchashed the file is a data file filepath = srchashed._fqpath + "/" + dstname file_stat = get_file_stat(srchashed._host, filepath) self.assertNotEqual(file_stat['access'], "1000", ("Expected file " "permission not to" "be 1000 on subvol" "%s", srchashed._host)) # Bring down the hashed subvol of dstfile(linkto file) ret = bring_bricks_offline(self.volname, subvols[dcount]) self.assertTrue(ret, ('Error in bringing down subvolume %s', subvols[dcount])) g.log.info('dst subvol %s is offline', subvols[dcount]) # Need to access the file through a fresh lookup through a new mount # create a new dir(choosing server to do a mount) ret, _, _ = g.run(self.mnode, ("mkdir -p /mnt")) self.assertEqual(ret, 0, ('mkdir of mount dir failed')) g.log.info("mkdir of mount dir succeeded") # do a temp mount ret = mount_volume(self.volname, self.mount_type, "/mnt", self.mnode, self.mnode) self.assertTrue(ret, ('temporary mount failed')) g.log.info("temporary mount succeeded") # check that file is accessible (stat) ret, _, _ = g.run(self.mnode, ("stat /mnt/%s" % dstname)) self.assertEqual(ret, 0, ('stat error on for dst file %s', dstname)) g.log.info("stat on /mnt/%s successful", dstname) # cleanup temporary mount ret = umount_volume(self.mnode, "/mnt") self.assertTrue(ret, ('temporary mount failed')) g.log.info("umount successful") # Bring up the hashed subvol ret = bring_bricks_online(self.mnode, self.volname, subvols[dcount], bring_bricks_online_methods=None) self.assertTrue(ret, "Error in bringing back subvol online") g.log.info('Subvol is back online') # now bring down the cached subvol ret = bring_bricks_offline(self.volname, subvols[scount]) self.assertTrue(ret, ('Error in bringing down subvolume %s', subvols[scount])) g.log.info('target subvol %s is offline', subvols[scount]) # file access should fail ret, _, _ = g.run(self.clients[0], ("stat %s" % dstfile)) self.assertEqual(ret, 1, ('stat error on for file %s', dstfile)) g.log.info("dstfile access failed as expected")
def test_snap_self_heal(self): """ Steps: 1. create a volume 2. mount volume 3. create snapshot of that volume 4. Activate snapshot 5. Clone snapshot and Mount 6. Perform I/O 7. Bring Down Few bricks from volume without affecting the volume or cluster. 8. Perform I/O 9. Bring back down bricks to online 10. Validate heal is complete with areequal """ # pylint: disable=too-many-statements, too-many-locals # Creating snapshot: g.log.info("Starting to Create snapshot") ret, _, _ = snap_create(self.mnode, self.volname, self.snap) self.assertEqual( ret, 0, ("Failed to create snapshot for volume %s" % self.volname)) g.log.info("Snapshot %s created successfully for volume %s", self.snap, self.volname) # Activating snapshot g.log.info("Starting to Activate Snapshot") ret, _, _ = snap_activate(self.mnode, self.snap) self.assertEqual(ret, 0, ("Failed to Activate snapshot %s" % self.snap)) g.log.info("Snapshot %s activated successfully", self.snap) # snapshot list ret, _, _ = snap_list(self.mnode) self.assertEqual(ret, 0, ("Failed to list all the snapshot")) g.log.info("Snapshot list command was successful") # Creating a Clone volume from snapshot: g.log.info("Starting to Clone volume from Snapshot") ret, _, _ = snap_clone(self.mnode, self.snap, self.clone) self.assertEqual(ret, 0, ("Failed to clone %s from snapshot %s" % (self.clone, self.snap))) g.log.info("%s created successfully", self.clone) # start clone volumes g.log.info("start to created clone volumes") ret, _, _ = volume_start(self.mnode, self.clone) self.assertEqual(ret, 0, "Failed to start clone %s" % self.clone) g.log.info("clone volume %s started successfully", self.clone) # Mounting a clone volume g.log.info("Mounting a clone volume") ret, _, _ = mount_volume(self.clone, self.mount_type, self.mount1, self.mnode, self.clients[0]) self.assertEqual(ret, 0, "Failed to mount clone Volume %s" % self.clone) g.log.info("Clone volume %s mounted Successfully", self.clone) # Checking cloned volume mounted or not ret = is_mounted(self.clone, self.mount1, self.mnode, self.clients[0], self.mount_type) self.assertTrue( ret, "Failed to mount clone volume on mount point: %s" % self.mount1) g.log.info("clone Volume %s mounted on %s", self.clone, self.mount1) # write files on all mounts g.log.info("Starting IO on all mounts...") g.log.info("mounts: %s", self.mount1) all_mounts_procs = [] cmd = ("python %s create_files " "-f 10 --base-file-name file %s" % (self.script_upload_path, self.mount1)) proc = g.run(self.clients[0], cmd) all_mounts_procs.append(proc) g.log.info("Successful in creating I/O on mounts") # get the bricks from the volume g.log.info("Fetching bricks for the volume : %s", self.clone) bricks_list = get_all_bricks(self.mnode, self.clone) g.log.info("Brick List : %s", bricks_list) # Select bricks to bring offline g.log.info("Starting to bring bricks to offline") bricks_to_bring_offline_dict = (select_bricks_to_bring_offline( self.mnode, self.volname)) bricks_to_bring_offline = filter( None, (bricks_to_bring_offline_dict['hot_tier_bricks'] + bricks_to_bring_offline_dict['cold_tier_bricks'] + bricks_to_bring_offline_dict['volume_bricks'])) g.log.info("Brick to bring offline: %s ", bricks_to_bring_offline) ret = bring_bricks_offline(self.clone, bricks_to_bring_offline) self.assertTrue(ret, "Failed to bring the bricks offline") g.log.info("Successful in bringing bricks: %s offline", bricks_to_bring_offline) # Offline Bricks list offline_bricks = get_offline_bricks_list(self.mnode, self.clone) self.assertIsNotNone( offline_bricks, "Failed to get offline bricklist" "for volume %s" % self.clone) for bricks in offline_bricks: self.assertIn(bricks, bricks_to_bring_offline, "Failed to validate " "Bricks offline") g.log.info("Bricks Offline: %s", offline_bricks) # Online Bricks list online_bricks = get_online_bricks_list(self.mnode, self.clone) self.assertIsNotNone( online_bricks, "Failed to get online bricks" " for volume %s" % self.clone) g.log.info("Bricks Online: %s", online_bricks) # write files mountpoint g.log.info("Starting IO on all mounts...") g.log.info("mounts: %s", self.mount1) all_mounts_procs = [] cmd = ("python %s create_files " "-f 10 --base-file-name file %s" % (self.script_upload_path, self.mount1)) proc = g.run(self.clients[0], cmd) all_mounts_procs.append(proc) g.log.info("Successful in creating I/O on mounts") # Bring all bricks online g.log.info("bring all bricks online") ret = bring_bricks_online(self.mnode, self.clone, bricks_to_bring_offline) self.assertTrue(ret, "Failed to bring bricks online") g.log.info("Successful in bringing all bricks online") # Validate Bricks are online g.log.info("Validating all bricks are online") ret = are_bricks_online(self.mnode, self.clone, bricks_list) self.assertTrue(ret, "Failed to bring all the bricks online") g.log.info("bricks online: %s", bricks_list) # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.clone) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online" % self.clone)) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.clone) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.clone) self.assertTrue( ret, ("Volume %s : All process are not online" % self.clone)) g.log.info("Volume %s : All process are online", self.clone) # wait for the heal process to complete g.log.info("waiting for heal process to complete") ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, "Failed to complete the heal process") g.log.info("Successfully completed heal process") # Check areequal # get the subvolumes g.log.info("Starting to get sub-volumes for volume %s", self.clone) subvols = get_subvols(self.mnode, self.clone) num_subvols = len(subvols['volume_subvols']) g.log.info("Number of subvolumes in volume %s:", num_subvols) # Get arequals and compare g.log.info("Starting to Compare areequals") for i in range(0, num_subvols): # Get arequal for first brick subvol_brick_list = subvols['volume_subvols'][i] node, brick_path = subvol_brick_list[0].split(':') command = ('arequal-checksum -p %s ' '-i .glusterfs -i .landfill -i .trashcan' % brick_path) ret, arequal, _ = g.run(node, command) first_brick_total = arequal.splitlines()[-1].split(':')[-1] # Get arequal for every brick and compare with first brick for brick in subvol_brick_list: node, brick_path = brick.split(':') command = ('arequal-checksum -p %s ' '-i .glusterfs -i .landfill -i .trashcan' % brick_path) ret, brick_arequal, _ = g.run(node, command) self.assertFalse(ret, 'Failed to get arequal on brick %s' % brick) g.log.info('Getting arequal for %s is successful', brick) brick_total = brick_arequal.splitlines()[-1].split(':')[-1] self.assertEqual( first_brick_total, brick_total, 'Arequals for subvol and %s are not equal' % brick) g.log.info('Arequals for subvol and %s are equal', brick) g.log.info('All arequals are equal for distributed-replicated')
def test_resolving_meta_data(self): """ - Create a file test_file.txt - Find out which brick the file resides on and kill arbiter brick in the replica pair - Modify the permissions of the file - Bring back the killed brick - Kill the other brick in the replica pair - Modify the permissions of the file - Bring back the killed brick - Trigger heal - Check if heal is completed - Check for split-brain """ # pylint: disable=too-many-locals,too-many-statements # Creating files on client side file_to_create = 'test_file.txt' for mount_obj in self.mounts: g.log.info("Generating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create file g.log.info('Creating file...') command = ("cd %s ; " "touch %s" % (mount_obj.mountpoint, file_to_create)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients") self.io_validation_complete = True # get bricks with file g.log.info('Getting bricks with file...') subvols_dict = get_subvols(self.mnode, self.volname) brick_list_with_file = [] for subvol in subvols_dict['volume_subvols']: for brick in subvol: node, brick_path = brick.split(':') ret, brick_file_list, _ = g.run(node, 'ls %s' % brick_path) if 'test_file.txt' in brick_file_list: brick_list_with_file.append(brick) g.log.info('Bricks with file: %s', brick_list_with_file) # Bring arbiter brick offline bricks_to_bring_offline = [brick_list_with_file[-1]] g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Modify the data self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Modifying data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Modify the permissions g.log.info('Modifying the permissions of the file...') command = ("cd %s ; " "chmod 600 %s" % (mount_obj.mountpoint, file_to_create)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients") self.io_validation_complete = True # Bring arbiter brick online g.log.info('Bringing bricks %s online...', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Bring 1-st data brick offline bricks_to_bring_offline = [brick_list_with_file[0]] g.log.info('Bringing bricks %s offline...', bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline) ret = are_bricks_offline(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue(ret, 'Bricks %s are not offline' % bricks_to_bring_offline) g.log.info('Bringing bricks %s offline is successful', bricks_to_bring_offline) # Modify the data self.all_mounts_procs = [] for mount_obj in self.mounts: g.log.info("Modifying data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Modify the permissions g.log.info('Modifying the permissions of the file...') command = ("cd %s ; " "chmod 644 %s" % (mount_obj.mountpoint, file_to_create)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # Validate IO self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts), "IO failed on some of the clients") self.io_validation_complete = True # Bring 1-st data brick online g.log.info('Bringing bricks %s online...', bricks_to_bring_offline) ret = bring_bricks_online(self.mnode, self.volname, bricks_to_bring_offline) self.assertTrue( ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline) g.log.info('Bringing bricks %s online is successful', bricks_to_bring_offline) # Start healing ret = trigger_heal(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not started') g.log.info('Healing is started') # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state')
def test_custom_xattr_with_subvol_down_dir_exists(self): """ Description: Steps: 1) Create directories from mount point. 2) Bring one or more(not all) dht sub-volume(s) down by killing processes on that server 3) Create a custom xattr for dir hashed to down sub-volume and also for another dir not hashing to down sub-volumes # setfattr -n user.foo -v bar2 <dir> 4) Verify that custom xattr for directory is displayed on mount point and bricks for both directories # getfattr -n user.foo <dir> # getfattr -n user.foo <brick_path>/<dir> 5) Modify custom xattr value and verify that custom xattr for directory is displayed on mount point and all up bricks # setfattr -n user.foo -v ABC <dir> 6) Verify that custom xattr is not displayed once you remove it on mount point and all up bricks 7) Verify that mount point shows pathinfo xattr for dir hashed to down sub-volume and also for dir not hashed to down sub-volumes # getfattr -n trusted.glusterfs.pathinfo <dir> 8) Again create a custom xattr for dir not hashing to down sub-volumes # setfattr -n user.foo -v star1 <dir> 9) Bring up the sub-volumes 10) Execute lookup on parent directory of both <dir> from mount point 11) Verify Custom extended attributes for dir1 on all bricks """ # pylint: disable=protected-access # Create dir1 on client0 self._create_dir(dir_name="dir1") # Get subvol list subvols = (get_subvols(self.mnode, self.volname))['volume_subvols'] self.assertIsNotNone(subvols, "Failed to get subvols") # Finding a dir name such that it hashes to a different subvol newhash = find_new_hashed(subvols, "/", "dir1") new_name = str(newhash.newname) new_subvol_count = newhash.subvol_count # Create a dir with the new name self._create_dir(dir_name=new_name) # Kill the brick/subvol to which the new dir hashes ret = bring_bricks_offline( self.volname, subvols[new_subvol_count]) self.assertTrue(ret, ('Error in bringing down subvolume %s', subvols[new_subvol_count])) g.log.info('DHT subvol %s is offline', subvols[new_subvol_count]) # Set the xattr on dir hashing to down subvol ret = set_fattr(self.client, '{}/{}'.format(self.m_point, new_name), 'user.foo', 'bar2') self.assertFalse(ret, "Unexpected: custom xattr set successfully" " for dir hashing to down subvol") g.log.info("Expected: Failed to set xattr on dir:%s" " which hashes to down subvol due to error: Transport" " endpoint not connected", new_name) # Check if the trusted.glusterfs.pathinfo is displayed # for dir hashing to down subvol on mointpoint ret = get_fattr(self.client, '{}/{}'.format( self.m_point, new_name), 'trusted.glusterfs.pathinfo') self.assertIsNotNone(ret, "Failed to get the xattr" " on:{}".format(self.client)) g.log.info("The xattr trusted.glusterfs.pathinfo" " is displayed on mointpoint for %s", new_name) # Set the xattr on dir hashing to down subvol ret = set_fattr(self.client, '{}/{}'.format(self.m_point, new_name), 'user.foo', 'star1') self.assertFalse(ret, "Unexpected: custom xattr set successfully" " for dir hashing to down subvol") g.log.info("Expected: Tansport endpoint not connected") # Calling the local function self._create_xattr_check_self_heal()
def test_client_side_quorum_with_fixed_for_cross3(self): """ Test Script to verify the Client Side Quorum with fixed for cross 3 volume * Disable self heal daemom * set cluster.quorum-type to fixed. * start I/O( write and read )from the mount point - must succeed * Bring down brick1 * start I/0 ( write and read ) - must succeed * Bring down brick2 * start I/0 ( write and read ) - must succeed * set the cluster.quorum-count to 1 * start I/0 ( write and read ) - must succeed * set the cluster.quorum-count to 2 * start I/0 ( write and read ) - read and write will fail * bring back the brick1 online * start I/0 ( write and read ) - must succeed * Bring back brick2 online * start I/0 ( write and read ) - must succeed * set cluster.quorum-type to auto * start I/0 ( write and read ) - must succeed * Bring down brick1 and brick2 * start I/0 ( write and read ) - read and write will fail * set the cluster.quorum-count to 1 * start I/0 ( write and read ) - read and write will fail * set the cluster.quorum-count to 3 * start I/0 ( write and read ) - read and write will fail * set the quorum-type to none * start I/0 ( write and read ) - must succeed """ # pylint: disable=too-many-locals,too-many-statements,too-many-branches # Disable self heal daemon options = {"cluster.self-heal-daemon": "off"} g.log.info("setting %s for the volume %s", options, self.volname) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, ("Unable to set %s for volume %s" % (options, self.volname))) g.log.info("Successfully set %s for volume %s", options, self.volname) # set cluster.quorum-type to fixed options = {"cluster.quorum-type": "fixed"} g.log.info("setting %s for the volume %s", options, self.volname) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, ("Unable to set %s for volume %s" % (options, self.volname))) g.log.info("Successfully set %s for volume %s", options, self.volname) # start I/O( write ) - must succeed all_mounts_procs = [] g.log.info("Starting IO on mountpoint %s", self.mounts[0].mountpoint) cmd = ("/usr/bin/env python %s create_files " "-f 10 --base-file-name file %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "IO failed on mountpoint %s" % self.mounts[0].mountpoint) # read the file g.log.info("Start reading files on %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = "/usr/bin/env python %s read %s" % (self.script_upload_path, self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts), "Reads failed on some of the clients") # get the subvolumes g.log.info("Starting to get sub-volumes for volume %s", self.volname) subvols_dict = get_subvols(self.mnode, self.volname) num_subvols = len(subvols_dict['volume_subvols']) g.log.info("Number of subvolumes in volume %s:", num_subvols) # bring down brick1 for all the subvolumes offline_brick1_from_replicasets = [] for i in range(0, num_subvols): subvol_brick_list = subvols_dict['volume_subvols'][i] g.log.info("sub-volume %s brick list : %s", i, subvol_brick_list) brick_to_bring_offline1 = subvol_brick_list[0] g.log.info("Going to bring down the brick process " "for %s", brick_to_bring_offline1) ret = bring_bricks_offline(self.volname, brick_to_bring_offline1) self.assertTrue(ret, ("Failed to bring down the bricks. Please " "check the log file for more details.")) g.log.info("Brought down the brick process " "for %s successfully", brick_to_bring_offline1) offline_brick1_from_replicasets.append(brick_to_bring_offline1) # start I/0 ( write and read ) - must succeed g.log.info("Starting IO on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = ("/usr/bin/env python %s create_files " "-f 10 --base-file-name testfile %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "IO failed on mountpoint %s" % self.mounts[0].mountpoint) # read the file g.log.info("Start reading files on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = "/usr/bin/env python %s read %s" % (self.script_upload_path, self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "Reads failed on mountpoint %s" % self.mounts[0].mountpoint) # bring down brick2 for all the subvolumes offline_brick2_from_replicasets = [] for i in range(0, num_subvols): subvol_brick_list = subvols_dict['volume_subvols'][i] g.log.info("sub-volume %s brick list : %s", i, subvol_brick_list) brick_to_bring_offline2 = subvol_brick_list[1] g.log.info("Going to bring down the brick process " "for %s", brick_to_bring_offline2) ret = bring_bricks_offline(self.volname, brick_to_bring_offline2) self.assertTrue(ret, ("Failed to bring down the bricks. Please " "check the log file for more details.")) g.log.info("Brought down the brick process " "for %s successfully", brick_to_bring_offline2) offline_brick2_from_replicasets.append(brick_to_bring_offline2) # start I/0 ( write and read ) - must succeed g.log.info("Starting IO on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = ("/usr/bin/env python %s create_files " "-f 10 --base-file-name newfile %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "IO failed on mountpoint %s" % self.mounts[0].mountpoint) # read the file g.log.info("Start reading files on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = "/usr/bin/env python %s read %s" % (self.script_upload_path, self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "Reads failed on mountpoint %s" % self.mounts[0].mountpoint) # set the cluster.quorum-count to 1 options = {"cluster.quorum-count": "1"} g.log.info("setting %s for the volume %s", options, self.volname) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue( ret, "Unable to set %s for volume %s" % (options, self.volname)) g.log.info("Successfully set %s for volume %s", options, self.volname) # start I/0 ( write and read ) - must succeed g.log.info("Starting IO on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = ("/usr/bin/env python %s create_files " "-f 10 --base-file-name filename %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "IO failed on mountpoint %s" % self.mounts[0].mountpoint) # read the file g.log.info("Start reading files on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = "/usr/bin/env python %s read %s" % (self.script_upload_path, self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "Reads failed on mountpoint %s" % self.mounts[0].mountpoint) # set the cluster.quorum-count to 2 options = {"cluster.quorum-count": "2"} g.log.info("setting %s for the volume %s", options, self.volname) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, ("Unable to set %s for volume %s" % (options, self.volname))) g.log.info("Successfully set %s for volume %s", options, self.volname) # start I/0 ( write and read ) - read and write will fail g.log.info("Starting IO on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = ("dd if=/dev/urandom of=%s/test_file bs=1M count=1" % self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO g.log.info("Validating whether IO failed with " "Transport endpoint is not connected") ret, _ = is_io_procs_fail_with_error(self, all_mounts_procs, self.mounts, self.mount_type) self.assertTrue(ret, ("Unexpected Error and IO successful" " on not connected transport endpoint")) g.log.info("EXPECTED: Transport endpoint is not connected" " while creating file") # read the file g.log.info("Start reading files on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = ("cat %s/file1.txt" % self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO g.log.info("Validating whether IO failed with " "Transport endpoint is not connected") ret, _ = is_io_procs_fail_with_error(self, all_mounts_procs, self.mounts, self.mount_type) self.assertTrue(ret, ("Unexpected error and IO successful" " on not connected transport endpoint")) g.log.info("EXPECTED: Transport endpoint is not connected" " while reading file") # bring back the brick1 online for all subvolumes g.log.info("bringing up the brick : %s online", offline_brick1_from_replicasets) ret = bring_bricks_online( self.mnode, self.volname, offline_brick1_from_replicasets, bring_bricks_online_methods='glusterd_restart') self.assertTrue(ret, ("Failed to brought the brick %s online" % offline_brick1_from_replicasets)) g.log.info("Successfully brought the brick %s online", offline_brick1_from_replicasets) # start I/0 ( write and read ) - must succeed g.log.info("Starting IO on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = ("/usr/bin/env python %s create_files " "-f 10 --base-file-name newfilename %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "IO failed on mountpoint %s" % self.mounts[0].mountpoint) # read the file g.log.info("Start reading files on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = "/usr/bin/env python %s read %s" % (self.script_upload_path, self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "Reads failed on mountpoint %s" % self.mounts[0].mountpoint) # Bring back brick2 online g.log.info("bringing up the brick : %s online", offline_brick2_from_replicasets) ret = bring_bricks_online( self.mnode, self.volname, offline_brick2_from_replicasets, bring_bricks_online_methods='glusterd_restart') self.assertTrue(ret, ("Failed to brought the brick %s online" % offline_brick2_from_replicasets)) g.log.info("Successfully brought the brick %s online", offline_brick2_from_replicasets) # start I/0 ( write and read ) - must succeed g.log.info("Starting IO on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = ("/usr/bin/env python %s create_files " "-f 10 --base-file-name textfile %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "IO failed on mountpoint %s" % self.mounts[0].mountpoint) # read the file g.log.info("Start reading files on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = "/usr/bin/env python %s read %s" % (self.script_upload_path, self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "Reads failed on mountpoint %s" % self.mounts[0].mountpoint) # set cluster.quorum-type to auto options = {"cluster.quorum-type": "auto"} g.log.info("setting %s for the volume %s", options, self.volname) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, ("Unable to set %s for volume %s" % (options, self.volname))) g.log.info("Successfully set %s for volume %s", options, self.volname) # start I/0 ( write and read ) - must succeed g.log.info("Starting IO on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = ("/usr/bin/env python %s create_files " "-f 10 --base-file-name newtextfile %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "IO failed on mountpoint %s" % self.mounts[0].mountpoint) # read the file g.log.info("Start reading files on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = "/usr/bin/env python %s read %s" % (self.script_upload_path, self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "Reads failed on mountpoint %s" % self.mounts[0].mountpoint) # bring down brick1 and brick2 for all the subvolumes for i in range(0, num_subvols): subvol_brick_list = subvols_dict['volume_subvols'][i] g.log.info("sub-volume %s brick list : %s", i, subvol_brick_list) bricks_to_bring_offline = subvol_brick_list[0:2] g.log.info("Going to bring down the brick process for %s", bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue( ret, "Failed to bring down the bricks. Please " "check the log file for more details.") g.log.info("Brought down the brick process " "for %s successfully", bricks_to_bring_offline) # start I/0 ( write and read ) - read and write will fail all_mounts_procs = [] g.log.info("Start creating file on mountpoint %s", self.mounts[0].mountpoint) cmd = ("dd if=/dev/urandom of=%s/new_test_file bs=1M count=1" % self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO g.log.info("Validating whether IO failed with " "Transport endpoint is not connected") ret, _ = is_io_procs_fail_with_error(self, all_mounts_procs, self.mounts, self.mount_type) self.assertTrue(ret, ("Unexpected error and IO successful" " on not connected transport endpoint")) g.log.info("EXPECTED: Transport endpoint is not connected" " while creating files") # read the file g.log.info("Start reading files on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] g.log.info("Starting reading file") cmd = ("cat %s/file1.txt" % self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO g.log.info("Validating whether IO failed with " "Transport endpoint is not connected") ret, _ = is_io_procs_fail_with_error(self, all_mounts_procs, self.mounts, self.mount_type) self.assertTrue(ret, ("Unexpected error and IO successful" " on not connected transport endpoint")) g.log.info("EXPECTED: Transport endpoint is not connected" " while reading file") # set the cluster.quorum-count to 1 options = {"cluster.quorum-count": "1"} g.log.info("setting %s for the volume %s", options, self.volname) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue( ret, "Unable to set %s for volume %s" % (options, self.volname)) g.log.info("Successfully set %s for volume %s", options, self.volname) # start I/0 ( write and read ) - read and write will fail g.log.info("Start creating files on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = ("dd if=/dev/urandom of=%s/new_test_file bs=1M count=1" % self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO g.log.info("Validating whether IO failed with " "Transport endpoint is not connected") ret, _ = is_io_procs_fail_with_error(self, all_mounts_procs, self.mounts, self.mount_type) self.assertTrue(ret, ("Unexpected error and IO successful" " on not connected transport endpoint")) g.log.info("EXPECTED: Transport endpoint is not connected" " while creating files") # read the file g.log.info("Start reading files on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = ("cat %s/file1.txt" % self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO g.log.info("Validating whether IO failed with " "Transport endpoint is not connected") ret, _ = is_io_procs_fail_with_error(self, all_mounts_procs, self.mounts, self.mount_type) self.assertTrue(ret, ("Unexpected error and IO successful" " on not connected transport endpoint")) g.log.info("EXPECTED: Transport endpoint is not connected" " while reading file") # set the cluster.quorum-count to 3 options = {"cluster.quorum-count": "3"} g.log.info("setting %s for the volume %s", options, self.volname) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue( ret, "Unable to set %s for volume %s" % (options, self.volname)) g.log.info("Successfully set %s for volume %s", options, self.volname) # start I/0 ( write and read ) - read and write will fail g.log.info("Start creating files on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = ("dd if=/dev/urandom of=%s/new_test_file bs=1M count=1" % self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO g.log.info("Validating whether IO failed with " "Transport endpoint is not connected") ret, _ = is_io_procs_fail_with_error(self, all_mounts_procs, self.mounts, self.mount_type) self.assertTrue(ret, ("Unexpected error and IO successful" " on not connected transport endpoint")) g.log.info("EXPECTED: Transport endpoint is not connected" " while creating files") # read the file g.log.info("Start reading files on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = ("cat %s/file1.txt" % self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO g.log.info("Validating whether IO failed with " "Transport endpoint is not connected") ret, _ = is_io_procs_fail_with_error(self, all_mounts_procs, self.mounts, self.mount_type) self.assertTrue(ret, ("Unexpected error and IO successful" " on not connected transport endpoint")) g.log.info("EXPECTED: Transport endpoint is not connected" " while reading file") # set the quorum-type to none options = {"cluster.quorum-type": "none"} g.log.info("setting %s for the volume %s", options, self.volname) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue( ret, "Unable to set %s for volume %s" % (options, self.volname)) g.log.info("Successfully set %s for volume %s", options, self.volname) # start I/0 ( write and read ) - must succeed g.log.info("Starting IO on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = ("/usr/bin/env python %s create_files " "-f 10 --base-file-name lastfile %s" % (self.script_upload_path, self.mounts[0].mountpoint)) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "IO failed on mountpoint %s" % self.mounts[0].mountpoint) # read the file g.log.info("Start reading files on mountpoint %s", self.mounts[0].mountpoint) all_mounts_procs = [] cmd = "/usr/bin/env python %s read %s" % (self.script_upload_path, self.mounts[0].mountpoint) proc = g.run_async(self.mounts[0].client_system, cmd, user=self.mounts[0].user) all_mounts_procs.append(proc) # Validate IO self.assertTrue( validate_io_procs(all_mounts_procs, self.mounts), "Reads failed on mountpoint %s" % self.mounts[0].mountpoint)
def test_client_side_quorum_with_auto_option(self): """ Test Script to verify the Client Side Quorum with auto option * set cluster.quorum-type to auto. * start I/O from the mount point. * kill 2 of the brick process from the each and every replica set * perform ops """ # set cluster.quorum-type to auto options = {"cluster.quorum-type": "auto"} g.log.info("setting cluster.quorum-type to auto on " "volume %s" % self.volname) ret = set_volume_options(self.mnode, self.volname, options) self.assertTrue(ret, ("Unable to set volume option %s for" "volume %s" % (options, self.volname))) g.log.info("Sucessfully set %s for volume %s" % (options, self.volname)) # write files on all mounts g.log.info("Starting IO on all mounts...") g.log.info("mounts: %s" % self.mounts) all_mounts_procs = [] for mount_obj in self.mounts: cmd = ("python %s create_files " "-f 10 --base-file-name file %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) # Validate IO g.log.info("Validating IO on mounts") ret = validate_io_procs(all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") g.log.info("IO is successful on all mounts") # get the subvolumes g.log.info("Starting to get sub-volumes for volume %s" % self.volname) subvols_dict = get_subvols(self.mnode, self.volname) num_subvols = len(subvols_dict['volume_subvols']) g.log.info("Number of subvolumes in volume %s:" % num_subvols) # bring bricks offline( 2 bricks ) for all the subvolumes for i in range(0, num_subvols): subvol_brick_list = subvols_dict['volume_subvols'][i] g.log.info("sub-volume %s brick list : %s" % (i, subvol_brick_list)) # For volume type: 1 * 2, bring 1 brick offline if len(subvol_brick_list) == 2: bricks_to_bring_offline = subvol_brick_list[0:1] else: bricks_to_bring_offline = subvol_brick_list[0:2] g.log.info("Going to bring down the brick process " "for %s" % bricks_to_bring_offline) ret = bring_bricks_offline(self.volname, bricks_to_bring_offline) self.assertTrue(ret, ("Failed to bring down the bricks. Please " "check the log file for more details.")) g.log.info("Brought down the brick process " "for %s succesfully" % bricks_to_bring_offline) # create 2 files named newfile0.txt and newfile1.txt g.log.info("Start creating 2 files on all mounts...") all_mounts_procs = [] for mount_obj in self.mounts: cmd = ("python %s create_files " "-f 2 --base-file-name newfile %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) # Validate IO g.log.info("Validating whether IO failed with read-only filesystem") ret = is_io_procs_fail_with_rofs(self, all_mounts_procs, self.mounts) self.assertTrue(ret, ("Unexpected error and IO successfull" " on read-only filesystem")) g.log.info("EXPECTED: Read-only file system in IO while creating file") # create directory user1 g.log.info("Start creating directory on all mounts...") all_mounts_procs = [] for mount_obj in self.mounts: cmd = ("python %s create_deep_dir " "%s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) # Validate IO g.log.info("Validating whether IO failed with read-only filesystem") ret = is_io_procs_fail_with_rofs(self, all_mounts_procs, self.mounts) self.assertTrue(ret, ("Unexpected error and IO successfull" " on read-only filesystem")) g.log.info("EXPECTED: Read-only file system in IO while" " creating directory") # create h/w link to file g.log.info("Start creating hard link for file0.txt on all mounts") for mount_obj in self.mounts: cmd = "ln %s/file0.txt %s/file0.txt_hwlink" \ % (mount_obj.mountpoint, mount_obj.mountpoint) ret, out, err = g.run(mount_obj.client_system, cmd) self.assertTrue(ret, ("Unexpected error and creating hard link" " successful on read-only filesystem")) self.assertIn( "Read-only file system", err, "Read-only filesystem not found in " "IO while truncating file") g.log.info("EXPECTED: Read-only file system in IO") # create s/w link g.log.info("Start creating soft link for file1.txt on all mounts") for mount_obj in self.mounts: cmd = "ln -s %s/file1.txt %s/file1.txt_swlink" %\ (mount_obj.mountpoint, mount_obj.mountpoint) ret, out, err = g.run(mount_obj.client_system, cmd) self.assertTrue(ret, ("Unexpected error and creating soft link" " successful on read-only filesystem")) self.assertIn( "Read-only file system", err, "Read-only filesystem not found in " "IO while truncating file") g.log.info("EXPECTED: Read-only file system in IO") # append to file g.log.info("Appending to file1.txt on all mounts") for mount_obj in self.mounts: cmd = "cat %s/file0.txt >> %s/file1.txt" %\ (mount_obj.mountpoint, mount_obj.mountpoint) ret, out, err = g.run(mount_obj.client_system, cmd) self.assertTrue(ret, ("Unexpected error and append successful" " on read-only filesystem")) self.assertIn( "Read-only file system", err, "Read-only filesystem not found in " "IO while truncating file") g.log.info("EXPECTED: Read-only file system in IO") # modify the file g.log.info("Modifying file1.txt on all mounts") for mount_obj in self.mounts: cmd = "echo 'Modify Contents' > %s/file1.txt"\ % (mount_obj.mountpoint) ret, out, err = g.run(mount_obj.client_system, cmd) self.assertTrue(ret, ("Unexpected error and modifying successful" " on read-only filesystem")) self.assertIn( "Read-only file system", err, "Read-only filesystem not found in " "IO while truncating file") g.log.info("EXPECTED: Read-only file system in IO") # truncate the file g.log.info("Truncating file1.txt on all mounts") for mount_obj in self.mounts: cmd = "truncate -s 0 %s/file1.txt" % (mount_obj.mountpoint) ret, out, err = g.run(mount_obj.client_system, cmd) self.assertTrue(ret, ("Unexpected error and truncating file" " successful on read-only filesystem")) self.assertIn( "Read-only file system", err, "Read-only filesystem not found in " "IO while truncating file") g.log.info("EXPECTED: Read-only file system in IO") # read the file g.log.info("Starting reading files on all mounts") all_mounts_procs = [] for mount_obj in self.mounts: cmd = ("python %s read " "%s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, cmd, user=mount_obj.user) all_mounts_procs.append(proc) # Validate IO g.log.info("validating IO on all mounts") ret = validate_io_procs(all_mounts_procs, self.mounts) self.assertTrue(ret, "Reads failed on some of the clients") g.log.info("Reads successful on all mounts") # stat on file g.log.info("stat on file1.txt on all mounts") for mount_obj in self.mounts: cmd = "stat %s/file1.txt" % (mount_obj.mountpoint) ret, out, err = g.run(mount_obj.client_system, cmd) self.assertFalse(ret, ("Unexpected error and stat on file fails" " on read-only filesystem")) g.log.info("stat on file is successfull on read-only filesystem") # stat on dir g.log.info("stat on directory on all mounts") for mount_obj in self.mounts: cmd = ("python %s stat %s" % (self.script_upload_path, mount_obj.mountpoint)) ret, out, err = g.run(mount_obj.client_system, cmd) self.assertFalse(ret, ("Unexpected error and stat on directory" " fails on read-only filesystem")) g.log.info("stat on dir is successfull on read-only filesystem") # ls on mount point g.log.info("ls on mount point on all mounts") for mount_obj in self.mounts: cmd = ("python %s ls %s" % (self.script_upload_path, mount_obj.mountpoint)) ret, out, err = g.run(mount_obj.client_system, cmd) self.assertFalse(ret, ("Unexpected error and listing file fails" " on read-only filesystem")) g.log.info("listing files is successfull on read-only filesystem")
def test_status_string(self): ''' -> Create Volume -> Start rebalance -> Check task type in volume status -> Check task status string in volume status -> Check task type in volume status xml -> Check task status string in volume status xml -> Start Remove brick operation -> Check task type in volume status -> Check task status string in volume status -> Check task type in volume status xml -> Check task status string in volume status xml ''' # Start rebalance ret, _, _ = rebalance_start(self.mnode, self.volname) self.assertEqual(ret, 0, "Failed to start rebalance for volume %s" % self.volname) g.log.info("Rebalance started successfully on volume %s", self.volname) # Wait for rebalance to complete ret = wait_for_rebalance_to_complete(self.mnode, self.volname) self.assertTrue(ret, "Rebalance failed for volume %s" % self.volname) g.log.info("Rebalance completed successfully on volume %s", self.volname) # Getting volume status after rebalance start ret, out, _ = volume_status(self.mnode, self.volname) self.assertEqual(ret, 0, "Failed to get volume status for volume %s" % self.volname) g.log.info("Volume status successful on volume %s", self.volname) status_list = out.splitlines() # Verifying task type from volume status for rebalance self.assertIn('Rebalance', status_list[len(status_list) - 4], "Incorrect task type found in volume status for %s" % self.volname) g.log.info("Correct task type found in volume status for %s", self.volname) # Verifying task status string in volume status for rebalance self.assertIn('completed', status_list[len(status_list) - 2], "Incorrect task status found in volume status for %s" % self.volname) g.log.info("Correct task status found in volume status for %s", self.volname) # Getting volume status --xml after rebalance start vol_status = get_volume_status(self.mnode, self.volname, options='tasks') # Verifying task type from volume status --xml for rebalance self.assertEqual('Rebalance', vol_status[self.volname]['task_status'][0]['type'], "Incorrect task type found in volume status xml " "for %s" % self.volname) g.log.info("Correct task type found in volume status xml for %s", self.volname) # Verifying task status string from volume status --xml for rebalance self.assertEqual( 'completed', vol_status[self.volname]['task_status'][0]['statusStr'], "Incorrect task status found in volume status " "xml for %s" % self.volname) g.log.info("Correct task status found in volume status xml %s", self.volname) # Getting sub vols subvol_dict = get_subvols(self.mnode, self.volname) subvol = subvol_dict['volume_subvols'][1] # Perform remove brick start ret, _, _ = remove_brick(self.mnode, self.volname, subvol, 'start', replica_count=3) self.assertEqual(ret, 0, "Failed to start remove brick operation " "for %s" % self.volname) g.log.info("Remove brick operation started successfully on volume %s", self.volname) # Getting volume status after remove brick start ret, out, _ = volume_status(self.mnode, self.volname) self.assertEqual(ret, 0, "Failed to get volume status for volume %s" % self.volname) g.log.info("Volume status successful on volume %s", self.volname) status_list = out.splitlines() # Verifying task type from volume status after remove brick start self.assertIn('Remove brick', status_list[len(status_list) - 8], "Incorrect task type found in volume status for " "%s" % self.volname) g.log.info("Correct task type found in volume status task for %s", self.volname) # Verifying task status string in volume status after remove # brick start ret = False remove_status = ['completed', 'in progress'] if (status_list[len(status_list) - 2].split(':')[1].strip() in remove_status): ret = True self.assertTrue(ret, "Incorrect task status found in volume status " "task for %s" % self.volname) g.log.info("Correct task status found in volume status task for %s", self.volname) # Getting volume status --xml after remove brick start vol_status = get_volume_status(self.mnode, self.volname, options='tasks') # Verifying task type from volume status --xml after # remove brick start self.assertEqual('Remove brick', vol_status[self.volname]['task_status'][0]['type'], "Incorrect task type found in volume status xml for " "%s" % self.volname) g.log.info("Correct task type found in volume status xml for %s", self.volname) # Verifying task status string from volume status --xml # after remove brick start ret = False if (vol_status[self.volname]['task_status'][0]['statusStr'] in remove_status): ret = True self.assertTrue(ret, "Incorrect task status found in volume status " "xml for %s" % self.volname) g.log.info("Correct task status found in volume status xml %s", self.volname)
def test_replacing_all_arbiters(self): """ - Create an arbiter volume 4(2+1) distributed replicate - Start writing IO - While the I/O's are going on replace all the arbiter bricks - check for the new bricks attached successfully - Check for heals - Validate IO """ # pylint: disable=too-many-locals,too-many-statements # get the bricks for the volume g.log.info("Fetching bricks for the volume: %s", self.volname) bricks_list = get_all_bricks(self.mnode, self.volname) g.log.info("Brick list: %s", bricks_list) # Clear all brick folders. Its need to prevent healing with old files for brick in bricks_list: g.log.info('Clearing brick %s', brick) node, brick_path = brick.split(':') ret, _, err = g.run(node, 'cd %s/ ; rm -rf *' % brick_path) self.assertFalse(ret, err) g.log.info('Clearing brick %s is successful', brick) g.log.info('Clearing for all brick is successful') # Creating files on client side for mount_obj in self.mounts: g.log.info("Generating data for %s:%s", mount_obj.client_system, mount_obj.mountpoint) # Create dirs with file g.log.info('Creating dirs with file...') command = ("/usr/bin/env python %s create_deep_dirs_with_files " "-d 3 -l 3 -n 3 -f 20 %s" % (self.script_upload_path, mount_obj.mountpoint)) proc = g.run_async(mount_obj.client_system, command, user=mount_obj.user) self.all_mounts_procs.append(proc) self.io_validation_complete = False # replace bricks subvols = get_subvols(self.mnode, self.volname)['volume_subvols'] for subvol in subvols: g.log.info('Replacing arbiter brick for %s', subvol) brick_to_replace = subvol[-1] self.bricks_to_clean.append(brick_to_replace) new_brick = brick_to_replace + 'new' g.log.info("Replacing the brick %s for the volume: %s", brick_to_replace, self.volname) ret, _, err = replace_brick(self.mnode, self.volname, brick_to_replace, new_brick) self.assertFalse(ret, err) g.log.info('Replaced brick %s to %s successfully', brick_to_replace, new_brick) # check replaced bricks subvols = get_subvols(self.mnode, self.volname)['volume_subvols'] index = 0 for subvol in subvols: expected_brick_path = self.bricks_to_clean[index] + 'new' brick_to_check = subvol[-1] self.assertEqual(expected_brick_path, brick_to_check, 'Brick %s is not replaced brick' % brick_to_check) index += 1 # Wait for volume processes to be online g.log.info("Wait for volume processes to be online") ret = wait_for_volume_process_to_be_online(self.mnode, self.volname) self.assertTrue(ret, ("Failed to wait for volume %s processes to " "be online", self.volname)) g.log.info( "Successful in waiting for volume %s processes to be " "online", self.volname) # Verify volume's all process are online g.log.info("Verifying volume's all process are online") ret = verify_all_process_of_volume_are_online(self.mnode, self.volname) self.assertTrue( ret, ("Volume %s : All process are not online" % self.volname)) g.log.info("Volume %s: All process are online", self.volname) # Wait for self-heal-daemons to be online g.log.info("Waiting for self-heal-daemons to be online") ret = is_shd_daemonized(self.all_servers) self.assertTrue(ret, "Either No self heal daemon process found") g.log.info("All self-heal-daemons are online") # Monitor heal completion ret = monitor_heal_completion(self.mnode, self.volname) self.assertTrue(ret, 'Heal has not yet completed') # Check if heal is completed ret = is_heal_complete(self.mnode, self.volname) self.assertTrue(ret, 'Heal is not complete') g.log.info('Heal is completed successfully') # Check for split-brain ret = is_volume_in_split_brain(self.mnode, self.volname) self.assertFalse(ret, 'Volume is in split-brain state') g.log.info('Volume is not in split-brain state') # Validate IO ret = validate_io_procs(self.all_mounts_procs, self.mounts) self.assertTrue(ret, "IO failed on some of the clients") self.io_validation_complete = True