def test_glusterd_replace_brick(self):
        """
        Create a volume and start it.
        - Get list of all the bricks which are online
        - Select a brick randomly from the bricks which are online
        - Form a non-existing brick path on node where the brick has to replace
        - Perform replace brick and it should fail
        - Form a new brick which valid brick path replace brick should succeed
        """
        # pylint: disable=too-many-function-args
        # Getting all the bricks which are online
        bricks_online = get_online_bricks_list(self.mnode, self.volname)
        self.assertIsNotNone(bricks_online, "Unable to get the online bricks")
        g.log.info("got the brick list from the volume")

        # Getting one random brick from the online bricks to be replaced
        brick_to_replace = random.choice(bricks_online)
        g.log.info("Brick to replace %s", brick_to_replace)
        node_for_brick_replace = brick_to_replace.split(':')[0]
        new_brick_to_replace = form_bricks_list(self.mnode, self.volname, 1,
                                                node_for_brick_replace,
                                                self.all_servers_info)

        # performing replace brick with non-existing brick path
        path = ":/brick/non_existing_path"
        non_existing_path = node_for_brick_replace + path

        # Replace brick for non-existing path
        ret, _, _ = replace_brick(self.mnode, self.volname, brick_to_replace,
                                  non_existing_path)
        self.assertNotEqual(ret, 0, ("Replace brick with commit force"
                                     " on a non-existing brick passed"))
        g.log.info("Replace brick with non-existing brick with commit"
                   "force failed as expected")

        # calling replace brick by passing brick_to_replace and
        # new_brick_to_replace with valid brick path
        ret = replace_brick_from_volume(self.mnode,
                                        self.volname,
                                        self.servers,
                                        self.all_servers_info,
                                        brick_to_replace,
                                        new_brick_to_replace[0],
                                        delete_brick=True)
        self.assertTrue(ret, ("Replace brick with commit force failed"))

        # Validating whether the brick replaced is online
        halt = 20
        counter = 0
        _rc = False
        g.log.info("Wait for some seconds for the replaced brick "
                   "to get online")
        while counter < halt:
            ret = are_bricks_online(self.mnode, self.volname,
                                    new_brick_to_replace)
            if not ret:
                g.log.info("The replaced brick isn't online, "
                           "Retry after 2 seconds .......")
                time.sleep(2)
                counter = counter + 2
            else:
                _rc = True
                g.log.info("The replaced brick is online after being replaced")
                break
        if not _rc:
            raise ExecutionError("The replaced brick isn't online")
Пример #2
0
    def test_replace_brick_quorum(self):
        '''
        -> Create volume
        -> Set quorum type
        -> Set quorum ratio to 95%
        -> Start the volume
        -> Stop the glusterd on one node
        -> Now quorum is in not met condition
        -> Check all bricks went to offline or not
        -> Perform replace brick operation
        -> Start glusterd on same node which is already stopped
        -> Check all bricks are in online or not
        -> Verify in vol info that old brick not replaced with new brick
        '''

        # Forming brick list, 6 bricks for creating volume, 7th brick for
        # performing replace brick operation
        brick_list = form_bricks_list(self.mnode, self.volname, 7,
                                      self.servers, self.all_servers_info)

        # Create Volume
        ret, _, _ = volume_create(self.mnode,
                                  self.volname,
                                  brick_list[0:6],
                                  replica_count=3)
        self.assertEqual(ret, 0, "Failed to create volume %s" % self.volname)
        g.log.info("Volume created successfully %s", self.volname)

        # Enabling server quorum
        ret = set_volume_options(self.mnode, self.volname,
                                 {'cluster.server-quorum-type': 'server'})
        self.assertTrue(
            ret, "Failed to set server quorum on volume %s" % self.volname)
        g.log.info("Able to set server quorum successfully on volume %s",
                   self.volname)

        # Setting Quorum ratio in percentage
        ret = set_volume_options(self.mnode, 'all',
                                 {'cluster.server-quorum-ratio': '95%'})
        self.assertTrue(
            ret, "Failed to set server quorum ratio on %s" % self.servers)
        g.log.info("Able to set server quorum ratio successfully on %s",
                   self.servers)

        # Start the volume
        ret, _, _ = volume_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, "Failed to start volume %s" % self.volname)
        g.log.info("Volume started successfully %s", self.volname)

        # Stop glusterd on one of the node
        random_server = random.choice(self.servers[1:])
        ret = stop_glusterd(random_server)
        self.assertTrue(ret, "Failed to stop glusterd for %s" % random_server)
        g.log.info("Glusterd stopped successfully on server %s", random_server)

        # Checking whether glusterd is running or not
        ret = is_glusterd_running(random_server)
        self.assertEqual(
            ret, 1, "Glusterd is still running on the node %s "
            "where glusterd stopped" % random_server)
        g.log.info("Glusterd is not running on the server %s", random_server)

        # Verifying node count in volume status after glusterd stopped
        # on one of the server, Its not possible to check the brick status
        # immediately in volume status after glusterd stop
        count = 0
        while count < 100:
            vol_status = get_volume_status(self.mnode, self.volname)
            servers_count = len(vol_status[self.volname].keys())
            if servers_count == 5:
                break
            sleep(2)
            count += 1

        # creating brick list from volume status
        offline_bricks = []
        vol_status = get_volume_status(self.mnode, self.volname)
        for node in vol_status[self.volname]:
            for brick_path in vol_status[self.volname][node]:
                if brick_path != 'Self-heal Daemon':
                    offline_bricks.append(':'.join([node, brick_path]))

        # Checking bricks are offline or not with quorum ratio(95%)
        ret = are_bricks_offline(self.mnode, self.volname, offline_bricks)
        self.assertTrue(
            ret, "Bricks are online when quorum is in not met "
            "condition for %s" % self.volname)
        g.log.info(
            "Bricks are offline when quorum is in not met "
            "condition for %s", self.volname)

        # Getting random brick from offline brick list
        self.random_brick = random.choice(offline_bricks)

        # Performing replace brick commit force when quorum not met
        self.replace_brick_failed = False
        ret, _, _ = replace_brick(self.mnode, self.volname, self.random_brick,
                                  brick_list[6])
        self.assertNotEqual(
            ret, 0, "Replace brick should fail when quorum is "
            "in not met condition but replace brick "
            "success on %s" % self.volname)
        g.log.info(
            "Failed to replace brick when quorum is in not met "
            "condition %s", self.volname)
        self.replace_brick_failed = True

        # Start glusterd on one of the node
        ret = start_glusterd(random_server)
        self.assertTrue(
            ret, "Failed to start glusterd on server %s" % random_server)
        g.log.info("Glusterd started successfully on server %s", random_server)

        # Verifying node count in volume status after glusterd started
        # on one of the servers, Its not possible to check the brick status
        # immediately in volume status after glusterd start
        count = 0
        while count < 100:
            vol_status = get_volume_status(self.mnode, self.volname)
            servers_count = len(vol_status[self.volname].keys())
            if servers_count == 6:
                break
            sleep(2)
            count += 1

        # Checking bricks are online or not
        count = 0
        while count < 100:
            ret = are_bricks_online(self.mnode, self.volname, brick_list[0:6])
            if ret:
                break
            sleep(2)
            count += 1
        self.assertTrue(ret, "All bricks are not online for %s" % self.volname)
        g.log.info("All bricks are online for volume %s", self.volname)

        # Comparing brick lists of before and after performing replace brick
        # operation
        after_brick_list = get_all_bricks(self.mnode, self.volname)
        self.assertListEqual(
            after_brick_list, brick_list[0:6],
            "Bricks are not same before and after performing "
            "replace brick operation for volume %s" % self.volname)
        g.log.info(
            "Bricks are same before and after performing replace "
            "brick operation for volume %s", self.volname)
    def test_impact_of_replace_brick_for_glustershd(self):
        # pylint: disable=too-many-statements,too-many-branches,too-many-locals
        nodes = self.volume['servers']
        replaced_bricks = []

        # check the self-heal daemon process
        g.log.info("Starting to get self-heal daemon process on "
                   "nodes %s", nodes)
        ret, pids = get_self_heal_daemon_pid(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process "
                              "found : %s" % pids))
        g.log.info(
            "Successful in getting Single self heal daemon process"
            " on all nodes %s", nodes)
        glustershd_pids = pids

        # get the bricks for the volume
        g.log.info("Fetching bricks for the volume : %s", self.volname)
        bricks_list = get_all_bricks(self.mnode, self.volname)
        g.log.info("Brick List : %s", bricks_list)

        # validate the bricks present in volume info with
        # glustershd server volume file
        g.log.info("Starting parsing file %s on "
                   "node %s", self.glustershd, self.mnode)
        ret = do_bricks_exist_in_shd_volfile(self.mnode, self.volname,
                                             bricks_list)
        self.assertTrue(ret, ("Brick List from volume info is different "
                              "from glustershd server volume file. "
                              "Please check log file for details"))
        g.log.info("Successfully parsed %s file", self.glustershd)

        # get the subvolumes
        g.log.info("Starting to get sub-volumes for volume %s", self.volname)
        subvols_dict = get_subvols(self.mnode, self.volname)
        num_subvols = len(subvols_dict['volume_subvols'])
        g.log.info("Number of subvolumes in volume %s:", num_subvols)

        # replace brick from each sub-vol
        for i in range(0, num_subvols):
            subvol_brick_list = subvols_dict['volume_subvols'][i]
            g.log.info("sub-volume %s brick list : %s", i, subvol_brick_list)
            brick_to_replace = subvol_brick_list[-1]
            new_brick = brick_to_replace + 'new'
            g.log.info("Replacing the brick %s for the volume : %s",
                       brick_to_replace, self.volname)
            ret, _, err = replace_brick(self.mnode, self.volname,
                                        brick_to_replace, new_brick)
            self.assertFalse(ret, err)
            g.log.info('Replaced brick %s to %s successfully',
                       brick_to_replace, new_brick)
            replaced_bricks.append(brick_to_replace)

        # Verify volume's all process are online for 60 sec
        g.log.info("Verifying volume's all process are online")
        ret = wait_for_volume_process_to_be_online(self.mnode,
                                                   self.volname,
                                                   timeout=60)
        self.assertTrue(ret, ("Volume %s : All process are not "
                              "online", self.volname))
        g.log.info("Successfully Verified volume %s processes are online",
                   self.volname)

        # Verify glustershd process releases its parent process
        ret = is_shd_daemonized(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process found"))

        # check the self-heal daemon process
        g.log.info("Starting to get self-heal daemon process on nodes "
                   "%s", nodes)
        ret, pids = get_self_heal_daemon_pid(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or"
                              " more than One self heal daemon process"
                              " found : %s" % pids))
        g.log.info(
            "Successful in getting Single self heal daemon process"
            " on all nodes %s", nodes)
        glustershd_pids_after_replacement = pids

        # Compare pids before and after replacing
        self.assertNotEqual(
            glustershd_pids, glustershd_pids_after_replacement,
            "Self Daemon process is same before and"
            " after replacing bricks")
        g.log.info("Self Heal Daemon Process is different before and "
                   "after replacing bricks")

        # get the bricks for the volume after replacing
        bricks_list_after_replacing = get_all_bricks(self.mnode, self.volname)
        g.log.info("Brick List after expanding "
                   "volume: %s", bricks_list_after_replacing)

        # validate the bricks present in volume info
        # with glustershd server volume file after replacing bricks
        g.log.info("Starting parsing file %s", self.glustershd)
        ret = do_bricks_exist_in_shd_volfile(self.mnode, self.volname,
                                             bricks_list_after_replacing)

        self.assertTrue(ret, ("Brick List from volume info is different "
                              "from glustershd server volume file after "
                              "replacing bricks. Please check log file "
                              "for details"))
        g.log.info("Successfully parsed %s file", self.glustershd)
        g.log.info("Starting to delete replaced brick dir's")

        # Remove brick directories of the replaced bricks as this is not
        # handled by tearDown class
        for bricks in replaced_bricks:
            node, brick_path = bricks.split(r':')
            cmd = "rm -rf " + brick_path
            ret, _, _ = g.run(node, cmd)
            if ret:
                raise ExecutionError("Failed to delete the brick dir's for"
                                     " %s and brick %s" % (node, brick_path))
            g.log.info("Successfully deleted brick dir's for replaced bricks")
    def test_impact_of_replace_brick_on_glustershd(self):
        """
        Test Script to verify the glustershd server vol file
        has only entries for replicate volumes
        1.Create multiple volumes and start all volumes
        2.Check the glustershd processes - Only 1 glustershd should be listed
        3.Do replace brick on the replicate volume
        4.Confirm that the brick is replaced
        5.Check the glustershd processes - Only 1 glustershd should be listed
                                           and pid should be different
        6.glustershd server vol should be updated with new bricks
        """
        # Check the self-heal daemon process
        ret, glustershd_pids = get_self_heal_daemon_pid(self.servers)
        self.assertTrue(ret, ("Either no self heal daemon process found or "
                              "more than one self heal daemon process "
                              "found : %s" % glustershd_pids))
        g.log.info(
            "Successful in getting single self heal daemon process"
            " on all nodes %s", self.servers)

        volume_list = get_volume_list(self.mnode)
        for volume in volume_list:

            # Log Volume Info and Status before replacing brick
            ret = log_volume_info_and_status(self.mnode, volume)
            self.assertTrue(ret, ("Logging volume info and status "
                                  "failed on volume %s", volume))
            g.log.info(
                "Successful in logging volume info and status "
                "of volume %s", volume)

            # Selecting a random source brick to replace
            src_brick = choice(get_all_bricks(self.mnode, volume))
            src_node, original_brick = src_brick.split(":")

            # Creating a random destination brick in such a way
            # that the brick is select from the same node but always
            # picks a different from the original brick
            list_of_bricks = [
                brick for brick in get_servers_bricks_dict(
                    src_node, self.all_servers_info)[src_node]
                if brick not in original_brick
            ]
            dst_brick = ('{}:{}/{}_replaced'.format(
                src_node, choice(list_of_bricks),
                original_brick.split('/')[::-1][0]))

            # Replace brick for the volume
            ret, _, _ = replace_brick(self.mnode, volume, src_brick, dst_brick)
            self.assertFalse(
                ret, "Failed to replace brick "
                "from the volume %s" % volume)
            g.log.info(
                "Successfully replaced faulty brick from "
                "the volume %s", volume)

            # Verify all volume process are online
            ret = wait_for_volume_process_to_be_online(self.mnode, volume)
            self.assertTrue(ret,
                            "Volume %s : All process are not online" % volume)
            g.log.info("Volume %s : All process are online", volume)

            # Check the self-heal daemon process after replacing brick
            ret, pid_after_replace = get_self_heal_daemon_pid(self.servers)
            self.assertTrue(
                ret, "Either no self heal daemon process "
                "found or more than one self heal "
                "daemon process found : %s" % pid_after_replace)
            g.log.info(
                "Successful in getting Single self heal "
                " daemon process on all nodes %s", self.servers)

            # Compare the glustershd pids
            self.assertNotEqual(
                glustershd_pids, pid_after_replace,
                "Self heal daemon process should be different "
                "after replacing bricks in %s volume" % volume)
            g.log.info("EXPECTED: Self heal daemon process should be different"
                       " after replacing bricks in replicate volume")

            # Get the bricks for the volume
            bricks_list = get_all_bricks(self.mnode, volume)
            g.log.info("Brick List : %s", bricks_list)

            # Validate the bricks present in volume info with
            # glustershd server volume file
            ret = do_bricks_exist_in_shd_volfile(self.mnode, volume,
                                                 bricks_list)
            self.assertTrue(ret, ("Brick List from volume info is "
                                  "different from glustershd server "
                                  "volume file. Please check log file "
                                  "for details"))
            g.log.info(
                "Bricks in volume %s exists in glustershd server "
                "volume file", volume)
    def test_replacing_all_arbiters(self):
        """
        - Create an arbiter volume 4(2+1) distributed replicate
        - Start writing IO
        - While the I/O's are going on replace all the arbiter bricks
        - check for the new bricks attached successfully
        - Check for heals
        - Validate IO
        """
        # pylint: disable=too-many-locals,too-many-statements
        # get the bricks for the volume
        g.log.info("Fetching bricks for the volume: %s", self.volname)
        bricks_list = get_all_bricks(self.mnode, self.volname)
        g.log.info("Brick list: %s", bricks_list)

        # Clear all brick folders. Its need to prevent healing with old files
        for brick in bricks_list:
            g.log.info('Clearing brick %s', brick)
            node, brick_path = brick.split(':')
            ret, _, err = g.run(node, 'cd %s/ ; rm -rf *' % brick_path)
            self.assertFalse(ret, err)
            g.log.info('Clearing brick %s is successful', brick)
        g.log.info('Clearing for all brick is successful')

        # Creating files on client side
        for mount_obj in self.mounts:
            g.log.info("Generating data for %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            # Create dirs with file
            g.log.info('Creating dirs with file...')
            command = ("/usr/bin/env python %s create_deep_dirs_with_files "
                       "-d 3 -l 3 -n 3 -f 20 %s" %
                       (self.script_upload_path, mount_obj.mountpoint))

            proc = g.run_async(mount_obj.client_system,
                               command,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # replace bricks
        subvols = get_subvols(self.mnode, self.volname)['volume_subvols']
        for subvol in subvols:
            g.log.info('Replacing arbiter brick for %s', subvol)
            brick_to_replace = subvol[-1]
            self.bricks_to_clean.append(brick_to_replace)
            new_brick = brick_to_replace + 'new'
            g.log.info("Replacing the brick %s for the volume: %s",
                       brick_to_replace, self.volname)
            ret, _, err = replace_brick(self.mnode, self.volname,
                                        brick_to_replace, new_brick)
            self.assertFalse(ret, err)
            g.log.info('Replaced brick %s to %s successfully',
                       brick_to_replace, new_brick)

        # check replaced bricks
        subvols = get_subvols(self.mnode, self.volname)['volume_subvols']
        index = 0
        for subvol in subvols:
            expected_brick_path = self.bricks_to_clean[index] + 'new'
            brick_to_check = subvol[-1]
            self.assertEqual(expected_brick_path, brick_to_check,
                             'Brick %s is not replaced brick' % brick_to_check)
            index += 1

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))
        g.log.info(
            "Successful in waiting for volume %s processes to be "
            "online", self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Volume %s : All process are not online" % self.volname))
        g.log.info("Volume %s: All process are online", self.volname)

        # Wait for self-heal-daemons to be online
        g.log.info("Waiting for self-heal-daemons to be online")
        ret = is_shd_daemonized(self.all_servers)
        self.assertTrue(ret, "Either No self heal daemon process found")
        g.log.info("All self-heal-daemons are online")

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Validate IO
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        self.io_validation_complete = True
    def test_impact_of_replace_brick_for_glustershd(self):
        nodes = self.volume['servers']

        # check the self-heal daemon process
        g.log.info("Starting to get self-heal daemon process on "
                   "nodes %s" % nodes)
        ret, pids = get_self_heal_daemon_pid(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process "
                              "found : %s" % pids))
        g.log.info("Successful in getting Single self heal daemon process"
                   " on all nodes %s", nodes)
        glustershd_pids = pids

        # get the bricks for the volume
        g.log.info("Fetching bricks for the volume : %s" % self.volname)
        bricks_list = get_all_bricks(self.mnode, self.volname)
        g.log.info("Brick List : %s" % bricks_list)

        # validate the bricks present in volume info with
        # glustershd server volume file
        g.log.info("Starting parsing file %s on "
                   "node %s" % (self.GLUSTERSHD, self.mnode))
        ret = do_bricks_exist_in_shd_volfile(self.mnode, self.volname,
                                             bricks_list)
        self.assertTrue(ret, ("Brick List from volume info is different "
                              "from glustershd server volume file. "
                              "Please check log file for details"))
        g.log.info("Successfully parsed %s file" % self.GLUSTERSHD)

        # replace brick
        brick_to_replace = bricks_list[-1]
        new_brick = brick_to_replace + 'new'
        g.log.info("Replacing the brick %s for the volume : %s"
                   % (brick_to_replace, self.volname))
        ret, out, err = replace_brick(self.mnode, self.volname,
                                      brick_to_replace, new_brick)
        self.assertFalse(ret, err)
        g.log.info('Replaced brick %s to %s successfully'
                   % (brick_to_replace, new_brick))

        # check bricks
        bricks_list = get_all_bricks(self.mnode, self.volname)
        self.assertEqual(bricks_list[-1], new_brick, 'Replaced brick and '
                                                     'new brick are not equal')

        # Verify volume's all process are online for 60 sec
        g.log.info("Verifying volume's all process are online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname,
                                                   timeout=60)
        self.assertTrue(ret, ("Volume %s : All process are not "
                              "online", self.volname))
        g.log.info("Successfully Verified volume %s processes are online",
                   self.volname)

        # Verify glustershd process releases its parent process
        ret = is_shd_daemonized(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process found"))

        # check the self-heal daemon process
        g.log.info("Starting to get self-heal daemon process on "
                   "nodes %s" % nodes)
        ret, pids = get_self_heal_daemon_pid(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process "
                              "found : %s" % pids))
        g.log.info("Successful in getting Single self heal daemon process"
                   " on all nodes %s", nodes)
        glustershd_pids_after_replacement = pids

        # Compare pids before and after replacing
        self.assertNotEqual(glustershd_pids,
                            glustershd_pids_after_replacement,
                            "Self Daemon process is same before and"
                            " after replacing bricks")
        g.log.info("Self Heal Daemon Process is different before and "
                   "after replacing bricks")

        # get the bricks for the volume after replacing
        bricks_list_after_replacing = get_all_bricks(self.mnode, self.volname)
        g.log.info("Brick List after expanding "
                   "volume: %s" % bricks_list_after_replacing)

        # validate the bricks present in volume info
        # with glustershd server volume file after replacing bricks
        g.log.info("Starting parsing file %s" % self.GLUSTERSHD)
        ret = do_bricks_exist_in_shd_volfile(self.mnode, self.volname,
                                             bricks_list_after_replacing)

        self.assertTrue(ret, ("Brick List from volume info is different "
                              "from glustershd server volume file after "
                              "replacing bricks. Please check log file "
                              "for details"))
        g.log.info("Successfully parsed %s file" % self.GLUSTERSHD)