示例#1
0
    def test_server_side_healing_happens_only_when_glustershd_running(self):
        """
        Test Script which verifies that the server side healing must happen
        only if the heal daemon is running on the node where source brick
        resides.

         * Create and start the Replicate volume
         * Check the glustershd processes - Only 1 glustershd should be listed
         * Bring down the bricks without affecting the cluster
         * Create files on volume
         * kill the glustershd on node where bricks is running
         * bring the bricks up which was killed in previous steps
         * check the heal info - heal info must show pending heal info, heal
           shouldn't happen since glustershd is down on source node
         * issue heal
         * trigger client side heal
         * heal should complete successfully
        """
        # pylint: disable=too-many-locals,too-many-statements,too-many-lines
        # Setting Volume options
        options = {
            "metadata-self-heal": "on",
            "entry-self-heal": "on",
            "data-self-heal": "on"
        }
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options %s' % options)
        g.log.info("Successfully set %s for volume %s", options, self.volname)

        # Check the self-heal daemon process
        ret, pids = get_self_heal_daemon_pid(self.servers)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process "
                              "found : %s" % pids))
        g.log.info(
            "Successful in verifying self heal daemon process"
            " on all nodes %s", self.servers)

        # Select the bricks to bring offline
        bricks_to_bring_offline = (select_volume_bricks_to_bring_offline(
            self.mnode, self.volname))
        g.log.info("Brick List to bring offline : %s", bricks_to_bring_offline)

        # Bring down the selected bricks
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(ret, "Failed to bring down the bricks")
        g.log.info("Brought down the brick process "
                   "for %s", bricks_to_bring_offline)

        # Write files on all mounts
        all_mounts_procs, num_files_to_write = [], 100
        for mount_obj in self.mounts:
            cmd = ("/usr/bin/env python %s create_files "
                   "-f %s --base-file-name file %s" %
                   (self.script_upload_path, num_files_to_write,
                    mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            all_mounts_procs.append(proc)

        # Validate IO
        ret = validate_io_procs(all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        g.log.info("IO is successful on all mounts")

        # Get online bricks list
        online_bricks = get_online_bricks_list(self.mnode, self.volname)
        g.log.info("Online Bricks for volume %s : %s", self.volname,
                   online_bricks)

        # Get the nodes where bricks are running
        bring_offline_glustershd_nodes = []
        for brick in online_bricks:
            bring_offline_glustershd_nodes.append(brick.split(":")[0])
        g.log.info("self heal deamon on nodes %s to be killed",
                   bring_offline_glustershd_nodes)

        # Kill the self heal daemon process on nodes
        ret = bring_self_heal_daemon_process_offline(
            bring_offline_glustershd_nodes)
        self.assertTrue(
            ret, ("Unable to bring self heal daemon process"
                  " offline for nodes %s" % bring_offline_glustershd_nodes))
        g.log.info(
            "Sucessfully brought down self heal process for "
            "nodes %s", bring_offline_glustershd_nodes)

        # Check the heal info
        heal_info = get_heal_info_summary(self.mnode, self.volname)
        g.log.info("Successfully got heal info %s for the volume %s",
                   heal_info, self.volname)

        # Bring bricks online
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline, 'glusterd_restart')
        self.assertTrue(
            ret,
            ("Failed to bring bricks: %s online" % bricks_to_bring_offline))

        # Issue heal
        ret = trigger_heal_full(self.mnode, self.volname)
        self.assertFalse(ret,
                         ("Able to trigger heal on volume %s where "
                          "self heal daemon is not running" % self.volname))
        g.log.info(
            "Expected : Unable to trigger heal on volume %s where "
            "self heal daemon is not running", self.volname)

        # Wait for 130 sec to heal
        ret = monitor_heal_completion(self.mnode, self.volname, 130)
        self.assertFalse(ret, ("Heal Completed on volume %s" % self.volname))
        g.log.info("Expected : Heal pending on volume %s", self.volname)

        # Check the heal info
        heal_info_after_triggering_heal = get_heal_info_summary(
            self.mnode, self.volname)
        g.log.info("Successfully got heal info for the volume %s",
                   self.volname)

        # Compare with heal pending with the files wrote
        for node in online_bricks:
            self.assertGreaterEqual(
                int(heal_info_after_triggering_heal[node]['numberOfEntries']),
                num_files_to_write,
                ("Some of the files are healed from source bricks %s where "
                 "self heal daemon is not running" % node))
        g.log.info("EXPECTED: No files are healed from source bricks where "
                   "self heal daemon is not running")

        # Unmount and Mount volume again as volume options were set
        # after mounting the volume
        for mount_obj in self.mounts:
            ret, _, _ = umount_volume(mount_obj.client_system,
                                      mount_obj.mountpoint)
            self.assertEqual(ret, 0,
                             "Failed to unmount %s" % mount_obj.client_system)
            ret, _, _ = mount_volume(self.volname,
                                     mtype='glusterfs',
                                     mpoint=mount_obj.mountpoint,
                                     mserver=self.mnode,
                                     mclient=mount_obj.client_system)
            self.assertEqual(ret, 0,
                             "Failed to mount %s" % mount_obj.client_system)

        all_mounts_procs = []
        for mount_obj in self.mounts:
            cmd = ("/usr/bin/env python %s read %s" %
                   (self.script_upload_path, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            all_mounts_procs.append(proc)

        # Validate IO
        ret = validate_io_procs(all_mounts_procs, self.mounts)
        self.assertTrue(ret, "Reads failed on some of the clients")
        g.log.info("Reads successful on all mounts")

        # Wait for heal to complete
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, "Unable to heal the pending entries")
        g.log.info("Successfully healed the pending entries for volume %s",
                   self.volname)
示例#2
0
    def test_glustershd_with_restarting_glusterd(self):
        """
        Test Script to verify the self heal daemon process with restarting
        glusterd and rebooting the server

        * stop all volumes
        * restart glusterd - should not run self heal daemon process
        * start replicated involved volumes
        * single self heal daemon process running
        * restart glusterd
        * self heal daemon pid will change
        * bring down brick and restart glusterd
        * self heal daemon pid will change and its different from previous
        * brought up the brick

        """
        # pylint: disable=too-many-statements
        nodes = self.volume['servers']

        # stop the volume
        g.log.info("Stopping the volume %s", self.volname)
        ret = volume_stop(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to stop volume %s" % self.volname))
        g.log.info("Successfully stopped volume %s", self.volname)

        # check the self heal daemon process after stopping the volume
        g.log.info("Verifying the self heal daemon process for "
                   "volume %s", self.volname)
        ret = are_all_self_heal_daemons_are_online(self.mnode, self.volname)
        self.assertFalse(ret, ("Self Heal Daemon process is still running "
                               "even after stopping volume %s" % self.volname))
        g.log.info("Self Heal Daemon is not running after stopping  "
                   "volume %s", self.volname)

        # restart glusterd service on all the servers
        g.log.info("Restarting glusterd on all servers %s", nodes)
        ret = restart_glusterd(nodes)
        self.assertTrue(ret, ("Failed to restart glusterd on all nodes %s",
                              nodes))
        g.log.info("Successfully restarted glusterd on all nodes %s",
                   nodes)

        self.assertTrue(
            wait_for_glusterd_to_start(self.servers),
            "Failed to start glusterd on %s" % self.servers)

        # check the self heal daemon process after restarting glusterd process
        g.log.info("Starting to get self-heal daemon process on"
                   " nodes %s", nodes)
        ret = are_all_self_heal_daemons_are_online(self.mnode, self.volname)
        self.assertFalse(ret, ("Self Heal Daemon process is running after "
                               "glusterd restart with volume %s in "
                               "stop state" % self.volname))
        g.log.info("Self Heal Daemon is not running after stopping  "
                   "volume and restarting glusterd %s", self.volname)

        # start the volume
        g.log.info("Starting the volume %s", self.volname)
        ret = volume_start(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to start volume %s" % self.volname))
        g.log.info("Volume %s started successfully", self.volname)

        # Verfiy glustershd process releases its parent process
        g.log.info("Checking whether glustershd process is daemonized or not")
        ret = is_shd_daemonized(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process found"))
        g.log.info("Single self heal daemon process on all nodes %s", nodes)

        # get the self heal daemon pids after starting volume
        g.log.info("Starting to get self-heal daemon process "
                   "on nodes %s", nodes)
        ret, pids = get_self_heal_daemon_pid(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process found"))
        g.log.info("Successful in getting self heal daemon pids")
        glustershd_pids = pids

        # get the bricks for the volume
        g.log.info("Fetching bricks for the volume : %s", self.volname)
        bricks_list = get_all_bricks(self.mnode, self.volname)
        g.log.info("Brick List : %s", bricks_list)

        # validate the bricks present in volume info
        # with glustershd server volume file
        g.log.info("Starting parsing file %s on "
                   "node %s", self.glustershd, self.mnode)
        ret = do_bricks_exist_in_shd_volfile(self.mnode, self.volname,
                                             bricks_list)
        self.assertTrue(ret, ("Brick List from volume info is different from "
                              "glustershd server volume file. "
                              "Please check log file for details."))
        g.log.info("Successfully parsed %s file", self.glustershd)

        # restart glusterd service on all the servers
        g.log.info("Restarting glusterd on all servers %s", nodes)
        ret = restart_glusterd(nodes)
        self.assertTrue(ret, ("Failed to restart glusterd on all nodes %s",
                              nodes))
        g.log.info("Successfully restarted glusterd on all nodes %s",
                   nodes)

        # Verify volume's all process are online for 60 sec
        g.log.info("Verifying volume's all process are online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname,
                                                   60)
        self.assertTrue(ret, ("Volume %s : All process are not "
                              "online", self.volname))
        g.log.info("Successfully Verified volume %s processes are online",
                   self.volname)

        # Verfiy glustershd process releases its parent process
        ret = is_shd_daemonized(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process found"))

        # check the self heal daemon process after starting volume and
        # restarting glusterd process
        g.log.info("Starting to get self-heal daemon process "
                   "on nodes %s", nodes)
        ret, pids = get_self_heal_daemon_pid(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process found"))
        glustershd_pids_after_glusterd_restart = pids

        self.assertNotEqual(glustershd_pids,
                            glustershd_pids_after_glusterd_restart,
                            ("Self Heal Daemon pids are same after "
                             "restarting glusterd process"))
        g.log.info("Self Heal Daemon process are different before and "
                   "after restarting glusterd process")

        # select bricks to bring offline
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = list(filter(None, (
            bricks_to_bring_offline_dict['hot_tier_bricks'] +
            bricks_to_bring_offline_dict['cold_tier_bricks'] +
            bricks_to_bring_offline_dict['volume_bricks'])))

        # bring bricks offline
        g.log.info("Going to bring down the brick process "
                   "for %s", bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(ret, ("Failed to bring down the bricks. Please "
                              "check the log file for more details."))
        g.log.info("Brought down the brick process "
                   "for %s successfully", bricks_to_bring_offline)

        # restart glusterd after brought down the brick
        g.log.info("Restart glusterd on all servers %s", nodes)
        ret = restart_glusterd(nodes)
        self.assertTrue(ret, ("Failed to restart glusterd on all nodes %s",
                              nodes))
        g.log.info("Successfully restarted glusterd on all nodes %s",
                   nodes)

        # Verify volume's all process are online for 60 sec
        g.log.info("Verifying volume's all process are online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname,
                                                   60)
        self.assertTrue(ret, ("Volume %s : All process are not "
                              "online", self.volname))
        g.log.info("Successfully Verified volume %s processes are online",
                   self.volname)

        # Verfiy glustershd process releases its parent process
        ret = is_shd_daemonized(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process found"))

        # check the self heal daemon process after killing brick and
        # restarting glusterd process
        g.log.info("Starting to get self-heal daemon process "
                   "on nodes %s", nodes)
        ret, pids = get_self_heal_daemon_pid(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process found"))
        glustershd_pids_after_killing_brick = pids

        self.assertNotEqual(glustershd_pids_after_glusterd_restart,
                            glustershd_pids_after_killing_brick,
                            ("Self Heal Daemon process are same from before "
                             "killing the brick,restarting glusterd process"))
        g.log.info("Self Heal Daemon process are different after killing the "
                   "brick, restarting the glusterd process")

        # brought the brick online
        g.log.info("bringing up the bricks : %s online",
                   bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(ret, ("Failed to brought the bricks online"))
        g.log.info("Successfully brought the bricks online")

        # check all bricks are online
        g.log.info("Verifying all bricka are online or not.....")
        ret = are_bricks_online(self.mnode, self.volname,
                                bricks_to_bring_offline)
        self.assertTrue(ret, ("Not all bricks are online"))
        g.log.info("All bricks are online.")
    def test_glustershd_on_all_volume_types(self):
        """
        Test Script to verify the glustershd server vol file
        has only entries for replicate volumes

        * Create multiple volumes and start all volumes
        * Check the glustershd processes - Only One glustershd should be listed
        * Check the glustershd server vol file - should contain entries only
                                             for replicated involved volumes
        * Add bricks to the replicate volume - it should convert to
                                               distributed-replicate
        * Check the glustershd server vol file - newly added bricks
                                                 should present
        * Check the glustershd processes - Only 1 glustershd should be listed

        """
        # pylint: disable=too-many-statements
        nodes = self.servers

        # check the self-heal daemon process
        g.log.info("Starting to get self-heal daemon process on "
                   "nodes %s", nodes)
        ret, glustershd_pids = get_self_heal_daemon_pid(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process "
                              "found : %s" % glustershd_pids))
        g.log.info(
            "Successful in getting Single self heal daemon process"
            " on all nodes %s", nodes)

        # For all the volumes, check whether bricks present in
        # glustershd server vol file
        volume_list = get_volume_list(self.mnode)
        for volume in volume_list:
            g.log.info("Volume Name: %s", volume)
            volume_type_info = get_volume_type_info(self.mnode, volume)
            volume_type = (volume_type_info['volume_type_info']['typeStr'])

            # get the bricks for the volume
            g.log.info("Fetching bricks for the volume : %s", volume)
            bricks_list = get_all_bricks(self.mnode, volume)
            g.log.info("Brick List : %s", bricks_list)

            # validate the bricks present in volume info with
            # glustershd server volume file
            g.log.info("Start parsing file %s on "
                       "node %s", self.GLUSTERSHD, self.mnode)
            ret = do_bricks_exist_in_shd_volfile(self.mnode, volume,
                                                 bricks_list)
            if volume_type == 'Distribute':
                self.assertFalse(ret,
                                 ("Bricks exist in glustershd server "
                                  "volume file for %s Volume" % volume_type))
                g.log.info(
                    "EXPECTED : Bricks doesn't exist in glustershd "
                    "server volume file for %s Volume", volume_type)
            else:
                self.assertTrue(ret, ("Brick List from volume info is "
                                      "different from glustershd server "
                                      "volume file. Please check log "
                                      "file for details"))
                g.log.info(
                    "Bricks exist in glustershd server volume file "
                    "for %s Volume", volume_type)

        # expanding volume for Replicate
        for volume in volume_list:
            volume_type_info = get_volume_type_info(self.mnode, volume)
            volume_type = (volume_type_info['volume_type_info']['typeStr'])
            if volume_type == 'Replicate':
                g.log.info("Start adding bricks to volume %s", volume)
                ret = expand_volume(self.mnode, volume, self.servers,
                                    self.all_servers_info)
                self.assertTrue(ret, ("Failed to add bricks to "
                                      "volume %s " % volume))
                g.log.info("Add brick successful")

                # Log Volume Info and Status after expanding the volume
                g.log.info("Logging volume info and Status after "
                           "expanding volume")
                ret = log_volume_info_and_status(self.mnode, volume)
                self.assertTrue(ret, ("Logging volume info and status failed "
                                      "on volume %s", volume))
                g.log.info(
                    "Successful in logging volume info and status "
                    "of volume %s", volume)

                # Verify volume's all process are online for 60 sec
                g.log.info("Verifying volume's all process are online")
                ret = wait_for_volume_process_to_be_online(
                    self.mnode, volume, 60)
                self.assertTrue(ret, ("Volume %s : All process are not "
                                      "online", volume))
                g.log.info(
                    "Successfully verified volume %s processes "
                    "are online", volume)

                # check the type for the replicate volume
                volume_type_info_for_replicate_after_adding_bricks = \
                    get_volume_type_info(self.mnode, volume)
                volume_type_for_replicate_after_adding_bricks = \
                    (volume_type_info_for_replicate_after_adding_bricks
                     ['volume_type_info']['typeStr'])

                self.assertEqual(volume_type_for_replicate_after_adding_bricks,
                                 'Distributed-Replicate',
                                 ("Replicate volume type is not converted to "
                                  "Distributed-Replicate after adding bricks"))
                g.log.info("Replicate Volume is successfully converted to"
                           " Distributed-Replicate after adding bricks")

                # get the bricks for the volume after expanding
                bricks_list_after_expanding = get_all_bricks(
                    self.mnode, volume)
                g.log.info("Brick List after expanding "
                           "volume: %s", bricks_list_after_expanding)

                # validate the bricks present in volume info
                # with glustershd server volume file after adding bricks
                g.log.info("Starting parsing file %s", self.GLUSTERSHD)
                ret = do_bricks_exist_in_shd_volfile(
                    self.mnode, volume, bricks_list_after_expanding)

                self.assertTrue(ret, ("Brick List from volume info is "
                                      "different from glustershd server "
                                      "volume file after expanding bricks. "
                                      "Please check log file for details"))
                g.log.info("Brick List from volume info is same as from "
                           "glustershd server volume file after "
                           "expanding bricks.")

        # check the self-heal daemon process
        g.log.info("Starting to get self-heal daemon process on "
                   "nodes %s", nodes)
        ret, glustershd_pids_after_adding_bricks = \
            get_self_heal_daemon_pid(nodes)
        self.assertTrue(ret,
                        ("Either No self heal daemon process found or "
                         "more than One self heal daemon process "
                         "found : %s" % glustershd_pids_after_adding_bricks))
        g.log.info(
            "Successful in getting Single self heal daemon process"
            " on all nodes %s", nodes)

        self.assertNotEqual(
            glustershd_pids, glustershd_pids_after_adding_bricks,
            "Self Daemon process is same before and"
            " after adding bricks")
        g.log.info("Self Heal Daemon Process is different before and "
                   "after adding bricks")
    def test_impact_of_replace_brick_for_glustershd(self):
        # pylint: disable=too-many-statements,too-many-branches,too-many-locals
        nodes = self.volume['servers']
        replaced_bricks = []

        # check the self-heal daemon process
        g.log.info("Starting to get self-heal daemon process on "
                   "nodes %s", nodes)
        ret, pids = get_self_heal_daemon_pid(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process "
                              "found : %s" % pids))
        g.log.info(
            "Successful in getting Single self heal daemon process"
            " on all nodes %s", nodes)
        glustershd_pids = pids

        # get the bricks for the volume
        g.log.info("Fetching bricks for the volume : %s", self.volname)
        bricks_list = get_all_bricks(self.mnode, self.volname)
        g.log.info("Brick List : %s", bricks_list)

        # validate the bricks present in volume info with
        # glustershd server volume file
        g.log.info("Starting parsing file %s on "
                   "node %s", self.glustershd, self.mnode)
        ret = do_bricks_exist_in_shd_volfile(self.mnode, self.volname,
                                             bricks_list)
        self.assertTrue(ret, ("Brick List from volume info is different "
                              "from glustershd server volume file. "
                              "Please check log file for details"))
        g.log.info("Successfully parsed %s file", self.glustershd)

        # get the subvolumes
        g.log.info("Starting to get sub-volumes for volume %s", self.volname)
        subvols_dict = get_subvols(self.mnode, self.volname)
        num_subvols = len(subvols_dict['volume_subvols'])
        g.log.info("Number of subvolumes in volume %s:", num_subvols)

        # replace brick from each sub-vol
        for i in range(0, num_subvols):
            subvol_brick_list = subvols_dict['volume_subvols'][i]
            g.log.info("sub-volume %s brick list : %s", i, subvol_brick_list)
            brick_to_replace = subvol_brick_list[-1]
            new_brick = brick_to_replace + 'new'
            g.log.info("Replacing the brick %s for the volume : %s",
                       brick_to_replace, self.volname)
            ret, _, err = replace_brick(self.mnode, self.volname,
                                        brick_to_replace, new_brick)
            self.assertFalse(ret, err)
            g.log.info('Replaced brick %s to %s successfully',
                       brick_to_replace, new_brick)
            replaced_bricks.append(brick_to_replace)

        # Verify volume's all process are online for 60 sec
        g.log.info("Verifying volume's all process are online")
        ret = wait_for_volume_process_to_be_online(self.mnode,
                                                   self.volname,
                                                   timeout=60)
        self.assertTrue(ret, ("Volume %s : All process are not "
                              "online", self.volname))
        g.log.info("Successfully Verified volume %s processes are online",
                   self.volname)

        # Verify glustershd process releases its parent process
        ret = is_shd_daemonized(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process found"))

        # check the self-heal daemon process
        g.log.info("Starting to get self-heal daemon process on nodes "
                   "%s", nodes)
        ret, pids = get_self_heal_daemon_pid(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or"
                              " more than One self heal daemon process"
                              " found : %s" % pids))
        g.log.info(
            "Successful in getting Single self heal daemon process"
            " on all nodes %s", nodes)
        glustershd_pids_after_replacement = pids

        # Compare pids before and after replacing
        self.assertNotEqual(
            glustershd_pids, glustershd_pids_after_replacement,
            "Self Daemon process is same before and"
            " after replacing bricks")
        g.log.info("Self Heal Daemon Process is different before and "
                   "after replacing bricks")

        # get the bricks for the volume after replacing
        bricks_list_after_replacing = get_all_bricks(self.mnode, self.volname)
        g.log.info("Brick List after expanding "
                   "volume: %s", bricks_list_after_replacing)

        # validate the bricks present in volume info
        # with glustershd server volume file after replacing bricks
        g.log.info("Starting parsing file %s", self.glustershd)
        ret = do_bricks_exist_in_shd_volfile(self.mnode, self.volname,
                                             bricks_list_after_replacing)

        self.assertTrue(ret, ("Brick List from volume info is different "
                              "from glustershd server volume file after "
                              "replacing bricks. Please check log file "
                              "for details"))
        g.log.info("Successfully parsed %s file", self.glustershd)
        g.log.info("Starting to delete replaced brick dir's")

        # Remove brick directories of the replaced bricks as this is not
        # handled by tearDown class
        for bricks in replaced_bricks:
            node, brick_path = bricks.split(r':')
            cmd = "rm -rf " + brick_path
            ret, _, _ = g.run(node, cmd)
            if ret:
                raise ExecutionError("Failed to delete the brick dir's for"
                                     " %s and brick %s" % (node, brick_path))
            g.log.info("Successfully deleted brick dir's for replaced bricks")
示例#5
0
    def test_glustershd_with_add_remove_brick(self):
        """
        Test script to verify glustershd process with adding and
        removing bricks

        * check glustershd process - only 1 glustershd process should
          be running
        * bricks must be present in glustershd-server.vol file for
          the replicated involved volumes
        * Add bricks
        * check glustershd process - only 1 glustershd process should
          be running and its should be different from previous one
        * bricks which are added must present in glustershd-server.vol file
        * remove bricks
        * check glustershd process - only 1 glustershd process should
          be running and its different from previous one
        * bricks which are removed should not present
          in glustershd-server.vol file

        """
        # pylint: disable=too-many-statements
        nodes = self.volume['servers']
        bricks_list = []
        glustershd_pids = {}

        # check the self-heal daemon process
        g.log.info("Starting to get self-heal daemon process on "
                   "nodes %s", nodes)
        ret, pids = get_self_heal_daemon_pid(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process "
                              "found : %s", pids))
        g.log.info("Successful in getting Single self heal daemon process"
                   " on all nodes %s", nodes)
        glustershd_pids = pids

        # get the bricks for the volume
        g.log.info("Fetching bricks for the volume : %s", self.volname)
        bricks_list = get_all_bricks(self.mnode, self.volname)
        g.log.info("Brick List : %s", bricks_list)

        # validate the bricks present in volume info with
        # glustershd server volume file
        g.log.info("Starting parsing file %s on "
                   "node %s", self.glustershd, self.mnode)
        ret = do_bricks_exist_in_shd_volfile(self.mnode, self.volname,
                                             bricks_list)
        self.assertTrue(ret, ("Brick List from volume info is different "
                              "from glustershd server volume file. "
                              "Please check log file for details"))
        g.log.info("Successfully parsed %s file", self.glustershd)

        # expanding volume
        g.log.info("Start adding bricks to volume %s", self.volname)
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(ret, ("Failed to add bricks to "
                              "volume %s " % self.volname))
        g.log.info("Add brick successful")

        # Log Volume Info and Status after expanding the volume
        g.log.info("Logging volume info and Status after expanding volume")
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed "
                              "on volume %s", self.volname))
        g.log.info("Successful in logging volume info and status "
                   "of volume %s", self.volname)

        # Verify volume's all process are online for 60 sec
        g.log.info("Verifying volume's all process are online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname,
                                                   60)
        self.assertTrue(ret, ("Volume %s : All process are not "
                              "online", self.volname))
        g.log.info("Successfully Verified volume %s processes are online",
                   self.volname)

        # Start Rebalance
        g.log.info("Starting Rebalance on the volume")
        ret, _, err = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, ("Failed to start rebalance on "
                                  "the volume %s with error %s" %
                                  (self.volname, err)))
        g.log.info("Successfully started rebalance on the "
                   "volume %s", self.volname)

        # Log Rebalance status
        g.log.info("Log Rebalance status")
        _, _, _ = rebalance_status(self.mnode, self.volname)

        # Wait for rebalance to complete
        g.log.info("Waiting for rebalance to complete")
        ret = wait_for_rebalance_to_complete(self.mnode, self.volname)
        self.assertTrue(ret, ("Rebalance is not yet complete "
                              "on the volume %s", self.volname))
        g.log.info("Rebalance is successfully complete on "
                   "the volume %s", self.volname)

        # Check Rebalance status after rebalance is complete
        g.log.info("Checking Rebalance status")
        ret, _, _ = rebalance_status(self.mnode, self.volname)
        self.assertEqual(ret, 0, ("Failed to get rebalance status for "
                                  "the volume %s", self.volname))
        g.log.info("Successfully got rebalance status of the "
                   "volume %s", self.volname)

        # Check the self-heal daemon process after adding bricks
        g.log.info("Starting to get self-heal daemon process on "
                   "nodes %s", nodes)
        glustershd_pids_after_expanding = {}
        ret, pids = get_self_heal_daemon_pid(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process found"))
        g.log.info("Successful in getting self-heal daemon process "
                   "on nodes %s", nodes)

        glustershd_pids_after_expanding = pids
        g.log.info("Self Heal Daemon Process ID's after expanding "
                   "volume: %s", glustershd_pids_after_expanding)

        self.assertNotEqual(glustershd_pids,
                            glustershd_pids_after_expanding,
                            "Self Daemon process is same before and"
                            " after adding bricks")
        g.log.info("Self Heal Daemon Process is different before and "
                   "after adding bricks")

        # get the bricks for the volume after expanding
        bricks_list_after_expanding = get_all_bricks(self.mnode, self.volname)
        g.log.info("Brick List after expanding "
                   "volume: %s", bricks_list_after_expanding)

        # validate the bricks present in volume info
        # with glustershd server volume file after adding bricks
        g.log.info("Starting parsing file %s", self.glustershd)
        ret = do_bricks_exist_in_shd_volfile(self.mnode, self.volname,
                                             bricks_list_after_expanding)

        self.assertTrue(ret, ("Brick List from volume info is different "
                              "from glustershd server volume file after "
                              "expanding bricks. Please check log file "
                              "for details"))
        g.log.info("Successfully parsed %s file", self.glustershd)

        # shrink the volume
        g.log.info("Starting volume shrink")
        ret = shrink_volume(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to shrink the volume on "
                              "volume %s", self.volname))
        g.log.info("Shrinking volume is successful on "
                   "volume %s", self.volname)

        # Log Volume Info and Status after shrinking the volume
        g.log.info("Logging volume info and Status after shrinking volume")
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status "
                   "of volume %s", self.volname)

        # get the bricks after shrinking the volume
        bricks_list_after_shrinking = get_all_bricks(self.mnode, self.volname)
        g.log.info("Brick List after shrinking "
                   "volume: %s", bricks_list_after_shrinking)

        self.assertEqual(len(bricks_list_after_shrinking), len(bricks_list),
                         "Brick Count is mismatched after "
                         "shrinking the volume %s" % self.volname)
        g.log.info("Brick Count matched before before expanding "
                   "and after shrinking volume")

        # Verfiy glustershd process releases its parent process
        ret = is_shd_daemonized(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process found"))

        # check the self-heal daemon process after removing bricks
        g.log.info("Starting to get self-heal daemon process "
                   "on nodes %s", nodes)
        glustershd_pids_after_shrinking = {}
        ret, pids = get_self_heal_daemon_pid(nodes)
        glustershd_pids_after_shrinking = pids
        self.assertNotEqual(glustershd_pids_after_expanding,
                            glustershd_pids_after_shrinking,
                            "Self Heal Daemon process is same "
                            "after adding bricks and shrinking volume")
        g.log.info("Self Heal Daemon Process is different after adding bricks "
                   "and shrinking volume")

        # validate bricks present in volume info
        # with glustershd server volume file after removing bricks
        g.log.info("Starting parsing file %s", self.glustershd)
        ret = do_bricks_exist_in_shd_volfile(self.mnode, self.volname,
                                             bricks_list_after_shrinking)
        self.assertTrue(ret, ("Brick List from volume info is different "
                              "from glustershd server volume file after "
                              "removing bricks. Please check log file "
                              "for details"))
        g.log.info("Successfully parsed %s file", self.glustershd)
    def test_impact_of_replace_brick_on_glustershd(self):
        """
        Test Script to verify the glustershd server vol file
        has only entries for replicate volumes
        1.Create multiple volumes and start all volumes
        2.Check the glustershd processes - Only 1 glustershd should be listed
        3.Do replace brick on the replicate volume
        4.Confirm that the brick is replaced
        5.Check the glustershd processes - Only 1 glustershd should be listed
                                           and pid should be different
        6.glustershd server vol should be updated with new bricks
        """
        # Check the self-heal daemon process
        ret, glustershd_pids = get_self_heal_daemon_pid(self.servers)
        self.assertTrue(ret, ("Either no self heal daemon process found or "
                              "more than one self heal daemon process "
                              "found : %s" % glustershd_pids))
        g.log.info(
            "Successful in getting single self heal daemon process"
            " on all nodes %s", self.servers)

        volume_list = get_volume_list(self.mnode)
        for volume in volume_list:

            # Log Volume Info and Status before replacing brick
            ret = log_volume_info_and_status(self.mnode, volume)
            self.assertTrue(ret, ("Logging volume info and status "
                                  "failed on volume %s", volume))
            g.log.info(
                "Successful in logging volume info and status "
                "of volume %s", volume)

            # Selecting a random source brick to replace
            src_brick = choice(get_all_bricks(self.mnode, volume))
            src_node, original_brick = src_brick.split(":")

            # Creating a random destination brick in such a way
            # that the brick is select from the same node but always
            # picks a different from the original brick
            list_of_bricks = [
                brick for brick in get_servers_bricks_dict(
                    src_node, self.all_servers_info)[src_node]
                if brick not in original_brick
            ]
            dst_brick = ('{}:{}/{}_replaced'.format(
                src_node, choice(list_of_bricks),
                original_brick.split('/')[::-1][0]))

            # Replace brick for the volume
            ret, _, _ = replace_brick(self.mnode, volume, src_brick, dst_brick)
            self.assertFalse(
                ret, "Failed to replace brick "
                "from the volume %s" % volume)
            g.log.info(
                "Successfully replaced faulty brick from "
                "the volume %s", volume)

            # Verify all volume process are online
            ret = wait_for_volume_process_to_be_online(self.mnode, volume)
            self.assertTrue(ret,
                            "Volume %s : All process are not online" % volume)
            g.log.info("Volume %s : All process are online", volume)

            # Check the self-heal daemon process after replacing brick
            ret, pid_after_replace = get_self_heal_daemon_pid(self.servers)
            self.assertTrue(
                ret, "Either no self heal daemon process "
                "found or more than one self heal "
                "daemon process found : %s" % pid_after_replace)
            g.log.info(
                "Successful in getting Single self heal "
                " daemon process on all nodes %s", self.servers)

            # Compare the glustershd pids
            self.assertNotEqual(
                glustershd_pids, pid_after_replace,
                "Self heal daemon process should be different "
                "after replacing bricks in %s volume" % volume)
            g.log.info("EXPECTED: Self heal daemon process should be different"
                       " after replacing bricks in replicate volume")

            # Get the bricks for the volume
            bricks_list = get_all_bricks(self.mnode, volume)
            g.log.info("Brick List : %s", bricks_list)

            # Validate the bricks present in volume info with
            # glustershd server volume file
            ret = do_bricks_exist_in_shd_volfile(self.mnode, volume,
                                                 bricks_list)
            self.assertTrue(ret, ("Brick List from volume info is "
                                  "different from glustershd server "
                                  "volume file. Please check log file "
                                  "for details"))
            g.log.info(
                "Bricks in volume %s exists in glustershd server "
                "volume file", volume)
示例#7
0
    def test_existing_glustershd_should_take_care_of_self_healing(self):
        """
        Test Script which verifies that the existing glustershd should take
        care of self healing

        * Create and start the Replicate volume
        * Check the glustershd processes - Note the pids
        * Bring down the One brick ( lets say brick1)  without affecting
          the cluster
        * Create 1000 files on volume
        * bring the brick1 up which was killed in previous steps
        * check the heal info - proactive self healing should start
        * Bring down brick1 again
        * wait for 60 sec and brought up the brick1
        * Check the glustershd processes - pids should be different
        * Monitor the heal till its complete

        """
        # pylint: disable=too-many-locals,too-many-lines,too-many-statements
        nodes = self.servers

        # check the self-heal daemon process
        g.log.info("Starting to get self-heal daemon process on "
                   "nodes %s", nodes)
        ret, pids = get_self_heal_daemon_pid(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process "
                              "found : %s" % pids))
        g.log.info(
            "Successful in getting Single self heal daemon process"
            " on all nodes %s", nodes)
        glustershd_pids = pids

        # select the bricks to bring offline
        g.log.info("Selecting bricks to brought offline for volume %s",
                   self.volname)
        bricks_to_bring_offline = \
            select_volume_bricks_to_bring_offline(self.mnode,
                                                  self.volname)
        g.log.info("Brick List to bring offline : %s", bricks_to_bring_offline)

        # Bring down the selected bricks
        g.log.info("Going to bring down the brick process "
                   "for %s", bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(ret, ("Failed to bring down the bricks. Please "
                              "check the log file for more details."))
        g.log.info("Brought down the brick process "
                   "for %s successfully", bricks_to_bring_offline)

        # get the bricks which are running
        g.log.info("getting the brick list which are online")
        online_bricks = get_online_bricks_list(self.mnode, self.volname)
        g.log.info("Online Bricks for volume %s : %s", self.volname,
                   online_bricks)

        # write 1MB files to the mounts
        g.log.info("Starting IO on all mounts...")
        g.log.info("mounts: %s", self.mounts)
        all_mounts_procs = []
        cmd = ("for i in `seq 1 1000`; "
               "do dd if=/dev/urandom of=%s/file_$i "
               "bs=1M count=1; "
               "done" % self.mounts[0].mountpoint)
        g.log.info(cmd)
        proc = g.run_async(self.mounts[0].client_system,
                           cmd,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")

        # check the heal info
        g.log.info("Get the pending heal info for the volume %s", self.volname)
        heal_info = get_heal_info_summary(self.mnode, self.volname)
        g.log.info("Successfully got heal info for the volume %s",
                   self.volname)
        g.log.info("Heal Info for volume %s : %s", self.volname, heal_info)

        # Bring bricks online
        g.log.info("Bring bricks: %s online", bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline, 'glusterd_restart')
        self.assertTrue(
            ret,
            ("Failed to bring bricks: %s online" % bricks_to_bring_offline))
        g.log.info("Successfully brought all bricks: %s online",
                   bricks_to_bring_offline)

        # Wait for 90 sec to start self healing
        g.log.info('Waiting for 90 sec to start self healing')
        time.sleep(90)

        # check the heal info
        g.log.info("Get the pending heal info for the volume %s", self.volname)
        heal_info_after_brick_online = get_heal_info_summary(
            self.mnode, self.volname)
        g.log.info("Successfully got heal info for the volume %s",
                   self.volname)
        g.log.info("Heal Info for volume %s : %s", self.volname,
                   heal_info_after_brick_online)

        # check heal pending is decreased
        flag = False
        for brick in online_bricks:
            if int(heal_info_after_brick_online[brick]['numberOfEntries'])\
                    < int(heal_info[brick]['numberOfEntries']):
                flag = True
                break

        self.assertTrue(flag, "Pro-active self heal is not started")
        g.log.info("Pro-active self heal is started")

        # bring down bricks again
        g.log.info("Going to bring down the brick process "
                   "for %s", bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(ret, ("Failed to bring down the bricks. Please "
                              "check the log file for more details."))
        g.log.info("Brought down the brick process "
                   "for %s successfully", bricks_to_bring_offline)

        # wait for 60 sec and brought up the brick again
        g.log.info('waiting for 60 sec and brought up the brick again')
        time.sleep(60)
        g.log.info("Bring bricks: %s online", bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline, 'glusterd_restart')
        self.assertTrue(
            ret,
            ("Failed to bring bricks: %s online" % bricks_to_bring_offline))
        g.log.info("Successfully brought all bricks: %s online",
                   bricks_to_bring_offline)

        # Verfiy glustershd process releases its parent process
        ret = is_shd_daemonized(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process found"))

        # check the self-heal daemon process
        g.log.info("Starting to get self-heal daemon process on "
                   "nodes %s", nodes)
        ret, pids = get_self_heal_daemon_pid(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process "
                              "found : %s" % pids))
        g.log.info(
            "Successful in getting Single self heal daemon process"
            " on all nodes %s", nodes)
        shd_pids_after_bricks_online = pids

        # compare the glustershd pids
        self.assertNotEqual(glustershd_pids, shd_pids_after_bricks_online,
                            ("self heal daemon process are same before and "
                             "after bringing up bricks online"))
        g.log.info("EXPECTED : self heal daemon process are different before "
                   "and after bringing up bricks online")

        # wait for heal to complete
        g.log.info("Monitoring the heal.....")
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret,
                        ("Heal is not completed on volume %s" % self.volname))
        g.log.info("Heal Completed on volume %s", self.volname)

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')
示例#8
0
    def test_glustershd_on_newly_probed_server(self):
        """
        Test script to verify glustershd process on newly probed server

        * check glustershd process - only 1 glustershd process should
          be running
        * Add new node to cluster
        * check glustershd process - only 1 glustershd process should
          be running on all servers inclusing newly probed server
        * stop the volume
        * add another node to cluster
        * check glustershd process - glustershd process shouldn't be running
          on servers including newly probed server
        * start the volume
        * check glustershd process - only 1 glustershd process should
          be running on all servers inclusing newly probed server

        """
        # pylint: disable=too-many-statements

        nodes = self.volume['servers'][:-2]

        # check the self-heal daemon process
        g.log.info("Starting to get self heal daemon process on "
                   "nodes %s", nodes)
        ret, pids = get_self_heal_daemon_pid(nodes)
        self.assertTrue(ret, ("Either no self heal daemon process found or "
                              "more than one self heal daemon process "
                              "found : %s" % pids))
        g.log.info(
            "Successful in getting single self heal daemon process"
            " on all nodes %s", nodes)

        # Add new node to the cluster
        g.log.info("Peer probe for %s", self.extra_servers[0])
        ret = peer_probe_servers(self.mnode, self.extra_servers[0])
        self.assertTrue(
            ret, "Failed to peer probe server : %s" % self.extra_servers[0])
        g.log.info(
            "Peer probe success for %s and all peers are in "
            "connected state", self.extra_servers[0])
        nodes.append(self.extra_servers[0])

        # check the self-heal daemon process and it should be running on
        # newly probed servers
        g.log.info("Starting to get self-heal daemon process on "
                   "nodes %s", nodes)
        ret, pids = get_self_heal_daemon_pid(nodes)
        self.assertTrue(ret, ("Either no self heal daemon process found or "
                              "more than one self heal daemon process "
                              "found : %s" % pids))
        g.log.info(
            "Successful in getting single self heal daemon process"
            " on all nodes %s", nodes)

        # stop the volume
        g.log.info("Stopping the volume %s", self.volname)
        ret = volume_stop(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to stop volume %s" % self.volname))
        g.log.info("Successfully stopped volume %s", self.volname)

        # Add another new node to the cluster
        g.log.info("peer probe for %s", self.extra_servers[1])
        ret = peer_probe_servers(self.mnode, self.extra_servers[1])
        self.assertTrue(
            ret, "Failed to peer probe server : %s" % self.extra_servers[1])
        g.log.info(
            "Peer probe success for %s and all peers are in "
            "connected state", self.extra_servers[1])
        nodes.append(self.extra_servers[1])

        # check the self-heal daemon process after stopping volume and
        # no self heal daemon should be running including newly probed node
        g.log.info("Starting to get self-heal daemon process on "
                   "nodes %s", nodes)
        ret, pids = get_self_heal_daemon_pid(nodes)
        self.assertFalse(ret, ("Self Heal Daemon process is running even "
                               "after stopping volume %s" % self.volname))
        for node in pids:
            self.assertEquals(pids[node][0], -1, ("Self Heal Daemon is still "
                                                  "running on node %s even "
                                                  "after stopping all "
                                                  "volumes" % node))
        g.log.info("Expected : No self heal daemon process is running "
                   "after stopping all volumes")

        # start the volume
        g.log.info("Starting volume %s", self.volname)
        ret = volume_start(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to start volume  %s" % self.volname))
        g.log.info("Volume %s started successfully", self.volname)

        # Verify volume's all process are online for 60 sec
        g.log.info("Verifying volume's all process are online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname,
                                                   60)
        self.assertTrue(ret, ("Volume %s : All process are not "
                              "online", self.volname))
        g.log.info("Successfully Verified volume %s processes are online",
                   self.volname)

        # Verfiy glustershd process releases its parent process
        g.log.info("verifying self heal daemon process is daemonized")
        ret = is_shd_daemonized(nodes)
        self.assertTrue(ret, ("Either no self heal daemon process found or "
                              "more than one self heal daemon process "
                              "found : %s" % pids))

        # check the self-heal daemon process
        g.log.info("Starting to get self-heal daemon process on "
                   "nodes %s", nodes)
        ret, pids = get_self_heal_daemon_pid(nodes)
        self.assertTrue(ret, ("Either no self heal daemon process found or "
                              "more than one self heal daemon process "
                              "found : %s" % pids))
        g.log.info(
            "Successful in getting single self heal daemon process"
            " on all nodes %s", nodes)

        # detach extra servers from the cluster
        g.log.info("peer detaching extra servers %s from cluster",
                   self.extra_servers)
        ret = peer_detach_servers(self.mnode, self.extra_servers)
        self.assertTrue(
            ret,
            "Failed to peer detach extra servers : %s" % self.extra_servers)
        g.log.info("Peer detach success for %s ", self.extra_servers)
    def test_no_glustershd_with_distribute(self):
        """
        Test Script to verify the glustershd server vol file
        has only entries for replicate volumes

        * Create multiple volumes and start all volumes
        * Check the glustershd processes - Only 1 glustershd should be listed
        * Stop all volumes
        * Check the glustershd processes - No glustershd should be running
        * Start the distribute volume only
        * Check the glustershd processes - No glustershd should be running

        """

        nodes = self.servers

        # check the self-heal daemon process
        g.log.info("Starting to get self-heal daemon process on "
                   "nodes %s", nodes)
        ret, pids = get_self_heal_daemon_pid(nodes)
        self.assertTrue(ret, ("Either no self heal daemon process found or "
                              "more than One self heal daemon process "
                              "found : %s" % pids))
        g.log.info(
            "Successful in getting single self heal daemon process"
            " on all nodes %s", nodes)

        # stop all the volumes
        g.log.info("Going to stop all the volumes")
        volume_list = get_volume_list(self.mnode)
        for volume in volume_list:
            g.log.info("Stopping Volume : %s", volume)
            ret = volume_stop(self.mnode, volume)
            self.assertTrue(ret, ("Failed to stop volume %s" % volume))
            g.log.info("Successfully stopped volume %s", volume)
        g.log.info("Successfully stopped all the volumes")

        # check the self-heal daemon process after stopping all volumes
        g.log.info("Starting to get self-heal daemon process on "
                   "nodes %s", nodes)
        ret, pids = get_self_heal_daemon_pid(nodes)
        self.assertFalse(ret, ("Self heal daemon process is still running "
                               "after stopping all volumes "))
        for node in pids:
            self.assertEqual(pids[node][0], -1, ("Self heal daemon is still "
                                                 "running on node %s even "
                                                 "after stoppong all "
                                                 "volumes" % node))
        g.log.info("EXPECTED: No self heal daemon process is "
                   "running after stopping all volumes")

        # start the distribute volume only
        for volume in volume_list:
            volume_type_info = get_volume_type_info(self.mnode, volume)
            volume_type = (volume_type_info['volume_type_info']['typeStr'])
            if volume_type == 'Distribute':
                g.log.info("starting to start distribute volume: %s", volume)
                ret = volume_start(self.mnode, volume)
                self.assertTrue(ret, ("Failed to start volume %s" % volume))
                g.log.info("Successfully started volume %s", volume)
                break

        # check the self-heal daemon process after starting distribute volume
        g.log.info("Starting to get self-heal daemon process on "
                   "nodes %s", nodes)
        ret, pids = get_self_heal_daemon_pid(nodes)
        self.assertFalse(ret, ("Self heal daemon process is still running "
                               "after stopping all volumes "))
        for node in pids:
            self.assertEqual(pids[node][0], -1, ("Self heal daemon is still "
                                                 "running on node %s even "
                                                 "after stopping all "
                                                 "volumes" % node))
        g.log.info("EXPECTED: No self heal daemon process is running "
                   "after stopping all volumes")
    def test_impact_of_replace_brick_for_glustershd(self):
        nodes = self.volume['servers']

        # check the self-heal daemon process
        g.log.info("Starting to get self-heal daemon process on "
                   "nodes %s" % nodes)
        ret, pids = get_self_heal_daemon_pid(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process "
                              "found : %s" % pids))
        g.log.info("Successful in getting Single self heal daemon process"
                   " on all nodes %s", nodes)
        glustershd_pids = pids

        # get the bricks for the volume
        g.log.info("Fetching bricks for the volume : %s" % self.volname)
        bricks_list = get_all_bricks(self.mnode, self.volname)
        g.log.info("Brick List : %s" % bricks_list)

        # validate the bricks present in volume info with
        # glustershd server volume file
        g.log.info("Starting parsing file %s on "
                   "node %s" % (self.GLUSTERSHD, self.mnode))
        ret = do_bricks_exist_in_shd_volfile(self.mnode, self.volname,
                                             bricks_list)
        self.assertTrue(ret, ("Brick List from volume info is different "
                              "from glustershd server volume file. "
                              "Please check log file for details"))
        g.log.info("Successfully parsed %s file" % self.GLUSTERSHD)

        # replace brick
        brick_to_replace = bricks_list[-1]
        new_brick = brick_to_replace + 'new'
        g.log.info("Replacing the brick %s for the volume : %s"
                   % (brick_to_replace, self.volname))
        ret, out, err = replace_brick(self.mnode, self.volname,
                                      brick_to_replace, new_brick)
        self.assertFalse(ret, err)
        g.log.info('Replaced brick %s to %s successfully'
                   % (brick_to_replace, new_brick))

        # check bricks
        bricks_list = get_all_bricks(self.mnode, self.volname)
        self.assertEqual(bricks_list[-1], new_brick, 'Replaced brick and '
                                                     'new brick are not equal')

        # Verify volume's all process are online for 60 sec
        g.log.info("Verifying volume's all process are online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname,
                                                   timeout=60)
        self.assertTrue(ret, ("Volume %s : All process are not "
                              "online", self.volname))
        g.log.info("Successfully Verified volume %s processes are online",
                   self.volname)

        # Verify glustershd process releases its parent process
        ret = is_shd_daemonized(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process found"))

        # check the self-heal daemon process
        g.log.info("Starting to get self-heal daemon process on "
                   "nodes %s" % nodes)
        ret, pids = get_self_heal_daemon_pid(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process "
                              "found : %s" % pids))
        g.log.info("Successful in getting Single self heal daemon process"
                   " on all nodes %s", nodes)
        glustershd_pids_after_replacement = pids

        # Compare pids before and after replacing
        self.assertNotEqual(glustershd_pids,
                            glustershd_pids_after_replacement,
                            "Self Daemon process is same before and"
                            " after replacing bricks")
        g.log.info("Self Heal Daemon Process is different before and "
                   "after replacing bricks")

        # get the bricks for the volume after replacing
        bricks_list_after_replacing = get_all_bricks(self.mnode, self.volname)
        g.log.info("Brick List after expanding "
                   "volume: %s" % bricks_list_after_replacing)

        # validate the bricks present in volume info
        # with glustershd server volume file after replacing bricks
        g.log.info("Starting parsing file %s" % self.GLUSTERSHD)
        ret = do_bricks_exist_in_shd_volfile(self.mnode, self.volname,
                                             bricks_list_after_replacing)

        self.assertTrue(ret, ("Brick List from volume info is different "
                              "from glustershd server volume file after "
                              "replacing bricks. Please check log file "
                              "for details"))
        g.log.info("Successfully parsed %s file" % self.GLUSTERSHD)