def test_rebalance_while_remove_brick_in_progress(self):
        """
        - Create directories and files on the mount point.
        -  now remove one of the brick from the volume
            gluster volume remove-brick <vol> <brick> start
        - immediately start rebalance on the same volume
            gluster volume rebalance <vol> start
        """
        # pylint: disable=too-many-statements
        # DHT Layout validation
        for mount in self.mounts:
            g.log.debug('Check DHT values %s:%s', mount.client_system,
                        mount.mountpoint)
            ret = validate_files_in_dir(self.clients[0], mount.mountpoint,
                                        test_type=LAYOUT_IS_COMPLETE,
                                        file_type=FILETYPE_DIRS)
            self.assertTrue(ret, "TEST_LAYOUT_IS_COMPLETE: FAILED")
            g.log.info("TEST_LAYOUT_IS_COMPLETE: PASS")

        # Log Volume Info and Status before shrinking the volume.
        g.log.info("Logging volume info and Status before shrinking volume")
        log_volume_info_and_status(self.mnode, self.volname)
        g.log.info("Successful in logging volume info and status of volume "
                   "%s", self.volname)

        # Form bricks list for Shrinking volume
        self.remove_brick_list = form_bricks_list_to_remove_brick(
            self.mnode, self.volname, subvol_name=1)
        self.assertIsNotNone(self.remove_brick_list, ("Volume %s: Failed to "
                                                      "form bricks list for "
                                                      "shrink", self.volname))
        g.log.info("Volume %s: Formed bricks list for shrink", self.volname)

        # Shrink volume by removing bricks with option start
        g.log.info("Start removing bricks for %s", self.volname)
        ret, _, _ = remove_brick(self.mnode, self.volname,
                                 self.remove_brick_list, "start")
        self.assertEqual(ret, 0, ("Volume %s: Remove-brick status failed",
                                  self.volname))
        g.log.info("Volume %s: Remove-brick start success ", self.volname)

        # Log remove-brick status
        g.log.info("Logging Remove-brick status")
        ret, out, err = remove_brick(self.mnode, self.volname,
                                     self.remove_brick_list, "status")
        self.assertEqual(ret, 0, ("Volume %s: Remove-brick status failed",
                                  self.volname))
        g.log.info("Volume %s: Remove-brick status", self.volname)
        g.log.info(out)

        # Start rebalance while volume shrink in-progress
        g.log.info("Volume %s: Start rebalance while volume shrink is "
                   "in-progress")
        _, _, err = rebalance_start(self.mnode, self.volname)
        self.assertIn("Either commit or stop the remove-brick task.", err,
                      "Rebalance started successfully while volume shrink"
                      " is in-progress")
        g.log.info("Failed to start rebalance while volume shrink is "
                   "in progress <EXPECTED>")
示例#2
0
    def test_shrinking_volume_when_io_in_progress(self):
        """Test shrinking volume (Decrease distribute count) using existing
        servers bricks when IO is in progress.

        Description:
            - remove brick (start, status, commit)
            - validate IO
        """
        # Log Volume Info and Status before shrinking the volume.
        g.log.info("Logging volume info and Status before shrinking volume")
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Shrinking volume by removing bricks from volume when IO in progress
        g.log.info("Start removing bricks from volume when IO in progress")
        ret = shrink_volume(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to shrink the volume when IO in "
                              "progress on volume %s", self.volname))
        g.log.info("Shrinking volume when IO in progress is successful on "
                   "volume %s", self.volname)

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))
        g.log.info("Successful in waiting for volume %s processes to be "
                   "online", self.volname)

        # Log Volume Info and Status after shrinking the volume
        g.log.info("Logging volume info and Status after shrinking volume")
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online after "
                   "shrinking volume")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Volume %s : All process are not online",
                              self.volname))
        g.log.info("Volume %s : All process are online after shrinking volume",
                   self.volname)

        # Validate IO
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.io_validation_complete = True
        self.assertTrue(ret, "IO failed on some of the clients")

        # List all files and dirs created
        g.log.info("List all files and directories:")
        ret = list_all_files_and_dirs_mounts(self.mounts)
        self.assertTrue(ret, "Failed to list all files and dirs")
        g.log.info("Listing all files and directories is successful")
示例#3
0
    def test_induce_holes_thenfixlayout(self):

        # pylint: disable=too-many-statements
        m_point = self.mounts[0].mountpoint
        command = 'mkdir -p ' + m_point + '/testdir'
        ret, _, _ = g.run(self.clients[0], command)
        self.assertEqual(ret, 0, "mkdir failed")
        g.log.info("mkdir is successful")

        # DHT Layout validation
        g.log.debug("Verifying hash layout values %s:%s", self.clients[0],
                    self.mounts[0].mountpoint)
        ret = validate_files_in_dir(self.clients[0],
                                    self.mounts[0].mountpoint,
                                    test_type=LAYOUT_IS_COMPLETE,
                                    file_type=FILETYPE_DIRS)
        self.assertTrue(ret, "LAYOUT_IS_COMPLETE: FAILED")
        g.log.info("LAYOUT_IS_COMPLETE: PASS")

        # Log Volume Info and Status before shrinking the volume.
        g.log.info("Logging volume info and Status before shrinking volume")
        log_volume_info_and_status(self.mnode, self.volname)

        # Shrinking volume by removing bricks
        g.log.info("Start removing bricks from volume")
        ret, _, _ = remove_brick(self.mnode, self.volname,
                                 self.remove_brick_list, "force")
        self.assertFalse(ret, "Remove-brick with force: FAIL")
        g.log.info("Remove-brick with force: PASS")

        # Check the layout
        ret = is_layout_complete(self.mnode, self.volname, dirpath='/testdir')
        self.assertFalse(ret, "Volume %s: Layout is complete")
        g.log.info("Volume %s: Layout has some holes")

        # Start Rebalance fix-layout
        g.log.info("Volume %s: Start fix-layout", self.volname)
        ret, _, _ = rebalance_start(self.mnode, self.volname, fix_layout=True)
        self.assertEqual(ret, 0, ("Volume %s: fix-layout start failed"
                                  "%s", self.volname))
        g.log.info("Volume %s: fix-layout start success", self.volname)

        # Wait for fix-layout to complete
        g.log.info("Waiting for fix-layout to complete")
        ret = wait_for_fix_layout_to_complete(self.mnode, self.volname)
        self.assertTrue(ret, ("Volume %s: Fix-layout is either failed or "
                              "in-progress", self.volname))
        g.log.info("Volume %s: Fix-layout completed successfully",
                   self.volname)

        # DHT Layout validation
        g.log.debug("Verifying hash layout values %s:%s", self.clients[0],
                    self.mounts[0].mountpoint)
        ret = validate_files_in_dir(self.clients[0],
                                    self.mounts[0].mountpoint,
                                    test_type=LAYOUT_IS_COMPLETE,
                                    file_type=FILETYPE_DIRS)
        self.assertTrue(ret, "LAYOUT_IS_COMPLETE: FAILED")
        g.log.info("LAYOUT_IS_COMPLETE: PASS")
示例#4
0
    def test_replace_brick_when_io_in_progress(self):
        """Test replacing brick using existing servers bricks when IO is
            in progress.

        Description:
            - replace_brick
            - wait for heal to complete
            - validate IO
        """
        # Log Volume Info and Status before replacing brick from the volume.
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Replace brick from a sub-volume
        ret = replace_brick_from_volume(self.mnode, self.volname, self.servers,
                                        self.all_servers_info)
        self.assertTrue(ret, "Failed to replace faulty brick from the volume")
        g.log.info("Successfully replaced faulty brick from the volume")

        # Wait for volume processes to be online
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))

        # Log Volume Info and Status after replacing the brick
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Verify volume's all process are online
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Volume %s : All process are not online", self.volname))

        # Wait for self-heal to complete
        ret = monitor_heal_completion(self.mnode,
                                      self.volname,
                                      timeout_period=1800)
        self.assertTrue(
            ret, "Self heal didn't complete even after waiting "
            "for 30 minutes. 30 minutes is too much a time for "
            "current test workload")

        # Validate IO
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.io_validation_complete = True
        self.assertTrue(ret, "IO failed on some of the clients")

        # List all files and dirs created
        ret = list_all_files_and_dirs_mounts(self.mounts)
        self.assertTrue(ret, "Failed to list all files and dirs")
示例#5
0
    def test_add_brick_while_remove_brick_is_in_progress(self):
        # DHT Layout and hash validation
        g.log.debug("Verifying hash layout values %s:%s", self.clients[0],
                    self.mounts[0].mountpoint)
        ret = validate_files_in_dir(self.clients[0],
                                    self.mounts[0].mountpoint,
                                    test_type=LAYOUT_IS_COMPLETE,
                                    file_type=FILETYPE_DIRS)
        self.assertTrue(ret, "LAYOUT_IS_COMPLETE: FAILED")
        g.log.info("LAYOUT_IS_COMPLETE: PASS")

        # Log Volume Info and Status before shrinking the volume.
        g.log.info("Logging volume info and Status before shrinking volume")
        log_volume_info_and_status(self.mnode, self.volname)

        # Form bricks list for volume shrink
        self.remove_brick_list = form_bricks_list_to_remove_brick(
            self.mnode, self.volname, subvol_name=1)
        self.assertIsNotNone(self.remove_brick_list, ("Volume %s: Failed to "
                                                      "form bricks list for "
                                                      "shrink", self.volname))
        g.log.info("Volume %s: Formed bricks list for shrink", self.volname)

        # Shrink volume by removing bricks
        g.log.info("Start removing bricks from volume")
        ret, _, _ = remove_brick(self.mnode, self.volname,
                                 self.remove_brick_list, "start")
        self.assertEqual(ret, 0, ("Volume %s shrink failed ", self.volname))
        g.log.info("Volume %s shrink started ", self.volname)
        # Log remove-brick status
        g.log.info("Logging Remove-brick status")
        ret, out, err = remove_brick(self.mnode, self.volname,
                                     self.remove_brick_list, "status")
        self.assertEqual(ret, 0,
                         ("Remove-brick status failed on %s ", self.volname))
        g.log.info("Remove-brick status %s", self.volname)
        g.log.info(out)

        # Expanding volume while volume shrink is in-progress
        g.log.info("Volume %s: Expand volume while volume shrink in-progress",
                   self.volname)
        _, _, err = add_brick(self.mnode, self.volname, self.add_brick_list)
        self.assertIn(
            "rebalance is in progress", err, "Successfully added"
            "bricks to the volume <NOT EXPECTED>")
        g.log.info(
            "Volume %s: Failed to add-bricks while volume shrink "
            "in-progress <EXPECTED>", self.volname)

        # cleanup add-bricks list
        for brick in self.add_brick_list:
            brick_node, brick_path = brick.split(":")
            ret, _, _ = g.run(brick_node, ("rm -rf %s", brick_path))
            if ret != 0:
                g.log.error("Failed to clean %s:%s", brick_node, brick_path)
        g.log.info("Successfully cleaned backend add-brick bricks list")
示例#6
0
    def test_volume_create_start_stop_start(self):
        """Tests volume create, start, status, stop, start.
        Also Validates whether all the brick process are running after the
        start of the volume.
        """
        # Verify volume processes are online
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Volume %s : All process are not online" %
                              self.volname))
        g.log.info("Successfully Verified volume %s processes are online",
                   self.volname)

        # Stop Volume
        ret, _, _ = volume_stop(self.mnode, self.volname, force=True)
        self.assertEqual(ret, 0, "Failed to stop volume %s" % self.volname)
        g.log.info("Successfully stopped volume %s", self.volname)

        # Start Volume
        ret, _, _ = volume_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, "Failed to start volume %s" % self.volname)
        g.log.info("Successfully started volume %s", self.volname)

        # Wait for volume processes to be online
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))

        # Log Volume Info and Status
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to Log volume %s info and status",
                              self.volname))
        g.log.info("Successfully logged Volume %s Info and Status",
                   self.volname)

        # Verify volume's all process are online
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Volume %s : All process are not online" %
                              self.volname))
        g.log.info("Successfully verified volume %s processes are online",
                   self.volname)

        # Log Volume Info and Status
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to Log volume %s info and status",
                              self.volname))
        g.log.info("Successfully logged Volume %s Info and Status",
                   self.volname)

        # Check if glusterd is running on all servers(expected: active)
        ret = is_glusterd_running(self.servers)
        self.assertEqual(ret, 0, "Glusterd is not running on all servers")
        g.log.info("Glusterd is running on all the servers")
示例#7
0
    def test_nfs_ganesha_remove_brick(self):
        """
        Verify remove brick operation while IO is running
        Steps:
        1. Start IO on mount points
        2. Perform remove brick operation
        3. Validate IOs
        """
        # pylint: disable=too-many-statements
        # Start IO on all mount points
        all_mounts_procs, count = [], 1
        for mount_obj in self.mounts:
            cmd = ("/usr/bin/env python %s create_deep_dirs_with_files "
                   "--dirname-start-num %d "
                   "--dir-depth 2 "
                   "--dir-length 10 "
                   "--max-num-of-dirs 5 "
                   "--num-of-files 5 %s" % (self.script_upload_path, count,
                                            mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system, cmd,
                               user=mount_obj.user)
            all_mounts_procs.append(proc)
            count += 10

        # Get stat of all the files/dirs created.
        ret = get_mounts_stat(self.mounts)
        self.assertTrue(ret, "Stat failed on some of the clients")
        g.log.info("Successfully got stat of all files/dirs created")

        # Perform remove brick operation
        ret = shrink_volume(self.mnode, self.volname)
        self.assertTrue(ret, ("Remove brick operation failed on "
                              "%s", self.volname))
        g.log.info("Remove brick operation is successful on "
                   "volume %s", self.volname)

        # Wait for volume processes to be online
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("All volume %s processes failed to come up "
                              "online", self.volname))
        g.log.info("All volume %s processes came up "
                   "online successfully after remove brick operation",
                   self.volname)

        # Log volume info and status after performing remove brick
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Validate IO
        ret = validate_io_procs(all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        g.log.info("Successfully validated all io's")

        # Get stat of all the files/dirs created.
        ret = get_mounts_stat(self.mounts)
        self.assertTrue(ret, "Stat failed on some of the clients")
        g.log.info("Successfully got stat of all files/dirs created")
示例#8
0
    def cleanup_volume(cls):
        """Cleanup the volume
        Returns (bool): True if cleanup volume is successful. False otherwise.
        """
        g.log.info("Cleanup Volume %s", cls.volname)
        ret = cleanup_volume(mnode=cls.mnode, volname=cls.volname)
        if not ret:
            g.log.error("cleanup of volume %s failed", cls.volname)
        else:
            g.log.info("Successfully cleaned-up volume %s", cls.volname)

        # Log Volume Info and Status
        g.log.info("Log Volume %s Info and Status", cls.volname)
        log_volume_info_and_status(cls.mnode, cls.volname)

        return ret
示例#9
0
    def test_non_existent_dir(self):
        # Displaying volume status and info
        g.log.info("Logging volume information and status")
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status"
                              "failed on volume %s", self.volname))

        # Enable Quota
        g.log.info("Enabling quota on the volume %s", self.volname)
        ret, _, _ = enable_quota(self.mnode, self.volname)
        self.assertEqual(
            ret, 0, ("Failed to enable quota on the volume %s", self.volname))
        g.log.info("Successfully enabled quota on the volume %s", self.volname)

        # Non existent path to set quota limit
        path = "/foo"

        # Set Quota limit on the root of the volume
        g.log.info("Set Quota Limit on the path %s of the volume %s", path,
                   self.volname)
        ret, out, err = set_quota_limit_usage(self.mnode,
                                              self.volname,
                                              path=path,
                                              limit="1GB")
        self.assertIn("No such file or directory", err, "Quota limit set "
                      "on path /foo which does not exist")
    def test_volume_create_start_stop_start(self):
        """Tests volume create, start, status, stop, start.
        Also Validates whether all the brick process are running after the
        start of the volume.
        """
        # Verify volume's all process are online
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Volume %s : All process are not online" % self.volname))

        # Stop Volume
        ret, _, _ = volume_stop(self.mnode, self.volname, force=True)
        self.assertEqual(ret, 0, "Failed to stop volume %s" % self.volname)

        # Start Volume
        ret, _, _ = volume_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, "Unable to start volume %s" % self.volname)

        time.sleep(15)

        # Log Volume Info and Status
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Logging volume %s info and status failed" % self.volname))

        # Verify volume's all process are online
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Volume %s : All process are not online" % self.volname))

        # Log Volume Info and Status
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Logging volume %s info and status failed" % self.volname))

        # Verify all glusterd's are running
        ret = is_glusterd_running(self.servers)
        self.assertEqual(
            ret, 0, ("glusterd not running on all servers: %s" % self.servers))
    def cleanup_volume(cls):
        """Cleanup the volume

        Returns (bool): True if cleanup volume is successful. False otherwise.
        """
        cls.bricks_online_and_volume_reset()
        g.log.info("Cleanup Volume %s", cls.volname)
        ret = cleanup_volume(mnode=cls.mnode, volname=cls.volname)
        if not ret:
            g.log.error("cleanup of volume %s failed", cls.volname)
        else:
            g.log.info("Successfully cleaned-up volume %s", cls.volname)

        # Log Volume Info and Status
        g.log.info("Log Volume %s Info and Status", cls.volname)
        log_volume_info_and_status(cls.mnode, cls.volname)

        # compare and remove additional lv created, skip otherwise
        new_lv_list = cls.get_unique_lv_list_from_all_servers()
        if cls.lv_list != new_lv_list:
            cmd = ("for mnt in `mount | grep 'run/gluster/snaps' |"
                   "awk '{print $3}'`; do umount $mnt; done")
            for server in cls.servers:
                ret, _, err = g.run(server, cmd, "root")
                if ret:
                    g.log.error("Failed to remove snap "
                                "bricks from mountpoint %s" % err)
                    return False
            new_lv_list = cls.get_unique_lv_list_from_all_servers()
            lv_remove_list = list(set(new_lv_list) - set(cls.lv_list))
            for server in cls.servers:
                for lv in lv_remove_list:
                    cmd = ("lvremove %s --force" % lv)
                    ret, _, err = g.run(server, cmd, "root")
                    if ret:
                        g.log.error("failed to remove lv: %s" % err)
                    g.log.info("Expected error msg '%s'" % err)
        g.log.info("Successfully cleaned-up volumes")
        return True
示例#12
0
    def setup_volume(cls, volume_create_force=False):
        """Setup the volume:
            - Create the volume, Start volume, Set volume
            options, enable snapshot/quota/tier if specified in the config
            file.
            - Wait for volume processes to be online
            - Export volume as NFS/SMB share if mount_type is NFS or SMB
            - Log volume info and status
        Args:
            volume_create_force(bool): True if create_volume should be
                executed with 'force' option.
        Returns (bool): True if all the steps mentioned in the descriptions
            passes. False otherwise.
        """
        force_volume_create = False
        if cls.volume_create_force:
            force_volume_create = True

        # Validate peers before setting up volume
        g.log.info("Validate peers before setting up volume ")
        ret = cls.validate_peers_are_connected()
        if not ret:
            g.log.error("Failed to validate peers are in connected state "
                        "before setting up volume")
            return False
        g.log.info("Successfully validated peers are in connected state "
                   "before setting up volume")

        # Setup Volume
        g.log.info("Setting up volume %s", cls.volname)
        ret = setup_volume(mnode=cls.mnode,
                           all_servers_info=cls.all_servers_info,
                           volume_config=cls.volume, force=force_volume_create)
        if not ret:
            g.log.error("Failed to Setup volume %s", cls.volname)
            return False
        g.log.info("Successful in setting up volume %s", cls.volname)

        # ToDo : Wait for volume processes to be online

        # Log Volume Info and Status
        g.log.info("Log Volume %s Info and Status", cls.volname)
        ret = log_volume_info_and_status(cls.mnode, cls.volname)
        if not ret:
            g.log.error("Logging volume %s info and status failed",
                        cls.volname)
            return False
        g.log.info("Successful in logging volume %s info and status",
                   cls.volname)

        return True
示例#13
0
        def create_snap(value, volname, snap, clone, counter):
            # Creating snapshots
            g.log.info("Starting to Create snapshot")
            for snap_count in value:
                ret, _, _ = snap_create(self.mnode, volname,
                                        "snap%s" % snap_count)
                self.assertEqual(ret, 0, ("Failed to create "
                                          "snapshot for volume %s" % volname))
                g.log.info(
                    "Snapshot snap%s created successfully"
                    " for volume %s", snap_count, volname)

            # Validate snapshot list
            g.log.info("Starting to list all snapshots")
            ret, out, _ = snap_list(self.mnode)
            self.assertEqual(
                ret, 0, ("Failed to list snapshot of volume %s" % volname))
            v_list = out.strip().split('\n')
            self.assertEqual(len(v_list), counter, "Failed to validate "
                             "all snapshots")
            g.log.info(
                "Snapshot listed and  Validated for volume %s"
                " successfully", volname)
            if counter == 40:
                return 0

            # Creating a Clone of snapshot:
            g.log.info("Starting to Clone Snapshot")
            ret, _, _ = snap_clone(self.mnode, snap, clone)
            self.assertEqual(ret, 0, "Failed to clone %s" % clone)
            g.log.info("Clone volume %s created successfully", clone)

            # Start cloned volumes
            g.log.info("starting to Validate clone volumes are started")
            ret, _, _ = volume_start(self.mnode, clone)
            self.assertEqual(ret, 0, "Failed to start %s" % clone)
            g.log.info("%s started successfully", clone)

            # log Cloned Volume information
            g.log.info("Logging Volume info and Volume status")
            ret = log_volume_info_and_status(self.mnode, clone)
            self.assertTrue("Failed to Log Info and Status of Volume %s" %
                            clone)
            g.log.info("Successfully Logged Info and Status")
            return counter + 10
示例#14
0
    def test_disperse_vol(self):
        bricks_list = get_all_bricks(self.mnode, self.volname)

        ret = bring_bricks_offline(self.volname, bricks_list[0:2])
        self.assertTrue(ret, "Failed to bring down the bricks")
        g.log.info("Successfully brought the bricks down")

        ret = bring_bricks_online(self.mnode, self.volname, bricks_list[0:2])
        self.assertTrue(ret, "Failed to bring up the bricks")
        g.log.info("Successfully brought the bricks up")

        # Verifying all bricks online
        ret = are_bricks_online(self.mnode, self.volname, bricks_list)
        if not ret:
            self.assertTrue(ret, "All bricks are not online")

        g.log.info("Logging volume info and status")
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed "
                              "on volume %s", self.volname))
        g.log.info(
            "Successful in logging volume info and status "
            "of volume %s", self.volname)
    def test_subdir_with_addbrick(self):

        # pylint: disable=too-many-statements
        """
        Mount the volume
        Create 2 subdir on mount point, subdir1 and subdir2
        Auth allow - Client1(subdir1,subdir2),Client2(subdir1,subdir2)
        Mount the subdir1 on client 1 and subdir2 on client2
        Start IO's on both subdirs
        Perform add-brick and rebalance
        """

        # Create  directories subdir1 and subdir2 on mount point
        ret = mkdir(self.mounts[0].client_system,
                    "%s/subdir1" % self.mounts[0].mountpoint)
        self.assertTrue(
            ret, ("Failed to create directory 'subdir1' on"
                  "volume %s from client %s" %
                  (self.mounts[0].volname, self.mounts[0].client_system)))
        ret = mkdir(self.mounts[0].client_system,
                    "%s/subdir2" % self.mounts[0].mountpoint)
        self.assertTrue(
            ret, ("Failed to create directory 'subdir2' on"
                  "volume %s from client %s" %
                  (self.mounts[0].volname, self.mounts[0].client_system)))
        # unmount volume
        ret = self.unmount_volume(self.mounts)
        self.assertTrue(ret, "Volumes Unmount failed")
        g.log.info("Volumes Unmounted successfully")

        # Set authentication on the subdirectory subdir1
        # and subdir2 to access by 2 clients
        g.log.info(
            'Setting authentication on subdir1 and subdir2'
            'for client %s and %s', self.clients[0], self.clients[0])
        ret = set_auth_allow(
            self.volname, self.mnode, {
                '/subdir1': [self.clients[0], self.clients[1]],
                '/subdir2': [self.clients[0], self.clients[1]]
            })
        self.assertTrue(
            ret, 'Failed to set Authentication on volume %s' % self.volume)

        # Creating mount list for subdirectories
        self.subdir_mounts = [
            copy.deepcopy(self.mounts[0]),
            copy.deepcopy(self.mounts[1])
        ]
        self.subdir_mounts[0].volname = "%s/subdir1" % self.volname
        self.subdir_mounts[1].volname = "%s/subdir2" % self.volname

        # Mount Subdirectory "subdir1" on client 1 and "subdir2" on client 2
        for mount_obj in self.subdir_mounts:
            ret = mount_obj.mount()
            self.assertTrue(
                ret, ("Failed to mount  %s on client"
                      " %s" % (mount_obj.volname, mount_obj.client_system)))
            g.log.info("Successfully mounted %s on client %s",
                       mount_obj.volname, mount_obj.client_system)
        g.log.info("Successfully mounted subdirectories on client1"
                   "and clients 2")

        # Start IO on all mounts.
        all_mounts_procs = []
        count = 1
        for mount_obj in self.subdir_mounts:
            g.log.info("Starting IO on %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            cmd = ("/usr/bin/env python %s create_deep_dirs_with_files "
                   "--dirname-start-num %d "
                   "--dir-depth 2 "
                   "--dir-length 10 "
                   "--max-num-of-dirs 5 "
                   "--num-of-files 5 %s" %
                   (self.script_upload_path, count, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            all_mounts_procs.append(proc)
            count = count + 10

        # Validate IO
        g.log.info("Validating IO's")
        ret = validate_io_procs(all_mounts_procs, self.subdir_mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        g.log.info("Successfully validated all io's")

        # Get stat of all the files/dirs created.
        g.log.info("Get stat of all the files/dirs created.")
        ret = get_mounts_stat(self.subdir_mounts)
        self.assertTrue(ret, "Stat failed on some of the clients")
        g.log.info("Successfully got stat of all files/dirs created")

        # Start add-brick (subvolume-increase)
        g.log.info("Start adding bricks to volume when IO in progress")
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(ret, ("Failed to expand the volume when IO in "
                              "progress on volume %s", self.volname))
        g.log.info(
            "Expanding volume when IO in progress is successful on "
            "volume %s", self.volname)

        # Log Volume Info and Status after expanding the volume
        g.log.info("Logging volume info and Status after expanding volume")
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("All process  for volume %s are not"
                              "online", self.volname))
        g.log.info("All volume %s processes are now online", self.volname)

        # Start Rebalance
        g.log.info("Starting Rebalance on the volume")
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, ("Failed to start rebalance on the volume "
                                  "%s", self.volname))
        g.log.info("Successfully started rebalance on the volume %s",
                   self.volname)

        # Wait for rebalance to complete
        g.log.info("Waiting for rebalance to complete")
        ret = wait_for_rebalance_to_complete(self.mnode, self.volname, 600)
        self.assertTrue(
            ret, "Rebalance did not complete "
            "despite waiting for 10 minutes")
        g.log.info("Rebalance successfully completed on the volume %s",
                   self.volname)

        # Again validate if subdirectories are still mounted post add-brick

        for mount_obj in self.subdir_mounts:
            ret = mount_obj.is_mounted()
            self.assertTrue(
                ret, ("Subdirectory %s is not mounted on client"
                      " %s" % (mount_obj.volname, mount_obj.client_system)))
            g.log.info("Subdirectory %s is mounted on client %s",
                       mount_obj.volname, mount_obj.client_system)
        g.log.info("Successfully validated that subdirectories are mounted"
                   "on client1 and clients 2 post add-brick operation")
    def setup_volume(cls, volume_create_force=False):
        """Setup the volume:
            - Create the volume, Start volume, Set volume
            options, enable snapshot/quota/tier if specified in the config
            file.
            - Wait for volume processes to be online
            - Export volume as NFS/SMB share if mount_type is NFS or SMB
            - Log volume info and status

        Args:
            volume_create_force(bool): True if create_volume should be
                executed with 'force' option.

        Returns (bool): True if all the steps mentioned in the descriptions
            passes. False otherwise.
        """
        force_volume_create = False
        if volume_create_force or cls.volume_create_force:
            force_volume_create = True

        # Validate peers before setting up volume
        g.log.info("Validate peers before setting up volume ")
        ret = cls.validate_peers_are_connected()
        if not ret:
            g.log.error("Failed to validate peers are in connected state "
                        "before setting up volume")
            return False
        g.log.info("Successfully validated peers are in connected state "
                   "before setting up volume")

        # Setup Volume
        g.log.info("Setting up volume %s", cls.volname)
        ret = setup_volume(mnode=cls.mnode,
                           all_servers_info=cls.all_servers_info,
                           volume_config=cls.volume,
                           force=force_volume_create)
        if not ret:
            g.log.error("Failed to Setup volume %s", cls.volname)
            return False
        g.log.info("Successful in setting up volume %s", cls.volname)

        # Wait for volume processes to be online
        g.log.info("Wait for volume %s processes to be online", cls.volname)
        ret = wait_for_volume_process_to_be_online(cls.mnode, cls.volname)
        if not ret:
            g.log.error(
                "Failed to wait for volume %s processes to "
                "be online", cls.volname)
            return False
        g.log.info(
            "Successful in waiting for volume %s processes to be "
            "online", cls.volname)

        # Export/Share the volume based on mount_type
        if cls.mount_type != "glusterfs":
            g.log.info("Export/Sharing the volume %s", cls.volname)
            if "nfs" in cls.mount_type:
                ret = export_volume_through_nfs(
                    mnode=cls.mnode,
                    volname=cls.volname,
                    enable_ganesha=cls.enable_nfs_ganesha)
                if not ret:
                    g.log.error("Failed to export volume %s "
                                "as NFS export", cls.volname)
                    return False
                g.log.info(
                    "Successful in exporting the volume %s "
                    "as NFS export", cls.volname)

                # Set NFS-Ganesha specific volume options
                if cls.enable_nfs_ganesha and cls.nfs_ganesha_export_options:
                    g.log.info(
                        "Setting NFS-Ganesha export specific "
                        "volume options on volume %s", cls.volname)
                    ret = set_volume_options(
                        mnode=cls.mnode,
                        volname=cls.volname,
                        options=cls.nfs_ganesha_export_options)
                    if not ret:
                        g.log.error(
                            "Failed to set NFS-Ganesha "
                            "export specific options on "
                            "volume %s", cls.volname)
                        return False
                    g.log.info(
                        "Successful in setting NFS-Ganesha export "
                        "specific volume options on volume %s", cls.volname)

            if "smb" in cls.mount_type or "cifs" in cls.mount_type:
                ret = share_volume_over_smb(mnode=cls.mnode,
                                            volname=cls.volname,
                                            smb_users_info=cls.smb_users_info)
                if not ret:
                    g.log.error("Failed to export volume %s "
                                "as SMB Share", cls.volname)
                    return False
                g.log.info("Successful in exporting volume %s as SMB Share",
                           cls.volname)

                # Set SMB share specific volume options
                if cls.smb_share_options:
                    g.log.info(
                        "Setting SMB share specific volume options "
                        "on volume %s", cls.volname)
                    ret = set_volume_options(mnode=cls.mnode,
                                             volname=cls.volname,
                                             options=cls.smb_share_options)
                    if not ret:
                        g.log.error(
                            "Failed to set SMB share "
                            "specific options "
                            "on volume %s", cls.volname)
                        return False
                    g.log.info(
                        "Successful in setting SMB share specific "
                        "volume options on volume %s", cls.volname)

        # Log Volume Info and Status
        g.log.info("Log Volume %s Info and Status", cls.volname)
        ret = log_volume_info_and_status(cls.mnode, cls.volname)
        if not ret:
            g.log.error("Logging volume %s info and status failed",
                        cls.volname)
            return False
        g.log.info("Successful in logging volume %s info and status",
                   cls.volname)

        return True
    def test_rebalance_with_quota_enabled(self):
        """
        Test rebalance with quota enabled on root.
        1. Create Volume of type distribute
        2. Set Quota limit on the root directory
        3. Do some IO to reach the Hard limit
        4. After IO ends, compute arequal checksum
        5. Add bricks to the volume.
        6. Start rebalance
        7. After rebalance is completed, check arequal checksum
        """
        # Enable Quota
        ret, _, _ = quota_enable(self.mnode, self.volname)
        self.assertEqual(
            ret, 0, ("Failed to enable quota on the volume %s", self.volname))
        g.log.info("Successfully enabled quota on volume %s", self.volname)

        # Set the Quota timeouts to 0 for strict accounting
        ret, _, _ = quota_set_hard_timeout(self.mnode, self.volname, 0)
        self.assertEqual(
            ret, 0, ("Failed to set hard-timeout to 0 for %s", self.volname))
        ret, _, _ = quota_set_soft_timeout(self.mnode, self.volname, 0)
        self.assertEqual(
            ret, 0, ("Failed to set soft-timeout to 0 for %s", self.volname))
        g.log.info("Quota soft and hard timeout has been set to 0 for %s",
                   self.volname)

        # Set the quota limit of 1 GB on root dir of the volume
        ret, _, _ = quota_limit_usage(self.mnode, self.volname, "/", "1GB")
        self.assertEqual(ret, 0, "Failed to set Quota for dir root")
        g.log.info("Successfully set quota limit for dir root")

        # Do some IO until hard limit is reached.
        cmd = ("/usr/bin/env python %s create_files "
               "-f 1024 --fixed-file-size 1M --base-file-name file %s" %
               (self.script_upload_path, self.mounts[0].mountpoint))
        proc = g.run_async(self.mounts[0].client_system,
                           cmd,
                           user=self.mounts[0].user)
        self.all_mounts_procs.append(proc)

        # Wait for IO to complete and validate IO
        self.assertTrue(
            wait_for_io_to_complete(self.all_mounts_procs, self.mounts[0]),
            "IO failed on some of the clients")
        g.log.info("IO completed on the clients")

        # Validate quota
        ret = quota_validate(self.mnode,
                             self.volname,
                             path='/',
                             hard_limit=1073741824,
                             sl_exceeded=True,
                             hl_exceeded=True)
        self.assertTrue(ret, "Quota validate Failed for '/'")
        g.log.info("Quota Validated for path '/'")

        # Compute arequal checksum.
        arequal_checksum_before_rebalance = collect_mounts_arequal(self.mounts)

        # Log Volume info and status before expanding volume.
        log_volume_info_and_status(self.mnode, self.volname)

        # Expand the volume.
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(ret, ("Failed to expand the volume %s", self.volname))
        g.log.info("Expanding volume is successful on "
                   "volume %s", self.volname)

        # Log volume info and status after expanding volume.
        log_volume_info_and_status(self.mnode, self.volname)

        # Perform rebalance start operation.
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, ("Failed to  start rebalance on the volume "
                                  "%s", self.volname))
        g.log.info("Rebalance started.")

        # Check rebalance is in progress
        rebalance_status = get_rebalance_status(self.mnode, self.volname)
        ret = rebalance_status['aggregate']['statusStr']
        self.assertEqual(ret, "in progress", ("Rebalance is not in "
                                              "'in progress' state, either "
                                              "rebalance is in completed state"
                                              "  or failed to get rebalance "
                                              "status"))
        g.log.info("Rebalance is 'in progress' state")

        # Wait till rebalance ends.
        ret = wait_for_rebalance_to_complete(self.mnode, self.volname)
        self.assertTrue(ret, ("Rebalance is not yet complete on the volume "
                              "%s", self.volname))
        g.log.info("Rebalance is successfully complete on the volume %s",
                   self.volname)

        # Validate quota
        ret = quota_validate(self.mnode,
                             self.volname,
                             path='/',
                             hard_limit=1073741824,
                             sl_exceeded=True,
                             hl_exceeded=True)
        self.assertTrue(ret, "Quota validate Failed for '/'")
        g.log.info("Quota Validated for path '/'")

        # Compute arequal checksum.
        arequal_checksum_after_rebalance = collect_mounts_arequal(self.mounts)

        # Comparing arequals checksum before and after rebalance.
        self.assertEqual(arequal_checksum_before_rebalance,
                         arequal_checksum_after_rebalance,
                         "arequal checksum is NOT MATCHING")
        g.log.info("arequal checksum is SAME")
示例#18
0
    def test_rebalance_with_hidden_files(self):
        # pylint: disable=too-many-statements
        # Start IO on mounts
        g.log.info("Starting IO on all mounts...")
        self.all_mounts_procs = []
        for mount_obj in self.mounts:
            g.log.info("Starting IO on %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            cmd = ("python %s create_files "
                   "--base-file-name . "
                   "-f 99 %s" %
                   (self.script_upload_path, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)

        # validate IO
        self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")

        # List all files and dirs created
        g.log.info("List all files and directories:")
        ret = list_all_files_and_dirs_mounts(self.mounts)
        self.assertTrue(ret, "Failed to list all files and dirs")
        g.log.info("Listing all files and directories is successful")

        # Verify DHT values across mount points
        for mount_obj in self.mounts:
            g.log.debug("Verifying hash layout values %s:%s",
                        mount_obj.client_system, mount_obj.mountpoint)
            ret = validate_files_in_dir(mount_obj.client_system,
                                        mount_obj.mountpoint,
                                        test_type=FILE_ON_HASHED_BRICKS,
                                        file_type=FILETYPE_FILES)
            self.assertTrue(
                ret, "Expected - Files are created on only "
                "sub-volume according to its hashed value")
            g.log.info("Hash layout values are verified %s:%s",
                       mount_obj.client_system, mount_obj.mountpoint)

        # Getting areequal checksum before rebalance
        g.log.info("Getting areequal checksum before rebalance")
        arequal_checksum_before_rebalance = collect_mounts_arequal(self.mounts)

        # Log Volume Info and Status before expanding the volume.
        g.log.info("Logging volume info and Status before expanding volume")
        log_volume_info_and_status(self.mnode, self.volname)

        # Expanding volume by adding bricks to the volume
        g.log.info("Start adding bricks to volume")
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(ret, ("Failed to expand the volume %s", self.volname))
        g.log.info("Expanding volume is successful on "
                   "volume %s", self.volname)

        # Wait for gluster processes to come online
        g.log.info("Wait for gluster processes to come online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))
        g.log.info(
            "Successful in waiting for volume %s processes to be "
            "online", self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Volume %s : All process are not online ", self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Log Volume Info and Status after expanding the volume
        g.log.info("Logging volume info and Status after expanding volume")
        log_volume_info_and_status(self.mnode, self.volname)

        # Start Rebalance
        g.log.info("Starting Rebalance on the volume")
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, ("Failed to start rebalance on the volume "
                                  "%s", self.volname))
        g.log.info("Successfully started rebalance on the volume %s",
                   self.volname)

        # Wait for rebalance to complete
        g.log.info("Waiting for rebalance to complete")
        ret = wait_for_rebalance_to_complete(self.mnode, self.volname)
        self.assertTrue(ret, ("Rebalance is not yet complete on the volume "
                              "%s", self.volname))
        g.log.info("Rebalance is successfully complete on the volume %s",
                   self.volname)

        # Checking if there are any migration failures
        status = get_rebalance_status(self.mnode, self.volname)
        for each_node in status['node']:
            failed_files_count = int(each_node['failures'])
            self.assertEqual(
                failed_files_count, 0,
                "Rebalance failed to migrate few files on %s" %
                each_node['nodeName'])
            g.log.info("There are no migration failures")

        # Getting areequal checksum after rebalance
        g.log.info("Getting areequal checksum after rebalance")
        arequal_checksum_after_rebalance = collect_mounts_arequal(self.mounts)

        # Comparing arequals checksum before and after rebalance
        g.log.info("Comparing arequals checksum before and after rebalance")
        self.assertEqual(arequal_checksum_before_rebalance,
                         arequal_checksum_after_rebalance,
                         "arequal checksum is NOT MATCHNG")
        g.log.info("arequal checksum is SAME")
    def test_nfs_ganesha_export_with_multiple_volumes(self):
        """
        Test case to verify multiple volumes gets exported when IO is in
        progress.
        """
        # Starting IO on the mounts
        all_mounts_procs = []
        count = 1
        for mount_obj in self.mounts:
            g.log.info("Starting IO on %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            cmd = ("/usr/bin/env python %s create_deep_dirs_with_files "
                   "--dirname-start-num %d "
                   "--dir-depth 2 "
                   "--dir-length 10 "
                   "--max-num-of-dirs 5 "
                   "--num-of-files 5 %s" %
                   (self.script_upload_path, count, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            all_mounts_procs.append(proc)
            count = count + 10

        # Create and export five new volumes
        for i in range(5):
            # Check availability of bricks to create new volume
            num_of_unused_bricks = 0

            servers_unused_bricks_dict = get_servers_unused_bricks_dict(
                self.mnode, self.all_servers, self.all_servers_info)
            for each_server_unused_bricks_list in list(
                    servers_unused_bricks_dict.values()):
                num_of_unused_bricks = (num_of_unused_bricks +
                                        len(each_server_unused_bricks_list))

            if num_of_unused_bricks < 2:
                self.assertNotEqual(
                    i, 0, "New volume cannot be created due "
                    "to unavailability of bricks.")
                g.log.warning(
                    "Tried to create five new volumes. But could "
                    "create only %s volume due to unavailability "
                    "of bricks.", str(i))
                break

            self.volume['name'] = "nfsvol" + str(i)
            self.volume['voltype']['type'] = 'distributed'
            self.volume['voltype']['replica_count'] = 1
            self.volume['voltype']['dist_count'] = 2

            new_vol = self.volume['name']

            # Create volume
            ret = setup_volume(mnode=self.mnode,
                               all_servers_info=self.all_servers_info,
                               volume_config=self.volume,
                               force=True)
            if not ret:
                self.assertTrue(ret, "Setup volume [%s] failed" % self.volume)

            g.log.info("Wait for volume processes to be online")
            ret = wait_for_volume_process_to_be_online(self.mnode, new_vol)
            self.assertTrue(
                ret, "Volume %s process not online despite "
                "waiting for 300 seconds" % new_vol)

            # Export volume with nfs ganesha
            ret, _, _ = export_nfs_ganesha_volume(mnode=self.mnode,
                                                  volname=new_vol)
            self.assertEqual(ret, 0, ("Failed to export volume %s "
                                      "using nfs-ganesha" % new_vol))

            # Wait for volume to get exported
            ret = wait_for_nfs_ganesha_volume_to_get_exported(
                self.mnode, new_vol)
            self.assertTrue(
                ret, "Volume %s is not exported after setting "
                "ganesha.enable 'on'" % new_vol)
            g.log.info("Exported nfs-ganesha volume %s", new_vol)

            # Log Volume Info and Status
            ret = log_volume_info_and_status(self.mnode, new_vol)
            self.assertTrue(
                ret, "Logging volume %s info and status failed" % new_vol)

        # Validate IO
        g.log.info("Validating IO")
        ret = validate_io_procs(all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        g.log.info("Successfully validated all IO")
    def test_brick_removal_with_quota(self):
        """
        Test Brick removal with quota in place
        1. Create Volume of type distribute
        2. Set Quota limit on the directory
        3. Do some IO to reach the Hard limit
        4. After IO ends, remove bricks
        5. Quota validation should succeed.
        """
        # Enable Quota
        ret, _, _ = quota_enable(self.mnode, self.volname)
        self.assertEqual(
            ret, 0, ("Failed to enable quota on the volume 5s", self.volname))
        g.log.info("Successfully enabled quota on volume %s", self.volname)

        # Set the Quota timeouts to 0 for strict accounting
        ret, _, _ = quota_set_hard_timeout(self.mnode, self.volname, 0)
        self.assertEqual(
            ret, 0, ("Failed to set hard-timeout to 0 for %s", self.volname))
        ret, _, _ = quota_set_soft_timeout(self.mnode, self.volname, 0)
        self.assertEqual(
            ret, 0, ("Failed to set soft-timeout to 0 for %s", self.volname))
        g.log.info("Quota soft and hard timeout has been set to 0 for %s",
                   self.volname)

        # Set the quota limit of 100 MB on root dir of the volume
        ret, _, _ = quota_limit_usage(self.mnode, self.volname, "/", "100MB")
        self.assertEqual(ret, 0, "Failed to set Quota for dir root")
        g.log.info("Successfully set quota limit for dir root")

        # Do some IO until hard limit is reached.
        cmd = ("/usr/bin/env python %s create_files "
               "-f 100 --fixed-file-size 1M --base-file-name file %s" %
               (self.script_upload_path, self.mounts[0].mountpoint))
        proc = g.run_async(self.mounts[0].client_system,
                           cmd,
                           user=self.mounts[0].user)
        self.all_mounts_procs.append(proc)

        # Wait for IO to complete and validate IO
        self.assertTrue(
            wait_for_io_to_complete(self.all_mounts_procs, self.mounts[0]),
            "IO failed on some of the clients")
        g.log.info("IO completed on the clients")

        # Validate quota
        ret = quota_validate(self.mnode,
                             self.volname,
                             path='/',
                             hard_limit=104857600,
                             sl_exceeded=True,
                             hl_exceeded=True)
        self.assertTrue(ret, "Quota validate Failed for '/'")
        g.log.info("Quota Validated for path '/'")

        # Log Volume info and status before shrinking volume.
        log_volume_info_and_status(self.mnode, self.volname)

        # Shrink the volume.
        ret = shrink_volume(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to shrink volume on "
                              "volume %s", self.volname))
        g.log.info("Shrinking volume is successful on "
                   "volume %s", self.volname)

        # Log volume info and status after shrinking volume.
        log_volume_info_and_status(self.mnode, self.volname)

        # Perform rebalance start operation.
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, ("Failed to  start rebalance on the volume "
                                  "%s", self.volname))
        g.log.info("Rebalance started.")

        # Wait till rebalance ends.
        ret = wait_for_rebalance_to_complete(self.mnode, self.volname)
        self.assertTrue(ret, ("Rebalance is not yet complete on the volume "
                              "%s", self.volname))
        g.log.info("Rebalance is successfully complete on the volume %s",
                   self.volname)

        # Validate quota
        ret = quota_validate(self.mnode,
                             self.volname,
                             path='/',
                             hard_limit=104857600,
                             sl_exceeded=True,
                             hl_exceeded=True)
        self.assertTrue(ret, "Quota validate Failed for '/'")
        g.log.info("Quota Validated for path '/'")
示例#21
0
    def test_expanding_volume_when_io_in_progress(self):
        # pylint: disable=too-many-statements
        # Log Volume Info and Status before expanding the volume.
        g.log.info("Logging volume info and Status before expanding volume")
        log_volume_info_and_status(self.mnode, self.volname)

        # Expanding volume by adding bricks to the volume when IO in progress
        g.log.info("Start adding bricks to volume when IO in progress")
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(ret, ("Failed to expand the volume while IO in "
                              "progress on volume %s", self.volname))
        g.log.info(
            "Expanding volume while IO in progress on "
            "volume %s : Success", self.volname)

        # Wait for gluster processes to come online
        g.log.info("Wait for gluster processes to come online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))
        g.log.info("Waiting for volume %s process to be online", self.volname)

        # Log Volume Info and Status after expanding the volume
        g.log.info("Logging volume info and Status after expanding volume")
        log_volume_info_and_status(self.mnode, self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Volume %s : All process are not online", self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Start Rebalance
        g.log.info("Starting Rebalance on the volume")
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, ("Failed to start rebalance on the volume "
                                  "%s", self.volname))
        g.log.info("Started rebalance on the volume %s: Success", self.volname)

        # Wait for rebalance to complete
        g.log.info("Waiting for rebalance to complete")
        ret = wait_for_rebalance_to_complete(self.mnode,
                                             self.volname,
                                             timeout=1800)
        self.assertTrue(ret, ("Rebalance is not yet complete on the volume "
                              "%s", self.volname))
        g.log.info("Rebalance status on volume %s: Complete", self.volname)

        # Check Rebalance status after rebalance is complete
        g.log.info("Checking Rebalance status")
        ret, _, _ = rebalance_status(self.mnode, self.volname)
        self.assertEqual(ret, 0, ("Failed to get rebalance status for the "
                                  "volume %s", self.volname))
        g.log.info("Rebalance status on volume %s: Complete", self.volname)

        # Validate IO
        g.log.info("Wait for IO to complete and validate IO ...")
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.io_validation_complete = True
        self.assertTrue(ret, "IO failed on some of the clients")
        g.log.info("IO on all mounts: Complete")

        # List all files and dirs created
        g.log.info("List all files and directories:")
        ret = list_all_files_and_dirs_mounts(self.mounts)
        self.assertTrue(ret, "Failed to list all files and dirs")
        g.log.info("List all files and directories: Success")

        # DHT Layout validation
        g.log.debug("Verifying hash layout values %s:%s", self.clients[0],
                    self.mounts[0].mountpoint)
        ret = validate_files_in_dir(self.clients[0],
                                    self.mounts[0].mountpoint,
                                    test_type=LAYOUT_IS_COMPLETE,
                                    file_type=FILETYPE_DIRS)
        self.assertTrue(ret, "LAYOUT_IS_COMPLETE: FAILED")
        g.log.info("LAYOUT_IS_COMPLETE: PASS")

        # Checking if there are any migration failures
        status = get_rebalance_status(self.mnode, self.volname)
        for each_node in status['node']:
            self.assertEqual(
                0, int(each_node['failures']),
                "Rebalance failed to migrate few files on %s" %
                each_node['nodeName'])
            g.log.info("No migration failures on %s", each_node['nodeName'])
示例#22
0
    def test_rebalance_with_brick_down(self):
        """
        Rebalance with brick down in replica
        - Create a Replica volume.
        - Bring down one of the brick down in the replica pair
        - Do some IO and create files on the mount point
        - Add a pair of bricks to the volume
        - Initiate rebalance
        - Bring back the brick which was down.
        - After self heal happens, all the files should be present.
        """
        # Log the volume info and status before brick is down.
        log_volume_info_and_status(self.mnode, self.volname)

        # Bring one fo the bricks offline
        brick_list = get_all_bricks(self.mnode, self.volname)
        ret = bring_bricks_offline(self.volname, choice(brick_list))

        # Log the volume info and status after brick is down.
        log_volume_info_and_status(self.mnode, self.volname)

        # Create files at mountpoint.
        cmd = (
            "/usr/bin/env python %s create_files "
            "-f 2000 --fixed-file-size 1k --base-file-name file %s"
            % (self.script_upload_path, self.mounts[0].mountpoint))
        proc = g.run_async(
            self.mounts[0].client_system, cmd, user=self.mounts[0].user)
        self.all_mounts_procs.append(proc)

        # Wait for IO to complete.
        self.assertTrue(wait_for_io_to_complete(self.all_mounts_procs,
                                                self.mounts[0]),
                        "IO failed on some of the clients")
        g.log.info("IO completed on the clients")

        # Compute the arequal checksum before bringing all bricks online
        arequal_before_all_bricks_online = collect_mounts_arequal(self.mounts)

        # Log the volume info and status before expanding volume.
        log_volume_info_and_status(self.mnode, self.volname)

        # Expand the volume.
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(ret, ("Failed to expand the volume %s", self.volname))
        g.log.info("Expanding volume is successful on "
                   "volume %s", self.volname)

        # Log the voluem info after expanding volume.
        log_volume_info_and_status(self.mnode, self.volname)

        # Start Rebalance.
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, ("Failed to start rebalance on the volume "
                                  "%s", self.volname))
        g.log.info("Successfully started rebalance on the volume %s",
                   self.volname)

        # Wait for rebalance to complete
        ret = wait_for_rebalance_to_complete(self.mnode, self.volname)
        self.assertTrue(ret, ("Rebalance is not yet complete on the volume "
                              "%s", self.volname))
        g.log.info("Rebalance is successfully complete on the volume %s",
                   self.volname)

        # Log the voluem info and status before bringing all bricks online
        log_volume_info_and_status(self.mnode, self.volname)

        # Bring all bricks online.
        ret, _, _ = volume_start(self.mnode, self.volname, force=True)
        self.assertEqual(ret, 0, "Not able to start volume with force option")
        g.log.info("Volume start with force option successful.")

        # Log the volume info and status after bringing all beicks online
        log_volume_info_and_status(self.mnode, self.volname)

        # Monitor heal completion.
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, "heal has not yet completed")
        g.log.info("Self heal completed")

        # Compute the arequal checksum after all bricks online.
        arequal_after_all_bricks_online = collect_mounts_arequal(self.mounts)

        # Comparing arequal checksum before and after the operations.
        self.assertEqual(arequal_before_all_bricks_online,
                         arequal_after_all_bricks_online,
                         "arequal checksum is NOT MATCHING")
        g.log.info("arequal checksum is SAME")
示例#23
0
 def _logged_vol_info(self):
     """Log volume info and status"""
     ret = log_volume_info_and_status(self.mnode, self.volname)
     self.assertTrue(ret, ("Logging volume info and status failed on "
                           "volume %s", self.volname))
    def test_replace_brick_when_io_in_progress(self):
        """Test replacing brick using existing servers bricks when IO is
            in progress.

        Description:
            - replace_brick
            - wait for heal to complete
            - validate IO
        """
        # Log Volume Info and Status before replacing brick from the volume.
        g.log.info(
            "Logging volume info and Status before replacing brick "
            "from the volume %s", self.volname)
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Replace brick from a sub-volume
        g.log.info("Replace a faulty brick from the volume")
        ret = replace_brick_from_volume(self.mnode, self.volname, self.servers,
                                        self.all_servers_info)
        self.assertTrue(ret, "Failed to replace faulty brick from the volume")
        g.log.info("Successfully replaced faulty brick from the volume")

        # Wait for gluster processes to come online
        time.sleep(30)

        # Log Volume Info and Status after replacing the brick
        g.log.info(
            "Logging volume info and Status after replacing brick "
            "from the volume %s", self.volname)
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Volume %s : All process are not online", self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Wait for self-heal to complete
        g.log.info("Wait for self-heal to complete")
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(
            ret, "Self heal didn't complete even after waiting "
            "for 20 minutes. 20 minutes is too much a time for "
            "current test workload")
        g.log.info("self-heal is successful after replace-brick operation")

        # Validate IO
        g.log.info("Wait for IO to complete and validate IO ...")
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.io_validation_complete = True
        self.assertTrue(ret, "IO failed on some of the clients")
        g.log.info("IO is successful on all mounts")

        # List all files and dirs created
        g.log.info("List all files and directories:")
        ret = list_all_files_and_dirs_mounts(self.mounts)
        self.assertTrue(ret, "Failed to list all files and dirs")
        g.log.info("Listing all files and directories is successful")
    def test_glustershd_on_all_volume_types(self):
        """
        Test Script to verify the glustershd server vol file
        has only entries for replicate volumes

        * Create multiple volumes and start all volumes
        * Check the glustershd processes - Only One glustershd should be listed
        * Check the glustershd server vol file - should contain entries only
                                             for replicated involved volumes
        * Add bricks to the replicate volume - it should convert to
                                               distributed-replicate
        * Check the glustershd server vol file - newly added bricks
                                                 should present
        * Check the glustershd processes - Only 1 glustershd should be listed

        """
        # pylint: disable=too-many-statements
        nodes = self.servers

        # check the self-heal daemon process
        g.log.info("Starting to get self-heal daemon process on "
                   "nodes %s", nodes)
        ret, glustershd_pids = get_self_heal_daemon_pid(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process "
                              "found : %s" % glustershd_pids))
        g.log.info(
            "Successful in getting Single self heal daemon process"
            " on all nodes %s", nodes)

        # For all the volumes, check whether bricks present in
        # glustershd server vol file
        volume_list = get_volume_list(self.mnode)
        for volume in volume_list:
            g.log.info("Volume Name: %s", volume)
            volume_type_info = get_volume_type_info(self.mnode, volume)
            volume_type = (volume_type_info['volume_type_info']['typeStr'])

            # get the bricks for the volume
            g.log.info("Fetching bricks for the volume : %s", volume)
            bricks_list = get_all_bricks(self.mnode, volume)
            g.log.info("Brick List : %s", bricks_list)

            # validate the bricks present in volume info with
            # glustershd server volume file
            g.log.info("Start parsing file %s on "
                       "node %s", self.GLUSTERSHD, self.mnode)
            ret = do_bricks_exist_in_shd_volfile(self.mnode, volume,
                                                 bricks_list)
            if volume_type == 'Distribute':
                self.assertFalse(ret,
                                 ("Bricks exist in glustershd server "
                                  "volume file for %s Volume" % volume_type))
                g.log.info(
                    "EXPECTED : Bricks doesn't exist in glustershd "
                    "server volume file for %s Volume", volume_type)
            else:
                self.assertTrue(ret, ("Brick List from volume info is "
                                      "different from glustershd server "
                                      "volume file. Please check log "
                                      "file for details"))
                g.log.info(
                    "Bricks exist in glustershd server volume file "
                    "for %s Volume", volume_type)

        # expanding volume for Replicate
        for volume in volume_list:
            volume_type_info = get_volume_type_info(self.mnode, volume)
            volume_type = (volume_type_info['volume_type_info']['typeStr'])
            if volume_type == 'Replicate':
                g.log.info("Start adding bricks to volume %s", volume)
                ret = expand_volume(self.mnode, volume, self.servers,
                                    self.all_servers_info)
                self.assertTrue(ret, ("Failed to add bricks to "
                                      "volume %s " % volume))
                g.log.info("Add brick successful")

                # Log Volume Info and Status after expanding the volume
                g.log.info("Logging volume info and Status after "
                           "expanding volume")
                ret = log_volume_info_and_status(self.mnode, volume)
                self.assertTrue(ret, ("Logging volume info and status failed "
                                      "on volume %s", volume))
                g.log.info(
                    "Successful in logging volume info and status "
                    "of volume %s", volume)

                # Verify volume's all process are online for 60 sec
                g.log.info("Verifying volume's all process are online")
                ret = wait_for_volume_process_to_be_online(
                    self.mnode, volume, 60)
                self.assertTrue(ret, ("Volume %s : All process are not "
                                      "online", volume))
                g.log.info(
                    "Successfully verified volume %s processes "
                    "are online", volume)

                # check the type for the replicate volume
                volume_type_info_for_replicate_after_adding_bricks = \
                    get_volume_type_info(self.mnode, volume)
                volume_type_for_replicate_after_adding_bricks = \
                    (volume_type_info_for_replicate_after_adding_bricks
                     ['volume_type_info']['typeStr'])

                self.assertEqual(volume_type_for_replicate_after_adding_bricks,
                                 'Distributed-Replicate',
                                 ("Replicate volume type is not converted to "
                                  "Distributed-Replicate after adding bricks"))
                g.log.info("Replicate Volume is successfully converted to"
                           " Distributed-Replicate after adding bricks")

                # get the bricks for the volume after expanding
                bricks_list_after_expanding = get_all_bricks(
                    self.mnode, volume)
                g.log.info("Brick List after expanding "
                           "volume: %s", bricks_list_after_expanding)

                # validate the bricks present in volume info
                # with glustershd server volume file after adding bricks
                g.log.info("Starting parsing file %s", self.GLUSTERSHD)
                ret = do_bricks_exist_in_shd_volfile(
                    self.mnode, volume, bricks_list_after_expanding)

                self.assertTrue(ret, ("Brick List from volume info is "
                                      "different from glustershd server "
                                      "volume file after expanding bricks. "
                                      "Please check log file for details"))
                g.log.info("Brick List from volume info is same as from "
                           "glustershd server volume file after "
                           "expanding bricks.")

        # check the self-heal daemon process
        g.log.info("Starting to get self-heal daemon process on "
                   "nodes %s", nodes)
        ret, glustershd_pids_after_adding_bricks = \
            get_self_heal_daemon_pid(nodes)
        self.assertTrue(ret,
                        ("Either No self heal daemon process found or "
                         "more than One self heal daemon process "
                         "found : %s" % glustershd_pids_after_adding_bricks))
        g.log.info(
            "Successful in getting Single self heal daemon process"
            " on all nodes %s", nodes)

        self.assertNotEqual(
            glustershd_pids, glustershd_pids_after_adding_bricks,
            "Self Daemon process is same before and"
            " after adding bricks")
        g.log.info("Self Heal Daemon Process is different before and "
                   "after adding bricks")
示例#26
0
    def test_heal_client_io_hang(self):
        mountpoint = self.mounts[0].mountpoint

        # disable server side heal
        ret = disable_heal(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to disable server side heal"))
        g.log.info("Successfully disabled server side heal")

        # Log Volume Info and Status after disabling client side heal
        g.log.info("Logging volume info and status")
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed "
                              "on volume %s", self.volname))

        bricks_list = get_all_bricks(self.mnode, self.volname)
        self.assertIsNotNone(bricks_list, "Failed to get the bricks list")

        # Create files
        cmd = ("cd %s; mkdir test; cd test; for i in `seq 1 100` ;"
               "do touch file$i; done" % mountpoint)

        ret, _, err = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, err)
        g.log.info('Finished creating files while all the bricks are UP')

        # Bring bricks offline
        ret = bring_bricks_offline(self.volname, bricks_list[0:1])
        self.assertTrue(ret, "Failed to bring down the bricks")
        g.log.info("Successfully brought the bricks down")

        # Start pumping IO from client
        cmd = ("cd %s; mkdir test; cd test; for i in `seq 1 100` ;"
               "do dd if=/dev/urandom of=file$i bs=1M "
               "count=5;done" % mountpoint)

        ret, _, err = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, err)
        g.log.info('Finished writing on files while a brick is DOWN')

        # Bring bricks online
        ret = bring_bricks_online(self.mnode, self.volname, bricks_list[0:1])
        self.assertTrue(ret, "Failed to bring up the bricks")
        g.log.info("Successfully brought the bricks up")

        # Verifying all bricks online
        ret = are_bricks_online(self.mnode, self.volname, bricks_list)
        self.assertTrue(ret, "All bricks are not online")

        # Start client side heal by reading/writing files.
        appendcmd = ("cd %s; mkdir test; cd test; for i in `seq 1 100` ;"
                     "do dd if=/dev/urandom of=file$i bs=1M "
                     "count=1 oflag=append conv=notrunc;done" % mountpoint)

        readcmd = ("cd %s; mkdir test; cd test; for i in `seq 1 100` ;"
                   "do dd if=file$i of=/dev/zero bs=1M "
                   "count=5;done" % mountpoint)

        ret, _, err = g.run(self.mounts[0].client_system, appendcmd)
        self.assertEqual(ret, 0, err)
        g.log.info('Finished append on files after bringing bricks online')

        ret, _, err = g.run(self.mounts[0].client_system, readcmd)
        self.assertEqual(ret, 0, err)
        g.log.info('Finished read on files after bringing bricks online')

        # check the heal info and completion
        ec_check_heal_comp(self)

        # Log Volume Info and Status after bringing the brick up
        g.log.info("Logging volume info and status")
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed "
                              "on volume %s", self.volname))
        g.log.info(
            "Successful in logging volume info and status "
            "of volume %s", self.volname)
    def test_expanding_volume_when_io_in_progress(self):
        """Test expanding volume (Increase distribution) using existing
        servers bricks when IO is in progress.

        Description:
            - add bricks
            - starts rebalance
            - wait for rebalance to complete
            - validate IO
        """
        # Log Volume Info and Status before expanding the volume.
        g.log.info("Logging volume info and Status before expanding volume")
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Expanding volume by adding bricks to the volume when IO in progress
        g.log.info("Start adding bricks to volume when IO in progress")
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(ret, ("Failed to expand the volume when IO in "
                              "progress on volume %s", self.volname))
        g.log.info(
            "Expanding volume when IO in progress is successful on "
            "volume %s", self.volname)

        # Wait for gluster processes to come online
        time.sleep(30)

        # Log Volume Info and Status after expanding the volume
        g.log.info("Logging volume info and Status after expanding volume")
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Volume %s : All process are not online", self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Start Rebalance
        g.log.info("Starting Rebalance on the volume")
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, ("Failed to start rebalance on the volume "
                                  "%s", self.volname))
        g.log.info("Successfully started rebalance on the volume %s",
                   self.volname)

        # Log Rebalance status
        g.log.info("Log Rebalance status")
        _, _, _ = rebalance_status(self.mnode, self.volname)

        # Wait for rebalance to complete
        g.log.info("Waiting for rebalance to complete")
        ret = wait_for_rebalance_to_complete(self.mnode, self.volname)
        self.assertTrue(ret, ("Rebalance is not yet complete on the volume "
                              "%s", self.volname))
        g.log.info("Rebalance is successfully complete on the volume %s",
                   self.volname)

        # Check Rebalance status after rebalance is complete
        g.log.info("Checking Rebalance status")
        ret, _, _ = rebalance_status(self.mnode, self.volname)
        self.assertEqual(ret, 0, ("Failed to get rebalance status for the "
                                  "volume %s", self.volname))
        g.log.info("Successfully got rebalance status of the volume %s",
                   self.volname)

        # Validate IO
        g.log.info("Wait for IO to complete and validate IO ...")
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.io_validation_complete = True
        self.assertTrue(ret, "IO failed on some of the clients")
        g.log.info("IO is successful on all mounts")

        # List all files and dirs created
        g.log.info("List all files and directories:")
        ret = list_all_files_and_dirs_mounts(self.mounts)
        self.assertTrue(ret, "Failed to list all files and dirs")
        g.log.info("Listing all files and directories is successful")
    def test_self_heal(self):
        """
        Description:-
        - Create files on mount point
        - Kill one brick from volume
        - rm -rfv on mount point
        - bring bricks online
        - wait for heals
        - list
        """
        # pylint: disable=too-many-statements

        # IO on the mount point
        g.log.info("Starting IO on all mounts...")
        self.all_mounts_procs = []
        for mount_obj in self.mounts:
            g.log.info("Starting IO on %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            cmd = ("/usr/bin/env python %s create_deep_dirs_with_files "
                   "--dirname-start-num %d "
                   "--dir-depth 2 "
                   "--dir-length 35 "
                   "--max-num-of-dirs 5 "
                   "--num-of-files 5 %s" % (
                       self.script_upload_path,
                       self.counter, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system, cmd,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
            self.counter = self.counter + 10

        # Select bricks to bring offline
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = list(filter(None, (
            bricks_to_bring_offline_dict['hot_tier_bricks'] +
            bricks_to_bring_offline_dict['cold_tier_bricks'] +
            bricks_to_bring_offline_dict['volume_bricks'])))

        # Killing one brick from the volume set
        g.log.info("Bringing bricks: %s offline", bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(ret, ("Failed to bring bricks: %s offline",
                              bricks_to_bring_offline))
        g.log.info("Successful in bringing bricks: %s offline",
                   bricks_to_bring_offline)

        # Validate if bricks are offline
        g.log.info("Validating if bricks: %s are offline",
                   bricks_to_bring_offline)
        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret, "Not all the bricks in list: %s are offline" %
                        bricks_to_bring_offline)
        g.log.info("Successfully validated that bricks: %s are all offline",
                   bricks_to_bring_offline)

        # Validate IO
        self.assertTrue(
            validate_io_procs(self.all_mounts_procs, self.mounts),
            "IO failed on some of the clients"
        )
        self.io_validation_complete = True

        # Checking volume status
        g.log.info("Logging volume info and Status after bringing bricks "
                   "offline from the volume %s", self.volname)
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Removing files from the mount point when one brick is down
        g.log.info("Removing files from the mount point")
        mountpoint = self.mounts[0].mountpoint
        client = self.mounts[0].client_system
        cmd = "rm -rfv %s/*" % mountpoint
        ret, _, _ = g.run(client, cmd)
        if ret != 0:
            raise ExecutionError("failed to delete the files")

        # Bringing bricks online
        g.log.info('Bringing bricks %s online', bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(ret, 'Failed to bring bricks %s online' %
                        bricks_to_bring_offline)
        g.log.info('Bricks %s are online', bricks_to_bring_offline)

        # Check if bricks are online
        g.log.info("Checking bricks are online or not")
        ret = are_bricks_online(self.mnode, self.volname,
                                bricks_to_bring_offline)
        self.assertTrue(ret, 'Bricks %s are not online' %
                        bricks_to_bring_offline)
        g.log.info('Bricks %s are online', bricks_to_bring_offline)

        # Monitoring heals on the volume
        g.log.info("Wait for heal completion...")
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, "Self heal didn't complete even after waiting "
                             "for 20 minutes.")
        g.log.info("self-heal is successful after changing the volume type "
                   "from replicated to arbitered volume")

        # List all files and dirs created
        g.log.info("List all files and directories:")
        ret = list_all_files_and_dirs_mounts(self.mounts)
        self.assertTrue(ret, "Failed to list all files and dirs")
        g.log.info("Listing all files and directories is successful")
    def test_self_heal_when_io_in_progress(self):
        """Test self-heal is successful when IO is in progress.

        Description:
            - simulate brick down.
            - bring bricks online
            - wait for heal to complete
            - validate IO
        """
        # Log Volume Info and Status before simulating brick failure
        g.log.info(
            "Logging volume info and Status before bringing bricks "
            "offlien from the volume %s", self.volname)
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Select bricks to bring offline
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = filter(
            None, (bricks_to_bring_offline_dict['hot_tier_bricks'] +
                   bricks_to_bring_offline_dict['cold_tier_bricks'] +
                   bricks_to_bring_offline_dict['volume_bricks']))

        # Bring bricks offline
        g.log.info("Bringing bricks: %s offline", bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(
            ret,
            ("Failed to bring bricks: %s offline", bricks_to_bring_offline))
        g.log.info("Successful in bringing bricks: %s offline",
                   bricks_to_bring_offline)

        # Wait for gluster processes to be offline
        time.sleep(10)

        # Log Volume Info and Status
        g.log.info(
            "Logging volume info and Status after bringing bricks "
            "offline from the volume %s", self.volname)
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Validate if bricks are offline
        g.log.info("Validating if bricks: %s are offline",
                   bricks_to_bring_offline)
        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret, "Not all the bricks in list:%s are offline")
        g.log.info("Successfully validated that bricks: %s are all offline")

        # Add delay before bringing bricks online
        time.sleep(40)

        # Bring bricks online
        g.log.info("Bring bricks: %s online", bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(
            ret,
            ("Failed to bring bricks: %s online", bricks_to_bring_offline))
        g.log.info("Successfully brought all bricks:%s online",
                   bricks_to_bring_offline)

        # Wait for gluster processes to be online
        time.sleep(10)

        # Log Volume Info and Status
        g.log.info(
            "Logging volume info and Status after bringing bricks "
            "online from the volume %s", self.volname)
        ret = log_volume_info_and_status(self.mnode, self.volname)
        self.assertTrue(ret, ("Logging volume info and status failed on "
                              "volume %s", self.volname))
        g.log.info("Successful in logging volume info and status of volume %s",
                   self.volname)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Volume %s : All process are not online", self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Wait for self-heal to complete
        g.log.info("Wait for self-heal to complete")
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(
            ret, "Self heal didn't complete even after waiting "
            "for 20 minutes. 20 minutes is too much a time for "
            "current test workload")
        g.log.info("self-heal is successful after replace-brick operation")

        # Validate IO
        g.log.info("Wait for IO to complete and validate IO ...")
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.io_validation_complete = True
        self.assertTrue(ret, "IO failed on some of the clients")
        g.log.info("IO is successful on all mounts")

        # List all files and dirs created
        g.log.info("List all files and directories:")
        ret = list_all_files_and_dirs_mounts(self.mounts)
        self.assertTrue(ret, "Failed to list all files and dirs")
        g.log.info("Listing all files and directories is successful")
    def setUpClass(cls):
        """Setup volume exports volume with nfs-ganesha,
            mounts the volume.
        """
        NfsGaneshaClusterSetupClass.setUpClass.im_func(cls)

        # Peer probe servers
        ret = peer_probe_servers(cls.mnode, cls.servers)
        if not ret:
            raise ExecutionError("Failed to peer probe servers")

        g.log.info("All peers are in connected state")

        # Peer Status from mnode
        peer_status(cls.mnode)

        for server in cls.servers:
            mount_info = [{
                'protocol': 'glusterfs',
                'mountpoint': '/run/gluster/shared_storage',
                'server': server,
                'client': {
                    'host': server
                },
                'volname': 'gluster_shared_storage',
                'options': ''
            }]

            mount_obj = create_mount_objs(mount_info)
            if not mount_obj[0].is_mounted():
                ret = mount_obj[0].mount()
                if not ret:
                    raise ExecutionError(
                        "Unable to mount volume '%s:%s' "
                        "on '%s:%s'" %
                        (mount_obj.server_system, mount_obj.volname,
                         mount_obj.client_system, mount_obj.mountpoint))

        # Setup Volume
        ret = setup_volume(mnode=cls.mnode,
                           all_servers_info=cls.all_servers_info,
                           volume_config=cls.volume,
                           force=True)
        if not ret:
            raise ExecutionError("Setup volume %s failed", cls.volume)
        time.sleep(10)

        # Export volume with nfs ganesha, if it is not exported already
        vol_option = get_volume_options(cls.mnode,
                                        cls.volname,
                                        option='ganesha.enable')
        if vol_option is None:
            raise ExecutionError("Failed to get ganesha.enable volume option "
                                 "for %s " % cls.volume)
        if vol_option['ganesha.enable'] != 'on':
            ret, out, err = export_nfs_ganesha_volume(mnode=cls.mnode,
                                                      volname=cls.volname)
            if ret != 0:
                raise ExecutionError(
                    "Failed to export volume %s "
                    "as NFS export", cls.volname)
            time.sleep(5)

        ret = wait_for_nfs_ganesha_volume_to_get_exported(
            cls.mnode, cls.volname)
        if not ret:
            raise ExecutionError("Failed to export volume %s. volume is "
                                 "not listed in showmount" % cls.volname)
        else:
            g.log.info("Volume %s is exported successfully" % cls.volname)

        # Log Volume Info and Status
        ret = log_volume_info_and_status(cls.mnode, cls.volname)
        if not ret:
            raise ExecutionError("Logging volume %s info and status failed",
                                 cls.volname)

        # Create Mounts
        _rc = True
        for mount_obj in cls.mounts:
            ret = mount_obj.mount()
            if not ret:
                g.log.error("Unable to mount volume '%s:%s' on '%s:%s'",
                            mount_obj.server_system, mount_obj.volname,
                            mount_obj.client_system, mount_obj.mountpoint)
                _rc = False
        if not _rc:
            raise ExecutionError("Mounting volume %s on few clients failed",
                                 cls.volname)

        # Get info of mount before the IO
        log_mounts_info(cls.mounts)