Пример #1
0
    def test_replica_to_arbiter_volume_with_io(self):
        """
        Description: Replica 3 to arbiter conversion with ongoing IO's

        Steps :
        1) Create a replica 3 volume and start volume.
        2) Set client side self heal off.
        3) Fuse mount the volume.
        4) Create directory dir1 and write data.
           Example: untar linux tar from the client into the dir1
        5) When IO's is running, execute remove-brick command,
           and convert replica 3 to replica 2 volume
        6) Execute add-brick command and convert to arbiter volume,
           provide the path of new arbiter brick.
        7) Issue gluster volume heal.
        8) Heal should be completed with no files in split-brain.
        """

        # pylint: disable=too-many-statements
        # Create a dir to start untar
        self.linux_untar_dir = "{}/{}".format(self.mounts[0].mountpoint,
                                              "linuxuntar")
        ret = mkdir(self.clients[0], self.linux_untar_dir)
        self.assertTrue(ret, "Failed to create dir linuxuntar for untar")

        # Start linux untar on dir linuxuntar
        self.io_process = run_linux_untar(self.clients[0],
                                          self.mounts[0].mountpoint,
                                          dirs=tuple(['linuxuntar']))
        self.is_io_running = True

        # Convert relicated to arbiter volume
        self._convert_replicated_to_arbiter_volume()

        # Wait for IO to complete.
        ret = self._wait_for_untar_completion()
        self.assertFalse(ret, "IO didn't complete or failed on client")
        self.is_io_running = False

        # Start healing
        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not started')
        g.log.info('Healing is started')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode,
                                      self.volname,
                                      timeout_period=3600)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')
Пример #2
0
    def test_basic_memory_leak(self):
        """
        Test case:
        1. Create a volume, start it and mount it.
        2. Start I/O from mount point.
        3. Check if there are any memory leaks and OOM killers.
        """
        # Start monitoring resource usage on servers and clients
        monitor_proc_dict = self.start_memory_and_cpu_usage_logging(
            self.test_id, count=30)
        self.assertIsNotNone(
            monitor_proc_dict, "Failed to start monitoring on servers and "
            "clients")

        # Create a dir to start untar
        self.linux_untar_dir = "{}/{}".format(self.mounts[1].mountpoint,
                                              "linuxuntar")
        ret = mkdir(self.mounts[1].client_system, self.linux_untar_dir)
        self.assertTrue(ret, "Failed to create dir linuxuntar for untar")

        # Start multiple I/O from mount points
        self.list_of_io_processes = []
        cmd = ("cd {};for i in `seq 1 100`; do mkdir dir.$i ;"
               "for j in `seq 1 1000`; do dd if=/dev/random "
               "of=dir.$i/testfile.$j bs=1k count=10;done;done".format(
                   self.mounts[0].mountpoint))
        ret = g.run_async(self.mounts[0].client_system, cmd)
        self.list_of_io_processes = [ret]

        # Start linux untar on dir linuxuntar
        ret = run_linux_untar(self.mounts[1].client_system,
                              self.mounts[1].mountpoint,
                              dirs=tuple(['linuxuntar']))
        self.list_of_io_processes += ret
        self.is_io_running = True

        # Wait for I/O to complete and validate I/O on mount points
        ret = validate_io_procs(self.list_of_io_processes, self.mounts)
        self.assertTrue(ret, "I/O failed on mount point")
        self.is_io_running = False

        # Wait for monitoring processes to complete
        ret = wait_for_logging_processes_to_stop(monitor_proc_dict,
                                                 cluster=True)
        self.assertTrue(ret, "ERROR: Failed to stop monitoring processes")

        # Check if there are any memory leaks and OOM killers
        ret = self.check_for_memory_leaks_and_oom_kills_on_servers(
            self.test_id)
        self.assertFalse(ret,
                         "Memory leak and OOM kills check failed on servers")

        ret = self.check_for_memory_leaks_and_oom_kills_on_clients(
            self.test_id)
        self.assertFalse(ret,
                         "Memory leak and OOM kills check failed on clients")
        g.log.info("No memory leaks or OOM kills found on serves and clients")
    def _untar_linux_kernel_in_a_specific_dir(self):
        """A function to create files and dirs on mount point"""
        # Create a parent directory test_link_self_heal on mount point
        ret = mkdir(self.first_client, '{}/{}'.format(self.mountpoint,
                                                      'test_self_heal'))
        self.assertTrue(ret, "Failed to create dir test_self_heal")

        # Start linux untar on dir linuxuntar
        proc = run_linux_untar(self.clients[0],
                               self.mounts[0].mountpoint,
                               dirs=tuple(['test_self_heal']))[0]
        try:
            ret, _, _ = proc.async_communicate()
            if not ret:
                untar_done = False
            untar_done = True
        except ValueError:
            untar_done = True
        self.assertTrue(untar_done,
                        "Kernel untar not done on client mount point")
Пример #4
0
    def test_heal_info_no_hang(self):
        """
        Testcase steps:
        1. Start kernel untar on the mount
        2. While untar is going on, kill a brick of the replica.
        3. Wait for the untar to be over, resulting in pending heals.
        4. Get the approx. number of pending heals and save it
        5. Bring the brick back online.
        6. Trigger heal
        7. Run more I/Os with dd command
        8. Run heal info command and check that it completes successfully under
           a timeout that is based on the no. of heals in step 4.
        """
        self.list_of_io_processes = []
        self.linux_untar_dir = "{}/{}".format(self.mounts[0].mountpoint,
                                              "linuxuntar")
        ret = mkdir(self.clients[0], self.linux_untar_dir)
        self.assertTrue(ret, "Failed to create dir linuxuntar for untar")

        # Start linux untar on dir linuxuntar
        ret = run_linux_untar(self.clients[0], self.mounts[0].mountpoint,
                              dirs=tuple(['linuxuntar']))
        self.list_of_io_processes += ret
        self.is_io_running = True

        # Kill brick resulting in heal backlog.
        brick_to_bring_offline = random.choice(self.bricks_list)
        ret = bring_bricks_offline(self.volname, brick_to_bring_offline)
        self.assertTrue(ret, 'Failed to bring bricks %s offline'
                        % brick_to_bring_offline)
        ret = are_bricks_offline(self.mnode, self.volname,
                                 [brick_to_bring_offline])
        self.assertTrue(ret, 'Bricks %s are not offline'
                        % brick_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   brick_to_bring_offline)

        ret = self._wait_for_untar_completion()
        self.assertFalse(ret, "IO didn't complete or failed on client")
        self.is_io_running = False

        # Get approx. no. of entries to be healed.
        cmd = ("gluster volume heal %s statistics heal-count | grep Number "
               "| awk '{sum+=$4} END {print sum/2}'" % self.volname)
        ret, self.num_entries, _ = g.run(self.mnode, cmd)
        self.assertEqual(ret, 0, "Failed to get heal-count statistics")

        # Restart the down bricks
        ret = bring_bricks_online(self.mnode, self.volname,
                                  [brick_to_bring_offline])
        self.assertTrue(ret, 'Failed to bring brick %s online' %
                        brick_to_bring_offline)
        g.log.info('Bringing brick %s online is successful',
                   brick_to_bring_offline)
        # Trigger heal
        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Starting heal failed')
        g.log.info('Index heal launched')

        # Run more I/O
        cmd = ("for i in `seq 1 10`; do dd if=/dev/urandom of=%s/file_$i "
               "bs=1M count=100; done" % self.mounts[0].mountpoint)
        ret = g.run_async(self.mounts[0].client_system, cmd,
                          user=self.mounts[0].user)

        # Get heal info
        ret = self._does_heal_info_complete_within_timeout()
        self.assertTrue(ret, 'Heal info timed out')
        g.log.info('Heal info completed succesfully')
    def test_add_brick_rebalance_with_rsync_in_progress(self):
        """
        Test case:
        1. Create, start and mount a volume.
        2. Create a directory on the mount point and start linux utar.
        3. Create another directory on the mount point and start rsync of
           linux untar directory.
        4. Add bricks to the volume
        5. Trigger rebalance on the volume.
        6. Wait for rebalance to complete on volume.
        7. Wait for I/O to complete.
        8. Validate if checksum of both the untar and rsync is same.
        """
        # List of I/O processes
        self.list_of_io_processes = []

        # Create a dir to start untar
        self.linux_untar_dir = "{}/{}".format(self.mounts[0].mountpoint,
                                              "linuxuntar")
        ret = mkdir(self.clients[0], self.linux_untar_dir)
        self.assertTrue(ret, "Failed to create dir linuxuntar for untar")

        # Start linux untar on dir linuxuntar
        ret = run_linux_untar(self.clients[0],
                              self.mounts[0].mountpoint,
                              dirs=tuple(['linuxuntar']))
        self.list_of_io_processes += ret
        self.is_io_running = True

        # Create a new directory and start rsync
        self.rsync_dir = "{}/{}".format(self.mounts[0].mountpoint,
                                        'rsyncuntarlinux')
        ret = mkdir(self.clients[0], self.rsync_dir)
        self.assertTrue(ret, "Failed to create dir rsyncuntarlinux for rsync")

        # Start rsync for linux untar on mount point
        cmd = ("for i in `seq 1 3`; do rsync -azr {} {};sleep 120;done".format(
            self.linux_untar_dir, self.rsync_dir))
        ret = g.run_async(self.clients[0], cmd)
        self.list_of_io_processes.append(ret)

        # Add bricks to the volume
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(
            ret, "Failed to add brick with rsync on volume %s" % self.volname)

        # Trigger rebalance on the volume
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(
            ret, 0,
            "Failed to start rebalance on the volume %s" % self.volname)

        # Wait for rebalance to complete
        ret = wait_for_rebalance_to_complete(self.mnode,
                                             self.volname,
                                             timeout=6000)
        self.assertTrue(
            ret, "Rebalance is not yet complete on the volume "
            "%s" % self.volname)

        # Wait for IO to complete.
        ret = self._wait_for_untar_and_rsync_completion()
        self.assertFalse(ret, "IO didn't complete or failed on client")
        self.is_io_running = False

        # As we are running rsync and untar together, there are situations
        # when some of the new files created by linux untar is not synced
        # through rsync which causes checksum to retrun different value,
        # Hence to take care of this corner case we are rerunning rsync.
        cmd = "rsync -azr {} {}".format(self.linux_untar_dir, self.rsync_dir)
        ret, _, _ = g.run(self.clients[0], cmd)
        self.assertEqual(ret, 0, "Failed sync left behind files")

        # Check daata consistency on both the directories
        rsync_checksum = collect_mounts_arequal(
            self.mounts[0], path='rsyncuntarlinux/linuxuntar/')
        untar_checksum = collect_mounts_arequal(self.mounts[0],
                                                path='linuxuntar')
        self.assertEqual(
            rsync_checksum, untar_checksum,
            "Checksum on untar dir and checksum on rsync dir didn't match")
Пример #6
0
    def test_reserve_limt_change_while_rebalance(self):
        """
        1) Create a distributed-replicated volume and start it.
        2) Enable storage.reserve option on the volume using below command,
           gluster volume set storage.reserve 50
        3) Mount the volume on a client
        4) Add some data on the mount point (should be within reserve limits)
        5) Now, add-brick and trigger rebalance.
           While rebalance is in-progress change the reserve limit to a lower
           value say (30)
        6. Stop the rebalance
        7. Reset the storage reserve value to 50 as in step 2
        8. trigger rebalance
        9. while rebalance in-progress change the reserve limit to a higher
         value say (70)
        """

        # Setting storage.reserve 50
        self._set_vol_option({"storage.reserve": "50"})

        self.list_of_io_processes = []
        # Create a dir to start untar
        self.linux_untar_dir = "{}/{}".format(self.mounts[0].mountpoint,
                                              "linuxuntar")
        ret = mkdir(self.clients[0], self.linux_untar_dir)
        self.assertTrue(ret, "Failed to create dir linuxuntar for untar")

        # Start linux untar on dir linuxuntar
        ret = run_linux_untar(self.clients[0], self.mounts[0].mountpoint,
                              dirs=tuple(['linuxuntar']))
        self.list_of_io_processes += ret
        self.is_io_running = True

        # Add bricks to the volume
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(ret, "Failed to add brick with rsync on volume %s"
                        % self.volname)

        # Trigger rebalance on the volume
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, "Failed to start rebalance on the volume %s"
                         % self.volname)

        # Setting storage.reserve 30
        self._set_vol_option({"storage.reserve": "30"})

        # Stopping Rebalance
        ret, _, _ = rebalance_stop(self.mnode, self.volname)
        self.assertEqual(ret, 0, "Failed to stop rebalance on the volume %s"
                         % self.volname)

        # Setting storage.reserve 500
        self._set_vol_option({"storage.reserve": "500"})

        # Trigger rebalance on the volume
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, "Failed to start rebalance on the volume %s"
                         % self.volname)

        # Setting storage.reserve 70
        self._set_vol_option({"storage.reserve": "70"})
Пример #7
0
    def test_add_brick_remove_brick_with_lookups_and_kernal_untar(self):
        """
        Test case:
        1. Enable brickmux on cluster, create a volume, start it and mount it.
        2. Start the below I/O from 4 clients:
           From client-1 : run script to create folders and files continuously
           From client-2 : start linux kernel untar
           From client-3 : while true;do find;done
           From client-4 : while true;do ls -lRt;done
        3. Kill brick process on one of the nodes.
        4. Add brick to the volume.
        5. Remove bricks from the volume.
        6. Validate if I/O was successful or not.
        """
        # Fill few bricks till it is full
        bricks = get_all_bricks(self.mnode, self.volname)

        # Create a dir to start untar
        self.linux_untar_dir = "{}/{}".format(self.mounts[0].mountpoint,
                                              "linuxuntar")
        ret = mkdir(self.clients[0], self.linux_untar_dir)
        self.assertTrue(ret, "Failed to create dir linuxuntar for untar")

        # Start linux untar on dir linuxuntar
        ret = run_linux_untar(self.clients[0],
                              self.mounts[0].mountpoint,
                              dirs=tuple(['linuxuntar']))
        self.list_of_io_processes += ret
        self.is_io_running = True

        # Run script to create folders and files continuously
        cmd = ("/usr/bin/env python {} create_deep_dirs_with_files "
               "--dirname-start-num 758 --dir-depth 2 "
               "--dir-length 100 --max-num-of-dirs 10 --num-of-files 105 {}".
               format(self.script_upload_path, self.mounts[1].mountpoint))
        ret = g.run_async(self.mounts[1].client_system, cmd)
        self.list_of_io_processes += [ret]

        # Run lookup operations from 2 clients
        cmd = ("cd {}; for i in `seq 1 1000000`;do find .; done".format(
            self.mounts[2].mountpoint))
        ret = g.run_async(self.mounts[2].client_system, cmd)
        self.list_of_io_processes += [ret]

        cmd = ("cd {}; for i in `seq 1 1000000`;do ls -lRt; done".format(
            self.mounts[3].mountpoint))
        ret = g.run_async(self.mounts[3].client_system, cmd)
        self.list_of_io_processes += [ret]

        # Kill brick process of one of the nodes.
        brick = choice(bricks)
        node, _ = brick.split(":")
        ret = kill_process(node, process_names="glusterfsd")
        self.assertTrue(ret,
                        "Failed to kill brick process of brick %s" % brick)

        # Add brick to volume
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(ret, "Failed to add brick on volume %s" % self.volname)
        g.log.info("Add brick to volume successful")

        # Remove bricks from the volume
        ret = shrink_volume(self.mnode, self.volname, rebalance_timeout=2400)
        self.assertTrue(ret, "Failed to remove-brick from volume")
        g.log.info("Remove-brick rebalance successful")

        # Validate if I/O was successful or not.
        ret = validate_io_procs(self.list_of_io_processes, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        self.is_io_running = False
    def test_afr_node_reboot_self_heal(self):
        """
        Steps:
        1. Create *3 replica volume
        2. Mount the volume on 3 clients
        3. Run following workload from clients
        Client 1: Linux Untars
        Client 2: Lookups ls
        Client 3: Lookups du
        4. Create a directory on mount point
        5. Create deep dirs and file in the directory created at step 4
        6. Perform node reboot
        7. Check for heal status
        8. Reboot another node
        9. Check for heal status
        """

        # Create a dir to start untar
        self.linux_untar_dir = "{}/{}".format(self.mounts[0].mountpoint,
                                              "linuxuntar")
        ret = mkdir(self.clients[0], self.linux_untar_dir)
        self.assertTrue(ret, "Failed to create dir linuxuntar for untar")

        # Start linux untar on dir linuxuntar from client 1
        ret = run_linux_untar(self.clients[0], self.mounts[0].mountpoint,
                              dirs=tuple(['linuxuntar']))
        self.list_of_io_processes += ret
        self.is_io_running = True

        # Run lookup operation ls from client 2
        cmd = ("cd {}; for i in `seq 1 1000000`;do du -sh; done"
               .format(self.mounts[1].mountpoint))
        ret = g.run_async(self.mounts[1].client_system, cmd)
        self.list_of_io_processes += [ret]

        # Run lookup operation du from client 3
        cmd = ("cd {}; for i in `seq 1 1000000`;do ls -laRt; done"
               .format(self.mounts[2].mountpoint))
        ret = g.run_async(self.mounts[2].client_system, cmd)
        self.list_of_io_processes += [ret]

        # Create a dir to start crefi tool
        self.linux_untar_dir = "{}/{}".format(self.mounts[3].mountpoint,
                                              "crefi")
        ret = mkdir(self.clients[3], self.linux_untar_dir)
        self.assertTrue(ret, "Failed to create dir for crefi")

        # Create deep dirs and files on mount point from client 4
        list_of_fops = ("create", "rename", "chmod", "chown", "chgrp",
                        "hardlink", "truncate", "setxattr")
        for fops in list_of_fops:
            ret = run_crefi(self.clients[3],
                            self.linux_untar_dir, 10, 3, 3, thread=4,
                            random_size=True, fop=fops, minfs=0,
                            maxfs=102400, multi=True, random_filename=True)
            self.assertTrue(ret, "crefi failed during {}".format(fops))
            g.log.info("crefi PASSED FOR fop %s", fops)
        g.log.info("IOs were successful using crefi")

        for server_num in (1, 2):
            # Perform node reboot for servers
            g.log.info("Rebooting %s", self.servers[server_num])
            ret = g.run_async(self.servers[server_num], "reboot")
            self.assertTrue(ret, 'Failed to reboot node')

            # Monitor heal completion
            ret = monitor_heal_completion(self.mnode, self.volname)
            self.assertTrue(ret, 'Heal has not yet completed')

            # Check if heal is completed
            ret = is_heal_complete(self.mnode, self.volname)
            self.assertTrue(ret, 'Heal is not complete')
            g.log.info('Heal is completed successfully')
    def test_rebalance_start_not_fail(self):
        """
        1. On Node N1, Add "transport.socket.bind-address N1" in the
            /etc/glusterfs/glusterd.vol
        2. Create a replicate (1X3) and disperse (4+2) volumes with
            name more than 108 chars
        3. Mount the both volumes using node 1 where you added the
            "transport.socket.bind-address" and start IO(like untar)
        4. Perform add-brick on replicate volume 3-bricks
        5. Start rebalance on replicated volume
        6. Perform add-brick for disperse volume 6 bricks
        7. Start rebalance of disperse volume
        """
        cmd = ("sed -i 's/end-volume/option "
               "transport.socket.bind-address {}\\n&/g' "
               "/etc/glusterfs/glusterd.vol".format(self.mnode))
        disperse = ("disperse_e4upxjmtre7dl4797wedbp7r3jr8equzvmcae9f55t6z1"
                    "ffhrlk40jtnrzgo4n48fjf6b138cttozw3c6of3ze71n9urnjkshoi")
        replicate = ("replicate_e4upxjmtre7dl4797wedbp7r3jr8equzvmcae9f55t6z1"
                     "ffhrlk40tnrzgo4n48fjf6b138cttozw3c6of3ze71n9urnjskahn")

        volnames = (disperse, replicate)
        for volume, vol_name in (("disperse", disperse), ("replicate",
                                                          replicate)):

            bricks_list = form_bricks_list(self.mnode, volume,
                                           6 if volume == "disperse" else 3,
                                           self.servers, self.all_servers_info)
            if volume == "replicate":
                ret, _, _ = volume_create(self.mnode,
                                          replicate,
                                          bricks_list,
                                          replica_count=3)

            else:
                ret, _, _ = volume_create(self.mnode,
                                          disperse,
                                          bricks_list,
                                          force=True,
                                          disperse_count=6,
                                          redundancy_count=2)

            self.assertFalse(
                ret, "Unexpected: Volume create '{}' failed ".format(vol_name))
            ret, _, _ = volume_start(self.mnode, vol_name)
            self.assertFalse(ret, "Failed to start volume")

        # Add entry in 'glusterd.vol'
        ret, _, _ = g.run(self.mnode, cmd)
        self.assertFalse(ret, "Failed to add entry in 'glusterd.vol' file")

        self.list_of_io_processes = []

        # mount volume
        self.mount = ("/mnt/replicated_mount", "/mnt/disperse_mount")
        for mount_dir, volname in zip(self.mount, volnames):
            ret, _, _ = mount_volume(volname, "glusterfs", mount_dir,
                                     self.mnode, self.clients[0])
            self.assertFalse(
                ret, "Failed to mount the volume '{}'".format(mount_dir))

            # Run IO
            # Create a dir to start untar
            # for mount_point in self.mount:
            self.linux_untar_dir = "{}/{}".format(mount_dir, "linuxuntar")
            ret = mkdir(self.clients[0], self.linux_untar_dir)
            self.assertTrue(ret, "Failed to create dir linuxuntar for untar")

            # Start linux untar on dir linuxuntar
            ret = run_linux_untar(self.clients[:1],
                                  mount_dir,
                                  dirs=tuple(['linuxuntar']))
            self.list_of_io_processes += ret
            self.is_io_running = True

        # Add Brick to replicate Volume
        bricks_list = form_bricks_list(self.mnode, replicate, 3, self.servers,
                                       self.all_servers_info, "replicate")
        ret, _, _ = add_brick(self.mnode, replicate, bricks_list, force=True)
        self.assertFalse(ret, "Failed to add-brick '{}'".format(replicate))

        # Trigger Rebalance on the volume
        ret, _, _ = rebalance_start(self.mnode, replicate)
        self.assertFalse(
            ret,
            "Failed to start rebalance on the volume '{}'".format(replicate))

        # Add Brick to disperse Volume
        bricks_list = form_bricks_list(self.mnode, disperse, 6, self.servers,
                                       self.all_servers_info, "disperse")

        ret, _, _ = add_brick(self.mnode, disperse, bricks_list, force=True)
        self.assertFalse(ret, "Failed to add-brick '{}'".format(disperse))

        # Trigger Rebalance on the volume
        ret, _, _ = rebalance_start(self.mnode, disperse)
        self.assertFalse(
            ret, "Failed to start rebalance on the volume {}".format(disperse))

        # Check if Rebalance is completed on both the volume
        for volume in (replicate, disperse):
            ret = wait_for_rebalance_to_complete(self.mnode,
                                                 volume,
                                                 timeout=600)
            self.assertTrue(
                ret,
                "Rebalance is not Compleated on Volume '{}'".format(volume))