Пример #1
0
    def test_afr_reset_brick_heal_full(self):
        """
         1. Create files/dirs from mount point
         2. With IO in progress execute reset-brick start
         3. Now format the disk from back-end, using rm -rf <brick path>
         4. Execute reset brick commit and check for the brick is online.
         5. Issue volume heal using "gluster vol heal <volname> full"
         6. Check arequal for all bricks to verify all backend bricks
            including the resetted brick have same data
        """
        self.all_mounts_procs = []
        for count, mount_obj in enumerate(self.mounts):
            cmd = ("/usr/bin/env python %s create_deep_dirs_with_files "
                   "--dirname-start-num %d --dir-depth 3 --dir-length 5 "
                   "--max-num-of-dirs 5 --num-of-files 5 %s" %
                   (self.script_upload_path, count, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)

        all_bricks = get_all_bricks(self.mnode, self.volname)
        self.assertIsNotNone(all_bricks, "Unable to fetch bricks of volume")
        brick_to_reset = choice(all_bricks)

        # Start reset brick
        ret, _, err = reset_brick(self.mnode,
                                  self.volname,
                                  src_brick=brick_to_reset,
                                  option="start")
        self.assertEqual(ret, 0, err)
        g.log.info("Reset brick: %s started", brick_to_reset)

        # Validate the brick is offline
        ret = are_bricks_offline(self.mnode, self.volname, [brick_to_reset])
        self.assertTrue(ret, "Brick:{} is still online".format(brick_to_reset))

        # rm -rf of the brick directory
        node, brick_path = brick_to_reset.split(":")
        ret = rmdir(node, brick_path, force=True)
        self.assertTrue(
            ret, "Unable to delete the brick {} on "
            "node {}".format(brick_path, node))

        # Reset brick commit
        ret, _, err = reset_brick(self.mnode,
                                  self.volname,
                                  src_brick=brick_to_reset,
                                  option="commit")
        self.assertEqual(ret, 0, err)
        g.log.info("Reset brick committed successfully")

        # Check the brick is online
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(
            ret, "Few volume processess are offline for the "
            "volume: {}".format(self.volname))

        # Trigger full heal
        ret = trigger_heal_full(self.mnode, self.volname)
        self.assertTrue(ret, "Unable  to trigger the heal full command")

        # Wait for the heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, "Heal didn't complete in 20 mins time")

        # Validate io on the clients
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on the mounts")
        self.all_mounts_procs *= 0

        # Check arequal of the back-end bricks after heal completion
        all_subvols = get_subvols(self.mnode, self.volname)['volume_subvols']
        for subvol in all_subvols:
            ret, arequal_from_subvol = collect_bricks_arequal(subvol)
            self.assertTrue(
                ret, "Arequal is collected successfully across the"
                " bricks in the subvol {}".format(subvol))
            self.assertEqual(
                len(set(arequal_from_subvol)), 1, "Arequal is "
                "same on all the bricks in the subvol")
    def test_gfid_split_brain_resolution(self):
        """
        Description: Simulates gfid split brain on multiple files in a dir and
        resolve them via `bigger-file`, `mtime` and `source-brick` methods

        Steps:
        - Create and mount a replicated volume, create a dir and ~10 data files
        - Simulate gfid splits in 9 of the files
        - Resolve each 3 set of files using `bigger-file`, `mtime` and
          `source-bricks` split-brain resoultion methods
        - Trigger and monitor for heal completion
        - Validate all the files are healed and arequal matches for bricks in
          subvols
        """
        io_cmd = 'cat /dev/urandom | tr -dc [:space:][:print:] | head -c '
        client, m_point = (self.mounts[0].client_system,
                           self.mounts[0].mountpoint)
        arbiter = self.volume_type.find('arbiter') >= 0

        # Disable self-heal daemon and set `quorum-type` option to `none`
        ret = set_volume_options(self.mnode, self.volname, {
            'self-heal-daemon': 'off',
            'cluster.quorum-type': 'none'
        })
        self.assertTrue(
            ret, 'Not able to disable `quorum-type` and '
            '`self-heal` daemon volume options')

        # Create required dir and files from the mount
        split_dir = 'gfid_split_dir'
        file_io = ('cd %s; for i in {1..10}; do ' + io_cmd +
                   ' 1M > %s/file$i; done;')
        ret = mkdir(client, '{}/{}'.format(m_point, split_dir))
        self.assertTrue(ret, 'Unable to create a directory from mount point')
        ret, _, _ = g.run(client, file_io % (m_point, split_dir))

        # `file{4,5,6}` are re-created every time to be used in `bigger-file`
        # resolution method
        cmd = 'rm -rf {0}/file{1} && {2} {3}M > {0}/file{1}'
        split_cmds = {
            1:
            ';'.join(cmd.format(split_dir, i, io_cmd, 2) for i in range(1, 7)),
            2:
            ';'.join(cmd.format(split_dir, i, io_cmd, 3) for i in range(4, 7)),
            3: ';'.join(
                cmd.format(split_dir, i, io_cmd, 1) for i in range(4, 10)),
            4: ';'.join(
                cmd.format(split_dir, i, io_cmd, 1) for i in range(7, 10)),
        }

        # Get subvols and simulate entry split brain
        subvols = get_subvols(self.mnode, self.volname)['volume_subvols']
        self.assertTrue(subvols, 'Not able to get list of subvols')
        msg = ('Unable to bring files under {} dir to entry split brain while '
               '{} are down')
        for index, bricks in enumerate(self._get_two_bricks(subvols, arbiter),
                                       1):
            # Bring down two bricks from each subvol
            ret = bring_bricks_offline(self.volname, list(bricks))
            self.assertTrue(ret, 'Unable to bring {} offline'.format(bricks))

            ret, _, _ = g.run(client,
                              'cd {}; {}'.format(m_point, split_cmds[index]))
            self.assertEqual(ret, 0, msg.format(split_dir, bricks))

            # Bricks will be brought down only two times in case of arbiter and
            # bringing remaining files into split brain for `latest-mtime` heal
            if arbiter and index == 2:
                ret, _, _ = g.run(client,
                                  'cd {}; {}'.format(m_point, split_cmds[4]))
                self.assertEqual(ret, 0, msg.format(split_dir, bricks))

            # Bring offline bricks online
            ret = bring_bricks_online(
                self.mnode,
                self.volname,
                bricks,
                bring_bricks_online_methods='volume_start_force')
            self.assertTrue(ret, 'Unable to bring {} online'.format(bricks))

        # Enable self-heal daemon, trigger heal and assert volume is in split
        # brain condition
        ret = enable_self_heal_daemon(self.mnode, self.volname)
        self.assertTrue(ret, 'Failed to enable self heal daemon')

        ret = wait_for_self_heal_daemons_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, 'Not all self heal daemons are online')

        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Unable to trigger index heal on the volume')

        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertTrue(ret, 'Volume should be in split brain condition')

        # Select source brick and take note of files in source brick
        stop = len(subvols[0]) - 1 if arbiter else len(subvols[0])
        source_bricks = [choice(subvol[0:stop]) for subvol in subvols]
        files = [
            self._get_files_in_brick(path, split_dir) for path in source_bricks
        ]

        # Resolve `file1, file2, file3` gfid split files using `source-brick`
        cmd = ('gluster volume heal ' + self.volname + ' split-brain '
               'source-brick {} /' + split_dir + '/{}')
        for index, source_brick in enumerate(source_bricks):
            for each_file in files[index]:
                run_cmd = cmd.format(source_brick, each_file)
                self._run_cmd_and_assert(run_cmd)

        # Resolve `file4, file5, file6` gfid split files using `bigger-file`
        cmd = ('gluster volume heal ' + self.volname +
               ' split-brain bigger-file /' + split_dir + '/{}')
        for each_file in ('file4', 'file5', 'file6'):
            run_cmd = cmd.format(each_file)
            self._run_cmd_and_assert(run_cmd)

        # Resolve `file7, file8, file9` gfid split files using `latest-mtime`
        cmd = ('gluster volume heal ' + self.volname +
               ' split-brain latest-mtime /' + split_dir + '/{}')
        for each_file in ('file7', 'file8', 'file9'):
            run_cmd = cmd.format(each_file)
            self._run_cmd_and_assert(run_cmd)

        # Unless `shd` is triggered manually/automatically files will still
        # appear in `heal info`
        ret = trigger_heal_full(self.mnode, self.volname)
        self.assertTrue(ret, 'Unable to trigger full self heal')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(
            ret, 'All files in volume should be healed after healing files via'
            ' `source-brick`, `bigger-file`, `latest-mtime` methods manually')

        # Validate normal file `file10` and healed files don't differ in
        # subvols via an `arequal`
        for subvol in subvols:
            # Disregard last brick if volume is of arbiter type
            ret, arequal = collect_bricks_arequal(subvol[0:stop])
            self.assertTrue(
                ret, 'Unable to get `arequal` checksum on '
                '{}'.format(subvol[0:stop]))
            self.assertEqual(
                len(set(arequal)), 1, 'Mismatch of `arequal` '
                'checksum among {} is identified'.format(subvol[0:stop]))

        g.log.info('Pass: Resolution of gfid split-brain via `source-brick`, '
                   '`bigger-file` and `latest-mtime` methods is complete')
Пример #3
0
    def test_server_side_healing_happens_only_when_glustershd_running(self):
        """
        Test Script which verifies that the server side healing must happen
        only if the heal daemon is running on the node where source brick
        resides.

         * Create and start the Replicate volume
         * Check the glustershd processes - Only 1 glustershd should be listed
         * Bring down the bricks without affecting the cluster
         * Create files on volume
         * kill the glustershd on node where bricks is running
         * bring the bricks up which was killed in previous steps
         * check the heal info - heal info must show pending heal info, heal
           shouldn't happen since glustershd is down on source node
         * issue heal
         * trigger client side heal
         * heal should complete successfully
        """
        # pylint: disable=too-many-locals,too-many-statements,too-many-lines
        # Setting Volume options
        options = {
            "metadata-self-heal": "on",
            "entry-self-heal": "on",
            "data-self-heal": "on"
        }
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options %s' % options)
        g.log.info("Successfully set %s for volume %s", options, self.volname)

        # Check the self-heal daemon process
        ret, pids = get_self_heal_daemon_pid(self.servers)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process "
                              "found : %s" % pids))
        g.log.info(
            "Successful in verifying self heal daemon process"
            " on all nodes %s", self.servers)

        # Select the bricks to bring offline
        bricks_to_bring_offline = (select_volume_bricks_to_bring_offline(
            self.mnode, self.volname))
        g.log.info("Brick List to bring offline : %s", bricks_to_bring_offline)

        # Bring down the selected bricks
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(ret, "Failed to bring down the bricks")
        g.log.info("Brought down the brick process "
                   "for %s", bricks_to_bring_offline)

        # Write files on all mounts
        all_mounts_procs, num_files_to_write = [], 100
        for mount_obj in self.mounts:
            cmd = ("/usr/bin/env python %s create_files "
                   "-f %s --base-file-name file %s" %
                   (self.script_upload_path, num_files_to_write,
                    mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            all_mounts_procs.append(proc)

        # Validate IO
        ret = validate_io_procs(all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        g.log.info("IO is successful on all mounts")

        # Get online bricks list
        online_bricks = get_online_bricks_list(self.mnode, self.volname)
        g.log.info("Online Bricks for volume %s : %s", self.volname,
                   online_bricks)

        # Get the nodes where bricks are running
        bring_offline_glustershd_nodes = []
        for brick in online_bricks:
            bring_offline_glustershd_nodes.append(brick.split(":")[0])
        g.log.info("self heal deamon on nodes %s to be killed",
                   bring_offline_glustershd_nodes)

        # Kill the self heal daemon process on nodes
        ret = bring_self_heal_daemon_process_offline(
            bring_offline_glustershd_nodes)
        self.assertTrue(
            ret, ("Unable to bring self heal daemon process"
                  " offline for nodes %s" % bring_offline_glustershd_nodes))
        g.log.info(
            "Sucessfully brought down self heal process for "
            "nodes %s", bring_offline_glustershd_nodes)

        # Check the heal info
        heal_info = get_heal_info_summary(self.mnode, self.volname)
        g.log.info("Successfully got heal info %s for the volume %s",
                   heal_info, self.volname)

        # Bring bricks online
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline, 'glusterd_restart')
        self.assertTrue(
            ret,
            ("Failed to bring bricks: %s online" % bricks_to_bring_offline))

        # Issue heal
        ret = trigger_heal_full(self.mnode, self.volname)
        self.assertFalse(ret,
                         ("Able to trigger heal on volume %s where "
                          "self heal daemon is not running" % self.volname))
        g.log.info(
            "Expected : Unable to trigger heal on volume %s where "
            "self heal daemon is not running", self.volname)

        # Wait for 130 sec to heal
        ret = monitor_heal_completion(self.mnode, self.volname, 130)
        self.assertFalse(ret, ("Heal Completed on volume %s" % self.volname))
        g.log.info("Expected : Heal pending on volume %s", self.volname)

        # Check the heal info
        heal_info_after_triggering_heal = get_heal_info_summary(
            self.mnode, self.volname)
        g.log.info("Successfully got heal info for the volume %s",
                   self.volname)

        # Compare with heal pending with the files wrote
        for node in online_bricks:
            self.assertGreaterEqual(
                int(heal_info_after_triggering_heal[node]['numberOfEntries']),
                num_files_to_write,
                ("Some of the files are healed from source bricks %s where "
                 "self heal daemon is not running" % node))
        g.log.info("EXPECTED: No files are healed from source bricks where "
                   "self heal daemon is not running")

        # Unmount and Mount volume again as volume options were set
        # after mounting the volume
        for mount_obj in self.mounts:
            ret, _, _ = umount_volume(mount_obj.client_system,
                                      mount_obj.mountpoint)
            self.assertEqual(ret, 0,
                             "Failed to unmount %s" % mount_obj.client_system)
            ret, _, _ = mount_volume(self.volname,
                                     mtype='glusterfs',
                                     mpoint=mount_obj.mountpoint,
                                     mserver=self.mnode,
                                     mclient=mount_obj.client_system)
            self.assertEqual(ret, 0,
                             "Failed to mount %s" % mount_obj.client_system)

        all_mounts_procs = []
        for mount_obj in self.mounts:
            cmd = ("/usr/bin/env python %s read %s" %
                   (self.script_upload_path, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            all_mounts_procs.append(proc)

        # Validate IO
        ret = validate_io_procs(all_mounts_procs, self.mounts)
        self.assertTrue(ret, "Reads failed on some of the clients")
        g.log.info("Reads successful on all mounts")

        # Wait for heal to complete
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, "Unable to heal the pending entries")
        g.log.info("Successfully healed the pending entries for volume %s",
                   self.volname)
    def test_manual_heal_full_should_trigger_heal(self):
        """
        - Create a single brick volume
        - Add some files and directories
        - Get arequal from mountpoint
        - Add-brick such that this brick makes the volume a replica vol 1x3
        - Start heal full
        - Make sure heal is completed
        - Get arequals from all bricks and compare with arequal from mountpoint
        """
        # pylint: disable=too-many-statements,too-many-locals
        # Start IO on mounts
        self.all_mounts_procs = []
        for mount_obj in self.mounts:
            cmd = ("/usr/bin/env python %s create_deep_dirs_with_files "
                   "--dir-length 1 --dir-depth 1 --max-num-of-dirs 1 "
                   "--num-of-files 10 %s" % (self.script_upload_path,
                                             mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system, cmd,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
            g.log.info("IO on %s:%s is started successfully",
                       mount_obj.client_system, mount_obj.mountpoint)
        self.io_validation_complete = False

        # Validate IO
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        self.io_validation_complete = True
        g.log.info("IO is successful on all mounts")

        # Get arequal for mount before adding bricks
        ret, arequals = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after healing is successful')
        mount_point_total = arequals[0].splitlines()[-1].split(':')[-1]

        # Form brick list to add
        bricks_to_add = form_bricks_list(self.mnode, self.volname, 2,
                                         self.servers, self.all_servers_info)
        g.log.info('Brick list to add: %s', bricks_to_add)

        # Add bricks
        ret, _, _ = add_brick(self.mnode, self.volname, bricks_to_add,
                              replica_count=3)
        self.assertFalse(ret, "Failed to add bricks %s" % bricks_to_add)
        g.log.info("Adding bricks is successful on volume %s", self.volname)

        # Make sure the newly added bricks are available in the volume
        # get the bricks for the volume
        bricks_list = get_all_bricks(self.mnode, self.volname)
        for brick in bricks_to_add:
            self.assertIn(brick, bricks_list,
                          'Brick %s is not in brick list' % brick)
        g.log.info('New bricks are present in the volume')

        # Make sure volume change from distribute to replicate volume
        vol_info_dict = get_volume_type_info(self.mnode, self.volname)
        vol_type = vol_info_dict['volume_type_info']['typeStr']
        self.assertEqual('Replicate', vol_type,
                         'Volume type is not converted to Replicate '
                         'after adding bricks')
        g.log.info('Volume type is successfully converted to Replicate '
                   'after adding bricks')

        # Start healing
        ret = trigger_heal_full(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not started')
        g.log.info('Healing is started')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Get arequal on bricks and compare with mount_point_total
        # It should be the same
        ret, arequals = collect_bricks_arequal(bricks_list)
        self.assertTrue(ret, 'Failed to get arequal on bricks')
        for arequal in arequals:
            brick_total = arequal.splitlines()[-1].split(':')[-1]
            self.assertEqual(mount_point_total, brick_total,
                             'Arequals for mountpoint and %s are not equal')
        g.log.info('All arequals are equal for replicated')
    def test_heal_full_after_deleting_files(self):
        """
        - Create IO
        - Calculate arequal from mount
        - Delete data from backend from the EC volume
        - Trigger heal full
        - Check if heal is completed
        - Check for split-brain
        - Calculate arequal checksum and compare it
        """
        # pylint: disable=too-many-locals,too-many-statements
        # Creating files on client side
        for mount_obj in self.mounts:
            g.log.info("Generating data for %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            # Create dirs with file
            g.log.info('Creating dirs with file...')
            command = ("/usr/bin/env python %s create_deep_dirs_with_files "
                       "-d 2 -l 2 -n 2 -f 20 %s" %
                       (self.script_upload_path, mount_obj.mountpoint))

            proc = g.run_async(mount_obj.client_system,
                               command,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Validate IO
        g.log.info("Wait for IO to complete and validate IO ...")
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        self.io_validation_complete = True
        g.log.info("IO is successful on all mounts")

        # Get areequal before deleting the files from brick
        g.log.info('Getting areequal before getting bricks offline...')
        ret, result_before_killing_procs = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting areequal before getting bricks offline '
                   'is successful')

        subvols = get_subvols(self.mnode, self.volname)['volume_subvols']

        # Delete data from backend from the erasure node
        for subvol in subvols:
            erasure = subvol[-1]
            g.log.info('Clearing ec brick %s', erasure)
            node, brick_path = erasure.split(':')
            ret, _, err = g.run(node, 'cd %s/ ; rm -rf *' % brick_path)
            g.log.error('Clearing ec brick %s is unsuccessful', erasure)
            self.assertFalse(ret, err)
        g.log.info('Clearing data from brick is successful')

        # Trigger heal full
        ret = trigger_heal_full(self.mnode, self.volname)

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Get areequal after healing
        g.log.info('Getting areequal after getting bricks online...')
        ret, result_after_healing = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting areequal after getting bricks online '
                   'is successful')

        # Comparing areequals
        self.assertEqual(
            result_before_killing_procs, result_after_healing,
            'Areequals areequals before before killing arbiter '
            'processes and after healing are not equal')
        g.log.info('Areequals areequals before before killing arbiter '
                   'processes and after healing equal')
    def test_replace_brick_self_heal_io_in_progress(self):
        """
        - Create directory on mount point and write files/dirs
        - Create another set of files (1K files)
        - While creation of files/dirs are in progress Kill one brick
        - Remove the contents of the killed brick(simulating disk replacement)
        - When the IO's are still in progress, restart glusterd on the nodes
          where we simulated disk replacement to bring back bricks online
        - Start volume heal
        - Wait for IO's to complete
        - Verify whether the files are self-healed
        - Calculate arequals of the mount point and all the bricks
        """
        # pylint: disable=too-many-locals,too-many-statements,too-many-branches
        # Create dirs with files
        g.log.info('Creating dirs with file...')
        command = ("/usr/bin/env python %s create_deep_dirs_with_files "
                   "-d 2 -l 2 -n 2 -f 10 %s" %
                   (self.script_upload_path, self.mounts[0].mountpoint))
        ret, _, err = g.run(self.mounts[0].client_system,
                            command,
                            user=self.mounts[0].user)
        self.assertFalse(ret, err)
        g.log.info("IO is successful")

        # Creating another set of files (1K files)
        self.all_mounts_procs = []

        # Create dirs with files
        g.log.info('Creating 1K files...')
        command = ("/usr/bin/env python %s create_files "
                   "-f 1500 --fixed-file-size 10k %s" %
                   (self.script_upload_path, self.mounts[0].mountpoint))
        proc = g.run_async(self.mounts[0].client_system,
                           command,
                           user=self.mounts[0].user)
        self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Validate IO
        ret = validate_io_procs(self.all_mounts_procs, self.mounts[0])
        self.assertTrue(ret, "IO failed on some of the clients")
        self.io_validation_complete = True
        g.log.info("IO is successful on all mounts")

        # Select bricks to bring offline
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = bricks_to_bring_offline_dict['volume_bricks']

        # Bring brick offline
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline)

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret,
                        'Bricks %s are not offline' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Remove the content of the killed bricks
        for brick in bricks_to_bring_offline:
            brick_node, brick_path = brick.split(':')

            # Removing files
            command = ('cd %s ; rm -rf *' % brick_path)
            ret, _, err = g.run(brick_node, command)
            self.assertFalse(ret, err)
            g.log.info('Files are deleted on brick %s', brick)

        # Bring brick online
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Wait for volume processes to be online
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))
        g.log.info(
            "Successful in waiting for volume %s processes to be "
            "online", self.volname)

        # Verify volume's all process are online
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Volume %s : All process are not online" % self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Wait for self-heal-daemons to be online
        ret = is_shd_daemonized(self.all_servers)
        self.assertTrue(ret, "Either No self heal daemon process found")
        g.log.info("All self-heal daemons are online")

        # Start healing
        ret = trigger_heal_full(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not started')
        g.log.info('Healing is started')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Check arequals for "replicated"
        all_bricks = get_all_bricks(self.mnode, self.volname)
        if self.volume_type == "replicated":

            # Get arequal after bricks are online
            ret, arequals = collect_mounts_arequal(self.mounts)
            self.assertTrue(ret, 'Failed to get arequal')
            g.log.info('Getting arequal after successfully bringing'
                       'bricks online.')
            mount_point_total = arequals[0].splitlines()[-1].split(':')[-1]

            # Get arequal on bricks and compare with mount_point_total
            ret, arequals = collect_bricks_arequal(all_bricks)
            self.assertTrue(ret, 'Failed to get arequal on bricks')
            for arequal in arequals:
                brick_total = arequal.splitlines()[-1].split(':')[-1]
                self.assertEqual(
                    mount_point_total, brick_total,
                    'Arequals for mountpoint and brick '
                    'are not equal')
                g.log.info('Arequals for mountpoint and brick are equal')

        # Check arequals for "distributed-replicated"
        if self.volume_type == "distributed-replicated":

            # Get the subvolumes
            subvols_dict = get_subvols(self.mnode, self.volname)
            num_subvols = len(subvols_dict['volume_subvols'])
            g.log.info("Number of subvolumes in volume %s:", num_subvols)

            # Get arequals and compare
            for i in range(0, num_subvols):

                # Get arequal for first brick
                subvol_brick_list = subvols_dict['volume_subvols'][i]
                ret, arequal = collect_bricks_arequal(subvol_brick_list[0])
                self.assertTrue(ret, 'Failed to get arequal on first brick')
                first_brick_total = arequal[0].splitlines()[-1].split(':')[-1]

                # Get arequal for every brick and compare with first brick
                ret, arequals = collect_bricks_arequal(subvol_brick_list)
                self.assertTrue(ret, 'Failed to get arequal on bricks')
                for arequal in arequals:
                    brick_total = arequal.splitlines()[-1].split(':')[-1]
                    self.assertEqual(
                        first_brick_total, brick_total,
                        'Arequals for subvol and brick are '
                        'not equal')
                    g.log.info('Arequals for subvol and brick are equal')
Пример #7
0
    def test_heal_full_node_reboot(self):
        """
        - Create IO from mountpoint.
        - Calculate arequal from mount.
        - Delete data from backend from the EC volume.
        - Trigger heal full.
        - Disable Heal.
        - Again Enable and do Heal full.
        - Reboot a Node.
        - Calculate arequal checksum and compare it.
        """
        # pylint: disable=too-many-locals,too-many-statements
        # Creating files on client side
        for mount_obj in self.mounts:
            g.log.info("Generating data for %s:%s",
                       mount_obj.client_system, mount_obj.mountpoint)

            # Create dirs with file
            g.log.info('Creating dirs with file...')
            command = ("/usr/bin/env python %s create_deep_dirs_with_files "
                       "-d 2 -l 2 -n 2 -f 20 %s" % (
                           self.script_upload_path,
                           mount_obj.mountpoint))

            proc = g.run_async(mount_obj.client_system, command,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Validate IO
        g.log.info("Wait for IO to complete and validate IO ...")
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        self.io_validation_complete = True
        g.log.info("IO is successful on all mounts")

        # Get arequal before deleting the files from brick
        g.log.info('Getting arequal before getting bricks offline...')
        ret, result_before_killing_procs = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal before getting bricks offline '
                   'is successful')

        subvols = get_subvols(self.mnode, self.volname)['volume_subvols']

        # Delete data from backend from the erasure node
        for subvol in subvols:
            erasure = subvol[-1]
            g.log.info('Clearing ec brick %s', erasure)
            node, brick_path = erasure.split(':')
            ret, _, err = g.run(node, 'cd %s/ ; rm -rf *' % brick_path)
            self.assertFalse(ret, err)
            g.log.info('Clearing ec brick %s is successful', erasure)
        g.log.info('Clearing data from brick is unsuccessful')

        # Trigger heal full
        ret = trigger_heal_full(self.mnode, self.volname)
        self.assertTrue(ret, 'Unable to trigger full heal.')

        # Disable Heal and Enable Heal Full Again
        g.log.info("Disabling Healon the Servers")
        ret = disable_heal(self.mnode, self.volname)
        self.assertTrue(ret, "Disabling Failed")
        g.log.info("Healing is Now Disabled")

        g.log.info("Enbaling Heal Now")
        ret = enable_heal(self.mnode, self.volname)
        self.assertTrue(ret, "Enabling Heal failed")
        g.log.info("Healing is now enabled")
        ret = trigger_heal_full(self.mnode, self.volname)
        self.assertTrue(ret, 'Unable to trigger full heal.')

        # Reboot A Node
        g.log.info("Rebooting Node from the Cluster")
        subvols_dict = get_subvols(self.mnode, self.volname)
        nodes_to_reboot = []
        for subvol in subvols_dict['volume_subvols']:
            # Define nodes to reboot
            brick_list = subvol[1:2]
            for brick in brick_list:
                node, brick_path = brick.split(':')
                if node not in nodes_to_reboot:
                    nodes_to_reboot.append(node)

        # Reboot nodes on subvol and wait while rebooting
        g.log.info("Rebooting the nodes %s", nodes_to_reboot)
        ret = reboot_nodes(nodes_to_reboot)
        self.assertTrue(ret, 'Failed to reboot nodes %s '
                        % nodes_to_reboot)

        # Check if nodes are online
        counter = 0
        timeout = 700
        _rc = False
        while counter < timeout:
            ret, reboot_results = are_nodes_online(nodes_to_reboot)
            if not ret:
                g.log.info("Nodes are offline, Retry after 5 seconds ... ")
                sleep(5)
                counter = counter + 5
            _rc = True
            break

        if not _rc:
            for node in reboot_results:
                if not reboot_results[node]:
                    g.log.error("Node %s is offline even after "
                                "%d minutes", node, timeout / 60.0)
        g.log.info("All nodes %s are up and running", nodes_to_reboot)

        # Trigger Heal Full
        ret = trigger_heal_full(self.mnode, self.volname)
        if not ret:
            sleep(10)
            ret = trigger_heal_full(self.mnode, self.volname)
        self.assertTrue(ret, 'Unable to trigger full heal.')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Get arequal after healing
        g.log.info('Getting arequal after getting bricks online...')
        ret, result_after_healing = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after getting bricks online '
                   'is successful')

        # Comparing arequals
        self.assertEqual(result_before_killing_procs, result_after_healing,
                         'Arequals before killing arbiter '
                         'processes and after healing are not equal')
        g.log.info('Arequals before killing arbiter '
                   'processes and after healing are equal')