Пример #1
0
    def check_arequal_from_mount_point_and_bricks(self):
        """
        Check if arequals of mount point and bricks are
        are the same.
        """
        # Check arequals for "replicated"
        all_bricks = get_all_bricks(self.mnode, self.volname)
        if self.volume_type == "replicated":
            # Get arequal before getting bricks offline
            ret, arequals = collect_mounts_arequal(self.mounts)
            self.assertTrue(ret, 'Failed to get arequal')
            g.log.info('Getting arequal before getting bricks offline '
                       'is successful')
            mount_point_total = arequals[0].splitlines()[-1].split(':')[-1]

            # Get arequal on bricks and compare with mount_point_total
            ret, arequals = collect_bricks_arequal(all_bricks)
            self.assertTrue(ret, 'Failed to get arequal on bricks')
            for arequal in arequals:
                brick_total = arequal.splitlines()[-1].split(':')[-1]
                self.assertEqual(mount_point_total, brick_total,
                                 'Arequals for mountpoint and brick '
                                 'are not equal')
                g.log.info('Arequals for mountpoint and brick are equal')
            g.log.info('All arequals are equal for replicated')

        # Check arequals for "distributed-replicated"
        if self.volume_type == "distributed-replicated":
            # get the subvolumes
            subvols_dict = get_subvols(self.mnode, self.volname)
            num_subvols = len(subvols_dict['volume_subvols'])
            g.log.info("Number of subvolumes in volume %s:", num_subvols)

            # Get arequals and compare
            for i in range(0, num_subvols):
                # Get arequal for first brick
                subvol_brick_list = subvols_dict['volume_subvols'][i]
                ret, arequal = collect_bricks_arequal([subvol_brick_list[0]])
                self.assertTrue(ret, 'Failed to get arequal on first')

                # Get arequal for every brick and compare with first brick
                first_brick_total = arequal[0].splitlines()[-1].split(':')[-1]
                ret, arequals = collect_bricks_arequal(subvol_brick_list)
                self.assertTrue(ret, 'Failed to get arequal on bricks')
                for arequal in arequals:
                    brick_total = arequal.splitlines()[-1].split(':')[-1]
                    self.assertEqual(first_brick_total, brick_total,
                                     'Arequals for subvol and brick are '
                                     'not equal')
                    g.log.info('Arequals for subvol and brick are equal')
            g.log.info('All arequals are equal for distributed-replicated')
    def _validate_arequal_and_perform_lookup(self, subvols, stop):
        '''Refactor of steps common to all tests: Validate arequal from bricks
        backend and perform a lookup of all files from mount'''
        arequal = None
        for subvol in subvols:
            ret, arequal = collect_bricks_arequal(subvol[0:stop])
            self.assertTrue(
                ret, 'Unable to get `arequal` checksum on '
                '{}'.format(subvol[0:stop]))
            self.assertEqual(
                len(set(arequal)), 1, 'Mismatch of `arequal` '
                'checksum among {} is identified'.format(subvol[0:stop]))

        # Validate arequal of mount point matching against backend bricks
        ret, mp_arequal = collect_mounts_arequal(self.mounts)
        self.assertTrue(
            ret, 'Unable to get `arequal` checksum on '
            '{}'.format(str(self.mounts)))
        self.assertEqual(
            len(set(arequal + mp_arequal)), 1, 'Mismatch of `arequal` '
            'checksum among bricks and mount is identified')

        # Perform a lookup of all files and directories on mounts
        self.assertTrue(list_all_files_and_dirs_mounts(self.mounts),
                        'Failed to list all files and dirs from mount')
    def _check_arequal_checksum_for_the_volume(self):
        """
        Check if arequals of mount point and bricks are
        are the same.
        """
        if self.volume_type == "replicated":
            # Check arequals for "replicated"
            brick_list = get_all_bricks(self.mnode, self.volname)

            # Get arequal before getting bricks offline
            ret, arequals = collect_mounts_arequal([self.mounts[0]])
            self.assertTrue(ret, 'Failed to get arequal')
            g.log.info('Getting arequal before getting bricks offline '
                       'is successful')

            # Get arequal on bricks and compare with mount_point_total
            self._check_arequal_on_bricks_with_a_specific_arequal(
                arequals, brick_list)

        # Check arequals for "distributed-replicated"
        if self.volume_type == "distributed-replicated":
            # Get the subvolumes
            subvols_dict = get_subvols(self.mnode, self.volname)
            num_subvols = len(subvols_dict['volume_subvols'])

            # Get arequals and compare
            for i in range(0, num_subvols):
                # Get arequal for first brick
                brick_list = subvols_dict['volume_subvols'][i]
                ret, arequals = collect_bricks_arequal([brick_list[0]])
                self.assertTrue(ret, 'Failed to get arequal on first brick')

                # Get arequal for every brick and compare with first brick
                self._check_arequal_on_bricks_with_a_specific_arequal(
                    arequals, brick_list)
Пример #4
0
 def _check_arequal_on_bricks_with_a_specific_arequal(
         self, arequal, brick_list):
     """
     Compare an inital arequal checksum with bricks from a given brick list
     """
     init_val = arequal[0].splitlines()[-1].split(':')[-1]
     ret, arequals = collect_bricks_arequal(brick_list)
     self.assertTrue(ret, 'Failed to get arequal on bricks')
     for brick_arequal in arequals:
         brick_total = brick_arequal.splitlines()[-1].split(':')[-1]
         self.assertEqual(init_val, brick_total, 'Arequals not matching')
    def test_gfid_split_brain_resolution(self):
        """
        Description: Simulates gfid split brain on multiple files in a dir and
        resolve them via `bigger-file`, `mtime` and `source-brick` methods

        Steps:
        - Create and mount a replicated volume, create a dir and ~10 data files
        - Simulate gfid splits in 9 of the files
        - Resolve each 3 set of files using `bigger-file`, `mtime` and
          `source-bricks` split-brain resoultion methods
        - Trigger and monitor for heal completion
        - Validate all the files are healed and arequal matches for bricks in
          subvols
        """
        io_cmd = 'cat /dev/urandom | tr -dc [:space:][:print:] | head -c '
        client, m_point = (self.mounts[0].client_system,
                           self.mounts[0].mountpoint)
        arbiter = self.volume_type.find('arbiter') >= 0

        # Disable self-heal daemon and set `quorum-type` option to `none`
        ret = set_volume_options(self.mnode, self.volname, {
            'self-heal-daemon': 'off',
            'cluster.quorum-type': 'none'
        })
        self.assertTrue(
            ret, 'Not able to disable `quorum-type` and '
            '`self-heal` daemon volume options')

        # Create required dir and files from the mount
        split_dir = 'gfid_split_dir'
        file_io = ('cd %s; for i in {1..10}; do ' + io_cmd +
                   ' 1M > %s/file$i; done;')
        ret = mkdir(client, '{}/{}'.format(m_point, split_dir))
        self.assertTrue(ret, 'Unable to create a directory from mount point')
        ret, _, _ = g.run(client, file_io % (m_point, split_dir))

        # `file{4,5,6}` are re-created every time to be used in `bigger-file`
        # resolution method
        cmd = 'rm -rf {0}/file{1} && {2} {3}M > {0}/file{1}'
        split_cmds = {
            1:
            ';'.join(cmd.format(split_dir, i, io_cmd, 2) for i in range(1, 7)),
            2:
            ';'.join(cmd.format(split_dir, i, io_cmd, 3) for i in range(4, 7)),
            3: ';'.join(
                cmd.format(split_dir, i, io_cmd, 1) for i in range(4, 10)),
            4: ';'.join(
                cmd.format(split_dir, i, io_cmd, 1) for i in range(7, 10)),
        }

        # Get subvols and simulate entry split brain
        subvols = get_subvols(self.mnode, self.volname)['volume_subvols']
        self.assertTrue(subvols, 'Not able to get list of subvols')
        msg = ('Unable to bring files under {} dir to entry split brain while '
               '{} are down')
        for index, bricks in enumerate(self._get_two_bricks(subvols, arbiter),
                                       1):
            # Bring down two bricks from each subvol
            ret = bring_bricks_offline(self.volname, list(bricks))
            self.assertTrue(ret, 'Unable to bring {} offline'.format(bricks))

            ret, _, _ = g.run(client,
                              'cd {}; {}'.format(m_point, split_cmds[index]))
            self.assertEqual(ret, 0, msg.format(split_dir, bricks))

            # Bricks will be brought down only two times in case of arbiter and
            # bringing remaining files into split brain for `latest-mtime` heal
            if arbiter and index == 2:
                ret, _, _ = g.run(client,
                                  'cd {}; {}'.format(m_point, split_cmds[4]))
                self.assertEqual(ret, 0, msg.format(split_dir, bricks))

            # Bring offline bricks online
            ret = bring_bricks_online(
                self.mnode,
                self.volname,
                bricks,
                bring_bricks_online_methods='volume_start_force')
            self.assertTrue(ret, 'Unable to bring {} online'.format(bricks))

        # Enable self-heal daemon, trigger heal and assert volume is in split
        # brain condition
        ret = enable_self_heal_daemon(self.mnode, self.volname)
        self.assertTrue(ret, 'Failed to enable self heal daemon')

        ret = wait_for_self_heal_daemons_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, 'Not all self heal daemons are online')

        ret = trigger_heal(self.mnode, self.volname)
        self.assertTrue(ret, 'Unable to trigger index heal on the volume')

        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertTrue(ret, 'Volume should be in split brain condition')

        # Select source brick and take note of files in source brick
        stop = len(subvols[0]) - 1 if arbiter else len(subvols[0])
        source_bricks = [choice(subvol[0:stop]) for subvol in subvols]
        files = [
            self._get_files_in_brick(path, split_dir) for path in source_bricks
        ]

        # Resolve `file1, file2, file3` gfid split files using `source-brick`
        cmd = ('gluster volume heal ' + self.volname + ' split-brain '
               'source-brick {} /' + split_dir + '/{}')
        for index, source_brick in enumerate(source_bricks):
            for each_file in files[index]:
                run_cmd = cmd.format(source_brick, each_file)
                self._run_cmd_and_assert(run_cmd)

        # Resolve `file4, file5, file6` gfid split files using `bigger-file`
        cmd = ('gluster volume heal ' + self.volname +
               ' split-brain bigger-file /' + split_dir + '/{}')
        for each_file in ('file4', 'file5', 'file6'):
            run_cmd = cmd.format(each_file)
            self._run_cmd_and_assert(run_cmd)

        # Resolve `file7, file8, file9` gfid split files using `latest-mtime`
        cmd = ('gluster volume heal ' + self.volname +
               ' split-brain latest-mtime /' + split_dir + '/{}')
        for each_file in ('file7', 'file8', 'file9'):
            run_cmd = cmd.format(each_file)
            self._run_cmd_and_assert(run_cmd)

        # Unless `shd` is triggered manually/automatically files will still
        # appear in `heal info`
        ret = trigger_heal_full(self.mnode, self.volname)
        self.assertTrue(ret, 'Unable to trigger full self heal')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(
            ret, 'All files in volume should be healed after healing files via'
            ' `source-brick`, `bigger-file`, `latest-mtime` methods manually')

        # Validate normal file `file10` and healed files don't differ in
        # subvols via an `arequal`
        for subvol in subvols:
            # Disregard last brick if volume is of arbiter type
            ret, arequal = collect_bricks_arequal(subvol[0:stop])
            self.assertTrue(
                ret, 'Unable to get `arequal` checksum on '
                '{}'.format(subvol[0:stop]))
            self.assertEqual(
                len(set(arequal)), 1, 'Mismatch of `arequal` '
                'checksum among {} is identified'.format(subvol[0:stop]))

        g.log.info('Pass: Resolution of gfid split-brain via `source-brick`, '
                   '`bigger-file` and `latest-mtime` methods is complete')
Пример #6
0
    def test_afr_reset_brick_heal_full(self):
        """
         1. Create files/dirs from mount point
         2. With IO in progress execute reset-brick start
         3. Now format the disk from back-end, using rm -rf <brick path>
         4. Execute reset brick commit and check for the brick is online.
         5. Issue volume heal using "gluster vol heal <volname> full"
         6. Check arequal for all bricks to verify all backend bricks
            including the resetted brick have same data
        """
        self.all_mounts_procs = []
        for count, mount_obj in enumerate(self.mounts):
            cmd = ("/usr/bin/env python %s create_deep_dirs_with_files "
                   "--dirname-start-num %d --dir-depth 3 --dir-length 5 "
                   "--max-num-of-dirs 5 --num-of-files 5 %s" %
                   (self.script_upload_path, count, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)

        all_bricks = get_all_bricks(self.mnode, self.volname)
        self.assertIsNotNone(all_bricks, "Unable to fetch bricks of volume")
        brick_to_reset = choice(all_bricks)

        # Start reset brick
        ret, _, err = reset_brick(self.mnode,
                                  self.volname,
                                  src_brick=brick_to_reset,
                                  option="start")
        self.assertEqual(ret, 0, err)
        g.log.info("Reset brick: %s started", brick_to_reset)

        # Validate the brick is offline
        ret = are_bricks_offline(self.mnode, self.volname, [brick_to_reset])
        self.assertTrue(ret, "Brick:{} is still online".format(brick_to_reset))

        # rm -rf of the brick directory
        node, brick_path = brick_to_reset.split(":")
        ret = rmdir(node, brick_path, force=True)
        self.assertTrue(
            ret, "Unable to delete the brick {} on "
            "node {}".format(brick_path, node))

        # Reset brick commit
        ret, _, err = reset_brick(self.mnode,
                                  self.volname,
                                  src_brick=brick_to_reset,
                                  option="commit")
        self.assertEqual(ret, 0, err)
        g.log.info("Reset brick committed successfully")

        # Check the brick is online
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(
            ret, "Few volume processess are offline for the "
            "volume: {}".format(self.volname))

        # Trigger full heal
        ret = trigger_heal_full(self.mnode, self.volname)
        self.assertTrue(ret, "Unable  to trigger the heal full command")

        # Wait for the heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, "Heal didn't complete in 20 mins time")

        # Validate io on the clients
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on the mounts")
        self.all_mounts_procs *= 0

        # Check arequal of the back-end bricks after heal completion
        all_subvols = get_subvols(self.mnode, self.volname)['volume_subvols']
        for subvol in all_subvols:
            ret, arequal_from_subvol = collect_bricks_arequal(subvol)
            self.assertTrue(
                ret, "Arequal is collected successfully across the"
                " bricks in the subvol {}".format(subvol))
            self.assertEqual(
                len(set(arequal_from_subvol)), 1, "Arequal is "
                "same on all the bricks in the subvol")
    def test_heal_for_conservative_merge_with_two_bricks_blame(self):
        """
        1) Create 1x3 volume and fuse mount the volume
        2) On mount created a dir dir1
        3) Pkill glusterfsd on node n1 (b2 on node2 and b3 and node3 up)
        4) touch f{1..10} on the mountpoint
        5) b2 and b3 xattrs would be blaming b1 as files are created while
           b1 is down
        6) Reset the b3 xattrs to NOT blame b1 by using setattr
        7) Now pkill glusterfsd of b2 on node2
        8) Restart glusterd on node1 to bring up b1
        9) Now bricks b1 online , b2 down, b3 online
        10) touch x{1..10} under dir1 itself
        11) Again reset xattr on node3 of b3 so that it doesn't blame b2,
        as done for b1 in step 6
        12) Do restart glusterd on node2 hosting b2 to bring all bricks online
        13) Check for heal info, split-brain and arequal for the bricks
        """
        # pylint: disable=too-many-locals
        # Create dir `dir1/` on mountpont
        path = self.mounts[0].mountpoint + "/dir1"
        ret = mkdir(self.mounts[0].client_system, path, parents=True)
        self.assertTrue(ret, "Directory {} creation failed".format(path))

        all_bricks = get_all_bricks(self.mnode, self.volname)
        self.assertIsNotNone(all_bricks, "Unable to fetch bricks of volume")
        brick1, brick2, brick3 = all_bricks

        # Bring first brick offline
        self._bring_brick_offline_and_check(brick1)

        # touch f{1..10} files on the mountpoint
        cmd = ("cd {mpt}; for i in `seq 1 10`; do touch f$i"
               "; done".format(mpt=path))
        ret, _, _ = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, "Unable to create files on mountpoint")

        # Check b2 and b3 xattrs are blaming b1 and are same
        self.assertEqual(self._get_fattr_for_the_brick(brick2),
                         self._get_fattr_for_the_brick(brick3),
                         "Both the bricks xattrs are not blaming "
                         "brick: {}".format(brick1))

        # Reset the xattrs of dir1 on b3 for brick b1
        first_xattr_to_reset = "trusted.afr.{}-client-0".format(self.volname)
        xattr_value = "0x000000000000000000000000"
        host, brick_path = brick3.split(":")
        brick_path = brick_path + "/dir1"
        ret = set_fattr(host, brick_path, first_xattr_to_reset, xattr_value)
        self.assertTrue(ret, "Unable to set xattr for the directory")

        # Kill brick2 on the node2
        self._bring_brick_offline_and_check(brick2)

        # Restart glusterd on node1 to bring the brick1 online
        self.assertTrue(restart_glusterd([brick1.split(":")[0]]), "Unable to "
                        "restart glusterd")
        # checking for peer status post glusterd restart
        self._check_peers_status()

        # Check if the brick b1 on node1 is online or not
        online_bricks = get_online_bricks_list(self.mnode, self.volname)
        self.assertIsNotNone(online_bricks, "Unable to fetch online bricks")
        self.assertIn(brick1, online_bricks, "Brick:{} is still offline after "
                                             "glusterd restart".format(brick1))

        # Create 10 files under dir1 naming x{1..10}
        cmd = ("cd {mpt}; for i in `seq 1 10`; do touch x$i"
               "; done".format(mpt=path))
        ret, _, _ = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, "Unable to create files on mountpoint")

        # Reset the xattrs from brick3 on to brick2
        second_xattr_to_reset = "trusted.afr.{}-client-1".format(self.volname)
        ret = set_fattr(host, brick_path, second_xattr_to_reset, xattr_value)
        self.assertTrue(ret, "Unable to set xattr for the directory")

        # Bring brick2 online
        self.assertTrue(restart_glusterd([brick2.split(":")[0]]), "Unable to "
                        "restart glusterd")
        self._check_peers_status()

        self.assertTrue(are_bricks_online(self.mnode, self.volname, [brick2]))

        # Check are there any files in split-brain and heal completion
        self.assertFalse(is_volume_in_split_brain(self.mnode, self.volname),
                         "Some files are in split brain for "
                         "volume: {}".format(self.volname))
        self.assertTrue(monitor_heal_completion(self.mnode, self.volname),
                        "Conservative merge of files failed")

        # Check arequal checksum of all the bricks is same
        ret, arequal_from_the_bricks = collect_bricks_arequal(all_bricks)
        self.assertTrue(ret, "Arequal is collected successfully across the"
                        " bricks in the subvol {}".format(all_bricks))
        self.assertEqual(len(set(arequal_from_the_bricks)), 1, "Arequal is "
                         "same on all the bricks in the subvol")
    def test_manual_heal_full_should_trigger_heal(self):
        """
        - Create a single brick volume
        - Add some files and directories
        - Get arequal from mountpoint
        - Add-brick such that this brick makes the volume a replica vol 1x3
        - Start heal full
        - Make sure heal is completed
        - Get arequals from all bricks and compare with arequal from mountpoint
        """
        # pylint: disable=too-many-statements,too-many-locals
        # Start IO on mounts
        self.all_mounts_procs = []
        for mount_obj in self.mounts:
            cmd = ("/usr/bin/env python %s create_deep_dirs_with_files "
                   "--dir-length 1 --dir-depth 1 --max-num-of-dirs 1 "
                   "--num-of-files 10 %s" % (self.script_upload_path,
                                             mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system, cmd,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
            g.log.info("IO on %s:%s is started successfully",
                       mount_obj.client_system, mount_obj.mountpoint)
        self.io_validation_complete = False

        # Validate IO
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        self.io_validation_complete = True
        g.log.info("IO is successful on all mounts")

        # Get arequal for mount before adding bricks
        ret, arequals = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        g.log.info('Getting arequal after healing is successful')
        mount_point_total = arequals[0].splitlines()[-1].split(':')[-1]

        # Form brick list to add
        bricks_to_add = form_bricks_list(self.mnode, self.volname, 2,
                                         self.servers, self.all_servers_info)
        g.log.info('Brick list to add: %s', bricks_to_add)

        # Add bricks
        ret, _, _ = add_brick(self.mnode, self.volname, bricks_to_add,
                              replica_count=3)
        self.assertFalse(ret, "Failed to add bricks %s" % bricks_to_add)
        g.log.info("Adding bricks is successful on volume %s", self.volname)

        # Make sure the newly added bricks are available in the volume
        # get the bricks for the volume
        bricks_list = get_all_bricks(self.mnode, self.volname)
        for brick in bricks_to_add:
            self.assertIn(brick, bricks_list,
                          'Brick %s is not in brick list' % brick)
        g.log.info('New bricks are present in the volume')

        # Make sure volume change from distribute to replicate volume
        vol_info_dict = get_volume_type_info(self.mnode, self.volname)
        vol_type = vol_info_dict['volume_type_info']['typeStr']
        self.assertEqual('Replicate', vol_type,
                         'Volume type is not converted to Replicate '
                         'after adding bricks')
        g.log.info('Volume type is successfully converted to Replicate '
                   'after adding bricks')

        # Start healing
        ret = trigger_heal_full(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not started')
        g.log.info('Healing is started')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Get arequal on bricks and compare with mount_point_total
        # It should be the same
        ret, arequals = collect_bricks_arequal(bricks_list)
        self.assertTrue(ret, 'Failed to get arequal on bricks')
        for arequal in arequals:
            brick_total = arequal.splitlines()[-1].split(':')[-1]
            self.assertEqual(mount_point_total, brick_total,
                             'Arequals for mountpoint and %s are not equal')
        g.log.info('All arequals are equal for replicated')
    def test_self_heal_with_diff_algorithm(self):
        """
        Test Steps:
        1. Create a replicated/distributed-replicate volume and mount it
        2. Set data/metadata/entry-self-heal to off and
           data-self-heal-algorithm to diff
        3. Create few files inside a directory with some data
        4. Check arequal of the subvol and all the bricks in the subvol should
           have same checksum
        5. Bring down a brick from the subvol and validate it is offline
        6. Modify the data of existing files under the directory
        7. Bring back the brick online and wait for heal to complete
        8. Check arequal of the subvol and all the brick in the same subvol
           should have same checksum
        """

        # Setting options
        for key, value in (("data-self-heal",
                            "off"), ("metadata-self-heal",
                                     "off"), ("entry-self-heal", "off"),
                           ("data-self-heal-algorithm", "diff")):
            ret = set_volume_options(self.mnode, self.volname, {key: value})
            self.assertTrue(ret, 'Failed to set %s to %s.' % (key, value))
            g.log.info("%s set to %s successfully", key, value)

        # Create few files under a directory with data
        mountpoint = self.mounts[0].mountpoint
        client = self.mounts[0].client_system

        cmd = ("mkdir %s/test_diff_self_heal ; cd %s/test_diff_self_heal ;"
               "for i in `seq 1 100` ; do dd if=/dev/urandom of=file.$i "
               " bs=1M count=1; done;" % (mountpoint, mountpoint))
        ret, _, _ = g.run(client, cmd)
        self.assertEqual(ret, 0, "Failed to create file on mountpoint")
        g.log.info("Successfully created files on mountpoint")

        # Check arequal checksum of all the bricks is same
        subvols = get_subvols(self.mnode, self.volname)['volume_subvols']
        for subvol in subvols:
            ret, arequal_from_the_bricks = collect_bricks_arequal(subvol)
            self.assertTrue(
                ret, "Arequal is collected successfully across "
                "the bricks in the subvol {}".format(subvol))
            cmd = len(set(arequal_from_the_bricks))
            if (self.volume_type == "arbiter"
                    or self.volume_type == "distributed-arbiter"):
                cmd = len(set(arequal_from_the_bricks[:2]))
            self.assertEqual(
                cmd, 1, "Arequal"
                " is same on all the bricks in the subvol")

        # List a brick in each subvol and bring them offline
        brick_to_bring_offline = []
        for subvol in subvols:
            self.assertTrue(subvol, "List is empty")
            brick_to_bring_offline.extend(sample(subvol, 1))

        ret = bring_bricks_offline(self.volname, brick_to_bring_offline)
        self.assertTrue(
            ret,
            "Unable to bring brick: {} offline".format(brick_to_bring_offline))

        # Validate the brick is offline
        ret = are_bricks_offline(self.mnode, self.volname,
                                 brick_to_bring_offline)
        self.assertTrue(
            ret, "Brick:{} is still online".format(brick_to_bring_offline))

        # Modify files under test_diff_self_heal directory
        cmd = ("for i in `seq 1 100` ; do truncate -s 0 file.$i ; "
               "truncate -s 2M file.$i ; done;")
        ret, _, _ = g.run(client, cmd)
        self.assertEqual(ret, 0, "Failed to modify the files")
        g.log.info("Successfully modified files")

        # Start volume with force to bring all bricks online
        ret, _, _ = volume_start(self.mnode, self.volname, force=True)
        self.assertEqual(ret, 0, "Volume start with force failed")
        g.log.info("Volume: %s started successfully", self.volname)

        # Verify volume's all process are online
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Volume %s : All process are not online", self.volname))

        # Monitor heal completion
        self.assertTrue(
            monitor_heal_completion(self.mnode,
                                    self.volname,
                                    interval_check=10),
            "Heal failed after 20 mins")

        # Check are there any files in split-brain
        self.assertFalse(
            is_volume_in_split_brain(self.mnode, self.volname),
            "Some files are in split brain for "
            "volume: {}".format(self.volname))

        # Check arequal checksum of all the bricks is same
        for subvol in subvols:
            ret, arequal_from_the_bricks = collect_bricks_arequal(subvol)
            self.assertTrue(
                ret, "Arequal is collected successfully across "
                "the bricks in the subvol {}".format(subvol))
            cmd = len(set(arequal_from_the_bricks))
            if (self.volume_type == "arbiter"
                    or self.volume_type == "distributed-arbiter"):
                cmd = len(set(arequal_from_the_bricks[:2]))
            self.assertEqual(
                cmd, 1, "Arequal"
                " is same on all the bricks in the subvol")
    def test_replace_brick_self_heal_io_in_progress(self):
        """
        - Create directory on mount point and write files/dirs
        - Create another set of files (1K files)
        - While creation of files/dirs are in progress Kill one brick
        - Remove the contents of the killed brick(simulating disk replacement)
        - When the IO's are still in progress, restart glusterd on the nodes
          where we simulated disk replacement to bring back bricks online
        - Start volume heal
        - Wait for IO's to complete
        - Verify whether the files are self-healed
        - Calculate arequals of the mount point and all the bricks
        """
        # pylint: disable=too-many-locals,too-many-statements,too-many-branches
        # Create dirs with files
        g.log.info('Creating dirs with file...')
        command = ("/usr/bin/env python %s create_deep_dirs_with_files "
                   "-d 2 -l 2 -n 2 -f 10 %s" %
                   (self.script_upload_path, self.mounts[0].mountpoint))
        ret, _, err = g.run(self.mounts[0].client_system,
                            command,
                            user=self.mounts[0].user)
        self.assertFalse(ret, err)
        g.log.info("IO is successful")

        # Creating another set of files (1K files)
        self.all_mounts_procs = []

        # Create dirs with files
        g.log.info('Creating 1K files...')
        command = ("/usr/bin/env python %s create_files "
                   "-f 1500 --fixed-file-size 10k %s" %
                   (self.script_upload_path, self.mounts[0].mountpoint))
        proc = g.run_async(self.mounts[0].client_system,
                           command,
                           user=self.mounts[0].user)
        self.all_mounts_procs.append(proc)
        self.io_validation_complete = False

        # Validate IO
        ret = validate_io_procs(self.all_mounts_procs, self.mounts[0])
        self.assertTrue(ret, "IO failed on some of the clients")
        self.io_validation_complete = True
        g.log.info("IO is successful on all mounts")

        # Select bricks to bring offline
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = bricks_to_bring_offline_dict['volume_bricks']

        # Bring brick offline
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s offline' % bricks_to_bring_offline)

        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(ret,
                        'Bricks %s are not offline' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s offline is successful',
                   bricks_to_bring_offline)

        # Remove the content of the killed bricks
        for brick in bricks_to_bring_offline:
            brick_node, brick_path = brick.split(':')

            # Removing files
            command = ('cd %s ; rm -rf *' % brick_path)
            ret, _, err = g.run(brick_node, command)
            self.assertFalse(ret, err)
            g.log.info('Files are deleted on brick %s', brick)

        # Bring brick online
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline)
        self.assertTrue(
            ret, 'Failed to bring bricks %s online' % bricks_to_bring_offline)
        g.log.info('Bringing bricks %s online is successful',
                   bricks_to_bring_offline)

        # Wait for volume processes to be online
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))
        g.log.info(
            "Successful in waiting for volume %s processes to be "
            "online", self.volname)

        # Verify volume's all process are online
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Volume %s : All process are not online" % self.volname))
        g.log.info("Volume %s : All process are online", self.volname)

        # Wait for self-heal-daemons to be online
        ret = is_shd_daemonized(self.all_servers)
        self.assertTrue(ret, "Either No self heal daemon process found")
        g.log.info("All self-heal daemons are online")

        # Start healing
        ret = trigger_heal_full(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not started')
        g.log.info('Healing is started')

        # Monitor heal completion
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal has not yet completed')

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Check arequals for "replicated"
        all_bricks = get_all_bricks(self.mnode, self.volname)
        if self.volume_type == "replicated":

            # Get arequal after bricks are online
            ret, arequals = collect_mounts_arequal(self.mounts)
            self.assertTrue(ret, 'Failed to get arequal')
            g.log.info('Getting arequal after successfully bringing'
                       'bricks online.')
            mount_point_total = arequals[0].splitlines()[-1].split(':')[-1]

            # Get arequal on bricks and compare with mount_point_total
            ret, arequals = collect_bricks_arequal(all_bricks)
            self.assertTrue(ret, 'Failed to get arequal on bricks')
            for arequal in arequals:
                brick_total = arequal.splitlines()[-1].split(':')[-1]
                self.assertEqual(
                    mount_point_total, brick_total,
                    'Arequals for mountpoint and brick '
                    'are not equal')
                g.log.info('Arequals for mountpoint and brick are equal')

        # Check arequals for "distributed-replicated"
        if self.volume_type == "distributed-replicated":

            # Get the subvolumes
            subvols_dict = get_subvols(self.mnode, self.volname)
            num_subvols = len(subvols_dict['volume_subvols'])
            g.log.info("Number of subvolumes in volume %s:", num_subvols)

            # Get arequals and compare
            for i in range(0, num_subvols):

                # Get arequal for first brick
                subvol_brick_list = subvols_dict['volume_subvols'][i]
                ret, arequal = collect_bricks_arequal(subvol_brick_list[0])
                self.assertTrue(ret, 'Failed to get arequal on first brick')
                first_brick_total = arequal[0].splitlines()[-1].split(':')[-1]

                # Get arequal for every brick and compare with first brick
                ret, arequals = collect_bricks_arequal(subvol_brick_list)
                self.assertTrue(ret, 'Failed to get arequal on bricks')
                for arequal in arequals:
                    brick_total = arequal.splitlines()[-1].split(':')[-1]
                    self.assertEqual(
                        first_brick_total, brick_total,
                        'Arequals for subvol and brick are '
                        'not equal')
                    g.log.info('Arequals for subvol and brick are equal')
    def test_metadata_self_heal_on_open_fd(self):
        """
        Description: Pro-active metadata self heal on open fd

        Steps :
        1) Create a volume.
        2) Mount the volume using FUSE.
        3) Create test executable on volume mount.
        4) While test execution is in progress, bring down brick1.
        5) From mount point, change ownership, permission, group id of
           the test file.
        6) While test execution is in progress, bring back brick1 online.
        7) Do stat on the test file to check ownership, permission,
           group id on mount point and on bricks
        8) Stop test execution.
        9) Do stat on the test file to check ownership, permission,
           group id on mount point and on bricks.
        10) There should be no pending heals in the heal info command.
        11) There should be no split-brain.
        12) Calculate arequal of the bricks and mount point and it
            should be same.
        """
        # pylint: disable=too-many-statements,too-many-locals
        # pylint: disable=too-many-branches
        bricks_list = get_all_bricks(self.mnode, self.volname)
        self.assertIsNotNone(bricks_list, 'Brick list is None')
        client = self.clients[0]

        # Create test executable file on mount point
        m_point = self.mounts[0].mountpoint
        test_file = "testfile.sh"
        cmd = ("echo 'while true; do echo 'Press CTRL+C to stop execution';"
               " done' >> {}/{}".format(m_point, test_file))
        ret, _, _ = g.run(client, cmd)
        self.assertEqual(ret, 0, "Failed to create test file")

        # Execute the test file
        cmd = "cd {}; sh {}".format(m_point, test_file)
        g.run_async(client, cmd)

        # Get pid of the test file
        _cmd = "ps -aux | grep -v grep | grep testfile.sh | awk '{print $2}'"
        ret, out, _ = g.run(client, _cmd)
        self.assertEqual(ret, 0, "Failed to get pid of test file execution")

        # Bring brick1 offline
        ret = bring_bricks_offline(self.volname, [bricks_list[1]])
        self.assertTrue(
            ret, 'Failed to bring bricks {} '
            'offline'.format(bricks_list[1]))

        ret = are_bricks_offline(self.mnode, self.volname, [bricks_list[1]])
        self.assertTrue(ret, 'Bricks {} are not '
                        'offline'.format(bricks_list[1]))

        # change uid, gid and permission from client
        cmd = "chown {} {}/{}".format(self.user, m_point, test_file)
        ret, _, _ = g.run(client, cmd)
        self.assertEqual(ret, 0, "chown failed")

        cmd = "chgrp {} {}/{}".format(self.user, m_point, test_file)
        ret, _, _ = g.run(client, cmd)
        self.assertEqual(ret, 0, "chgrp failed")

        cmd = "chmod 777 {}/{}".format(m_point, test_file)
        ret, _, _ = g.run(client, cmd)
        self.assertEqual(ret, 0, "chown failed")

        # Bring brick1 online
        ret = bring_bricks_online(self.mnode, self.volname, [bricks_list[1]])
        self.assertTrue(
            ret, 'Failed to bring bricks {} online'.format(bricks_list[1]))

        ret = get_pathinfo(client, "{}/{}".format(m_point, test_file))
        self.assertIsNotNone(
            ret, "Unable to get "
            "trusted.glusterfs.pathinfo  of file")
        nodes_to_check = {}
        bricks_list = []
        for brick in ret['brickdir_paths']:
            node, brick_path = brick.split(':')
            if node[0:2].isdigit():
                nodes_to_check[node] = os.path.dirname(brick_path)
                path = node + ":" + os.path.dirname(brick_path)
            else:
                nodes_to_check[gethostbyname(node)] = (
                    os.path.dirname(brick_path))
                path = gethostbyname(node) + ":" + os.path.dirname(brick_path)
            bricks_list.append(path)
        nodes_to_check[client] = m_point

        # Verify that the changes are successful on bricks and client
        self._verify_stat_info(nodes_to_check, test_file)

        # Kill the test executable file
        for pid in out.split('\n')[:-1]:
            cmd = "kill -s 9 {}".format(pid)
            ret, _, _ = g.run(client, cmd)
            self.assertEqual(ret, 0, "Failed to kill test file execution")

        # Verify that the changes are successful on bricks and client
        self._verify_stat_info(nodes_to_check, test_file)

        # Verify there are no pending heals
        heal_info = get_heal_info_summary(self.mnode, self.volname)
        self.assertIsNotNone(heal_info, 'Unable to get heal info')
        for brick in bricks_list:
            self.assertEqual(int(heal_info[brick]['numberOfEntries']), 0,
                             ("Pending heal on brick {} ".format(brick)))

        # Check for split-brain
        ret = is_volume_in_split_brain(self.mnode, self.volname)
        self.assertFalse(ret, 'Volume is in split-brain state')
        g.log.info('Volume is not in split-brain state')

        # Get arequal for mount
        ret, arequals = collect_mounts_arequal(self.mounts)
        self.assertTrue(ret, 'Failed to get arequal')
        mount_point_total = arequals[0].splitlines()[-1].split(':')[-1]

        # Collecting data bricks
        vol_info = get_volume_info(self.mnode, self.volname)
        self.assertIsNotNone(vol_info, 'Unable to get volume info')
        data_brick_list = []
        for brick in bricks_list:
            for brick_info in vol_info[self.volname]["bricks"]["brick"]:
                if brick_info["name"] == brick:
                    if brick_info["isArbiter"] == "0":
                        data_brick_list.append(brick)
        bricks_list = data_brick_list

        # Get arequal on bricks and compare with mount_point_total
        # It should be the same
        arbiter = self.volume_type.find('arbiter') >= 0
        subvols = get_subvols(self.mnode, self.volname)['volume_subvols']
        stop = len(subvols[0]) - 1 if arbiter else len(subvols[0])
        for subvol in subvols:
            subvol = [i for i in subvol if i in bricks_list]
            if subvol:
                ret, arequal = collect_bricks_arequal(subvol[0:stop])
                self.assertTrue(
                    ret, 'Unable to get arequal checksum '
                    'on {}'.format(subvol[0:stop]))
                self.assertEqual(
                    len(set(arequal)), 1, 'Mismatch of arequal '
                    'checksum among {} is '
                    'identified'.format(subvol[0:stop]))
                brick_total = arequal[-1].splitlines()[-1].split(':')[-1]
                self.assertEqual(
                    brick_total, mount_point_total,
                    "Arequals for mountpoint and {} "
                    "are not equal".format(subvol[0:stop]))
Пример #12
0
    def check_full_data_heal(self, brick_set):
        # Bring specified set of bricks in each subvolume offline, pump I/O,
        # bring the bricks online, wait for heal and check arequal.
        mountpoint = self.mounts[0].mountpoint
        client = self.mounts[0].client_system
        bricks_to_bring_offline = []
        subvols = get_subvols(self.mnode, self.volname)['volume_subvols']
        bricks_list = list(zip(*subvols))

        bricks_to_bring_offline = list(bricks_list[brick_set])

        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(
            ret, "Unable to bring brick: {} offline".format(
                bricks_to_bring_offline))

        # Validate the bricks are offline
        ret = are_bricks_offline(self.mnode, self.volname,
                                 bricks_to_bring_offline)
        self.assertTrue(
            ret, "Brick:{} is still online".format(bricks_to_bring_offline))

        if brick_set == 0:
            # Create few files under the directory with data
            cmd = ("cd %s/test_full_self_heal ; for i in `seq 1 100` ; "
                   "do dd if=/dev/urandom of=file.$i bs=1M count=1; done;" %
                   (mountpoint))
        else:
            # Modify files under test_full_self_heal directory
            cmd = (
                "cd %s/test_full_self_heal ; for i in `seq 1 100` ; "
                "do truncate -s 0 file.$i ; truncate -s 2M file.$i ; done;" %
                (mountpoint))

        ret, _, _ = g.run(client, cmd)
        self.assertFalse(ret, "Failed to create file on mountpoint")
        g.log.info("Successfully created files on mountpoint")

        # Start volume with force to bring all bricks online
        ret, _, _ = volume_start(self.mnode, self.volname, force=True)
        self.assertEqual(ret, 0, "Volume start with force failed")
        g.log.info("Volume: %s started successfully", self.volname)

        # Verify volume's all process are online
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Volume %s : All process are not online", self.volname))

        # Monitor heal completion
        self.assertTrue(
            monitor_heal_completion(self.mnode,
                                    self.volname,
                                    interval_check=10), "Heal "
            "failed after 20 mins")

        # Check are there any files in split-brain
        self.assertFalse(
            is_volume_in_split_brain(self.mnode, self.volname),
            "Some files are in split brain for volume : {}".format(
                self.volname))

        # Check arequal checksum of all the bricks are same
        for subvol in subvols:
            ret, arequal_from_the_bricks = collect_bricks_arequal(subvol)
            self.assertFalse(
                ret, "Failed to collect arequal on the bricks in "
                "subvol {}".format(subvol))
            cmd = len(set(arequal_from_the_bricks))
            if (self.volume_type == "arbiter"
                    or self.volume_type == "distributed-arbiter"):
                cmd = len(set(arequal_from_the_bricks[:2]))
            self.assertEqual(
                cmd, 1, "Arequal is same on all the bricks in the"
                " subvol")
Пример #13
0
    def test_afr_self_heal_add_brick_rebalance(self):
        """
        Test Steps:
        1. Create a replicated/distributed-replicate volume and mount it
        2. Start IO from the clients
        3. Bring down a brick from the subvol and validate it is offline
        4. Bring back the brick online and wait for heal to complete
        5. Once the heal is completed, expand the volume.
        6. Trigger rebalance and wait for rebalance to complete
        7. Validate IO, no errors during the steps performed from step 2
        8. Check arequal of the subvol and all the brick in the same subvol
        should have same checksum
        """
        # Start IO from the clients
        self.all_mounts_procs = []
        for count, mount_obj in enumerate(self.mounts):
            g.log.info("Starting IO on %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            cmd = ("/usr/bin/env python %s create_deep_dirs_with_files "
                   "--dirname-start-num %d --dir-depth 3 --dir-length 5 "
                   "--max-num-of-dirs 5 --num-of-files 30 %s" %
                   (self.script_upload_path, count, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)

        # List a brick in each subvol and bring them offline
        subvols = get_subvols(self.mnode, self.volname)['volume_subvols']
        brick_to_bring_offline = []
        for subvol in subvols:
            self.assertTrue(subvol, "List is empty")
            brick_to_bring_offline.extend(sample(subvol, 1))

        ret = bring_bricks_offline(self.volname, brick_to_bring_offline)
        self.assertTrue(
            ret,
            "Unable to bring brick: {} offline".format(brick_to_bring_offline))

        # Validate the brick is offline
        ret = are_bricks_offline(self.mnode, self.volname,
                                 brick_to_bring_offline)
        self.assertTrue(
            ret, "Brick:{} is still online".format(brick_to_bring_offline))

        # Wait for 10 seconds for IO to be generated
        sleep(10)

        # Start volume with force to bring all bricks online
        ret, _, _ = volume_start(self.mnode, self.volname, force=True)
        self.assertEqual(ret, 0, "Volume start with force failed")
        g.log.info("Volume: %s started successfully", self.volname)

        # Verify volume's all process are online
        ret = verify_all_process_of_volume_are_online(self.mnode, self.volname)
        self.assertTrue(
            ret, ("Volume %s : All process are not online", self.volname))

        # Monitor heal completion
        self.assertTrue(
            monitor_heal_completion(self.mnode,
                                    self.volname,
                                    interval_check=10),
            "Heal failed after 20 mins")

        # Check are there any files in split-brain and heal completion
        self.assertFalse(
            is_volume_in_split_brain(self.mnode, self.volname),
            "Some files are in split brain for "
            "volume: {}".format(self.volname))

        # Expanding volume by adding bricks to the volume when IO in progress
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(ret, ("Failed to expand the volume when IO in "
                              "progress on volume %s", self.volname))

        # Wait for volume processes to be online
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online", self.volname))

        # Start Rebalance
        ret, _, _ = rebalance_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, ("Failed to start rebalance on the volume "
                                  "%s", self.volname))
        g.log.info("Successfully started rebalance on the "
                   "volume %s", self.volname)

        # Without sleep the next step will fail with Glusterd Syncop locking.
        sleep(2)

        # Wait for rebalance to complete
        ret = wait_for_rebalance_to_complete(self.mnode,
                                             self.volname,
                                             timeout=1800)
        self.assertTrue(ret, ("Rebalance is not yet complete on the volume "
                              "%s", self.volname))
        g.log.info("Rebalance is successfully complete on "
                   "the volume %s", self.volname)

        # Validate IO
        ret = validate_io_procs(self.all_mounts_procs, self.mounts)
        self.io_validation_complete = True
        self.assertTrue(ret, "IO failed on some of the clients")
        self.all_mounts_procs *= 0

        # List all files and dirs created
        ret = list_all_files_and_dirs_mounts(self.mounts)
        self.assertTrue(ret, "Failed to list all files and dirs")

        # Check arequal checksum of all the bricks is same
        for subvol in subvols:
            ret, arequal_from_the_bricks = collect_bricks_arequal(subvol)
            self.assertTrue(
                ret, "Arequal is collected successfully across "
                "the bricks in the subvol {}".format(subvol))
            cmd = len(set(arequal_from_the_bricks))
            if (self.volume_type == "arbiter"
                    or self.volume_type == "distributed-arbiter"):
                cmd = len(set(arequal_from_the_bricks[:2]))
            self.assertEqual(
                cmd, 1, "Arequal"
                " is same on all the bricks in the subvol")