def test_healed_and_heal_failed_command(self):
        """
        Description: Validate absence of `healed` and `heal-failed` options

        Steps:
        - Create and mount a replicated volume
        - Kill one of the bricks and write IO from mount point
        - Verify `gluster volume heal <volname> info healed` and `gluster
          volume heal <volname> info heal-failed` command results in error
        - Validate `gluster volume help` doesn't list `healed` and
          `heal-failed` commands
        """

        client, m_point = (self.mounts[0].client_system,
                           self.mounts[0].mountpoint)

        # Kill one of the bricks in the volume
        brick_list = get_online_bricks_list(self.mnode, self.volname)
        self.assertIsNotNone(brick_list, 'Unable to get online bricks list')
        ret = bring_bricks_offline(self.volname, choice(brick_list))
        self.assertTrue(ret, 'Unable to kill one of the bricks in the volume')

        # Fill IO in the mount point
        cmd = ('/usr/bin/env python {} '
               'create_deep_dirs_with_files --dir-depth 10 '
               '--fixed-file-size 1M --num-of-files 50 '
               '--dirname-start-num 1 {}'.format(self.script_path, m_point))
        ret, _, _ = g.run(client, cmd)
        self.assertEqual(ret, 0, 'Not able to fill directory with IO')

        # Verify `gluster volume heal <volname> info healed` results in error
        cmd = 'gluster volume heal <volname> info'
        ret, _, err = heal_info_healed(self.mnode, self.volname)
        self.assertNotEqual(ret, 0, '`%s healed` should result in error' % cmd)
        self.assertIn('Usage', err, '`%s healed` should list `Usage`' % cmd)

        # Verify `gluster volume heal <volname> info heal-failed` errors out
        ret, _, err = heal_info_heal_failed(self.mnode, self.volname)
        self.assertNotEqual(ret, 0,
                            '`%s heal-failed` should result in error' % cmd)
        self.assertIn('Usage', err,
                      '`%s heal-failed` should list `Usage`' % cmd)

        # Verify absence of `healed` nd `heal-failed` commands in `volume help`
        cmd = 'gluster volume help | grep -i heal'
        ret, rout, _ = g.run(self.mnode, cmd)
        self.assertEqual(
            ret, 0, 'Unable to query help content from `gluster volume help`')
        self.assertNotIn(
            'healed', rout, '`healed` string should not exist '
            'in `gluster volume help` command')
        self.assertNotIn(
            'heal-failed', rout, '`heal-failed` string should '
            'not exist in `gluster volume help` command')
    def test_profile_operations_with_one_node_down(self):

        # pylint: disable=too-many-statements
        """
        Test Case:
        1) Create a volume and start it.
        2) Mount volume on client and start IO.
        3) Start profile info on the volume.
        4) Stop glusterd on one node.
        5) Run profile info with different parameters
           and see if all bricks are present or not.
        6) Stop profile on the volume.
        """

        # Start IO on mount points.
        g.log.info("Starting IO on all mounts...")
        self.all_mounts_procs = []
        counter = 1
        for mount_obj in self.mounts:
            g.log.info("Starting IO on %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            cmd = ("/usr/bin/env python %s create_deep_dirs_with_files "
                   "--dir-depth 4 "
                   "--dirname-start-num %d "
                   "--dir-length 6 "
                   "--max-num-of-dirs 3 "
                   "--num-of-files 5 %s" % (
                       self.script_upload_path,
                       counter, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system, cmd,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
            counter += 1

        # Start profile on volume.
        ret, _, _ = profile_start(self.mnode, self.volname)
        self.assertEqual(ret, 0, "Failed to start profile on volume: %s"
                         % self.volname)
        g.log.info("Successfully started profile on volume: %s",
                   self.volname)

        # Fetching a random server from list.
        self.random_server = randint(1, len(self.servers)-1)

        # Stopping glusterd on one node.
        ret = stop_glusterd(self.servers[self.random_server])
        self.assertTrue(ret, "Failed to stop glusterd on one node.")
        g.log.info("Successfully stopped glusterd on one node.")
        ret = wait_for_glusterd_to_start(self.servers[self.random_server])
        self.assertFalse(ret, "glusterd is still running on %s"
                         % self.servers[self.random_server])
        g.log.info("Glusterd stop on the nodes : %s "
                   "succeeded", self.servers[self.random_server])

        # Getting and checking output of profile info.
        ret, out, _ = profile_info(self.mnode, self.volname)
        self.assertEqual(ret, 0, "Failed to run profile info on volume: %s"
                         % self.volname)
        g.log.info("Successfully executed profile info on volume: %s",
                   self.volname)

        # Checking if all bricks are present in profile info.
        brick_list = get_online_bricks_list(self.mnode, self.volname)
        for brick in brick_list:
            self.assertTrue(brick in out,
                            "Brick %s not a part of profile info output."
                            % brick)
            g.log.info("Brick %s showing in profile info output.",
                       brick)

        # Running profile info with different profile options.
        profile_options = ['peek', 'incremental', 'clear', 'incremental peek',
                           'cumulative']
        for option in profile_options:

            # Getting and checking output of profile info.
            ret, out, _ = profile_info(self.mnode, self.volname,
                                       options=option)
            self.assertEqual(ret, 0,
                             "Failed to run profile info %s on volume: %s"
                             % (option, self.volname))
            g.log.info("Successfully executed profile info %s on volume: %s",
                       option, self.volname)

            # Checking if all bricks are present in profile info peek.
            for brick in brick_list:
                self.assertTrue(brick in out,
                                "Brick %s not a part of profile"
                                " info %s output."
                                % (brick, option))
                g.log.info("Brick %s showing in profile info %s output.",
                           brick, option)

        # Starting glusterd on node where stopped.
        ret = start_glusterd(self.servers[self.random_server])
        self.assertTrue(ret, "Failed to start glusterd.")
        g.log.info("Successfully started glusterd.")

        # Checking if peer is connected
        ret = wait_for_peers_to_connect(self.mnode, self.servers)
        self.assertTrue(ret, "Peers are not in connected state.")
        g.log.info("Peers are in connected state.")

        # Stop profile on volume.
        ret, _, _ = profile_stop(self.mnode, self.volname)
        self.assertEqual(ret, 0, "Failed to stop profile on volume: %s"
                         % self.volname)
        g.log.info("Successfully stopped profile on volume: %s", self.volname)

        # Validate IO
        self.assertTrue(
            validate_io_procs(self.all_mounts_procs, self.mounts),
            "IO failed on some of the clients"
        )
        g.log.info("IO validation complete.")
    def test_heal_on_file_appends(self):
        """
        Test steps:
        - create and mount EC volume 4+2
        - start append to a file from client
        - bring down one of the bricks (say b1)
        - wait for ~minute and bring down another brick (say b2)
        - after ~minute bring up first brick (b1)
        - check the xattrs 'ec.size', 'ec.version'
        - xattrs of online bricks should be same as an indication to heal
        """

        # Get bricks list
        bricks_list = get_online_bricks_list(self.mnode, self.volname)
        self.assertIsNotNone(bricks_list, 'Not able to get bricks list')

        # Creating a file, generate and append data to the file
        self.file_name = 'test_file'
        cmd = ("cd %s ;"
               "while true; do "
               "cat /dev/urandom | tr -dc  [:space:][:print:] "
               "| head -c 4K >> %s; sleep 2; "
               "done;" % (self.mount_obj.mountpoint, self.file_name))
        ret = g.run_async(self.client, cmd, user=self.mount_obj.user)
        self.assertIsNotNone(ret, "Not able to start IO on client")
        g.log.info('Started generating and appending data to the file')
        self.is_io_started = True

        # Select 3 bricks, 2 need to be offline and 1 will be healthy
        brick_1, brick_2, brick_3 = sample(bricks_list, 3)

        # Wait for IO to fill the bricks
        sleep(30)

        # Bring first brick offline and validate
        ret = bring_bricks_offline(self.volname, [brick_1])
        self.assertTrue(ret,
                        'Failed to bring brick {} offline'.format(brick_1))
        ret = are_bricks_offline(self.mnode, self.volname, [brick_1])
        self.assertTrue(
            ret, 'Not able to validate brick {} being '
            'offline'.format(brick_1))
        g.log.info("Brick %s is brought offline successfully", brick_1)
        self.offline_bricks.append(brick_1)

        # Wait for IO to fill the bricks
        sleep(30)

        # Bring second brick offline and validate
        ret = bring_bricks_offline(self.volname, [brick_2])
        self.assertTrue(ret,
                        'Failed to bring brick {} offline'.format(brick_2))
        ret = are_bricks_offline(self.mnode, self.volname, [brick_2])
        self.assertTrue(
            ret, 'Not able to validate brick {} being '
            'offline'.format(brick_2))
        g.log.info("Brick %s is brought offline successfully", brick_2)
        self.offline_bricks.append(brick_2)

        # Wait for IO to fill the bricks
        sleep(30)

        # Bring first brick online and validate peer status
        ret = bring_bricks_online(
            self.mnode,
            self.volname, [brick_1],
            bring_bricks_online_methods=['glusterd_restart'])
        self.assertTrue(ret, 'Not able to bring brick {} '
                        'online'.format(brick_1))
        g.log.info("Offlined brick %s is brought online successfully", brick_1)
        ret = self.validate_peers_are_connected()
        self.assertTrue(
            ret, "Peers are not in connected state after bringing "
            "an offline brick to online via `glusterd restart`")
        g.log.info("Successfully validated peers are in connected state")

        # To catchup onlined brick with healthy bricks
        sleep(30)

        # Validate the xattr to be same on onlined and healthy bric
        online_bricks = get_online_bricks_list(self.mnode, self.volname)
        self.assertIsNotNone(online_bricks, 'Unable to fetch online bricks')
        g.log.info('All online bricks are fetched successfully')
        for xattr in ('trusted.ec.size', 'trusted.ec.version'):
            ret = validate_xattr_on_all_bricks([brick_1, brick_3],
                                               self.file_name, xattr)
            self.assertTrue(
                ret, "{} is not same on all online "
                "bricks".format(xattr))

        # Get epoch time on the client
        ret, prev_ctime, _ = g.run(self.client, 'date +%s')
        self.assertEqual(ret, 0, 'Not able to get epoch time from client')

        # Headroom for file ctime to get updated
        sleep(5)

        # Validate file was being apended while checking for xattrs
        ret = get_file_stat(
            self.client, '{}/{}'.format(self.mount_obj.mountpoint,
                                        self.file_name))
        self.assertIsNotNone(ret, "Not able to get stats of the file")
        curr_ctime = ret['epoch_ctime']
        self.assertGreater(
            int(curr_ctime), int(prev_ctime), "Not able "
            "to validate data is appended to the file "
            "while checking for xaatrs")

        g.log.info("Data on all online bricks is healed and consistent")
    def _perform_brick_ops_and_enable_self_heal(self, op_type):
        '''Refactor of steps common to all tests: Brick down and perform
        metadata/data operations'''
        # First brick in the subvol will always be online and used for self
        # heal, so make keys match brick index
        self.op_cmd = {
            # The operation with key `4` in every op_type will be used for
            # final data consistency check
            # Metadata Operations (owner and permission changes)
            'metadata': {
                2:
                '''cd {0}; for i in `seq 1 3`; do chown -R qa_all:qa_func \
                dir.$i file.$i; chmod -R 555 dir.$i file.$i; done;''',
                3:
                '''cd {0}; for i in `seq 1 3`; do chown -R :qa_system \
                dir.$i file.$i; chmod -R 777 dir.$i file.$i; done;''',
                4:
                '''cd {0}; for i in `seq 1 6`; do chown -R qa_all:qa_system \
                dir.$i file.$i; chmod -R 777 dir.$i file.$i; done;''',
            },
            # Data Operations (append data to the files)
            'data': {
                2:
                '''cd {0}; for i in `seq 1 3`;
                    do {1} 2K >> file.$i;
                    for j in `seq 1 3`;
                    do {1} 2K >> dir.$i/file.$j; done;
                    done;''',
                3:
                '''cd {0}; for i in `seq 1 3`;
                    do {1} 3K >> file.$i;
                    for j in `seq 1 3`;
                    do {1} 3K >> dir.$i/file.$j; done;
                    done;''',
                4:
                '''cd {0}; for i in `seq 1 6`;
                    do {1} 4K >> file.$i;
                    for j in `seq 1 6`;
                    do {1} 4K >> dir.$i/file.$j; done;
                    done;''',
            },
            # Create files and directories when brick is down with no
            # initial IO
            'gfid': {
                2:
                '''cd {0}; for i in `seq 1 3`;
                    do {1} 2K > file.2.$i; mkdir dir.2.$i;
                    for j in `seq 1 3`;
                    do {1} 2K > dir.2.$i/file.2.$j; done;
                    done;''',
                3:
                '''cd {0}; for i in `seq 1 3`;
                    do {1} 2K > file.3.$i; mkdir dir.3.$i;
                    for j in `seq 1 3`;
                    do {1} 2K > dir.3.$i/file.3.$j; done;
                    done;''',
                4:
                '''cd {0}; for i in `seq 4 6`;
                    do {1} 2K > file.$i; mkdir dir.$i;
                    for j in `seq 4 6`;
                    do {1} 2K > dir.$i/file.$j; done;
                    done;''',
            },
            # Create different file type with same name while a brick was down
            # with no initial IO and validate failure
            'file_type': {
                2:
                'cd {0}; for i in `seq 1 6`; do {1} 2K > notype.$i; done;',
                3:
                'cd {0}; for i in `seq 1 6`; do mkdir -p notype.$i; done;',
                4:
                '''cd {0}; for i in `seq 1 6`;
                    do {1} 2K > file.$i;
                    for j in `seq 1 6`;
                    do mkdir -p dir.$i; {1} 2K > dir.$i/file.$j; done;
                    done;''',
            },
            # Create symlinks for files and directories while a brick was down
            # Out of 6 files, 6 dirs and 6 files in each dir, symlink
            # outer 2 files, inner 2 files in each dir, 2 dirs and
            # verify it's a symlink(-L) and linking file exists(-e)
            'symlink': {
                2:
                '''cd {0}; for i in `seq 1 2`;
                    do ln -sr file.$i sl_file.2.$i;
                    [ -L sl_file.2.$i ] && [ -e sl_file.2.$i ] || exit -1;
                    for j in `seq 1 2`;
                    do ln -sr dir.$i/file.$j dir.$i/sl_file.2.$j; done;
                    [ -L dir.$i/sl_file.2.$j ] && [ -e dir.$i/sl_file.2.$j ] \
                    || exit -1;
                    done; for k in `seq 3 4`; do ln -sr dir.$k sl_dir.2.$k;
                    [ -L sl_dir.2.$k ] && [ -e sl_dir.2.$k ] || exit -1;
                    done;''',
                3:
                '''cd {0}; for i in `seq 1 2`;
                    do ln -sr file.$i sl_file.3.$i;
                    [ -L sl_file.3.$i ] && [ -e sl_file.3.$i ] || exit -1;
                    for j in `seq 1 2`;
                    do ln -sr dir.$i/file.$j dir.$i/sl_file.3.$j; done;
                    [ -L dir.$i/sl_file.3.$j ] && [ -e dir.$i/sl_file.3.$j ] \
                    || exit -1;
                    done; for k in `seq 3 4`; do ln -sr dir.$k sl_dir.3.$k;
                    [ -L sl_dir.3.$k ] && [ -e sl_dir.3.$k ] || exit -1;
                    done;''',
                4:
                '''cd {0}; ln -sr dir.4 sl_dir_new.4; mkdir sl_dir_new.4/dir.1;
                    {1} 4K >> sl_dir_new.4/dir.1/test_file;
                    {1} 4K >> sl_dir_new.4/test_file;
                    ''',
            },
        }
        bricks = get_online_bricks_list(self.mnode, self.volname)
        self.assertIsNotNone(bricks,
                             'Not able to get list of bricks in the volume')

        # Make first brick always online and start operations from second brick
        for index, brick in enumerate(bricks[1:], start=2):

            # Bring brick offline
            ret = bring_bricks_offline(self.volname, brick)
            self.assertTrue(ret, 'Unable to bring {} offline'.format(brick))
            self.assertTrue(
                are_bricks_offline(self.mnode, self.volname, [brick]),
                'Brick {} is not offline'.format(brick))

            # Perform file/dir operation
            cmd = self.op_cmd[op_type][index].format(self.fqpath, self.io_cmd)
            ret, _, err = g.run(self.client, cmd)
            if op_type == 'file_type' and index == 3:
                # Should fail with ENOTCONN as one brick is down, lookupt can't
                # happen and quorum is not met
                self.assertNotEqual(
                    ret, 0, '{0} should fail as lookup fails, quorum is not '
                    'met'.format(cmd))
                self.assertIn(
                    'Transport', err, '{0} should fail with ENOTCONN '
                    'error'.format(cmd))
            else:
                self.assertEqual(ret, 0,
                                 '{0} failed with {1}'.format(cmd, err))
                self.assertFalse(err, '{0} failed with {1}'.format(cmd, err))

            # Bring brick online
            ret = bring_bricks_online(
                self.mnode,
                self.volname,
                brick,
                bring_bricks_online_methods='volume_start_force')
            self.assertTrue(
                are_bricks_online(self.mnode, self.volname, [brick]),
                'Brick {} is not online'.format(brick))

        # Assert metadata/data operations resulted in pending heals
        self.assertFalse(is_heal_complete(self.mnode, self.volname))

        # Enable and wait self heal daemon to be online
        self.assertTrue(enable_self_heal_daemon(self.mnode, self.volname),
                        'Enabling self heal daemon failed')
        self.assertTrue(
            wait_for_self_heal_daemons_to_be_online(self.mnode, self.volname),
            'Not all self heal daemons are online')
    def test_glusterd_replace_brick(self):
        """
        Create a volume and start it.
        - Get list of all the bricks which are online
        - Select a brick randomly from the bricks which are online
        - Form a non-existing brick path on node where the brick has to replace
        - Perform replace brick and it should fail
        - Form a new brick which valid brick path replace brick should succeed
        """
        # pylint: disable=too-many-function-args
        # Getting all the bricks which are online
        bricks_online = get_online_bricks_list(self.mnode, self.volname)
        self.assertIsNotNone(bricks_online, "Unable to get the online bricks")
        g.log.info("got the brick list from the volume")

        # Getting one random brick from the online bricks to be replaced
        brick_to_replace = random.choice(bricks_online)
        g.log.info("Brick to replace %s", brick_to_replace)
        node_for_brick_replace = brick_to_replace.split(':')[0]
        new_brick_to_replace = form_bricks_list(self.mnode, self.volname, 1,
                                                node_for_brick_replace,
                                                self.all_servers_info)

        # performing replace brick with non-existing brick path
        path = ":/brick/non_existing_path"
        non_existing_path = node_for_brick_replace + path

        # Replace brick for non-existing path
        ret, _, _ = replace_brick(self.mnode, self.volname, brick_to_replace,
                                  non_existing_path)
        self.assertNotEqual(ret, 0, ("Replace brick with commit force"
                                     " on a non-existing brick passed"))
        g.log.info("Replace brick with non-existing brick with commit"
                   "force failed as expected")

        # calling replace brick by passing brick_to_replace and
        # new_brick_to_replace with valid brick path
        ret = replace_brick_from_volume(self.mnode,
                                        self.volname,
                                        self.servers,
                                        self.all_servers_info,
                                        brick_to_replace,
                                        new_brick_to_replace[0],
                                        delete_brick=True)
        self.assertTrue(ret, ("Replace brick with commit force failed"))

        # Validating whether the brick replaced is online
        halt = 20
        counter = 0
        _rc = False
        g.log.info("Wait for some seconds for the replaced brick "
                   "to get online")
        while counter < halt:
            ret = are_bricks_online(self.mnode, self.volname,
                                    new_brick_to_replace)
            if not ret:
                g.log.info("The replaced brick isn't online, "
                           "Retry after 2 seconds .......")
                time.sleep(2)
                counter = counter + 2
            else:
                _rc = True
                g.log.info("The replaced brick is online after being replaced")
                break
        if not _rc:
            raise ExecutionError("The replaced brick isn't online")
Exemplo n.º 6
0
    def test_create_snap_bricks(self):
        """
        1. get brick list
        2. check all bricks are online
        3. Selecting one brick randomly to bring it offline
        4. get brick list
        5. check all bricks are online
        6. Offline Bricks list
        7. Online Bricks list
        8. Create snapshot of volume
        9. snapshot create should fail
        """

        bricks_list = []
        # get the bricks from the volume
        g.log.info("Fetching bricks for the volume : %s" % self.volname)
        bricks_list = get_all_bricks(self.mnode, self.volname)
        g.log.info("Brick List : %s" % bricks_list)

        # check all bricks are online
        g.log.info("Verifying all bricks are online or not.....")
        ret = are_bricks_online(self.mnode, self.volname, bricks_list)
        self.assertTrue(ret, ("Not all bricks are online"))
        g.log.info("All bricks are online.")

        # Selecting one brick randomly to bring it offline
        g.log.info("Selecting one brick randomly to bring it offline")
        brick_to_bring_offline = random.choice(bricks_list)
        g.log.info("Brick to bring offline:%s " % brick_to_bring_offline)
        ret = bring_bricks_offline(self.volname, brick_to_bring_offline, None)
        self.assertTrue(ret, "Failed to bring the bricks offline")
        g.log.info("Randomly Selected brick: %s" % brick_to_bring_offline)

        # get brick list
        g.log.info("Fetching bricks for the volume : %s" % self.volname)
        bricks_list = get_all_bricks(self.mnode, self.volname)
        g.log.info("Brick List : %s" % bricks_list)

        # check all bricks are online
        g.log.info("Verifying all bricks are online or not.....")
        ret = are_bricks_online(self.mnode, self.volname, bricks_list)
        self.assertFalse(ret, ("Not all bricks are online"))
        g.log.info("All bricks are online.")

        # get the bricks for the volume
        g.log.info("Fetching bricks for the volume : %s" % self.volname)
        bricks_list = get_all_bricks(self.mnode, self.volname)
        g.log.info("Brick List : %s" % bricks_list)

        # Offline Bricks list
        offbricks = get_offline_bricks_list(self.mnode, self.volname)
        g.log.info("Bricks Offline: %s" % offbricks)

        # Online Bricks list
        onbricks = get_online_bricks_list(self.mnode, self.volname)
        g.log.info("Bricks Online: %s" % onbricks)

        # Create snapshot of volume
        ret = snap_create(self.mnode, self.volname, "snap1", False,
                          "Description with $p3c1al characters!")
        self.assertTrue(ret, ("Failed to create snapshot snap1"))
        g.log.info("Snapshot snap1 of volume %s created Successfully" %
                   (self.volname))

        # Volume status
        ret = get_volume_info(self.mnode, self.volname)
        self.assertTrue(ret, ("Failed to perform gluster volume"
                              "info on volume %s" % self.volname))
        g.log.info("Gluster volume info on volume %s is successful" %
                   self.volname)
        # snapshot list
        ret = snap_list(self.mnode)
        self.assertTrue(
            ret, ("Failed to list snapshot of volume %s" % self.volname))
        g.log.info("Snapshot list command for volume %s was successful" %
                   self.volname)
Exemplo n.º 7
0
    def test_server_side_healing_happens_only_when_glustershd_running(self):
        """
        Test Script which verifies that the server side healing must happen
        only if the heal daemon is running on the node where source brick
        resides.

         * Create and start the Replicate volume
         * Check the glustershd processes - Only 1 glustershd should be listed
         * Bring down the bricks without affecting the cluster
         * Create files on volume
         * kill the glustershd on node where bricks is running
         * bring the bricks up which was killed in previous steps
         * check the heal info - heal info must show pending heal info, heal
           shouldn't happen since glustershd is down on source node
         * issue heal
         * trigger client side heal
         * heal should complete successfully
        """
        # pylint: disable=too-many-locals,too-many-statements,too-many-lines
        # Setting Volume options
        options = {
            "metadata-self-heal": "on",
            "entry-self-heal": "on",
            "data-self-heal": "on"
        }
        ret = set_volume_options(self.mnode, self.volname, options)
        self.assertTrue(ret, 'Failed to set options %s' % options)
        g.log.info("Successfully set %s for volume %s", options, self.volname)

        # Check the self-heal daemon process
        ret, pids = get_self_heal_daemon_pid(self.servers)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process "
                              "found : %s" % pids))
        g.log.info(
            "Successful in verifying self heal daemon process"
            " on all nodes %s", self.servers)

        # Select the bricks to bring offline
        bricks_to_bring_offline = (select_volume_bricks_to_bring_offline(
            self.mnode, self.volname))
        g.log.info("Brick List to bring offline : %s", bricks_to_bring_offline)

        # Bring down the selected bricks
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(ret, "Failed to bring down the bricks")
        g.log.info("Brought down the brick process "
                   "for %s", bricks_to_bring_offline)

        # Write files on all mounts
        all_mounts_procs, num_files_to_write = [], 100
        for mount_obj in self.mounts:
            cmd = ("/usr/bin/env python %s create_files "
                   "-f %s --base-file-name file %s" %
                   (self.script_upload_path, num_files_to_write,
                    mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            all_mounts_procs.append(proc)

        # Validate IO
        ret = validate_io_procs(all_mounts_procs, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        g.log.info("IO is successful on all mounts")

        # Get online bricks list
        online_bricks = get_online_bricks_list(self.mnode, self.volname)
        g.log.info("Online Bricks for volume %s : %s", self.volname,
                   online_bricks)

        # Get the nodes where bricks are running
        bring_offline_glustershd_nodes = []
        for brick in online_bricks:
            bring_offline_glustershd_nodes.append(brick.split(":")[0])
        g.log.info("self heal deamon on nodes %s to be killed",
                   bring_offline_glustershd_nodes)

        # Kill the self heal daemon process on nodes
        ret = bring_self_heal_daemon_process_offline(
            bring_offline_glustershd_nodes)
        self.assertTrue(
            ret, ("Unable to bring self heal daemon process"
                  " offline for nodes %s" % bring_offline_glustershd_nodes))
        g.log.info(
            "Sucessfully brought down self heal process for "
            "nodes %s", bring_offline_glustershd_nodes)

        # Check the heal info
        heal_info = get_heal_info_summary(self.mnode, self.volname)
        g.log.info("Successfully got heal info %s for the volume %s",
                   heal_info, self.volname)

        # Bring bricks online
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline, 'glusterd_restart')
        self.assertTrue(
            ret,
            ("Failed to bring bricks: %s online" % bricks_to_bring_offline))

        # Issue heal
        ret = trigger_heal_full(self.mnode, self.volname)
        self.assertFalse(ret,
                         ("Able to trigger heal on volume %s where "
                          "self heal daemon is not running" % self.volname))
        g.log.info(
            "Expected : Unable to trigger heal on volume %s where "
            "self heal daemon is not running", self.volname)

        # Wait for 130 sec to heal
        ret = monitor_heal_completion(self.mnode, self.volname, 130)
        self.assertFalse(ret, ("Heal Completed on volume %s" % self.volname))
        g.log.info("Expected : Heal pending on volume %s", self.volname)

        # Check the heal info
        heal_info_after_triggering_heal = get_heal_info_summary(
            self.mnode, self.volname)
        g.log.info("Successfully got heal info for the volume %s",
                   self.volname)

        # Compare with heal pending with the files wrote
        for node in online_bricks:
            self.assertGreaterEqual(
                int(heal_info_after_triggering_heal[node]['numberOfEntries']),
                num_files_to_write,
                ("Some of the files are healed from source bricks %s where "
                 "self heal daemon is not running" % node))
        g.log.info("EXPECTED: No files are healed from source bricks where "
                   "self heal daemon is not running")

        # Unmount and Mount volume again as volume options were set
        # after mounting the volume
        for mount_obj in self.mounts:
            ret, _, _ = umount_volume(mount_obj.client_system,
                                      mount_obj.mountpoint)
            self.assertEqual(ret, 0,
                             "Failed to unmount %s" % mount_obj.client_system)
            ret, _, _ = mount_volume(self.volname,
                                     mtype='glusterfs',
                                     mpoint=mount_obj.mountpoint,
                                     mserver=self.mnode,
                                     mclient=mount_obj.client_system)
            self.assertEqual(ret, 0,
                             "Failed to mount %s" % mount_obj.client_system)

        all_mounts_procs = []
        for mount_obj in self.mounts:
            cmd = ("/usr/bin/env python %s read %s" %
                   (self.script_upload_path, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            all_mounts_procs.append(proc)

        # Validate IO
        ret = validate_io_procs(all_mounts_procs, self.mounts)
        self.assertTrue(ret, "Reads failed on some of the clients")
        g.log.info("Reads successful on all mounts")

        # Wait for heal to complete
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, "Unable to heal the pending entries")
        g.log.info("Successfully healed the pending entries for volume %s",
                   self.volname)
    def test_heal_for_conservative_merge_with_two_bricks_blame(self):
        """
        1) Create 1x3 volume and fuse mount the volume
        2) On mount created a dir dir1
        3) Pkill glusterfsd on node n1 (b2 on node2 and b3 and node3 up)
        4) touch f{1..10} on the mountpoint
        5) b2 and b3 xattrs would be blaming b1 as files are created while
           b1 is down
        6) Reset the b3 xattrs to NOT blame b1 by using setattr
        7) Now pkill glusterfsd of b2 on node2
        8) Restart glusterd on node1 to bring up b1
        9) Now bricks b1 online , b2 down, b3 online
        10) touch x{1..10} under dir1 itself
        11) Again reset xattr on node3 of b3 so that it doesn't blame b2,
        as done for b1 in step 6
        12) Do restart glusterd on node2 hosting b2 to bring all bricks online
        13) Check for heal info, split-brain and arequal for the bricks
        """
        # pylint: disable=too-many-locals
        # Create dir `dir1/` on mountpont
        path = self.mounts[0].mountpoint + "/dir1"
        ret = mkdir(self.mounts[0].client_system, path, parents=True)
        self.assertTrue(ret, "Directory {} creation failed".format(path))

        all_bricks = get_all_bricks(self.mnode, self.volname)
        self.assertIsNotNone(all_bricks, "Unable to fetch bricks of volume")
        brick1, brick2, brick3 = all_bricks

        # Bring first brick offline
        self._bring_brick_offline_and_check(brick1)

        # touch f{1..10} files on the mountpoint
        cmd = ("cd {mpt}; for i in `seq 1 10`; do touch f$i"
               "; done".format(mpt=path))
        ret, _, _ = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, "Unable to create files on mountpoint")

        # Check b2 and b3 xattrs are blaming b1 and are same
        self.assertEqual(self._get_fattr_for_the_brick(brick2),
                         self._get_fattr_for_the_brick(brick3),
                         "Both the bricks xattrs are not blaming "
                         "brick: {}".format(brick1))

        # Reset the xattrs of dir1 on b3 for brick b1
        first_xattr_to_reset = "trusted.afr.{}-client-0".format(self.volname)
        xattr_value = "0x000000000000000000000000"
        host, brick_path = brick3.split(":")
        brick_path = brick_path + "/dir1"
        ret = set_fattr(host, brick_path, first_xattr_to_reset, xattr_value)
        self.assertTrue(ret, "Unable to set xattr for the directory")

        # Kill brick2 on the node2
        self._bring_brick_offline_and_check(brick2)

        # Restart glusterd on node1 to bring the brick1 online
        self.assertTrue(restart_glusterd([brick1.split(":")[0]]), "Unable to "
                        "restart glusterd")
        # checking for peer status post glusterd restart
        self._check_peers_status()

        # Check if the brick b1 on node1 is online or not
        online_bricks = get_online_bricks_list(self.mnode, self.volname)
        self.assertIsNotNone(online_bricks, "Unable to fetch online bricks")
        self.assertIn(brick1, online_bricks, "Brick:{} is still offline after "
                                             "glusterd restart".format(brick1))

        # Create 10 files under dir1 naming x{1..10}
        cmd = ("cd {mpt}; for i in `seq 1 10`; do touch x$i"
               "; done".format(mpt=path))
        ret, _, _ = g.run(self.mounts[0].client_system, cmd)
        self.assertEqual(ret, 0, "Unable to create files on mountpoint")

        # Reset the xattrs from brick3 on to brick2
        second_xattr_to_reset = "trusted.afr.{}-client-1".format(self.volname)
        ret = set_fattr(host, brick_path, second_xattr_to_reset, xattr_value)
        self.assertTrue(ret, "Unable to set xattr for the directory")

        # Bring brick2 online
        self.assertTrue(restart_glusterd([brick2.split(":")[0]]), "Unable to "
                        "restart glusterd")
        self._check_peers_status()

        self.assertTrue(are_bricks_online(self.mnode, self.volname, [brick2]))

        # Check are there any files in split-brain and heal completion
        self.assertFalse(is_volume_in_split_brain(self.mnode, self.volname),
                         "Some files are in split brain for "
                         "volume: {}".format(self.volname))
        self.assertTrue(monitor_heal_completion(self.mnode, self.volname),
                        "Conservative merge of files failed")

        # Check arequal checksum of all the bricks is same
        ret, arequal_from_the_bricks = collect_bricks_arequal(all_bricks)
        self.assertTrue(ret, "Arequal is collected successfully across the"
                        " bricks in the subvol {}".format(all_bricks))
        self.assertEqual(len(set(arequal_from_the_bricks)), 1, "Arequal is "
                         "same on all the bricks in the subvol")
Exemplo n.º 9
0
    def _perform_brick_ops_and_enable_self_heal(self, op_type):
        '''Refactor of steps common to all tests: Brick down and perform
        metadata/data operations'''
        # First brick in the subvol will always be online and used for self
        # heal, so make keys match brick index
        self.op_cmd = {
            # Metadata Operations (owner and permission changes)
            'metadata': {
                2:
                '''cd {0}; for i in `seq 1 3`; do chown -R qa_all:qa_func \
                dir.$i file.$i; chmod -R 555 dir.$i file.$i; done;''',
                3:
                '''cd {0}; for i in `seq 1 3`; do chown -R :qa_system \
                dir.$i file.$i; chmod -R 777 dir.$i file.$i; done;''',
                # 4 - Will be used for final data consistency check
                4:
                '''cd {0}; for i in `seq 1 6`; do chown -R qa_all:qa_system \
                dir.$i file.$i; chmod -R 777 dir.$i file.$i; done;''',
            },
            # Data Operations (append data to the files)
            'data': {
                2:
                '''cd {0}; for i in `seq 1 3`;
                    do {1} 2K >> file.$i;
                    for j in `seq 1 3`;
                    do {1} 2K >> dir.$i/file.$j; done;
                    done;''',
                3:
                '''cd {0}; for i in `seq 1 3`;
                    do {1} 3K >> file.$i;
                    for j in `seq 1 3`;
                    do {1} 3K >> dir.$i/file.$j; done;
                    done;''',
                # 4 - Will be used for final data consistency check
                4:
                '''cd {0}; for i in `seq 1 6`;
                    do {1} 4K >> file.$i;
                    for j in `seq 1 6`;
                    do {1} 4K >> dir.$i/file.$j; done;
                    done;''',
            },
        }
        bricks = get_online_bricks_list(self.mnode, self.volname)
        self.assertIsNotNone(bricks,
                             'Not able to get list of bricks in the volume')

        # Make first brick always online and start operations from second brick
        for index, brick in enumerate(bricks[1:], start=2):

            # Bring brick offline
            ret = bring_bricks_offline(self.volname, brick)
            self.assertTrue(ret, 'Unable to bring {} offline'.format(bricks))

            # Perform metadata/data operation
            cmd = self.op_cmd[op_type][index].format(self.fqpath, self.io_cmd)
            ret, _, err = g.run(self.client, cmd)
            self.assertEqual(ret, 0, '{0} failed with {1}'.format(cmd, err))
            self.assertFalse(err, '{0} failed with {1}'.format(cmd, err))

            # Bring brick online
            ret = bring_bricks_online(
                self.mnode,
                self.volname,
                brick,
                bring_bricks_online_methods='volume_start_force')

        # Assert metadata/data operations resulted in pending heals
        self.assertFalse(is_heal_complete(self.mnode, self.volname))

        # Enable and wait self heal daemon to be online
        self.assertTrue(enable_self_heal_daemon(self.mnode, self.volname),
                        'Enabling self heal daemon failed')
        self.assertTrue(
            wait_for_self_heal_daemons_to_be_online(self.mnode, self.volname),
            'Not all self heal daemons are online')
Exemplo n.º 10
0
    def test_volume_set_when_glusterd_stopped_on_one_node(self):
        """
        Test Case:
        1) Setup and mount a volume on client.
        2) Stop glusterd on a random server.
        3) Start IO on mount points
        4) Set an option on the volume
        5) Start glusterd on the stopped node.
        6) Verify all the bricks are online after starting glusterd.
        7) Check if the volume info is synced across the cluster.
        """
        # Fetching the bricks list and storing it for later use
        list1 = get_online_bricks_list(self.mnode, self.volname)
        self.assertIsNotNone(
            list1, "Failed to get the list of online bricks "
            "for volume: %s" % self.volname)

        # Fetching a random server from list.
        self.random_server = choice(self.servers[1:])

        # Stopping glusterd on one node.
        ret = stop_glusterd(self.random_server)
        self.assertTrue(ret, "Failed to stop glusterd on one node.")
        g.log.info("Successfully stopped glusterd on one node.")

        self.glusterd_is_stopped = True

        # Start IO on mount points.
        self.all_mounts_procs = []
        counter = 1
        for mount_obj in self.mounts:
            g.log.info("Starting IO on %s:%s", mount_obj.client_system,
                       mount_obj.mountpoint)
            cmd = ("/usr/bin/env python %s create_deep_dirs_with_files "
                   "--dir-depth 4 "
                   "--dir-length 6 "
                   "--dirname-start-num %d "
                   "--max-num-of-dirs 3 "
                   "--num-of-files 5 %s" %
                   (self.script_upload_path, counter, mount_obj.mountpoint))
            proc = g.run_async(mount_obj.client_system,
                               cmd,
                               user=mount_obj.user)
            self.all_mounts_procs.append(proc)
            counter += 1

        # Validate IO
        self.assertTrue(validate_io_procs(self.all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")
        g.log.info("IO validation complete.")

        # set a option on volume, stat-prefetch on
        self.options = {"stat-prefetch": "on"}
        ret = set_volume_options(self.mnode, self.volname, self.options)
        self.assertTrue(ret, ("Failed to set option stat-prefetch to on"
                              "for the volume %s" % self.volname))
        g.log.info(
            "Succeeded in setting stat-prefetch option to on"
            "for the volume %s", self.volname)

        # start glusterd on the node where glusterd is stopped
        ret = start_glusterd(self.random_server)
        self.assertTrue(ret,
                        "Failed to start glusterd on %s" % self.random_server)
        g.log.info("Successfully started glusterd on node: %s",
                   self.random_server)

        # Waiting for glusterd to start completely
        ret = wait_for_glusterd_to_start(self.random_server)
        self.assertTrue(ret,
                        "glusterd is not running on %s" % self.random_server)
        g.log.info("glusterd is started and running on %s", self.random_server)

        self.glusterd_is_stopped = False

        # Confirm if all the bricks are online or not
        count = 0
        while count < 10:
            list2 = get_online_bricks_list(self.mnode, self.volname)
            if list1 == list2:
                break
            sleep(2)
            count += 1

        self.assertListEqual(
            list1, list2, "Unexpected: All the bricks in the"
            "volume are not online")
        g.log.info("All the bricks in the volume are back online")

        # volume info should be synced across the cluster
        out1 = get_volume_info(self.mnode, self.volname)
        self.assertIsNotNone(
            out1, "Failed to get the volume info from %s" % self.mnode)
        g.log.info("Getting volume info from %s is success", self.mnode)

        count = 0
        while count < 60:
            out2 = get_volume_info(self.random_server, self.volname)
            self.assertIsNotNone(
                out2,
                "Failed to get the volume info from %s" % self.random_server)
            if out1 == out2:
                break
            sleep(2)
            count += 1

        self.assertDictEqual(
            out1, out2, "Volume info is not synced in the"
            "restarted node")
        g.log.info("Volume info is successfully synced across the cluster")
    def _create_xattr_check_self_heal(self):
        """Create custom xattr and check if its healed"""
        # Set the xattr on the dir1
        self._set_xattr_value(fattr_value="bar2")

        # Get online brick list
        online_bricks = get_online_bricks_list(self.mnode, self.volname)
        self.assertIsNotNone(online_bricks, "Failed to get online bricks")

        # Check if the custom xattr is being displayed on the
        # mount-point for dir1
        self._check_xattr_value_on_mnt(expected_value="bar2")

        # Check if the xattr is being displayed on the online-bricks
        # for dir1
        self._check_xattr_value_on_bricks(online_bricks, expected_value="bar2")

        # Modify custom xattr value on dir1
        self._set_xattr_value(fattr_value="ABC")

        # Lookup on moint-point to refresh the value of xattr
        self._perform_lookup()

        # Check if the modified custom xattr is being displayed
        # on the mount-point for dir1
        self._check_xattr_value_on_mnt(expected_value="ABC")

        # Check if the modified custom xattr is being
        # displayed on the bricks for dir1
        self._check_xattr_value_on_bricks(online_bricks, expected_value="ABC")

        # Remove the custom xattr from the mount point for dir1
        ret = delete_fattr(self.client,
                           '{}/dir1'.format(self.m_point), 'user.foo')
        self.assertTrue(ret, "Failed to delete the xattr for "
                             "dir1 on mountpoint")
        g.log.info(
            "Successfully deleted the xattr for dir1 from mountpoint")

        # Lookup on moint-point to refresh the value of xattr
        self._perform_lookup()

        # Check that the custom xattr is not displayed on the
        # for dir1 on mountpoint
        ret = get_fattr(self.client, '{}/dir1'.format(self.m_point),
                        'user.foo', encode="text")
        self.assertEqual(ret, None, "Xattr for dir1 is not removed"
                         " on:{}".format(self.client))
        g.log.info("Success: xattr is removed for dir1 on mointpoint")

        # Check that the custom xattr is not displayed on the
        # for dir1 on the backend bricks
        for brick in online_bricks:
            host, brick_path = brick.split(':')
            ret = get_fattr(host, '{}/dir1'.format(brick_path),
                            'user.foo', encode="text")
            self.assertEqual(ret, None, "Xattr for dir1 is not removed"
                                        " on:{}".format(brick_path))
            g.log.info("Xattr for dir1 is removed from "
                       "brick:%s", brick_path)

        # Check if the trusted.glusterfs.pathinfo is displayed
        # for dir1 on mointpoint
        ret = get_fattr(self.client, '{}/dir1'.format(self.m_point),
                        'trusted.glusterfs.pathinfo')
        self.assertIsNotNone(ret, "Failed to get the xattr"
                             " on:{}".format(self.client))
        g.log.info("The xattr trusted.glusterfs.pathinfo"
                   " is displayed on mointpoint for dir1")

        # Set the xattr on the dir1
        self._set_xattr_value(fattr_value="star1")

        # Bring back the bricks online
        ret, _, _ = volume_start(self.mnode, self.volname, force=True)
        self.assertFalse(ret, 'Failed to start volume %s with "force" option'
                         % self.volname)
        g.log.info('Successfully started volume %s with "force" option',
                   self.volname)

        # Execute lookup on the mointpoint
        self._perform_lookup()

        # Get online brick list
        online_bricks = get_online_bricks_list(self.mnode, self.volname)
        self.assertIsNotNone(online_bricks, "Failed to get online bricks")

        # Check if the custom xattr is being displayed
        # on the mount-point for dir1
        self._check_xattr_value_on_mnt(expected_value="star1")

        # Check if the custom xattr is displayed on all the bricks
        self._check_xattr_value_on_bricks(online_bricks,
                                          expected_value="star1")
Exemplo n.º 12
0
    def test_snap_self_heal(self):
        """
        Steps:

        1. create a volume
        2. mount volume
        3. create snapshot of that volume
        4. Activate snapshot
        5. Clone snapshot and Mount
        6. Perform I/O
        7. Bring Down Few bricks from volume without
           affecting the volume or cluster.
        8. Perform I/O
        9. Bring back down bricks to online
        10. Validate heal is complete with areequal

        """
        # pylint: disable=too-many-statements, too-many-locals
        # Creating snapshot:
        g.log.info("Starting to Create snapshot")
        ret, _, _ = snap_create(self.mnode, self.volname, self.snap)
        self.assertEqual(
            ret, 0, ("Failed to create snapshot for volume %s" % self.volname))
        g.log.info("Snapshot %s created successfully for volume %s", self.snap,
                   self.volname)

        # Activating snapshot
        g.log.info("Starting to Activate Snapshot")
        ret, _, _ = snap_activate(self.mnode, self.snap)
        self.assertEqual(ret, 0,
                         ("Failed to Activate snapshot %s" % self.snap))
        g.log.info("Snapshot %s activated successfully", self.snap)

        # snapshot list
        ret, _, _ = snap_list(self.mnode)
        self.assertEqual(ret, 0, ("Failed to list all the snapshot"))
        g.log.info("Snapshot list command was successful")

        # Creating a Clone volume from snapshot:
        g.log.info("Starting to Clone volume from Snapshot")
        ret, _, _ = snap_clone(self.mnode, self.snap, self.clone)
        self.assertEqual(ret, 0, ("Failed to clone %s from snapshot %s" %
                                  (self.clone, self.snap)))
        g.log.info("%s created successfully", self.clone)

        #  start clone volumes
        g.log.info("start to created clone volumes")
        ret, _, _ = volume_start(self.mnode, self.clone)
        self.assertEqual(ret, 0, "Failed to start clone %s" % self.clone)
        g.log.info("clone volume %s started successfully", self.clone)

        # Mounting a clone volume
        g.log.info("Mounting a clone volume")
        ret, _, _ = mount_volume(self.clone, self.mount_type, self.mount1,
                                 self.mnode, self.clients[0])
        self.assertEqual(ret, 0,
                         "Failed to mount clone Volume %s" % self.clone)
        g.log.info("Clone volume %s mounted Successfully", self.clone)

        # Checking cloned volume mounted or not
        ret = is_mounted(self.clone, self.mount1, self.mnode, self.clients[0],
                         self.mount_type)
        self.assertTrue(
            ret,
            "Failed to mount clone volume on mount point: %s" % self.mount1)
        g.log.info("clone Volume %s mounted on %s", self.clone, self.mount1)

        # write files on all mounts
        g.log.info("Starting IO on all mounts...")
        g.log.info("mounts: %s", self.mount1)
        all_mounts_procs = []
        cmd = ("python %s create_files "
               "-f 10 --base-file-name file %s" %
               (self.script_upload_path, self.mount1))
        proc = g.run(self.clients[0], cmd)
        all_mounts_procs.append(proc)
        g.log.info("Successful in creating I/O on mounts")

        # get the bricks from the volume
        g.log.info("Fetching bricks for the volume : %s", self.clone)
        bricks_list = get_all_bricks(self.mnode, self.clone)
        g.log.info("Brick List : %s", bricks_list)

        # Select bricks to bring offline
        g.log.info("Starting to bring bricks to offline")
        bricks_to_bring_offline_dict = (select_bricks_to_bring_offline(
            self.mnode, self.volname))
        bricks_to_bring_offline = filter(
            None, (bricks_to_bring_offline_dict['hot_tier_bricks'] +
                   bricks_to_bring_offline_dict['cold_tier_bricks'] +
                   bricks_to_bring_offline_dict['volume_bricks']))
        g.log.info("Brick to bring offline: %s ", bricks_to_bring_offline)
        ret = bring_bricks_offline(self.clone, bricks_to_bring_offline)
        self.assertTrue(ret, "Failed to bring the bricks offline")
        g.log.info("Successful in bringing bricks: %s offline",
                   bricks_to_bring_offline)

        # Offline Bricks list
        offline_bricks = get_offline_bricks_list(self.mnode, self.clone)
        self.assertIsNotNone(
            offline_bricks, "Failed to get offline bricklist"
            "for volume %s" % self.clone)
        for bricks in offline_bricks:
            self.assertIn(bricks, bricks_to_bring_offline,
                          "Failed to validate "
                          "Bricks offline")
        g.log.info("Bricks Offline: %s", offline_bricks)

        # Online Bricks list
        online_bricks = get_online_bricks_list(self.mnode, self.clone)
        self.assertIsNotNone(
            online_bricks, "Failed to get online bricks"
            " for volume %s" % self.clone)
        g.log.info("Bricks Online: %s", online_bricks)

        # write files mountpoint
        g.log.info("Starting IO on all mounts...")
        g.log.info("mounts: %s", self.mount1)
        all_mounts_procs = []
        cmd = ("python %s create_files "
               "-f 10 --base-file-name file %s" %
               (self.script_upload_path, self.mount1))
        proc = g.run(self.clients[0], cmd)
        all_mounts_procs.append(proc)
        g.log.info("Successful in creating I/O on mounts")

        # Bring all bricks online
        g.log.info("bring all bricks online")
        ret = bring_bricks_online(self.mnode, self.clone,
                                  bricks_to_bring_offline)
        self.assertTrue(ret, "Failed to bring bricks online")
        g.log.info("Successful in bringing all bricks online")

        # Validate Bricks are online
        g.log.info("Validating all bricks are online")
        ret = are_bricks_online(self.mnode, self.clone, bricks_list)
        self.assertTrue(ret, "Failed to bring all the bricks online")
        g.log.info("bricks online: %s", bricks_list)

        # Wait for volume processes to be online
        g.log.info("Wait for volume processes to be online")
        ret = wait_for_volume_process_to_be_online(self.mnode, self.clone)
        self.assertTrue(ret, ("Failed to wait for volume %s processes to "
                              "be online" % self.clone))
        g.log.info(
            "Successful in waiting for volume %s processes to be "
            "online", self.clone)

        # Verify volume's all process are online
        g.log.info("Verifying volume's all process are online")
        ret = verify_all_process_of_volume_are_online(self.mnode, self.clone)
        self.assertTrue(
            ret, ("Volume %s : All process are not online" % self.clone))
        g.log.info("Volume %s : All process are online", self.clone)

        # wait for the heal process to complete
        g.log.info("waiting for heal process to complete")
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, "Failed to complete the heal process")
        g.log.info("Successfully completed heal process")

        # Check areequal
        # get the subvolumes
        g.log.info("Starting to get sub-volumes for volume %s", self.clone)
        subvols = get_subvols(self.mnode, self.clone)
        num_subvols = len(subvols['volume_subvols'])
        g.log.info("Number of subvolumes in volume %s:", num_subvols)

        # Get arequals and compare
        g.log.info("Starting to Compare areequals")
        for i in range(0, num_subvols):
            # Get arequal for first brick
            subvol_brick_list = subvols['volume_subvols'][i]
            node, brick_path = subvol_brick_list[0].split(':')
            command = ('arequal-checksum -p %s '
                       '-i .glusterfs -i .landfill -i .trashcan' % brick_path)
            ret, arequal, _ = g.run(node, command)
            first_brick_total = arequal.splitlines()[-1].split(':')[-1]

        # Get arequal for every brick and compare with first brick
        for brick in subvol_brick_list:
            node, brick_path = brick.split(':')
            command = ('arequal-checksum -p %s '
                       '-i .glusterfs -i .landfill -i .trashcan' % brick_path)
            ret, brick_arequal, _ = g.run(node, command)
            self.assertFalse(ret, 'Failed to get arequal on brick %s' % brick)
            g.log.info('Getting arequal for %s is successful', brick)
            brick_total = brick_arequal.splitlines()[-1].split(':')[-1]
            self.assertEqual(
                first_brick_total, brick_total,
                'Arequals for subvol and %s are not equal' % brick)
            g.log.info('Arequals for subvol and %s are equal', brick)
        g.log.info('All arequals are equal for distributed-replicated')
Exemplo n.º 13
0
    def test_existing_glustershd_should_take_care_of_self_healing(self):
        """
        Test Script which verifies that the existing glustershd should take
        care of self healing

        * Create and start the Replicate volume
        * Check the glustershd processes - Note the pids
        * Bring down the One brick ( lets say brick1)  without affecting
          the cluster
        * Create 1000 files on volume
        * bring the brick1 up which was killed in previous steps
        * check the heal info - proactive self healing should start
        * Bring down brick1 again
        * wait for 60 sec and brought up the brick1
        * Check the glustershd processes - pids should be different
        * Monitor the heal till its complete

        """
        # pylint: disable=too-many-locals,too-many-lines,too-many-statements
        nodes = self.servers

        # check the self-heal daemon process
        g.log.info("Starting to get self-heal daemon process on "
                   "nodes %s", nodes)
        ret, pids = get_self_heal_daemon_pid(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process "
                              "found : %s" % pids))
        g.log.info(
            "Successful in getting Single self heal daemon process"
            " on all nodes %s", nodes)
        glustershd_pids = pids

        # select the bricks to bring offline
        g.log.info("Selecting bricks to brought offline for volume %s",
                   self.volname)
        bricks_to_bring_offline = \
            select_volume_bricks_to_bring_offline(self.mnode,
                                                  self.volname)
        g.log.info("Brick List to bring offline : %s", bricks_to_bring_offline)

        # Bring down the selected bricks
        g.log.info("Going to bring down the brick process "
                   "for %s", bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(ret, ("Failed to bring down the bricks. Please "
                              "check the log file for more details."))
        g.log.info("Brought down the brick process "
                   "for %s successfully", bricks_to_bring_offline)

        # get the bricks which are running
        g.log.info("getting the brick list which are online")
        online_bricks = get_online_bricks_list(self.mnode, self.volname)
        g.log.info("Online Bricks for volume %s : %s", self.volname,
                   online_bricks)

        # write 1MB files to the mounts
        g.log.info("Starting IO on all mounts...")
        g.log.info("mounts: %s", self.mounts)
        all_mounts_procs = []
        cmd = ("for i in `seq 1 1000`; "
               "do dd if=/dev/urandom of=%s/file_$i "
               "bs=1M count=1; "
               "done" % self.mounts[0].mountpoint)
        g.log.info(cmd)
        proc = g.run_async(self.mounts[0].client_system,
                           cmd,
                           user=self.mounts[0].user)
        all_mounts_procs.append(proc)

        # Validate IO
        self.assertTrue(validate_io_procs(all_mounts_procs, self.mounts),
                        "IO failed on some of the clients")

        # check the heal info
        g.log.info("Get the pending heal info for the volume %s", self.volname)
        heal_info = get_heal_info_summary(self.mnode, self.volname)
        g.log.info("Successfully got heal info for the volume %s",
                   self.volname)
        g.log.info("Heal Info for volume %s : %s", self.volname, heal_info)

        # Bring bricks online
        g.log.info("Bring bricks: %s online", bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline, 'glusterd_restart')
        self.assertTrue(
            ret,
            ("Failed to bring bricks: %s online" % bricks_to_bring_offline))
        g.log.info("Successfully brought all bricks: %s online",
                   bricks_to_bring_offline)

        # Wait for 90 sec to start self healing
        g.log.info('Waiting for 90 sec to start self healing')
        time.sleep(90)

        # check the heal info
        g.log.info("Get the pending heal info for the volume %s", self.volname)
        heal_info_after_brick_online = get_heal_info_summary(
            self.mnode, self.volname)
        g.log.info("Successfully got heal info for the volume %s",
                   self.volname)
        g.log.info("Heal Info for volume %s : %s", self.volname,
                   heal_info_after_brick_online)

        # check heal pending is decreased
        flag = False
        for brick in online_bricks:
            if int(heal_info_after_brick_online[brick]['numberOfEntries'])\
                    < int(heal_info[brick]['numberOfEntries']):
                flag = True
                break

        self.assertTrue(flag, "Pro-active self heal is not started")
        g.log.info("Pro-active self heal is started")

        # bring down bricks again
        g.log.info("Going to bring down the brick process "
                   "for %s", bricks_to_bring_offline)
        ret = bring_bricks_offline(self.volname, bricks_to_bring_offline)
        self.assertTrue(ret, ("Failed to bring down the bricks. Please "
                              "check the log file for more details."))
        g.log.info("Brought down the brick process "
                   "for %s successfully", bricks_to_bring_offline)

        # wait for 60 sec and brought up the brick again
        g.log.info('waiting for 60 sec and brought up the brick again')
        time.sleep(60)
        g.log.info("Bring bricks: %s online", bricks_to_bring_offline)
        ret = bring_bricks_online(self.mnode, self.volname,
                                  bricks_to_bring_offline, 'glusterd_restart')
        self.assertTrue(
            ret,
            ("Failed to bring bricks: %s online" % bricks_to_bring_offline))
        g.log.info("Successfully brought all bricks: %s online",
                   bricks_to_bring_offline)

        # Verfiy glustershd process releases its parent process
        ret = is_shd_daemonized(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process found"))

        # check the self-heal daemon process
        g.log.info("Starting to get self-heal daemon process on "
                   "nodes %s", nodes)
        ret, pids = get_self_heal_daemon_pid(nodes)
        self.assertTrue(ret, ("Either No self heal daemon process found or "
                              "more than One self heal daemon process "
                              "found : %s" % pids))
        g.log.info(
            "Successful in getting Single self heal daemon process"
            " on all nodes %s", nodes)
        shd_pids_after_bricks_online = pids

        # compare the glustershd pids
        self.assertNotEqual(glustershd_pids, shd_pids_after_bricks_online,
                            ("self heal daemon process are same before and "
                             "after bringing up bricks online"))
        g.log.info("EXPECTED : self heal daemon process are different before "
                   "and after bringing up bricks online")

        # wait for heal to complete
        g.log.info("Monitoring the heal.....")
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret,
                        ("Heal is not completed on volume %s" % self.volname))
        g.log.info("Heal Completed on volume %s", self.volname)

        # Check if heal is completed
        ret = is_heal_complete(self.mnode, self.volname)
        self.assertTrue(ret, 'Heal is not complete')
        g.log.info('Heal is completed successfully')
    def _perform_quota_ops_before_brick_down(self):
        """
        Refactor of common test steps across three test functions
        """
        self.client, self.m_point = (self.mounts[0].client_system,
                                     self.mounts[0].mountpoint)
        ret = mkdir(self.client, '%s/dir/dir1' % self.m_point, parents=True)
        self.assertTrue(ret, 'Failed to create first dir on mountpoint')
        if self.num_of_dirs == 2:
            ret = mkdir(self.client, '%s/dir/dir' % self.m_point)
            self.assertTrue(ret, 'Failed to create second dir on mountpoint')

        # Types of errors
        self.space_error = 'Input/output error|No space left on device'
        self.quota_error = 'Disk quota exceeded'

        # Start IO from the clients
        cmd = ('/usr/bin/env python {} -n 10 -t 480 -d 10 -c 256 --dir '
               '{}/dir/dir{}')
        for count, mount in enumerate(self.mounts, start=1):
            proc = g.run_async(
                mount.client_system,
                cmd.format(self.script_path, mount.mountpoint, count))
            self.all_mount_procs.append(proc)

        # fallocate a large file and perform IO on remaining space
        online_bricks = get_online_bricks_list(self.mnode, self.volname)
        self.assertIsNotNone(online_bricks, 'Failed to get list of online '
                             'bricks')
        brick_node, brick_path = online_bricks[0].split(':')
        self.brick_size = self._get_space_in_gb(brick_node,
                                                brick_path,
                                                size='total')
        self.free_disk_size = self._get_space_in_gb(self.client, self.m_point)
        self.fqpath = self.m_point + '/sparsefile'
        self.rem_size = 1  # Only 1G will be available to the mount
        self.alloc_size = self.free_disk_size - self.rem_size
        self._fallocate_file()

        # Insert breakpoint in the log
        self.bp_text = 'breakpoint_' + str(ceil(time())) + '_'
        self.bp_count = 1
        self.logpath = ('/var/log/glusterfs/mnt-' + self.volname +
                        '_glusterfs.log')
        self._insert_bp(self.client, self.logpath)

        # Create file with size greater than available mount space
        self.cmd = ('cd {}; cat /dev/urandom | tr -dc [:space:][:print:] '
                    '| head -c {}G > datafile_{};')
        self.fqpath = self.m_point + '/dir/dir1'
        proc = g.run_async(
            self.client,
            self.cmd.format(self.fqpath, self.rem_size * 2, self.bp_count))
        self.assertFalse(
            validate_io_procs([proc], self.mounts[0]),
            'Fail: Process should not allow data more '
            'than available space to be written')
        sleep(10)
        self._insert_bp(self.client, self.logpath)

        # Validate space error in the mount log
        self._validate_error_in_mount_log(pattern=self.space_error)

        # Enable quota and set all alert timeouts to 0secs
        ret, _, _ = quota_enable(self.mnode, self.volname)
        self.assertEqual(ret, 0, 'Not able to enable quota on the volume')
        for alert_type, msg in ((quota_set_alert_time,
                                 'alert'), (quota_set_soft_timeout, 'soft'),
                                (quota_set_hard_timeout, 'hard')):
            ret, _, _ = alert_type(self.mnode, self.volname, '0sec')
            self.assertEqual(
                ret, 0, 'Failed to set quota {} timeout to 0sec'.format(msg))

        # Expose only 20G and set quota's on the dir
        self.rem_size = 20  # Only 20G will be available to whole mount
        self.alloc_size = self.free_disk_size - self.rem_size
        self.fqpath = self.m_point + '/sparsefile'
        self._fallocate_file()

        self._insert_bp(self.client, self.logpath)
        ret, _, _ = quota_limit_usage(self.mnode,
                                      self.volname,
                                      path='/dir/dir1',
                                      limit='10GB')
        self.assertEqual(ret, 0, 'Not able to set quota limit on /dir/dir1')
        if self.num_of_dirs == 2:
            ret, _, _ = quota_limit_usage(self.mnode,
                                          self.volname,
                                          path='/dir/dir2',
                                          limit='5GB')
            self.assertEqual(ret, 0, 'Not able to set quota limit on '
                             '/dir/dir2')

        # Write data more than available quota and validate error
        sleep(10)
        self.rem_size = 1  # Only 1G will be availble to /dir/dir1
        self.alloc_size = 9
        self.fqpath = self.m_point + '/dir/dir1/sparsefile'
        self._fallocate_file()

        self.fqpath = self.m_point + '/dir/dir1'
        proc = g.run_async(
            self.client,
            self.cmd.format(self.fqpath, self.rem_size * 2, self.bp_count))
        self.assertFalse(
            validate_io_procs([proc], self.mounts[0]),
            'Fail: Process should not allow data more '
            'than available space to be written')
        sleep(10)
        self._insert_bp(self.client, self.logpath)
        self._validate_error_in_mount_log(pattern=self.quota_error)
        self._validate_error_in_mount_log(pattern=self.space_error,
                                          exp_pre=False)
    def test_no_fresh_lookup(self):
        """
        The testcase covers negative lookup of a directory in distributed-
        replicated and distributed-dispersed volumes
        1. Mount the volume on one client.
        2. Create a directory
        3. Validate the number of lookups for the directory creation from the
           log file.
        4. Perform a new lookup of the directory
        5. No new lookups should have happened on the directory, validate from
           the log file.
        6. Bring down one subvol of the volume and repeat step 4, 5
        7. Bring down one brick from the online bricks and repeat step 4, 5
        8. Start the volume with force and wait for all process to be online.
        """

        # Mounting the volume on a distinct directory for the validation of
        # testcase
        self.mountpoint = "/mnt/" + self.volname
        ret, _, _ = mount_volume(self.volname, mtype=self.mount_type,
                                 mpoint=self.mountpoint,
                                 mserver=self.mnode,
                                 mclient=self.mounts[0].client_system)
        self.assertEqual(ret, 0, ("Volume %s is not mounted") % self.volname)
        g.log.info("Volume mounted successfully : %s", self.volname)

        # Distinct log file for the validation of this test
        filename = "/var/log/glusterfs/mnt-" + self.volname + ".log"
        # Creating a dir on the mount point.
        dirname = self.mountpoint + "/dir1"
        ret = mkdir(host=self.mounts[0].client_system, fqpath=dirname)
        self.assertTrue(ret, "Failed to create dir1")
        g.log.info("dir1 created successfully for %s",
                   self.mounts[0].client_system)

        search_pattern = "/dir1: Calling fresh lookup"

        # Check log file for the pattern in the log file
        first_count = occurences_of_pattern_in_file(
            self.mounts[0].client_system, search_pattern, filename)
        self.assertGreater(first_count, 0, "Unable to find "
                           "pattern in the given file")
        g.log.info("Searched for the pattern in the log file successfully")

        # Perform a lookup of the directory dir1
        self.do_lookup(dirname)

        # Recheck for the number of lookups from the log file
        self.match_occurences(first_count, search_pattern, filename)

        # Bring down one subvol of the volume
        ret = get_subvols(self.mnode, self.volname)
        brick_list = choice(ret['volume_subvols'])
        ret = bring_bricks_offline(self.volname, brick_list)
        self.assertTrue(ret, "Unable to bring the given bricks offline")
        g.log.info("Able to bring all the bricks in the subvol offline")

        # Do a lookup on the mountpoint for the directory dir1
        self.do_lookup(dirname)

        # Re-check the number of occurences of lookup
        self.match_occurences(first_count, search_pattern, filename)

        # From the online bricks, bring down one brick
        online_bricks = get_online_bricks_list(self.mnode, self.volname)
        self.assertIsNotNone(online_bricks, "Unable to fetch online bricks")
        g.log.info("Able to fetch the online bricks")
        offline_brick = choice(online_bricks)
        ret = bring_bricks_offline(self.volname, [offline_brick])
        self.assertTrue(ret, "Unable to bring the brick %s offline " %
                        offline_brick)
        g.log.info("Successfully brought the brick %s offline", offline_brick)

        # Do a lookup on the mounpoint and check for new lookups from the log
        self.do_lookup(dirname)
        self.match_occurences(first_count, search_pattern, filename)

        # Start volume with force
        ret, _, err = volume_start(self.mnode, self.volname, force=True)
        self.assertEqual(ret, 0, "Unable to force start the volume %s " % err)
        g.log.info("Volume started successfully")

        # Wait for all the processess to be online.
        ret = wait_for_volume_process_to_be_online(self.mnode, self.volname)
        self.assertTrue(ret, "Some processes are offline")
        g.log.info("All processes of the volume")