예제 #1
0
 def test_observe_with_warmup(self):
     self._load_doc_data_all_buckets('create', 0, self.num_items)
     # Persist all the loaded data item
     self.log.info("Nodes in cluster: %s" % self.servers[:self.nodes_init])
     for bucket in self.buckets:
         RebalanceHelper.wait_for_persistence(self.master, bucket)
         self._stats_befor_warmup(bucket.name)
         self._restart_memcache(bucket.name)
         # for bucket in self.buckets:
         ClusterOperationHelper._wait_warmup_completed(self, self.servers[:self.nodes_init], bucket.name)
         self._run_observe(self)
예제 #2
0
 def test_observe_with_warmup(self):
     self._load_doc_data_all_buckets('create', 0, self.num_items)
     # Persist all the loaded data item
     self.log.info("Nodes in cluster: %s" % self.servers[:self.nodes_init])
     for bucket in self.buckets:
         RebalanceHelper.wait_for_persistence(self.master, bucket)
         self._stats_befor_warmup(bucket.name)
         self._restart_memcache(bucket.name)
         # for bucket in self.buckets:
         ClusterOperationHelper._wait_warmup_completed(
             self, self.servers[:self.nodes_init], bucket.name)
         self._run_observe()
예제 #3
0
    def rebalance_out_with_warming_up(self):
        master_restart = self.input.param("master_restart", False)
        if master_restart:
            warmup_node = self.master
        else:
            warmup_node = self.servers[len(self.servers) - self.nodes_out - 1]
        servs_out = self.servers[len(self.servers) - self.nodes_out:]
        shell = RemoteMachineShellConnection(warmup_node)
        shell.stop_couchbase()
        self.sleep(20)
        shell.start_couchbase()
        shell.disconnect()
        try:
            rebalance = self.cluster.async_rebalance(self.servers, [],
                                                     servs_out)
            rebalance.result()
        except RebalanceFailedException:
            self.log.info("rebalance was failed as expected")
            self.assertTrue(ClusterOperationHelper._wait_warmup_completed(self, [warmup_node], \
                            self.default_bucket_name, wait_time=self.wait_timeout * 10))

            self.log.info("second attempt to rebalance")
            rebalance = self.cluster.async_rebalance(self.servers, [],
                                                     servs_out)
            rebalance.result()
        self.verify_cluster_stats(self.servers[:len(self.servers) -
                                               self.nodes_out])
        self.verify_unacked_bytes_all_buckets()
예제 #4
0
    def rebalance_out_with_warming_up(self):
        master_restart = self.input.param("master_restart", False)
        if master_restart:
            warmup_node = self.master
        else:
            warmup_node = self.servers[len(self.servers) - self.nodes_out - 1]
        servs_out = self.servers[len(self.servers) - self.nodes_out :]
        shell = RemoteMachineShellConnection(warmup_node)
        shell.stop_couchbase()
        self.sleep(20)
        shell.start_couchbase()
        shell.disconnect()
        try:
            rebalance = self.cluster.async_rebalance(self.servers, [], servs_out)
            rebalance.result()
        except RebalanceFailedException:
            self.log.info("rebalance was failed as expected")
            self.assertTrue(
                ClusterOperationHelper._wait_warmup_completed(
                    self, [warmup_node], self.default_bucket_name, wait_time=self.wait_timeout * 10
                )
            )

            self.log.info("second attempt to rebalance")
            rebalance = self.cluster.async_rebalance(self.servers, [], servs_out)
            rebalance.result()
        self.verify_cluster_stats(self.servers[: len(self.servers) - self.nodes_out])
        self.verify_unacked_bytes_all_buckets()
예제 #5
0
    def rebalance_in_with_warming_up(self):
        servs_in = self.servers[self.nodes_init:self.nodes_init + self.nodes_in]
        servs_init = self.servers[:self.nodes_init]
        warmup_node = servs_init[-1]
        shell = RemoteMachineShellConnection(warmup_node)
        shell.stop_couchbase()
        self.sleep(20)
        shell.start_couchbase()
        shell.disconnect()
        try:
            rebalance = self.cluster.async_rebalance(
                servs_init, servs_in, [],
                sleep_before_rebalance=self.sleep_before_rebalance)
            rebalance.result()
        except RebalanceFailedException:
            self.log.info("rebalance was failed as expected")
            self.assertTrue(ClusterOperationHelper._wait_warmup_completed(self, [warmup_node], \
                            self.default_bucket_name, wait_time=self.wait_timeout * 10))

            self.log.info("second attempt to rebalance")
            rebalance = self.cluster.async_rebalance(
                servs_init + servs_in, [], [],
                sleep_before_rebalance=self.sleep_before_rebalance)
            rebalance.result()
        self.verify_cluster_stats(self.servers[:self.nodes_in + self.nodes_init])
        self.verify_unacked_bytes_all_buckets()
예제 #6
0
    def rebalance_in_with_warming_up(self):
        servs_in = self.servers[self.nodes_init:self.nodes_init + self.nodes_in]
        servs_init = self.servers[:self.nodes_init]
        warmup_node = servs_init[-1]
        shell = RemoteMachineShellConnection(warmup_node)
        shell.stop_couchbase()
        self.sleep(20)
        shell.start_couchbase()
        shell.disconnect()
        try:
            rebalance = self.cluster.async_rebalance(servs_init, servs_in, [])
            rebalance.result()
        except RebalanceFailedException:
            self.log.info("rebalance was failed as expected")
            self.assertTrue(ClusterOperationHelper._wait_warmup_completed(self, [warmup_node], \
                            self.default_bucket_name, wait_time=self.wait_timeout * 10))

            self.log.info("second attempt to rebalance")
            rebalance = self.cluster.async_rebalance(servs_init + servs_in, [], [])
            rebalance.result()
        self.verify_cluster_stats(self.servers[:self.nodes_in + self.nodes_init])
예제 #7
0
    def _common_test_body_failed_swap_rebalance(self):
        master = self.servers[0]
        rest = RestConnection(master)
        num_initial_servers = self.num_initial_servers
        creds = self.input.membase_settings
        intial_severs = self.servers[:num_initial_servers]

        self.log.info("CREATE BUCKET PHASE")
        SwapRebalanceBase.create_buckets(self)

        # Cluster all starting set of servers
        self.log.info("INITIAL REBALANCE PHASE")
        status, servers_rebalanced = RebalanceHelper.rebalance_in(intial_severs, len(intial_severs) - 1)
        self.assertTrue(status, msg="Rebalance was failed")

        self.log.info("DATA LOAD PHASE")
        self.loaders = SwapRebalanceBase.start_load_phase(self, master)

        # Wait till load phase is over
        SwapRebalanceBase.stop_load(self.loaders, do_stop=False)
        self.log.info("DONE LOAD PHASE")

        # Start the swap rebalance
        current_nodes = RebalanceHelper.getOtpNodeIds(master)
        self.log.info("current nodes : {0}".format(current_nodes))
        toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.num_swap)
        optNodesIds = [node.id for node in toBeEjectedNodes]
        if self.swap_orchestrator:
            status, content = ClusterOperationHelper.find_orchestrator(master)
            self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\
            format(status, content))
            # When swapping all the nodes
            if self.num_swap is len(current_nodes):
                optNodesIds.append(content)
            else:
                optNodesIds[0] = content

        for node in optNodesIds:
            self.log.info("removing node {0} and rebalance afterwards".format(node))

        new_swap_servers = self.servers[num_initial_servers:num_initial_servers + self.num_swap]
        for server in new_swap_servers:
            otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip)
            msg = "unable to add node {0} to the cluster"
            self.assertTrue(otpNode, msg.format(server.ip))

        if self.swap_orchestrator:
            rest = RestConnection(new_swap_servers[0])
            master = new_swap_servers[0]

        self.log.info("DATA ACCESS PHASE")
        self.loaders = SwapRebalanceBase.start_access_phase(self, master)

        self.log.info("SWAP REBALANCE PHASE")
        rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()],
            ejectedNodes=optNodesIds)
        SwapRebalanceBase.sleep(self, 10, "Rebalance should start")
        self.log.info("FAIL SWAP REBALANCE PHASE @ {0}".format(self.percentage_progress))
        reached = RestHelper(rest).rebalance_reached(self.percentage_progress)
        if reached == 100 and not RestHelper(rest).is_cluster_rebalanced():
            # handle situation when rebalance failed at the beginning
            self.log.error('seems rebalance failed!')
            self.log.info("Latest logs from UI:")
            for i in rest.get_logs(): self.log.error(i)
            self.fail("rebalance failed even before killing memcached")
        bucket = rest.get_buckets()[0].name
        pid = None
        if self.swap_orchestrator:
            # get PID via remote connection if master is a new node
            shell = RemoteMachineShellConnection(master)
            o, _ = shell.execute_command("ps -eo comm,pid | awk '$1 == \"memcached\" { print $2 }'")
            pid = o[0]
            shell.disconnect()
        else:
            for i in xrange(2):
                try:
                    _mc = MemcachedClientHelper.direct_client(master, bucket)
                    pid = _mc.stats()["pid"]
                    break
                except EOFError as e:
                    self.log.error("{0}.Retry in 2 sec".format(e))
                    SwapRebalanceBase.sleep(self, 1)
        if pid is None:
            self.fail("impossible to get a PID")
        command = "os:cmd(\"kill -9 {0} \")".format(pid)
        self.log.info(command)
        killed = rest.diag_eval(command)
        self.log.info("killed {0}:{1}??  {2} ".format(master.ip, master.port, killed))
        self.log.info("sleep for 10 sec after kill memcached")
        SwapRebalanceBase.sleep(self, 10)
        # we can't get stats for new node when rebalance falls
        if not self.swap_orchestrator:
            ClusterOperationHelper._wait_warmup_completed(self, [master], bucket, wait_time=600)
        i = 0
        # we expect that rebalance will be failed
        try:
            rest.monitorRebalance()
        except RebalanceFailedException:
            # retry rebalance if it failed
            self.log.warn("Rebalance failed but it's expected")
            SwapRebalanceBase.sleep(self, 30)
            self.assertFalse(RestHelper(rest).is_cluster_rebalanced(), msg="cluster need rebalance")
            knownNodes = rest.node_statuses();
            self.log.info("nodes are still in cluster: {0}".format([(node.ip, node.port) for node in knownNodes]))
            ejectedNodes = list(set(optNodesIds) & set([node.id for node in knownNodes]))
            rest.rebalance(otpNodes=[node.id for node in knownNodes], ejectedNodes=ejectedNodes)
            self.assertTrue(rest.monitorRebalance(),
                            msg="rebalance operation failed after adding node {0}".format(toBeEjectedNodes))
        else:
            self.log.info("rebalance completed successfully")
        SwapRebalanceBase.verification_phase(self, master)
예제 #8
0
    def _common_test_body_failed_swap_rebalance(self):
        master = self.servers[0]
        rest = RestConnection(master)
        num_initial_servers = self.num_initial_servers
        creds = self.input.membase_settings
        intial_severs = self.servers[:num_initial_servers]

        self.log.info("CREATE BUCKET PHASE")
        SwapRebalanceBase.create_buckets(self)

        # Cluster all starting set of servers
        self.log.info("INITIAL REBALANCE PHASE")
        status, servers_rebalanced = RebalanceHelper.rebalance_in(
            intial_severs,
            len(intial_severs) - 1)
        self.assertTrue(status, msg="Rebalance was failed")

        self.log.info("DATA LOAD PHASE")
        self.loaders = SwapRebalanceBase.start_load_phase(self, master)

        # Wait till load phase is over
        SwapRebalanceBase.stop_load(self.loaders, do_stop=False)
        self.log.info("DONE LOAD PHASE")

        # Start the swap rebalance
        current_nodes = RebalanceHelper.getOtpNodeIds(master)
        self.log.info("current nodes : {0}".format(current_nodes))
        toBeEjectedNodes = RebalanceHelper.pick_nodes(master,
                                                      howmany=self.num_swap)
        optNodesIds = [node.id for node in toBeEjectedNodes]
        if self.swap_orchestrator:
            status, content = ClusterOperationHelper.find_orchestrator(master)
            self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\
            format(status, content))
            # When swapping all the nodes
            if self.num_swap is len(current_nodes):
                optNodesIds.append(content)
            else:
                optNodesIds[0] = content

        for node in optNodesIds:
            self.log.info(
                "removing node {0} and rebalance afterwards".format(node))

        new_swap_servers = self.servers[
            num_initial_servers:num_initial_servers + self.num_swap]
        for server in new_swap_servers:
            otpNode = rest.add_node(creds.rest_username, creds.rest_password,
                                    server.ip, server.port)
            msg = "unable to add node {0} to the cluster"
            self.assertTrue(otpNode, msg.format(server.ip))

        if self.swap_orchestrator:
            rest = RestConnection(new_swap_servers[0])
            master = new_swap_servers[0]

        self.log.info("DATA ACCESS PHASE")
        self.loaders = SwapRebalanceBase.start_access_phase(self, master)

        self.log.info("SWAP REBALANCE PHASE")
        rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()],
                       ejectedNodes=optNodesIds)
        SwapRebalanceBase.sleep(self, 10, "Rebalance should start")
        self.log.info("FAIL SWAP REBALANCE PHASE @ {0}".format(
            self.percentage_progress))
        reached = RestHelper(rest).rebalance_reached(self.percentage_progress)
        if reached and RestHelper(rest).is_cluster_rebalanced():
            # handle situation when rebalance failed at the beginning
            self.log.error('seems rebalance failed!')
            rest.print_UI_logs()
            self.fail("rebalance failed even before killing memcached")
        bucket = rest.get_buckets()[0].name
        pid = None
        if self.swap_orchestrator and not self.cluster_run:
            # get PID via remote connection if master is a new node
            shell = RemoteMachineShellConnection(master)
            pid = shell.get_memcache_pid()
            shell.disconnect()
        else:
            times = 2
            if self.cluster_run:
                times = 20
            for i in xrange(times):
                try:
                    _mc = MemcachedClientHelper.direct_client(master, bucket)
                    pid = _mc.stats()["pid"]
                    break
                except (EOFError, KeyError) as e:
                    self.log.error("{0}.Retry in 2 sec".format(e))
                    SwapRebalanceBase.sleep(self, 2)
        if pid is None:
            # sometimes pid is not returned by mc.stats()
            shell = RemoteMachineShellConnection(master)
            pid = shell.get_memcache_pid()
            shell.disconnect()
            if pid is None:
                self.fail("impossible to get a PID")
        command = "os:cmd(\"kill -9 {0} \")".format(pid)
        self.log.info(command)
        killed = rest.diag_eval(command)
        self.log.info("killed {0}:{1}??  {2} ".format(master.ip, master.port,
                                                      killed))
        self.log.info("sleep for 10 sec after kill memcached")
        SwapRebalanceBase.sleep(self, 10)
        # we can't get stats for new node when rebalance falls
        if not self.swap_orchestrator:
            ClusterOperationHelper._wait_warmup_completed(self, [master],
                                                          bucket,
                                                          wait_time=600)
        i = 0
        # we expect that rebalance will be failed
        try:
            rest.monitorRebalance()
        except RebalanceFailedException:
            # retry rebalance if it failed
            self.log.warn("Rebalance failed but it's expected")
            SwapRebalanceBase.sleep(self, 30)
            self.assertFalse(RestHelper(rest).is_cluster_rebalanced(),
                             msg="cluster need rebalance")
            knownNodes = rest.node_statuses()
            self.log.info("nodes are still in cluster: {0}".format([
                (node.ip, node.port) for node in knownNodes
            ]))
            ejectedNodes = list(
                set(optNodesIds) & set([node.id for node in knownNodes]))
            rest.rebalance(otpNodes=[node.id for node in knownNodes],
                           ejectedNodes=ejectedNodes)
            SwapRebalanceBase.sleep(self, 10, "Wait for rebalance to start")
            self.assertTrue(
                rest.monitorRebalance(),
                msg="rebalance operation failed after adding node {0}".format(
                    toBeEjectedNodes))
        else:
            self.log.info("rebalance completed successfully")
        SwapRebalanceBase.verification_phase(self, master)
예제 #9
0
    def _common_test_body_failed_swap_rebalance(self):
        master = self.servers[0]
        rest = RestConnection(master)
        num_initial_servers = self.num_initial_servers
        creds = self.input.membase_settings
        intial_severs = self.servers[:num_initial_servers]

        self.log.info("CREATE BUCKET PHASE")
        SwapRebalanceBase.create_buckets(self)

        # Cluster all starting set of servers
        self.log.info("INITIAL REBALANCE PHASE")
        RebalanceHelper.rebalance_in(intial_severs, len(intial_severs) - 1)

        self.log.info("DATA LOAD PHASE")
        self.loaders = SwapRebalanceBase.start_load_phase(self, master)

        # Wait till load phase is over
        SwapRebalanceBase.stop_load(self.loaders, do_stop=False)
        self.log.info("DONE LOAD PHASE")

        # Start the swap rebalance
        current_nodes = RebalanceHelper.getOtpNodeIds(master)
        self.log.info("current nodes : {0}".format(current_nodes))
        toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.num_swap)
        optNodesIds = [node.id for node in toBeEjectedNodes]
        if self.swap_orchestrator:
            status, content = ClusterOperationHelper.find_orchestrator(master)
            self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\
            format(status, content))
            # When swapping all the nodes
            if self.num_swap is len(current_nodes):
                optNodesIds.append(content)
            else:
                optNodesIds[0] = content

        for node in optNodesIds:
            self.log.info("removing node {0} and rebalance afterwards".format(node))

        new_swap_servers = self.servers[num_initial_servers:num_initial_servers + self.num_swap]
        for server in new_swap_servers:
            otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip)
            msg = "unable to add node {0} to the cluster"
            self.assertTrue(otpNode, msg.format(server.ip))

        if self.swap_orchestrator:
            rest = RestConnection(new_swap_servers[0])
            master = new_swap_servers[0]

        self.log.info("DATA ACCESS PHASE")
        self.loaders = SwapRebalanceBase.start_access_phase(self, master)

        self.log.info("SWAP REBALANCE PHASE")
        rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()],
            ejectedNodes=optNodesIds)

        # Rebalance is failed at 20%, 40% and 60% completion
        for i in [1, 2, 3]:
            expected_progress = 20 * i
            self.log.info("FAIL SWAP REBALANCE PHASE @ {0}".format(expected_progress))
            RestHelper(rest).rebalance_reached(expected_progress)
            bucket = rest.get_buckets()[0].name
            pid = None
            if self.swap_orchestrator:
                # get PID via remote connection if master is a new node
                shell = RemoteMachineShellConnection(master)
                o, _ = shell.execute_command("ps -eo comm,pid | awk '$1 == \"memcached\" { print $2 }'")
                pid = o[0]
                shell.disconnect()
            else:
                for i in xrange(2):
                    try:
                        _mc = MemcachedClientHelper.direct_client(master, bucket)
                        pid = _mc.stats()["pid"]
                        break
                    except EOFError as e:
                        self.log.error("{0}.Retry in 2 sec".format(e))
                        time.sleep(1)
            if pid is None:
                self.fail("impossible to get a PID")
            command = "os:cmd(\"kill -9 {0} \")".format(pid)
            self.log.info(command)
            killed = rest.diag_eval(command)
            self.log.info("killed {0}:{1}??  {2} ".format(master.ip, master.port, killed))
            self.log.info("sleep for 10 sec after kill memcached")
            time.sleep(10)
            # we can't get stats for new node when rebalance falls
            if not self.swap_orchestrator:
                ClusterOperationHelper._wait_warmup_completed(self, [master], bucket, wait_time=600)
            i = 0
            #we expect that rebalance will be failed
            while rest._rebalance_progress_status() == "running" and i < 60:
                self.log.info("rebalance progress: {0}".format(rest._rebalance_progress()))
                time.sleep(1)
                i += 1
            self.log.info("rebalance progress status:{0}".format(rest._rebalance_progress_status()))
            knownNodes = rest.node_statuses();
            self.log.info("nodes are still in cluster: {0}".format([(node.ip, node.port) for node in knownNodes]))
            ejectedNodes = list(set(optNodesIds) & set([node.id for node in knownNodes]))
            rest.rebalance(otpNodes=[node.id for node in knownNodes],
                ejectedNodes=ejectedNodes)

        self.assertTrue(rest.monitorRebalance(),
            msg="rebalance operation failed after adding node {0}".format(toBeEjectedNodes))

        SwapRebalanceBase.verification_phase(self, master)