def online_upgrade_swap_rebalance(self): self._install(self.servers[:self.nodes_init]) self.operations(self.servers[:self.nodes_init]) self.sleep(self.sleep_time, "Pre-setup of old version is done. Wait for upgrade") self.initial_version = self.upgrade_versions[0] self.product = 'couchbase-server' self._install(self.servers[self.nodes_init:self.num_servers]) self.sleep(self.sleep_time, "Installation of new version is done. Wait for rebalance") self.swap_num_servers = self.input.param('swap_num_servers', 1) old_servers = self.servers[:self.nodes_init] new_servers = [] for i in range(self.nodes_init / self.swap_num_servers): servers_in = self.servers[(self.nodes_init + i * self.swap_num_servers): (self.nodes_init + (i + 1) * self.swap_num_servers)] servers_out = self.servers[(i * self.swap_num_servers):((i + 1) * self.swap_num_servers)] servers = old_servers + new_servers self.log.info("Swap rebalance: rebalance out %s old version nodes, rebalance in %s 2.0 Nodes" % (self.swap_num_servers, self.swap_num_servers)) self.cluster.rebalance(servers, servers_in, servers_out) self.sleep(self.sleep_time) old_servers = self.servers[((i + 1) * self.swap_num_servers):self.nodes_init] new_servers = new_servers + servers_in servers = old_servers + new_servers status, content = ClusterOperationHelper.find_orchestrator(servers[0]) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) FIND_MASTER = False for new_server in new_servers: if content.find(new_server.ip) >= 0: self._new_master(new_server) FIND_MASTER = True self.log.info("2.0 Node %s becomes the master" % (new_server.ip)) if not FIND_MASTER: raise Exception("After rebalance in 2.0 nodes, 2.0 doesn't become the master ") self.verification(self.servers[self.nodes_init : self.num_servers])
def _failover_swap_rebalance(self): master = self.servers[0] rest = RestConnection(master) creds = self.input.membase_settings num_initial_servers = self.num_initial_servers intial_severs = self.servers[:num_initial_servers] self.log.info("CREATE BUCKET PHASE") SwapRebalanceBase.create_buckets(self) # Cluster all starting set of servers self.log.info("INITIAL REBALANCE PHASE") status, servers_rebalanced = RebalanceHelper.rebalance_in(intial_severs, len(intial_severs) - 1) self.assertTrue(status, msg="Rebalance was failed") self.log.info("DATA LOAD PHASE") self.loaders = SwapRebalanceBase.start_load_phase(self, master) # Wait till load phase is over SwapRebalanceBase.stop_load(self.loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance self.log.info("current nodes : {0}".format(RebalanceHelper.getOtpNodeIds(master))) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.failover_factor) optNodesIds = [node.id for node in toBeEjectedNodes] if self.fail_orchestrator: status, content = ClusterOperationHelper.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) optNodesIds[0] = content self.log.info("FAILOVER PHASE") # Failover selected nodes for node in optNodesIds: self.log.info("failover node {0} and rebalance afterwards".format(node)) rest.fail_over(node) self.assertTrue(rest.monitorRebalance(), msg="failed after failover of {0}".format(node)) new_swap_servers = self.servers[num_initial_servers:num_initial_servers + self.failover_factor] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip, server.port) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.fail_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] self.log.info("DATA ACCESS PHASE") self.loaders = SwapRebalanceBase.start_access_phase(self, master) rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], \ ejectedNodes=optNodesIds) self.assertTrue(rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format(new_swap_servers)) SwapRebalanceBase.verification_phase(self, master)
def online_upgrade(self): servers_in = self.servers[self.nodes_init:self.num_servers] self.cluster.rebalance(self.servers[:self.nodes_init], servers_in, []) self.log.info("Rebalance in all {0} nodes" \ .format(self.input.param("upgrade_version", ""))) self.sleep(self.sleep_time) status, content = ClusterOperationHelper.find_orchestrator(self.master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) FIND_MASTER = False for new_server in servers_in: if content.find(new_server.ip) >= 0: self._new_master(new_server) FIND_MASTER = True self.log.info("%s node %s becomes the master" \ % (self.input.param("upgrade_version", ""), new_server.ip)) break if self.input.param("initial_version", "")[:5] in COUCHBASE_VERSION_2 \ and not FIND_MASTER and not self.is_downgrade: raise Exception( \ "After rebalance in {0} nodes, {0} node doesn't become master" \ .format(self.input.param("upgrade_version", ""))) servers_out = self.servers[:self.nodes_init] self.log.info("Rebalanced out all old version nodes") self.cluster.rebalance(self.servers[:self.num_servers], [], servers_out)
def online_upgrade_rebalance_in_out(self): self._install(self.servers[: self.initial_num_servers]) self.operations(multi_nodes=True) self.log.info("Installation of old version is done. Wait for %s sec for upgrade" % (self.sleep_time)) time.sleep(self.sleep_time) upgrade_version = self.input.param("upgrade_version", "2.0.0-1870-rel") self.initial_version = upgrade_version self.product = "couchbase-server" self._install(self.servers[self.initial_num_servers : self.num_servers]) self.log.info("Installation of new version is done. Wait for %s sec for rebalance" % (self.sleep_time)) time.sleep(self.sleep_time) servers_in = self.servers[self.initial_num_servers : self.num_servers] self.cluster.rebalance(self.servers[: self.initial_num_servers], servers_in, []) self.log.info("Rebalance in all 2.0 Nodes") time.sleep(self.sleep_time) status, content = ClusterHelper.find_orchestrator(self.master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".format(status, content)) FIND_MASTER = False for new_server in servers_in: if content.find(new_server.ip) >= 0: FIND_MASTER = True self.log.info("2.0 Node %s becomes the master" % (new_server.ip)) if not FIND_MASTER: raise Exception("After rebalance in 2.0 Nodes, 2.0 doesn't become the master") servers_out = self.servers[: self.initial_num_servers] self.cluster.rebalance(self.servers[: self.num_servers], [], servers_out) self.log.info("Rebalance out all old version nodes") time.sleep(self.sleep_time) self.verify_upgrade_rebalance_in_out()
def _online_upgrade(self, update_servers, extra_servers, check_newmaster=True): RestConnection(update_servers[0]).get_nodes_versions() added_versions = RestConnection(extra_servers[0]).get_nodes_versions() self.cluster.rebalance(update_servers + extra_servers, extra_servers, []) self.log.info("Rebalance in all {0} nodes completed".format( added_versions[0])) RestConnection(update_servers[0]).get_nodes_versions() self.sleep(self.sleep_time) status, content = ClusterOperationHelper.find_orchestrator( update_servers[0]) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) self.log.info("after rebalance in the master is {0}".format(content)) if check_newmaster and not self.upgrade_same_version: FIND_MASTER = False for new_server in extra_servers: if content.find(new_server.ip) >= 0: FIND_MASTER = True self.log.info("{0} Node {1} becomes the master".format( added_versions[0], new_server.ip)) break if not FIND_MASTER: raise Exception( "After rebalance in {0} Nodes, one of them doesn't become the master" .format(added_versions[0])) self.log.info("Rebalanced out all old version nodes") self.cluster.rebalance(update_servers + extra_servers, [], update_servers) if self.upgrade_versions[0] >= "3.0.0": self._enable_xdcr_trace_logging(extra_servers)
def online_upgrade_with_failover(self, services=None): servers_in = self.servers[self.nodes_init:self.num_servers] self.cluster.rebalance(self.servers[:self.nodes_init], servers_in, [], services=services) log.info("Rebalance in all {0} nodes" \ .format(self.input.param("upgrade_version", ""))) self.sleep(self.sleep_time) status, content = ClusterOperationHelper.find_orchestrator(self.master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}". \ format(status, content)) FIND_MASTER = False for new_server in servers_in: if content.find(new_server.ip) >= 0: self._new_master(new_server) FIND_MASTER = True self.log.info("%s node %s becomes the master" \ % (self.input.param("upgrade_version", ""), new_server.ip)) break if self.input.param("initial_version", "")[:5] in COUCHBASE_VERSION_2 \ and not FIND_MASTER: raise Exception( \ "After rebalance in {0} nodes, {0} node doesn't become master" \ .format(self.input.param("upgrade_version", ""))) servers_out = self.servers[:self.nodes_init] self._new_master(self.servers[self.nodes_init]) log.info("failover and rebalance nodes") self.cluster.failover(self.servers[:self.num_servers], failover_nodes=servers_out, graceful=False) self.cluster.rebalance(self.servers[:self.num_servers], [], servers_out) self.sleep(180)
def _failover_swap_rebalance(self): master = self.servers[0] rest = RestConnection(master) creds = self.input.membase_settings num_initial_servers = self.num_initial_servers intial_severs = self.servers[:num_initial_servers] self.log.info("CREATE BUCKET PHASE") SwapRebalanceBase.create_buckets(self) # Cluster all starting set of servers self.log.info("INITIAL REBALANCE PHASE") status, servers_rebalanced = RebalanceHelper.rebalance_in(intial_severs, len(intial_severs) - 1) self.assertTrue(status, msg="Rebalance was failed") self.log.info("DATA LOAD PHASE") self.loaders = SwapRebalanceBase.start_load_phase(self, master) # Wait till load phase is over SwapRebalanceBase.stop_load(self.loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance self.log.info("current nodes : {0}".format(RebalanceHelper.getOtpNodeIds(master))) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.failover_factor) optNodesIds = [node.id for node in toBeEjectedNodes] if self.fail_orchestrator: status, content = ClusterOperationHelper.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) optNodesIds[0] = content self.log.info("FAILOVER PHASE") # Failover selected nodes for node in optNodesIds: self.log.info("failover node {0} and rebalance afterwards".format(node)) rest.fail_over(node) new_swap_servers = self.servers[num_initial_servers:num_initial_servers + self.failover_factor] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.fail_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] self.log.info("DATA ACCESS PHASE") self.loaders = SwapRebalanceBase.start_access_phase(self, master) rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], \ ejectedNodes=optNodesIds) self.assertTrue(rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format(new_swap_servers)) SwapRebalanceBase.verification_phase(self, master)
def online_upgrade_swap_rebalance(self, services=None): servers_in = self.servers[self.nodes_init:self.num_servers] self.sleep(self.sleep_time) status, content = ClusterOperationHelper.find_orchestrator(self.master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}". \ format(status, content)) i = 0 for server_in, service_in in zip(servers_in, services): log.info("Swap rebalance nodes") self.cluster.rebalance(self.servers[:self.nodes_init], [server_in], [self.servers[i]], [service_in]) self._new_master(self.servers[self.nodes_init]) i += 1
def online_upgrade_swap_rebalance(self, services=None): servers_in = self.servers[self.nodes_init:self.num_servers] self.sleep(self.sleep_time) status, content = ClusterOperationHelper.find_orchestrator(self.master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}". \ format(status, content)) i = 0 for server_in, service_in in zip(servers_in, services): log.info("Swap rebalance nodes") self.cluster.rebalance(self.servers[:self.nodes_init], [server_in], [self.servers[i]], [service_in]) self._new_master(self.servers[self.nodes_init]) i += 1
def online_upgrade_swap_rebalance(self): self._install(self.servers[:self.initial_num_servers]) self.operations(multi_nodes=True) self.log.info( "Installation of old version is done. Wait for %s sec for upgrade" % (self.sleep_time)) time.sleep(self.sleep_time) upgrade_version = self.input.param('upgrade_version', '2.0.0-1870-rel') self.initial_version = upgrade_version self.product = 'couchbase-server' self._install(self.servers[self.initial_num_servers:self.num_servers]) self.log.info( "Installation of new version is done. Wait for %s sec for rebalance" % (self.sleep_time)) time.sleep(self.sleep_time) self.swap_num_servers = self.input.param('swap_num_servers', 1) old_servers = self.servers[:self.initial_num_servers] new_servers = [] for i in range(self.initial_num_servers / self.swap_num_servers): servers_in = self.servers[(self.initial_num_servers + i * self.swap_num_servers):( self.initial_num_servers + (i + 1) * self.swap_num_servers)] servers_out = self.servers[( i * self.swap_num_servers):((i + 1) * self.swap_num_servers)] servers = old_servers + new_servers self.cluster.rebalance(servers, servers_in, servers_out) self.log.info( "Swap rebalance: rebalance out %s old version nodes, rebalance in %s 2.0 Nodes" % (self.swap_num_servers, self.swap_num_servers)) time.sleep(self.sleep_time) old_servers = self.servers[( (i + 1) * self.swap_num_servers):self.initial_num_servers] new_servers = new_servers + servers_in servers = old_servers + new_servers status, content = ClusterHelper.find_orchestrator(servers[0]) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) FIND_MASTER = False for new_server in new_servers: if content.find(new_server.ip) >= 0: FIND_MASTER = True self.log.info("2.0 Node %s becomes the master" % (new_server.ip)) if not FIND_MASTER: raise Exception( "After rebalance in 2.0 nodes, 2.0 doesn't become the master " ) self.verify_upgrade_rebalance_in_out()
def online_upgrade_swap_rebalance(self): self._install(self.servers[: self.nodes_init]) self.operations(self.servers[: self.nodes_init]) self.initial_version = self.upgrade_versions[0] self.product = "couchbase-server" self.sleep( self.sleep_time, "Pre-setup of old version is done. Wait for online upgrade to {0} version".format(self.initial_version), ) self._install(self.servers[self.nodes_init : self.num_servers]) self.sleep(self.sleep_time, "Installation of new version is done. Wait for rebalance") self.swap_num_servers = self.input.param("swap_num_servers", 1) old_servers = self.servers[: self.nodes_init] new_vb_nums = RestHelper(RestConnection(self.master))._get_vbuckets( old_servers, bucket_name=self.buckets[0].name ) new_servers = [] for i in range(self.nodes_init / self.swap_num_servers): old_vb_nums = copy.deepcopy(new_vb_nums) servers_in = self.servers[ (self.nodes_init + i * self.swap_num_servers) : (self.nodes_init + (i + 1) * self.swap_num_servers) ] servers_out = self.servers[(i * self.swap_num_servers) : ((i + 1) * self.swap_num_servers)] servers = old_servers + new_servers self.log.info( "Swap rebalance: rebalance out %s old version nodes, rebalance in %s 2.0 Nodes" % (self.swap_num_servers, self.swap_num_servers) ) self.cluster.rebalance(servers, servers_in, servers_out) self.sleep(self.sleep_time) old_servers = self.servers[((i + 1) * self.swap_num_servers) : self.nodes_init] new_servers = new_servers + servers_in servers = old_servers + new_servers new_vb_nums = RestHelper(RestConnection(self.master))._get_vbuckets( servers, bucket_name=self.buckets[0].name ) self._verify_vbucket_nums_for_swap(old_vb_nums, new_vb_nums) status, content = ClusterOperationHelper.find_orchestrator(servers[0]) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".format(status, content)) FIND_MASTER = False for new_server in new_servers: if content.find(new_server.ip) >= 0: self._new_master(new_server) FIND_MASTER = True self.log.info("3.0 Node %s becomes the master" % (new_server.ip)) if not FIND_MASTER: raise Exception("After rebalance in 3.0 nodes, 3.0 doesn't become the master ") """ verify DCP upgrade in 3.0.0 version """ self.monitor_dcp_rebalance() self.verification(new_servers)
def _online_upgrade(self, update_servers, extra_servers, check_newmaster=True): self.cluster.rebalance(update_servers + extra_servers, extra_servers, []) self.log.info("Rebalance in all 2.0 Nodes") self.sleep(self.sleep_time) status, content = ClusterOperationHelper.find_orchestrator(update_servers[0]) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) if check_newmaster: FIND_MASTER = False for new_server in extra_servers: if content.find(new_server.ip) >= 0: FIND_MASTER = True self.log.info("2.0 Node %s becomes the master" % (new_server.ip)) break if not FIND_MASTER: raise Exception("After rebalance in 2.0 Nodes, 2.0 doesn't become the master") self.log.info("Rebalanced out all old version nodes") self.cluster.rebalance(update_servers + extra_servers, [], update_servers)
def online_upgrade_swap_rebalance(self): self._install(self.servers[: self.initial_num_servers]) self.operations(multi_nodes=True) self.log.info("Installation of old version is done. Wait for %s sec for upgrade" % (self.sleep_time)) time.sleep(self.sleep_time) upgrade_version = self.input.param("upgrade_version", "2.0.0-1870-rel") self.initial_version = upgrade_version self.product = "couchbase-server" self._install(self.servers[self.initial_num_servers : self.num_servers]) self.log.info("Installation of new version is done. Wait for %s sec for rebalance" % (self.sleep_time)) time.sleep(self.sleep_time) self.swap_num_servers = self.input.param("swap_num_servers", 1) old_servers = self.servers[: self.initial_num_servers] new_servers = [] for i in range(self.initial_num_servers / self.swap_num_servers): servers_in = self.servers[ (self.initial_num_servers + i * self.swap_num_servers) : ( self.initial_num_servers + (i + 1) * self.swap_num_servers ) ] servers_out = self.servers[(i * self.swap_num_servers) : ((i + 1) * self.swap_num_servers)] servers = old_servers + new_servers self.cluster.rebalance(servers, servers_in, servers_out) self.log.info( "Swap rebalance: rebalance out %s old version nodes, rebalance in %s 2.0 Nodes" % (self.swap_num_servers, self.swap_num_servers) ) time.sleep(self.sleep_time) old_servers = self.servers[((i + 1) * self.swap_num_servers) : self.initial_num_servers] new_servers = new_servers + servers_in servers = old_servers + new_servers status, content = ClusterHelper.find_orchestrator(servers[0]) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".format(status, content)) FIND_MASTER = False for new_server in new_servers: if content.find(new_server.ip) >= 0: FIND_MASTER = True self.log.info("2.0 Node %s becomes the master" % (new_server.ip)) if not FIND_MASTER: raise Exception("After rebalance in 2.0 nodes, 2.0 doesn't become the master ") self.verify_upgrade_rebalance_in_out()
def online_upgrade_swap_rebalance(self): self._install(self.servers[:self.nodes_init]) self.operations(self.servers[:self.nodes_init]) self.initial_version = self.upgrade_versions[0] self.product = 'couchbase-server' self.sleep(self.sleep_time, "Pre-setup of old version is done. Wait for online upgrade to {0} version".\ format(self.initial_version)) self._install(self.servers[self.nodes_init:self.num_servers]) self.sleep(self.sleep_time, "Installation of new version is done. Wait for rebalance") self.swap_num_servers = self.input.param('swap_num_servers', 1) old_servers = self.servers[:self.nodes_init] new_vb_nums = RestHelper(RestConnection(self.master))._get_vbuckets(old_servers, bucket_name=self.buckets[0].name) new_servers = [] for i in range(self.nodes_init / self.swap_num_servers): old_vb_nums = copy.deepcopy(new_vb_nums) servers_in = self.servers[(self.nodes_init + i * self.swap_num_servers): (self.nodes_init + (i + 1) * self.swap_num_servers)] servers_out = self.servers[(i * self.swap_num_servers):((i + 1) * self.swap_num_servers)] servers = old_servers + new_servers self.log.info("Swap rebalance: rebalance out %s old version nodes, rebalance in %s 2.0 Nodes" % (self.swap_num_servers, self.swap_num_servers)) self.cluster.rebalance(servers, servers_in, servers_out) self.sleep(self.sleep_time) old_servers = self.servers[((i + 1) * self.swap_num_servers):self.nodes_init] new_servers = new_servers + servers_in servers = old_servers + new_servers new_vb_nums = RestHelper(RestConnection(self.master))._get_vbuckets(servers, bucket_name=self.buckets[0].name) self._verify_vbucket_nums_for_swap(old_vb_nums, new_vb_nums) status, content = ClusterOperationHelper.find_orchestrator(servers[0]) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) FIND_MASTER = False for new_server in new_servers: if content.find(new_server.ip) >= 0: self._new_master(new_server) FIND_MASTER = True self.log.info("3.0 Node %s becomes the master" % (new_server.ip)) if not FIND_MASTER: raise Exception("After rebalance in 3.0 nodes, 3.0 doesn't become the master ") """ verify DCP upgrade in 3.0.0 version """ self.monitor_dcp_rebalance() self.verification(new_servers)
def online_upgrade(self): servers_in = self.servers[self.nodes_init : self.num_servers] self.cluster.rebalance(self.servers[: self.nodes_init], servers_in, []) self.log.info("Rebalance in all 2.0 Nodes") self.sleep(self.sleep_time) status, content = ClusterOperationHelper.find_orchestrator(self.master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".format(status, content)) FIND_MASTER = False for new_server in servers_in: if content.find(new_server.ip) >= 0: self._new_master(new_server) FIND_MASTER = True self.log.info("2.0 Node %s becomes the master" % (new_server.ip)) break if not FIND_MASTER and not self.is_downgrade: raise Exception("After rebalance in 3.0 Nodes, 3.0 doesn't become the master") servers_out = self.servers[: self.nodes_init] self.log.info("Rebalanced out all old version nodes") self.cluster.rebalance(self.servers[: self.num_servers], [], servers_out)
def _online_upgrade(self, update_servers, extra_servers, check_newmaster=True): self.cluster.rebalance(update_servers + extra_servers, extra_servers, []) current_versions = RestConnection(update_servers[0]).get_nodes_versions() added_versions = RestConnection(extra_servers[0]).get_nodes_versions() self.log.info("Rebalance in all {0} nodes completed".format(added_versions[0])) self.sleep(self.sleep_time) status, content = ClusterOperationHelper.find_orchestrator(update_servers[0]) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) self.log.info("after rebalance in the master is {0}".format(content)) if check_newmaster: FIND_MASTER = False for new_server in extra_servers: if content.find(new_server.ip) >= 0: FIND_MASTER = True self.log.info("{0} Node {1} becomes the master".format(added_versions[0], new_server.ip)) break if not FIND_MASTER: raise Exception("After rebalance in {0} Nodes, one of them doesn't become the master".format(added_versions[0])) self.log.info("Rebalanced out all old version nodes") self.cluster.rebalance(update_servers + extra_servers, [], update_servers)
def online_upgrade_rebalance_in_out(self): self._install(self.servers[:self.initial_num_servers]) self.operations(multi_nodes=True) self.log.info( "Installation of old version is done. Wait for %s sec for upgrade" % (self.sleep_time)) time.sleep(self.sleep_time) upgrade_version = self.input.param('upgrade_version', '2.0.0-1870-rel') self.initial_version = upgrade_version self.product = 'couchbase-server' self._install(self.servers[self.initial_num_servers:self.num_servers]) self.log.info( "Installation of new version is done. Wait for %s sec for rebalance" % (self.sleep_time)) time.sleep(self.sleep_time) servers_in = self.servers[self.initial_num_servers:self.num_servers] self.cluster.rebalance(self.servers[:self.initial_num_servers], servers_in, []) self.log.info("Rebalance in all 2.0 Nodes") time.sleep(self.sleep_time) status, content = ClusterHelper.find_orchestrator(self.master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) FIND_MASTER = False for new_server in servers_in: if content.find(new_server.ip) >= 0: FIND_MASTER = True self.log.info("2.0 Node %s becomes the master" % (new_server.ip)) if not FIND_MASTER: raise Exception( "After rebalance in 2.0 Nodes, 2.0 doesn't become the master") servers_out = self.servers[:self.initial_num_servers] self.cluster.rebalance(self.servers[:self.num_servers], [], servers_out) self.log.info("Rebalance out all old version nodes") time.sleep(self.sleep_time) self.verify_upgrade_rebalance_in_out()
def online_upgrade(self): servers_in = self.servers[self.nodes_init:self.num_servers] self.cluster.rebalance(self.servers[:self.nodes_init], servers_in, []) self.log.info("Rebalance in all 2.0 Nodes") self.sleep(self.sleep_time) status, content = ClusterOperationHelper.find_orchestrator(self.master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) FIND_MASTER = False for new_server in servers_in: if content.find(new_server.ip) >= 0: self._new_master(new_server) FIND_MASTER = True self.log.info("2.0 Node %s becomes the master" % (new_server.ip)) break if not FIND_MASTER: raise Exception( "After rebalance in 2.0 Nodes, 2.0 doesn't become the master") servers_out = self.servers[:self.nodes_init] self.log.info("Rebalanced out all old version nodes") self.cluster.rebalance(self.servers[:self.num_servers], [], servers_out)
def _common_test_body_failed_swap_rebalance(self): master = self.servers[0] rest = RestConnection(master) num_initial_servers = self.num_initial_servers creds = self.input.membase_settings intial_severs = self.servers[:num_initial_servers] self.log.info("CREATE BUCKET PHASE") SwapRebalanceBase.create_buckets(self) # Cluster all starting set of servers self.log.info("INITIAL REBALANCE PHASE") RebalanceHelper.rebalance_in(intial_severs, len(intial_severs) - 1) self.log.info("DATA LOAD PHASE") loaders = SwapRebalanceBase.start_load_phase(self, master) # Wait till load phase is over SwapRebalanceBase.stop_load(loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.num_swap) optNodesIds = [node.id for node in toBeEjectedNodes] if self.swap_orchestrator: status, content = ClusterHelper.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) # When swapping all the nodes if self.num_swap is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content for node in optNodesIds: self.log.info( "removing node {0} and rebalance afterwards".format(node)) new_swap_servers = self.servers[ num_initial_servers:num_initial_servers + self.num_swap] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.swap_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] self.log.info("DATA ACCESS PHASE") loaders = SwapRebalanceBase.start_access_phase(self, master) self.log.info("SWAP REBALANCE PHASE") rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) # Rebalance is failed at 20%, 40% and 60% completion for i in [1, 2, 3]: expected_progress = 20 * i self.log.info( "FAIL SWAP REBALANCE PHASE @ {0}".format(expected_progress)) RestHelper(rest).rebalance_reached(expected_progress) bucket = rest.get_buckets()[0].name pid = StatsCommon.get_stats([master], bucket, "", "pid")[master] command = "os:cmd(\"kill -9 {0} \")".format(pid) self.log.info(command) killed = rest.diag_eval(command) self.log.info("killed {0}:{1}?? {2} ".format( master.ip, master.port, killed)) BaseTestCase._wait_warmup_completed(self, [master], bucket, wait_time=600) time.sleep(5) rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) self.assertTrue( rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format( toBeEjectedNodes)) # Stop loaders SwapRebalanceBase.stop_load(loaders) self.log.info("DONE DATA ACCESS PHASE") #for bucket in rest.get_buckets(): # SwapRebalanceBase.verify_data(new_swap_servers[0], bucket_data[bucket.name].get('inserted_keys'),\ # bucket.name, self) # RebalanceHelper.wait_for_persistence(master, bucket.name) self.log.info("VERIFICATION PHASE") SwapRebalanceBase.items_verification(master, self)
def test_capi_with_online_upgrade(self): self._install(self._input.servers[:self.src_init + self.dest_init]) upgrade_version = self._input.param("upgrade_version", "5.0.0-1797") upgrade_nodes = self.src_cluster.get_nodes() extra_nodes = self._input.servers[self.src_init + self.dest_init:] repl_id = self._start_es_replication() rest_conn = RestConnection(self.src_master) rest_conn.pause_resume_repl_by_id(repl_id, REPL_PARAM.PAUSE_REQUESTED, 'true') gen = DocumentGenerator('es', '{{"key":"value","mutated":0}}', xrange(100), start=0, end=self._num_items) self.src_cluster.load_all_buckets_from_generator(gen) self.perform_update_delete() rest_conn.pause_resume_repl_by_id(repl_id, REPL_PARAM.PAUSE_REQUESTED, 'false') self._wait_for_es_replication_to_catchup() rest_conn.pause_resume_repl_by_id(repl_id, REPL_PARAM.PAUSE_REQUESTED, 'true') RestConnection(upgrade_nodes[0]).get_nodes_versions() added_versions = RestConnection(extra_nodes[0]).get_nodes_versions() self.cluster.rebalance(upgrade_nodes + extra_nodes, extra_nodes, []) self.log.info("Rebalance in all {0} nodes completed".format(added_versions[0])) RestConnection(upgrade_nodes[0]).get_nodes_versions() self.sleep(15) status, content = ClusterOperationHelper.find_orchestrator(upgrade_nodes[0]) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) self.log.info("after rebalance in the master is {0}".format(content)) find_master = False for new_server in extra_nodes: if content.find(new_server.ip) >= 0: find_master = True self.log.info("{0} Node {1} becomes the master".format(added_versions[0], new_server.ip)) break if not find_master: raise Exception("After rebalance in {0} Nodes, one of them doesn't become the master". format(added_versions[0])) self.log.info("Rebalancing out all old version nodes") self.cluster.rebalance(upgrade_nodes + extra_nodes, [], upgrade_nodes) self.src_master = self._input.servers[self.src_init + self.dest_init] self._install(self.src_cluster.get_nodes(), version=upgrade_version) upgrade_nodes = self._input.servers[self.src_init + self.dest_init:] extra_nodes = self.src_cluster.get_nodes() RestConnection(upgrade_nodes[0]).get_nodes_versions() added_versions = RestConnection(extra_nodes[0]).get_nodes_versions() self.cluster.rebalance(upgrade_nodes + extra_nodes, extra_nodes, []) self.log.info("Rebalance in all {0} nodes completed".format(added_versions[0])) RestConnection(upgrade_nodes[0]).get_nodes_versions() self.sleep(15) status, content = ClusterOperationHelper.find_orchestrator(upgrade_nodes[0]) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) self.log.info("after rebalance in the master is {0}".format(content)) self.log.info("Rebalancing out all old version nodes") self.cluster.rebalance(upgrade_nodes + extra_nodes, [], upgrade_nodes) self.src_master = self._input.servers[0] self.log.info("######### Upgrade of CB cluster completed ##########") rest_conn = RestConnection(self.src_master) rest_conn.pause_resume_repl_by_id(repl_id, REPL_PARAM.PAUSE_REQUESTED, 'true') gen = DocumentGenerator('es', '{{"key":"value"}}', xrange(100), start=0, end=self._num_items) self.src_cluster.load_all_buckets_from_generator(gen) self.perform_update_delete() rest_conn.pause_resume_repl_by_id(repl_id, REPL_PARAM.PAUSE_REQUESTED, 'false') self._wait_for_es_replication_to_catchup() self._verify_es_results()
def test_failover_continuous_bidirectional_sets_deletes(self): cluster_ref_a = "cluster_ref_a" master_a = self._input.clusters.get(0)[0] rest_conn_a = RestConnection(master_a) cluster_ref_b = "cluster_ref_b" master_b = self._input.clusters.get(1)[0] rest_conn_b = RestConnection(master_b) # Rebalance all the nodes together servers_a = self._input.clusters.get(0) servers_b = self._input.clusters.get(1) rebalanced_servers_a = [] rebalanced_servers_b = [] RebalanceHelper.rebalance_in(servers_a, len(servers_a)-1) RebalanceHelper.rebalance_in(servers_b, len(servers_b)-1) rebalanced_servers_a.extend(servers_a) rebalanced_servers_b.extend(servers_b) # Setup bi-directional continuous replication replication_type = "continuous" rest_conn_a.add_remote_cluster(master_b.ip, master_b.port, master_b.rest_username, master_b.rest_password, cluster_ref_b) rest_conn_b.add_remote_cluster(master_a.ip, master_a.port, master_a.rest_username, master_a.rest_password, cluster_ref_a) (rep_database_a, rep_id_a) = rest_conn_a.start_replication( replication_type, self._buckets[0], cluster_ref_b) (rep_database_b, rep_id_b) = rest_conn_b.start_replication( replication_type, self._buckets[0], cluster_ref_a) load_thread_list = [] # Start load kvstore = ClientKeyValueStore() self._params["ops"] = "set" task_def = RebalanceDataGenerator.create_loading_tasks(self._params) load_thread = RebalanceDataGenerator.start_load(rest_conn_a, self._buckets[0], task_def, kvstore) load_thread.start() load_thread.join() RebalanceHelper.wait_for_persistence(master_a, self._buckets[0]) # Do some deletes self._params["ops"] = "delete" self._params["count"] = self._num_items/5 task_def = RebalanceDataGenerator.create_loading_tasks(self._params) load_thread = RebalanceDataGenerator.start_load(rest_conn_a, self._buckets[0], task_def, kvstore) load_thread_list.append(load_thread) # Start all loads concurrently for lt in load_thread_list: lt.start() # Do the failover of nodes on both clusters self.log.info("Failing over nodes") self.log.info("current nodes on cluster 1: {0}".format(RebalanceHelper.getOtpNodeIds(master_a))) self.log.info("current nodes on cluster 2: {0}".format(RebalanceHelper.getOtpNodeIds(master_b))) # Find nodes to be failed_over toBeEjectedNodes = RebalanceHelper.pick_nodes(master_a, howmany=self._failover_factor) optNodesIds_a = [node.id for node in toBeEjectedNodes] if self._fail_orchestrator_a: status, content = ClusterOperationHelper.find_orchestrator(master_a) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) optNodesIds_a[0] = content master_a = self._input.clusters.get(0)[-1] rest_conn_a = RestConnection(master_a) #Failover selected nodes for node in optNodesIds_a: self.log.info("failover node {0} and rebalance afterwards".format(node)) rest_conn_a.fail_over(node) toBeEjectedNodes = RebalanceHelper.pick_nodes(master_b, howmany=self._failover_factor) optNodesIds_b = [node.id for node in toBeEjectedNodes] if self._fail_orchestrator_b: status, content = ClusterOperationHelper.find_orchestrator(master_b) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) optNodesIds_b[0] = content master_b = self._input.clusters.get(1)[-1] rest_conn_b = RestConnection(master_b) self._state.append((rest_conn_a, cluster_ref_b, rep_database_a, rep_id_a)) self._state.append((rest_conn_b, cluster_ref_a, rep_database_b, rep_id_b)) #Failover selected nodes for node in optNodesIds_b: self.log.info("failover node {0} and rebalance afterwards".format(node)) rest_conn_b.fail_over(node) rest_conn_a.rebalance(otpNodes=[node.id for node in rest_conn_a.node_statuses()],\ ejectedNodes=optNodesIds_a) rest_conn_b.rebalance(otpNodes=[node.id for node in rest_conn_b.node_statuses()],\ ejectedNodes=optNodesIds_b) self.assertTrue(rest_conn_a.monitorRebalance(), msg="rebalance operation failed after adding node on cluster 1") self.assertTrue(rest_conn_b.monitorRebalance(), msg="rebalance operation failed after adding node on cluster 2") # Wait for loading threads to finish for lt in load_thread_list: lt.join() self.log.info("All loading threads finished") # Verify replication self.assertTrue(XDCRBaseTest.verify_replicated_data(rest_conn_b, self._buckets[0], kvstore, self._poll_sleep, self._poll_timeout), "Verification of replicated data failed") self.assertTrue(XDCRBaseTest.verify_replicated_revs(rest_conn_a, rest_conn_b, self._buckets[0], self._poll_sleep, self._poll_timeout), "Verification of replicated revisions failed")
def _common_test_body_failed_swap_rebalance(self): master = self.servers[0] rest = RestConnection(master) num_initial_servers = self.num_initial_servers creds = self.input.membase_settings intial_severs = self.servers[:num_initial_servers] self.log.info("CREATE BUCKET PHASE") SwapRebalanceBase.create_buckets(self) # Cluster all starting set of servers self.log.info("INITIAL REBALANCE PHASE") status, servers_rebalanced = RebalanceHelper.rebalance_in(intial_severs, len(intial_severs) - 1) self.assertTrue(status, msg="Rebalance was failed") self.log.info("DATA LOAD PHASE") self.loaders = SwapRebalanceBase.start_load_phase(self, master) # Wait till load phase is over SwapRebalanceBase.stop_load(self.loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.num_swap) optNodesIds = [node.id for node in toBeEjectedNodes] if self.swap_orchestrator: status, content = ClusterOperationHelper.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) # When swapping all the nodes if self.num_swap is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content for node in optNodesIds: self.log.info("removing node {0} and rebalance afterwards".format(node)) new_swap_servers = self.servers[num_initial_servers:num_initial_servers + self.num_swap] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.swap_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] self.log.info("DATA ACCESS PHASE") self.loaders = SwapRebalanceBase.start_access_phase(self, master) self.log.info("SWAP REBALANCE PHASE") rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) SwapRebalanceBase.sleep(self, 10, "Rebalance should start") self.log.info("FAIL SWAP REBALANCE PHASE @ {0}".format(self.percentage_progress)) reached = RestHelper(rest).rebalance_reached(self.percentage_progress) if reached == 100 and not RestHelper(rest).is_cluster_rebalanced(): # handle situation when rebalance failed at the beginning self.log.error('seems rebalance failed!') self.log.info("Latest logs from UI:") for i in rest.get_logs(): self.log.error(i) self.fail("rebalance failed even before killing memcached") bucket = rest.get_buckets()[0].name pid = None if self.swap_orchestrator: # get PID via remote connection if master is a new node shell = RemoteMachineShellConnection(master) o, _ = shell.execute_command("ps -eo comm,pid | awk '$1 == \"memcached\" { print $2 }'") pid = o[0] shell.disconnect() else: for i in xrange(2): try: _mc = MemcachedClientHelper.direct_client(master, bucket) pid = _mc.stats()["pid"] break except EOFError as e: self.log.error("{0}.Retry in 2 sec".format(e)) SwapRebalanceBase.sleep(self, 1) if pid is None: self.fail("impossible to get a PID") command = "os:cmd(\"kill -9 {0} \")".format(pid) self.log.info(command) killed = rest.diag_eval(command) self.log.info("killed {0}:{1}?? {2} ".format(master.ip, master.port, killed)) self.log.info("sleep for 10 sec after kill memcached") SwapRebalanceBase.sleep(self, 10) # we can't get stats for new node when rebalance falls if not self.swap_orchestrator: ClusterOperationHelper._wait_warmup_completed(self, [master], bucket, wait_time=600) i = 0 # we expect that rebalance will be failed try: rest.monitorRebalance() except RebalanceFailedException: # retry rebalance if it failed self.log.warn("Rebalance failed but it's expected") SwapRebalanceBase.sleep(self, 30) self.assertFalse(RestHelper(rest).is_cluster_rebalanced(), msg="cluster need rebalance") knownNodes = rest.node_statuses(); self.log.info("nodes are still in cluster: {0}".format([(node.ip, node.port) for node in knownNodes])) ejectedNodes = list(set(optNodesIds) & set([node.id for node in knownNodes])) rest.rebalance(otpNodes=[node.id for node in knownNodes], ejectedNodes=ejectedNodes) self.assertTrue(rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format(toBeEjectedNodes)) else: self.log.info("rebalance completed successfully") SwapRebalanceBase.verification_phase(self, master)
def _common_test_body_failed_swap_rebalance(self): master = self.servers[0] rest = RestConnection(master) num_initial_servers = self.num_initial_servers creds = self.input.membase_settings intial_severs = self.servers[:num_initial_servers] self.log.info("CREATE BUCKET PHASE") SwapRebalanceBase.create_buckets(self) # Cluster all starting set of servers self.log.info("INITIAL REBALANCE PHASE") status, servers_rebalanced = RebalanceHelper.rebalance_in( intial_severs, len(intial_severs) - 1) self.assertTrue(status, msg="Rebalance was failed") self.log.info("DATA LOAD PHASE") self.loaders = SwapRebalanceBase.start_load_phase(self, master) # Wait till load phase is over SwapRebalanceBase.stop_load(self.loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.num_swap) optNodesIds = [node.id for node in toBeEjectedNodes] if self.swap_orchestrator: status, content = ClusterOperationHelper.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) # When swapping all the nodes if self.num_swap is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content for node in optNodesIds: self.log.info( "removing node {0} and rebalance afterwards".format(node)) new_swap_servers = self.servers[ num_initial_servers:num_initial_servers + self.num_swap] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip, server.port) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.swap_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] self.log.info("DATA ACCESS PHASE") self.loaders = SwapRebalanceBase.start_access_phase(self, master) self.log.info("SWAP REBALANCE PHASE") rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) SwapRebalanceBase.sleep(self, 10, "Rebalance should start") self.log.info("FAIL SWAP REBALANCE PHASE @ {0}".format( self.percentage_progress)) reached = RestHelper(rest).rebalance_reached(self.percentage_progress) if reached and RestHelper(rest).is_cluster_rebalanced(): # handle situation when rebalance failed at the beginning self.log.error('seems rebalance failed!') rest.print_UI_logs() self.fail("rebalance failed even before killing memcached") bucket = rest.get_buckets()[0].name pid = None if self.swap_orchestrator and not self.cluster_run: # get PID via remote connection if master is a new node shell = RemoteMachineShellConnection(master) pid = shell.get_memcache_pid() shell.disconnect() else: times = 2 if self.cluster_run: times = 20 for i in xrange(times): try: _mc = MemcachedClientHelper.direct_client(master, bucket) pid = _mc.stats()["pid"] break except (EOFError, KeyError) as e: self.log.error("{0}.Retry in 2 sec".format(e)) SwapRebalanceBase.sleep(self, 2) if pid is None: # sometimes pid is not returned by mc.stats() shell = RemoteMachineShellConnection(master) pid = shell.get_memcache_pid() shell.disconnect() if pid is None: self.fail("impossible to get a PID") command = "os:cmd(\"kill -9 {0} \")".format(pid) self.log.info(command) killed = rest.diag_eval(command) self.log.info("killed {0}:{1}?? {2} ".format(master.ip, master.port, killed)) self.log.info("sleep for 10 sec after kill memcached") SwapRebalanceBase.sleep(self, 10) # we can't get stats for new node when rebalance falls if not self.swap_orchestrator: ClusterOperationHelper._wait_warmup_completed(self, [master], bucket, wait_time=600) i = 0 # we expect that rebalance will be failed try: rest.monitorRebalance() except RebalanceFailedException: # retry rebalance if it failed self.log.warn("Rebalance failed but it's expected") SwapRebalanceBase.sleep(self, 30) self.assertFalse(RestHelper(rest).is_cluster_rebalanced(), msg="cluster need rebalance") knownNodes = rest.node_statuses() self.log.info("nodes are still in cluster: {0}".format([ (node.ip, node.port) for node in knownNodes ])) ejectedNodes = list( set(optNodesIds) & set([node.id for node in knownNodes])) rest.rebalance(otpNodes=[node.id for node in knownNodes], ejectedNodes=ejectedNodes) SwapRebalanceBase.sleep(self, 10, "Wait for rebalance to start") self.assertTrue( rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format( toBeEjectedNodes)) else: self.log.info("rebalance completed successfully") SwapRebalanceBase.verification_phase(self, master)
def _common_test_body_swap_rebalance(self, do_stop_start=False): master = self.servers[0] rest = RestConnection(master) num_initial_servers = self.num_initial_servers creds = self.input.membase_settings intial_severs = self.servers[:num_initial_servers] self.log.info("CREATE BUCKET PHASE") SwapRebalanceBase.create_buckets(self) # Cluster all starting set of servers self.log.info("INITIAL REBALANCE PHASE") status, servers_rebalanced = RebalanceHelper.rebalance_in(intial_severs, len(intial_severs) - 1) self.assertTrue(status, msg="Rebalance was failed") self.log.info("DATA LOAD PHASE") self.loaders = SwapRebalanceBase.start_load_phase(self, master) # Wait till load phase is over SwapRebalanceBase.stop_load(self.loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.num_swap) optNodesIds = [node.id for node in toBeEjectedNodes] if self.swap_orchestrator: status, content = ClusterOperationHelper.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) if self.num_swap is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content for node in optNodesIds: self.log.info("removing node {0} and rebalance afterwards".format(node)) new_swap_servers = self.servers[num_initial_servers:num_initial_servers + self.num_swap] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.swap_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] if self.do_access: self.log.info("DATA ACCESS PHASE") self.loaders = SwapRebalanceBase.start_access_phase(self, master) self.log.info("SWAP REBALANCE PHASE") rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) if do_stop_start: # Rebalance is stopped at 20%, 40% and 60% completion retry = 0 for expected_progress in (20, 40, 60): self.log.info("STOP/START SWAP REBALANCE PHASE WITH PROGRESS {0}%". format(expected_progress)) while True: progress = rest._rebalance_progress() if progress < 0: self.log.error("rebalance progress code : {0}".format(progress)) break elif progress == 100: self.log.warn("Rebalance has already reached 100%") break elif progress >= expected_progress: self.log.info("Rebalance will be stopped with {0}%".format(progress)) stopped = rest.stop_rebalance() self.assertTrue(stopped, msg="unable to stop rebalance") SwapRebalanceBase.sleep(self, 20) rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) break elif retry > 100: break else: retry += 1 SwapRebalanceBase.sleep(self, 1) self.assertTrue(rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format(optNodesIds)) SwapRebalanceBase.verification_phase(self, master)
def _add_back_failed_node(self, do_node_cleanup=False): master = self.servers[0] rest = RestConnection(master) creds = self.input.membase_settings self.log.info("CREATE BUCKET PHASE") SwapRebalanceBase.create_buckets(self) # Cluster all servers self.log.info("INITIAL REBALANCE PHASE") status, servers_rebalanced = RebalanceHelper.rebalance_in( self.servers, len(self.servers) - 1) self.assertTrue(status, msg="Rebalance was failed") self.log.info("DATA LOAD PHASE") self.loaders = SwapRebalanceBase.start_load_phase(self, master) # Wait till load phase is over SwapRebalanceBase.stop_load(self.loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes( master, howmany=self.failover_factor) optNodesIds = [node.id for node in toBeEjectedNodes] # List of servers that will not be failed over not_failed_over = [] for server in self.servers: if self.cluster_run: if server.port not in [node.port for node in toBeEjectedNodes]: not_failed_over.append(server) self.log.info("Node {0}:{1} not failed over".format( server.ip, server.port)) else: if server.ip not in [node.ip for node in toBeEjectedNodes]: not_failed_over.append(server) self.log.info("Node {0}:{1} not failed over".format( server.ip, server.port)) if self.fail_orchestrator: status, content = ClusterOperationHelper.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) # When swapping all the nodes if self.num_swap is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content master = not_failed_over[-1] self.log.info("DATA ACCESS PHASE") self.loaders = SwapRebalanceBase.start_access_phase(self, master) # Failover selected nodes for node in optNodesIds: self.log.info( "failover node {0} and rebalance afterwards".format(node)) rest.fail_over(node) rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], \ ejectedNodes=optNodesIds) self.assertTrue( rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format( optNodesIds)) # Add back the same failed over nodes # Cleanup the node, somehow # TODO: cluster_run? if do_node_cleanup: pass # Make rest connection with node part of cluster rest = RestConnection(master) # Given the optNode, find ip add_back_servers = [] nodes = rest.get_nodes() for server in nodes: if isinstance(server.ip, unicode): add_back_servers.append(server) final_add_back_servers = [] for server in self.servers: if self.cluster_run: if server.port not in [serv.port for serv in add_back_servers]: final_add_back_servers.append(server) else: if server.ip not in [serv.ip for serv in add_back_servers]: final_add_back_servers.append(server) for server in final_add_back_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip, server.port) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=[]) self.assertTrue( rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format( add_back_servers)) SwapRebalanceBase.verification_phase(self, master)
def _add_back_failed_node(self, do_node_cleanup=False): master = self.servers[0] rest = RestConnection(master) creds = self.input.membase_settings self.log.info("CREATE BUCKET PHASE") SwapRebalanceBase.create_buckets(self) # Cluster all servers self.log.info("INITIAL REBALANCE PHASE") status, servers_rebalanced = RebalanceHelper.rebalance_in(self.servers, len(self.servers) - 1) self.assertTrue(status, msg="Rebalance was failed") self.log.info("DATA LOAD PHASE") self.loaders = SwapRebalanceBase.start_load_phase(self, master) # Wait till load phase is over SwapRebalanceBase.stop_load(self.loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.failover_factor) optNodesIds = [node.id for node in toBeEjectedNodes] # List of servers that will not be failed over not_failed_over = [] for server in self.servers: if server.ip not in [node.ip for node in toBeEjectedNodes]: not_failed_over.append(server) self.log.info("Node %s not failed over" % server.ip) if self.fail_orchestrator: status, content = ClusterOperationHelper.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) # When swapping all the nodes if self.num_swap is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content master = not_failed_over[-1] self.log.info("DATA ACCESS PHASE") self.loaders = SwapRebalanceBase.start_access_phase(self, master) # Failover selected nodes for node in optNodesIds: self.log.info("failover node {0} and rebalance afterwards".format(node)) rest.fail_over(node) rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], \ ejectedNodes=optNodesIds) self.assertTrue(rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format(optNodesIds)) # Add back the same failed over nodes # Cleanup the node, somehow # TODO: cluster_run? if do_node_cleanup: pass # Make rest connection with node part of cluster rest = RestConnection(master) # Given the optNode, find ip add_back_servers = [] nodes = rest.get_nodes() for server in [node.ip for node in nodes]: if isinstance(server, unicode): add_back_servers.append(server) final_add_back_servers = [] for server in self.servers: if server.ip not in add_back_servers: final_add_back_servers.append(server) for server in final_add_back_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=[]) self.assertTrue(rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format(add_back_servers)) SwapRebalanceBase.verification_phase(self, master)
def _common_test_body_failed_swap_rebalance(self): master = self.servers[0] rest = RestConnection(master) num_initial_servers = self.num_initial_servers creds = self.input.membase_settings intial_severs = self.servers[:num_initial_servers] # Cluster all starting set of servers self.log.info("INITIAL REBALANCE PHASE") RebalanceHelper.rebalance_in(intial_severs, len(intial_severs)-1) self.log.info("CREATE BUCKET PHASE") SwapRebalanceBase.create_buckets(self) self.log.info("DATA LOAD PHASE") loaders = SwapRebalanceBase.start_load_phase(self, master) # Wait till load phase is over SwapRebalanceBase.stop_load(loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.num_swap) optNodesIds = [node.id for node in toBeEjectedNodes] if self.swap_orchestrator: status, content = ClusterHelper.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) # When swapping all the nodes if self.num_swap is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content for node in optNodesIds: self.log.info("removing node {0} and rebalance afterwards".format(node)) new_swap_servers = self.servers[num_initial_servers:num_initial_servers+self.num_swap] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.swap_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] self.log.info("DATA ACCESS PHASE") loaders = SwapRebalanceBase.start_access_phase(self, master) self.log.info("SWAP REBALANCE PHASE") rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()],\ ejectedNodes=optNodesIds) # Rebalance is failed at 20%, 40% and 60% completion for i in [1, 2, 3]: expected_progress = 20*i self.log.info("FAIL SWAP REBALANCE PHASE @ {0}".format(expected_progress)) reached = RestHelper(rest).rebalance_reached(expected_progress) command = "[erlang:exit(element(2, X), kill) || X <- supervisor:which_children(ns_port_sup)]." memcached_restarted = rest.diag_eval(command) self.assertTrue(memcached_restarted, "unable to restart memcached/moxi process through diag/eval") time.sleep(20) rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()],\ ejectedNodes=optNodesIds) # Stop loaders SwapRebalanceBase.stop_load(loaders) self.assertTrue(rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format(toBeEjectedNodes)) self.log.info("DONE DATA ACCESS PHASE") #for bucket in rest.get_buckets(): # SwapRebalanceBase.verify_data(new_swap_servers[0], bucket_data[bucket.name].get('inserted_keys'),\ # bucket.name, self) # RebalanceHelper.wait_for_persistence(master, bucket.name) self.log.info("VERIFICATION PHASE") SwapRebalanceBase.items_verification(master, self)
def _common_test_body_swap_rebalance(self, do_stop_start=False): master = self.servers[0] rest = RestConnection(master) num_initial_servers = self.num_initial_servers creds = self.input.membase_settings intial_severs = self.servers[:num_initial_servers] self.log.info("CREATE BUCKET PHASE") SwapRebalanceBase.create_buckets(self) # Cluster all starting set of servers self.log.info("INITIAL REBALANCE PHASE") status, servers_rebalanced = RebalanceHelper.rebalance_in( intial_severs, len(intial_severs) - 1) self.assertTrue(status, msg="Rebalance was failed") self.log.info("DATA LOAD PHASE") self.loaders = SwapRebalanceBase.start_load_phase(self, master) # Wait till load phase is over SwapRebalanceBase.stop_load(self.loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.num_swap) optNodesIds = [node.id for node in toBeEjectedNodes] if self.swap_orchestrator: status, content = ClusterOperationHelper.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) if self.num_swap is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content for node in optNodesIds: self.log.info( "removing node {0} and rebalance afterwards".format(node)) new_swap_servers = self.servers[ num_initial_servers:num_initial_servers + self.num_swap] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip, server.port) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.swap_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] if self.do_access: self.log.info("DATA ACCESS PHASE") self.loaders = SwapRebalanceBase.start_access_phase(self, master) self.log.info("SWAP REBALANCE PHASE") rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) if do_stop_start: # Rebalance is stopped at 20%, 40% and 60% completion retry = 0 for expected_progress in (20, 40, 60): self.log.info( "STOP/START SWAP REBALANCE PHASE WITH PROGRESS {0}%". format(expected_progress)) while True: progress = rest._rebalance_progress() if progress < 0: self.log.error( "rebalance progress code : {0}".format(progress)) break elif progress == 100: self.log.warn("Rebalance has already reached 100%") break elif progress >= expected_progress: self.log.info( "Rebalance will be stopped with {0}%".format( progress)) stopped = rest.stop_rebalance() self.assertTrue(stopped, msg="unable to stop rebalance") SwapRebalanceBase.sleep(self, 20) rest.rebalance(otpNodes=[ node.id for node in rest.node_statuses() ], ejectedNodes=optNodesIds) break elif retry > 100: break else: retry += 1 SwapRebalanceBase.sleep(self, 1) self.assertTrue( rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format( optNodesIds)) SwapRebalanceBase.verification_phase(self, master)
def _common_test_body_failed_swap_rebalance(self): master = self.servers[0] rest = RestConnection(master) num_initial_servers = self.num_initial_servers creds = self.input.membase_settings intial_severs = self.servers[:num_initial_servers] self.log.info("CREATE BUCKET PHASE") SwapRebalanceBase.create_buckets(self) # Cluster all starting set of servers self.log.info("INITIAL REBALANCE PHASE") RebalanceHelper.rebalance_in(intial_severs, len(intial_severs) - 1) self.log.info("DATA LOAD PHASE") self.loaders = SwapRebalanceBase.start_load_phase(self, master) # Wait till load phase is over SwapRebalanceBase.stop_load(self.loaders, do_stop=False) self.log.info("DONE LOAD PHASE") # Start the swap rebalance current_nodes = RebalanceHelper.getOtpNodeIds(master) self.log.info("current nodes : {0}".format(current_nodes)) toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.num_swap) optNodesIds = [node.id for node in toBeEjectedNodes] if self.swap_orchestrator: status, content = ClusterOperationHelper.find_orchestrator(master) self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\ format(status, content)) # When swapping all the nodes if self.num_swap is len(current_nodes): optNodesIds.append(content) else: optNodesIds[0] = content for node in optNodesIds: self.log.info("removing node {0} and rebalance afterwards".format(node)) new_swap_servers = self.servers[num_initial_servers:num_initial_servers + self.num_swap] for server in new_swap_servers: otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip) msg = "unable to add node {0} to the cluster" self.assertTrue(otpNode, msg.format(server.ip)) if self.swap_orchestrator: rest = RestConnection(new_swap_servers[0]) master = new_swap_servers[0] self.log.info("DATA ACCESS PHASE") self.loaders = SwapRebalanceBase.start_access_phase(self, master) self.log.info("SWAP REBALANCE PHASE") rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=optNodesIds) # Rebalance is failed at 20%, 40% and 60% completion for i in [1, 2, 3]: expected_progress = 20 * i self.log.info("FAIL SWAP REBALANCE PHASE @ {0}".format(expected_progress)) RestHelper(rest).rebalance_reached(expected_progress) bucket = rest.get_buckets()[0].name pid = None if self.swap_orchestrator: # get PID via remote connection if master is a new node shell = RemoteMachineShellConnection(master) o, _ = shell.execute_command("ps -eo comm,pid | awk '$1 == \"memcached\" { print $2 }'") pid = o[0] shell.disconnect() else: for i in xrange(2): try: _mc = MemcachedClientHelper.direct_client(master, bucket) pid = _mc.stats()["pid"] break except EOFError as e: self.log.error("{0}.Retry in 2 sec".format(e)) time.sleep(1) if pid is None: self.fail("impossible to get a PID") command = "os:cmd(\"kill -9 {0} \")".format(pid) self.log.info(command) killed = rest.diag_eval(command) self.log.info("killed {0}:{1}?? {2} ".format(master.ip, master.port, killed)) self.log.info("sleep for 10 sec after kill memcached") time.sleep(10) # we can't get stats for new node when rebalance falls if not self.swap_orchestrator: ClusterOperationHelper._wait_warmup_completed(self, [master], bucket, wait_time=600) i = 0 #we expect that rebalance will be failed while rest._rebalance_progress_status() == "running" and i < 60: self.log.info("rebalance progress: {0}".format(rest._rebalance_progress())) time.sleep(1) i += 1 self.log.info("rebalance progress status:{0}".format(rest._rebalance_progress_status())) knownNodes = rest.node_statuses(); self.log.info("nodes are still in cluster: {0}".format([(node.ip, node.port) for node in knownNodes])) ejectedNodes = list(set(optNodesIds) & set([node.id for node in knownNodes])) rest.rebalance(otpNodes=[node.id for node in knownNodes], ejectedNodes=ejectedNodes) self.assertTrue(rest.monitorRebalance(), msg="rebalance operation failed after adding node {0}".format(toBeEjectedNodes)) SwapRebalanceBase.verification_phase(self, master)