def test_simple_ui_request(self): rest = RestConnection(self.master) passed = True self.log.info("GET " + rest.query_baseUrl) status, content, header = rest._http_request(rest.query_baseUrl) self.log.info(header) if not status: self.log.info("wrong status for {0} GET request {1}".format( rest.query_baseUrl, header)) passed = False self.assertTrue(passed, msg="some GET requests failed. See logs above") _, content, _ = rest._http_request(rest.query_baseUrl) occurrences = [ m.start() for m in re.finditer('web request failed', content) ] for occurrence in occurrences: subcontent = content[occurrence - 1000:occurrence + 1000] if 'path,"/diag"' in subcontent: break else: passed = False self.log.info(subcontent) self.assertTrue( passed, "some web request failed in the server logs. See logs above")
def delete_task(self, state, repo_id, task_category, task_name): """ Delete a task """ rest = RestConnection(self.server) assert(task_category in ['one-off', 'scheduled']) status, content, header = rest._http_request( rest.baseUrl + "_p/backup/internal/v1/cluster/self/repository/{}/{}/task/{}/{}".format(state, repo_id, task_category, task_name), 'DELETE')
def _retrieve_user_roles(self): rest = RestConnection(self.master_ip) url = "/settings/rbac/users" api = rest.baseUrl + url status, content, header = rest._http_request(api, 'GET') log.info(" Retrieve User Roles - Status - {0} -- Content - {1} -- Header - {2}".format(status, content, header)) return status, content, header
def get_index_storage_stats(self, index_node=None, timeout=120): if index_node is None: index_node = self.index_node api = self.get_index_baseURL() + 'stats/storage' self.log.info("api is:" + str(api)) content = None counter = 0 while content is None: rest_client = RestConnection(index_node) status, content, header = rest_client._http_request( api, timeout=timeout) if not status: raise Exception(content) counter += 1 if counter > 10: break json_parsed = json.loads(content) index_storage_stats = {} for index_stats in json_parsed: bucket = index_stats["Index"].split(":")[0] index_name = index_stats["Index"].split(":")[-1] if bucket not in list(index_storage_stats.keys()): index_storage_stats[bucket] = {} index_storage_stats[bucket][index_name] = index_stats["Stats"] return index_storage_stats
def test_simple_ui_request(self): rest = RestConnection(self.master) passed = True for api in ["", "versions", "pools", "pools/default", "pools/nodes", "pools/default/overviewStats", "pools/default/buckets", "pools/default/buckets/@query/stats", "pools/default/nodeServices", "pools/default/remoteClusters", "pools/default/serverGroups", "pools/default/certificate", "pools/default/settings/memcached/global", "nodeStatuses", "logs", "settings/web", "settings/alerts", "settings/stats", "settings/autoFailover", "settings/maxParallelIndexers", "settings/viewUpdateDaemon", "settings/autoCompaction", "settings/replications", "settings/replications", "settings/saslauthdAuth", "settings/audit", "internalSettings", "nodes/self/xdcrSSLPorts", "indexStatus", #"diag/vbuckets", MB-15080 "settings/indexes", "diag", #"diag/ale", MB-15080 "pools/default/rebalanceProgress", "pools/default/tasks", "index.html", "sasl_logs", "couchBase", "sampleBuckets"]: url = rest.baseUrl + api self.log.info("GET " + url) try: status, content, header = rest._http_request(url) except IncompleteRead, e: self.log.warn("size of partial responce {0} api is {1} bytes".format(api, sys.getsizeof(e.partial))) if api != "diag": #otherwise for /diag API we should increase request time for dynamic data in _http_request passed = False continue self.log.info(header) if not self.is_linux and api == "settings/saslauthdAuth": #This http API endpoint is only supported in enterprise edition running on GNU/Linux continue if not status: self.log.info("wrong status for {0} GET request {1}".format(url, header)) passed = False
def is_backup_service_running(self): """ Returns true if the backup service is running. """ rest = RestConnection(self.server) return 'backupAPI' in json.loads( rest._http_request(rest.baseUrl + "pools/default/nodeServices") [1])['nodesExt'][0]['services'].keys()
def _retrieve_user_roles(self): rest = RestConnection(self.master_ip) url = "/settings/rbac/users" api = rest.baseUrl + url status, content, header = rest._http_request(api, 'GET') log.info(" Retrieve User Roles - Status - {0} -- Content - {1} -- Header - {2}".format(status, content, header)) return status, content, header
def upload_client_cert_settings(self, server=None): """ Upload client cert settings(that was initialized in init function) to CB server """ if server is None: server = self.host with open(x509main.CACERTFILEPATH + x509main.CLIENT_CERT_AUTH_JSON, 'rb') as fh: data = fh.read() self.log.info("Client cert to be Uploaded -- {0}".format(data)) rest = RestConnection(server) authorization = base64.encodestring(('%s:%s' % (rest.username, rest.password)).encode()).decode().rstrip("\n") headers = {'Content-Type': 'application/octet-stream', 'Authorization': 'Basic %s' % authorization, 'Accept': '*/*'} url = "settings/clientCertAuth" api = rest.baseUrl + url tries = 0 while tries < 4: status, content, response = rest._http_request( api, method='POST', params=data, headers=headers, timeout=300) if status: return content else: tries = tries + 1 if tries >= 4: raise Exception(content)
class NsServerIngress(UserResourceTask): """ Produces throughput for ingress_mib_per_min """ def __init__(self, user, node): super(NsServerIngress, self).__init__(user, node) self.rest = RestConnection(node) self.workers = [] self.threads = 10 for _ in range(self.threads): self.workers.append(IngressThroughputWorker(self.node)) for worker in self.workers: worker.start() def on_throughput_update(self, throughput): for worker in self.workers: worker.throughput.set(throughput / self.threads) if throughput == 0: for worker in self.workers: worker.stop() on_throughput_increase = on_throughput_update on_throughput_decrease = on_throughput_update def get_throughput_success(self): return sum(worker.throughput_success.get() for worker in self.workers) def error(self): return self.rest._http_request(self.rest.baseUrl + "/pools/default")[1] def expected_error(self): return 'Limit(s) exceeded [ingress]'
def test_simple_ui_request(self): rest = RestConnection(self.master) passed = True for api in ["", "versions", "pools", "pools/default", "pools/nodes", "pools/default/overviewStats", "pools/default/buckets", "pools/default/buckets/@query/stats", "pools/default/nodeServices", "pools/default/remoteClusters", "pools/default/serverGroups", "pools/default/certificate", "pools/default/settings/memcached/global", "nodeStatuses", "logs", "settings/web", "settings/alerts", "settings/stats", "settings/autoFailover", "settings/maxParallelIndexers", "settings/viewUpdateDaemon", "settings/autoCompaction", "settings/replications", "settings/replications", "settings/saslauthdAuth", "settings/audit", "internalSettings", "nodes/self/xdcrSSLPorts", "indexStatus", #"diag/vbuckets", MB-15080 "settings/indexes", "diag", #"diag/ale", MB-15080 "pools/default/rebalanceProgress", "pools/default/tasks", "index.html", "sasl_logs", "couchBase", "sampleBuckets"]: url = rest.baseUrl + api self.log.info("GET " + url) try: status, content, header = rest._http_request(url) except IncompleteRead as e: self.log.warning("size of partial responce {0} api is {1} bytes".format(api, sys.getsizeof(e.partial))) if api != "diag": #otherwise for /diag API we should increase request time for dynamic data in _http_request passed = False continue self.log.info(header) if not self.is_linux and api == "settings/saslauthdAuth": #This http API endpoint is only supported in enterprise edition running on GNU/Linux continue if not status: self.log.info("wrong status for {0} GET request {1}".format(url, header)) passed = False self.assertTrue(passed, msg="some GET requests failed. See logs above") _, content, _ = rest._http_request(rest.baseUrl + "sasl_logs") occurrences = [m.start() for m in re.finditer('web request failed', str(content))] for occurrence in occurrences: subcontent = content[occurrence - 1000: occurrence + 1000] if 'path,"/diag"' in str(subcontent): break else: passed = False self.log.info(subcontent) self.assertTrue(passed, "some web request failed in the server logs. See logs above")
def _retrive_all_user_role(self, user_list=None ): server = self.master_ip rest = RestConnection(server) url = "/settings/rbac/roles" api = rest.baseUrl + url status, content, header = rest._http_request(api, 'GET') log.info(" Retrieve all User roles - Status - {0} -- Content - {1} -- Header - {2}".format(status, content, header)) return status, content, header
def _retrive_all_user_role(self, user_list=None ): server = self.master_ip rest = RestConnection(server) url = "/settings/rbac/roles" api = rest.baseUrl + url status, content, header = rest._http_request(api, 'GET') log.info(" Retrieve all User roles - Status - {0} -- Content - {1} -- Header - {2}".format(status, content, header)) return status, content, header
def _delete_user(self, user_name): rest = RestConnection(self.master_ip) url = "/settings/rbac/users/" + user_name api = rest.baseUrl + url status, content, header = rest._http_request(api, 'DELETE') print status print content print header return status, content, header
def _retrieve_user_roles(self): rest = RestConnection(self.master_ip) url = "/settings/rbac/users" api = rest.baseUrl + url status, content, header = rest._http_request(api, 'GET') print status print content print header return status, content, header
def _delete_user(self,user_name): rest = RestConnection(self.master_ip) url = "/settings/rbac/users/" + user_name api = rest.baseUrl + url status, content, header = rest._http_request(api, 'DELETE') print status print content print header return status, content, header
def enable_ldap(self): rest = RestConnection(self.master) api = rest.baseUrl + 'settings/saslauthdAuth' params = urllib.urlencode({ "enabled": 'true', "admins": [], "roAdmins": [] }) status, content, header = rest._http_request(api, 'POST', params)
def _retrieve_user_roles(self): rest = RestConnection(self.master_ip) url = "/settings/rbac/users" api = rest.baseUrl + url status, content, header = rest._http_request(api, 'GET') print status print content print header return status, content, header
def set_metadata_purge_interval(self, interval=0.04): # set it to 0.04 ie 1 hour if not given rest = RestConnection(self.cluster.master) params = {} api = rest.baseUrl + "controller/setAutoCompaction" params["purgeInterval"] = interval params["parallelDBAndViewCompaction"] = "false" params = urllib.urlencode(params) return rest._http_request(api, "POST", params)
def set_index_settings(self, setting_json, index_node): plasma_obj = PlasmaStatsUtil(index_node, server_task=self.task) api = plasma_obj.get_index_baseURL() + 'settings' rest_client = RestConnection(index_node) status, content, header = rest_client._http_request( api, 'POST', json.dumps(setting_json)) if not status: raise Exception(content) self.log.info("{0} set".format(setting_json))
def _set_user_roles(self,user_name,payload): rest = RestConnection(self.master_ip) if self.auth_type == "ldap" or self.auth_type == "pam": url = "settings/rbac/users/" + user_name elif self.auth_type == 'builtin': url = "settings/rbac/users/local/" + user_name api = rest.baseUrl + url status, content, header = rest._http_request(api, 'PUT', params=payload) log.info(" Set User Roles - Status - {0} -- Content - {1} -- Header - {2}".format(status, content, header)) return status, content, header
def _delete_user(self,user_name): rest = RestConnection(self.master_ip) if self.auth_type == 'ldap' or self.auth_type == "pam": url = "/settings/rbac/users/" + user_name else: url = "settings/rbac/users/local/" + user_name api = rest.baseUrl + url status, content, header = rest._http_request(api, 'DELETE') log.info (" Deleting User - Status - {0} -- Content - {1} -- Header - {2}".format(status, content, header)) return status, content, header
def _delete_user(self,user_name): rest = RestConnection(self.master_ip) if self.auth_type == 'ldap' or self.auth_type == "pam": url = "/settings/rbac/users/external/" + user_name else: url = "settings/rbac/users/local/" + user_name api = rest.baseUrl + url status, content, header = rest._http_request(api, 'DELETE') log.info (" Deleting User - Status - {0} -- Content - {1} -- Header - {2}".format(status, content, header)) return status, content, header
def _retrive_all_user_role(self, user_list=None): server = self.master_ip rest = RestConnection(server) url = "/settings/rbac/roles" api = rest.baseUrl + url status, content, header = rest._http_request(api, 'GET') print status print content print header return status, content, header
def _retrive_all_user_role(self, user_list=None ): server = self.master_ip rest = RestConnection(server) url = "/settings/rbac/roles" api = rest.baseUrl + url status, content, header = rest._http_request(api, 'GET') print status print content print header return status, content, header
def _set_user_roles(self,user_name,payload): rest = RestConnection(self.master_ip) if self.auth_type == "ldap" or self.auth_type == "pam": url = "settings/rbac/users/external/" + user_name elif self.auth_type == 'builtin': url = "settings/rbac/users/local/" + user_name api = rest.baseUrl + url status, content, header = rest._http_request(api, 'PUT', params=payload) log.info(" Set User Roles - Status - {0} -- Content - {1} -- Header - {2}".format(status, content, header)) return status, content, header
def _reload_node_certificate(self, host): rest = RestConnection(host) api = rest.baseUrl + "node/controller/reloadCertificate" http = httplib2.Http(disable_ssl_certificate_validation=self. disable_ssl_certificate_validation) status, content, header = rest._http_request( api, 'POST', headers=self._create_rest_headers('Administrator', 'password')) return status, content
def _check_user_permission(self,user_name,password,permission_set): rest = RestConnection(self.master_ip) url = "pools/default/checkPermissions/" param = permission_set api = rest.baseUrl + url authorization = base64.encodestring('%s:%s' % (user_name, password)) header = {'Content-Type': 'application/x-www-form-urlencoded', 'Authorization': 'Basic %s' % authorization, 'Accept': '*/*'} status, content, header = rest._http_request(api, 'POST', params=param,headers=header) return status, content, header
def _set_user_roles(self,user_name,payload): rest = RestConnection(self.master_ip) url = "settings/rbac/users/" + user_name #param = urllib.urlencode(payload) param = payload api = rest.baseUrl + url status, content, header = rest._http_request(api, 'PUT', param) print status print content print header return status, content, header
def _set_user_roles(self, user_name, payload): rest = RestConnection(self.master_ip) url = "settings/rbac/users/" + user_name #param = urllib.urlencode(payload) param = payload api = rest.baseUrl + url status, content, header = rest._http_request(api, 'PUT', param) print status print content print header return status, content, header
def test_simple_ui_request(self): rest = RestConnection(self.master) passed = True self.log.info("GET " + rest.query_baseUrl) status, content, header = rest._http_request(rest.query_baseUrl) self.log.info(header) if not status: self.log.info("wrong status for {0} GET request {1}".format(rest.query_baseUrl, header)) passed = False self.assertTrue(passed, msg="some GET requests failed. See logs above") _, content, _ = rest._http_request(rest.query_baseUrl) occurrences = [m.start() for m in re.finditer('web request failed', content)] for occurrence in occurrences: subcontent = content[occurrence - 1000: occurrence + 1000] if 'path,"/diag"' in subcontent: break else: passed = False self.log.info(subcontent) self.assertTrue(passed, "some web request failed in the server logs. See logs above")
def _check_user_permission(self,user_name,password,permission_set): rest = RestConnection(self.master_ip) url = "pools/default/checkPermissions/" param = permission_set api = rest.baseUrl + url authorization = base64.encodestring('%s:%s' % (user_name, password)) header = {'Content-Type': 'application/x-www-form-urlencoded', 'Authorization': 'Basic %s' % authorization, 'Accept': '*/*'} time.sleep(10) status, content, header = rest._http_request(api, 'POST', params=param,headers=header) return status, content, header
def _rest_client_wrapper(self,username,password, url,method,params,restClient,port=None): if restClient == None: restClient = self.master_ip if port != None: restClient.port=port rest = RestConnection(restClient) rest.username = username rest.password = password api = rest.baseUrl + url status, content, header = rest._http_request(api, method=method, params=params) print content print status return header['status']
def get_all_index_stat_map(self, index_node=None, timeout=120): if index_node is None: index_node = self.index_node rest_client = RestConnection(index_node) api = self.get_index_baseURL() + 'stats' status, content, header = rest_client._http_request(api, timeout=timeout) if status: json_parsed = json.loads(content) index_map = self.get_bucket_index_stats(json_parsed) index_stat_map = self.get_indexer_stats(json_parsed) index_stat_map['bucket_index_map'] = index_map return index_stat_map
def test_non_ssl_ports_after_enabling_tls(self): """ 1. Enforce TLS on cluster 2. For each component make a GET request on non-ssl port, and validate that it fails. 3. Make the same above request on TLS port and validate that it works 4. Repeat for all components 5. Disable n2n encryption on all nodes 6. For each component make a GET request on non-ssl port, and validate that it works """ self.enable_tls_encryption_cli_on_nodes(nodes=[self.cluster.master]) CbServer.use_https = True rest = RestConnection(self.cluster.master) for non_ssl_request in self.sample_urls_map.keys(): api = non_ssl_request % self.cluster.master.ip try: rest._http_request(api=api, timeout=10) except Exception as _: ssl_request = self.sample_urls_map[non_ssl_request] api = ssl_request % self.cluster.master.ip status, content, response = rest._http_request(api=api, timeout=10) if not status: self.fail("{0} failed".format(api)) else: self.log.error("{0} worked".format(api)) self.disable_n2n_encryption_cli_on_nodes(nodes=[self.cluster.master]) CbServer.use_https = False rest = RestConnection(self.cluster.master) for non_ssl_request in self.sample_urls_map.keys(): api = non_ssl_request % self.cluster.master.ip status, content, response = rest._http_request(api=api, timeout=10) if not status: self.fail("{0} api failed with content {1}".format( api, content))
class NsServerEgress(UserResourceTask): """ Produces throughput for egress_mib_per_min """ def __init__(self, user, node): super(NsServerEgress, self).__init__(user, node) self.rest = RestConnection(node) self.workers = [] self.threads = 10 for _ in range(self.threads): self.workers.append(EgressThroughputWorker(self.node)) for worker in self.workers: worker.start() def on_throughput_update(self, throughput): """ Updates document size """ document_size = throughput / (self.threads * self.workers[0].chunks) self.rest.set_document("default", "doc24601", create_document_of_size(document_size - 127)) for worker in self.workers: worker.throughput.set(throughput / self.threads) if throughput == 0: for worker in self.workers: worker.stop() on_throughput_increase = on_throughput_update on_throughput_decrease = on_throughput_update def get_throughput_success(self): return sum(worker.throughput_success.get() for worker in self.workers) def error(self): return self.rest._http_request(self.rest.baseUrl + "/pools/default")[1] def expected_error(self): return 'Limit(s) exceeded [egress]'
def build_info(node): rest = RestConnection(node) api = rest.baseUrl + 'nodes/self' status, content = rest._http_request(api) json_parsed = json.loads(content) return json_parsed
def rotate_data_key(self, host): rest = RestConnection(host) api = rest.baseUrl + "/node/controller/rotateDataKey" status, content, header = rest._http_request(api, 'POST') log.info("Status of rotate data key command - {0}".format(status)) return status
class CommunityTests(CommunityBaseTest): def setUp(self): super(CommunityTests, self).setUp() self.command = self.input.param("command", "") self.zone = self.input.param("zone", 1) self.replica = self.input.param("replica", 1) self.command_options = self.input.param("command_options", '') self.set_get_ratio = self.input.param("set_get_ratio", 0.9) self.item_size = self.input.param("item_size", 128) self.shutdown_zone = self.input.param("shutdown_zone", 1) self.do_verify = self.input.param("do-verify", True) self.num_node = self.input.param("num_node", 4) self.services = self.input.param("services", None) self.start_node_services = self.input.param("start_node_services", "kv") self.add_node_services = self.input.param("add_node_services", "kv") self.timeout = 6000 self.user_add = self.input.param("user_add", None) self.user_role = self.input.param("user_role", None) def tearDown(self): super(CommunityTests, self).tearDown() def test_disabled_zone(self): disabled_zone = False zone_name = "group1" serverInfo = self.servers[0] self.rest = RestConnection(serverInfo) try: self.log.info("create zone name 'group1'!") result = self.rest.add_zone(zone_name) print("result ", result) except Exception as e: if e: print(e) disabled_zone = True pass if not disabled_zone: self.fail("CE version should not have zone feature") def check_audit_available(self): audit_available = False try: self.rest.getAuditSettings() audit_available = True except Exception as e: if e: print(e) if audit_available: self.fail("This feature 'audit' only available on " "Enterprise Edition") def check_ldap_available(self): ldap_available = False self.rest = RestConnection(self.master) try: s, c, h = self.rest.clearLDAPSettings() if s: ldap_available = True except Exception as e: if e: print(e) if ldap_available: self.fail("This feature 'ldap' only available on " "Enterprise Edition") def check_set_services(self): self.rest.force_eject_node() self.sleep(7, "wait for node reset done") try: status = self.rest.init_node_services(hostname=self.master.ip, services=[self.services]) except Exception as e: if e: print(e) if self.services == "kv": if status: self.log.info("CE could set {0} only service.".format( self.services)) else: self.fail("Failed to set {0} only service.".format( self.services)) elif self.services == "index,kv": if status: self.fail("CE does not support kv and index on same node") else: self.log.info("services enforced in CE") elif self.services == "kv,n1ql": if status: self.fail("CE does not support kv and n1ql on same node") else: self.log.info("services enforced in CE") elif self.services == "kv,eventing": if status: self.fail("CE does not support kv and eventing on same node") else: self.log.info("services enforced in CE") elif self.services == "index,n1ql": if status: self.fail("CE does not support index and n1ql on same node") else: self.log.info("services enforced in CE") elif self.services == "index,kv,n1ql": if status: self.log.info( "CE could set all services {0} on same nodes.".format( self.services)) else: self.fail("Failed to set kv, index and query services on CE") elif self.version[:5] in COUCHBASE_FROM_WATSON: if self.version[: 5] in COUCHBASE_FROM_VULCAN and "eventing" in self.services: if status: self.fail("CE does not support eventing in vulcan") else: self.log.info("services enforced in CE") elif self.services == "fts,index,kv": if status: self.fail( "CE does not support fts, index and kv on same node") else: self.log.info("services enforced in CE") elif self.services == "fts,index,n1ql": if status: self.fail( "CE does not support fts, index and n1ql on same node") else: self.log.info("services enforced in CE") elif self.services == "fts,kv,n1ql": if status: self.fail( "CE does not support fts, kv and n1ql on same node") else: self.log.info("services enforced in CE") elif self.services == "fts,index,kv,n1ql": if status: self.log.info( "CE could set all services {0} on same nodes.".format( self.services)) else: self.fail("Failed to set " "fts, index, kv, and query services on CE") else: self.fail("some services don't support") def check_set_services_when_add_node(self): self.rest.force_eject_node() sherlock_services_in_ce = ["kv", "index,kv,n1ql"] watson_services_in_ce = ["kv", "index,kv,n1ql", "fts,index,kv,n1ql"] self.sleep(5, "wait for node reset done") kv_quota = 0 while kv_quota == 0: time.sleep(1) kv_quota = int(self.rest.get_nodes_self().mcdMemoryReserved) info = self.rest.get_nodes_self() kv_quota = int(info.mcdMemoryReserved * (CLUSTER_QUOTA_RATIO)) self.rest.set_service_memoryQuota(service='indexMemoryQuota', memoryQuota=INDEX_QUOTA) self.rest.set_service_memoryQuota(service='ftsMemoryQuota', memoryQuota=FTS_QUOTA) self.rest.init_cluster_memoryQuota( self.input.membase_settings.rest_username, self.input.membase_settings.rest_password, kv_quota - INDEX_QUOTA - FTS_QUOTA - 100) try: self.log.info("Initialize node with services {0}".format( self.start_node_services)) status = self.rest.init_node_services( hostname=self.master.ip, services=[self.start_node_services]) self.rest.init_cluster() except Exception as e: if e: print(e) if not status: if self.version not in COUCHBASE_FROM_WATSON and \ self.start_node_services not in sherlock_services_in_ce: self.log.info( "initial services setting enforced in Sherlock CE") elif self.version in COUCHBASE_FROM_WATSON and \ self.start_node_services not in watson_services_in_ce: self.log.info("initial services setting enforced in Watson CE") elif status: add_node = False try: self.log.info("node with services {0} try to add".format( self.add_node_services)) add_node = self.cluster.rebalance( self.servers[:2], self.servers[1:2], [], services=[self.add_node_services]) except Exception: pass if add_node: self.get_services_map() list_nodes = self.get_nodes_from_services_map( get_all_nodes=True) map = self.get_nodes_services() if map[self.master.ip] == self.start_node_services and \ map[self.servers[1].ip] == self.add_node_services: self.log.info( "services set correctly when node added & rebalance") else: self.fail("services set incorrectly when node added & rebalance. " "cluster expected services: {0}; set cluster services {1} ." "add node expected srv: {2}; set add node srv {3}"\ .format(map[self.master.ip], self.start_node_services, \ map[self.servers[1].ip], self.add_node_services)) else: if self.version not in COUCHBASE_FROM_WATSON: if self.start_node_services in ["kv", "index,kv,n1ql"] and \ self.add_node_services not in ["kv", "index,kv,n1ql"]: self.log.info("services are enforced in CE") elif self.start_node_services not in [ "kv", "index,kv,n1ql" ]: self.log.info("services are enforced in CE") else: self.fail("maybe bug in add node") elif self.version in COUCHBASE_FROM_WATSON: if self.start_node_services in ["kv", "index,kv,n1ql", "fts,index,kv,n1ql"] and self.add_node_services not in \ ["kv", "index,kv,n1ql", "fts,index,kv,n1ql"]: self.log.info("services are enforced in CE") elif self.start_node_services not in [ "kv", "index,kv,n1ql", "fts,index,kv,n1ql" ]: self.log.info("services are enforced in CE") else: self.fail("maybe bug in add node") else: self.fail("maybe bug in node initialization") def check_full_backup_only(self): """ for windows vm, ask IT to put uniq.exe at /cygdrive/c/Program Files (x86)/ICW/bin directory """ self.remote = RemoteMachineShellConnection(self.master) """ put params items=0 in test param so that init items = 0 """ self.remote.execute_command("{0}cbworkloadgen -n {1}:8091 -j -i 1000 " \ "-u Administrator -p password" \ .format(self.bin_path, self.master.ip)) """ delete backup location before run backup """ self.remote.execute_command("rm -rf {0}*".format(self.backup_location)) output, error = self.remote.execute_command("ls -lh {0}".format( self.backup_location)) self.remote.log_command_output(output, error) """ first full backup """ self.remote.execute_command("{0}cbbackup http://{1}:8091 {2} -m full " \ "-u Administrator -p password"\ .format(self.bin_path, self.master.ip, self.backup_c_location)) output, error = self.remote.execute_command("ls -lh {0}*/".format( self.backup_location)) self.remote.log_command_output(output, error) output, error = self.remote.execute_command("{0}cbtransfer -u Administrator "\ "-p password {1}*/*-full/ " \ "stdout: | grep set | uniq | wc -l"\ .format(self.bin_path, self.backup_c_location)) self.remote.log_command_output(output, error) if int(output[0]) != 1000: self.fail("full backup did not work in CE. " "Expected 1000, actual: {0}".format(output[0])) self.remote.execute_command("{0}cbworkloadgen -n {1}:8091 -j -i 1000 "\ " -u Administrator -p password --prefix=t_" .format(self.bin_path, self.master.ip)) """ do different backup mode """ self.remote.execute_command("{0}cbbackup -u Administrator -p password "\ "http://{1}:8091 {2} -m {3}"\ .format(self.bin_path, self.master.ip, self.backup_c_location, self.backup_option)) output, error = self.remote.execute_command("ls -lh {0}".format( self.backup_location)) self.remote.log_command_output(output, error) output, error = self.remote.execute_command("{0}cbtransfer -u Administrator "\ "-p password {1}*/*-{2}/ stdout: "\ "| grep set | uniq | wc -l"\ .format(self.bin_path, self.backup_c_location, self.backup_option)) self.remote.log_command_output(output, error) if int(output[0]) == 2000: self.log.info("backup option 'diff' is enforced in CE") elif int(output[0]) == 1000: self.fail("backup option 'diff' is not enforced in CE. " "Expected 2000, actual: {0}".format(output[0])) else: self.fail("backup failed to backup correct items") self.remote.disconnect() def check_ent_backup(self): """ for CE version from Watson, cbbackupmgr exe file should not in bin """ command = "cbbackupmgr" self.remote = RemoteMachineShellConnection(self.master) self.log.info("check if {0} in {1} directory".format( command, self.bin_path)) found = self.remote.file_exists(self.bin_path, command) if found: self.log.info("found {0} in {1} directory".format( command, self.bin_path)) self.log.info("Ent. backup in CE is in bin!") elif not found: self.fail( "CE from Cheshire Cat should contain {0}".format(command)) self.remote.disconnect() def check_memory_optimized_storage_mode(self): """ from Watson, CE should not have option 'memory_optimized' to set """ self.rest.force_eject_node() self.sleep(5, "wait for node reset done") try: self.log.info("Initialize node with 'Memory Optimized' option") status = self.rest.set_indexer_storage_mode( username=self.input.membase_settings.rest_username, password=self.input.membase_settings.rest_password, storageMode='memory_optimized') except Exception as ex: if ex: print(ex) if not status: self.log.info("Memory Optimized setting enforced in CE " "Could not set memory_optimized option") else: self.fail("Memory Optimzed setting does not enforced in CE " "We could set this option in") def check_plasma_storage_mode(self): """ from Watson, CE should not have option 'memory_optimized' to set """ self.rest.force_eject_node() self.sleep(5, "wait for node reset done") try: self.log.info("Initialize node with 'Memory Optimized' option") status = self.rest.set_indexer_storage_mode( username=self.input.membase_settings.rest_username, password=self.input.membase_settings.rest_password, storageMode='plasma') except Exception as ex: if ex: print(ex) if not status: self.log.info("Plasma setting enforced in CE " "Could not set Plasma option") else: self.fail("Plasma setting does not enforced in CE " "We could set this option in") def check_x509_cert(self): """ from Watson, X509 certificate only support in EE """ api = self.rest.baseUrl + "pools/default/certificate?extended=true" self.log.info("request to get certificate at " "'pools/default/certificate?extended=true' " "should return False") try: status, content, header = self.rest._http_request(api, 'GET') except Exception as ex: if ex: print(ex) if status: self.fail("This X509 certificate feature only available in EE") elif not status: if b'requires enterprise edition' in content: self.log.info("X509 cert is enforced in CE") def check_roles_base_access(self): """ from Watson, roles base access for admin should not in in CE """ if self.user_add is None: self.fail( "We need to pass user name (user_add) to run this test. ") if self.user_role is None: self.fail( "We need to pass user roles (user_role) to run this test. ") api = self.rest.baseUrl + "settings/rbac/users/" + self.user_add self.log.info("url to run this test: %s" % api) """ add admin user """ param = "name=%s&roles=%s" % (self.user_add, self.user_role) try: status, content, header = self.rest._http_request( api, 'PUT', param) except Exception as ex: if ex: print(ex) if status: self.fail("CE should not allow to add admin users") else: self.log.info("roles base is enforced in CE! ") def check_root_certificate(self): """ from watson, ce should not see root certificate manual test: curl -u Administrator:password -X GET http://localhost:8091/pools/default/certificate """ api = self.rest.baseUrl + "pools/default/certificate" try: status, content, header = self.rest._http_request(api, 'GET') except Exception as ex: if ex: print(ex) if status: self.fail("CE should not see root certificate!") elif b'requires enterprise edition' in content: self.log.info("root certificate is enforced in CE! ") def check_settings_audit(self): """ from watson, ce should not set audit manual test: curl -u Administrator:password -X GET http://localhost:8091/settings/audit """ api = self.rest.baseUrl + "settings/audit" try: status, content, header = self.rest._http_request(api, 'GET') except Exception as ex: if ex: print(ex) if status: self.fail("CE should not allow to set audit !") elif b'requires enterprise edition' in content: self.log.info("settings audit is enforced in CE! ") def check_infer(self): """ from watson, ce should not see infer manual test: curl -H "Content-Type: application/json" -X POST -d '{"statement":"infer `bucket_name`;"}' http://localhost:8093/query/service test params: new_services=kv-index-n1ql,default_bucket=False """ self.rest.force_eject_node() self.sleep(7, "wait for node reset done") self.rest.init_node() bucket = "default" self.rest.create_bucket(bucket, ramQuotaMB=200) api = self.rest.query_baseUrl + "query/service" param = urllib.parse.urlencode({"statement": "infer `%s` ;" % bucket}) try: status, content, header = self.rest._http_request( api, 'POST', param) json_parsed = json.loads(content) except Exception as ex: if ex: print(ex) if json_parsed["status"] == "success": self.fail("CE should not allow to run INFER !") elif json_parsed["status"] == "fatal": self.log.info("INFER is enforced in CE! ") def check_query_monitoring(self): self.rest.force_eject_node() self.sleep(7, "wait for node reset done") self.rest.init_node() bucket = "default" self.rest.create_bucket(bucket, ramQuotaMB=200) api = self.rest.query_baseUrl + "admin/settings" param = {'profile': 'phases'} try: status, content, header = self.rest._http_request( api, 'POST', json.dumps(param)) except Exception as ex: if ex: print(ex) if status: self.fail("CE should not be allowed to do query monitoring !") elif b'Profiling is an EE only feature' in content: self.log.info("Query monitoring is enforced in CE! ") def check_flex_index(self): """ from watson, ce should not see infer manual test: curl -H "Content-Type: application/json" -X POST -d '{"statement":"infer `bucket_name`;"}' http://localhost:8093/query/service test params: new_services=kv-index-n1ql,default_bucket=False """ self.rest.force_eject_node() self.sleep(7, "wait for node reset done") self.rest.init_node() bucket = "default" self.rest.create_bucket(bucket, ramQuotaMB=200) api = self.rest.query_baseUrl + "query/service" param = urllib.parse.urlencode({ "statement": "SELECT META(d).id FROM `%s` AS d USE INDEX (USING FTS) WHERE d.f2 = 100;" % bucket }) try: status, content, header = self.rest._http_request( api, 'POST', param) json_parsed = json.loads(content) except Exception as ex: if ex: print(ex) if json_parsed["status"] == "success": self.fail("CE should not allow to run flex index !") elif json_parsed["status"] == "fatal": self.log.info("Flex index is enforced in CE! ") def check_index_partitioning(self): self.rest.force_eject_node() self.sleep(7, "wait for node reset done") self.rest.init_node() bucket = "default" self.rest.create_bucket(bucket, ramQuotaMB=200) api = self.rest.query_baseUrl + "query/service" param = urllib.parse.urlencode({ "statement": "CREATE INDEX idx ON `%s`(id) PARTITION BY HASH(META().id)" % bucket }) try: status, content, header = self.rest._http_request( api, 'POST', param) json_parsed = json.loads(content) except Exception as ex: if ex: print(ex) if json_parsed["status"] == "success": self.fail("CE should not be allowed to run index partitioning !") elif json_parsed["status"] == "fatal": self.log.info("Index partitioning is enforced in CE! ") def check_query_cost_based_optimizer(self): self.rest.force_eject_node() self.sleep(7, "wait for node reset done") self.rest.init_node() bucket = "default" self.rest.create_bucket(bucket, ramQuotaMB=200) api = self.rest.query_baseUrl + "query/service" param = urllib.parse.urlencode({ "statement": "UPDATE STATISTICS for `hotel` (type, address, city, country, free_breakfast, id, phone);" }) try: status, content, header = self.rest._http_request( api, 'POST', param) json_parsed = json.loads(content) except Exception as ex: if ex: print(ex) if json_parsed["status"] == "success": self.fail("CE should not be allowed to run CBO !") elif json_parsed["status"] == "fatal": self.log.info("CBO is enforced in CE! ") def check_query_window_functions(self): self.rest.force_eject_node() self.sleep(7, "wait for node reset done") self.rest.init_node() bucket = "default" self.rest.create_bucket(bucket, ramQuotaMB=200) api = self.rest.query_baseUrl + "query/service" param = urllib.parse.urlencode({ "statement": "SELECT d.id, d.destinationairport, CUME_DIST() OVER (PARTITION BY d.destinationairport \ ORDER BY d.distance NULLS LAST) AS `rank` \ FROM `%s` AS d \ WHERE d.type='route' \ LIMIT 7;" % bucket }) try: status, content, header = self.rest._http_request( api, 'POST', param) json_parsed = json.loads(content) except Exception as ex: if ex: print(ex) if json_parsed["status"] == "success": self.fail("CE should not be allowed to use window functions !") elif json_parsed["status"] == "fatal": self.log.info("Window functions is enforced in CE! ") def check_auto_complete(self): """ this feature has not complete to block in CE """ """ Check new features from spock start here """ def check_cbbackupmgr(self): """ cbbackupmgr should not available in CE from spock """ if self.cb_version[:5] in COUCHBASE_FROM_SPOCK: file_name = "cbbackupmgr" + self.file_extension self.log.info("check if cbbackupmgr in bin dir in CE") result = self.remote.file_exists(self.bin_path, file_name) if result: self.fail("cbbackupmgr should not in bin dir of CE") else: self.log.info("cbbackupmgr is enforced in CE") self.remote.disconnect() def test_max_ttl_bucket(self): """ From vulcan, EE bucket has has an option to set --max-ttl, not it CE. This test is make sure CE could not create bucket with option --max-ttl This test must pass default_bucket=False """ if self.cb_version[:5] not in COUCHBASE_FROM_VULCAN: self.log.info("This test only for vulcan and later") return cmd = 'curl -X POST -u Administrator:password \ http://{0}:8091/pools/default/buckets \ -d name=bucket0 \ -d maxTTL=100 \ -d ramQuotaMB=100 '.format(self.master.ip) if self.cli_test: cmd = "{0}couchbase-cli bucket-create -c {1}:8091 --username Administrator \ --password password --bucket bucket0 --bucket-type couchbase \ --bucket-ramsize 512 --bucket-replica 1 --bucket-priority high \ --bucket-eviction-policy fullEviction --enable-flush 0 \ --enable-index-replica 1 --max-ttl 200".format( self.bin_path, self.master.ip) conn = RemoteMachineShellConnection(self.master) output, error = conn.execute_command(cmd) conn.log_command_output(output, error) mesg = "Max TTL is supported in enterprise edition only" if self.cli_test: mesg = "Maximum TTL can only be configured on enterprise edition" if output and mesg not in str(output[0]): self.fail("max ttl feature should not in Community Edition") buckets = RestConnection(self.master).get_buckets() if buckets: for bucket in buckets: self.log.info("bucekt in cluser: {0}".format(bucket.name)) if bucket.name == "bucket0": self.fail("Failed to enforce feature max ttl in CE.") conn.disconnect() def test_setting_audit(self): """ CE does not allow to set audit from vulcan 5.5.0 """ if self.cb_version[:5] not in COUCHBASE_FROM_VULCAN: self.log.info("This test only for vulcan and later") return cmd = 'curl -X POST -u Administrator:password \ http://{0}:8091/settings/audit \ -d auditdEnabled=true '.format(self.master.ip) if self.cli_test: cmd = "{0}couchbase-cli setting-audit -c {1}:8091 -u Administrator \ -p password --audit-enabled 1 --audit-log-rotate-interval 604800 \ --audit-log-path /opt/couchbase/var/lib/couchbase/logs --set"\ .format(self.bin_path, self.master.ip) conn = RemoteMachineShellConnection(self.master) output, error = conn.execute_command(cmd) conn.log_command_output(output, error) mesg = "This http API endpoint requires enterprise edition" if output and mesg not in str(output[0]): self.fail("setting-audit feature should not in Community Edition") conn.disconnect() def test_setting_autofailover_enterprise_only(self): """ CE does not allow set auto failover if disk has issue and failover group from vulcan 5.5.0 """ if self.cb_version[:5] not in COUCHBASE_FROM_VULCAN: self.log.info("This test only for vulcan and later") return self.failover_disk_period = self.input.param("failover_disk_period", False) self.failover_server_group = self.input.param("failover_server_group", False) failover_disk_period = "" if self.failover_disk_period: if self.cli_test: failover_disk_period = "--failover-data-disk-period 300" else: failover_disk_period = "-d failoverOnDataDiskIssues[timePeriod]=300" failover_server_group = "" if self.failover_server_group and self.cli_test: failover_server_group = "--enable-failover-of-server-group 1" cmd = 'curl -X POST -u Administrator:password \ http://{0}:8091/settings/autoFailover -d enabled=true -d timeout=120 \ -d maxCount=1 \ -d failoverOnDataDiskIssues[enabled]=true {1} \ -d failoverServerGroup={2}'.format(self.master.ip, failover_disk_period, self.failover_server_group) if self.cli_test: cmd = "{0}couchbase-cli setting-autofailover -c {1}:8091 \ -u Administrator -p password \ --enable-failover-on-data-disk-issues 1 {2} {3} "\ .format(self.bin_path, self.master.ip, failover_disk_period, failover_server_group) conn = RemoteMachineShellConnection(self.master) output, error = conn.execute_command(cmd) conn.log_command_output(output, error) mesg = "Auto failover on Data Service disk issues can only be " + \ "configured on enterprise edition" if not self.cli_test: if self.failover_disk_period or \ self.failover_server_group: if output and not error: self.fail("setting autofailover disk issues feature\ should not in Community Edition") else: if self.failover_server_group: mesg = "--enable-failover-of-server-groups can only be " + \ "configured on enterprise edition" if output and mesg not in str(output[0]): self.fail("Setting EE autofailover features \ should not in Community Edition") else: self.log.info("EE setting autofailover are disable in CE") conn.disconnect() def test_set_bucket_compression(self): """ CE does not allow to set bucket compression to bucket from vulcan 5.5.0. Mode compression: off,active,passive Note: must set defaultbucket=False for this test """ if self.cb_version[:5] not in COUCHBASE_FROM_VULCAN: self.log.info("This test only for vulcan and later") return self.compression_mode = self.input.param("compression_mode", "off") cmd = 'curl -X POST -u Administrator:password \ http://{0}:8091/pools/default/buckets \ -d name=bucket0 \ -d compressionMode={1} \ -d ramQuotaMB=100 '.format( self.master.ip, self.compression_mode) if self.cli_test: cmd = "{0}couchbase-cli bucket-create -c {1}:8091 --username Administrator \ --password password --bucket bucket0 --bucket-type couchbase \ --bucket-ramsize 512 --bucket-replica 1 --bucket-priority high \ --bucket-eviction-policy fullEviction --enable-flush 0 \ --enable-index-replica 1 --compression-mode {2}".format( self.bin_path, self.master.ip, self.compression_mode) conn = RemoteMachineShellConnection(self.master) output, error = conn.execute_command(cmd) conn.log_command_output(output, error) mesg = "Compression mode is supported in enterprise edition only" if self.cli_test: mesg = "Compression mode can only be configured on enterprise edition" if output and mesg not in str(output[0]): self.fail("Setting bucket compression should not in CE") conn.disconnect() def test_ldap_groups(self): """ LDAP Groups feature is not available in CE """ if self.cb_version[:5] not in COUCHBASE_FROM_MAD_HATTER: self.log.info("This test is only for MH and later") return cmd = 'curl -X POST -u Administrator:password \ http://{0}:8091/settings/rbac/groups/admins \ -d roles=admin \ -d description="Couchbase+Server+Administrators" \ --data-urlencode ldap_group_ref="uid=cbadmins,ou=groups,dc=example,dc=com"'\ .format(self.master.ip) if self.cli_test: cmd = '{0}couchbase-cli user-manage -c {1}:8091 --username Administrator \ --password password \ --set-group \ --group-name admins \ --roles admin \ --group-description "Couchbase Server Administrators" \ --ldap-ref "uid=cbadmins,ou=groups,dc=example,dc=com"'.format( self.bin_path, self.master.ip) conn = RemoteMachineShellConnection(self.master) output, error = conn.execute_command(cmd) conn.log_command_output(output, error) mesg = "Requested resource not found." if self.cli_test: mesg = "ERROR: This http API endpoint requires enterprise edition" if output and mesg not in str(output[0]): self.fail("LDAP Groups should not be in CE") conn.disconnect() def test_ldap_cert(self): """ LDAP Cert feature is not available in CE """ if self.cb_version[:5] not in COUCHBASE_FROM_MAD_HATTER: self.log.info("This test is only for MH and later") return cmd = 'curl -X POST -u Administrator:password http://{0}:8091/settings/ldap \ -d hosts={1} \ -d port=389 \ -d encryption=StartTLSExtension \ -d serverCertValidation=true \ --data-urlencode [email protected] \ -d bindDN="cn=admin,dc=example,dc=com" \ -d bindPass=password \ -d authenticationEnabled=true \ -d authorizationEnabled=true \ --data-urlencode groupsQuery="ou=groups,dc=example,dc=com??one?(member=%D)"'\ .format(self.master.ip, self.master.ip) if self.cli_test: cmd = '{0}couchbase-cli setting-ldap -c {1}:8091 --username Administrator \ --password password \ --authentication-enabled 1 \ --authorization-enabled 1 \ --hosts {2} \ --encryption startTLS \ --client-cert root.crt \ --bind-dn "cn=admin,dc=example,dc=com" \ --bind-password password \ --group-query "ou=groups,dc=example,dc=com??one?(member=%D)"'.format( self.bin_path, self.master.ip, self.master.ip) conn = RemoteMachineShellConnection(self.master) output, error = conn.execute_command(cmd) conn.log_command_output(output, error) mesg = "This http API endpoint requires enterprise edition" if self.cli_test: mesg = "ERROR: Command only available in enterprise edition" if output and mesg not in str(output[0]): self.fail("LDAP Cert should not be in CE") conn.disconnect() def test_network_encryption(self): """ Encrypted network access is not available in CE """ if self.cb_version[:5] not in COUCHBASE_FROM_MAD_HATTER: self.log.info("This test is only for MH and later") return cmd = 'curl -u Administrator:password -v -X POST \ http://{0}:8091/settings/security \ -d disableUIOverHttp=true \ -d clusterEncryptionLevel=control \ -d tlsMinVersion=tlsv1.1 \ -d "cipherSuites=["TLS_RSA_WITH_AES_128_CBC_SHA", "TLS_RSA_WITH_AES_256_CBC_SHA"]"'\ .format(self.master.ip) conn = RemoteMachineShellConnection(self.master) output, error = conn.execute_command(cmd) conn.log_command_output(output, error) mesg = "not supported in community edition" if output and mesg not in str(output[0]): self.fail("Encrypted network access should not be in CE") conn.disconnect() def test_n2n_encryption(self): """ Encrypted network access is not available in CE """ if self.cb_version[:5] not in COUCHBASE_FROM_MAD_HATTER: self.log.info("This test is only for MH and later") return cmd = '/opt/couchbase/bin/couchbase-cli node-to-node-encryption \ -c http://{0}:8091 \ -u Administrator \ -p password \ --enable'\ .format(self.master.ip) conn = RemoteMachineShellConnection(self.master) output, error = conn.execute_command(cmd) conn.log_command_output(output, error) mesg = "not supported in community edition" if output and mesg not in str(output[0]): self.fail("Encrypted network access should not be in CE") conn.disconnect() def test_log_redaction(self): """ Log redaction feature is not available in CE """ if self.cb_version[:5] not in COUCHBASE_FROM_MAD_HATTER: self.log.info("This test is only for MH and later") return cmd = 'curl -X POST -u Administrator:password \ http://{0}:8091/controller/startLogsCollection \ -d nodes="*" \ -d logRedactionLevel=partial'.format( self.master.ip) if self.cli_test: cmd = '{0}couchbase-cli collect-logs-start -c {1}:8091 --username Administrator \ --password password \ --all-nodes \ --redaction-level partial'.format( self.bin_path, self.master.ip) conn = RemoteMachineShellConnection(self.master) output, error = conn.execute_command(cmd) conn.log_command_output(output, error) mesg = "log redaction is an enterprise only feature" if output and mesg not in str(output[0]): self.fail("Log redaction should not be in CE") conn.disconnect()
class FailoverTests(FailoverBaseTest): def setUp(self): super(FailoverTests, self).setUp(self) def tearDown(self): super(FailoverTests, self).tearDown(self) def test_failover_firewall(self): self.common_test_body('firewall') def test_failover_normal(self): self.common_test_body('normal') def test_failover_stop_server(self): self.common_test_body('stop_server') def test_failover_then_add_back(self): self.add_back_flag = True self.common_test_body('normal') def common_test_body(self, failover_reason): """ Main Test body which contains the flow of the failover basic steps 1. Starts Operations if programmed into the test case (before/after) 2. Start View and Index Building operations 3. Failover K out of N nodes (failover can be HARDFAILOVER/GRACEFUL) 4.1 Rebalance the cluster is failover of K nodeStatuses 4.2 Run Add-Back operation with recoveryType = (full/delta) with rebalance 5. Verify all expected operations completed by checking stats, replicaiton, views, data correctness """ # Pick the reference node for communication # We pick a node in the cluster which will NOT be failed over self.filter_list = [] if self.failoverMaster: self.master = self.servers[1] self.log.info(" Picking node {0} as reference node for test case".format(self.master.ip)) self.print_test_params(failover_reason) self.rest = RestConnection(self.master) self.nodes = self.rest.node_statuses() # Set the data path for the cluster self.data_path = self.rest.get_data_path() # Check if the test case has to be run for 3.0.0 versions = self.rest.get_nodes_versions() self.version_greater_than_2_5 = True for version in versions: if "3" > version: self.version_greater_than_2_5 = False # Do not run this this test if graceful category is being used if not self.version_greater_than_2_5 and (self.graceful or (self.recoveryType != None)): self.log.error("Graceful failover can't be applied to nodes with version less then 3.*") self.log.error("Please check configuration parameters: SKIPPING TEST.") return # Find nodes that will under go failover if self.failoverMaster: self.chosen = RebalanceHelper.pick_nodes(self.master, howmany=1, target_node = self.servers[0]) else: self.chosen = RebalanceHelper.pick_nodes(self.master, howmany=self.num_failed_nodes) # Perform operations - Create/Update/Delete # self.withMutationOps = True => Run Operations in parallel to failover # self.withMutationOps = False => Run Operations Before failover self.load_initial_data() if not self.withMutationOps: self.run_mutation_operations() # Perform View Creation Tasks and check for completion if required before failover if self.withViewsOps: self.run_view_creation_operations(self.servers) if not self.createIndexesDuringFailover: self.query_and_monitor_view_tasks(self.servers) # Take snap-shot of data set used for validaiton record_static_data_set ={} prev_vbucket_stats = {} prev_failover_stats = {} if not self.withMutationOps: record_static_data_set = self.get_data_set_all(self.servers, self.buckets, path = None) # Capture vbucket and failover stats if test version >= 2.5.* if self.version_greater_than_2_5 and self.upr_check: prev_vbucket_stats = self.get_vbucket_seqnos(self.servers, self.buckets) prev_failover_stats = self.get_failovers_logs(self.servers, self.buckets) # Perform Operations relalted to failover if self.withMutationOps or self.withViewsOps or self.compact: self.run_failover_operations_with_ops(self.chosen, failover_reason) else: self.run_failover_operations(self.chosen, failover_reason) # Perform Add Back Operation with Rebalance Or only Rebalance with Verificaitons if not self.gracefulFailoverFail and self.runRebalanceAfterFailover: if self.add_back_flag: self.run_add_back_operation_and_verify(self.chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats) else: self.run_rebalance_after_failover_and_verify(self.chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats) else: return if self.during_ops == None: self.verify_unacked_bytes_all_buckets(filter_list = self.filter_list, master_node = self.master) def run_rebalance_after_failover_and_verify(self, chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats): """ Method to run rebalance after failover and verify """ # Need a delay > min because MB-7168 _servers_ = self.filter_servers(self.servers, chosen) self._wait_for_stats_all_buckets(_servers_, check_ep_items_remaining = True) self.sleep(5, "after failover before invoking rebalance...") # Rebalance after Failover operation self.rest.rebalance(otpNodes=[node.id for node in self.nodes],ejectedNodes=[node.id for node in chosen]) if self.during_ops: self.sleep(5, "Wait for some progress in rebalance") if self.during_ops == "change_password": old_pass = self.master.rest_password self.change_password(new_password=self.input.param("new_password", "new_pass")) self.rest = RestConnection(self.master) elif self.during_ops == "change_port": self.change_port(new_port=self.input.param("new_port", "9090")) self.rest = RestConnection(self.master) # Perform Compaction if self.compact: for bucket in self.buckets: self.cluster.compact_bucket(self.master,bucket) # Peform View Validation if Supported nodes = self.filter_servers(self.servers,chosen) if self.withViewsOps: self.query_and_monitor_view_tasks(nodes) # Run operations if required during rebalance after failover if self.withMutationOps: self.run_mutation_operations_after_failover() # Kill or restart operations if self.killNodes or self.stopNodes or self.firewallOnNodes: self.victim_node_operations(node = chosen[0]) self.log.info(" Start Rebalance Again !") self.rest.rebalance(otpNodes=[node.id for node in self.nodes],ejectedNodes=[node.id for node in chosen]) # Rebalance Monitoring msg = "rebalance failed while removing failover nodes {0}".format([node.id for node in chosen]) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg) # Reset password or port if self.during_ops: if self.during_ops == "change_password": self.change_password(new_password=old_pass) elif self.during_ops == "change_port": self.change_port(new_port='8091', current_port=self.input.param("new_port", "9090")) return # Drain Queue and make sure intra-cluster replication is complete self.log.info("Begin VERIFICATION for Rebalance after Failover Only") self.verify_cluster_stats(_servers_, self.master, check_bucket_stats = True, check_ep_items_remaining = True) # Verify all data set with meta data if failover happens after failover if not self.withMutationOps: self.sleep(60) self.data_analysis_all(record_static_data_set, _servers_, self.buckets, path = None, addedItems = None) # Check Cluster Stats and Data as well if max_verify > 0 # Check Failover logs :: Not sure about this logic, currently not checking, will update code once confirmed # Currently, only for checking case where we have graceful failover if self.version_greater_than_2_5 and self.graceful and self.upr_check: new_failover_stats = self.compare_failovers_logs(prev_failover_stats, _servers_, self.buckets) new_vbucket_stats = self.compare_vbucket_seqnos(prev_vbucket_stats, _servers_, self.buckets) self.compare_vbucketseq_failoverlogs(new_vbucket_stats, new_failover_stats) # Verify Active and Replica Bucket Count if self.num_replicas > 0: nodes = self.get_nodes_in_cluster(self.master) self.vb_distribution_analysis(servers = nodes, buckets = self.buckets, std = 20.0 , total_vbuckets = self.total_vbuckets) self.log.info("End VERIFICATION for Rebalance after Failover Only") def run_add_back_operation_and_verify(self, chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats): """ Method to run add-back operation with recovery type = (delta/full) It also verifies if the operations are correct with data verificaiton steps """ _servers_ = self.filter_servers(self.servers, chosen) self._wait_for_stats_all_buckets(_servers_, check_ep_items_remaining = True) serverMap = self.get_server_map(self.servers) recoveryTypeMap = self.define_maps_during_failover(self.recoveryType) fileMapsForVerification = self.create_file(chosen, self.buckets, serverMap) index = 0 for node in chosen: self.rest.add_back_node(node.id) self.sleep(5) if self.recoveryType: # define precondition for recoverytype self.rest.set_recovery_type(otpNode=node.id, recoveryType=self.recoveryType[index]) index += 1 self.sleep(20, "After failover before invoking rebalance...") self.rest.rebalance(otpNodes=[node.id for node in self.nodes],ejectedNodes=[],deltaRecoveryBuckets = self.deltaRecoveryBuckets) # Perform Compaction if self.compact: for bucket in self.buckets: self.cluster.compact_bucket(self.master,bucket) # Peform View Validation if Supported nodes = self.filter_servers(self.servers,chosen) if self.withViewsOps: self.query_and_monitor_view_tasks(nodes) # Run operations if required during rebalance after failover if self.withMutationOps: self.run_mutation_operations_after_failover() # Kill or restart operations if self.killNodes or self.stopNodes or self.firewallOnNodes: self.victim_node_operations(node = chosen[0]) self.log.info(" Start Rebalance Again !") self.rest.rebalance(otpNodes=[node.id for node in self.nodes],ejectedNodes=[],deltaRecoveryBuckets = self.deltaRecoveryBuckets) # Check if node has to be killed or restarted during rebalance # Monitor Rebalance msg = "rebalance failed while removing failover nodes {0}".format(chosen) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg) # Drain ep_queue and make sure that intra-cluster replication is complete self._wait_for_stats_all_buckets(self.servers, check_ep_items_remaining = True) self.log.info("Begin VERIFICATION for Add-back and rebalance") # Verify Stats of cluster and Data is max_verify > 0 self.verify_cluster_stats(self.servers, self.master, check_bucket_stats = True, check_ep_items_remaining = True) # Verify recovery Type succeeded if we added-back nodes self.verify_for_recovery_type(chosen, serverMap, self.buckets,recoveryTypeMap, fileMapsForVerification, self.deltaRecoveryBuckets) # Comparison of all data if required if not self.withMutationOps: self.sleep(60) self.data_analysis_all(record_static_data_set,self.servers, self.buckets, path = None, addedItems = None) # Verify if vbucket sequence numbers and failover logs are as expected # We will check only for version > 2.5.* and if the failover is graceful if self.version_greater_than_2_5 and self.graceful and self.upr_check: new_vbucket_stats = self.compare_vbucket_seqnos(prev_vbucket_stats, self.servers, self.buckets,perNode= False) new_failover_stats = self.compare_failovers_logs(prev_failover_stats,self.servers,self.buckets) self.compare_vbucketseq_failoverlogs(new_vbucket_stats, new_failover_stats) # Verify Active and Replica Bucket Count if self.num_replicas > 0: nodes = self.get_nodes_in_cluster(self.master) self.vb_distribution_analysis(servers = nodes, buckets = self.buckets, std = 20.0 , total_vbuckets = self.total_vbuckets) self.log.info("End VERIFICATION for Add-back and rebalance") def print_test_params(self, failover_reason): """ Method to print test parameters """ self.log.info("num_replicas : {0}".format(self.num_replicas)) self.log.info("recoveryType : {0}".format(self.recoveryType)) self.log.info("failover_reason : {0}".format(failover_reason)) self.log.info("num_failed_nodes : {0}".format(self.num_failed_nodes)) self.log.info('picking server : {0} as the master'.format(self.master)) def run_failover_operations(self, chosen, failover_reason): """ Method to run fail over operations used in the test scenario based on failover reason """ # Perform Operations relalted to failover graceful_count = 0 graceful_failover = True failed_over = True for node in chosen: unreachable = False if failover_reason == 'stop_server': unreachable=True self.stop_server(node) self.log.info("10 seconds delay to wait for membase-server to shutdown") # wait for 5 minutes until node is down self.assertTrue(RestHelper(self.rest).wait_for_node_status(node, "unhealthy", self.wait_timeout * 10), msg="node status is not unhealthy even after waiting for 5 minutes") elif failover_reason == "firewall": unreachable=True self.filter_list.append (node.ip) server = [srv for srv in self.servers if node.ip == srv.ip][0] RemoteUtilHelper.enable_firewall(server, bidirectional=self.bidirectional) status = RestHelper(self.rest).wait_for_node_status(node, "unhealthy", self.wait_timeout * 10) if status: self.log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port)) else: # verify iptables on the node if something wrong for server in self.servers: if server.ip == node.ip: shell = RemoteMachineShellConnection(server) info = shell.extract_remote_info() if info.type.lower() == "windows": o, r = shell.execute_command("netsh advfirewall show allprofiles") shell.log_command_output(o, r) else: o, r = shell.execute_command("/sbin/iptables --list") shell.log_command_output(o, r) shell.disconnect() self.rest.print_UI_logs() api = self.rest.baseUrl + 'nodeStatuses' status, content, header = self.rest._http_request(api) json_parsed = json.loads(content) self.log.info("nodeStatuses: {0}".format(json_parsed)) self.fail("node status is not unhealthy even after waiting for 5 minutes") # verify the failover type if self.check_verify_failover_type: graceful_count, graceful_failover = self.verify_failover_type(node, graceful_count, self.num_replicas, unreachable) # define precondition check for failover success_failed_over = self.rest.fail_over(node.id, graceful=(self.graceful and graceful_failover)) if self.graceful and graceful_failover: if self.stopGracefulFailover or self.killNodes or self.stopNodes or self.firewallOnNodes: self.victim_node_operations(node) # Start Graceful Again self.log.info(" Start Graceful Failover Again !") success_failed_over = self.rest.fail_over(node.id, graceful=(self.graceful and graceful_failover)) msg = "graceful failover failed for nodes {0}".format(node.id) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg) else: msg = "rebalance failed while removing failover nodes {0}".format(node.id) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg) failed_over = failed_over and success_failed_over # Check for negative cases if self.graceful and (failover_reason in ['stop_server', 'firewall']): if failed_over: # MB-10479 self.rest.print_UI_logs() self.assertFalse(failed_over, "Graceful Falover was started for unhealthy node!!! ") return elif self.gracefulFailoverFail and not failed_over: """ Check if the fail_over fails as expected """ self.assertFalse(failed_over,""" Graceful failover should fail due to not enough replicas """) return # Check if failover happened as expected or re-try one more time if not failed_over: self.log.info("unable to failover the node the first time. try again in 60 seconds..") # try again in 75 seconds self.sleep(75) failed_over = self.rest.fail_over(node.id, graceful=(self.graceful and graceful_failover)) if self.graceful and (failover_reason not in ['stop_server', 'firewall']): reached = RestHelper(self.rest).rebalance_reached() self.assertTrue(reached, "rebalance failed for Graceful Failover, stuck or did not completed") # Verify Active and Replica Bucket Count if self.num_replicas > 0: nodes = self.filter_servers(self.servers,chosen) self.vb_distribution_analysis(servers = nodes, buckets = self.buckets, std = 20.0 , total_vbuckets = self.total_vbuckets, type = "failover", graceful = (self.graceful and graceful_failover) ) def run_failover_operations_with_ops(self, chosen, failover_reason): """ Method to run fail over operations used in the test scenario based on failover reason """ # Perform Operations relalted to failover failed_over = True for node in chosen: unreachable = False if failover_reason == 'stop_server': unreachable=True self.stop_server(node) self.log.info("10 seconds delay to wait for membase-server to shutdown") # wait for 5 minutes until node is down self.assertTrue(RestHelper(self.rest).wait_for_node_status(node, "unhealthy", 300), msg="node status is not unhealthy even after waiting for 5 minutes") elif failover_reason == "firewall": unreachable=True self.filter_list.append (node.ip) server = [srv for srv in self.servers if node.ip == srv.ip][0] RemoteUtilHelper.enable_firewall(server, bidirectional=self.bidirectional) status = RestHelper(self.rest).wait_for_node_status(node, "unhealthy", 300) if status: self.log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port)) else: # verify iptables on the node if something wrong for server in self.servers: if server.ip == node.ip: shell = RemoteMachineShellConnection(server) info = shell.extract_remote_info() if info.type.lower() == "windows": o, r = shell.execute_command("netsh advfirewall show allprofiles") shell.log_command_output(o, r) else: o, r = shell.execute_command("/sbin/iptables --list") shell.log_command_output(o, r) shell.disconnect() self.rest.print_UI_logs() api = self.rest.baseUrl + 'nodeStatuses' status, content, header = self.rest._http_request(api) json_parsed = json.loads(content) self.log.info("nodeStatuses: {0}".format(json_parsed)) self.fail("node status is not unhealthy even after waiting for 5 minutes") nodes = self.filter_servers(self.servers,chosen) failed_over = self.cluster.async_failover([self.master], failover_nodes = chosen, graceful=self.graceful) # Perform Compaction compact_tasks = [] if self.compact: for bucket in self.buckets: compact_tasks.append(self.cluster.async_compact_bucket(self.master,bucket)) # Run View Operations if self.withViewsOps: self.query_and_monitor_view_tasks(nodes) # Run mutation operations if self.withMutationOps: self.run_mutation_operations() failed_over.result() for task in compact_tasks: task.result() msg = "rebalance failed while removing failover nodes {0}".format(node.id) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg) def load_initial_data(self): """ Method to run operations Update/Delete/Create """ # Load All Buckets if num_items > 0 tasks = [] tasks += self._async_load_all_buckets(self.master, self.gen_initial_create, "create", 0, flag = 2, batch_size=20000) for task in tasks: task.result() self._wait_for_stats_all_buckets(self.servers, check_ep_items_remaining = True) self._verify_stats_all_buckets(self.servers,timeout = 120) def run_mutation_operations(self): mutation_ops_tasks = [] if("create" in self.doc_ops): mutation_ops_tasks += self._async_load_all_buckets(self.master, self.gen_create, "create", 0) if("update" in self.doc_ops): mutation_ops_tasks += self._async_load_all_buckets(self.master, self.gen_update, "update", 0) if("delete" in self.doc_ops): mutation_ops_tasks += self._async_load_all_buckets(self.master, self.gen_delete, "delete", 0) try: for task in mutation_ops_tasks: task.result() except Exception, ex: self.log.info(ex)
def build_info(node): rest = RestConnection(node) api = rest.baseUrl + 'nodes/self' status, content, header = rest._http_request(api) json_parsed = json.loads(content) return json_parsed
def _get_cluster_ca_cert(self): rest = RestConnection(self.host) api = rest.baseUrl + "pools/default/certificate?extended=true" status, content, header = rest._http_request(api, 'GET') return status, content, header
class CommunityTests(CommunityBaseTest): def setUp(self): super(CommunityTests, self).setUp() self.command = self.input.param("command", "") self.zone = self.input.param("zone", 1) self.replica = self.input.param("replica", 1) self.command_options = self.input.param("command_options", '') self.set_get_ratio = self.input.param("set_get_ratio", 0.9) self.item_size = self.input.param("item_size", 128) self.shutdown_zone = self.input.param("shutdown_zone", 1) self.do_verify = self.input.param("do-verify", True) self.num_node = self.input.param("num_node", 4) self.services = self.input.param("services", None) self.start_node_services = self.input.param("start_node_services", "kv") self.add_node_services = self.input.param("add_node_services", "kv") self.timeout = 6000 self.user_add = self.input.param("user_add", None) self.user_role = self.input.param("user_role", None) def tearDown(self): super(CommunityTests, self).tearDown() def test_disabled_zone(self): disabled_zone = False zone_name = "group1" serverInfo = self.servers[0] self.rest = RestConnection(serverInfo) try: self.log.info("create zone name 'group1'!") result = self.rest.add_zone(zone_name) print("result ", result) except Exception as e: if e: print(e) disabled_zone = True pass if not disabled_zone: self.fail("CE version should not have zone feature") def check_audit_available(self): audit_available = False try: self.rest.getAuditSettings() audit_available = True except Exception as e: if e: print(e) if audit_available: self.fail("This feature 'audit' only available on " "Enterprise Edition") def check_ldap_available(self): ldap_available = False self.rest = RestConnection(self.master) try: s, c, h = self.rest.clearLDAPSettings() if s: ldap_available = True except Exception as e: if e: print(e) if ldap_available: self.fail("This feature 'ldap' only available on " "Enterprise Edition") def check_set_services(self): self.rest.force_eject_node() self.sleep(7, "wait for node reset done") try: status = self.rest.init_node_services(hostname=self.master.ip, services=[self.services]) except Exception as e: if e: print(e) if self.services == "kv": if status: self.log.info("CE could set {0} only service.".format( self.services)) else: self.fail("Failed to set {0} only service.".format( self.services)) elif self.services == "index,kv": if status: self.fail("CE does not support kv and index on same node") else: self.log.info("services enforced in CE") elif self.services == "kv,n1ql": if status: self.fail("CE does not support kv and n1ql on same node") else: self.log.info("services enforced in CE") elif self.services == "kv,eventing": if status: self.fail("CE does not support kv and eventing on same node") else: self.log.info("services enforced in CE") elif self.services == "index,n1ql": if status: self.fail("CE does not support index and n1ql on same node") else: self.log.info("services enforced in CE") elif self.services == "index,kv,n1ql": if status: self.log.info( "CE could set all services {0} on same nodes.".format( self.services)) else: self.fail("Failed to set kv, index and query services on CE") elif self.version[:5] in COUCHBASE_FROM_WATSON: if self.version[: 5] in COUCHBASE_FROM_VULCAN and "eventing" in self.services: if status: self.fail("CE does not support eventing in vulcan") else: self.log.info("services enforced in CE") elif self.services == "fts,index,kv": if status: self.fail( "CE does not support fts, index and kv on same node") else: self.log.info("services enforced in CE") elif self.services == "fts,index,n1ql": if status: self.fail( "CE does not support fts, index and n1ql on same node") else: self.log.info("services enforced in CE") elif self.services == "fts,kv,n1ql": if status: self.fail( "CE does not support fts, kv and n1ql on same node") else: self.log.info("services enforced in CE") elif self.services == "fts,index,kv,n1ql": if status: self.log.info( "CE could set all services {0} on same nodes.".format( self.services)) else: self.fail("Failed to set " "fts, index, kv, and query services on CE") else: self.fail("some services don't support") def check_set_services_when_add_node(self): self.rest.force_eject_node() sherlock_services_in_ce = ["kv", "index,kv,n1ql"] watson_services_in_ce = ["kv", "index,kv,n1ql", "fts,index,kv,n1ql"] self.sleep(5, "wait for node reset done") try: self.log.info("Initialize node with services {0}".format( self.start_node_services)) status = self.rest.init_node_services( hostname=self.master.ip, services=[self.start_node_services]) init_node = self.cluster.async_init_node( self.master, services=[self.start_node_services]) except Exception as e: if e: print(e) if not status: if self.version not in COUCHBASE_FROM_WATSON and \ self.start_node_services not in sherlock_services_in_ce: self.log.info( "initial services setting enforced in Sherlock CE") elif self.version in COUCHBASE_FROM_WATSON and \ self.start_node_services not in watson_services_in_ce: self.log.info("initial services setting enforced in Watson CE") elif status and init_node.result() != 0: add_node = False try: self.log.info("node with services {0} try to add".format( self.add_node_services)) add_node = self.cluster.rebalance( self.servers[:2], self.servers[1:2], [], services=[self.add_node_services]) except Exception: pass if add_node: self.get_services_map() list_nodes = self.get_nodes_from_services_map( get_all_nodes=True) map = self.get_nodes_services() if map[self.master.ip] == self.start_node_services and \ map[self.servers[1].ip] == self.add_node_services: self.log.info( "services set correctly when node added & rebalance") else: self.fail("services set incorrectly when node added & rebalance. " "cluster expected services: {0}; set cluster services {1} ." "add node expected srv: {2}; set add node srv {3}"\ .format(map[self.master.ip], self.start_node_services, \ map[self.servers[1].ip], self.add_node_services)) else: if self.version not in COUCHBASE_FROM_WATSON: if self.start_node_services in ["kv", "index,kv,n1ql"] and \ self.add_node_services not in ["kv", "index,kv,n1ql"]: self.log.info("services are enforced in CE") elif self.start_node_services not in [ "kv", "index,kv,n1ql" ]: self.log.info("services are enforced in CE") else: self.fail("maybe bug in add node") elif self.version in COUCHBASE_FROM_WATSON: if self.start_node_services in ["kv", "index,kv,n1ql", "fts,index,kv,n1ql"] and self.add_node_services not in \ ["kv", "index,kv,n1ql", "fts,index,kv,n1ql"]: self.log.info("services are enforced in CE") elif self.start_node_services not in [ "kv", "index,kv,n1ql", "fts,index,kv,n1ql" ]: self.log.info("services are enforced in CE") else: self.fail("maybe bug in add node") else: self.fail("maybe bug in node initialization") def check_full_backup_only(self): """ for windows vm, ask IT to put uniq.exe at /cygdrive/c/Program Files (x86)/ICW/bin directory """ self.remote = RemoteMachineShellConnection(self.master) """ put params items=0 in test param so that init items = 0 """ self.remote.execute_command("{0}cbworkloadgen -n {1}:8091 -j -i 1000 " \ "-u Administrator -p password" \ .format(self.bin_path, self.master.ip)) """ delete backup location before run backup """ self.remote.execute_command("rm -rf {0}*".format(self.backup_location)) output, error = self.remote.execute_command("ls -lh {0}".format( self.backup_location)) self.remote.log_command_output(output, error) """ first full backup """ self.remote.execute_command("{0}cbbackup http://{1}:8091 {2} -m full " \ "-u Administrator -p password"\ .format(self.bin_path, self.master.ip, self.backup_c_location)) output, error = self.remote.execute_command("ls -lh {0}*/".format( self.backup_location)) self.remote.log_command_output(output, error) output, error = self.remote.execute_command("{0}cbtransfer -u Administrator "\ "-p password {1}*/*-full/ " \ "stdout: | grep set | uniq | wc -l"\ .format(self.bin_path, self.backup_c_location)) self.remote.log_command_output(output, error) if int(output[0]) != 1000: self.fail("full backup did not work in CE. " "Expected 1000, actual: {0}".format(output[0])) self.remote.execute_command("{0}cbworkloadgen -n {1}:8091 -j -i 1000 "\ " -u Administrator -p password --prefix=t_" .format(self.bin_path, self.master.ip)) """ do different backup mode """ self.remote.execute_command("{0}cbbackup -u Administrator -p password "\ "http://{1}:8091 {2} -m {3}"\ .format(self.bin_path, self.master.ip, self.backup_c_location, self.backup_option)) output, error = self.remote.execute_command("ls -lh {0}".format( self.backup_location)) self.remote.log_command_output(output, error) output, error = self.remote.execute_command("{0}cbtransfer -u Administrator "\ "-p password {1}*/*-{2}/ stdout: "\ "| grep set | uniq | wc -l"\ .format(self.bin_path, self.backup_c_location, self.backup_option)) self.remote.log_command_output(output, error) if int(output[0]) == 2000: self.log.info("backup option 'diff' is enforced in CE") elif int(output[0]) == 1000: self.fail("backup option 'diff' is not enforced in CE. " "Expected 2000, actual: {0}".format(output[0])) else: self.fail("backup failed to backup correct items") self.remote.disconnect() def check_ent_backup(self): """ for CE version from Watson, cbbackupmgr exe file should not in bin """ command = "cbbackupmgr" self.remote = RemoteMachineShellConnection(self.master) self.log.info("check if {0} in {1} directory".format( command, self.bin_path)) found = self.remote.file_exists(self.bin_path, command) if found: self.log.info("found {0} in {1} directory".format( command, self.bin_path)) self.fail("CE from Watson should not contain {0}".format(command)) elif not found: self.log.info("Ent. backup in CE is enforced, not in bin!") self.remote.disconnect() def check_memory_optimized_storage_mode(self): """ from Watson, CE should not have option 'memory_optimized' to set """ self.rest.force_eject_node() self.sleep(5, "wait for node reset done") try: self.log.info("Initialize node with 'Memory Optimized' option") status = self.rest.set_indexer_storage_mode( username=self.input.membase_settings.rest_username, password=self.input.membase_settings.rest_password, storageMode='memory_optimized') except Exception as ex: if ex: print(ex) if not status: self.log.info("Memory Optimized setting enforced in CE " "Could not set memory_optimized option") else: self.fail("Memory Optimzed setting does not enforced in CE " "We could set this option in") def check_x509_cert(self): """ from Watson, X509 certificate only support in EE """ api = self.rest.baseUrl + "pools/default/certificate?extended=true" self.log.info("request to get certificate at " "'pools/default/certificate?extended=true' " "should return False") try: status, content, header = self.rest._http_request(api, 'GET') except Exception as ex: if ex: print(ex) if status: self.fail("This X509 certificate feature only available in EE") elif not status: if "requires enterprise edition" in content: self.log.info("X509 cert is enforced in CE") def check_roles_base_access(self): """ from Watson, roles base access for admin should not in in CE """ if self.user_add is None: self.fail( "We need to pass user name (user_add) to run this test. ") if self.user_role is None: self.fail( "We need to pass user roles (user_role) to run this test. ") api = self.rest.baseUrl + "settings/rbac/users/" + self.user_add self.log.info("url to run this test: %s" % api) """ add admin user """ param = "name=%s&roles=%s" % (self.user_add, self.user_role) try: status, content, header = self.rest._http_request( api, 'PUT', param) except Exception as ex: if ex: print(ex) if status: self.fail("CE should not allow to add admin users") else: self.log.info("roles base is enforced in CE! ") def check_root_certificate(self): """ from watson, ce should not see root certificate manual test: curl -u Administrator:password -X GET http://localhost:8091/pools/default/certificate """ api = self.rest.baseUrl + "pools/default/certificate" try: status, content, header = self.rest._http_request(api, 'GET') except Exception as ex: if ex: print(ex) if status: self.fail("CE should not see root certificate!") elif "requires enterprise edition" in content: self.log.info("root certificate is enforced in CE! ") def check_settings_audit(self): """ from watson, ce should not set audit manual test: curl -u Administrator:password -X GET http://localhost:8091/settings/audit """ api = self.rest.baseUrl + "settings/audit" try: status, content, header = self.rest._http_request(api, 'GET') except Exception as ex: if ex: print(ex) if status: self.fail("CE should not allow to set audit !") elif "requires enterprise edition" in content: self.log.info("settings audit is enforced in CE! ") def check_infer(self): """ from watson, ce should not see infer manual test: curl -H "Content-Type: application/json" -X POST -d '{"statement":"infer `bucket_name`;"}' http://localhost:8093/query/service test params: new_services=kv-index-n1ql,default_bucket=False """ self.rest.force_eject_node() self.sleep(7, "wait for node reset done") self.rest.init_node() bucket = "default" self.rest.create_bucket(bucket, ramQuotaMB=200) api = self.rest.query_baseUrl + "query/service" param = urllib.parse.urlencode({"statement": "infer `%s` ;" % bucket}) try: status, content, header = self.rest._http_request( api, 'POST', param) json_parsed = json.loads(content) except Exception as ex: if ex: print(ex) if json_parsed["status"] == "success": self.fail("CE should not allow to run INFER !") elif json_parsed["status"] == "fatal": self.log.info("INFER is enforced in CE! ") def check_auto_complete(self): """ this feature has not complete to block in CE """ """ Check new features from spock start here """ def check_cbbackupmgr(self): """ cbbackupmgr should not available in CE from spock """ if self.cb_version[:5] in COUCHBASE_FROM_SPOCK: file_name = "cbbackupmgr" + self.file_extension self.log.info("check if cbbackupmgr in bin dir in CE") result = self.remote.file_exists(self.bin_path, file_name) if result: self.fail("cbbackupmgr should not in bin dir of CE") else: self.log.info("cbbackupmgr is enforced in CE") self.remote.disconnect() def test_max_ttl_bucket(self): """ From vulcan, EE bucket has has an option to set --max-ttl, not it CE. This test is make sure CE could not create bucket with option --max-ttl This test must pass default_bucket=False """ if self.cb_version[:5] not in COUCHBASE_FROM_VULCAN: self.log.info("This test only for vulcan and later") return cmd = 'curl -X POST -u Administrator:password \ http://{0}:8091/pools/default/buckets \ -d name=bucket0 \ -d maxTTL=100 \ -d authType=sasl \ -d ramQuotaMB=100 '.format(self.master.ip) if self.cli_test: cmd = "{0}couchbase-cli bucket-create -c {1}:8091 --username Administrator \ --password password --bucket bucket0 --bucket-type couchbase \ --bucket-ramsize 512 --bucket-replica 1 --bucket-priority high \ --bucket-eviction-policy fullEviction --enable-flush 0 \ --enable-index-replica 1 --max-ttl 200".format( self.bin_path, self.master.ip) conn = RemoteMachineShellConnection(self.master) output, error = conn.execute_command(cmd) conn.log_command_output(output, error) mesg = "Max TTL is supported in enterprise edition only" if self.cli_test: mesg = "Maximum TTL can only be configured on enterprise edition" if output and mesg not in str(output[0]): self.fail("max ttl feature should not in Community Edition") buckets = RestConnection(self.master).get_buckets() if buckets: for bucket in buckets: self.log.info("bucekt in cluser: {0}".format(bucket.name)) if bucket.name == "bucket0": self.fail("Failed to enforce feature max ttl in CE.") conn.disconnect() def test_setting_audit(self): """ CE does not allow to set audit from vulcan 5.5.0 """ if self.cb_version[:5] not in COUCHBASE_FROM_VULCAN: self.log.info("This test only for vulcan and later") return cmd = 'curl -X POST -u Administrator:password \ http://{0}:8091/settings/audit \ -d auditdEnabled=true '.format(self.master.ip) if self.cli_test: cmd = "{0}couchbase-cli setting-audit -c {1}:8091 -u Administrator \ -p password --audit-enabled 1 --audit-log-rotate-interval 604800 \ --audit-log-path /opt/couchbase/var/lib/couchbase/logs "\ .format(self.bin_path, self.master.ip) conn = RemoteMachineShellConnection(self.master) output, error = conn.execute_command(cmd) conn.log_command_output(output, error) mesg = "This http API endpoint requires enterprise edition" if output and mesg not in str(output[0]): self.fail("setting-audit feature should not in Community Edition") conn.disconnect() def test_setting_autofailover_enterprise_only(self): """ CE does not allow set auto failover if disk has issue and failover group from vulcan 5.5.0 """ if self.cb_version[:5] not in COUCHBASE_FROM_VULCAN: self.log.info("This test only for vulcan and later") return self.failover_disk_period = self.input.param("failover_disk_period", False) self.failover_server_group = self.input.param("failover_server_group", False) failover_disk_period = "" if self.failover_disk_period: if self.cli_test: failover_disk_period = "--failover-data-disk-period 300" else: failover_disk_period = "-d failoverOnDataDiskIssues[timePeriod]=300" failover_server_group = "" if self.failover_server_group and self.cli_test: failover_server_group = "--enable-failover-of-server-group 1" cmd = 'curl -X POST -u Administrator:password \ http://{0}:8091/settings/autoFailover -d enabled=true -d timeout=120 \ -d maxCount=1 \ -d failoverOnDataDiskIssues[enabled]=true {1} \ -d failoverServerGroup={2}'.format(self.master.ip, failover_disk_period, self.failover_server_group) if self.cli_test: cmd = "{0}couchbase-cli setting-autofailover -c {1}:8091 \ -u Administrator -p password \ --enable-failover-on-data-disk-issues 1 {2} {3} "\ .format(self.bin_path, self.master.ip, failover_disk_period, failover_server_group) conn = RemoteMachineShellConnection(self.master) output, error = conn.execute_command(cmd) conn.log_command_output(output, error) mesg = "Auto failover on Data Service disk issues can only be " + \ "configured on enterprise edition" if not self.cli_test: if self.failover_disk_period or \ self.failover_server_group: if output and not error: self.fail("setting autofailover disk issues feature\ should not in Community Edition") else: if self.failover_server_group: mesg = "--enable-failover-of-server-groups can only be " + \ "configured on enterprise edition" if output and mesg not in str(output[0]): self.fail("Setting EE autofailover features \ should not in Community Edition") else: self.log.info("EE setting autofailover are disable in CE") conn.disconnect() def test_set_bucket_compression(self): """ CE does not allow to set bucket compression to bucket from vulcan 5.5.0. Mode compression: off,active,passive Note: must set defaultbucket=False for this test """ if self.cb_version[:5] not in COUCHBASE_FROM_VULCAN: self.log.info("This test only for vulcan and later") return self.compression_mode = self.input.param("compression_mode", "off") cmd = 'curl -X POST -u Administrator:password \ http://{0}:8091/pools/default/buckets \ -d name=bucket0 \ -d compressionMode={1} \ -d authType=sasl \ -d ramQuotaMB=100 '.format( self.master.ip, self.compression_mode) if self.cli_test: cmd = "{0}couchbase-cli bucket-create -c {1}:8091 --username Administrator \ --password password --bucket bucket0 --bucket-type couchbase \ --bucket-ramsize 512 --bucket-replica 1 --bucket-priority high \ --bucket-eviction-policy fullEviction --enable-flush 0 \ --enable-index-replica 1 --compression-mode {2}".format( self.bin_path, self.master.ip, self.compression_mode) conn = RemoteMachineShellConnection(self.master) output, error = conn.execute_command(cmd) conn.log_command_output(output, error) mesg = "Compression mode is supported in enterprise edition only" if self.cli_test: mesg = "Compression mode can only be configured on enterprise edition" if output and mesg not in str(output[0]): self.fail("Setting bucket compression should not in CE") conn.disconnect()
def _get_cluster_ca_cert(self): rest = RestConnection(self.host) api = rest.baseUrl + "pools/default/certificate?extended=true" status, content, header = rest._http_request(api, 'GET') return status, content, header
class FailoverTests(FailoverBaseTest): def setUp(self): super(FailoverTests, self).setUp() self.server_map = self.get_server_map(self.servers) def tearDown(self): super(FailoverTests, self).tearDown() def test_failover_firewall(self): self.common_test_body('firewall') def test_failover_normal(self): self.common_test_body('normal') def test_failover_stop_server(self): self.common_test_body('stop_server') def test_failover_then_add_back(self): self.add_back_flag = True self.common_test_body('normal') def common_test_body(self, failover_reason): """ Main Test body which contains the flow of the failover basic steps 1. Starts Operations if programmed into the test case(before/after) 2. Start View and Index Building operations 3. Failover K out of N nodes (failover can be HARD/GRACEFUL) 4.1 Rebalance the cluster is failover of K nodeStatuses 4.2 Run Add-Back operation with recoveryType = (full/delta) with rebalance 5. Verify all expected operations completed by checking stats, replication, views, data correctness """ # Pick the reference node for communication # We pick a node in the cluster which will NOT be failed over self.filter_list = [] if self.failoverMaster: self.master = self.servers[1] self.log.info( "Picking node {0} as reference node for test case".format( self.master.ip)) self.print_test_params(failover_reason) self.rest = RestConnection(self.master) self.nodes = self.rest.node_statuses() # Set the data path for the cluster self.data_path = self.rest.get_data_path() # Check if the test case has to be run for 3.0.0 versions = self.rest.get_nodes_versions() self.version_greater_than_2_5 = True for version in versions: if "3" > version: self.version_greater_than_2_5 = False # Do not run this this test if graceful category is being used if not self.version_greater_than_2_5 and (self.graceful or (self.recoveryType != None)): self.log.error( "Graceful failover can't be applied to nodes with version less then 3.*" ) self.log.error( "Please check configuration parameters: SKIPPING TEST.") return # Find nodes that will under go failover if self.failoverMaster: self.chosen = RebalanceHelper.pick_nodes( self.master, howmany=1, target_node=self.servers[0]) else: self.chosen = RebalanceHelper.pick_nodes( self.master, howmany=self.num_failed_nodes) # Perform operations - Create/Update/Delete # self.withMutationOps = True => Run Operations in parallel to failover # self.withMutationOps = False => Run Operations Before failover self.load_initial_data() if not self.withMutationOps: self.run_mutation_operations() # Perform view creation tasks and wait for completion before failover if self.withViewsOps: self.run_view_creation_operations(self.servers) if not self.createIndexesDuringFailover: self.query_and_monitor_view_tasks(self.servers) # Validate seq_no snap_start/stop values self.check_snap_start_corruption() # Take snap-shot of data set used for validaiton record_static_data_set = dict() prev_vbucket_stats = dict() prev_failover_stats = dict() if not self.withMutationOps: record_static_data_set = self.get_data_set_all(self.servers, self.buckets, path=None) # Capture vbucket and failover stats if test version >= 2.5.* if self.version_greater_than_2_5 and self.upr_check: prev_vbucket_stats = self.get_vbucket_seqnos( self.servers, self.buckets) prev_failover_stats = self.get_failovers_logs( self.servers, self.buckets) # Perform Operations related to failover if self.withMutationOps or self.withViewsOps or self.compact: self.run_failover_operations_with_ops(self.chosen, failover_reason) else: self.run_failover_operations(self.chosen, failover_reason) # TODO: Enable this even when 'flusher_batch_split_trigger' is not set if self.flusher_batch_split_trigger and \ self.num_replicas >= self.num_failed_nodes: tasks = self._async_load_all_buckets(self.master, self.gen_update, "update", 0) for task in tasks: task.result() if self.graceful: # Validate seq_no snap_start/stop values self.check_snap_start_corruption() # Add back + rebalance / only rebalance with verification if not self.gracefulFailoverFail and self.runRebalanceAfterFailover: if self.add_back_flag: self.run_add_back_operation_and_verify(self.chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats) else: self.run_rebalance_after_failover_and_verify( self.chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats) if self.graceful: # Validate seq_no snap_start/stop values self.check_snap_start_corruption() if self.during_ops is None: self.verify_unacked_bytes_all_buckets(filter_list=self.filter_list, master_node=self.master) def run_rebalance_after_failover_and_verify(self, chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats): """ Method to run rebalance after failover and verify """ # Need a delay > min because MB-7168 _servers_ = self.filter_servers(self.servers, chosen) self._wait_for_stats_all_buckets(_servers_, check_ep_items_remaining=True) self.sleep(5, "after failover before invoking rebalance...") # Rebalance after Failover operation self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[node.id for node in chosen]) if self.during_ops: self.sleep(5, "Wait for some progress in rebalance") if self.during_ops == "change_password": old_pass = self.master.rest_password self.change_password( new_password=self.input.param("new_password", "new_pass")) self.rest = RestConnection(self.master) elif self.during_ops == "change_port": self.change_port(new_port=self.input.param("new_port", "9090")) self.rest = RestConnection(self.master) # Perform Compaction if self.compact: for bucket in self.buckets: self.cluster.compact_bucket(self.master, bucket) # Peform View Validation if Supported nodes = self.filter_servers(self.servers, chosen) if self.withViewsOps: self.query_and_monitor_view_tasks(nodes) # Run operations if required during rebalance after failover if self.withMutationOps: self.run_mutation_operations_after_failover() # Kill or restart operations if self.killNodes or self.stopNodes or self.firewallOnNodes: self.victim_node_operations(node=chosen[0]) self.sleep(60) self.log.info(" Start Rebalance Again !") self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[node.id for node in chosen]) # Rebalance Monitoring msg = "rebalance failed while removing failover nodes {0}".format( [node.id for node in chosen]) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg) # Reset password or port if self.during_ops: if self.during_ops == "change_password": self.change_password(new_password=old_pass) elif self.during_ops == "change_port": self.change_port(new_port='8091', current_port=self.input.param( "new_port", "9090")) return # Drain Queue and make sure intra-cluster replication is complete self.log.info("Begin VERIFICATION for Rebalance after Failover Only") self.verify_cluster_stats(_servers_, self.master, check_bucket_stats=True, check_ep_items_remaining=True) # Verify all data set with meta data if failover happens after failover if not self.withMutationOps: self.sleep(60) self.data_analysis_all(record_static_data_set, _servers_, self.buckets, path=None, addedItems=None) # Check Cluster Stats and Data as well if max_verify > 0 # Check Failover logs :: Not sure about this logic, currently not checking, will update code once confirmed # Currently, only for checking case where we have graceful failover if self.version_greater_than_2_5 and self.graceful and self.upr_check: new_failover_stats = self.compare_failovers_logs( prev_failover_stats, _servers_, self.buckets) new_vbucket_stats = self.compare_vbucket_seqnos( prev_vbucket_stats, _servers_, self.buckets) self.compare_vbucketseq_failoverlogs(new_vbucket_stats, new_failover_stats) # Verify Active and Replica Bucket Count if self.num_replicas > 0: nodes = self.get_nodes_in_cluster(self.master) self.vb_distribution_analysis(servers=nodes, buckets=self.buckets, std=20.0, total_vbuckets=self.total_vbuckets) self.log.info("End VERIFICATION for Rebalance after Failover Only") def run_add_back_operation_and_verify(self, chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats): """ Method to run add-back operation with recovery type = (delta/full) It also verifies if the operations are correct with data verificaiton steps """ _servers_ = self.filter_servers(self.servers, chosen) self._wait_for_stats_all_buckets(_servers_, check_ep_items_remaining=True) recoveryTypeMap = self.define_maps_during_failover(self.recoveryType) fileMapsForVerification = self.create_file(chosen, self.buckets, self.server_map) index = 0 for node in chosen: self.sleep(5) if self.recoveryType: # define precondition for recoverytype self.rest.set_recovery_type( otpNode=node.id, recoveryType=self.recoveryType[index]) index += 1 else: self.rest.add_back_node(node.id) # Doc_mutation before triggering rebalance tasks = self._async_load_all_buckets(self.master, self.gen_update, "update", 0) for task in tasks: task.result() self.sleep(20, "After failover before invoking rebalance...") self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[], deltaRecoveryBuckets=self.deltaRecoveryBuckets) # Perform Compaction if self.compact: for bucket in self.buckets: self.cluster.compact_bucket(self.master, bucket) # Peform View Validation if Supported nodes = self.filter_servers(self.servers, chosen) if self.withViewsOps: self.query_and_monitor_view_tasks(nodes) # Run operations if required during rebalance after failover if self.withMutationOps: self.run_mutation_operations_after_failover() # Kill or restart operations if self.killNodes or self.stopNodes or self.firewallOnNodes: self.victim_node_operations(node=chosen[0]) self.sleep(60) self.log.info("Start Rebalance Again!") self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[], deltaRecoveryBuckets=self.deltaRecoveryBuckets) # Check if node has to be killed or restarted during rebalance # Monitor Rebalance msg = "rebalance failed while removing failover nodes {0}".format( chosen) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg) # Drain ep_queue and make sure that intra-cluster replication is complete self._wait_for_stats_all_buckets(self.servers, check_ep_items_remaining=True) self.log.info("Begin VERIFICATION for Add-back and rebalance") # Verify Stats of cluster and Data is max_verify > 0 self.verify_cluster_stats(self.servers, self.master, check_bucket_stats=True, check_ep_items_remaining=True) # Verify recovery Type succeeded if we added-back nodes self.verify_for_recovery_type(chosen, self.server_map, self.buckets, recoveryTypeMap, fileMapsForVerification, self.deltaRecoveryBuckets) # Comparison of all data if required if not self.withMutationOps: self.sleep(60) self.data_analysis_all(record_static_data_set, self.servers, self.buckets, path=None, addedItems=None) # Verify if vbucket sequence numbers and failover logs are as expected # We will check only for version > 2.5.* and if the failover is graceful if self.version_greater_than_2_5 and self.graceful and self.upr_check: new_vbucket_stats = self.compare_vbucket_seqnos(prev_vbucket_stats, self.servers, self.buckets, perNode=False) new_failover_stats = self.compare_failovers_logs( prev_failover_stats, self.servers, self.buckets) self.compare_vbucketseq_failoverlogs(new_vbucket_stats, new_failover_stats) # Verify Active and Replica Bucket Count if self.num_replicas > 0: nodes = self.get_nodes_in_cluster(self.master) self.vb_distribution_analysis(servers=nodes, buckets=self.buckets, std=20.0, total_vbuckets=self.total_vbuckets) self.log.info("End VERIFICATION for Add-back and rebalance") def print_test_params(self, failover_reason): """ Method to print test parameters """ self.log.info("num_replicas : {0}".format(self.num_replicas)) self.log.info("recoveryType : {0}".format(self.recoveryType)) self.log.info("failover_reason : {0}".format(failover_reason)) self.log.info("num_failed_nodes : {0}".format(self.num_failed_nodes)) self.log.info('picking server : {0} as the master'.format(self.master)) def run_failover_operations(self, chosen, failover_reason): """ Method to run fail over operations used in the test scenario based on failover reason """ # Perform Operations relalted to failover graceful_count = 0 graceful_failover = True failed_over = True for node in chosen: unreachable = False if failover_reason == 'stop_server': unreachable = True self.stop_server(node) self.log.info( "10 seconds delay to wait for membase-server to shutdown") # wait for 5 minutes until node is down self.assertTrue( RestHelper(self.rest).wait_for_node_status( node, "unhealthy", self.wait_timeout * 10), msg= "node status is not unhealthy even after waiting for 5 minutes" ) elif failover_reason == "firewall": unreachable = True self.filter_list.append(node.ip) server = [srv for srv in self.servers if node.ip == srv.ip][0] RemoteUtilHelper.enable_firewall( server, bidirectional=self.bidirectional) status = RestHelper(self.rest).wait_for_node_status( node, "unhealthy", self.wait_timeout * 10) if status: self.log.info( "node {0}:{1} is 'unhealthy' as expected".format( node.ip, node.port)) else: # verify iptables on the node if something wrong for server in self.servers: if server.ip == node.ip: shell = RemoteMachineShellConnection(server) info = shell.extract_remote_info() if info.type.lower() == "windows": o, r = shell.execute_command( "netsh advfirewall show allprofiles") shell.log_command_output(o, r) else: o, r = shell.execute_command( "/sbin/iptables --list") shell.log_command_output(o, r) shell.disconnect() self.rest.print_UI_logs() api = self.rest.baseUrl + 'nodeStatuses' status, content, header = self.rest._http_request(api) json_parsed = json.loads(content) self.log.info("nodeStatuses: {0}".format(json_parsed)) self.fail( "node status is not unhealthy even after waiting for 5 minutes" ) # verify the failover type if self.check_verify_failover_type: graceful_count, graceful_failover = self.verify_failover_type( node, graceful_count, self.num_replicas, unreachable) # define precondition check for failover success_failed_over = self.rest.fail_over( node.id, graceful=(self.graceful and graceful_failover)) if self.graceful and graceful_failover: if self.stopGracefulFailover or self.killNodes or self.stopNodes or self.firewallOnNodes: self.victim_node_operations(node) # Start Graceful Again self.log.info(" Start Graceful Failover Again !") self.sleep(60) success_failed_over = self.rest.fail_over( node.id, graceful=(self.graceful and graceful_failover)) msg = "graceful failover failed for nodes {0}".format( node.id) self.assertTrue( self.rest.monitorRebalance(stop_if_loop=True), msg=msg) else: msg = "rebalance failed while removing failover nodes {0}".format( node.id) self.assertTrue( self.rest.monitorRebalance(stop_if_loop=True), msg=msg) failed_over = failed_over and success_failed_over # Check for negative cases if self.graceful and (failover_reason in ['stop_server', 'firewall']): if failed_over: # MB-10479 self.rest.print_UI_logs() self.assertFalse( failed_over, "Graceful Falover was started for unhealthy node!!! ") return elif self.gracefulFailoverFail and not failed_over: """ Check if the fail_over fails as expected """ self.assertFalse( failed_over, """ Graceful failover should fail due to not enough replicas """ ) return # Check if failover happened as expected or re-try one more time if not failed_over: self.log.info( "unable to failover the node the first time. try again in 60 seconds.." ) # try again in 75 seconds self.sleep(75) failed_over = self.rest.fail_over(node.id, graceful=(self.graceful and graceful_failover)) if self.graceful and (failover_reason not in ['stop_server', 'firewall']): reached = RestHelper(self.rest).rebalance_reached() self.assertTrue( reached, "rebalance failed for Graceful Failover, stuck or did not completed" ) # Verify Active and Replica Bucket Count if self.num_replicas > 0: nodes = self.filter_servers(self.servers, chosen) self.vb_distribution_analysis(servers=nodes, buckets=self.buckets, std=20.0, total_vbuckets=self.total_vbuckets, type="failover", graceful=(self.graceful and graceful_failover)) def run_failover_operations_with_ops(self, chosen, failover_reason): """ Method to run fail over operations used in the test scenario based on failover reason """ # Perform Operations relalted to failover failed_over = True for node in chosen: unreachable = False if failover_reason == 'stop_server': unreachable = True self.stop_server(node) self.log.info( "10 seconds delay to wait for membase-server to shutdown") # wait for 5 minutes until node is down self.assertTrue( RestHelper(self.rest).wait_for_node_status( node, "unhealthy", 300), msg= "node status is not unhealthy even after waiting for 5 minutes" ) elif failover_reason == "firewall": unreachable = True self.filter_list.append(node.ip) server = [srv for srv in self.servers if node.ip == srv.ip][0] RemoteUtilHelper.enable_firewall( server, bidirectional=self.bidirectional) status = RestHelper(self.rest).wait_for_node_status( node, "unhealthy", 300) if status: self.log.info( "node {0}:{1} is 'unhealthy' as expected".format( node.ip, node.port)) else: # verify iptables on the node if something wrong for server in self.servers: if server.ip == node.ip: shell = RemoteMachineShellConnection(server) info = shell.extract_remote_info() if info.type.lower() == "windows": o, r = shell.execute_command( "netsh advfirewall show allprofiles") shell.log_command_output(o, r) else: o, r = shell.execute_command( "/sbin/iptables --list") shell.log_command_output(o, r) shell.disconnect() self.rest.print_UI_logs() api = self.rest.baseUrl + 'nodeStatuses' status, content, header = self.rest._http_request(api) json_parsed = json.loads(content) self.log.info("nodeStatuses: {0}".format(json_parsed)) self.fail( "node status is not unhealthy even after waiting for 5 minutes" ) nodes = self.filter_servers(self.servers, chosen) failed_over = self.cluster.async_failover([self.master], failover_nodes=chosen, graceful=self.graceful) # Perform Compaction compact_tasks = [] if self.compact: for bucket in self.buckets: compact_tasks.append( self.cluster.async_compact_bucket(self.master, bucket)) # Run View Operations if self.withViewsOps: self.query_and_monitor_view_tasks(nodes) # Run mutation operations if self.withMutationOps: self.run_mutation_operations() failed_over.result() for task in compact_tasks: task.result() msg = "rebalance failed while removing failover nodes {0}".format( node.id) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg) def load_initial_data(self): """ Method to run operations Update/Delete/Create """ # Load All Buckets if num_items > 0 tasks = [] tasks += self._async_load_all_buckets(self.master, self.gen_initial_create, "create", 0, flag=2, batch_size=20000) for task in tasks: task.result() self._wait_for_stats_all_buckets(self.servers, check_ep_items_remaining=True) self._verify_stats_all_buckets(self.servers, timeout=120) def run_mutation_operations(self): mutation_ops_tasks = [] if "create" in self.doc_ops: mutation_ops_tasks += self._async_load_all_buckets( self.master, self.gen_create, "create", 0) if "update" in self.doc_ops: mutation_ops_tasks += self._async_load_all_buckets( self.master, self.gen_update, "update", 0) if "delete" in self.doc_ops: mutation_ops_tasks += self._async_load_all_buckets( self.master, self.gen_delete, "delete", 0) try: for task in mutation_ops_tasks: task.result() except Exception, ex: self.log.info(ex)
class FailoverTests(FailoverBaseTest): def setUp(self): super(FailoverTests, self).setUp(self) def tearDown(self): super(FailoverTests, self).tearDown(self) def test_failover_firewall(self): self.common_test_body('firewall') def test_failover_normal(self): self.common_test_body('normal') def test_failover_stop_server(self): self.common_test_body('stop_server') def test_failover_then_add_back(self): self.add_back_flag = True self.common_test_body('normal') def common_test_body(self, failover_reason): """ Main Test body which contains the flow of the failover basic steps 1. Starts Operations if programmed into the test case (before/after) 2. Start View and Index Building operations 3. Failover K out of N nodes (failover can be HARDFAILOVER/GRACEFUL) 4.1 Rebalance the cluster is failover of K nodeStatuses 4.2 Run Add-Back operation with recoveryType = (full/delta) with rebalance 5. Verify all expected operations completed by checking stats, replicaiton, views, data correctness """ # Pick the reference node for communication # We pick a node in the cluster which will NOT be failed over self.referenceNode = self.master if self.failoverMaster: self.referenceNode = self.servers[1] self.log.info(" Picking node {0} as reference node for test case".format(self.referenceNode.ip)) self.print_test_params(failover_reason) self.rest = RestConnection(self.referenceNode) self.nodes = self.rest.node_statuses() # Set the data path for the cluster self.data_path = self.rest.get_data_path() # Check if the test case has to be run for 3.0.0 versions = self.rest.get_nodes_versions() self.version_greater_than_2_5 = True for version in versions: if "3" > version: self.version_greater_than_2_5 = False # Do not run this this test if graceful category is being used if not self.version_greater_than_2_5 and (self.graceful or (self.recoveryType != None)): self.log.error("Graceful failover can't be applied to nodes with version less then 3.*") self.log.error("Please check configuration parameters: SKIPPING TEST.") return # Find nodes that will under go failover self.chosen = RebalanceHelper.pick_nodes(self.referenceNode, howmany=self.num_failed_nodes) # Perform operations - Create/Update/Delete # self.withOps = True => Run Operations in parallel to failover # self.withOps = False => Run Operations Before failover self.ops_tasks = self.run_operation_tasks() # Perform View Creation Tasks and check for completion if required before failover if self.runViews: self.run_view_creation_operations(self.servers) if not self.runViewsDuringFailover: self.run_view_creation_operations(self.servers) self.monitor_view_tasks(self.servers) # Take snap-shot of data set used for validaiton record_static_data_set = self.get_data_set_all(self.servers, self.buckets, path = None) prev_vbucket_stats = {} prev_failover_stats = {} # Capture vbucket and failover stats if test version >= 2.5.* if self.version_greater_than_2_5 and self.upr_check: prev_vbucket_stats = self.get_vbucket_seqnos(self.servers, self.buckets) prev_failover_stats = self.get_failovers_logs(self.servers, self.buckets) # Perform Operations relalted to failover self.run_failover_operations(self.chosen, failover_reason) # Perform Add Back Operation with Rebalance Or only Rebalance with Verificaitons if not self.gracefulFailoverFail: if self.add_back_flag: self.run_add_back_operation_and_verify(self.chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats) else: self.run_rebalance_after_failover_and_verify(self.chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats) def run_rebalance_after_failover_and_verify(self, chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats): """ Method to run rebalance after failover and verify """ # Need a delay > min because MB-7168 self.sleep(60, "after failover before invoking rebalance...") _servers_ = self.filter_servers(self.servers, chosen) # Rebalance after Failover operation self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[node.id for node in chosen]) if self.during_ops: self.sleep(5, "Wait for some progress in rebalance") if self.during_ops == "change_password": old_pass = self.referenceNode.rest_password self.change_password(new_password=self.input.param("new_password", "new_pass")) self.rest = RestConnection(self.referenceNode) elif self.during_ops == "change_port": self.change_port(new_port=self.input.param("new_port", "9090")) self.rest = RestConnection(self.referenceNode) try: # Run operations if required during rebalance after failover if self.withOps: for task in self.ops_tasks: task.result() msg = "rebalance failed while removing failover nodes {0}".format([node.id for node in chosen]) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg) # Drain Queue and make sure intra-cluster replication is complete self._verify_stats_all_buckets(_servers_,timeout = 120) self._wait_for_stats_all_buckets(_servers_) self.log.info("Begin VERIFICATION for Rebalance after Failover Only") # Verify all data set with meta data if failover happens after failover if not self.withOps: self.data_analysis_all(record_static_data_set, _servers_, self.buckets, path = None) # Check Cluster Stats and Data as well if max_verify > 0 self.verify_cluster_stats(_servers_, self.referenceNode) # If views were created they can be verified if self.runViews: if self.runViewsDuringFailover: self.monitor_view_tasks(_servers_) self.verify_query_task() # Check Failover logs :: Not sure about this logic, currently not checking, will update code once confirmed # Currently, only for checking case where we have graceful failover if self.version_greater_than_2_5 and self.graceful and self.upr_check: new_failover_stats = self.compare_failovers_logs(prev_failover_stats, _servers_, self.buckets) new_vbucket_stats = self.compare_vbucket_seqnos(prev_vbucket_stats, _servers_, self.buckets) self.compare_vbucketseq_failoverlogs(new_vbucket_stats, new_failover_stats) self.log.info("End VERIFICATION for Rebalance after Failover Only") finally: if self.during_ops: if self.during_ops == "change_password": self.change_password(new_password=old_pass) elif self.during_ops == "change_port": self.change_port(new_port='8091', current_port=self.input.param("new_port", "9090")) def run_add_back_operation_and_verify(self, chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats): """ Method to run add-back operation with recovery type = (delta/full) It also verifies if the operations are correct with data verificaiton steps """ serverMap = self.get_server_map(self.servers) recoveryTypeMap = self.define_maps_during_failover(self.recoveryType) fileMapsForVerification = self.create_file(chosen, self.buckets, serverMap) index = 0 for node in chosen: self.rest.add_back_node(node.id) self.sleep(5) if self.recoveryType: # define precondition for recoverytype self.rest.set_recovery_type(otpNode=node.id, recoveryType=self.recoveryType[index]) index += 1 self.sleep(20, "After failover before invoking rebalance...") self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[]) msg = "rebalance failed while removing failover nodes {0}".format(chosen) # Run operations if required during rebalance after failover if self.withOps: for task in self.ops_tasks: task.result() self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg) # Drain ep_queue and make sure that intra-cluster replication is complete self._verify_stats_all_buckets(self.servers,timeout = 120) self._wait_for_stats_all_buckets(self.servers) self.log.info("Begin VERIFICATION for Add-back and rebalance") # Verify recovery Type succeeded if we added-back nodes self.verify_for_recovery_type(chosen, serverMap, self.buckets, recoveryTypeMap, fileMapsForVerification) # Comparison of all data if required if not self.withOps: self.data_analysis_all(record_static_data_set,self.servers, self.buckets, path = None) # Verify Stats of cluster and Data is max_verify > 0 self.verify_cluster_stats(self.servers, self.referenceNode) # Verify if vbucket sequence numbers and failover logs are as expected # We will check only for version > 2.5.* and if the failover is graceful if self.version_greater_than_2_5 and self.graceful and self.upr_check: new_vbucket_stats = self.compare_vbucket_seqnos(prev_vbucket_stats, self.servers, self.buckets,perNode= False) new_failover_stats = self.compare_failovers_logs(prev_failover_stats,self.servers,self.buckets) self.compare_vbucketseq_failoverlogs(new_vbucket_stats, new_failover_stats) # Peform View Validation if Supported if self.runViews: if self.runViewsDuringFailover: self.monitor_view_tasks(self.servers) self.verify_query_task() self.log.info("End VERIFICATION for Add-back and rebalance") def print_test_params(self, failover_reason): """ Method to print test parameters """ self.log.info("num_replicas : {0}".format(self.num_replicas)) self.log.info("recoveryType : {0}".format(self.recoveryType)) self.log.info("failover_reason : {0}".format(failover_reason)) self.log.info("num_failed_nodes : {0}".format(self.num_failed_nodes)) self.log.info('picking server : {0} as the master'.format(self.referenceNode)) def run_failover_operations(self, chosen, failover_reason): """ Method to run fail over operations used in the test scenario based on failover reason """ # Perform Operations relalted to failover for node in chosen: if failover_reason == 'stop_server': self.stop_server(node) self.log.info("10 seconds delay to wait for membase-server to shutdown") # wait for 5 minutes until node is down self.assertTrue(RestHelper(self.rest).wait_for_node_status(node, "unhealthy", 300), msg="node status is not unhealthy even after waiting for 5 minutes") elif failover_reason == "firewall": server = [srv for srv in self.servers if node.ip == srv.ip][0] RemoteUtilHelper.enable_firewall(server, bidirectional=self.bidirectional) status = RestHelper(self.rest).wait_for_node_status(node, "unhealthy", 300) if status: self.log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port)) else: # verify iptables on the node if something wrong for server in self.servers: if server.ip == node.ip: shell = RemoteMachineShellConnection(server) info = shell.extract_remote_info() if info.type.lower() == "windows": o, r = shell.execute_command("netsh advfirewall show allprofiles") shell.log_command_output(o, r) else: o, r = shell.execute_command("/sbin/iptables --list") shell.log_command_output(o, r) shell.disconnect() self.rest.print_UI_logs() api = self.rest.baseUrl + 'nodeStatuses' status, content, header = self.rest._http_request(api) json_parsed = json.loads(content) self.log.info("nodeStatuses: {0}".format(json_parsed)) self.fail("node status is not unhealthy even after waiting for 5 minutes") # define precondition check for failover failed_over = self.rest.fail_over(node.id, graceful=self.graceful) # Check for negative cases if self.graceful and (failover_reason in ['stop_server', 'firewall']): if failed_over: # MB-10479 self.rest.print_UI_logs() self.assertFalse(failed_over, "Graceful Falover was started for unhealthy node!!! ") return elif self.gracefulFailoverFail and failed_over: """ Check if the fail_over fails as expected """ self.assertTrue(not failed_over,""" Graceful failover should fail due to not enough replicas """) return # Check if failover happened as expected or re-try one more time if not failed_over: self.log.info("unable to failover the node the first time. try again in 60 seconds..") # try again in 75 seconds self.sleep(75) failed_over = self.rest.fail_over(node.id, graceful=self.graceful) if self.graceful and (failover_reason not in ['stop_server', 'firewall']): reached = RestHelper(self.rest).rebalance_reached() self.assertTrue(reached, "rebalance failed for Graceful Failover, stuck or did not completed") def run_operation_tasks(self): """ Method to run operations Update/Delete/Create """ # Load All Buckets if num_items > 0 tasks = [] tasks += self._async_load_all_buckets(self.referenceNode, self.gen_initial_create, "create", 0) for task in tasks: task.result() self._verify_stats_all_buckets(self.servers,timeout = 120) self._wait_for_stats_all_buckets(self.servers) # Update or Delete buckets if items > 0 and options are passed in tests # These can run in parallel (withOps = True), or before (withOps = True) ops_tasks = [] if("create" in self.doc_ops): ops_tasks += self._async_load_all_buckets(self.referenceNode, self.gen_update, "create", 0) if("update" in self.doc_ops): ops_tasks += self._async_load_all_buckets(self.referenceNode, self.gen_update, "update", 0) if("delete" in self.doc_ops): ops_tasks += self._async_load_all_buckets(self.referenceNode, self.gen_delete, "delete", 0) if not self.withOps: for task in ops_tasks: task.result() self._wait_for_stats_all_buckets(self.servers) self._verify_stats_all_buckets(self.servers,timeout = 120) return ops_tasks def define_maps_during_failover(self, recoveryType = []): """ Method to define nope ip, recovery type map """ recoveryTypeMap={} index=0 for server in self.chosen: if recoveryType: recoveryTypeMap[server.ip] = recoveryType[index] index += 1 return recoveryTypeMap def filter_servers(self, original_servers, filter_servers): """ Filter servers that have not failed over """ _servers_ = copy.deepcopy(original_servers) for failed in filter_servers: for server in _servers_: if server.ip == failed.ip: _servers_.remove(server) self._cleanup_nodes.append(server) return _servers_ def verify_for_recovery_type(self, chosen = [], serverMap = {}, buckets = [], recoveryTypeMap = {}, fileMap = {}): """ Verify recovery type is delta or full """ logic = True summary = "" for server in self.chosen: shell = RemoteMachineShellConnection(serverMap[server.ip]) for bucket in buckets: path = fileMap[server.ip][bucket.name] exists = shell.file_exists(path,"check.txt") if recoveryTypeMap[server.ip] == "delta" and not exists: logic = False summary += "\n Failed Condition :: node {0}, bucket {1} :: Expected Delta, Actual Full".format(server.ip,bucket.name) elif recoveryTypeMap[server.ip] == "full" and exists: logic = False summary += "\n Failed Condition :: node {0}, bucket {1} :: Expected Full, Actual Delta".format(server.ip,bucket.name) shell.disconnect() self.assertTrue(logic, summary) def run_view_creation_operations(self, servers): """" Run view Creation and indexing building tasks on servers """ num_views = self.input.param("num_views", 5) is_dev_ddoc = self.input.param("is_dev_ddoc", True) num_tries = self.input.param("num_tries", 10) ddoc_name = "ddoc1" prefix = ("", "dev_")[is_dev_ddoc] query = {} query["connectionTimeout"] = 60000; query["full_set"] = "true" views = [] tasks = [] for bucket in self.buckets: temp = self.make_default_views(self.default_view_name, num_views, is_dev_ddoc, different_map= False) temp_tasks = self.async_create_views(self.master, ddoc_name, temp, bucket) views += temp tasks += temp_tasks timeout = max(self.wait_timeout * 4, len(self.buckets) * self.wait_timeout * self.num_items / 50000) for task in tasks: task.result(self.wait_timeout * 20) for bucket in self.buckets: for view in views: # run queries to create indexes self.cluster.query_view(self.master, prefix + ddoc_name, view.name, query) self.verify_query_task() active_tasks = self.cluster.async_monitor_active_task(servers, "indexer", "_design/" + prefix + ddoc_name, wait_task=False) for active_task in active_tasks: result = active_task.result() self.assertTrue(result) def monitor_view_tasks(self, servers): """ Monitor Query Tasks for their completion """ num_views = self.input.param("num_views", 5) is_dev_ddoc = self.input.param("is_dev_ddoc", True) ddoc_name = "ddoc1" prefix = ("", "dev_")[is_dev_ddoc] active_tasks = self.cluster.async_monitor_active_task(servers, "indexer", "_design/" + prefix + ddoc_name, wait_task=False) for active_task in active_tasks: result = active_task.result() self.assertTrue(result) def verify_query_task(self): """ Verify Query Results """ num_views = self.input.param("num_views", 5) is_dev_ddoc = self.input.param("is_dev_ddoc", True) ddoc_name = "ddoc1" prefix = ("", "dev_")[is_dev_ddoc] query = {} query["connectionTimeout"] = 60000; query["full_set"] = "true" expected_rows = None if self.max_verify: expected_rows = self.max_verify query["limit"] = expected_rows query["stale"] = "false" for bucket in self.buckets: self.perform_verify_queries(num_views, prefix, ddoc_name, query, bucket=bucket, wait_time=2400, expected_rows=expected_rows) def create_file(self,chosen,buckets,serverMap): """ Created files in data paths for checking if delta/full recovery occured """ fileMap={} for server in self.chosen: shell = RemoteMachineShellConnection(serverMap[server.ip]) map = {} for bucket in buckets: bucket_data_path=self.data_path+"/"+bucket.name+"/"+"check.txt" full_path=self.data_path+"/"+bucket.name+"/" map[bucket.name] = full_path shell.create_file(bucket_data_path,"check") fileMap[server.ip] = map shell.disconnect() return fileMap def get_server_map(self,node): """ Map of ips and server information """ map = {} for server in self.servers: map[server.ip] = server return map def stop_server(self, node): """ Method to stop a server which is subject to failover """ for server in self.servers: if server.ip == node.ip: shell = RemoteMachineShellConnection(server) if shell.is_couchbase_installed(): shell.stop_couchbase() self.log.info("Couchbase stopped") else: shell.stop_membase() self.log.info("Membase stopped") shell.disconnect() break
class FailoverTests(FailoverBaseTest): def setUp(self): super(FailoverTests, self).setUp() self.server_map = self.get_server_map(self.servers) def tearDown(self): super(FailoverTests, self).tearDown() def test_failover_firewall(self): self.common_test_body('firewall') def test_failover_normal(self): self.common_test_body('normal') def test_failover_stop_server(self): self.common_test_body('stop_server') def test_failover_then_add_back(self): self.add_back_flag = True self.common_test_body('normal') def common_test_body(self, failover_reason): """ Main Test body which contains the flow of the failover basic steps 1. Starts Operations if programmed into the test case(before/after) 2. Start View and Index Building operations 3. Failover K out of N nodes (failover can be HARD/GRACEFUL) 4.1 Rebalance the cluster is failover of K nodeStatuses 4.2 Run Add-Back operation with recoveryType = (full/delta) with rebalance 5. Verify all expected operations completed by checking stats, replication, views, data correctness """ # Pick the reference node for communication # We pick a node in the cluster which will NOT be failed over self.filter_list = [] if self.failoverMaster: self.master = self.servers[1] self.log.info("Picking node {0} as reference node for test case" .format(self.master.ip)) self.print_test_params(failover_reason) self.rest = RestConnection(self.master) self.nodes = self.rest.node_statuses() # Set the data path for the cluster self.data_path = self.rest.get_data_path() # Check if the test case has to be run for 3.0.0 versions = self.rest.get_nodes_versions() self.version_greater_than_2_5 = True for version in versions: if "3" > version: self.version_greater_than_2_5 = False # Do not run this this test if graceful category is being used if not self.version_greater_than_2_5 and (self.graceful or (self.recoveryType != None)): self.log.error("Graceful failover can't be applied to nodes with version less then 3.*") self.log.error("Please check configuration parameters: SKIPPING TEST.") return # Find nodes that will under go failover if self.failoverMaster: self.chosen = RebalanceHelper.pick_nodes( self.master, howmany=1, target_node=self.servers[0]) else: self.chosen = RebalanceHelper.pick_nodes( self.master, howmany=self.num_failed_nodes) # Perform operations - Create/Update/Delete # self.withMutationOps = True => Run Operations in parallel to failover # self.withMutationOps = False => Run Operations Before failover self.load_initial_data() if not self.withMutationOps: self.run_mutation_operations() # Perform view creation tasks and wait for completion before failover if self.withViewsOps: self.run_view_creation_operations(self.servers) if not self.createIndexesDuringFailover: self.query_and_monitor_view_tasks(self.servers) # Validate seq_no snap_start/stop values self.check_snap_start_corruption() # Take snap-shot of data set used for validaiton record_static_data_set = dict() prev_vbucket_stats = dict() prev_failover_stats = dict() if not self.withMutationOps: record_static_data_set = self.get_data_set_all( self.servers, self.buckets, path=None) # Capture vbucket and failover stats if test version >= 2.5.* if self.version_greater_than_2_5 and self.upr_check: prev_vbucket_stats = self.get_vbucket_seqnos(self.servers, self.buckets) prev_failover_stats = self.get_failovers_logs(self.servers, self.buckets) # Perform Operations related to failover if self.withMutationOps or self.withViewsOps or self.compact: self.run_failover_operations_with_ops(self.chosen, failover_reason) else: self.run_failover_operations(self.chosen, failover_reason) # TODO: Enable this even when 'flusher_total_batch_limit' is not set if self.flusher_total_batch_limit and \ self.num_replicas >= self.num_failed_nodes: tasks = self._async_load_all_buckets( self.master, self.gen_update, "update", 0) for task in tasks: task.result() if self.graceful: # Validate seq_no snap_start/stop values self.check_snap_start_corruption() # Add back + rebalance // only rebalance with verification if not self.gracefulFailoverFail and self.runRebalanceAfterFailover: if self.add_back_flag: self.run_add_back_operation_and_verify( self.chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats) else: self.run_rebalance_after_failover_and_verify( self.chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats) if self.graceful: # Validate seq_no snap_start/stop values self.check_snap_start_corruption() if self.during_ops is None: self.verify_unacked_bytes_all_buckets(filter_list=self.filter_list, master_node=self.master) def run_rebalance_after_failover_and_verify(self, chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats): """ Method to run rebalance after failover and verify """ # Need a delay > min because MB-7168 _servers_ = self.filter_servers(self.servers, chosen) self._wait_for_stats_all_buckets(_servers_, check_ep_items_remaining=True) self.sleep(5, "after failover before invoking rebalance...") # Rebalance after Failover operation self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[node.id for node in chosen]) if self.during_ops: self.sleep(5, "Wait for some progress in rebalance") if self.during_ops == "change_password": old_pass = self.master.rest_password self.change_password(new_password=self.input.param("new_password", "new_pass")) self.rest = RestConnection(self.master) elif self.during_ops == "change_port": self.change_port(new_port=self.input.param("new_port", "9090")) self.rest = RestConnection(self.master) # Perform Compaction if self.compact: for bucket in self.buckets: self.cluster.compact_bucket(self.master, bucket) # Peform View Validation if Supported nodes = self.filter_servers(self.servers, chosen) if self.withViewsOps: self.query_and_monitor_view_tasks(nodes) # Run operations if required during rebalance after failover if self.withMutationOps: self.run_mutation_operations_after_failover() # Kill or restart operations if self.killNodes or self.stopNodes or self.firewallOnNodes: self.victim_node_operations(node=chosen[0]) self.sleep(60) self.log.info(" Start Rebalance Again !") self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[node.id for node in chosen]) # Rebalance Monitoring msg = "rebalance failed while removing failover nodes {0}".format([node.id for node in chosen]) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg) # Reset password or port if self.during_ops: if self.during_ops == "change_password": self.change_password(new_password=old_pass) elif self.during_ops == "change_port": self.change_port(new_port='8091', current_port=self.input.param("new_port", "9090")) return # Drain Queue and make sure intra-cluster replication is complete self.log.info("Begin VERIFICATION for Rebalance after Failover Only") self.verify_cluster_stats(_servers_, self.master, check_bucket_stats=True, check_ep_items_remaining=True) # Verify all data set with meta data if failover happens after failover if not self.withMutationOps: self.sleep(60) self.data_analysis_all(record_static_data_set, _servers_, self.buckets, path=None, addedItems=None) # Check Cluster Stats and Data as well if max_verify > 0 # Check Failover logs :: Not sure about this logic, currently not checking, will update code once confirmed # Currently, only for checking case where we have graceful failover if self.version_greater_than_2_5 and self.graceful and self.upr_check: new_failover_stats = self.compare_failovers_logs(prev_failover_stats, _servers_, self.buckets) new_vbucket_stats = self.compare_vbucket_seqnos(prev_vbucket_stats, _servers_, self.buckets) self.compare_vbucketseq_failoverlogs(new_vbucket_stats, new_failover_stats) # Verify Active and Replica Bucket Count if self.num_replicas > 0: nodes = self.get_nodes_in_cluster(self.master) self.vb_distribution_analysis(servers=nodes, buckets=self.buckets, std=20.0 , total_vbuckets=self.total_vbuckets) self.log.info("End VERIFICATION for Rebalance after Failover Only") def run_add_back_operation_and_verify(self, chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats): """ Method to run add-back operation with recovery type = (delta/full) It also verifies if the operations are correct with data verificaiton steps """ _servers_ = self.filter_servers(self.servers, chosen) self._wait_for_stats_all_buckets(_servers_, check_ep_items_remaining=True) recoveryTypeMap = self.define_maps_during_failover(self.recoveryType) fileMapsForVerification = self.create_file(chosen, self.buckets, self.server_map) index = 0 for node in chosen: self.sleep(5) if self.recoveryType: # define precondition for recoverytype self.rest.set_recovery_type(otpNode=node.id, recoveryType=self.recoveryType[index]) index += 1 else: self.rest.add_back_node(node.id) # Doc_mutation before triggering rebalance if self.flusher_total_batch_limit and \ self.num_replicas >= self.num_failed_nodes: tasks = self._async_load_all_buckets( self.master, self.gen_update, "update", 0) for task in tasks: task.result() self.sleep(20, "After failover before invoking rebalance...") self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[], deltaRecoveryBuckets=self.deltaRecoveryBuckets) # Perform Compaction if self.compact: for bucket in self.buckets: self.cluster.compact_bucket(self.master, bucket) # Peform View Validation if Supported nodes = self.filter_servers(self.servers, chosen) if self.withViewsOps: self.query_and_monitor_view_tasks(nodes) # Run operations if required during rebalance after failover if self.withMutationOps: self.run_mutation_operations_after_failover() # Kill or restart operations if self.killNodes or self.stopNodes or self.firewallOnNodes: self.victim_node_operations(node=chosen[0]) self.sleep(60) self.log.info("Start Rebalance Again!") self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[], deltaRecoveryBuckets=self.deltaRecoveryBuckets) self.sleep(10, "Wait for rebalance to start") # Check if node has to be killed or restarted during rebalance # Monitor Rebalance msg = "rebalance failed while removing failover nodes {0}".format(chosen) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg) # Drain ep_queue and make sure that intra-cluster replication is complete self._wait_for_stats_all_buckets(self.servers, check_ep_items_remaining=True) self.log.info("Begin VERIFICATION for Add-back and rebalance") # Verify Stats of cluster and Data is max_verify > 0 self.verify_cluster_stats(self.servers, self.master, check_bucket_stats=True, check_ep_items_remaining=True) # Verify recovery Type succeeded if we added-back nodes self.verify_for_recovery_type(chosen, self.server_map, self.buckets, recoveryTypeMap, fileMapsForVerification, self.deltaRecoveryBuckets) # Comparison of all data if required if not self.withMutationOps and self.flusher_total_batch_limit is None: self.sleep(60) self.data_analysis_all(record_static_data_set, self.servers, self.buckets, path=None, addedItems=None) # Verify if vbucket sequence numbers and failover logs are as expected # We will check only for version > 2.5.* and if the failover is graceful if self.version_greater_than_2_5 and self.graceful and self.upr_check: new_vbucket_stats = self.compare_vbucket_seqnos(prev_vbucket_stats, self.servers, self.buckets, perNode=False) new_failover_stats = self.compare_failovers_logs(prev_failover_stats, self.servers, self.buckets) self.compare_vbucketseq_failoverlogs(new_vbucket_stats, new_failover_stats) # Verify Active and Replica Bucket Count if self.num_replicas > 0: nodes = self.get_nodes_in_cluster(self.master) self.vb_distribution_analysis(servers=nodes, buckets=self.buckets, std=20.0 , total_vbuckets=self.total_vbuckets) self.log.info("End VERIFICATION for Add-back and rebalance") def print_test_params(self, failover_reason): """ Method to print test parameters """ self.log.info("num_replicas : {0}".format(self.num_replicas)) self.log.info("recoveryType : {0}".format(self.recoveryType)) self.log.info("failover_reason : {0}".format(failover_reason)) self.log.info("num_failed_nodes : {0}".format(self.num_failed_nodes)) self.log.info('picking server : {0} as the master'.format(self.master)) def run_failover_operations(self, chosen, failover_reason): """ Method to run fail over operations used in the test scenario based on failover reason """ # Perform Operations relalted to failover graceful_count = 0 graceful_failover = True failed_over = True for node in chosen: unreachable = False if failover_reason == 'stop_server': unreachable = True self.stop_server(node) self.log.info("10 seconds delay to wait for membase-server to shutdown") # wait for 5 minutes until node is down self.assertTrue(RestHelper(self.rest).wait_for_node_status(node, "unhealthy", self.wait_timeout * 10), msg="node status is not unhealthy even after waiting for 5 minutes") elif failover_reason == "firewall": unreachable = True self.filter_list.append (node.ip) server = [srv for srv in self.servers if node.ip == srv.ip][0] RemoteUtilHelper.enable_firewall(server, bidirectional=self.bidirectional) status = RestHelper(self.rest).wait_for_node_status(node, "unhealthy", self.wait_timeout * 10) if status: self.log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port)) else: # verify iptables on the node if something wrong for server in self.servers: if server.ip == node.ip: shell = RemoteMachineShellConnection(server) info = shell.extract_remote_info() if info.type.lower() == "windows": o, r = shell.execute_command("netsh advfirewall show allprofiles") shell.log_command_output(o, r) else: o, r = shell.execute_command("/sbin/iptables --list") shell.log_command_output(o, r) shell.disconnect() self.rest.print_UI_logs() api = self.rest.baseUrl + 'nodeStatuses' status, content, header = self.rest._http_request(api) json_parsed = json.loads(content) self.log.info("nodeStatuses: {0}".format(json_parsed)) self.fail("node status is not unhealthy even after waiting for 5 minutes") # verify the failover type if self.check_verify_failover_type: graceful_count, graceful_failover = self.verify_failover_type(node, graceful_count, self.num_replicas, unreachable) # define precondition check for failover success_failed_over = self.rest.fail_over(node.id, graceful=(self.graceful and graceful_failover)) if self.graceful and graceful_failover: if self.stopGracefulFailover or self.killNodes or self.stopNodes or self.firewallOnNodes: self.victim_node_operations(node) # Start Graceful Again self.log.info(" Start Graceful Failover Again !") self.sleep(120) success_failed_over = self.rest.fail_over(node.id, graceful=(self.graceful and graceful_failover)) self.sleep(180) msg = "graceful failover failed for nodes {0}".format(node.id) self.log.info("chosen: {0} get_failover_count: {1}".format(len(chosen), self.get_failover_count())) self.assertEqual(len(chosen), self.get_failover_count(), msg=msg) else: msg = "rebalance failed while removing failover nodes {0}".format(node.id) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg) failed_over = failed_over and success_failed_over # Check for negative cases if self.graceful and (failover_reason in ['stop_server', 'firewall']): if failed_over: # MB-10479 self.rest.print_UI_logs() self.assertFalse(failed_over, "Graceful Falover was started for unhealthy node!!! ") return elif self.gracefulFailoverFail and not failed_over: """ Check if the fail_over fails as expected """ self.assertFalse(failed_over, """ Graceful failover should fail due to not enough replicas """) return # Check if failover happened as expected or re-try one more time if not failed_over: self.log.info("unable to failover the node the first time. try again in 60 seconds..") # try again in 75 seconds self.sleep(75) failed_over = self.rest.fail_over(node.id, graceful=(self.graceful and graceful_failover)) if self.graceful and (failover_reason not in ['stop_server', 'firewall']): reached = RestHelper(self.rest).rebalance_reached() self.assertTrue(reached, "rebalance failed for Graceful Failover, stuck or did not completed") # Verify Active and Replica Bucket Count if self.num_replicas > 0: nodes = self.filter_servers(self.servers, chosen) self.vb_distribution_analysis(servers=nodes, buckets=self.buckets, std=20.0 , total_vbuckets=self.total_vbuckets, type="failover", graceful=(self.graceful and graceful_failover)) def run_failover_operations_with_ops(self, chosen, failover_reason): """ Method to run fail over operations used in the test scenario based on failover reason """ # Perform Operations relalted to failover failed_over = True for node in chosen: unreachable = False if failover_reason == 'stop_server': unreachable = True self.stop_server(node) self.log.info("10 seconds delay to wait for membase-server to shutdown") # wait for 5 minutes until node is down self.assertTrue(RestHelper(self.rest).wait_for_node_status(node, "unhealthy", 300), msg="node status is not unhealthy even after waiting for 5 minutes") elif failover_reason == "firewall": unreachable = True self.filter_list.append (node.ip) server = [srv for srv in self.servers if node.ip == srv.ip][0] RemoteUtilHelper.enable_firewall(server, bidirectional=self.bidirectional) status = RestHelper(self.rest).wait_for_node_status(node, "unhealthy", 300) if status: self.log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port)) else: # verify iptables on the node if something wrong for server in self.servers: if server.ip == node.ip: shell = RemoteMachineShellConnection(server) info = shell.extract_remote_info() if info.type.lower() == "windows": o, r = shell.execute_command("netsh advfirewall show allprofiles") shell.log_command_output(o, r) else: o, r = shell.execute_command("/sbin/iptables --list") shell.log_command_output(o, r) shell.disconnect() self.rest.print_UI_logs() api = self.rest.baseUrl + 'nodeStatuses' status, content, header = self.rest._http_request(api) json_parsed = json.loads(content) self.log.info("nodeStatuses: {0}".format(json_parsed)) self.fail("node status is not unhealthy even after waiting for 5 minutes") nodes = self.filter_servers(self.servers, chosen) failed_over = self.cluster.async_failover([self.master], failover_nodes=chosen, graceful=self.graceful) # Perform Compaction compact_tasks = [] if self.compact: for bucket in self.buckets: compact_tasks.append(self.cluster.async_compact_bucket(self.master, bucket)) # Run View Operations if self.withViewsOps: self.query_and_monitor_view_tasks(nodes) # Run mutation operations if self.withMutationOps: self.run_mutation_operations() failed_over.result() for task in compact_tasks: task.result() msg = "rebalance failed while removing failover nodes {0}".format(node.id) self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg) def load_initial_data(self): """ Method to run operations Update/Delete/Create """ # Load All Buckets if num_items > 0 tasks = [] tasks += self._async_load_all_buckets(self.master, self.gen_initial_create, "create", 0, flag=2, batch_size=20000) for task in tasks: task.result() self._wait_for_stats_all_buckets(self.servers, check_ep_items_remaining=True) self._verify_stats_all_buckets(self.servers, timeout=120) def run_mutation_operations(self): mutation_ops_tasks = [] if "create" in self.doc_ops: mutation_ops_tasks += self._async_load_all_buckets( self.master, self.gen_create, "create", 0) if "update" in self.doc_ops: mutation_ops_tasks += self._async_load_all_buckets( self.master, self.gen_update, "update", 0) if "delete" in self.doc_ops: mutation_ops_tasks += self._async_load_all_buckets( self.master, self.gen_delete, "delete", 0) try: for task in mutation_ops_tasks: task.result() except Exception as ex: self.log.info(ex) def run_mutation_operations_after_failover(self): mutation_ops_tasks = [] if "create" in self.doc_ops: mutation_ops_tasks += self._async_load_all_buckets( self.master, self.afterfailover_gen_create, "create", 0) if "update" in self.doc_ops: mutation_ops_tasks += self._async_load_all_buckets( self.master, self.afterfailover_gen_update, "update", 0) if "delete" in self.doc_ops: mutation_ops_tasks += self._async_load_all_buckets( self.master, self.afterfailover_gen_delete, "delete", 0) try: for task in mutation_ops_tasks: task.result() except Exception as ex: self.log.info(ex) def define_maps_during_failover(self, recoveryType=[]): """ Method to define nope ip, recovery type map """ recoveryTypeMap = {} index = 0 for server in self.chosen: if recoveryType: recoveryTypeMap[server.ip] = recoveryType[index] index += 1 return recoveryTypeMap def filter_servers(self, original_servers, filter_servers): """ Filter servers that have not failed over """ _servers_ = copy.deepcopy(original_servers) for failed in filter_servers: for server in _servers_: if server.ip == failed.ip: _servers_.remove(server) self._cleanup_nodes.append(server) return _servers_ def verify_for_recovery_type(self, chosen=[], serverMap={}, buckets=[], recoveryTypeMap={}, fileMap={}, deltaRecoveryBuckets=[]): """ Verify recovery type is delta or full """ summary = "" logic = True for server in self.chosen: shell = RemoteMachineShellConnection(serverMap[server.ip]) os_type = shell.extract_remote_info() if os_type.type.lower() == 'windows': return for bucket in buckets: path = fileMap[server.ip][bucket.name] exists = shell.file_exists(path, "check.txt") if deltaRecoveryBuckets != None: if recoveryTypeMap[server.ip] == "delta" and (bucket.name in deltaRecoveryBuckets) and not exists: logic = False summary += "\n Failed Condition :: node {0}, bucket {1} :: Expected Delta, Actual Full".format(server.ip, bucket.name) elif recoveryTypeMap[server.ip] == "delta" and (bucket.name not in deltaRecoveryBuckets) and exists: summary += "\n Failed Condition :: node {0}, bucket {1} :: Expected Full, Actual Delta".format(server.ip, bucket.name) logic = False else: if recoveryTypeMap[server.ip] == "delta" and not exists: logic = False summary += "\n Failed Condition :: node {0}, bucket {1} :: Expected Delta, Actual Full".format(server.ip, bucket.name) elif recoveryTypeMap[server.ip] == "full" and exists: logic = False summary += "\n Failed Condition :: node {0}, bucket {1} :: Expected Full, Actual Delta".format(server.ip, bucket.name) shell.disconnect() self.assertTrue(logic, summary) def run_view_creation_operations(self, servers): """" Run view Creation and indexing building tasks on servers """ num_views = self.input.param("num_views", 5) is_dev_ddoc = self.input.param("is_dev_ddoc", True) num_tries = self.input.param("num_tries", 10) ddoc_name = "ddoc1" prefix = ("", "dev_")[is_dev_ddoc] query = {} query["connectionTimeout"] = 60000 query["full_set"] = "true" views = [] tasks = [] for bucket in self.buckets: temp = self.make_default_views(self.default_view_name, num_views, is_dev_ddoc, different_map=False) temp_tasks = self.async_create_views(self.master, ddoc_name, temp, bucket) views += temp tasks += temp_tasks timeout = max(self.wait_timeout * 4, len(self.buckets) * self.wait_timeout * self.num_items // 50000) for task in tasks: task.result(self.wait_timeout * 20) def query_and_monitor_view_tasks(self, servers): """ Monitor Query Tasks for their completion """ num_views = self.input.param("num_views", 5) is_dev_ddoc = self.input.param("is_dev_ddoc", True) ddoc_name = "ddoc1" prefix = ("", "dev_")[is_dev_ddoc] self.verify_query_task() active_tasks = self.cluster.async_monitor_active_task(servers, "indexer", "_design/" + prefix + ddoc_name, wait_task=False) for active_task in active_tasks: result = active_task.result() self.assertTrue(result) def verify_query_task(self): """ Verify Query Results """ num_views = self.input.param("num_views", 5) is_dev_ddoc = self.input.param("is_dev_ddoc", True) ddoc_name = "ddoc1" prefix = ("", "dev_")[is_dev_ddoc] query = {} query["connectionTimeout"] = 60000 query["full_set"] = "true" expected_rows = None timeout = None if self.active_resident_threshold == 0: timeout = 2400 if self.max_verify: expected_rows = self.max_verify query["limit"] = expected_rows query["stale"] = "false" for bucket in self.buckets: self.perform_verify_queries(num_views, prefix, ddoc_name, query, bucket=bucket, wait_time=timeout, expected_rows=expected_rows) def create_file(self, chosen, buckets, serverMap): """ Created files in data paths for checking if delta/full recovery occured """ fileMap = {} for server in self.chosen: shell = RemoteMachineShellConnection(serverMap[server.ip]) type = shell.extract_remote_info().distribution_type map = {} for bucket in buckets: if type.lower() == 'windows': self.data_path = 'c:/Program\ Files/Couchbase/Server/var/lib/couchbase/data' bucket_data_path = self.data_path + "/" + bucket.name + "/" + "check.txt" full_path = self.data_path + "/" + bucket.name + "/" map[bucket.name] = full_path shell.create_file(bucket_data_path, "check") fileMap[server.ip] = map shell.disconnect() return fileMap def verify_failover_type(self, chosen=None, graceful_count=0, replica_count=0, unreachable=False): logic = True summary = "" nodes = self.rest.node_statuses() node_count = len(nodes) change_graceful_count = graceful_count graceful_failover = True if unreachable: node_count -= 1 else: change_graceful_count += 1 if replica_count != 0: for node in nodes: if unreachable and node.ip == chosen.ip: graceful_failover = node.gracefulFailoverPossible if node.gracefulFailoverPossible: logic = False summary += "\n failover type for unreachable node {0} Expected :: Hard, Actual :: Graceful".format(node.ip) elif node.ip == chosen.ip: graceful_failover = node.gracefulFailoverPossible if replica_count > graceful_count and (node_count - 1) + graceful_count >= replica_count: if not node.gracefulFailoverPossible: logic = False summary += "\n failover type for node {0} Expected :: Graceful, Actual :: Hard".format(node.ip) else: if node.gracefulFailoverPossible: logic = False summary += "\n failover type for {0} Expected :: Hard, Actual :: Graceful".format(node.ip) else: for node in nodes: if node.ip == chosen.ip: graceful_failover = node.gracefulFailoverPossible if node.gracefulFailoverPossible: logic = False summary += "\n failover type for node {0} Expected :: Hard, Actual :: Graceful".format(node.ip) self.assertTrue(logic, summary) return change_graceful_count, graceful_failover def get_server_map(self, node): """ Map of ips and server information """ map = {} for server in self.servers: map[server.ip] = server return map def victim_node_operations(self, node=None): if self.stopGracefulFailover: self.log.info(" Stopping Graceful Failover ") stopped = self.rest.stop_rebalance(wait_timeout=self.wait_timeout // 3) self.assertTrue(stopped, msg="unable to stop rebalance") if self.killNodes: self.log.info(" Killing Memcached ") kill_nodes = self.get_victim_nodes(self.servers, self.master, node, self.victim_type, self.victim_count) for kill_node in kill_nodes: self.kill_server_memcached(kill_node) if self.stopNodes: self.log.info(" Stopping Node") stop_nodes = self.get_victim_nodes(self.servers, self.master, node, self.victim_type, self.victim_count) for stop_node in stop_nodes: self.stop_server(stop_node) self.sleep(10) self.log.info(" Starting Node") for start_node in stop_nodes: self.start_server(start_node) if self.firewallOnNodes: self.log.info(" Enabling Firewall for Node ") stop_nodes = self.get_victim_nodes(self.servers, self.master, node, self.victim_type, self.victim_count) for stop_node in stop_nodes: self.start_firewall_on_node(stop_node) self.sleep(30) self.log.info(" Disable Firewall for Node ") for start_node in stop_nodes: self.stop_firewall_on_node(start_node) self.sleep(120) def get_failover_count(self): rest = RestConnection(self.master) cluster_status = rest.cluster_status() failover_count = 0 # check for inactiveFailed for node in cluster_status['nodes']: if node['clusterMembership'] == "inactiveFailed": failover_count += 1 return failover_count
def common_test_body(self, keys_count, failover_reason): log = logger.Logger.get_logger() log.info("keys_count : {0}".format(keys_count)) log.info("replicas : {0}".format(self.num_replicas)) log.info("failover_reason : {0}".format(failover_reason)) log.info('picking server : {0} as the master'.format(self.master)) self._load_all_buckets(self.master, self.gen_create, "create", 0, batch_size=10000, pause_secs=5, timeout_secs=180) self._wait_for_stats_all_buckets(self.servers) _servers_ = self.servers rest = RestConnection(self.master) nodes = rest.node_statuses() RebalanceHelper.wait_for_replication(self.servers, self.cluster) chosen = RebalanceHelper.pick_nodes(self.master, howmany=self.num_replicas) for node in chosen: # let's do op if failover_reason == 'stop_server': self.stop_server(node) log.info("10 seconds delay to wait for membase-server to shutdown") # wait for 5 minutes until node is down self.assertTrue(RestHelper(rest).wait_for_node_status(node, "unhealthy", 300), msg="node status is not unhealthy even after waiting for 5 minutes") elif failover_reason == "firewall": server = [srv for srv in self.servers if node.ip == srv.ip][0] RemoteUtilHelper.enable_firewall(server, bidirectional=self.bidirectional) status = RestHelper(rest).wait_for_node_status(node, "unhealthy", 300) if status: log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port)) else: # verify iptables on the node if something wrong for server in self.servers: if server.ip == node.ip: shell = RemoteMachineShellConnection(server) info = shell.extract_remote_info() if info.type.lower() == "windows": o, r = shell.execute_command("netsh advfirewall show allprofiles") else: o, r = shell.execute_command("/sbin/iptables --list") shell.log_command_output(o, r) shell.disconnect() for i in rest.get_logs(): self.log.error(i) api = rest.baseUrl + 'nodeStatuses' status, content, header = rest._http_request(api) json_parsed = json.loads(content) self.log.info("nodeStatuses: {0}".format(json_parsed)) self.fail("node status is not unhealthy even after waiting for 5 minutes") failed_over = rest.fail_over(node.id) if not failed_over: self.log.info("unable to failover the node the first time. try again in 60 seconds..") # try again in 75 seconds time.sleep(75) failed_over = rest.fail_over(node.id) self.assertTrue(failed_over, "unable to failover node after {0}".format(failover_reason)) log.info("failed over node : {0}".format(node.id)) self._failed_nodes.append(node) if self.add_back_flag: for node in self._failed_nodes: rest.add_back_node(node.id) time.sleep(5) log.info("10 seconds sleep after failover before invoking rebalance...") time.sleep(10) rest.rebalance(otpNodes=[node.id for node in nodes], ejectedNodes=[]) msg = "rebalance failed while removing failover nodes {0}".format(chosen) self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg) else: # Need a delay > min because MB-7168 log.info("60 seconds sleep after failover before invoking rebalance...") time.sleep(60) rest.rebalance(otpNodes=[node.id for node in nodes], ejectedNodes=[node.id for node in chosen]) if self.during_ops: self.sleep(5, "Wait for some progress in rebalance") if self.during_ops == "change_password": old_pass = self.master.rest_password self.change_password(new_password=self.input.param("new_password", "new_pass")) rest = RestConnection(self.master) elif self.during_ops == "change_port": self.change_port(new_port=self.input.param("new_port", "9090")) rest = RestConnection(self.master) try: msg = "rebalance failed while removing failover nodes {0}".format(chosen) self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg) for failed in chosen: for server in _servers_: if server.ip == failed.ip: _servers_.remove(server) self._cleanup_nodes.append(server) log.info("Begin VERIFICATION ...") RebalanceHelper.wait_for_replication(_servers_, self.cluster) self.verify_cluster_stats(_servers_, self.master) finally: if self.during_ops: if self.during_ops == "change_password": self.change_password(new_password=old_pass) elif self.during_ops == "change_port": self.change_port(new_port='8091', current_port=self.input.param("new_port", "9090"))