def reset_cluster(self, crash_warning=False): try: for node in self.cluster.servers: shell = RemoteMachineShellConnection(node) # Start node rest = RestConnection(node) data_path = rest.get_data_path() core_path = str(rest.get_data_path()).split("data")[0] + "crash/" if not os.path.isdir(core_path): core_path = "/opt/couchbase/var/lib/couchbase/crash/" # Stop node self.stop_server(node) # Delete Path shell.cleanup_data_config(data_path) if not crash_warning: shell.cleanup_data_config(core_path) self.start_server(node) if '.com' in node.ip or ':' in node.ip: shell.update_dist_type() shell.disconnect() # Wait after reset of cluster nodes sleep(10) except Exception, ex: self.log.error(ex)
def persistence_verification_per_node(rest, bucket, queue=None, timeout=1260): stat_key = 'ep_flusher_todo' start = time.time() stats = [] log = logger.get("infra") # Collect stats data points while time.time() - start <= timeout: _new_stats = rest.get_bucket_stats(bucket) if _new_stats and 'ep_flusher_todo' in _new_stats: stats.append(_new_stats[stat_key]) # Wait before checking bucket_stats sleep(0.5) else: log.error("Unable to obtain stats for bucket: %s" % bucket) value_90th = ClusterOperationHelper.percentile(stats, 90) average = float(sum(stats)) / len(stats) log.info("90th percentile value is {0} and average {1}".format( value_90th, average)) if value_90th == 0 and average == 0: queue.put(False) return queue.put(True)
def _http_session_get(self, api, params='', headers=None, session=None, timeout=120, verify=False): end_time = time.time() + timeout while True: try: headers['Connection'] = "keep-alive" response = session.get(api, headers=headers, timeout=timeout) status = response.status_code content = response.content if status in [200, 201, 202]: return True, content, response else: self.log.error(response.reason) return False, content, response except requests.exceptions.HTTPError as errh: self.log.error("HTTP Error {0}".format(errh)) except requests.exceptions.ConnectionError as errc: self.log.error("Error Connecting {0}".format(errc)) if time.time() > end_time: raise ServerUnavailableException(ip=self.ip) except requests.exceptions.Timeout as errt: self.log.error("Timeout Error: {0}".format(errt)) if time.time() > end_time: raise ServerUnavailableException(ip=self.ip) except requests.exceptions.RequestException as err: self.log.error("Something else: {0}".format(err)) sleep(3, log_type="infra")
def monitor_checkpoints(self, master, bucket, state, interval, max_allowed, command_queue): # Monitor checkpoints & if num_checkpoints is > max_allowed it alerts # Should be started in a thread and stopped there while True: try: command = command_queue.get_nowait() if command and command == "stop": break except Queue.Empty: pass merged = self.get_checkpoints_from_cluster(master, bucket) alarms = [] for vb, checkpoints in merged.items(): for node, checkpoint_attributes in checkpoints.items(): if checkpoint_attributes["state"] == state: if int(checkpoint_attributes["num_checkpoints"] ) > max_allowed: alarms.append( "Active vbucket %s num_checkpoints is %s" % (vb, checkpoint_attributes["num_checkpoints"])) for alarm in alarms: print alarm # Wait before next Queue check sleep(interval, log_type="infra")
def wait_until(value_getter, condition, timeout_secs=-1): """ Repeatedly calls value_getter returning the value when it satisfies condition. Calls to value getter back off exponentially. Useful if you simply want to synchronously wait for a condition to be satisfied. :param value_getter: no-arg function that gets a value :param condition: single-arg function that tests the value :param timeout_secs: number of seconds after which to timeout; if negative waits forever; default is to wait forever :return: the value returned by value_getter :raises: TimeoutError if the operation times out before getting a value that satisfies condition """ start_time = time.time() stop_time = start_time + max(timeout_secs, 0) interval = 0.01 attempt = 0 value = value_getter() while not condition(value): now = time.time() if timeout_secs < 0 or now < stop_time: sleep(2**attempt * interval, log_type="infra") attempt += 1 value = value_getter() else: raise TimeoutException( 'Timed out after {0} seconds and {1} attempts'.format( now - start_time, attempt)) return value
def wait_for_mc_stats_no_timeout(master, bucket, stat_key, stat_value, timeout_in_seconds=-1, verbose=True): log = logger.get("infra") log.info( "Waiting for bucket {0} stat : {1} to match {2} on {3}".format( bucket, stat_key, stat_value, master.ip)) # keep retrying until reaches the server stats = dict() while not stats: try: c = MemcachedClient(master.ip, constants.memcached_port) c.sasl_auth_plain(bucket, '') stats = c.stats() except Exception as e: stats = dict() sleep(2, "Exception: %s. Will retry.." % str(e), log_type="infra") finally: c.close() while str(stats[stat_key]) != str(stat_value): c = MemcachedClient(master.ip, constants.memcached_port) c.sasl_auth_plain(bucket, '') stats = c.stats() c.close() if verbose: log.info("{0} : {1}".format(stat_key, stats[stat_key])) sleep(5, log_type="infra") return True
def wait_for_mc_stats(master, bucket, stat_key, stat_value, timeout_in_seconds=120, verbose=True): log = logger.get("infra") log.info("Waiting for bucket {0} stat: {1} to match {2} on {3}".format( bucket, stat_key, stat_value, master.ip)) start = time.time() verified = False while (time.time() - start) <= timeout_in_seconds: c = MemcachedClient(master.ip, constants.memcached_port) stats = c.stats() c.close() if stats and stat_key in stats \ and str(stats[stat_key]) == str(stat_value): log.info("{0} : {1}".format(stat_key, stats[stat_key])) verified = True break else: if stats and stat_key in stats: if verbose: log.info("{0} : {1}".format(stat_key, stats[stat_key])) sleep_time = 2 if not verbose: sleep_time = 0.1 sleep(sleep_time, log_type="infra") return verified
def pause_resume_n(self, body, num): for i in range(num): self.pause_function(body) self.log.debug("Wait between pause_function & " "before resume_function() call") sleep(30) self.resume_function(body)
def wait_for_indexing_to_complete(self, bucket_name, target_index=None, timeout=60): """ Waits till the indexes 'num_docs_queued' to reach '0', meaning all docs are indexed. :param bucket_name: Name of the bucket to validate :param target_index: Index_name to wait for. None means wait for all :param timeout: :return index_completed: Boolean value to tell the index is done or not """ self.log.info("Wait for indexing queue to reach '0'") timer = 0 index_completed = False while timer < timeout and index_completed is False: index_completed = True stats = self.get_index_stats() for index_name, index_stats in stats[bucket_name].items(): if target_index is not None and index_name != target_index: continue if index_stats["num_docs_queued"] != 0: index_completed = False break sleep(2, "Wait before next indexer stats query") return index_completed
def change_env_variables(self): for server in self.cluster.servers: remote_client = RemoteMachineShellConnection(server) vb_on_node, _ = remote_client.execute_command("grep ^COUCHBASE_NUM_VBUCKETS \ /opt/couchbase/bin/couchbase-server | cut -d \"=\" -f 2",) self.log.debug("Current vBuckets on node %s: %s" % (server, vb_on_node)) if vb_on_node: vb_on_node = int(vb_on_node[0]) else: vb_on_node = 1024 if self.vbuckets != vb_on_node or self.upr is not None: env_dict = {} if self.vbuckets: env_dict["COUCHBASE_NUM_VBUCKETS"] = self.vbuckets if self.upr is not None: if self.upr: env_dict["COUCHBASE_REPL_TYPE"] = "upr" else: env_dict["COUCHBASE_REPL_TYPE"] = "tap" if len(env_dict) >= 1: remote_client.change_env_variables(env_dict) remote_client.disconnect() self.log.debug("========= CHANGED ENVIRONMENT SETTING ===========") self.log.debug("Wait for all the services to come up after " "change_env_vars update") sleep(10, log_type="infra")
def call(self): self.thread_used = threading.currentThread().getName() self.started = time.time() try: temp=0 docs=[] for i in xrange(self.num_items): start_message_id = global_vars.message_id if self.op_type == "create": for j in xrange(random.randint(1,10)): var = str(json.dumps(self.generate_GleambookMessages(i+self.start_from , global_vars.message_id))) user = JsonTranscoder().stringToJsonObject(var) # print i+self.start_from,global_vars.message_id doc = JsonDocument.create(str(global_vars.message_id), user) docs.append(doc) temp += 1 if temp == self.batch_size: try: doc_op().bulkSet(self.msg_bucket, docs) except: sleep(20, "Exception in Java SDK - create") try: doc_op().bulkUpsert(self.msg_bucket, docs) except: print "skipping %s documents upload"%len(docs) pass temp = 0 docs=[] global_vars.message_id += 1 end_message_id = global_vars.message_id elif self.op_type == "update": var = str(json.dumps(self.generate_GleambookMessages(i+self.start_from , i+start_message_id))) user = JsonTranscoder().stringToJsonObject(var) doc = JsonDocument.create(str(i+start_message_id), user) docs.append(doc) if temp == self.batch_size: try: doc_op().bulkUpsert(self.msg_bucket, docs) except: sleep(20, "Exception in Java SDK - create") try: doc_op().bulkUpsert(self.msg_bucket, docs) except: print "skipping %s documents upload"%len(docs) pass temp = 0 docs=[] elif self.op_type == "delete": try: response = self.msg_bucket.remove(str(i+start_message_id)) except: pass self.loaded += 1 except Exception, ex: import traceback traceback.print_exc() exc_info = sys.exc_info() traceback.print_exception(*exc_info) self.exception = ex
def polling_delete_index(self, bucket=None, index=None, timeout=100): for x in range(timeout): result = self.index_status() if result[bucket.name].get(index) is None: return True sleep(1) self.log.info("Index found with iteration {}".format(index, str(x))) return False
def wait_for_stats(master, bucket, stat_key, stat_value, timeout_in_seconds=120, verbose=True): log = logger.get("infra") log.info( "waiting for bucket {0} stat : {1} to match {2} on {3}".format( bucket, stat_key, stat_value, master.ip)) time_to_timeout = 0 previous_stat_value = -1 curr_stat_value = -1 verified = False while not verified: rest = RestConnection(master) try: stats = rest.get_bucket_stats(bucket) if stats and stat_key in stats and stats[ stat_key] == stat_value: log.info("{0} : {1}".format(stat_key, stats[stat_key])) verified = True break else: if stats and stat_key in stats: if verbose: log.info("{0} : {1}".format( stat_key, stats[stat_key])) curr_stat_value = stats[stat_key] # values are changing so clear any timeout if curr_stat_value != previous_stat_value: time_to_timeout = 0 else: if time_to_timeout == 0: time_to_timeout = time.time() + timeout_in_seconds if time_to_timeout < time.time(): log.info( "no change in {0} stat after {1} seconds (value = {2})" .format(stat_key, timeout_in_seconds, curr_stat_value)) break previous_stat_value = curr_stat_value sleep_time = 2 if not verbose: sleep_time = 0.1 sleep(sleep_time) except: log.info( "unable to collect stats from server {0}".format(master)) # TODO: throw ex and assume caller catches verified = True break sleep(5, "Wait before next stats check", log_type="infra") return verified
def polling_create_index_status(self, bucket=None, index=None, timeout=300): for x in range(timeout): result = self.index_status() if result[bucket.name].has_key(index): if result[bucket.name][index]['status'] == 'Ready': return True sleep(1) self.log.info("Index {} not found with iteration {}".format(index, str(x))) return False
def call(self): self.thread_used = threading.currentThread().getName() self.started = time.time() try: docs=[] keys=[] temp=0 for i in xrange(self.num_items): if self.op_type == "create": var = str(json.dumps(self.generate_GleambookUser(i+self.start_from))) user = JsonTranscoder().stringToJsonObject(var) doc = JsonDocument.create(str(i+self.start_from), user) docs.append(doc) temp += 1 if temp == self.batch_size: try: doc_op().bulkSet(self.bucket, docs) except: sleep(20, "Exception from Java SDK-create") try: doc_op().bulkUpsert(self.bucket, docs) except: print "GleambookUser_Docloader: skipping %s documents create"%len(docs) pass temp = 0 docs = [] # response = self.bucket.insert(doc) elif self.op_type == "update": var = str(json.dumps(self.generate_GleambookUser(i+self.start_from))) user = JsonTranscoder().stringToJsonObject(var) doc = JsonDocument.create(str(i+self.start_from), user) docs.append(doc) temp += 1 if temp == self.batch_size: try: doc_op().bulkUpsert(self.bucket, docs) except: sleep(20, "Exception from Java SDK - create") try: doc_op().bulkUpsert(self.bucket, docs) except: print "GleambookUser_Docloader: skipping %s documents upload"%len(docs) pass temp = 0 docs=[] elif self.op_type == "delete": try: response = self.bucket.remove(str(i+self.start_from)) except: print "Exception from Java SDK - remove" self.loaded += 1 except Exception, ex: import traceback traceback.print_exc() exc_info = sys.exc_info() traceback.print_exception(*exc_info) self.exception = ex
def with_sleep(method, *args): self = args[0] start_time = time.time() return_value = method(self, *args[1:]) end_time = time.time() exec_time = end_time - start_time if self.rate_limited and exec_time < self.thread_min_time: sleep(self.thread_min_time - exec_time) return return_value
def get_bucket_from_cluster(self, bucket, num_attempt=1, timeout=1): api = '%s%s%s?basic_stats=true' \ % (self.baseUrl, 'pools/default/buckets/', urllib.quote_plus(bucket.name)) status, content, _ = self._http_request(api) num = 1 while not status and num_attempt > num: sleep(timeout, "Will retry to get %s" % api, log_type="infra") status, content, _ = self._http_request(api) num += 1 if status: parsed = json.loads(content) if 'vBucketServerMap' in parsed: vBucketServerMap = parsed['vBucketServerMap'] serverList = vBucketServerMap['serverList'] bucket.servers.extend(serverList) if "numReplicas" in vBucketServerMap: bucket.replicaNumber = vBucketServerMap["numReplicas"] # vBucketMapForward if 'vBucketMapForward' in vBucketServerMap: # let's gather the forward map vBucketMapForward = vBucketServerMap['vBucketMapForward'] counter = 0 for vbucket in vBucketMapForward: # there will be n number of replicas vbucketInfo = Bucket.vBucket() vbucketInfo.master = serverList[vbucket[0]] if vbucket: for i in range(1, len(vbucket)): if vbucket[i] != -1: vbucketInfo.replica.append( serverList[vbucket[i]]) vbucketInfo.id = counter counter += 1 bucket.forward_map.append(vbucketInfo) vBucketMap = vBucketServerMap['vBucketMap'] counter = 0 # Reset the list to avoid appending through multiple calls bucket.vbuckets = list() for vbucket in vBucketMap: # there will be n number of replicas vbucketInfo = Bucket.vBucket() vbucketInfo.master = serverList[vbucket[0]] if vbucket: for i in range(1, len(vbucket)): if vbucket[i] != -1: vbucketInfo.replica.append( serverList[vbucket[i]]) vbucketInfo.id = counter counter += 1 bucket.vbuckets.append(vbucketInfo) bucket.vbActiveNumNonResident = 100 if "vbActiveNumNonResident" in parsed["basicStats"]: bucket.vbActiveNumNonResident = \ parsed["basicStats"]["vbActiveNumNonResident"] bucket.maxTTL = parsed["maxTTL"] return bucket
def _http_request(self, api, method='GET', params='', headers=None, timeout=120): if not headers: headers = self._create_headers() end_time = time.time() + timeout while True: try: response, content = httplib2.Http(timeout=timeout).request( api, method, params, headers) if response['status'] in ['200', '201', '202']: return True, content, response else: try: json_parsed = json.loads(content) except ValueError: json_parsed = dict() json_parsed["error"] = "status: {0}, content: {1}" \ .format(response['status'], content) reason = "unknown" if "error" in json_parsed: reason = json_parsed["error"] if ("accesskey" in params.lower()) or ( "secretaccesskey" in params.lower() ) or ("password" in params.lower()) or ("secretkey" in params.lower()): message = '{0} {1} body: {2} headers: {3} ' \ 'error: {4} reason: {5} {6} {7}'.\ format(method, api, "Body is being redacted because it contains sensitive info", headers, response['status'], reason, content.rstrip('\n'), RestConnection.get_auth(headers)) else: message = '{0} {1} body: {2} headers: {3} ' \ 'error: {4} reason: {5} {6} {7}'.\ format(method, api, params, headers, response['status'], reason, content.rstrip('\n'), RestConnection.get_auth(headers)) self.log.error(message) self.log.debug(''.join(traceback.format_stack())) return False, content, response except socket.error as e: self.log.error("Socket error while connecting to {0}. " "Error {1}".format(api, e)) if time.time() > end_time: raise ServerUnavailableException(ip=self.ip) except httplib2.ServerNotFoundError as e: self.log.error("ServerNotFoundError while connecting to {0}. " "Error {1}".format(api, e)) if time.time() > end_time: raise ServerUnavailableException(ip=self.ip) sleep(3, log_type="infra")
def vbucket_map_ready(self, bucket, timeout_in_seconds=360): end_time = time.time() + timeout_in_seconds while time.time() <= end_time: v_buckets = self.get_vbuckets(bucket) if v_buckets: return True sleep(0.5, "Wait before retrying get_vbs call", log_type="infra") msg = 'Vbucket map not ready for bucket {0} after waiting {1} seconds' self.log.warn(msg.format(bucket, timeout_in_seconds)) return False
def wait_for_undeployment(self, name, iterations=20): sleep(30, "Waiting for undeployment of function...") result = self.eventing_helper.get_running_eventing_apps() count = 0 while name in result and count < iterations: sleep(30, "Waiting for undeployment of function...") count += 1 result = self.eventing_helper.get_running_eventing_apps() if count == iterations: raise Exception('Eventing took lot of time to undeploy')
def wait_for_bootstrap_to_complete(self, name, iterations=20): result = self.eventing_helper.get_deployed_eventing_apps() count = 0 while name not in result and count < iterations: sleep(30, "Waiting for eventing node to complete bootstrap") count += 1 result = self.eventing_helper.get_deployed_eventing_apps() if count == iterations: raise Exception( 'Eventing took lot of time to come out of bootstrap state or did not successfully bootstrap' )
def setup_curl(self): o = os.system('python scripts/curl_setup.py start') self.log.info("=== started docker container =======".format(o)) sleep(10, "Wait for docker to start", log_type="infra") if o != 0: self.log.info("script result {}".format(o)) raise Exception("unable to start docker") o = os.system('python scripts/curl_setup.py setup') self.log.info("=== setup done =======") if o != 0: self.log.info("script result {}".format(o)) raise Exception("curl setup fail")
def wait_for_handler_state(self, name, status, iterations=20): sleep(20, "Waiting for %s to %s..." % (name, status)) _ = self.eventing_helper.get_composite_eventing_status() count = 0 composite_status = None while composite_status != status and count < iterations: sleep(20, "Waiting for %s to %s..." % (name, status)) result = self.eventing_helper.get_composite_eventing_status() for i in range(len(result['apps'])): if result['apps'][i]['name'] == name: composite_status = result['apps'][i]['composite_status'] count += 1 if count == iterations: raise Exception('Eventing took lot of time for handler %s to %s' % (name, status))
def start(self): """Start replication""" src_master = self.__src_cluster.master rest_conn_src = RestConnection(src_master) self.__rep_id = rest_conn_src.start_replication( REPLICATION_TYPE.CONTINUOUS, self.__from_bucket, self.__remote_cluster_ref.name, rep_type=self.__rep_type, toBucket=self.__to_bucket, xdcr_params=self.__convert_test_to_xdcr_params()) self.__validate_start_audit_event() # if within this 10s for pipeline updater if we try to create another replication, it doesn't work until the previous pipeline is updated. # but better to have this 10s sleep between replications. sleep(10, "Wait between replications")
def polling_create_index_status(self, bucket=None, index=None, timeout=60, sleep_time=10): self.log.info("Starting polling for index:" + str(index)) for x in range(timeout): result = self.index_status() print(result) if bucket.name in result: if result[bucket.name].has_key(index): if result[bucket.name][index]['status'] == 'Ready': return True sleep(sleep_time) self.log.info("Index {} not found with iteration {}".format( index, str(x))) return False
def execute(self): try: self._failover_nodes(self.task_manager) self.test_log.debug( "{0} seconds sleep after failover for nodes to go pending....". format(self.wait_for_pending)) sleep(self.wait_for_pending) self.state = FINISHED self.set_result(True) except FailoverFailedException as e: self.state = FINISHED self.set_unexpected_exception(e) except Exception as e: self.state = FINISHED self.set_unexpected_exception(e)
def _handle_op(self, op, retries=5): """ sends op to mcd. Then it recvs response if the received response is for another op then it will attempt to get the next response and retry 5 times.""" self.ops[op.opaque] = op self.send_op(op) response = None while retries > 0: response = self.recv_op(op) if response: break retries -= 1 sleep(1, "Retrying recv_op..") return response
def wait_for_stats_no_timeout(master, bucket, stat_key, stat_value, timeout_in_seconds=-1, verbose=True): log = logger.get("infra") log.info("Waiting for bucket {0} stat: {1} to match {2} on {3}".format( bucket, stat_key, stat_value, master.ip)) rest = RestConnection(master) stats = rest.get_bucket_stats(bucket) while stats.get(stat_key, -1) != stat_value: stats = rest.get_bucket_stats(bucket) if verbose: log.info("{0} : {1}".format(stat_key, stats.get(stat_key, -1))) sleep(5, log_type="infra") return True
def stop_task(self, task): if task.thread_name not in self.futures.keys(): return future = self.futures[task.thread_name] i = 0 while not future.isDone() and i < 30: sleep(1, "Wait for %s to complete. Current status: %s" % (task.thread_name, future.isDone()), log_type="infra") i += 1 else: self.log.debug( "Task %s in already finished. No need to stop task" % task.thread_name) if not future.isDone(): self.log.debug("Stopping task %s" % task.thread_name) future.cancel(True)
def remove_node(self, otpnode=None, wait_for_rebalance=True): nodes = self.rest.node_statuses() '''This is the case when master node is running cbas service as well''' if len(nodes) <= len(otpnode): return helper = RestHelper(self.rest) try: removed = helper.remove_nodes( knownNodes=[node.id for node in nodes], ejectedNodes=[node.id for node in otpnode], wait_for_rebalance=wait_for_rebalance) except Exception as e: self.log.error("First time rebalance failed on Removal. " "Wait and try again. THIS IS A BUG.") sleep(5) removed = helper.remove_nodes( knownNodes=[node.id for node in nodes], ejectedNodes=[node.id for node in otpnode], wait_for_rebalance=wait_for_rebalance)