def queryRunner(): hosts = None clusterStatus = CacheHelper.clusterstatus(cfg.CB_CLUSTER_TAG + "_status") if clusterStatus: hosts = clusterStatus.get_all_hosts() # retreive all active query workloads queries = CacheHelper.active_queries() for query in queries: # async update query workload object updateQueryWorkload.apply_async(args=[query]) count = int(query.qps) filters = list(set(query.include_filters) -\ set(query.exclude_filters)) params = generateQueryParams(query.indexed_key, query.bucket, filters, query.limit, query.startkey, query.endkey, query.startkey_docid, query.endkey_docid) multi_query.delay(count, query.ddoc, query.view, params, query.bucket, query.password, hosts=hosts)
def setitup(self): # if user forget to assign the number of initial nodes for any cluster # use 1 node as default if len(self._num_initial_nodes) < len(self._clusters_keys_olst): diff = len(self._clusters_keys_olst) - len(self._num_initial_nodes) for i in range(diff): self._num_initial_nodes.append('1') for key in self._clusters_keys_olst: clusterStatus = None if key == 0: clusterStatus = CacheHelper.clusterstatus(cfg.CB_CLUSTER_TAG+"_status") or ClusterStatus() else: clusterStatus = CacheHelper.clusterstatus(cfg.CB_REMOTE_CLUSTER_TAG[key-1]+"_status") or\ ClusterStatus(cfg.CB_REMOTE_CLUSTER_TAG[key-1]+"_status") clusterStatus.all_available_hosts = ["%s:%s" % (node.ip, node.port) for node in self._clusters_dic[key]] self.set_the_cluster_up(self._clusters_dic[key][:int(self._num_initial_nodes[key])]) time.sleep(20) if self._xdcr: self._link_create_replications(self._s_master, self._d_master, "cluster1") if self._rdirection == "bidirection": self._link_create_replications(self._d_master, self._s_master, "cluster0")
def setitup(self): # if user forget to assign the number of initial nodes for any cluster # use 1 node as default if len(self._num_initial_nodes) < len(self._clusters_keys_olst): diff = len(self._clusters_keys_olst) - len(self._num_initial_nodes) for i in range(diff): self._num_initial_nodes.append('1') for key in self._clusters_keys_olst: clusterStatus = None if key == 0: clusterStatus = CacheHelper.clusterstatus( cfg.CB_CLUSTER_TAG + "_status") or ClusterStatus() else: clusterStatus = CacheHelper.clusterstatus(cfg.CB_REMOTE_CLUSTER_TAG[key-1]+"_status") or\ ClusterStatus(cfg.CB_REMOTE_CLUSTER_TAG[key-1]+"_status") clusterStatus.all_available_hosts = [ "%s:%s" % (node.ip, node.port) for node in self._clusters_dic[key] ] self.set_the_cluster_up( self._clusters_dic[key][:int(self._num_initial_nodes[key])]) time.sleep(20) if self._xdcr: self._link_create_replications(self._s_master, self._d_master, "cluster1") if self._rdirection == "bidirection": self._link_create_replications(self._d_master, self._s_master, "cluster0")
def queryRunner(): hosts = None clusterStatus = CacheHelper.clusterstatus(cfg.CB_CLUSTER_TAG+"_status") if clusterStatus: hosts = clusterStatus.get_all_hosts() # retreive all active query workloads queries = CacheHelper.active_queries() for query in queries: # async update query workload object updateQueryWorkload.apply_async(args=[query]) count = int(query.qps) filters = list(set(query.include_filters) -\ set(query.exclude_filters)) params = generateQueryParams(query.indexed_key, query.bucket, filters, query.limit, query.startkey, query.endkey, query.startkey_docid, query.endkey_docid) multi_query.delay(count, query.ddoc, query.view, params, query.bucket, query.password, hosts = hosts)
def report_kv_latency(bucket = "default"): if cfg.SERIESLY_IP == '': # seriesly not configured return rabbitHelper = report_kv_latency.rabbitHelper clusterStatus = CacheHelper.clusterstatus(cfg.CB_CLUSTER_TAG+"_status") or\ ClusterStatus() host = clusterStatus.get_random_host() if host is None: return ip, port = host.split(':') workloads = CacheHelper.workloads() for workload in workloads: if workload.active and workload.bucket == bucket: # read workload params bucket = str(workload.bucket) password = str(workload.password) # read template from active workload template = Template.from_cache(str(workload.template)) template = template.__dict__ client.decodeMajgicStrings(template) # setup key/val to use for timing key = _random_string(12) value = json.dumps(template['kv']) get_key = key # for get op, try to pull from consume_queue # so that we can calc impact of dgm consume_queue = workload.consume_queue if consume_queue is not None: keys = rabbitHelper.getJsonMsg(str(consume_queue), requeue = True) if len(keys) > 0: get_key = str(keys[0]) # collect op latency set_latency = client.mc_op_latency('set', key, value, ip, port, bucket, password) get_latency = client.mc_op_latency('get', get_key, value, ip, port, bucket, password) delete_latency = client.mc_op_latency('delete', key, value, ip, port, bucket, password) # report to seriessly seriesly = Seriesly(cfg.SERIESLY_IP, 3133) db='fast' seriesly[db].append({'set_latency' : set_latency, 'get_latency' : get_latency, 'delete_latency' : delete_latency})
def perform_admin_tasks(adminMsg, cluster_id=cfg.CB_CLUSTER_TAG + "_status"): app.workload_manager.updateClusterStatus() clusterStatus = CacheHelper.clusterstatus(cluster_id) if clusterStatus is None: logger.error("Unable to fetch clusterStatus from cache") return rest = clusterStatus.node_rest() # Add nodes servers = adminMsg["rebalance_in"] add_nodes(rest, servers, cluster_id) # Get all nodes allNodes = [] for node in rest.node_statuses(): allNodes.append(node.id) # Remove nodes servers = adminMsg["rebalance_out"] toBeEjectedNodes = remove_nodes(rest, servers, adminMsg["involve_orchestrator"], cluster_id) # Failover Node servers = adminMsg["failover"] auto_failover_servers = adminMsg["auto_failover"] only_failover = adminMsg["only_failover"] add_back_servers = adminMsg["add_back"] failoverNodes = failover_nodes(rest, servers, only_failover, adminMsg["involve_orchestrator"], cluster_id) autoFailoverNodes = auto_failover_nodes( rest, auto_failover_servers, only_failover, adminMsg["involve_orchestrator"], cluster_id ) app.workload_manager.updateClusterStatus() clusterStatus = CacheHelper.clusterstatus(cluster_id) or ClusterStatus(cluster_id) rest = clusterStatus.node_rest() addBackNodes = add_back_nodes(rest, add_back_servers, autoFailoverNodes + failoverNodes) toBeEjectedNodes.extend(failoverNodes) toBeEjectedNodes.extend(autoFailoverNodes) for node in addBackNodes: toBeEjectedNodes.remove(node) # SoftRestart a node servers = adminMsg["soft_restart"] restart(servers, cluster_id=cluster_id) # HardRestart a node servers = adminMsg["hard_restart"] restart(servers, type="hard", cluster_id=cluster_id) if not only_failover and (len(allNodes) > 0 or len(toBeEjectedNodes) > 0): logger.error("Rebalance") logger.error(allNodes) logger.error(toBeEjectedNodes) rest.rebalance(otpNodes=allNodes, ejectedNodes=toBeEjectedNodes)
def perform_admin_tasks(adminMsg, cluster_id=cfg.CB_CLUSTER_TAG+"_status"): app.workload_manager.updateClusterStatus() clusterStatus = CacheHelper.clusterstatus(cluster_id) if clusterStatus is None: logger.error("Unable to fetch clusterStatus from cache") return rest = clusterStatus.node_rest() # Add nodes servers = adminMsg["rebalance_in"] add_nodes(rest, servers, cluster_id) # Get all nodes allNodes = [] for node in rest.node_statuses(): allNodes.append(node.id) # Remove nodes servers = adminMsg["rebalance_out"] toBeEjectedNodes = remove_nodes(rest, servers, adminMsg["involve_orchestrator"], cluster_id) # Failover Node servers = adminMsg["failover"] auto_failover_servers = adminMsg["auto_failover"] only_failover = adminMsg["only_failover"] add_back_servers = adminMsg["add_back"] failoverNodes = failover_nodes(rest, servers, only_failover, adminMsg["involve_orchestrator"], cluster_id) autoFailoverNodes = auto_failover_nodes(rest, auto_failover_servers, only_failover, adminMsg["involve_orchestrator"], cluster_id) app.workload_manager.updateClusterStatus() clusterStatus = CacheHelper.clusterstatus(cluster_id) or ClusterStatus(cluster_id) rest = clusterStatus.node_rest() addBackNodes = add_back_nodes(rest, add_back_servers, autoFailoverNodes+failoverNodes) toBeEjectedNodes.extend(failoverNodes) toBeEjectedNodes.extend(autoFailoverNodes) for node in addBackNodes: toBeEjectedNodes.remove(node) # SoftRestart a node servers = adminMsg["soft_restart"] restart(servers, cluster_id=cluster_id) # HardRestart a node servers = adminMsg["hard_restart"] restart(servers, type='hard', cluster_id=cluster_id) if not only_failover and (len(allNodes) > 0 or len(toBeEjectedNodes) > 0): logger.error("Rebalance") logger.error(allNodes) logger.error(toBeEjectedNodes) rest.rebalance(otpNodes=allNodes, ejectedNodes=toBeEjectedNodes)
def updateQueryWorkload(query): workloads = CacheHelper.workloads() for workload in workloads: if workload.active and workload.bucket == query.bucket: key = query.indexed_key workload.updateIndexKeys(key)
def pick_nodesToRemove(servers='', involve_orchestrator=False, cluster_id=cfg.CB_CLUSTER_TAG + "_status"): if servers.find('.') != -1 or servers == '': servers = servers.split() else: clusterStatus = CacheHelper.clusterstatus(cluster_id) count = int(servers) temp_count = count servers = [] if involve_orchestrator: servers.append("%s:%s" % (clusterStatus.orchestrator.ip, clusterStatus.orchestrator.port)) temp_count = temp_count - 1 if len(clusterStatus.nodes) > count: non_orchestrator_servers = list( set(clusterStatus.get_all_hosts()) - set([ "%s:%s" % (clusterStatus.orchestrator.ip, clusterStatus.orchestrator.port) ])) servers.extend(non_orchestrator_servers[:temp_count]) else: logger.error( "Remove nodes request invalid. # of nodes in cluster is not enough" ) return [] return servers
def add_nodes(rest, servers='', cluster_id=cfg.CB_CLUSTER_TAG + "_status", zone_name='', services=None): # create zone if it does not exit if zone_name != '': if rest.is_zone_exist(zone_name) == False: rest.add_zone(zone_name) if servers.find('.') != -1 or servers == '': servers = servers.split() else: clusterStatus = CacheHelper.clusterstatus(cluster_id) count = int(servers) if (len(clusterStatus.all_available_hosts) - len(clusterStatus.nodes)) >= int(count): servers = list( set(clusterStatus.all_available_hosts) - set(clusterStatus.get_all_hosts())) else: logger.error( "Add nodes request invalid. # of nodes outside cluster is not enough" ) return servers = servers[:count] for server in servers: logger.error("Adding node %s" % server) ip, port = parse_server_arg(server) if services: rest.add_node(cfg.COUCHBASE_USER, cfg.COUCHBASE_PWD, ip, port, zone_name, services) else: rest.add_node(cfg.COUCHBASE_USER, cfg.COUCHBASE_PWD, ip, port, zone_name)
def queryConsumer(queryQueue="query_default"): rabbitHelper = queryConsumer.rabbitHelper queryQueueSize = rabbitHelper.qsize(queryQueue) # for cli retreive currently active query workload # since multi-query is not supported here active_query = None all_queries = CacheHelper.active_queries() if len(all_queries) > 0: active_query = all_queries[0] if queryQueueSize > 0: # setup new query workload from queued message queryMsg = rabbitHelper.getJsonMsg(queryQueue) logger.error(queryMsg) try: queryWorkload = QueryWorkload(queryMsg) # deactivate old query workload if active_query is not None: active_query.active = False # activate new query workload # to be detected in queryRunner task queryWorkload.active = True if 'rcq' in queryMsg: rabbitHelper.putMsg(queryMsg['rcq'], "Started Querying: %s/%s" % \ (queryWorkload.ddoc, queryWorkload.view)) except KeyError: logger.info("Invalid query workload message: %s" % queryMsg)
def restart(servers='', type='soft', cluster_id=cfg.CB_CLUSTER_TAG+"_status"): if servers.find('.') != -1 or servers == '': servers = servers.split() else: clusterStatus = CacheHelper.clusterstatus(cluster_id) or ClusterStatus(cluster_id) count = int(servers) if len(clusterStatus.nodes) >= int(count): servers = clusterStatus.get_all_hosts() else: logger.error("Restart nodes request invalid. # of nodes in cluster is not enough") return servers = servers[:count] for server in servers: ip, port = parse_server_arg(server) node_ssh, node = create_ssh_conn(ip) if type is not 'soft': logger.error('Hard Restart') cmd = "reboot" else: logger.error('Soft Restart') cmd = "/etc/init.d/couchbase-server restart" logger.error(cmd) result = node_ssh.execute_command(cmd, node) logger.error(result)
def postcondition_handler(): workloads = CacheHelper.workloads() for workload in workloads: if workload.postcondition_handler and workload.active: bucket = workload.bucket bs = BucketStatus.from_cache(bucket) bs.block(bucket) status = True try: postcondition_handler = \ getattr(phandler, workload.postcondition_handler) status = postcondition_handler(workload) except AttributeError: logger.error("Postcondition method %s doesn't exist" \ % workload.postcondition_handler) workload.postcondition = None workload.postcondition_handler = None if status == True: # unblock bucket and deactivate workload bs = BucketStatus.from_cache(bucket) bs.unblock(bucket) workload.active = False
def queryConsumer(queryQueue = "query_default"): rabbitHelper = queryConsumer.rabbitHelper queryQueueSize = rabbitHelper.qsize(queryQueue) # for cli retreive currently active query workload # since multi-query is not supported here active_query = None all_queries = CacheHelper.active_queries() if len(all_queries) > 0: active_query = all_queries[0] if queryQueueSize> 0: # setup new query workload from queued message queryMsg = rabbitHelper.getJsonMsg(queryQueue) logger.error(queryMsg) try: queryWorkload = QueryWorkload(queryMsg) # deactivate old query workload if active_query is not None: active_query.active = False # activate new query workload # to be detected in queryRunner task queryWorkload.active = True if 'rcq' in queryMsg: rabbitHelper.putMsg(queryMsg['rcq'], "Started Querying: %s/%s" % \ (queryWorkload.ddoc, queryWorkload.view)) except KeyError: logger.info("Invalid query workload message: %s" % queryMsg)
def restart(servers="", type="soft", cluster_id=cfg.CB_CLUSTER_TAG + "_status"): if servers.find(".") != -1 or servers == "": servers = servers.split() else: clusterStatus = CacheHelper.clusterstatus(cluster_id) or ClusterStatus(cluster_id) count = int(servers) if len(clusterStatus.nodes) >= int(count): servers = clusterStatus.get_all_hosts() else: logger.error("Restart nodes request invalid. # of nodes in cluster is not enough") return servers = servers[:count] for server in servers: ip, port = parse_server_arg(server) node_ssh, node = create_ssh_conn(ip) if type is not "soft": logger.error("Hard Restart") cmd = "reboot" else: logger.error("Soft Restart") cmd = "/etc/init.d/couchbase-server restart" logger.error(cmd) result = node_ssh.execute_command(cmd, node) logger.error(result)
def restart(servers='', type='soft', cluster_id=cfg.CB_CLUSTER_TAG+"_status"): if servers.find('.') != -1 or servers == '': servers = servers.split() else: clusterStatus = CacheHelper.clusterstatus(cluster_id) count = int(servers) if len(clusterStatus.nodes) >= int(count): servers = clusterStatus.get_all_hosts() else: logger.error("Restart nodes request invalid. # of nodes in cluster is not enough") return servers = servers[:count] for server in servers: ip, port = parse_server_arg(server) node_ssh, node = create_ssh_conn(ip) if type is not 'soft': logger.error('Hard Restart') if cfg.COUCHBASE_OS == "windows": cmd = "shutdown -r -t 0" else: cmd = "reboot" else: logger.error('Soft Restart') if cfg.COUCHBASE_OS == "windows": cmd = "net stop couchbaseserver && net start couchbaseserver" else: cmd = "/etc/init.d/couchbase-server restart" logger.error(cmd) result = node_ssh.execute_command(cmd, node) logger.error(result)
def queryRunner(): # retreive all active query workloads queries = CacheHelper.active_queries() for query in queries: count = int(query.qps) params = {"stale": "update_after"} multi_query.delay(count, query.ddoc, query.view, params, query.bucket, query.password)
def queryRunner(max_msgs=10): rabbitHelper = queryRunner.rabbitHelper # check queue with pending http requests pending_http_requests = "query_multi_" + cfg.CB_CLUSTER_TAG if rabbitHelper.qsize(pending_http_requests) > max_msgs: # purge waiting tasks rabbitHelper.purge(pending_http_requests) query_ops_manager(max_msgs, True) else: hosts = None clusterStatus = CacheHelper.clusterstatus(cfg.CB_CLUSTER_TAG + "_status") if clusterStatus: hosts = clusterStatus.get_all_hosts() # retreive all active query workloads queries = CacheHelper.active_queries() for query in queries: # async update query workload object updateQueryWorkload.apply_async(args=[query]) count = int(query.qps) filters = list(set(query.include_filters) -\ set(query.exclude_filters)) params = generateQueryParams(query.indexed_key, query.bucket, filters, query.limit, query.startkey, query.endkey, query.startkey_docid, query.endkey_docid) multi_query.delay(count, query.ddoc, query.view, params, query.bucket, query.password, hosts=hosts)
def queryRunner(max_msgs=10): rabbitHelper = queryRunner.rabbitHelper # check queue with pending http requests pending_http_requests = "query_multi_" + cfg.CB_CLUSTER_TAG if rabbitHelper.qsize(pending_http_requests) > max_msgs: # purge waiting tasks rabbitHelper.purge(pending_http_requests) query_ops_manager(max_msgs, True) else: hosts = None clusterStatus = CacheHelper.clusterstatus(cfg.CB_CLUSTER_TAG + "_status") if clusterStatus: hosts = clusterStatus.get_all_hosts() # retreive all active query workloads queries = CacheHelper.active_queries() for query in queries: # async update query workload object updateQueryWorkload.apply_async(args=[query]) count = int(query.qps) filters = list(set(query.include_filters) - set(query.exclude_filters)) params = generateQueryParams( query.indexed_key, query.bucket, filters, query.limit, query.startkey, query.endkey, query.startkey_docid, query.endkey_docid, ) multi_query.delay(count, query.ddoc, query.view, params, query.bucket, query.password, hosts=hosts)
def getClusterStat(bucket, stat): val = 0 clusterStatus = CacheHelper.clusterstatus(cfg.CB_CLUSTER_TAG+"_status") or\ ClusterStatus() host = clusterStatus.get_random_host() stat_checker = phandler.BucketStatChecker(bucket, addr = host) stats = stat_checker.get_stats() if len(stats) > 0: if stat in stats: val = stats[stat] return val
def getClusterStat(bucket, stat): val = 0 clusterStatus = CacheHelper.clusterstatus(cfg.CB_CLUSTER_TAG+"_status") or\ ClusterStatus() host = clusterStatus.get_random_host() stat_checker = phandler.BucketStatChecker(bucket, addr=host) stats = stat_checker.get_stats() if len(stats) > 0: if stat in stats: val = stats[stat] return val
def get_ep_hostip_from_params(params): app.workload_manager.updateClusterStatus() clusterStatus = CacheHelper.clusterstatus(cfg.CB_CLUSTER_TAG + "_status") random_host = None try: random_host = clusterStatus.get_random_host().split(":")[0] except AttributeError: logger.error("Can not fetch cluster status information") pass host = params.get('ip') or random_host or cfg.COUCHBASE_IP port = params.get('port') or 11210 return host, int(port)
def queryRunner(): # retreive all active query workloads queries = CacheHelper.active_queries() for query in queries: count = int(query.qps) params = {"stale" : "update_after"} multi_query.delay(count, query.ddoc, query.view, params, query.bucket, query.password)
def get_ep_hostip_from_params(params): app.workload_manager.updateClusterStatus() clusterStatus = CacheHelper.clusterstatus(cfg.CB_CLUSTER_TAG+"_status") random_host = None try: random_host = clusterStatus.get_random_host().split(":")[0] except AttributeError: logger.error("Can not fetch cluster status information") pass host = params.get('ip') or random_host or cfg.COUCHBASE_IP port = params.get('port') or 11210 return host, int(port)
def throttle_kv_ops(isovercommited=True): rabbitHelper = throttle_kv_ops.rabbitHelper workloads = CacheHelper.workloads() for workload in workloads: if workload.active: if isovercommited: # clear pending task_queue rabbitHelper.purge(workload.task_queue) # reduce ops by 10% workload.ops_per_sec = workload.ops_per_sec*0.90 logger.error("Cluster Overcommited: reduced ops to (%s)" % workload.ops_per_sec)
def queue_op_cycles(workload): # read doc template template = Template.from_cache(str(workload.template)) if template is None: logger.error("no doc template imported") return rabbitHelper = queue_op_cycles.rabbitHelper bucket = str(workload.bucket) task_queue = workload.task_queue active_hosts = None clusterStatus = CacheHelper.clusterstatus(cfg.CB_CLUSTER_TAG+"_status") if clusterStatus is not None: active_hosts = clusterStatus.get_all_hosts() # create 30 op cycles for i in xrange(20): if workload.cc_queues is not None: # override template attribute with workload template.cc_queues = workload.cc_queues if len(workload.indexed_keys) > 0: template.indexed_keys = workload.indexed_keys # read workload settings bucketInfo = {"bucket" : workload.bucket, "password" : workload.password} ops_sec = workload.ops_per_sec create_count = int(ops_sec * workload.create_perc/100) update_count = int(ops_sec * workload.update_perc/100) get_count = int(ops_sec * workload.get_perc/100) del_count = int(ops_sec * workload.del_perc/100) exp_count = int(ops_sec * workload.exp_perc/100) consume_queue = workload.consume_queue ttl = workload.ttl miss_queue = workload.miss_queue miss_perc = workload.miss_perc generate_pending_tasks(task_queue, template, bucketInfo, create_count, update_count, get_count, del_count, exp_count, consume_queue, ttl, miss_perc, miss_queue, active_hosts)
def queue_op_cycles(workload): # read doc template template = Template.from_cache(str(workload.template)) if template is None: logger.error("no doc template imported") return rabbitHelper = queue_op_cycles.rabbitHelper bucket = str(workload.bucket) task_queue = workload.task_queue active_hosts = None clusterStatus = CacheHelper.clusterstatus(cfg.CB_CLUSTER_TAG + "_status") if clusterStatus is not None: active_hosts = clusterStatus.get_all_hosts() # create 30 op cycles for i in xrange(20): if workload.cc_queues is not None: # override template attribute with workload template.cc_queues = workload.cc_queues if len(workload.indexed_keys) > 0: template.indexed_keys = workload.indexed_keys # read workload settings bucketInfo = {"bucket": workload.bucket, "password": workload.password} ops_sec = workload.ops_per_sec create_count = int(ops_sec * workload.create_perc / 100) update_count = int(ops_sec * workload.update_perc / 100) get_count = int(ops_sec * workload.get_perc / 100) del_count = int(ops_sec * workload.del_perc / 100) exp_count = int(ops_sec * workload.exp_perc / 100) consume_queue = workload.consume_queue ttl = workload.ttl miss_queue = workload.miss_queue miss_perc = workload.miss_perc generate_pending_tasks(task_queue, template, bucketInfo, create_count, update_count, get_count, del_count, exp_count, consume_queue, ttl, miss_perc, miss_queue, active_hosts)
def taskScheduler(): workloads = CacheHelper.workloads() rabbitHelper = taskScheduler.rabbitHelper tasks = [] for workload in workloads: if workload.active: task_queue = workload.task_queue # dequeue subtasks if rabbitHelper.qsize(task_queue) > 0: tasks = rabbitHelper.getJsonMsg(task_queue) if tasks is not None and len(tasks) > 0: # apply async result = TaskSet(tasks=tasks).apply_async()
def add_nodes(rest, servers='', cluster_id=cfg.CB_CLUSTER_TAG+"_status"): if servers.find('.') != -1 or servers == '': servers = servers.split() else: clusterStatus = CacheHelper.clusterstatus(cluster_id) or ClusterStatus(cluster_id) count = int(servers) if (len(clusterStatus.all_available_hosts) - len(clusterStatus.nodes)) >= int(count): servers = list(set(clusterStatus.all_available_hosts) - set(clusterStatus.get_all_hosts())) else: logger.error("Add nodes request invalid. # of nodes outside cluster is not enough") return servers = servers[:count] for server in servers: logger.error("Adding node %s" % server) ip, port = parse_server_arg(server) rest.add_node(cfg.COUCHBASE_USER, cfg.COUCHBASE_PWD, ip, port)
def throttle_kv_ops(isovercommited=True): rabbitHelper = kv_ops_manager.rabbitHelper workloads = CacheHelper.workloads() for workload in workloads: if workload.active: if isovercommited: # clear pending task_queue rabbitHelper.purge(workload.task_queue) # reduce ops by 10% new_ops_per_sec = workload.ops_per_sec * 0.90 if new_ops_per_sec > 5000: workload.ops_per_sec = workload.ops_per_sec * 0.90 logger.error("Cluster Overcommited: reduced ops to (%s)" % workload.ops_per_sec)
def taskScheduler(): workloads = CacheHelper.workloads() rabbitHelper = taskScheduler.rabbitHelper tasks = [] for workload in workloads: if workload.active: task_queue = workload.task_queue # dequeue subtasks if rabbitHelper.qsize(task_queue) > 0: tasks = rabbitHelper.getJsonMsg(task_queue) if tasks is not None and len(tasks) > 0: # apply async result = TaskSet(tasks = tasks).apply_async()
def add_nodes(rest, servers='', cluster_id=cfg.CB_CLUSTER_TAG+"_status"): if servers.find('.') != -1: servers = servers.split() else: clusterStatus = CacheHelper.clusterstatus(cluster_id) or ClusterStatus(cluster_id) count = int(servers) if (len(clusterStatus.all_available_hosts) - len(clusterStatus.nodes)) >= int(count): servers = list(set(clusterStatus.all_available_hosts) - set(clusterStatus.get_all_hosts())) else: logger.error("Rebalance in request invalid. # of nodes outside cluster is not enough") return servers = servers[:count] for server in servers: logger.error("Adding node %s" % server) ip, port = parse_server_arg(server) rest.add_node(cfg.COUCHBASE_USER, cfg.COUCHBASE_PWD, ip, port)
def postcondition_handler(): workloads = CacheHelper.workloads() for workload in workloads: if workload.postconditions and workload.active: bucket = workload.bucket bs = BucketStatus.from_cache(bucket) bs.block(bucket) stat_checker = StatChecker(cfg.COUCHBASE_IP +":"+cfg.COUCHBASE_PORT, bucket = bucket, username = cfg.COUCHBASE_USER, password = cfg.COUCHBASE_PWD) status = stat_checker.check(workload.postconditions) if status == True: # unblock bucket and deactivate workload bs = BucketStatus.from_cache(bucket) bs.unblock(bucket) workload.active = False
def updateClusterStatus(ignore_result = True): done = False clusterStatus = CacheHelper.clusterstatus(cfg.CB_CLUSTER_TAG+"_status") or\ ClusterStatus() # check cluster nodes cached_nodes = clusterStatus.nodes new_cached_nodes = [] for node in cached_nodes: # get an active node if clusterStatus.http_ping_node(node) is not None: # get remaining nodes active_nodes = clusterStatus.get_cluster_nodes(node) # populate cache with healthy nodes for active_node in active_nodes: if active_node.status == 'healthy': new_cached_nodes.append(active_node) break if len(new_cached_nodes) > 0: # check for update new_node_list = ["%s:%s" % (n.ip, n.port) for n in new_cached_nodes] if len(new_node_list) != len(cached_nodes) or\ len(set(clusterStatus.get_all_hosts()).intersection(new_node_list)) !=\ len(cached_nodes): clusterStatus.nodes = new_cached_nodes clusterStatus.update_orchestrator() else: clusterStatus.orchestrator = None ObjCacher().delete(CacheHelper.CLUSTERSTATUSKEY, clusterStatus)
def query_ops_manager(max_msgs=10, isovercommited=False): rabbitHelper = query_ops_manager.rabbitHelper # retreive all active query workloads queries = CacheHelper.active_queries() for query in queries: # check if query tasks are overloaded if rabbitHelper.qsize(query.task_queue) > max_msgs or isovercommited: # purge waiting tasks rabbitHelper.purge(query.task_queue) # throttle down ops by 10% new_queries_per_sec = query.qps * 0.90 # cannot reduce below 10 qps if new_queries_per_sec > 10: query.qps = new_queries_per_sec logger.error("Cluster Overcommited: reduced queries/sec to (%s)" % query.qps)
def generate_node_stats_report(): allnodestats = CacheHelper.allnodestats() if len(allnodestats) > 0: # print current time at top of each report # TODO: add active tasks at time of report generation ts = time.localtime() ts_string = "%s/%s/%s %s:%s:%s" %\ (ts.tm_year, ts.tm_mon, ts.tm_mday, ts.tm_hour, ts.tm_min, ts.tm_sec) print_separator() logger.error("\tSTAT REPORT: (%s)" % ts_string) for node_stats in allnodestats: calculate_node_stat_results(node_stats) if len(node_stats.results) > 0: print_node_results(node_stats) logger.error("\tEND OF REPORT: (%s)" % ts_string) print_separator() new_line()
def pick_nodesToRemove(servers='', involve_orchestrator=False, cluster_id=cfg.CB_CLUSTER_TAG+"_status"): if servers.find('.') != -1 or servers == '': servers = servers.split() else: clusterStatus = CacheHelper.clusterstatus(cluster_id) count = int(servers) temp_count = count servers = [] if involve_orchestrator: servers.append("%s:%s" % (clusterStatus.orchestrator.ip, clusterStatus.orchestrator.port)) temp_count = temp_count -1 if len(clusterStatus.nodes) > count: non_orchestrator_servers = list(set(clusterStatus.get_all_hosts()) - set(["%s:%s" % (clusterStatus.orchestrator.ip, clusterStatus.orchestrator.port)])) servers.extend(non_orchestrator_servers[:temp_count]) else: logger.error("Remove nodes request invalid. # of nodes in cluster is not enough") return [] return servers
def postcondition_handler(): workloads = CacheHelper.workloads() for workload in workloads: if workload.postconditions and workload.active: bucket = workload.bucket bs = BucketStatus.from_cache(bucket) bs.block(bucket) stat_checker = StatChecker(cfg.COUCHBASE_IP + ":" + cfg.COUCHBASE_PORT, bucket=bucket, username=cfg.COUCHBASE_USER, password=cfg.COUCHBASE_PWD) status = stat_checker.check(workload.postconditions) if status == True: # unblock bucket and deactivate workload bs = BucketStatus.from_cache(bucket) bs.unblock(bucket) workload.active = False
def query_ops_manager(max_msgs=10, isovercommited=False): rabbitHelper = query_ops_manager.rabbitHelper # retreive all active query workloads queries = CacheHelper.active_queries() for query in queries: # check if query tasks are overloaded if rabbitHelper.qsize(query.task_queue) > max_msgs or isovercommited: # purge waiting tasks rabbitHelper.purge(query.task_queue) # throttle down ops by 10% new_queries_per_sec = query.qps * 0.90 # cannot reduce below 10 qps if new_queries_per_sec > 10: query.qps = new_queries_per_sec logger.error("Cluster Overcommited: reduced queries/sec to (%s)" %\ query.qps)
def updateClusterStatus(ignore_result=True): done = False clusterStatus = CacheHelper.clusterstatus(cfg.CB_CLUSTER_TAG+"_status") or\ ClusterStatus() # check cluster nodes cached_nodes = clusterStatus.nodes new_cached_nodes = [] for node in cached_nodes: # get an active node if clusterStatus.http_ping_node(node) is not None: # get remaining nodes active_nodes = clusterStatus.get_cluster_nodes(node) # populate cache with healthy nodes for active_node in active_nodes: if active_node.status == 'healthy': new_cached_nodes.append(active_node) break if len(new_cached_nodes) > 0: # check for update new_node_list = ["%s:%s" % (n.ip, n.port) for n in new_cached_nodes] if len(new_node_list) != len(cached_nodes) or\ len(set(clusterStatus.get_all_hosts()).intersection(new_node_list)) !=\ len(cached_nodes): clusterStatus.nodes = new_cached_nodes clusterStatus.update_orchestrator() else: clusterStatus.orchestrator = None ObjCacher().delete(CacheHelper.CLUSTERSTATUSKEY, clusterStatus)
def perform_xdcr_tasks(xdcrMsg): logger.error(xdcrMsg) src_master = create_server_obj() remote_id = '' if len(cfg.CB_REMOTE_CLUSTER_TAG) > 0: remote_id = cfg.CB_REMOTE_CLUSTER_TAG[0]+"_status" else: logger.error("No remote cluster tag. Can not create xdcr") return clusterStatus = CacheHelper.clusterstatus(remote_id) or ClusterStatus(remote_id) remote_ip = clusterStatus.get_random_host().split(":")[0] dest_master = create_server_obj(server_ip=remote_ip, username=xdcrMsg['dest_cluster_rest_username'], password=xdcrMsg['dest_cluster_rest_pwd']) dest_cluster_name = xdcrMsg['dest_cluster_name'] xdcr_link_cluster(src_master, dest_master, dest_cluster_name) xdcr_start_replication(src_master, dest_cluster_name) if xdcrMsg['replication_type'] == "bidirection": src_cluster_name = dest_cluster_name + "_temp" xdcr_link_cluster(dest_master, src_master, src_cluster_name) xdcr_start_replication(dest_master, src_cluster_name)
def epengine_stat_checker(workload): postcondition = workload.postconditions if isinstance(postcondition, dict): params = parse_condition_dict(postcondition) else: params = parse_condition(postcondition) random_host, port = get_ep_hostip_from_params(params) status = True all_hosts = [random_host] if params['cluster_check'] == True: clusterStatus = CacheHelper.clusterstatus(cfg.CB_CLUSTER_TAG+"_status") all_hosts = clusterStatus.get_all_hosts() for host in all_hosts: statChecker = EPStatChecker(host.split(":")[0], port) status &= statChecker.check(postcondition) return status
def add_nodes(rest, servers='', cluster_id=cfg.CB_CLUSTER_TAG+"_status", zone_name = ''): # create zone if it does not exit if zone_name != '': if rest.is_zone_exist(zone_name) == False: rest.add_zone(zone_name) if servers.find('.') != -1 or servers == '': servers = servers.split() else: clusterStatus = CacheHelper.clusterstatus(cluster_id) count = int(servers) if (len(clusterStatus.all_available_hosts) - len(clusterStatus.nodes)) >= int(count): servers = list(set(clusterStatus.all_available_hosts) - set(clusterStatus.get_all_hosts())) else: logger.error("Add nodes request invalid. # of nodes outside cluster is not enough") return servers = servers[:count] for server in servers: logger.error("Adding node %s" % server) ip, port = parse_server_arg(server) rest.add_node(cfg.COUCHBASE_USER, cfg.COUCHBASE_PWD, ip, port, zone_name)
def epengine_stat_checker(workload): postcondition = workload.postconditions if isinstance(postcondition, dict): params = parse_condition_dict(postcondition) else: params = parse_condition(postcondition) random_host, port = get_ep_hostip_from_params(params) status = True all_hosts = [random_host] if params['cluster_check'] == True: clusterStatus = CacheHelper.clusterstatus(cfg.CB_CLUSTER_TAG + "_status") all_hosts = clusterStatus.get_all_hosts() for host in all_hosts: statChecker = EPStatChecker(host.split(":")[0], port) status &= statChecker.check(postcondition) return status
def taskScheduler(): workloads = CacheHelper.workloads() rabbitHelper = taskScheduler.rabbitHelper tasks = [] for workload in workloads: if workload.active: task_queue = workload.task_queue num_ready_tasks = rabbitHelper.qsize(task_queue) # dequeue subtasks if num_ready_tasks > 0: tasks = rabbitHelper.getJsonMsg(task_queue) if tasks is not None and len(tasks) > 0: # apply async result = TaskSet(tasks=tasks).apply_async() # check if more subtasks need to be queued if num_ready_tasks < 10: queue_op_cycles.delay(workload)
def taskScheduler(): workloads = CacheHelper.workloads() rabbitHelper = taskScheduler.rabbitHelper tasks = [] for workload in workloads: if workload.active: task_queue = workload.task_queue num_ready_tasks = rabbitHelper.qsize(task_queue) # dequeue subtasks if num_ready_tasks > 0: tasks = rabbitHelper.getJsonMsg(task_queue) if tasks is not None and len(tasks) > 0: # apply async result = TaskSet(tasks = tasks).apply_async() # check if more subtasks need to be queued if num_ready_tasks < 10: queue_op_cycles.delay(workload)
def setPhaseForStats(phase_name): allnodestats = CacheHelper.allnodestats() if len(allnodestats) > 0: for node_stats in allnodestats: node_stats.phase = phase_name
def perform_admin_tasks(adminMsg, cluster_id=cfg.CB_CLUSTER_TAG+"_status"): app.workload_manager.updateClusterStatus() clusterStatus = CacheHelper.clusterstatus(cluster_id) if clusterStatus is None: logger.error("Unable to fetch clusterStatus from cache: ") return rest = clusterStatus.node_rest() # Add nodes servers = adminMsg["rebalance_in"] zone_name = adminMsg["group"] if adminMsg["services"]: add_nodes(rest, servers, cluster_id, zone_name, adminMsg["services"]) else: add_nodes(rest, servers, cluster_id, zone_name) # Get all nodes allNodes = [] for node in rest.node_statuses(): allNodes.append(node.id) # Remove nodes servers = adminMsg["rebalance_out"] toBeEjectedNodes = remove_nodes(rest, servers, adminMsg["involve_orchestrator"], cluster_id) # Failover Node servers = adminMsg["failover"] auto_failover_servers = adminMsg["auto_failover"] only_failover = adminMsg["only_failover"] add_back_servers = adminMsg["add_back"] failoverNodes = failover_nodes(rest, servers, only_failover, adminMsg["involve_orchestrator"], cluster_id) autoFailoverNodes = auto_failover_nodes(rest, auto_failover_servers, only_failover, adminMsg["involve_orchestrator"], cluster_id) app.workload_manager.updateClusterStatus() clusterStatus = CacheHelper.clusterstatus(cluster_id) rest = clusterStatus.node_rest() addBackNodes = add_back_nodes(rest, add_back_servers, autoFailoverNodes+failoverNodes) toBeEjectedNodes.extend(failoverNodes) toBeEjectedNodes.extend(autoFailoverNodes) for node in addBackNodes: toBeEjectedNodes.remove(node) # SoftRestart a node servers = adminMsg["soft_restart"] restart(servers, cluster_id=cluster_id) # HardRestart a node servers = adminMsg["hard_restart"] restart(servers, type='hard', cluster_id=cluster_id) if adminMsg["soft_restart"] == '' and adminMsg["hard_restart"] == '': if not only_failover and (len(allNodes) > 0 or len(toBeEjectedNodes) > 0): logger.error("Rebalance") logger.error(allNodes) logger.error(toBeEjectedNodes) rest.rebalance(otpNodes=allNodes, ejectedNodes=toBeEjectedNodes) # do a soft rest on ejectedNodes that were failed over logger.error(toBeEjectedNodes) restartNodes = "" for node in toBeEjectedNodes: if node in (failoverNodes + autoFailoverNodes): if '@' in node: # ns_X@hostname formated node = node.split('@')[1] restartNodes = "%s %s" % (node, restartNodes) if len(restartNodes): restart(restartNodes)
from cache import CacheHelper import testcfg as cfg # make sure logdir exists os.system("mkdir -p " + cfg.LOGDIR) #make sure celeybeat-schedule.db file is deleted os.system("rm -rf celerybeat-schedule.db") # kill old background processes kill_procs = ["consumer"] for proc in kill_procs: os.system("ps aux | grep %s | awk '{print $2}' | xargs kill" % proc) # delete queues (note using --purge will remove cc_queues) queues = CacheHelper.task_queues() + CacheHelper.miss_queues() # when --purge set delete cc_queue's as well # as seriesly db if "--purge" in sys.argv: queues = set(CacheHelper.queues()) # cleaning up seriesly database (fast and slow created by cbtop) if cfg.SERIESLY_IP != '': from seriesly import Seriesly seriesly = Seriesly(cfg.SERIESLY_IP, 3133) dbs = seriesly.list_dbs() for db in dbs: seriesly.drop_db(db)
def cacheVariable(cacheMsg): bucket = cacheMsg.get("bucket") or "default" ref = str(cacheMsg.get("reference") or "default_key") stat = cacheMsg.get("stat") or "curr_items" value = getClusterStat(bucket, stat) CacheHelper.cachePhaseVar(ref, value)
def replace_magic_vars(str_): ref = re.match(r".*\$(?P<var>\w+)",str_).group('var') ref = str(ref.strip()) value = CacheHelper.getPhaseVar(ref) or 0 str_ = str_.replace("$"+ref, str(value)) return str_
def perform_admin_tasks(adminMsg, cluster_id=cfg.CB_CLUSTER_TAG+"_status"): app.workload_manager.updateClusterStatus() clusterStatus = CacheHelper.clusterstatus(cluster_id) if clusterStatus is None: logger.error("Unable to fetch clusterStatus from cache: ") return rest = clusterStatus.node_rest() # Add nodes servers = adminMsg["rebalance_in"] zone_name = adminMsg["group"] add_nodes(rest, servers, cluster_id, zone_name) # Get all nodes allNodes = [] for node in rest.node_statuses(): allNodes.append(node.id) # Remove nodes servers = adminMsg["rebalance_out"] toBeEjectedNodes = remove_nodes(rest, servers, adminMsg["involve_orchestrator"], cluster_id) # Failover Node servers = adminMsg["failover"] auto_failover_servers = adminMsg["auto_failover"] only_failover = adminMsg["only_failover"] add_back_servers = adminMsg["add_back"] failoverNodes = failover_nodes(rest, servers, only_failover, adminMsg["involve_orchestrator"], cluster_id) autoFailoverNodes = auto_failover_nodes(rest, auto_failover_servers, only_failover, adminMsg["involve_orchestrator"], cluster_id) app.workload_manager.updateClusterStatus() clusterStatus = CacheHelper.clusterstatus(cluster_id) rest = clusterStatus.node_rest() addBackNodes = add_back_nodes(rest, add_back_servers, autoFailoverNodes+failoverNodes) toBeEjectedNodes.extend(failoverNodes) toBeEjectedNodes.extend(autoFailoverNodes) for node in addBackNodes: toBeEjectedNodes.remove(node) # SoftRestart a node servers = adminMsg["soft_restart"] restart(servers, cluster_id=cluster_id) # HardRestart a node servers = adminMsg["hard_restart"] restart(servers, type='hard', cluster_id=cluster_id) if adminMsg["soft_restart"] == '' and adminMsg["hard_restart"] == '': if not only_failover and (len(allNodes) > 0 or len(toBeEjectedNodes) > 0): logger.error("Rebalance") logger.error(allNodes) logger.error(toBeEjectedNodes) rest.rebalance(otpNodes=allNodes, ejectedNodes=toBeEjectedNodes) # do a soft rest on ejectedNodes that were failed over logger.error(toBeEjectedNodes) restartNodes = "" for node in toBeEjectedNodes: if node in (failoverNodes + autoFailoverNodes): if '@' in node: # ns_X@hostname formated node = node.split('@')[1] restartNodes = "%s %s" % (node, restartNodes) if len(restartNodes): restart(restartNodes)
def run(workload): workload.active = True rabbitHelper = RabbitHelper() sdk_queue_key = "sdk_consumer.*" # read doc template template = Template.from_cache(str(workload.template)) if template is None: logger.error("no doc template imported") return consumer_template = copy.deepcopy(template) bucket = str(workload.bucket) password = str(workload.password) active_hosts = None clusterStatus = CacheHelper.clusterstatus(cfg.CB_CLUSTER_TAG + "_status") if clusterStatus is not None: active_hosts = clusterStatus.get_all_hosts() if workload.cc_queues is not None: # override template attribute with workload consumer_template.cc_queues = workload.cc_queues if len(workload.indexed_keys) > 0: template.indexed_keys = workload.indexed_keys ops_sec = workload.ops_per_sec # modify ops by number of consumers num_consumers = rabbitHelper.numExchangeQueues(cfg.CB_CLUSTER_TAG, EXCHANGE) if num_consumers == 0: logger.error("No sdkclients running") return ops_sec = int(ops_sec) / num_consumers create_count = int(ops_sec * workload.create_perc / 100) update_count = int(ops_sec * workload.update_perc / 100) get_count = int(ops_sec * workload.get_perc / 100) del_count = int(ops_sec * workload.del_perc / 100) exp_count = int(ops_sec * workload.exp_perc / 100) consume_queue = workload.consume_queue ttl = workload.ttl miss_queue = workload.miss_queue miss_perc = workload.miss_perc # broadcast to sdk_consumers msg = { 'bucket': bucket, 'id': workload.id, 'password': password, 'template': consumer_template.__dict__, 'ops_sec': ops_sec, 'create_count': create_count, 'update_count': update_count, 'get_count': get_count, 'del_count': del_count, 'exp_count': exp_count, 'consume_queue': consume_queue, 'ttl': ttl, 'miss_perc': miss_perc, 'active': True, 'active_hosts': active_hosts } rabbitHelper.putMsg('', json.dumps(msg), EXCHANGE) logger.error("start task sent to %s consumers" % num_consumers)
def replace_magic_vars(str_): ref = re.match(r".*\$(?P<var>\w+)", str_).group('var') ref = str(ref.strip()) value = CacheHelper.getPhaseVar(ref) or 0 str_ = str_.replace("$" + ref, str(value)) return str_
def report_kv_latency(bucket="default"): if cfg.SERIESLY_IP == '': # seriesly not configured return rabbitHelper = report_kv_latency.rabbitHelper clusterStatus = CacheHelper.clusterstatus(cfg.CB_CLUSTER_TAG+"_status") or\ ClusterStatus() host = clusterStatus.get_random_host() if host is None: return ip, port = host.split(':') workloads = CacheHelper.workloads() for workload in workloads: if workload.active and workload.bucket == bucket: # read workload params bucket = str(workload.bucket) password = str(workload.password) # read template from active workload template = Template.from_cache(str(workload.template)) template = template.__dict__ client.decodeMajgicStrings(template) # setup key/val to use for timing key = _random_string(12) value = json.dumps(template['kv']) get_key = key # for get op, try to pull from consume_queue # so that we can calc impact of dgm consume_queue = workload.consume_queue if consume_queue is not None: keys = rabbitHelper.getJsonMsg(str(consume_queue), requeue=True) if len(keys) > 0: get_key = str(keys['start']) # collect op latency set_latency = client.mc_op_latency('set', key, value, ip, port, bucket, password) get_latency = client.mc_op_latency('get', get_key, value, ip, port, bucket, password) delete_latency = client.mc_op_latency('delete', key, value, ip, port, bucket, password) # report to seriessly seriesly = Seriesly(cfg.SERIESLY_IP, 3133) db = None if 'fast' in seriesly.list_dbs(): db = 'fast' else: bucketStatus = BucketStatus.from_cache(bucket) or BucketStatus( bucket) db = bucketStatus.latency_db if db not in seriesly.list_dbs(): seriesly.create_db(db) if db is not None: seriesly[db].append({ 'set_latency': set_latency, 'get_latency': get_latency, 'delete_latency': delete_latency })