def update_app_limit(app_namespace, app_type, resource): output = {} file_name = "./resource.yaml" tmp_file_name = "%s.tmp" % file_name try: with open(file_name, "r") as f_r: output = yaml.load(f_r) output["spec"]["template"]["spec"]["containers"][0]["name"] = resource app_image = get_image_name(app_namespace, app_type, resource) if app_image: output["spec"]["template"]["spec"]["containers"][0]["image"] = app_image output["spec"]["template"]["spec"]["containers"][0]["resources"]["limits"]["cpu"] = str(cpu_limit)+"m" output["spec"]["template"]["spec"]["containers"][0]["resources"]["requests"]["cpu"] = str(cpu_limit)+"m" output["spec"]["template"]["spec"]["containers"][0]["resources"]["limits"]["memory"] = str(memory_limit)+"Mi" output["spec"]["template"]["spec"]["containers"][0]["resources"]["requests"]["memory"] = str(memory_limit)+"Mi" with open(tmp_file_name, "w") as f_w: yaml.dump(output, f_w) f_w.close() except Exception as e: print "failed to update %s: %s" % (file_name, str(e)) return -1 os.rename(tmp_file_name, file_name) if app_type == "deploymentconfig": OC().patch_deploymentconfig(app_namespace, resource, file_name) else: OC().patch_deployment(app_namespace, resource, file_name) print "success to update limits(cpu=%dm and memory=%dMi) for %s" % (cpu_limit, memory_limit, resource) return 0
def delete_consumer_yaml(): file_name = "%s/consumer-deployment.yaml" % config_path if number_k8shpa == 1 and k8shpa_type == "cpu": file_name = "%s/consumer-cpu-deployment.yaml" % config_path o = OC() output = o.delete_file(file_name) return output
def clean_data(algo, namespace, resource_type, resource): output = OC().scale_replica(namespace, resource_type, resource, 0) #if algo in ["k8shpa", "alameda"]: output = OC().scale_replica(namespace, resource_type, resource, initial_replica) # else: # output = OC().scale_replica(namespace, resource_type, resource, overprovision_replica) return output
def get_pod_name(pod_name_prefix): o = OC() pod_name = "" pod_output = o.get_pods(namespace).split("\n") for line in pod_output: if line.find(pod_name_prefix) != -1: pod_name = line.split()[0] break return pod_name
def start_k8shpa(namespace, resource_type, resource, num_replica_max, percent): print "=== Start K8sHPA ===", k8shpa_type if k8shpa_type == "cpu": output = OC().autoscale_replica(namespace, resource_type, resource, num_replica_max, percent) elif k8shpa_type == "memory": file_name = "./k8shpa_memory.yaml" output = update_k8shpa_yaml(file_name, namespace, resource, percent) output = OC().apply_file(file_name) return output
def find_alameda_namespace(app_name): namespace = "" output = OC().get_pods_all_namespace() for line in output.split("\n"): if line.find(app_name) != -1: namespace = line.split()[0] break if namespace: print "find %s's namespace: %s" % (app_name, namespace) else: raise Exception("ns: %s is not existed" % namespace) return namespace
def get_image_name(app_namespace, app_type, resource): output = "" image_name = "" oc = OC() if app_type == "deploymentconfig": output = oc.get_specific_deploymentconfig(app_namespace, resource) elif app_type == "deployment": output = oc.get_specific_deployment(app_namespace, resource) if output: output = yaml.load(output) for container_info in output.get("spec").get("template").get("spec").get("containers"): if container_info.get("name") == resource: image_name = container_info.get("image") break return image_name
def find_pod_name(app_name, app_namespace): pod_name_list = [] status = "" output = OC().get_pods(app_namespace) for line in output.split("\n"): if line.find(app_name) != -1: pod_name = line.split()[0] if pod_name.find("build") != -1: continue status = line.split()[2] if status not in ["Running"]: raise Exception("%s is %s" % (pod_name, status)) pod_name_list.append(pod_name) if not pod_name_list: raise Exception("%s is not existed in %s" % (app_name, app_namespace)) return pod_name_list
def restart_pod(app_name, app_namespace): output = "" pod_name_list = find_pod_name(app_name, app_namespace) for pod_name in pod_name_list: output = OC().delete_pod(pod_name, app_namespace) print output return output
def do_main(args): app_name = args.app_name[0] ret = OC().check_platform() # if ret == 0: #OpenShift # user = args.user[0] # passwd = args.password[0] # OC().login(user, passwd) # ask users to check execution or not if query_mode: ret = check_execution(app_name) if ret != 0: print "exit" return 0 try: initial_environment() check_environment(app_name) main(app_name) except KeyboardInterrupt: print "pgogram exit with keyboard interrupt" kill_process() except Exception as e: print "failed to test HPA: %s" % str(e) kill_process() kill_process() return 0
def __init__(self): self.oc_platform = not OC().check_platform( ) # OC().check_platform() return 0 if oc command exists if self.oc_platform: self.endpoint = define.prometheus_endpoint self.token = define.prometheus_token else: self.endpoint = self._get_endpoint_from_service()
def update_app_limit(app_namespace, app_type, resource): output = {} result = "" if app_type == "deploymentconfig": result = OC().get_specific_deploymentconfig(app_namespace, resource) elif app_type == "deployment": result = OC().get_specific_deployment(app_namespace, resource) output = yaml.load(result) output["spec"]["template"]["spec"]["containers"][0]["resources"] = {} output["spec"]["template"]["spec"]["containers"][0]["resources"][ "limits"] = {} output["spec"]["template"]["spec"]["containers"][0]["resources"][ "requests"] = {} if cpu_limit != 0: output["spec"]["template"]["spec"]["containers"][0]["resources"][ "limits"]["cpu"] = str(cpu_limit) + "m" output["spec"]["template"]["spec"]["containers"][0]["resources"][ "requests"]["cpu"] = str(cpu_limit) + "m" if memory_limit != 0: output["spec"]["template"]["spec"]["containers"][0]["resources"][ "limits"]["memory"] = str(memory_limit) + "Mi" output["spec"]["template"]["spec"]["containers"][0]["resources"][ "requests"]["memory"] = str(memory_limit) + "Mi" output["metadata"].pop("creationTimestamp") output["metadata"].pop("generation") output["metadata"].pop("resourceVersion") output["metadata"].pop("selfLink") output["metadata"].pop("uid") output.pop("status") file_name = "./resource.yaml" tmp_file_name = "%s.tmp" % file_name try: with open(tmp_file_name, "w") as f_w: yaml.dump(output, f_w) f_w.close() except Exception as e: print "failed to update %s: %s" % (file_name, str(e)) return -1 os.rename(tmp_file_name, file_name) OC().apply_file(file_name) print "success to update limits(cpu=%dm and memory=%dMi) for %s" % ( cpu_limit, memory_limit, resource) return 0
def _get_endpoint_from_service(self): output = OC().get_service(define.prometheus_namespace).split("\n") for line in output: if line.find(define.prometheus_operator_name) != -1: ip = line.split()[2] port = line.split()[-2].split(":")[0] endpoint = "http://%s:%s/api/v1" % (ip, port) break print("Prometheus: find endpoint(%s)", endpoint) sys.exit() return endpoint
class FetchLog: k = Kubectl() o = OC() namespace = "myproject" app_name = "consumer" def __init__(self): pass def get_pod_list(self): pod_list = [] output = self.o.get_pods(self.namespace) for line in output.split("\n"): if line.find(self.app_name) != -1 and line: pod_name = line.split()[0] pod_list.append(pod_name) return pod_list def fetch_log(self, pod_name): # print "pod_name=", pod_name data = {} data["start_complete_time"] = 0 data["partitions_revoked_time"] = {} data["partitions_assigned_time"] = {} data["rebalance_time"] = {} revoked_index = 0 assigned_index = 0 output = self.o.log_pod(self.namespace, pod_name) for line in output.split("\n"): if line.find("startup_complete") != -1: # print line startup_complete_time = json.loads(line).get("timestamp") data["start_complete_time"] = int(startup_complete_time) if line.find("partitions_revoked") != -1: # print line partitions_revoked_time = json.loads(line).get("timestamp") data["partitions_revoked_time"][revoked_index] = int(partitions_revoked_time) revoked_index += 1 if line.find("partitions_assigned") != -1: # print line partitions_assigned_time = json.loads(line).get("timestamp") data["partitions_assigned_time"][assigned_index] = int(partitions_assigned_time) assigned_index += 1 rebalance_count = len(data["partitions_assigned_time"].keys()) for i in range(rebalance_count): if i == 0: data["rebalance_time"][i] = data["partitions_assigned_time"][i] - data["start_complete_time"] elif data["partitions_revoked_time"]: data["rebalance_time"][i] = data["partitions_assigned_time"][i] - data["partitions_revoked_time"][i-1] # print data return data
def start_k8shpa(namespace, resource_type, resource, num_replica, value): output = "" print "=== Start K8sHPA: %s ===" % k8shpa_type if k8shpa_type == "cpu": print "=== Set up autoscale ===" output = OC().autoscale_replica(namespace, resource_type, resource, num_replica, value) print "%s" % output if k8shpa_type == "lag": file_name = "%s/consumer-hpa.yaml" % config_path print "=== Applying file: %s ===" % file_name output = OC().apply_file(file_name) print "%s" % output # elif k8shpa_type == "memory": # file_name = "./k8shpa_memory.yaml" # output = update_k8shpa_yaml(file_name, namespace, resource, value) # output = OC().apply_file(file_name) # elif k8shpa_type == "consumergroup_lag": # file_name = "./k8shpa_consumergrouplag.yaml" # # output = update_k8shpa_yaml(file_name, namespace, resource, value) # output = OC().apply_file(file_name) # print output return output
def get_recommender_log(pod_name_prefix): log_data = {} o = OC() pod_name = get_pod_name(pod_name_prefix) log_output = o.log_pod(namespace, pod_name).split("\n") for line in log_output: if line and line.find("Desired") != -1: if line.find("Evaluate Cost parameters") != -1: timestamp = line.split()[0].split(".")[0] cost_data_list = line.split("\t")[-1].split() if line.find("desired replicas result") != -1 and line.find( "best") == -1: timestamp = line.split()[0].split(".")[0] log_data[timestamp] = {} recommender_data_list = json.loads( line.split("\t")[-1].split()[-1]) count = 0 for data in recommender_data_list: if count == 0: log_data[timestamp]["ma%d_data" % count] = data else: log_data[timestamp]["predict%d_data" % count] = data count += 1 if line.find("best desired replicas result") != -1: timestamp = line.split()[0].split(".")[0] if log_data.get(timestamp): best_desired_replicas = json.loads( line.split("\t")[-1].split()[-1]).get( "DesiredReplicas") log_data[timestamp][ "best_desired_replicas"] = best_desired_replicas best_execution_time = json.loads( line.split("\t")[-1].split()[-1]).get( "BestExecutationTime") log_data[timestamp][ "best_execution_time"] = best_execution_time return log_data
def wait_pod_running(namespace): print "wait pods in %s running" % namespace start_time = time.time() while True: output = OC().get_pods(namespace) print "wait 30 sec" time.sleep(30) end_time = time.time() if end_time - start_time >= 600: raise Exception("timeout for waiting pods running") break count = 0 for line in output.split("\n"): if line and line.find("NAME") == -1: status = line.split()[2] ready = line.split()[1] if ready != "1/1" and status != "Running": print line elif (ready == "1/1" or ready == "2/2" or ready == "3/3") and status == "Running": count += 1 print "%s pods in %s are running" % (count, namespace) if count >= 8 + initial_consumer: print "all pods in %s are running and ready" % namespace break
class OverLimit: k = Kubectl() wait_time = 30 metric_item_list = ["cpu_value", "memory_value"] limit_item_list = ["pod_cpu_limits", "pod_memory_limits"] request_item_list = ["pod_cpu_requests", "pod_memory_requests"] app_list = {} app_name = "" namespace = "" cpu_limit = 0 mem_limit = 0 oc = OC() app_type = "" prometheus = Prometheus() def __init__(self): app_namespace = os.environ.get("NAMESPACE") or "nginx" app_type = os.environ.get("RESOURCE_TYPE") or "deployment" resource = os.environ.get("RESOURCE") or "nginx" self.namespace = app_namespace self.app_name = resource self.app_type = app_type def find_deploymentconfig_by_namespace(self, app_name): deployment_name_list = [] output = {} if self.app_type == "deployment": output = self.oc.get_deployment(self.namespace) if self.app_type == "deploymentconfig": output = self.oc.get_deploymentconfig(self.namespace) for line in output.split("\n"): if line.find(app_name) != -1: deployment_name = line.split()[0] deployment_name_list.append(deployment_name) return deployment_name_list def find_pod_by_namespace(self, app_name): pod_name_list = [] output = self.oc.get_pods(self.namespace) for line in output.split("\n"): if line.find(app_name) != -1: pod_name = line.split()[0] if pod_name.find("build") != -1: continue pod_name_list.append(pod_name) return pod_name_list def get_deploymentconfig(self): self.app_list = {} # print ("---get deployment info---") deployment_name_list = self.find_deploymentconfig_by_namespace( self.app_name) for deployment in deployment_name_list: self.app_list[deployment] = {} # print self.app_list def get_pod_info(self): # print ("---get pod info---") pod_name_list = self.find_pod_by_namespace(self.app_name) for pod_name in pod_name_list: for deployment in self.app_list.keys(): if pod_name.find(deployment) != -1: self.app_list[deployment][pod_name] = {} # print self.app_list def get_metrics(self): # print ("---get metrics---") self.kubectl = Kubectl() for metric_item in self.metric_item_list: for deployment in self.app_list.keys(): for pod_name in self.app_list[deployment]: self.app_list[deployment][pod_name][metric_item] = 0 for deployment in self.app_list.keys(): for pod_name in self.app_list[deployment].keys(): output = self.kubectl.top_pod(pod_name, self.namespace) for line in output.split("\n"): if line.find(pod_name) != -1: # by kubectl top cpu = int(line.split()[-2].strip("m")) # mCore memory = int(line.split()[-1].strip("Mi")) # MB self.app_list[deployment][pod_name]["cpu_value"] = cpu self.app_list[deployment][pod_name][ "memory_value"] = memory # print self.app_list def get_pod_limit(self, pod_name): # print ("---get pod limit---") cpu_limit = d_cpu_limit memory_limit = d_memory_limit # data collect interval needs less than 30s # return cpu/memory limit from setting directly return cpu_limit, memory_limit output = self.oc.get_pod_json(pod_name, self.namespace) if output: try: output = json.loads(output) cpu_limit1 = output.get("spec", {}).get( "containers", [])[0].get("resources").get("limits").get("cpu") if cpu_limit1 and cpu_limit1.find("m") != -1: cpu_limit = float(cpu_limit1.split("m")[0]) else: cpu_limit = float(cpu_limit1) * 1000 memory_limit1 = output.get("spec", {}).get( "containers", [])[0].get("resources").get("limits").get("memory") if memory_limit1 and memory_limit1.find("M") != -1: memory_limit = float(memory_limit1.split("M")[0]) elif memory_limit1 and memory_limit1.find("G") != -1: memory_limit = float(memory_limit1.split("G")[0]) * 1000 except Exception as e: print "failed to get limits: %s" % str(e) return cpu_limit, memory_limit def get_limits(self): output = {} for metric_item in self.limit_item_list: for deployment in self.app_list.keys(): for pod_name in self.app_list[deployment].keys(): cpu_limit, memory_limit = self.get_pod_limit(pod_name) if metric_item == "pod_cpu_limits": self.app_list[deployment][pod_name][ metric_item] = cpu_limit else: self.app_list[deployment][pod_name][ metric_item] = memory_limit def get_pod_reason(self, pod_name): reason_list = [] output = self.oc.get_pod_json(pod_name, self.namespace) if output: output = json.loads(output) if output.get("status").get("containerStatuses")[0].get( "lastState"): terminated = output.get("status").get( "containerStatuses")[0].get("lastState").get("terminated") reason_list.append(terminated) return reason_list def get_status(self): output = self.oc.get_pods(self.namespace) for deployment in self.app_list.keys(): for pod_name in self.app_list[deployment].keys(): for line in output.split("\n"): if line.find(self.app_name) != -1: pod = line.split()[0] if pod == pod_name: reason_list = self.get_pod_reason(pod_name) status = line.split()[2] restart = int(line.split()[3]) self.app_list[deployment][pod_name][ "status"] = status self.app_list[deployment][pod_name][ "restart"] = restart self.app_list[deployment][pod_name][ "reason"] = reason_list def get_node_status(self): # print "get node status" node_info = {} output = self.oc.get_nodes() for line in output.split("\n"): if line.find("NAME") == -1 and line: node_name = line.split()[0] status = line.split()[1] node_info[node_name] = {} node_info[node_name]["status"] = status usage_output = self.k.top_node(node_name) for line in usage_output.split("\n"): if line.find(node_name) != -1: cpu = int(line.split()[1].split("m")[0]) memory = int(line.split()[3].split("Mi")[0]) node_info[node_name]["cpu"] = cpu node_info[node_name]["memory"] = memory # print node_info return node_info def get_http_requests(self): #query = "%s{namespace=\"%s\"}" % (ingress_http_requests_name, ingress_namespace) query = "sum(idelta(haproxy_server_http_responses_total{exported_namespace=\"nginx\",route=\"nginx-service\",code=\"2xx\"}[2m]))" output = self.prometheus.query_value(query) return float(output) / 2.0 def calculate_overlimit(self, algo, time_count): cpu_count = 0 memory_count = 0 count = 0 total_restart = 0 total_terminated = 0 data_count = int(time_count * 60 / self.wait_time) print "--- %s collect data and write to logs for %d minutes ---" % ( algo.split("_")[0].upper(), time_count) start_time = time.time() for i in range(data_count): self.get_deploymentconfig() self.get_pod_info() self.get_limits() self.get_metrics() # self.get_status() print "--- %s start to collect data at %d/%d interval(in 30 sec), start: %s, current: %s ---" % ( algo.split("_")[0], i, data_interval * 2, start_time, time.time()) for deployment in self.app_list.keys(): cpu_limit = 0 memory_limit = 0 total_cpu = 0 total_memory = 0 total_cpu_limit = 0 total_memory_limit = 0 # pod for pod in self.app_list[deployment].keys(): if self.app_list[deployment][pod].get("pod_cpu_limits"): cpu_limit = self.app_list[deployment][pod][ "pod_cpu_limits"] memory_limit = self.app_list[deployment][pod][ "pod_memory_limits"] cpu = self.app_list[deployment][pod]["cpu_value"] memory = self.app_list[deployment][pod]["memory_value"] total_cpu += cpu total_memory += memory total_cpu_limit += cpu_limit total_memory_limit += memory_limit if cpu >= cpu_limit and cpu_limit != 0: cpu_count += 1 if memory >= memory_limit and memory_limit != 0: memory_count += 1 restart = self.app_list[deployment][pod].get("restart", 0) total_restart += restart reason = self.app_list[deployment][pod].get("reason", []) total_terminated += len(reason) num_replica = len(self.app_list[deployment].keys()) # http requests http_requests = self.get_http_requests() print self.app_name, "total_cpu=", total_cpu, "m" print self.app_name, "total_memory=", total_memory, "Mi" print self.app_name, "current replica=%d" % num_replica print self.app_name, "overflow=", cpu_count, "times" print self.app_name, "oom=", memory_count, "times" print self.app_name, "restart=", total_restart, "times" print self.app_name, "terminated=", total_terminated, "times" print self.app_name, "http_requests=%s" % http_requests print "\n" total_status = 0 total_node_cpu = 0 total_node_memory = 0 # # skip collect node info (take too long) # node #node_info = self.get_node_status() #for node in node_info.keys(): # if node_info[node].get("status").find("NotReady") != -1: # total_status += 1 # total_node_cpu += node_info[node]["cpu"] # total_node_memory += node_info[node]["memory"] algo_name = "%s-%s" % (self.app_name, algo) data = [ algo_name, total_cpu, total_cpu_limit, total_memory, total_memory_limit, cpu_count, memory_count, num_replica, restart, total_status, total_node_cpu, total_node_memory, http_requests ] self.write_metric(data) # print "wait %d seconds" % self.wait_time # correct time interval = 30 for j in range(interval): end_time = time.time() if end_time - start_time >= interval: start_time = start_time + interval break time.sleep(1) def write_metric(self, data): # print "write metrics" timestamp = str(int(time.time())) data.append(timestamp) try: pod_name = data[0] fn = "./metrics/%s" % pod_name with open(fn, "a") as f: line = " ".join([str(elem) for elem in data]) f.write("%s\n" % str(line)) except Exception as e: print "failed to write metrics:%s" % str(e)
def apply_producer_yaml(): file_name = "%s/producer-deployment.yaml" % config_path o = OC() output = o.apply_file(file_name) return output
class Producer(Client): oc = OC() k = Kubectl() w = WriteLog() def __init__(self): super(Producer, self).__init__() self.namespace = "myproject" self.app_name = "producer" self.app_type = "deployment" self.w.namespace = self.namespace self.w.app_name = self.app_name self.w.app_type = self.app_type def wait_time(self, value): # print "wait %d seconds" % value time.sleep(value) def read_transaction_list(self): transaction_list = [] file_name = "./transaction.txt" try: with open(file_name, "r") as f: output = f.read() for line in output.split("\n"): if line: transaction_list.append(float(line)) except Exception as e: print "faild to read %s: %s" % (file_name, str(e)) return transaction_list # print "success to read %s" % (file_name) return transaction_list def calculate_pod_info(self): app_cpu_value = 0 app_memory_value = 0 app_cpu_limit = 0 app_memory_limit = 0 app_cpu_overlimit = 0 app_memory_overlimit = 0 app_restart = 0 app_status_running = 0 app_status_crashloopbackoff = 0 app_status_oomkilled = 0 for pod in self.w.app_list[self.app_name].keys(): for item in self.w.app_list[self.app_name][pod].keys(): if item in ["cpu_value"]: app_cpu_value += self.w.app_list[ self.app_name][pod]["cpu_value"] elif item in ["memory_value"]: app_memory_value += self.w.app_list[ self.app_name][pod]["memory_value"] elif item in ["pod_cpu_limits"]: app_cpu_limit += self.w.app_list[ self.app_name][pod]["pod_cpu_limits"] elif item in ["pod_memory_limits"]: app_memory_limit += self.w.app_list[ self.app_name][pod]["pod_memory_limits"] elif item in ["restart"]: app_restart += self.w.app_list[ self.app_name][pod]["restart"] elif item == "status": status = self.w.app_list[self.app_name][pod]["status"] if status in ["Running"]: app_status_running += 1 if status in ["CrashLoopBackOff"]: app_status_crashloopbackoff += 1 if status in ["OOMKilled"]: app_status_oomkilled += 1 print "- Producers: CPU %s/%s mCore; Memory %s/%s Mi; Restart %s" % ( app_cpu_value, app_cpu_limit, app_memory_value, app_memory_limit, app_restart) output = "%s %s %s %s %s %s %s %s " % ( app_cpu_value, app_cpu_limit, app_memory_value, app_memory_limit, app_restart, app_status_running, app_status_crashloopbackoff, app_status_oomkilled) return output def calculate_overlimit(self): app_cpu_overlimit = 0 app_memory_overlimit = 0 # calculate overlimit for pod in self.w.app_list[self.app_name].keys(): cpu_value = self.w.app_list[self.app_name][pod]["cpu_value"] memory_value = self.w.app_list[self.app_name][pod]["memory_value"] cpu_limit = self.w.app_list[self.app_name][pod]["pod_cpu_limits"] memory_limit = self.w.app_list[ self.app_name][pod]["pod_memory_limits"] if cpu_limit <= cpu_value: app_cpu_overlimit += 1 if memory_limit <= memory_value: app_memory_overlimit += 1 num_replica = len(self.w.app_list[self.app_name].keys()) #print "- Producers: OverLimit %s; OOM: %s\n" % (app_cpu_overlimit, app_memory_overlimit) output = "%s %s %s" % (app_cpu_overlimit, app_memory_overlimit, num_replica) return output def calculate_performance(self, producer_info): app_record = 0 app_999th_latency = 0 app_max_latency = 0 app_99th_latency = 0 app_95th_latency = 0 app_throughput = 0 app_avg_latency = 0 app_50th_latency = 0 # get avg. latency for pod in producer_info.keys(): for item in producer_info[pod].keys(): if item == "record": app_record += producer_info[pod][item] if item == "max_latency": app_max_latency += producer_info[pod][ item] * producer_info[pod]["record"] elif item == "throughput": app_throughput += producer_info[pod][item] * producer_info[ pod]["record"] elif item == "avg_latency": app_avg_latency += producer_info[pod][ item] * producer_info[pod]["record"] elif item == "50th_latency": app_50th_latency += producer_info[pod][ item] * producer_info[pod]["record"] elif item == "95th_latency": app_95th_latency += producer_info[pod][ item] * producer_info[pod]["record"] elif item == "99th_latency": app_99th_latency += producer_info[pod][ item] * producer_info[pod]["record"] elif item == "99.9th_latency": app_999th_latency += producer_info[pod][ item] * producer_info[pod]["record"] if app_record == 0: app_record = -1 app_max_latency = app_max_latency / app_record app_throughput = app_throughput / app_record app_avg_latency = app_avg_latency / app_record app_50th_latency = app_50th_latency / app_record app_95th_latency = app_95th_latency / app_record app_99th_latency = app_99th_latency / app_record app_999th_latency = app_999th_latency / app_record output = "%s %s %s %s %s %s %s %s " % ( app_record, app_max_latency, app_throughput, app_avg_latency, app_50th_latency, app_95th_latency, app_99th_latency, app_999th_latency) return output def write_logs(self, algo_name): self.w.get_deploymentconfig() self.w.get_pod_info() self.w.get_limits() self.w.get_metrics() self.w.get_status() timestamp = int(time.time()) line = "%s " % timestamp line += self.calculate_pod_info() line += self.calculate_overlimit() line += "\n" file_name = "%s/%s_producer_metrics" % (traffic_path, algo_name) try: with open(file_name, "a") as f: f.write(line) except Exception as e: print "failed to write producer logs(%s): %s" % (file_name, str(e)) return -1 # print "success to write producer logs(%s)" % file_name return 0 def write_latency(self, algo_name, producer_info): timestamp = int(time.time()) line = "%s " % timestamp line += self.calculate_performance(producer_info) line += "\n" file_name = "%s/%s_producer_latency" % (traffic_path, algo_name) try: with open(file_name, "a") as f: f.write(line) except Exception as e: print "failed to write producer latency(%s): %s" % (file_name, str(e)) return -1 # print "success to write producer logs(%s)" % file_name return 0
class WriteLog: k = Kubectl() wait_time = 30 metric_item_list = ["cpu_value", "memory_value"] limit_item_list = ["pod_cpu_limits", "pod_memory_limits"] request_item_list = ["pod_cpu_requests", "pod_memory_requests"] app_list = {} app_name = "" namespace = "" cpu_limit = 0 mem_limit = 0 oc = OC() app_type = "" def __init__(self): pass def find_deploymentconfig_by_namespace(self, app_name): deployment_name_list = [] output = "" if self.app_type == "deployment": output = self.oc.get_deployment(self.namespace) elif self.app_type == "deploymentconfig": output = self.oc.get_deploymentconfig(self.namespace) elif self.app_type == "statefulset": output = self.oc.get_statefulset(self.namespace) for line in output.split("\n"): if line.find(app_name) != -1: deployment_name = line.split()[0] deployment_name_list.append(deployment_name) return deployment_name_list def find_pod_by_namespace(self, app_name): pod_name_list = [] output = self.oc.get_pods(self.namespace) for line in output.split("\n"): if line.find(app_name) != -1: pod_name = line.split()[0] if pod_name.find("build") != -1: continue pod_name_list.append(pod_name) return pod_name_list def get_deploymentconfig(self): self.app_list = {} # print ("---get deployment info---") deployment_name_list = self.find_deploymentconfig_by_namespace( self.app_name) for deployment in deployment_name_list: self.app_list[deployment] = {} # print self.app_list def get_pod_info(self): # print ("---get pod info---") pod_name_list = self.find_pod_by_namespace(self.app_name) for pod_name in pod_name_list: for deployment in self.app_list.keys(): if pod_name.find(deployment) != -1: self.app_list[deployment][pod_name] = {} # print self.app_list def get_metrics(self): # print ("---get metrics---") self.kubectl = Kubectl() for metric_item in self.metric_item_list: for deployment in self.app_list.keys(): for pod_name in self.app_list[deployment]: self.app_list[deployment][pod_name][metric_item] = 0 for deployment in self.app_list.keys(): for pod_name in self.app_list[deployment].keys(): output = self.kubectl.top_pod(pod_name, self.namespace) for line in output.split("\n"): if line.find(pod_name) != -1: # by kubectl top cpu = int(line.split()[-2].strip("m")) # mCore memory = int(line.split()[-1].strip("Mi")) # MB self.app_list[deployment][pod_name]["cpu_value"] = cpu self.app_list[deployment][pod_name][ "memory_value"] = memory # print self.app_list def get_pod_limit(self, pod_name): #print ("---get pod limit---") cpu_limit = 0 memory_limit = 0 cpu_limit_mcore = "0m" memory_limit_mb = "0Mi" output = self.oc.get_pod_json(pod_name, self.namespace) if output: try: output = json.loads(output) if output.get("spec", {}).get("containers", [])[0].get("resources"): cpu_limit_mcore = output.get("spec", {}).get( "containers", [])[0].get("resources").get("limits").get("cpu", "0m") if cpu_limit_mcore and cpu_limit_mcore.find("m") != -1: cpu_limit = float(cpu_limit_mcore.split("m")[0]) else: cpu_limit = float(cpu_limit_mcore) * 1000 if output.get("spec", {}).get("containers", [])[0].get("resources"): memory_limit_mb = output.get("spec", {}).get( "containers", [])[0].get("resources").get("limits").get( "memory", "0Mi") if memory_limit_mb and memory_limit_mb.find("M") != -1: memory_limit = float(memory_limit_mb.split("M")[0]) elif memory_limit_mb and memory_limit_mb.find("G") != -1: memory_limit = float(memory_limit_mb.split("G")[0]) * 1000 except Exception as e: print "failed to get limits: %s" % str(e) return cpu_limit, memory_limit def get_limits(self): output = {} for metric_item in self.limit_item_list: for deployment in self.app_list.keys(): for pod_name in self.app_list[deployment].keys(): cpu_limit, memory_limit = self.get_pod_limit(pod_name) if metric_item == "pod_cpu_limits": self.app_list[deployment][pod_name][ metric_item] = cpu_limit else: self.app_list[deployment][pod_name][ metric_item] = memory_limit def get_pod_reason(self, pod_name): reason_list = [] output = self.oc.get_pod_json(pod_name, self.namespace) if output: output = json.loads(output) if output.get("status").get("containerStatuses")[0].get( "lastState"): terminated = output.get("status").get("containerStatuses")[ 0].get("lastState").get("terminated").get("reason") reason_list.append(terminated) return reason_list def get_status(self, is_reason=True): output = self.oc.get_pods(self.namespace) for deployment in self.app_list.keys(): for pod_name in self.app_list[deployment].keys(): for line in output.split("\n"): if line.find(self.app_name) != -1: pod = line.split()[0] if pod == pod_name: status = line.split()[2] restart = int(line.split()[3]) self.app_list[deployment][pod_name][ "status"] = status self.app_list[deployment][pod_name][ "restart"] = restart if is_reason: reason_list = self.get_pod_reason(pod_name) self.app_list[deployment][pod_name][ "reason"] = reason_list def get_node_status(self): # print "get node status" node_info = {} output = self.oc.get_nodes() for line in output.split("\n"): if line.find("NAME") == -1 and line: node_name = line.split()[0] status = line.split()[1] node_info[node_name] = {} node_info[node_name]["status"] = status usage_output = self.k.top_node(node_name) for line in usage_output.split("\n"): if line.find(node_name) != -1: cpu = int(line.split()[1].split("m")[0]) memory = int(line.split()[3].split("Mi")[0]) node_info[node_name]["cpu"] = cpu node_info[node_name]["memory"] = memory # print node_info return node_info def calculate_overlimit(self, algo, time_count): cpu_count = 0 memory_count = 0 count = 0 total_restart = 0 total_terminated = 0 data_count = int(time_count * 60 / self.wait_time) print "--- %s collect data and write to logs for %d minutes ---" % ( algo.split("_")[0].upper(), time_count) for i in range(data_count): start_time = time.time() self.get_deploymentconfig() self.get_pod_info() self.get_limits() self.get_metrics() self.get_status() print "--- %s start to collect data at %d/%d interval(in 30 sec) ---" % ( algo.split("_")[0], i, data_interval * 2) for deployment in self.app_list.keys(): cpu_limit = 0 memory_limit = 0 total_cpu = 0 total_memory = 0 total_cpu_limit = 0 total_memory_limit = 0 # pod for pod in self.app_list[deployment].keys(): if self.app_list[deployment][pod].get("pod_cpu_limits"): cpu_limit = self.app_list[deployment][pod][ "pod_cpu_limits"] memory_limit = self.app_list[deployment][pod][ "pod_memory_limits"] cpu = self.app_list[deployment][pod]["cpu_value"] memory = self.app_list[deployment][pod]["memory_value"] total_cpu += cpu total_memory += memory total_cpu_limit += cpu_limit total_memory_limit += memory_limit if cpu >= cpu_limit and cpu_limit != 0: cpu_count += 1 if memory >= memory_limit and memory_limit != 0: memory_count += 1 restart = self.app_list[deployment][pod].get("restart", 0) total_restart += restart reason = self.app_list[deployment][pod].get("reason", []) total_terminated += len(reason) num_replica = len(self.app_list[deployment].keys()) print self.app_name, "total_cpu=", total_cpu, "m" print self.app_name, "total_memory=", total_memory, "Mi" print self.app_name, "current replica=%d" % num_replica print self.app_name, "overflow=", cpu_count, "times" print self.app_name, "oom=", memory_count, "times" print self.app_name, "restart=", total_restart, "times" print self.app_name, "terminated=", total_terminated, "times" print "\n" total_status = 0 algo_name = "%s-%s" % (self.app_name, algo) data = [ algo_name, total_cpu, total_cpu_limit, total_memory, total_memory_limit, cpu_count, memory_count, num_replica, restart, total_status ] self.write_metric(data) # print "wait %d seconds" % self.wait_time # correct time interval = 30 for j in range(interval): end_time = time.time() if end_time - start_time >= interval: start_time = start_time + interval break time.sleep(5) def write_metric(self, data): # print "write metrics" timestamp = str(int(time.time())) data.append(timestamp) try: pod_name = data[0] fn = "./metrics/%s" % pod_name with open(fn, "a") as f: line = " ".join([str(elem) for elem in data]) f.write("%s\n" % str(line)) except Exception as e: print "failed to write metrics:%s" % str(e)
class Consumer(Client): oc = OC() k = Kubectl() w = WriteLog() def __init__(self): super(Consumer, self).__init__() self.namespace = "myproject" self.app_name = "consumer" self.app_type = "deployment" self.w.namespace = self.namespace self.w.app_name = self.app_name self.w.app_type = self.app_type def wait_time(self, value): # print "wait %d seconds" % value time.sleep(value) def calculate_pod_info(self): app_cpu_value = 0 app_memory_value = 0 app_cpu_limit = 0 app_memory_limit = 0 app_restart = 0 app_status_running = 0 app_status_crashloopbackoff = 0 app_status_oomkilled = 0 for pod in self.w.app_list[self.app_name].keys(): for item in self.w.app_list[self.app_name][pod].keys(): if item in ["cpu_value"]: app_cpu_value += self.w.app_list[ self.app_name][pod]["cpu_value"] elif item in ["memory_value"]: app_memory_value += self.w.app_list[ self.app_name][pod]["memory_value"] elif item in ["pod_cpu_limits"]: app_cpu_limit += self.w.app_list[ self.app_name][pod]["pod_cpu_limits"] elif item in ["pod_memory_limits"]: app_memory_limit += self.w.app_list[ self.app_name][pod]["pod_memory_limits"] elif item in ["restart"]: app_restart += self.w.app_list[ self.app_name][pod]["restart"] elif item == "status": status = self.w.app_list[self.app_name][pod]["status"] if status in ["Running"]: app_status_running += 1 if status in ["CrashLoopBackOff"]: app_status_crashloopbackoff += 1 elif item == "reason": reason_list = self.w.app_list[self.app_name][pod]["reason"] for reason in reason_list: if reason == "OOMKilled": app_status_oomkilled += 1 print "- Consumers: CPU %s/%s mCore; Memory %s/%s Mi; Restart %s OOMKilled %s" % ( app_cpu_value, app_cpu_limit, app_memory_value, app_memory_limit, app_restart, app_status_oomkilled) output = "%s %s %s %s %s %s %s %s " % ( app_cpu_value, app_cpu_limit, app_memory_value, app_memory_limit, app_restart, app_status_running, app_status_crashloopbackoff, app_status_oomkilled) return output def calculate_overlimit(self): app_cpu_overlimit = 0 app_memory_overlimit = 0 # calculate overlimit for pod in self.w.app_list[self.app_name].keys(): cpu_value = self.w.app_list[self.app_name][pod]["cpu_value"] memory_value = self.w.app_list[self.app_name][pod]["memory_value"] cpu_limit = self.w.app_list[self.app_name][pod]["pod_cpu_limits"] memory_limit = self.w.app_list[ self.app_name][pod]["pod_memory_limits"] if cpu_limit <= cpu_value: app_cpu_overlimit += 1 if memory_limit <= memory_value: app_memory_overlimit += 1 num_replica = len(self.w.app_list[self.app_name].keys()) print "- Consumers: Replica: %s\n" % (num_replica) output = "%s %s %s " % (app_cpu_overlimit, app_memory_overlimit, num_replica) return output def calculate_performance(self, group_name, topic_name): total_lag = 0 total_log_offset = 0 total_current_offset = 0 active_client = 0 inactive_client = 0 partition_list = [] active_client_list = [] start_time = time.time() num_sample = 3 # print "--------", group_name, topic_name for i in range(num_sample): output = self.describe_consumer_group(group_name) print "===" print "%s" % output print "===" for line in output.split("\n"): if line and line.find(topic_name) != -1 and line.find( "Error") == -1: partition = int(line.split()[2]) if partition not in partition_list: partition_list.append(partition) current_offset = int(line.split()[3]) log_offset = int(line.split()[4]) lag = int(line.split()[5]) consumer_id = line.split()[6] total_log_offset += log_offset total_current_offset += current_offset total_lag += lag if consumer_id.find("consumer-1") == -1: inactive_client += 1 if consumer_id not in active_client_list: active_client_list.append(consumer_id) # print i, "total describe lag=", lag, time.time() total_lag = total_lag / (num_sample * 1.0) total_log_offset = total_log_offset / (num_sample * 1.0) total_current_offset = total_current_offset / (num_sample * 1.0) inactive_client = inactive_client / (num_sample * 1.0) active_client = len(active_client_list) print "- Consumers: Log Offset %s;" % total_log_offset, "Current Offset %s;" % total_current_offset, "Lag %s;" % total_lag print "- Consumers: Active %s;" % active_client, "Inactive %s" % inactive_client print "\n" output = "%s %s %s %s %s %s %s %s " % ( group_name, topic_name, total_lag, active_client, inactive_client, total_log_offset, total_current_offset, len(partition_list)) end_time = time.time() #print ">> describe time = ", end_time - start_time return output def write_logs(self, algo_name, group_name, topic_name): self.w.get_deploymentconfig() self.w.get_pod_info() self.w.get_limits() self.w.get_metrics() self.w.get_status() file_name = "%s/%s_consumer_metrics" % (traffic_path, algo_name) timestamp = int(time.time()) line = "%s " % (timestamp) line += self.calculate_pod_info() line += self.calculate_overlimit() # hungo test - block calculate (per maygy) #line += self.calculate_performance(group_name, topic_name) line += "\n" try: with open(file_name, "a") as f: f.write(line) except Exception as e: print "failed to write consumer logs(%s): %s" % (file_name, str(e)) return -1 # print "success to write consumer logs(%s)" % file_name return 0 def delete_all_consumer_groups(self): # delete all consumer groups group_list = self.list_consumer_group() for group in group_list: output = self.delete_consumer_group(group)
class Zookeeper(Client): oc = OC() k = Kubectl() w = WriteLog() def __init__(self): super(Zookeeper, self).__init__() self.namespace = "myproject" self.app_name = "my-cluster-zookeeper" self.app_type = "statefulset" self.w.namespace = self.namespace self.w.app_name = self.app_name self.w.app_type = self.app_type def wait_time(self, value): # print "wait %d seconds" % value time.sleep(value) def calculate_pod_info(self): app_cpu_value = 0 app_memory_value = 0 app_cpu_limit = 0 app_memory_limit = 0 app_restart = 0 app_status_running = 0 app_status_crashloopbackoff = 0 app_status_oomkilled = 0 for pod in self.w.app_list[self.app_name].keys(): for item in self.w.app_list[self.app_name][pod].keys(): if item in ["cpu_value"]: app_cpu_value += self.w.app_list[ self.app_name][pod]["cpu_value"] elif item in ["memory_value"]: app_memory_value += self.w.app_list[ self.app_name][pod]["memory_value"] elif item in ["pod_cpu_limits"]: app_cpu_limit += self.w.app_list[ self.app_name][pod]["pod_cpu_limits"] elif item in ["pod_memory_limits"]: app_memory_limit += self.w.app_list[ self.app_name][pod]["pod_memory_limits"] elif item in ["restart"]: app_restart += self.w.app_list[ self.app_name][pod]["restart"] elif item == "status": status = self.w.app_list[self.app_name][pod]["status"] if status in ["Running"]: app_status_running += 1 if status in ["CrashLoopBackOff"]: app_status_crashloopbackoff += 1 if status in ["OOMKilled"]: app_status_oomkilled += 1 print "- Zookeepers: CPU %s/%s mCore; Memory %s/%s Mi; Restart %s" % ( app_cpu_value, app_cpu_limit, app_memory_value, app_memory_limit, app_restart) output = "%s %s %s %s %s %s %s %s " % ( app_cpu_value, app_cpu_limit, app_memory_value, app_memory_limit, app_restart, app_status_running, app_status_crashloopbackoff, app_status_oomkilled) return output def calculate_overlimit(self): app_cpu_overlimit = 0 app_memory_overlimit = 0 # calculate overlimit for pod in self.w.app_list[self.app_name].keys(): cpu_value = self.w.app_list[self.app_name][pod]["cpu_value"] memory_value = self.w.app_list[self.app_name][pod]["memory_value"] cpu_limit = self.w.app_list[self.app_name][pod]["pod_cpu_limits"] memory_limit = self.w.app_list[ self.app_name][pod]["pod_memory_limits"] if cpu_limit <= cpu_value: app_cpu_overlimit += 1 if memory_limit <= memory_value: app_memory_overlimit += 1 num_replica = len(self.w.app_list[self.app_name].keys()) print "- Zookeepers: OverLimit %s; OOM: %s\n" % (app_cpu_overlimit, app_memory_overlimit) output = "%s %s %s" % (app_cpu_overlimit, app_memory_overlimit, num_replica) return output def write_logs(self, algo_name): self.w.get_deploymentconfig() self.w.get_pod_info() self.w.get_limits() self.w.get_metrics() self.w.get_status() file_name = "%s/%s_zookeeper_metrics" % (traffic_path, algo_name) timestamp = int(time.time()) line = "%s " % (timestamp) line += self.calculate_pod_info() line += self.calculate_overlimit() line += "\n" try: with open(file_name, "a") as f: f.write(line) except Exception as e: print "failed to write zookeeper logs(%s): %s" % (file_name, str(e)) return -1 # print "success to write zookeeper logs(%s)" % file_name return 0
def enable_executor(): print "enable executor" output = OC().apply_file("alameda-executor-true.yaml") alameda_namespace = find_alameda_namespace("alameda-executor") get_executor_status(alameda_namespace, "true") return output
def disable_executor(): print "disable executor" output = OC().apply_file("alameda-executor-false.yaml") alameda_namespace = find_alameda_namespace("alameda-executor") get_executor_status(alameda_namespace, "false") return output
class Training: k = Kubectl() o = OC() n = Nginx() def __init__(self): #self.o.login("admin", "password") test = "" def get_node_list(self): node_list = [] output = self.o.get_nodes() for line in output.split("\n"): if line.find("NAME") == -1 and line: node_name = line.split()[0] node_list.append(node_name) return node_list def get_node_usage(self): # kubectl top node h5-135 # NAME CPU(cores) CPU% MEMORY(bytes) MEMORY% # h5-135 655m 8% 5703Mi 17% node_usage = {} node_usage["cpu"] = {} node_usage["memory"] = {} node_list = self.get_node_list() for node in node_list: output = self.k.top_node(node) for line in output.split("\n"): if line.find("NAME") == -1 and line: cpu_usage = int(line.split()[2].split("%")[0]) memory_usage = int(line.split()[-1].split("%")[0]) node_usage["cpu"][node] = cpu_usage node_usage["memory"][node] = memory_usage avg_node_usage = sum(node_usage["cpu"].values()) / len( node_usage["cpu"].values()) max_node_usage = max(node_usage["cpu"].values()) return max_node_usage, avg_node_usage def get_pod_usage(self, app_name, app_namespace): pod_usage = {} pod_usage["cpu"] = {} pod_usage["memory"] = {} pod_name_list = find_pod_name(app_name, app_namespace) for pod in pod_name_list: output = self.k.top_pod(pod, app_namespace) for line in output.split("\n"): if line.find("NAME") == -1 and line: cpu_usage = int(line.split()[1].split("m")[0]) memory_usage = int(line.split()[-1].split("M")[0]) pod_usage["cpu"][pod] = cpu_usage pod_usage["memory"][pod] = memory_usage avg_pod_usage = sum(pod_usage["cpu"].values()) / len( pod_usage["cpu"].values()) max_pod_usage = max(pod_usage["cpu"].values()) num_pod = len(pod_name_list) return max_pod_usage, avg_pod_usage, num_pod def import_traffic(self, ratio, i): cmd = "python ./run_ab.py %d %d &" % (0, ratio) ret = os.system(cmd) return ret def get_traffic_info(self): dir_name = "./traffic" traffic_file_list = os.listdir(dir_name) latency_list = [] for traffic in traffic_file_list: traffic_file = "./%s/%s" % (dir_name, traffic) if os.path.exists(traffic_file): with open(traffic_file, "r") as f: output = f.read() for line in output.split("\n"): if line.find("Connect: ") != -1: avg_connect_latency = int(line.split()[2]) latency_list.append(avg_connect_latency) return latency_list def collect_usage(self, app_namespace, app_name): data = {} max_node_usage_list = [] avg_node_usage_list = [] max_pod_usage_list = [] avg_pod_usage_list = [] start_time = time.time() timeout = 120 print "collect %ds resource usage" % timeout while True: end_time = time.time() if end_time - start_time > timeout: print "time is up to %ds..." % timeout break max_node_usage, avg_node_usage = self.get_node_usage() max_pod_usage, avg_pod_usage, num_pod = self.get_pod_usage( app_name, app_namespace) self.get_traffic_info() max_node_usage_list.append(max_node_usage) avg_node_usage_list.append(avg_node_usage) max_pod_usage_list.append(max_pod_usage) avg_pod_usage_list.append(avg_pod_usage) time.sleep(5) connect_latency_list = self.get_traffic_info() max_node_usage = sum(max_node_usage_list) / len(max_node_usage_list) avg_node_usage = sum(avg_node_usage_list) / len(avg_node_usage_list) max_pod_usage = sum(max_pod_usage_list) / len(max_pod_usage_list) avg_pod_usage = sum(avg_pod_usage_list) / len(avg_pod_usage_list) avg_connect_latency = sum(connect_latency_list) / len( connect_latency_list) print "max. node =", max_node_usage, "%" print "avg. node =", avg_node_usage, "%" print "max. pod = ", max_pod_usage, "m" print "avg. pod = ", avg_pod_usage, "m" print "avg. connect latency = ", avg_connect_latency, "ms" data["max_node"] = max_node_usage data["avg_node"] = avg_node_usage data["max_pod"] = max_pod_usage data["avg_pod"] = avg_pod_usage data["avg_connect_latency"] = avg_connect_latency return data
class Prometheus_Query: p = Prometheus() instance_name = "10.244.0.85:9308" oc = OC() def __init__(self): ns, ip, port = self.get_kafka_exporter_ip() if ip and port: self.instance_name = "%s:%s" % (ip, port) def get_kafka_exporter_ip(self): ns = "" ip = "" port = "" output = self.oc.get_services_all_namespace() try: for line in output.split("\n"): if line.find("my-cluster-kafka-exporter") != -1: ns = line.split()[0] ip = line.split()[3] port = line.split()[5].split("/")[0].split(":")[0] except Exception as e: print "it cannot find kafka exporter ip: %s" % str(e) return ns, ip, port print "find namespace (%s) exporter ip (%s:%s)" % (ns, ip, port) return ns, ip, port def query_lag(self): # cmd = 'sum(kafka_consumergroup_lag{instance="%s",topic=~"%s"}) by (consumergroup, topic)' % (self.instance_name, topic_name) cmd = 'sum(kafka_consumergroup_lag{topic=~"%s"})' % (topic_name) output = self.p.run_cmd(cmd) return output def query_avg_lag(self): cmd = 'avg_over_time(kafka_consumergroup_lag{topic="%s",consumergroup="%s"}[1m])' % ( topic_name, group_name) output = self.p.run_cmd(cmd) return output def query_log_offset(self): cmd = 'sum(kafka_topic_partition_current_offset{topic=~"%s"})' % ( topic_name) output = self.p.run_cmd(cmd) return output def query_log_offset_by_min(self): cmd = 'sum(delta(kafka_topic_partition_current_offset{topic=~"%s"}[3m])/3)' % ( topic_name) output = self.p.run_cmd(cmd) return output def query_log_offset_by_sec(self): cmd = 'sum(rate(kafka_topic_partition_current_offset{topic=~"%s"}[1m]))' % ( topic_name) output = self.p.run_cmd(cmd) return output def query_current_offset(self): cmd = 'sum(kafka_consumergroup_current_offset{topic=~"%s"})' % ( topic_name) output = self.p.run_cmd(cmd) return output def query_current_offset_by_min(self): cmd = 'sum(delta(kafka_consumergroup_current_offset{topic=~"%s"}[3m])/3)' % ( topic_name) output = self.p.run_cmd(cmd) return output def query_current_offset_by_sec(self): cmd = 'sum(rate(kafka_consumergroup_current_offset{topic=~"%s"}[1m]))' % ( topic_name) output = self.p.run_cmd(cmd) return output def query_lag_by_sec(self): cmd = 'sum(rate(kafka_consumergroup_lag{topic=~"%s"}[1m]))' % ( topic_name) output = self.p.run_cmd(cmd) return output def query_lag_by_min(self): cmd = 'sum(delta(kafka_consumergroup_lag{topic=~"%s"}[3m])/3)' % ( topic_name) output = self.p.run_cmd(cmd) return output def query_pod_start_time(self, pod_name): cmd = 'kube_pod_start_time{pod="%s"}' % pod_name output = self.p.run_cmd(cmd) return output def wait_time(self, value): # print "wait %d seconds" % value time.sleep(value)
def stop_k8shpa(namespace, resource): if k8shpa_type == "lag": file_name = "%s/consumer-hpa.yaml" % config_path output = OC().delete_file(file_name) print "%s" % output
def get_executor_status(namespace, desired_status): output = OC().get_configmap(namespace, "alameda-executor-config") if output.find(desired_status) == -1: raise Exception("executor must be %s" % desired_status)
def find_app_location(app_name, namespace=""): app_namespace = "" app_type = "" resource = "" app_list = [] output = OC().get_deployments_all_namespace() if output.find(app_name) != -1: for line in output.split("\n"): if line.find(app_name) != -1: app_namespace = line.split()[0] app_type = "deployment" resource = line.split()[1] app = {} app["namespace"] = app_namespace app["resource_type"] = app_type app["resource"] = resource app_list.append(app) if not app_list: output = OC().get_deploymentconfigs_all_namespace() if output.find(app_name) != -1: for line in output.split("\n"): if line.find(app_name) != -1: app_namespace = line.split()[0] app_type = "deploymentconfig" resource = line.split()[1] app = {} app["namespace"] = app_namespace app["resource_type"] = app_type app["resource"] = resource app_list.append(app) if not app_list: raise Exception("app: %s is not existed" % app_name) # do not choose if namespace: for app in app_list: if app["namespace"] == namespace and app["resource"] == app_name: break return app_namespace, app_type, resource app_namespace = app["namespace"] app_type = app["resource_type"] resource = app["resource"] if query_mode: # show app i = 0 print "\n" print "*******************************************************************" print " Applications:" for app in app_list: print " %d) namespace: %s %s: %s" % (i, app["namespace"], app["resource_type"], app["resource"]) i = i + 1 print "*******************************************************************\n" sys.stdin = open('/dev/tty') try: x = raw_input("input prefered application (default:0): ") if not x: x = 0 except Exception: x = 0 x = int(x) app_namespace = app_list[x]["namespace"] app_type = app_list[x]["resource_type"] resource = app_list[x]["resource"] print "preferred application is %s/%s" % (app_namespace, resource) os.environ["NAMESPACE"] = app_namespace os.environ["RESOURCE"] = resource os.environ["RESOURCE_TYPE"] = app_type return app_namespace, app_type, resource