def update_app_limit(app_namespace, app_type, resource):
    output = {}
    file_name = "./resource.yaml"
    tmp_file_name = "%s.tmp" % file_name
    try:
        with open(file_name, "r") as f_r:
            output = yaml.load(f_r)
            output["spec"]["template"]["spec"]["containers"][0]["name"] = resource
            app_image = get_image_name(app_namespace, app_type, resource)
            if app_image:
                output["spec"]["template"]["spec"]["containers"][0]["image"] = app_image
            output["spec"]["template"]["spec"]["containers"][0]["resources"]["limits"]["cpu"] = str(cpu_limit)+"m"
            output["spec"]["template"]["spec"]["containers"][0]["resources"]["requests"]["cpu"] = str(cpu_limit)+"m"
            output["spec"]["template"]["spec"]["containers"][0]["resources"]["limits"]["memory"] = str(memory_limit)+"Mi"
            output["spec"]["template"]["spec"]["containers"][0]["resources"]["requests"]["memory"] = str(memory_limit)+"Mi"
        with open(tmp_file_name, "w") as f_w:
            yaml.dump(output, f_w)
            f_w.close()
    except Exception as e:
        print "failed to update %s: %s" % (file_name, str(e))
        return -1
    os.rename(tmp_file_name, file_name)
    if app_type == "deploymentconfig":
        OC().patch_deploymentconfig(app_namespace, resource, file_name)
    else:
        OC().patch_deployment(app_namespace, resource, file_name)
    print "success to update limits(cpu=%dm and memory=%dMi) for %s" % (cpu_limit, memory_limit, resource)
    return 0
def delete_consumer_yaml():
    file_name = "%s/consumer-deployment.yaml" % config_path
    if number_k8shpa == 1 and k8shpa_type == "cpu":
        file_name = "%s/consumer-cpu-deployment.yaml" % config_path
    o = OC()
    output = o.delete_file(file_name)
    return output
def clean_data(algo, namespace, resource_type, resource):
    output = OC().scale_replica(namespace, resource_type, resource, 0)
    #if algo in ["k8shpa", "alameda"]:
    output = OC().scale_replica(namespace, resource_type, resource,
                                initial_replica)
    # else:
    #     output = OC().scale_replica(namespace, resource_type, resource, overprovision_replica)
    return output
def get_pod_name(pod_name_prefix):
    o = OC()
    pod_name = ""
    pod_output = o.get_pods(namespace).split("\n")
    for line in pod_output:
        if line.find(pod_name_prefix) != -1:
            pod_name = line.split()[0]
            break
    return pod_name
def start_k8shpa(namespace, resource_type, resource, num_replica_max, percent):
    print "=== Start K8sHPA ===", k8shpa_type
    if k8shpa_type == "cpu":
        output = OC().autoscale_replica(namespace, resource_type, resource,
                                        num_replica_max, percent)
    elif k8shpa_type == "memory":
        file_name = "./k8shpa_memory.yaml"
        output = update_k8shpa_yaml(file_name, namespace, resource, percent)
        output = OC().apply_file(file_name)
    return output
def find_alameda_namespace(app_name):
    namespace = ""
    output = OC().get_pods_all_namespace()
    for line in output.split("\n"):
        if line.find(app_name) != -1:
            namespace = line.split()[0]
            break
    if namespace:
        print "find %s's namespace: %s" % (app_name, namespace)
    else:
        raise Exception("ns: %s is not existed" % namespace)
    return namespace
def get_image_name(app_namespace, app_type, resource):
    output = ""
    image_name = ""
    oc = OC()
    if app_type == "deploymentconfig":
        output = oc.get_specific_deploymentconfig(app_namespace, resource)
    elif app_type == "deployment":
        output = oc.get_specific_deployment(app_namespace, resource)
    if output:
        output = yaml.load(output)
        for container_info in output.get("spec").get("template").get("spec").get("containers"):
            if container_info.get("name") == resource:
                image_name = container_info.get("image")
                break
    return image_name
def find_pod_name(app_name, app_namespace):
    pod_name_list = []
    status = ""
    output = OC().get_pods(app_namespace)
    for line in output.split("\n"):
        if line.find(app_name) != -1:
            pod_name = line.split()[0]
            if pod_name.find("build") != -1:
                continue
            status = line.split()[2]
            if status not in ["Running"]:
                raise Exception("%s is %s" % (pod_name, status))
            pod_name_list.append(pod_name)
    if not pod_name_list:
        raise Exception("%s is not existed in %s" % (app_name, app_namespace))
    return pod_name_list
def restart_pod(app_name, app_namespace):
    output = ""
    pod_name_list = find_pod_name(app_name, app_namespace)
    for pod_name in pod_name_list:
        output = OC().delete_pod(pod_name, app_namespace)
        print output
    return output
def do_main(args):
    app_name = args.app_name[0]
    ret = OC().check_platform()
    # if ret == 0: #OpenShift
    #     user = args.user[0]
    #     passwd = args.password[0]
    #     OC().login(user, passwd)
    # ask users to check execution or not
    if query_mode:
        ret = check_execution(app_name)
        if ret != 0:
            print "exit"
            return 0
    try:
        initial_environment()
        check_environment(app_name)
        main(app_name)
    except KeyboardInterrupt:
        print "pgogram exit with keyboard interrupt"
        kill_process()
    except Exception as e:
        print "failed to test HPA: %s" % str(e)
        kill_process()
    kill_process()
    return 0
예제 #11
0
 def __init__(self):
     self.oc_platform = not OC().check_platform(
     )  # OC().check_platform() return 0 if oc command exists
     if self.oc_platform:
         self.endpoint = define.prometheus_endpoint
         self.token = define.prometheus_token
     else:
         self.endpoint = self._get_endpoint_from_service()
def update_app_limit(app_namespace, app_type, resource):
    output = {}
    result = ""
    if app_type == "deploymentconfig":
        result = OC().get_specific_deploymentconfig(app_namespace, resource)
    elif app_type == "deployment":
        result = OC().get_specific_deployment(app_namespace, resource)
    output = yaml.load(result)
    output["spec"]["template"]["spec"]["containers"][0]["resources"] = {}
    output["spec"]["template"]["spec"]["containers"][0]["resources"][
        "limits"] = {}
    output["spec"]["template"]["spec"]["containers"][0]["resources"][
        "requests"] = {}
    if cpu_limit != 0:
        output["spec"]["template"]["spec"]["containers"][0]["resources"][
            "limits"]["cpu"] = str(cpu_limit) + "m"
        output["spec"]["template"]["spec"]["containers"][0]["resources"][
            "requests"]["cpu"] = str(cpu_limit) + "m"
    if memory_limit != 0:
        output["spec"]["template"]["spec"]["containers"][0]["resources"][
            "limits"]["memory"] = str(memory_limit) + "Mi"
        output["spec"]["template"]["spec"]["containers"][0]["resources"][
            "requests"]["memory"] = str(memory_limit) + "Mi"
    output["metadata"].pop("creationTimestamp")
    output["metadata"].pop("generation")
    output["metadata"].pop("resourceVersion")
    output["metadata"].pop("selfLink")
    output["metadata"].pop("uid")
    output.pop("status")
    file_name = "./resource.yaml"
    tmp_file_name = "%s.tmp" % file_name
    try:
        with open(tmp_file_name, "w") as f_w:
            yaml.dump(output, f_w)
            f_w.close()
    except Exception as e:
        print "failed to update %s: %s" % (file_name, str(e))
        return -1
    os.rename(tmp_file_name, file_name)
    OC().apply_file(file_name)
    print "success to update limits(cpu=%dm and memory=%dMi) for %s" % (
        cpu_limit, memory_limit, resource)
    return 0
예제 #13
0
    def _get_endpoint_from_service(self):
        output = OC().get_service(define.prometheus_namespace).split("\n")
        for line in output:
            if line.find(define.prometheus_operator_name) != -1:
                ip = line.split()[2]
                port = line.split()[-2].split(":")[0]
                endpoint = "http://%s:%s/api/v1" % (ip, port)
                break

        print("Prometheus: find endpoint(%s)", endpoint)
        sys.exit()
        return endpoint
class FetchLog:
    k = Kubectl()
    o = OC()
    namespace = "myproject"
    app_name = "consumer"

    def __init__(self):
        pass

    def get_pod_list(self):
        pod_list = []
        output = self.o.get_pods(self.namespace)
        for line in output.split("\n"):
            if line.find(self.app_name) != -1 and line:
                pod_name = line.split()[0]
                pod_list.append(pod_name)
        return pod_list

    def fetch_log(self, pod_name):
        # print "pod_name=", pod_name
        data = {}
        data["start_complete_time"] = 0
        data["partitions_revoked_time"] = {}
        data["partitions_assigned_time"] = {}
        data["rebalance_time"] = {}
        revoked_index = 0
        assigned_index = 0
        output = self.o.log_pod(self.namespace, pod_name)
        for line in output.split("\n"):
            if line.find("startup_complete") != -1:
                # print line
                startup_complete_time = json.loads(line).get("timestamp")
                data["start_complete_time"] = int(startup_complete_time)
            if line.find("partitions_revoked") != -1:
                # print line
                partitions_revoked_time = json.loads(line).get("timestamp")
                data["partitions_revoked_time"][revoked_index] = int(partitions_revoked_time)
                revoked_index += 1
            if line.find("partitions_assigned") != -1:
                # print line
                partitions_assigned_time = json.loads(line).get("timestamp")
                data["partitions_assigned_time"][assigned_index] = int(partitions_assigned_time)
                assigned_index += 1
        rebalance_count = len(data["partitions_assigned_time"].keys())
        for i in range(rebalance_count):
            if i == 0:
                data["rebalance_time"][i] = data["partitions_assigned_time"][i] - data["start_complete_time"]
            elif data["partitions_revoked_time"]:
                data["rebalance_time"][i] = data["partitions_assigned_time"][i] - data["partitions_revoked_time"][i-1]
        # print data
        return data
def start_k8shpa(namespace, resource_type, resource, num_replica, value):
    output = ""
    print "=== Start K8sHPA: %s ===" % k8shpa_type
    if k8shpa_type == "cpu":
        print "=== Set up autoscale ==="
        output = OC().autoscale_replica(namespace, resource_type, resource, num_replica, value)
        print "%s" % output
    if k8shpa_type == "lag":
        file_name = "%s/consumer-hpa.yaml" % config_path
        print "=== Applying file: %s ===" % file_name
        output = OC().apply_file(file_name)
        print "%s" % output
        
    # elif k8shpa_type == "memory":
    #     file_name = "./k8shpa_memory.yaml"
    #     output = update_k8shpa_yaml(file_name, namespace, resource, value)
    #     output = OC().apply_file(file_name)
    # elif k8shpa_type == "consumergroup_lag":
    #     file_name = "./k8shpa_consumergrouplag.yaml"
    #     # output = update_k8shpa_yaml(file_name, namespace, resource, value)
    #     output = OC().apply_file(file_name)
    #     print output
    return output
def get_recommender_log(pod_name_prefix):
    log_data = {}
    o = OC()
    pod_name = get_pod_name(pod_name_prefix)
    log_output = o.log_pod(namespace, pod_name).split("\n")
    for line in log_output:
        if line and line.find("Desired") != -1:
            if line.find("Evaluate Cost parameters") != -1:
                timestamp = line.split()[0].split(".")[0]
                cost_data_list = line.split("\t")[-1].split()
            if line.find("desired replicas result") != -1 and line.find(
                    "best") == -1:
                timestamp = line.split()[0].split(".")[0]
                log_data[timestamp] = {}
                recommender_data_list = json.loads(
                    line.split("\t")[-1].split()[-1])
                count = 0
                for data in recommender_data_list:
                    if count == 0:
                        log_data[timestamp]["ma%d_data" % count] = data
                    else:
                        log_data[timestamp]["predict%d_data" % count] = data
                    count += 1
            if line.find("best desired replicas result") != -1:
                timestamp = line.split()[0].split(".")[0]
                if log_data.get(timestamp):
                    best_desired_replicas = json.loads(
                        line.split("\t")[-1].split()[-1]).get(
                            "DesiredReplicas")
                    log_data[timestamp][
                        "best_desired_replicas"] = best_desired_replicas
                    best_execution_time = json.loads(
                        line.split("\t")[-1].split()[-1]).get(
                            "BestExecutationTime")
                    log_data[timestamp][
                        "best_execution_time"] = best_execution_time
    return log_data
def wait_pod_running(namespace):
    print "wait pods in %s running" % namespace
    start_time = time.time()
    while True:
        output = OC().get_pods(namespace)
        print "wait 30 sec"
        time.sleep(30)
        end_time = time.time()
        if end_time - start_time >= 600:
            raise Exception("timeout for waiting pods running")
            break
        count = 0
        for line in output.split("\n"):
            if line and line.find("NAME") == -1:
                status = line.split()[2]
                ready = line.split()[1]
                if ready != "1/1" and status != "Running":
                    print line
                elif (ready == "1/1" or ready == "2/2" or ready == "3/3") and status == "Running":
                    count += 1
        print "%s pods in %s are running" % (count, namespace) 
        if count >= 8 + initial_consumer:
            print "all pods in %s are running and ready" % namespace
            break
class OverLimit:
    k = Kubectl()
    wait_time = 30
    metric_item_list = ["cpu_value", "memory_value"]
    limit_item_list = ["pod_cpu_limits", "pod_memory_limits"]
    request_item_list = ["pod_cpu_requests", "pod_memory_requests"]
    app_list = {}
    app_name = ""
    namespace = ""
    cpu_limit = 0
    mem_limit = 0
    oc = OC()
    app_type = ""
    prometheus = Prometheus()

    def __init__(self):
        app_namespace = os.environ.get("NAMESPACE") or "nginx"
        app_type = os.environ.get("RESOURCE_TYPE") or "deployment"
        resource = os.environ.get("RESOURCE") or "nginx"
        self.namespace = app_namespace
        self.app_name = resource
        self.app_type = app_type

    def find_deploymentconfig_by_namespace(self, app_name):
        deployment_name_list = []
        output = {}
        if self.app_type == "deployment":
            output = self.oc.get_deployment(self.namespace)
        if self.app_type == "deploymentconfig":
            output = self.oc.get_deploymentconfig(self.namespace)
        for line in output.split("\n"):
            if line.find(app_name) != -1:
                deployment_name = line.split()[0]
                deployment_name_list.append(deployment_name)
        return deployment_name_list

    def find_pod_by_namespace(self, app_name):
        pod_name_list = []
        output = self.oc.get_pods(self.namespace)
        for line in output.split("\n"):
            if line.find(app_name) != -1:
                pod_name = line.split()[0]
                if pod_name.find("build") != -1:
                    continue
                pod_name_list.append(pod_name)
        return pod_name_list

    def get_deploymentconfig(self):
        self.app_list = {}
        # print ("---get deployment info---")
        deployment_name_list = self.find_deploymentconfig_by_namespace(
            self.app_name)
        for deployment in deployment_name_list:
            self.app_list[deployment] = {}
        # print self.app_list

    def get_pod_info(self):
        # print ("---get pod info---")
        pod_name_list = self.find_pod_by_namespace(self.app_name)
        for pod_name in pod_name_list:
            for deployment in self.app_list.keys():
                if pod_name.find(deployment) != -1:
                    self.app_list[deployment][pod_name] = {}
        # print self.app_list

    def get_metrics(self):
        # print ("---get metrics---")
        self.kubectl = Kubectl()
        for metric_item in self.metric_item_list:
            for deployment in self.app_list.keys():
                for pod_name in self.app_list[deployment]:
                    self.app_list[deployment][pod_name][metric_item] = 0
        for deployment in self.app_list.keys():
            for pod_name in self.app_list[deployment].keys():
                output = self.kubectl.top_pod(pod_name, self.namespace)
                for line in output.split("\n"):
                    if line.find(pod_name) != -1:
                        # by kubectl top
                        cpu = int(line.split()[-2].strip("m"))  # mCore
                        memory = int(line.split()[-1].strip("Mi"))  # MB
                        self.app_list[deployment][pod_name]["cpu_value"] = cpu
                        self.app_list[deployment][pod_name][
                            "memory_value"] = memory
        # print self.app_list

    def get_pod_limit(self, pod_name):
        # print ("---get pod limit---")
        cpu_limit = d_cpu_limit
        memory_limit = d_memory_limit

        # data collect interval needs less than 30s
        # return cpu/memory limit from setting directly
        return cpu_limit, memory_limit

        output = self.oc.get_pod_json(pod_name, self.namespace)
        if output:
            try:
                output = json.loads(output)
                cpu_limit1 = output.get("spec", {}).get(
                    "containers",
                    [])[0].get("resources").get("limits").get("cpu")
                if cpu_limit1 and cpu_limit1.find("m") != -1:
                    cpu_limit = float(cpu_limit1.split("m")[0])
                else:
                    cpu_limit = float(cpu_limit1) * 1000
                memory_limit1 = output.get("spec", {}).get(
                    "containers",
                    [])[0].get("resources").get("limits").get("memory")
                if memory_limit1 and memory_limit1.find("M") != -1:
                    memory_limit = float(memory_limit1.split("M")[0])
                elif memory_limit1 and memory_limit1.find("G") != -1:
                    memory_limit = float(memory_limit1.split("G")[0]) * 1000
            except Exception as e:
                print "failed to get limits: %s" % str(e)
        return cpu_limit, memory_limit

    def get_limits(self):
        output = {}
        for metric_item in self.limit_item_list:
            for deployment in self.app_list.keys():
                for pod_name in self.app_list[deployment].keys():
                    cpu_limit, memory_limit = self.get_pod_limit(pod_name)
                    if metric_item == "pod_cpu_limits":
                        self.app_list[deployment][pod_name][
                            metric_item] = cpu_limit
                    else:
                        self.app_list[deployment][pod_name][
                            metric_item] = memory_limit

    def get_pod_reason(self, pod_name):
        reason_list = []
        output = self.oc.get_pod_json(pod_name, self.namespace)
        if output:
            output = json.loads(output)
            if output.get("status").get("containerStatuses")[0].get(
                    "lastState"):
                terminated = output.get("status").get(
                    "containerStatuses")[0].get("lastState").get("terminated")
                reason_list.append(terminated)
        return reason_list

    def get_status(self):
        output = self.oc.get_pods(self.namespace)
        for deployment in self.app_list.keys():
            for pod_name in self.app_list[deployment].keys():
                for line in output.split("\n"):
                    if line.find(self.app_name) != -1:
                        pod = line.split()[0]
                        if pod == pod_name:
                            reason_list = self.get_pod_reason(pod_name)
                            status = line.split()[2]
                            restart = int(line.split()[3])
                            self.app_list[deployment][pod_name][
                                "status"] = status
                            self.app_list[deployment][pod_name][
                                "restart"] = restart
                            self.app_list[deployment][pod_name][
                                "reason"] = reason_list

    def get_node_status(self):
        # print "get node status"
        node_info = {}
        output = self.oc.get_nodes()
        for line in output.split("\n"):
            if line.find("NAME") == -1 and line:
                node_name = line.split()[0]
                status = line.split()[1]
                node_info[node_name] = {}
                node_info[node_name]["status"] = status
                usage_output = self.k.top_node(node_name)
                for line in usage_output.split("\n"):
                    if line.find(node_name) != -1:
                        cpu = int(line.split()[1].split("m")[0])
                        memory = int(line.split()[3].split("Mi")[0])
                        node_info[node_name]["cpu"] = cpu
                        node_info[node_name]["memory"] = memory
        # print node_info
        return node_info

    def get_http_requests(self):
        #query = "%s{namespace=\"%s\"}" % (ingress_http_requests_name, ingress_namespace)
        query = "sum(idelta(haproxy_server_http_responses_total{exported_namespace=\"nginx\",route=\"nginx-service\",code=\"2xx\"}[2m]))"
        output = self.prometheus.query_value(query)
        return float(output) / 2.0

    def calculate_overlimit(self, algo, time_count):
        cpu_count = 0
        memory_count = 0
        count = 0
        total_restart = 0
        total_terminated = 0
        data_count = int(time_count * 60 / self.wait_time)
        print "--- %s collect data and write to logs for %d minutes ---" % (
            algo.split("_")[0].upper(), time_count)

        start_time = time.time()
        for i in range(data_count):
            self.get_deploymentconfig()
            self.get_pod_info()
            self.get_limits()
            self.get_metrics()
            # self.get_status()

            print "--- %s start to collect data at %d/%d interval(in 30 sec), start: %s, current: %s ---" % (
                algo.split("_")[0], i, data_interval * 2, start_time,
                time.time())
            for deployment in self.app_list.keys():
                cpu_limit = 0
                memory_limit = 0
                total_cpu = 0
                total_memory = 0
                total_cpu_limit = 0
                total_memory_limit = 0
                # pod
                for pod in self.app_list[deployment].keys():
                    if self.app_list[deployment][pod].get("pod_cpu_limits"):
                        cpu_limit = self.app_list[deployment][pod][
                            "pod_cpu_limits"]
                        memory_limit = self.app_list[deployment][pod][
                            "pod_memory_limits"]
                    cpu = self.app_list[deployment][pod]["cpu_value"]
                    memory = self.app_list[deployment][pod]["memory_value"]
                    total_cpu += cpu
                    total_memory += memory
                    total_cpu_limit += cpu_limit
                    total_memory_limit += memory_limit
                    if cpu >= cpu_limit and cpu_limit != 0:
                        cpu_count += 1
                    if memory >= memory_limit and memory_limit != 0:
                        memory_count += 1
                    restart = self.app_list[deployment][pod].get("restart", 0)
                    total_restart += restart
                    reason = self.app_list[deployment][pod].get("reason", [])
                    total_terminated += len(reason)
                num_replica = len(self.app_list[deployment].keys())

                # http requests
                http_requests = self.get_http_requests()

                print self.app_name, "total_cpu=", total_cpu, "m"
                print self.app_name, "total_memory=", total_memory, "Mi"
                print self.app_name, "current replica=%d" % num_replica
                print self.app_name, "overflow=", cpu_count, "times"
                print self.app_name, "oom=", memory_count, "times"
                print self.app_name, "restart=", total_restart, "times"
                print self.app_name, "terminated=", total_terminated, "times"
                print self.app_name, "http_requests=%s" % http_requests
                print "\n"
                total_status = 0
                total_node_cpu = 0
                total_node_memory = 0

                # # skip collect node info (take too long)

                # node
                #node_info = self.get_node_status()
                #for node in node_info.keys():
                #    if node_info[node].get("status").find("NotReady") != -1:
                #        total_status += 1
                #    total_node_cpu += node_info[node]["cpu"]
                #    total_node_memory += node_info[node]["memory"]

                algo_name = "%s-%s" % (self.app_name, algo)
                data = [
                    algo_name, total_cpu, total_cpu_limit, total_memory,
                    total_memory_limit, cpu_count, memory_count, num_replica,
                    restart, total_status, total_node_cpu, total_node_memory,
                    http_requests
                ]
                self.write_metric(data)
            # print "wait %d seconds" % self.wait_time
            # correct time
            interval = 30
            for j in range(interval):
                end_time = time.time()
                if end_time - start_time >= interval:
                    start_time = start_time + interval
                    break
                time.sleep(1)

    def write_metric(self, data):
        # print "write metrics"
        timestamp = str(int(time.time()))
        data.append(timestamp)
        try:
            pod_name = data[0]
            fn = "./metrics/%s" % pod_name
            with open(fn, "a") as f:
                line = " ".join([str(elem) for elem in data])
                f.write("%s\n" % str(line))
        except Exception as e:
            print "failed to write metrics:%s" % str(e)
def apply_producer_yaml():
    file_name = "%s/producer-deployment.yaml" % config_path
    o = OC()
    output = o.apply_file(file_name)
    return output
class Producer(Client):
    oc = OC()
    k = Kubectl()
    w = WriteLog()

    def __init__(self):
        super(Producer, self).__init__()
        self.namespace = "myproject"
        self.app_name = "producer"
        self.app_type = "deployment"
        self.w.namespace = self.namespace
        self.w.app_name = self.app_name
        self.w.app_type = self.app_type

    def wait_time(self, value):
        # print "wait %d seconds" % value
        time.sleep(value)

    def read_transaction_list(self):
        transaction_list = []
        file_name = "./transaction.txt"
        try:
            with open(file_name, "r") as f:
                output = f.read()
                for line in output.split("\n"):
                    if line:
                        transaction_list.append(float(line))
        except Exception as e:
            print "faild to read %s: %s" % (file_name, str(e))
            return transaction_list
        # print "success to read %s" % (file_name)
        return transaction_list

    def calculate_pod_info(self):
        app_cpu_value = 0
        app_memory_value = 0
        app_cpu_limit = 0
        app_memory_limit = 0
        app_cpu_overlimit = 0
        app_memory_overlimit = 0
        app_restart = 0
        app_status_running = 0
        app_status_crashloopbackoff = 0
        app_status_oomkilled = 0
        for pod in self.w.app_list[self.app_name].keys():
            for item in self.w.app_list[self.app_name][pod].keys():
                if item in ["cpu_value"]:
                    app_cpu_value += self.w.app_list[
                        self.app_name][pod]["cpu_value"]
                elif item in ["memory_value"]:
                    app_memory_value += self.w.app_list[
                        self.app_name][pod]["memory_value"]
                elif item in ["pod_cpu_limits"]:
                    app_cpu_limit += self.w.app_list[
                        self.app_name][pod]["pod_cpu_limits"]
                elif item in ["pod_memory_limits"]:
                    app_memory_limit += self.w.app_list[
                        self.app_name][pod]["pod_memory_limits"]
                elif item in ["restart"]:
                    app_restart += self.w.app_list[
                        self.app_name][pod]["restart"]
                elif item == "status":
                    status = self.w.app_list[self.app_name][pod]["status"]
                    if status in ["Running"]:
                        app_status_running += 1
                    if status in ["CrashLoopBackOff"]:
                        app_status_crashloopbackoff += 1
                    if status in ["OOMKilled"]:
                        app_status_oomkilled += 1
        print "- Producers: CPU %s/%s mCore; Memory %s/%s Mi; Restart %s" % (
            app_cpu_value, app_cpu_limit, app_memory_value, app_memory_limit,
            app_restart)
        output = "%s %s %s %s %s %s %s %s " % (
            app_cpu_value, app_cpu_limit, app_memory_value, app_memory_limit,
            app_restart, app_status_running, app_status_crashloopbackoff,
            app_status_oomkilled)
        return output

    def calculate_overlimit(self):
        app_cpu_overlimit = 0
        app_memory_overlimit = 0

        # calculate overlimit
        for pod in self.w.app_list[self.app_name].keys():
            cpu_value = self.w.app_list[self.app_name][pod]["cpu_value"]
            memory_value = self.w.app_list[self.app_name][pod]["memory_value"]
            cpu_limit = self.w.app_list[self.app_name][pod]["pod_cpu_limits"]
            memory_limit = self.w.app_list[
                self.app_name][pod]["pod_memory_limits"]
            if cpu_limit <= cpu_value:
                app_cpu_overlimit += 1
            if memory_limit <= memory_value:
                app_memory_overlimit += 1
        num_replica = len(self.w.app_list[self.app_name].keys())
        #print "- Producers: OverLimit %s; OOM: %s\n" % (app_cpu_overlimit, app_memory_overlimit)
        output = "%s %s %s" % (app_cpu_overlimit, app_memory_overlimit,
                               num_replica)
        return output

    def calculate_performance(self, producer_info):
        app_record = 0
        app_999th_latency = 0
        app_max_latency = 0
        app_99th_latency = 0
        app_95th_latency = 0
        app_throughput = 0
        app_avg_latency = 0
        app_50th_latency = 0

        # get avg. latency
        for pod in producer_info.keys():
            for item in producer_info[pod].keys():
                if item == "record":
                    app_record += producer_info[pod][item]
                if item == "max_latency":
                    app_max_latency += producer_info[pod][
                        item] * producer_info[pod]["record"]
                elif item == "throughput":
                    app_throughput += producer_info[pod][item] * producer_info[
                        pod]["record"]
                elif item == "avg_latency":
                    app_avg_latency += producer_info[pod][
                        item] * producer_info[pod]["record"]
                elif item == "50th_latency":
                    app_50th_latency += producer_info[pod][
                        item] * producer_info[pod]["record"]
                elif item == "95th_latency":
                    app_95th_latency += producer_info[pod][
                        item] * producer_info[pod]["record"]
                elif item == "99th_latency":
                    app_99th_latency += producer_info[pod][
                        item] * producer_info[pod]["record"]
                elif item == "99.9th_latency":
                    app_999th_latency += producer_info[pod][
                        item] * producer_info[pod]["record"]
        if app_record == 0:
            app_record = -1
        app_max_latency = app_max_latency / app_record
        app_throughput = app_throughput / app_record
        app_avg_latency = app_avg_latency / app_record
        app_50th_latency = app_50th_latency / app_record
        app_95th_latency = app_95th_latency / app_record
        app_99th_latency = app_99th_latency / app_record
        app_999th_latency = app_999th_latency / app_record
        output = "%s %s %s %s %s %s %s %s " % (
            app_record, app_max_latency, app_throughput, app_avg_latency,
            app_50th_latency, app_95th_latency, app_99th_latency,
            app_999th_latency)
        return output

    def write_logs(self, algo_name):
        self.w.get_deploymentconfig()
        self.w.get_pod_info()
        self.w.get_limits()
        self.w.get_metrics()
        self.w.get_status()

        timestamp = int(time.time())
        line = "%s " % timestamp
        line += self.calculate_pod_info()
        line += self.calculate_overlimit()
        line += "\n"

        file_name = "%s/%s_producer_metrics" % (traffic_path, algo_name)
        try:
            with open(file_name, "a") as f:
                f.write(line)
        except Exception as e:
            print "failed to write producer logs(%s): %s" % (file_name, str(e))
            return -1

        # print "success to write producer logs(%s)" % file_name
        return 0

    def write_latency(self, algo_name, producer_info):
        timestamp = int(time.time())
        line = "%s " % timestamp
        line += self.calculate_performance(producer_info)
        line += "\n"

        file_name = "%s/%s_producer_latency" % (traffic_path, algo_name)
        try:
            with open(file_name, "a") as f:
                f.write(line)
        except Exception as e:
            print "failed to write producer latency(%s): %s" % (file_name,
                                                                str(e))
            return -1

        # print "success to write producer logs(%s)" % file_name
        return 0
예제 #21
0
class WriteLog:
    k = Kubectl()
    wait_time = 30
    metric_item_list = ["cpu_value", "memory_value"]
    limit_item_list = ["pod_cpu_limits", "pod_memory_limits"]
    request_item_list = ["pod_cpu_requests", "pod_memory_requests"]
    app_list = {}
    app_name = ""
    namespace = ""
    cpu_limit = 0
    mem_limit = 0
    oc = OC()
    app_type = ""

    def __init__(self):
        pass

    def find_deploymentconfig_by_namespace(self, app_name):
        deployment_name_list = []
        output = ""
        if self.app_type == "deployment":
            output = self.oc.get_deployment(self.namespace)
        elif self.app_type == "deploymentconfig":
            output = self.oc.get_deploymentconfig(self.namespace)
        elif self.app_type == "statefulset":
            output = self.oc.get_statefulset(self.namespace)
        for line in output.split("\n"):
            if line.find(app_name) != -1:
                deployment_name = line.split()[0]
                deployment_name_list.append(deployment_name)
        return deployment_name_list

    def find_pod_by_namespace(self, app_name):
        pod_name_list = []
        output = self.oc.get_pods(self.namespace)
        for line in output.split("\n"):
            if line.find(app_name) != -1:
                pod_name = line.split()[0]
                if pod_name.find("build") != -1:
                    continue
                pod_name_list.append(pod_name)
        return pod_name_list

    def get_deploymentconfig(self):
        self.app_list = {}
        # print ("---get deployment info---")
        deployment_name_list = self.find_deploymentconfig_by_namespace(
            self.app_name)
        for deployment in deployment_name_list:
            self.app_list[deployment] = {}
        # print self.app_list

    def get_pod_info(self):
        # print ("---get pod info---")
        pod_name_list = self.find_pod_by_namespace(self.app_name)
        for pod_name in pod_name_list:
            for deployment in self.app_list.keys():
                if pod_name.find(deployment) != -1:
                    self.app_list[deployment][pod_name] = {}
        # print self.app_list

    def get_metrics(self):
        # print ("---get metrics---")
        self.kubectl = Kubectl()
        for metric_item in self.metric_item_list:
            for deployment in self.app_list.keys():
                for pod_name in self.app_list[deployment]:
                    self.app_list[deployment][pod_name][metric_item] = 0
        for deployment in self.app_list.keys():
            for pod_name in self.app_list[deployment].keys():
                output = self.kubectl.top_pod(pod_name, self.namespace)
                for line in output.split("\n"):
                    if line.find(pod_name) != -1:
                        # by kubectl top
                        cpu = int(line.split()[-2].strip("m"))  # mCore
                        memory = int(line.split()[-1].strip("Mi"))  # MB
                        self.app_list[deployment][pod_name]["cpu_value"] = cpu
                        self.app_list[deployment][pod_name][
                            "memory_value"] = memory
        # print self.app_list

    def get_pod_limit(self, pod_name):
        #print ("---get pod limit---")
        cpu_limit = 0
        memory_limit = 0
        cpu_limit_mcore = "0m"
        memory_limit_mb = "0Mi"
        output = self.oc.get_pod_json(pod_name, self.namespace)
        if output:
            try:
                output = json.loads(output)
                if output.get("spec", {}).get("containers",
                                              [])[0].get("resources"):
                    cpu_limit_mcore = output.get("spec", {}).get(
                        "containers",
                        [])[0].get("resources").get("limits").get("cpu", "0m")
                if cpu_limit_mcore and cpu_limit_mcore.find("m") != -1:
                    cpu_limit = float(cpu_limit_mcore.split("m")[0])
                else:
                    cpu_limit = float(cpu_limit_mcore) * 1000
                if output.get("spec", {}).get("containers",
                                              [])[0].get("resources"):
                    memory_limit_mb = output.get("spec", {}).get(
                        "containers",
                        [])[0].get("resources").get("limits").get(
                            "memory", "0Mi")
                if memory_limit_mb and memory_limit_mb.find("M") != -1:
                    memory_limit = float(memory_limit_mb.split("M")[0])
                elif memory_limit_mb and memory_limit_mb.find("G") != -1:
                    memory_limit = float(memory_limit_mb.split("G")[0]) * 1000
            except Exception as e:
                print "failed to get limits: %s" % str(e)
        return cpu_limit, memory_limit

    def get_limits(self):
        output = {}
        for metric_item in self.limit_item_list:
            for deployment in self.app_list.keys():
                for pod_name in self.app_list[deployment].keys():
                    cpu_limit, memory_limit = self.get_pod_limit(pod_name)
                    if metric_item == "pod_cpu_limits":
                        self.app_list[deployment][pod_name][
                            metric_item] = cpu_limit
                    else:
                        self.app_list[deployment][pod_name][
                            metric_item] = memory_limit

    def get_pod_reason(self, pod_name):
        reason_list = []
        output = self.oc.get_pod_json(pod_name, self.namespace)
        if output:
            output = json.loads(output)
            if output.get("status").get("containerStatuses")[0].get(
                    "lastState"):
                terminated = output.get("status").get("containerStatuses")[
                    0].get("lastState").get("terminated").get("reason")
                reason_list.append(terminated)
        return reason_list

    def get_status(self, is_reason=True):
        output = self.oc.get_pods(self.namespace)
        for deployment in self.app_list.keys():
            for pod_name in self.app_list[deployment].keys():
                for line in output.split("\n"):
                    if line.find(self.app_name) != -1:
                        pod = line.split()[0]
                        if pod == pod_name:
                            status = line.split()[2]
                            restart = int(line.split()[3])
                            self.app_list[deployment][pod_name][
                                "status"] = status
                            self.app_list[deployment][pod_name][
                                "restart"] = restart
                            if is_reason:
                                reason_list = self.get_pod_reason(pod_name)
                                self.app_list[deployment][pod_name][
                                    "reason"] = reason_list

    def get_node_status(self):
        # print "get node status"
        node_info = {}
        output = self.oc.get_nodes()
        for line in output.split("\n"):
            if line.find("NAME") == -1 and line:
                node_name = line.split()[0]
                status = line.split()[1]
                node_info[node_name] = {}
                node_info[node_name]["status"] = status
                usage_output = self.k.top_node(node_name)
                for line in usage_output.split("\n"):
                    if line.find(node_name) != -1:
                        cpu = int(line.split()[1].split("m")[0])
                        memory = int(line.split()[3].split("Mi")[0])
                        node_info[node_name]["cpu"] = cpu
                        node_info[node_name]["memory"] = memory
        # print node_info
        return node_info

    def calculate_overlimit(self, algo, time_count):
        cpu_count = 0
        memory_count = 0
        count = 0
        total_restart = 0
        total_terminated = 0
        data_count = int(time_count * 60 / self.wait_time)
        print "--- %s collect data and write to logs for %d minutes ---" % (
            algo.split("_")[0].upper(), time_count)
        for i in range(data_count):
            start_time = time.time()
            self.get_deploymentconfig()
            self.get_pod_info()
            self.get_limits()
            self.get_metrics()
            self.get_status()
            print "--- %s start to collect data at %d/%d interval(in 30 sec) ---" % (
                algo.split("_")[0], i, data_interval * 2)
            for deployment in self.app_list.keys():
                cpu_limit = 0
                memory_limit = 0
                total_cpu = 0
                total_memory = 0
                total_cpu_limit = 0
                total_memory_limit = 0
                # pod
                for pod in self.app_list[deployment].keys():
                    if self.app_list[deployment][pod].get("pod_cpu_limits"):
                        cpu_limit = self.app_list[deployment][pod][
                            "pod_cpu_limits"]
                        memory_limit = self.app_list[deployment][pod][
                            "pod_memory_limits"]
                    cpu = self.app_list[deployment][pod]["cpu_value"]
                    memory = self.app_list[deployment][pod]["memory_value"]
                    total_cpu += cpu
                    total_memory += memory
                    total_cpu_limit += cpu_limit
                    total_memory_limit += memory_limit
                    if cpu >= cpu_limit and cpu_limit != 0:
                        cpu_count += 1
                    if memory >= memory_limit and memory_limit != 0:
                        memory_count += 1
                    restart = self.app_list[deployment][pod].get("restart", 0)
                    total_restart += restart
                    reason = self.app_list[deployment][pod].get("reason", [])
                    total_terminated += len(reason)
                num_replica = len(self.app_list[deployment].keys())
                print self.app_name, "total_cpu=", total_cpu, "m"
                print self.app_name, "total_memory=", total_memory, "Mi"
                print self.app_name, "current replica=%d" % num_replica
                print self.app_name, "overflow=", cpu_count, "times"
                print self.app_name, "oom=", memory_count, "times"
                print self.app_name, "restart=", total_restart, "times"
                print self.app_name, "terminated=", total_terminated, "times"
                print "\n"
                total_status = 0
                algo_name = "%s-%s" % (self.app_name, algo)
                data = [
                    algo_name, total_cpu, total_cpu_limit, total_memory,
                    total_memory_limit, cpu_count, memory_count, num_replica,
                    restart, total_status
                ]
                self.write_metric(data)
            # print "wait %d seconds" % self.wait_time
            # correct time
            interval = 30
            for j in range(interval):
                end_time = time.time()
                if end_time - start_time >= interval:
                    start_time = start_time + interval
                    break
                time.sleep(5)

    def write_metric(self, data):
        # print "write metrics"
        timestamp = str(int(time.time()))
        data.append(timestamp)
        try:
            pod_name = data[0]
            fn = "./metrics/%s" % pod_name
            with open(fn, "a") as f:
                line = " ".join([str(elem) for elem in data])
                f.write("%s\n" % str(line))
        except Exception as e:
            print "failed to write metrics:%s" % str(e)
예제 #22
0
class Consumer(Client):
    oc = OC()
    k = Kubectl()
    w = WriteLog()

    def __init__(self):
        super(Consumer, self).__init__()
        self.namespace = "myproject"
        self.app_name = "consumer"
        self.app_type = "deployment"
        self.w.namespace = self.namespace
        self.w.app_name = self.app_name
        self.w.app_type = self.app_type

    def wait_time(self, value):
        # print "wait %d seconds" % value
        time.sleep(value)

    def calculate_pod_info(self):
        app_cpu_value = 0
        app_memory_value = 0
        app_cpu_limit = 0
        app_memory_limit = 0
        app_restart = 0
        app_status_running = 0
        app_status_crashloopbackoff = 0
        app_status_oomkilled = 0
        for pod in self.w.app_list[self.app_name].keys():
            for item in self.w.app_list[self.app_name][pod].keys():
                if item in ["cpu_value"]:
                    app_cpu_value += self.w.app_list[
                        self.app_name][pod]["cpu_value"]
                elif item in ["memory_value"]:
                    app_memory_value += self.w.app_list[
                        self.app_name][pod]["memory_value"]
                elif item in ["pod_cpu_limits"]:
                    app_cpu_limit += self.w.app_list[
                        self.app_name][pod]["pod_cpu_limits"]
                elif item in ["pod_memory_limits"]:
                    app_memory_limit += self.w.app_list[
                        self.app_name][pod]["pod_memory_limits"]
                elif item in ["restart"]:
                    app_restart += self.w.app_list[
                        self.app_name][pod]["restart"]
                elif item == "status":
                    status = self.w.app_list[self.app_name][pod]["status"]
                    if status in ["Running"]:
                        app_status_running += 1
                    if status in ["CrashLoopBackOff"]:
                        app_status_crashloopbackoff += 1
                elif item == "reason":
                    reason_list = self.w.app_list[self.app_name][pod]["reason"]
                    for reason in reason_list:
                        if reason == "OOMKilled":
                            app_status_oomkilled += 1
        print "- Consumers: CPU %s/%s mCore; Memory %s/%s Mi; Restart %s OOMKilled %s" % (
            app_cpu_value, app_cpu_limit, app_memory_value, app_memory_limit,
            app_restart, app_status_oomkilled)
        output = "%s %s %s %s %s %s %s %s " % (
            app_cpu_value, app_cpu_limit, app_memory_value, app_memory_limit,
            app_restart, app_status_running, app_status_crashloopbackoff,
            app_status_oomkilled)
        return output

    def calculate_overlimit(self):
        app_cpu_overlimit = 0
        app_memory_overlimit = 0

        # calculate overlimit
        for pod in self.w.app_list[self.app_name].keys():
            cpu_value = self.w.app_list[self.app_name][pod]["cpu_value"]
            memory_value = self.w.app_list[self.app_name][pod]["memory_value"]
            cpu_limit = self.w.app_list[self.app_name][pod]["pod_cpu_limits"]
            memory_limit = self.w.app_list[
                self.app_name][pod]["pod_memory_limits"]
            if cpu_limit <= cpu_value:
                app_cpu_overlimit += 1
            if memory_limit <= memory_value:
                app_memory_overlimit += 1
        num_replica = len(self.w.app_list[self.app_name].keys())
        print "- Consumers: Replica: %s\n" % (num_replica)
        output = "%s %s %s " % (app_cpu_overlimit, app_memory_overlimit,
                                num_replica)
        return output

    def calculate_performance(self, group_name, topic_name):
        total_lag = 0
        total_log_offset = 0
        total_current_offset = 0
        active_client = 0
        inactive_client = 0
        partition_list = []
        active_client_list = []
        start_time = time.time()
        num_sample = 3
        # print "--------", group_name, topic_name
        for i in range(num_sample):
            output = self.describe_consumer_group(group_name)
            print "==="
            print "%s" % output
            print "==="
            for line in output.split("\n"):
                if line and line.find(topic_name) != -1 and line.find(
                        "Error") == -1:
                    partition = int(line.split()[2])
                    if partition not in partition_list:
                        partition_list.append(partition)
                    current_offset = int(line.split()[3])
                    log_offset = int(line.split()[4])
                    lag = int(line.split()[5])
                    consumer_id = line.split()[6]
                    total_log_offset += log_offset
                    total_current_offset += current_offset
                    total_lag += lag
                    if consumer_id.find("consumer-1") == -1:
                        inactive_client += 1
                    if consumer_id not in active_client_list:
                        active_client_list.append(consumer_id)
            # print i, "total describe lag=", lag, time.time()
        total_lag = total_lag / (num_sample * 1.0)
        total_log_offset = total_log_offset / (num_sample * 1.0)
        total_current_offset = total_current_offset / (num_sample * 1.0)
        inactive_client = inactive_client / (num_sample * 1.0)
        active_client = len(active_client_list)
        print "- Consumers: Log Offset %s;" % total_log_offset, "Current Offset %s;" % total_current_offset, "Lag %s;" % total_lag
        print "- Consumers: Active %s;" % active_client, "Inactive %s" % inactive_client
        print "\n"
        output = "%s %s %s %s %s %s %s %s " % (
            group_name, topic_name, total_lag, active_client, inactive_client,
            total_log_offset, total_current_offset, len(partition_list))
        end_time = time.time()
        #print ">> describe time = ", end_time - start_time
        return output

    def write_logs(self, algo_name, group_name, topic_name):
        self.w.get_deploymentconfig()
        self.w.get_pod_info()
        self.w.get_limits()
        self.w.get_metrics()
        self.w.get_status()

        file_name = "%s/%s_consumer_metrics" % (traffic_path, algo_name)
        timestamp = int(time.time())
        line = "%s " % (timestamp)
        line += self.calculate_pod_info()
        line += self.calculate_overlimit()
        # hungo test - block calculate (per maygy)
        #line += self.calculate_performance(group_name, topic_name)
        line += "\n"

        try:
            with open(file_name, "a") as f:
                f.write(line)
        except Exception as e:
            print "failed to write consumer logs(%s): %s" % (file_name, str(e))
            return -1

        # print "success to write consumer logs(%s)" % file_name
        return 0

    def delete_all_consumer_groups(self):
        # delete all consumer groups
        group_list = self.list_consumer_group()
        for group in group_list:
            output = self.delete_consumer_group(group)
class Zookeeper(Client):
    oc = OC()
    k = Kubectl()
    w = WriteLog()

    def __init__(self):
        super(Zookeeper, self).__init__()
        self.namespace = "myproject"
        self.app_name = "my-cluster-zookeeper"
        self.app_type = "statefulset"
        self.w.namespace = self.namespace
        self.w.app_name = self.app_name
        self.w.app_type = self.app_type

    def wait_time(self, value):
        # print "wait %d seconds" % value
        time.sleep(value)

    def calculate_pod_info(self):
        app_cpu_value = 0
        app_memory_value = 0
        app_cpu_limit = 0
        app_memory_limit = 0
        app_restart = 0
        app_status_running = 0
        app_status_crashloopbackoff = 0
        app_status_oomkilled = 0

        for pod in self.w.app_list[self.app_name].keys():
            for item in self.w.app_list[self.app_name][pod].keys():
                if item in ["cpu_value"]:
                    app_cpu_value += self.w.app_list[
                        self.app_name][pod]["cpu_value"]
                elif item in ["memory_value"]:
                    app_memory_value += self.w.app_list[
                        self.app_name][pod]["memory_value"]
                elif item in ["pod_cpu_limits"]:
                    app_cpu_limit += self.w.app_list[
                        self.app_name][pod]["pod_cpu_limits"]
                elif item in ["pod_memory_limits"]:
                    app_memory_limit += self.w.app_list[
                        self.app_name][pod]["pod_memory_limits"]
                elif item in ["restart"]:
                    app_restart += self.w.app_list[
                        self.app_name][pod]["restart"]
                elif item == "status":
                    status = self.w.app_list[self.app_name][pod]["status"]
                    if status in ["Running"]:
                        app_status_running += 1
                    if status in ["CrashLoopBackOff"]:
                        app_status_crashloopbackoff += 1
                    if status in ["OOMKilled"]:
                        app_status_oomkilled += 1

        print "- Zookeepers: CPU %s/%s mCore; Memory %s/%s Mi; Restart %s" % (
            app_cpu_value, app_cpu_limit, app_memory_value, app_memory_limit,
            app_restart)
        output = "%s %s %s %s %s %s %s %s " % (
            app_cpu_value, app_cpu_limit, app_memory_value, app_memory_limit,
            app_restart, app_status_running, app_status_crashloopbackoff,
            app_status_oomkilled)
        return output

    def calculate_overlimit(self):
        app_cpu_overlimit = 0
        app_memory_overlimit = 0

        # calculate overlimit
        for pod in self.w.app_list[self.app_name].keys():
            cpu_value = self.w.app_list[self.app_name][pod]["cpu_value"]
            memory_value = self.w.app_list[self.app_name][pod]["memory_value"]
            cpu_limit = self.w.app_list[self.app_name][pod]["pod_cpu_limits"]
            memory_limit = self.w.app_list[
                self.app_name][pod]["pod_memory_limits"]
            if cpu_limit <= cpu_value:
                app_cpu_overlimit += 1
            if memory_limit <= memory_value:
                app_memory_overlimit += 1
        num_replica = len(self.w.app_list[self.app_name].keys())
        print "- Zookeepers: OverLimit %s; OOM: %s\n" % (app_cpu_overlimit,
                                                         app_memory_overlimit)
        output = "%s %s %s" % (app_cpu_overlimit, app_memory_overlimit,
                               num_replica)
        return output

    def write_logs(self, algo_name):
        self.w.get_deploymentconfig()
        self.w.get_pod_info()
        self.w.get_limits()
        self.w.get_metrics()
        self.w.get_status()

        file_name = "%s/%s_zookeeper_metrics" % (traffic_path, algo_name)
        timestamp = int(time.time())
        line = "%s " % (timestamp)
        line += self.calculate_pod_info()
        line += self.calculate_overlimit()
        line += "\n"

        try:
            with open(file_name, "a") as f:
                f.write(line)
        except Exception as e:
            print "failed to write zookeeper logs(%s): %s" % (file_name,
                                                              str(e))
            return -1

        # print "success to write zookeeper logs(%s)" % file_name
        return 0
def enable_executor():
    print "enable executor"
    output = OC().apply_file("alameda-executor-true.yaml")
    alameda_namespace = find_alameda_namespace("alameda-executor")
    get_executor_status(alameda_namespace, "true")
    return output
def disable_executor():
    print "disable executor"
    output = OC().apply_file("alameda-executor-false.yaml")
    alameda_namespace = find_alameda_namespace("alameda-executor")
    get_executor_status(alameda_namespace, "false")
    return output
class Training:
    k = Kubectl()
    o = OC()
    n = Nginx()

    def __init__(self):
        #self.o.login("admin", "password")
        test = ""

    def get_node_list(self):
        node_list = []
        output = self.o.get_nodes()
        for line in output.split("\n"):
            if line.find("NAME") == -1 and line:
                node_name = line.split()[0]
                node_list.append(node_name)
        return node_list

    def get_node_usage(self):
        # kubectl top node h5-135
        # NAME      CPU(cores)   CPU%      MEMORY(bytes)   MEMORY%
        # h5-135    655m         8%        5703Mi          17%
        node_usage = {}
        node_usage["cpu"] = {}
        node_usage["memory"] = {}
        node_list = self.get_node_list()
        for node in node_list:
            output = self.k.top_node(node)
            for line in output.split("\n"):
                if line.find("NAME") == -1 and line:
                    cpu_usage = int(line.split()[2].split("%")[0])
                    memory_usage = int(line.split()[-1].split("%")[0])
                    node_usage["cpu"][node] = cpu_usage
                    node_usage["memory"][node] = memory_usage
        avg_node_usage = sum(node_usage["cpu"].values()) / len(
            node_usage["cpu"].values())
        max_node_usage = max(node_usage["cpu"].values())
        return max_node_usage, avg_node_usage

    def get_pod_usage(self, app_name, app_namespace):
        pod_usage = {}
        pod_usage["cpu"] = {}
        pod_usage["memory"] = {}
        pod_name_list = find_pod_name(app_name, app_namespace)
        for pod in pod_name_list:
            output = self.k.top_pod(pod, app_namespace)
            for line in output.split("\n"):
                if line.find("NAME") == -1 and line:
                    cpu_usage = int(line.split()[1].split("m")[0])
                    memory_usage = int(line.split()[-1].split("M")[0])
                    pod_usage["cpu"][pod] = cpu_usage
                    pod_usage["memory"][pod] = memory_usage
        avg_pod_usage = sum(pod_usage["cpu"].values()) / len(
            pod_usage["cpu"].values())
        max_pod_usage = max(pod_usage["cpu"].values())
        num_pod = len(pod_name_list)
        return max_pod_usage, avg_pod_usage, num_pod

    def import_traffic(self, ratio, i):
        cmd = "python ./run_ab.py %d %d &" % (0, ratio)
        ret = os.system(cmd)
        return ret

    def get_traffic_info(self):
        dir_name = "./traffic"
        traffic_file_list = os.listdir(dir_name)
        latency_list = []
        for traffic in traffic_file_list:
            traffic_file = "./%s/%s" % (dir_name, traffic)
            if os.path.exists(traffic_file):
                with open(traffic_file, "r") as f:
                    output = f.read()
                    for line in output.split("\n"):
                        if line.find("Connect:  ") != -1:
                            avg_connect_latency = int(line.split()[2])
                            latency_list.append(avg_connect_latency)
        return latency_list

    def collect_usage(self, app_namespace, app_name):
        data = {}
        max_node_usage_list = []
        avg_node_usage_list = []
        max_pod_usage_list = []
        avg_pod_usage_list = []
        start_time = time.time()
        timeout = 120
        print "collect %ds resource usage" % timeout
        while True:
            end_time = time.time()
            if end_time - start_time > timeout:
                print "time is up to %ds..." % timeout
                break
            max_node_usage, avg_node_usage = self.get_node_usage()
            max_pod_usage, avg_pod_usage, num_pod = self.get_pod_usage(
                app_name, app_namespace)
            self.get_traffic_info()
            max_node_usage_list.append(max_node_usage)
            avg_node_usage_list.append(avg_node_usage)
            max_pod_usage_list.append(max_pod_usage)
            avg_pod_usage_list.append(avg_pod_usage)
            time.sleep(5)
        connect_latency_list = self.get_traffic_info()
        max_node_usage = sum(max_node_usage_list) / len(max_node_usage_list)
        avg_node_usage = sum(avg_node_usage_list) / len(avg_node_usage_list)
        max_pod_usage = sum(max_pod_usage_list) / len(max_pod_usage_list)
        avg_pod_usage = sum(avg_pod_usage_list) / len(avg_pod_usage_list)
        avg_connect_latency = sum(connect_latency_list) / len(
            connect_latency_list)
        print "max. node =", max_node_usage, "%"
        print "avg. node =", avg_node_usage, "%"
        print "max. pod = ", max_pod_usage, "m"
        print "avg. pod = ", avg_pod_usage, "m"
        print "avg. connect latency = ", avg_connect_latency, "ms"
        data["max_node"] = max_node_usage
        data["avg_node"] = avg_node_usage
        data["max_pod"] = max_pod_usage
        data["avg_pod"] = avg_pod_usage
        data["avg_connect_latency"] = avg_connect_latency
        return data
예제 #27
0
class Prometheus_Query:
    p = Prometheus()
    instance_name = "10.244.0.85:9308"
    oc = OC()

    def __init__(self):
        ns, ip, port = self.get_kafka_exporter_ip()
        if ip and port:
            self.instance_name = "%s:%s" % (ip, port)

    def get_kafka_exporter_ip(self):
        ns = ""
        ip = ""
        port = ""
        output = self.oc.get_services_all_namespace()
        try:
            for line in output.split("\n"):
                if line.find("my-cluster-kafka-exporter") != -1:
                    ns = line.split()[0]
                    ip = line.split()[3]
                    port = line.split()[5].split("/")[0].split(":")[0]
        except Exception as e:
            print "it cannot find kafka exporter ip: %s" % str(e)
            return ns, ip, port
        print "find namespace (%s) exporter ip (%s:%s)" % (ns, ip, port)
        return ns, ip, port

    def query_lag(self):
        # cmd = 'sum(kafka_consumergroup_lag{instance="%s",topic=~"%s"}) by (consumergroup, topic)' % (self.instance_name, topic_name)
        cmd = 'sum(kafka_consumergroup_lag{topic=~"%s"})' % (topic_name)
        output = self.p.run_cmd(cmd)
        return output

    def query_avg_lag(self):
        cmd = 'avg_over_time(kafka_consumergroup_lag{topic="%s",consumergroup="%s"}[1m])' % (
            topic_name, group_name)
        output = self.p.run_cmd(cmd)
        return output

    def query_log_offset(self):
        cmd = 'sum(kafka_topic_partition_current_offset{topic=~"%s"})' % (
            topic_name)
        output = self.p.run_cmd(cmd)
        return output

    def query_log_offset_by_min(self):
        cmd = 'sum(delta(kafka_topic_partition_current_offset{topic=~"%s"}[3m])/3)' % (
            topic_name)
        output = self.p.run_cmd(cmd)
        return output

    def query_log_offset_by_sec(self):
        cmd = 'sum(rate(kafka_topic_partition_current_offset{topic=~"%s"}[1m]))' % (
            topic_name)
        output = self.p.run_cmd(cmd)
        return output

    def query_current_offset(self):
        cmd = 'sum(kafka_consumergroup_current_offset{topic=~"%s"})' % (
            topic_name)
        output = self.p.run_cmd(cmd)
        return output

    def query_current_offset_by_min(self):
        cmd = 'sum(delta(kafka_consumergroup_current_offset{topic=~"%s"}[3m])/3)' % (
            topic_name)
        output = self.p.run_cmd(cmd)
        return output

    def query_current_offset_by_sec(self):
        cmd = 'sum(rate(kafka_consumergroup_current_offset{topic=~"%s"}[1m]))' % (
            topic_name)
        output = self.p.run_cmd(cmd)
        return output

    def query_lag_by_sec(self):
        cmd = 'sum(rate(kafka_consumergroup_lag{topic=~"%s"}[1m]))' % (
            topic_name)
        output = self.p.run_cmd(cmd)
        return output

    def query_lag_by_min(self):
        cmd = 'sum(delta(kafka_consumergroup_lag{topic=~"%s"}[3m])/3)' % (
            topic_name)
        output = self.p.run_cmd(cmd)
        return output

    def query_pod_start_time(self, pod_name):
        cmd = 'kube_pod_start_time{pod="%s"}' % pod_name
        output = self.p.run_cmd(cmd)
        return output

    def wait_time(self, value):
        # print "wait %d seconds" % value
        time.sleep(value)
def stop_k8shpa(namespace, resource):
    if k8shpa_type == "lag":
        file_name = "%s/consumer-hpa.yaml" % config_path
        output = OC().delete_file(file_name)
        print "%s" % output
def get_executor_status(namespace, desired_status):
    output = OC().get_configmap(namespace, "alameda-executor-config")
    if output.find(desired_status) == -1:
        raise Exception("executor must be %s" % desired_status)
def find_app_location(app_name, namespace=""):
    app_namespace = ""
    app_type = ""
    resource = ""
    app_list = []
    output = OC().get_deployments_all_namespace()
    if output.find(app_name) != -1:
        for line in output.split("\n"):
            if line.find(app_name) != -1:
                app_namespace = line.split()[0]
                app_type = "deployment"
                resource = line.split()[1]
                app = {}
                app["namespace"] = app_namespace
                app["resource_type"] = app_type
                app["resource"] = resource
                app_list.append(app)
    if not app_list:
        output = OC().get_deploymentconfigs_all_namespace()
        if output.find(app_name) != -1:
            for line in output.split("\n"):
                if line.find(app_name) != -1:
                    app_namespace = line.split()[0]
                    app_type = "deploymentconfig"
                    resource = line.split()[1]
                    app = {}
                    app["namespace"] = app_namespace
                    app["resource_type"] = app_type
                    app["resource"] = resource
                    app_list.append(app)
    if not app_list:
        raise Exception("app: %s is not existed" % app_name)

    # do not choose
    if namespace:
        for app in app_list:
            if app["namespace"] == namespace and app["resource"] == app_name:
                break
        return app_namespace, app_type, resource

    app_namespace = app["namespace"]
    app_type = app["resource_type"]
    resource = app["resource"]
    if query_mode:
        # show app
        i = 0
        print "\n"
        print "*******************************************************************"
        print "   Applications:"
        for app in app_list:
            print "    %d) namespace: %s   %s: %s" % (i, app["namespace"], app["resource_type"], app["resource"])
            i = i + 1
        print "*******************************************************************\n"
        sys.stdin = open('/dev/tty')
        try:
            x = raw_input("input prefered application (default:0): ")
            if not x:
                x = 0
        except Exception:
            x = 0
        x = int(x)
        app_namespace = app_list[x]["namespace"]
        app_type = app_list[x]["resource_type"]
        resource = app_list[x]["resource"]

    print "preferred application is %s/%s" % (app_namespace, resource)
    os.environ["NAMESPACE"] = app_namespace
    os.environ["RESOURCE"] = resource
    os.environ["RESOURCE_TYPE"] = app_type
    return app_namespace, app_type, resource