def test_retry_on_error(self): # noqa D102 retry = Retry(total=3, backoff_factor=0.1, status_forcelist=[400]) pc = PrometheusConnect(url=self.prometheus_host, disable_ssl=True, retry=retry) with self.assertRaises(requests.exceptions.RetryError, msg="too many 400 error responses"): pc.custom_query("BOOM.BOOM!#$%")
def get_gpu_number(self): max_available_gpu = 0 pod_list = [] ## Verify if dcgm-exporter is deployed try: pod_list = self.api_client.list_pod_for_all_namespaces(label_selector="app=nvidia-dcgm-exporter") except ApiException as e: if e.status != 404: _LOGGER.error("Exception when calling DCGM exporter pods: %s\n" % e) if len(pod_list.items) != 0: prom = PrometheusConnect( url=self.get_prometheus_url(), headers={"Authorization": "Bearer " + self.get_openshift_prometheus_token()}, disable_ssl=True) for pod in pod_list.items: pod_IP = pod.status.pod_ip gpu_query = 'count (count by (UUID,GPU_I_ID) (DCGM_FI_PROF_GR_ENGINE_ACTIVE{instance="' + pod_IP +\ ':9400"}) or vector(0)) - count(count by (UUID,GPU_I_ID) (DCGM_FI_PROF_GR_ENGINE_ACTIVE{instance="'\ + pod_IP + ':9400", exported_pod=~".+"}) or vector(0))' get_available_gpu_in_node_data = prom.custom_query(query=gpu_query) get_available_gpu_in_node = int(get_available_gpu_in_node_data[0]['value'][1]) if get_available_gpu_in_node > max_available_gpu: max_available_gpu = get_available_gpu_in_node return max_available_gpu
def collect_metrics(): """Collect metrics from Prometheus/Thanos.""" pc = PrometheusConnect( url=_THANOS_URL, headers={"Authorization": f"bearer {_THANOS_TOKEN}"}, disable_ssl=True) collected_info = {} for sli_name, sli_methods in SLIReport.REPORT_SLI_CONTEXT.items(): _LOGGER.info(f"Retrieving data for... {sli_name}") collected_info[sli_name] = {} for query_name, query in sli_methods["query"].items(): _LOGGER.info(f"Querying... {query_name}") try: metric_data = pc.custom_query(query=query) _LOGGER.info(f"Metric obtained... {metric_data}") collected_info[sli_name][query_name] = float( metric_data[0]["value"][1]) except Exception as e: _LOGGER.exception( f"Could not gather metric for {sli_name}-{query_name}...{e}" ) pass collected_info[sli_name][query_name] = None return collected_info
def main(): try: # Setting up Mongo DB MONGO_HOST = str(os.environ.get('MONGO_HOST', '127.0.0.1')) MONGO_PORT = str(os.environ.get('MONGO_PORT', '27017')) MONGO_DB = str(os.environ.get('MONGO_DBNAME', 'cpa')) MONGO_USER = str(os.environ.get('MONGO_USERNAME', 'root')) MONGO_PASS = str(os.environ.get('MONGO_PASSWORD', 'iRhrF6O0vp')) mongodb_client = MongoClient( 'mongodb://{}:{}@{}:{}/?authSource=admin'.format( MONGO_USER, MONGO_PASS, MONGO_HOST, MONGO_PORT)) cpa_db = mongodb_client[MONGO_DB] deployments_collection = cpa_db.deployments list_of_deployments = [] for deployment in deployments_collection.find(): list_of_deployments = deployment['list'] # Setting up Prometheus prometheus_base = str( os.environ.get('PROMETHEUS_URL', 'http://192.168.23.92:9090')) prom = PrometheusConnect(url=prometheus_base, disable_ssl=True) # get workload cpu query_workload_cpu = """ sum( irate(container_cpu_usage_seconds_total{cluster="", namespace="default"}[2m]) * on(namespace,pod) group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster="", namespace="default", workload_type="deployment"} ) by (workload, workload_type) """ get_workload_cpu_query = lambda: prom.custom_query(query= query_workload_cpu) def get_deployments_cpu_usage(list_of_deployments): wl_cpu_res = get_workload_cpu_query() # filter results (unit is millicores) filtered_cpu_query = { q['metric']['workload']: float(q['value'][1]) * 1000 for q in wl_cpu_res if q['metric']['workload'] in list_of_deployments } # if metric skipped, put in None instead for d in list_of_deployments: if d not in filtered_cpu_query: filtered_cpu_query[d] = None return filtered_cpu_query deployments_cpu = get_deployments_cpu_usage(list_of_deployments) # Parse spec into a dict # spec = json.loads(r'{"resource": {"kind": "Deployment", "apiVersion": "apps/v1", "metadata": {"name": "redis-cart", "namespace": "default", "uid": "1b25ec34-965e-4f57-9638-b95e78edfe41", "resourceVersion": "2238", "generation": 1, "creationTimestamp": "2021-02-13T06:18:09Z", "annotations": {"deployment.kubernetes.io/revision": "1", "kubectl.kubernetes.io/last-applied-configuration": "{\"apiVersion\":\"apps/v1\",\"kind\":\"Deployment\",\"metadata\":{\"annotations\":{},\"name\":\"redis-cart\",\"namespace\":\"default\"},\"spec\":{\"selector\":{\"matchLabels\":{\"app\":\"redis-cart\"}},\"template\":{\"metadata\":{\"labels\":{\"app\":\"redis-cart\"}},\"spec\":{\"containers\":[{\"image\":\"redis:alpine\",\"livenessProbe\":{\"periodSeconds\":5,\"tcpSocket\":{\"port\":6379}},\"name\":\"redis\",\"ports\":[{\"containerPort\":6379}],\"readinessProbe\":{\"periodSeconds\":5,\"tcpSocket\":{\"port\":6379}},\"resources\":{\"limits\":{\"cpu\":\"125m\",\"memory\":\"256Mi\"},\"requests\":{\"cpu\":\"70m\",\"memory\":\"200Mi\"}},\"volumeMounts\":[{\"mountPath\":\"/data\",\"name\":\"redis-data\"}]}],\"volumes\":[{\"emptyDir\":{},\"name\":\"redis-data\"}]}}}}\n"}, "managedFields": [{"manager": "kubectl", "operation": "Update", "apiVersion": "apps/v1", "time": "2021-02-13T06:18:09Z", "fieldsType": "FieldsV1", "fieldsV1": {"f:metadata": {"f:annotations": {".": {}, "f:kubectl.kubernetes.io/last-applied-configuration": {}}}, "f:spec": {"f:progressDeadlineSeconds": {}, "f:replicas": {}, "f:revisionHistoryLimit": {}, "f:selector": {}, "f:strategy": {"f:rollingUpdate": {".": {}, "f:maxSurge": {}, "f:maxUnavailable": {}}, "f:type": {}}, "f:template": {"f:metadata": {"f:labels": {".": {}, "f:app": {}}}, "f:spec": {"f:containers": {"k:{\"name\":\"redis\"}": {".": {}, "f:image": {}, "f:imagePullPolicy": {}, "f:livenessProbe": {".": {}, "f:failureThreshold": {}, "f:periodSeconds": {}, "f:successThreshold": {}, "f:tcpSocket": {".": {}, "f:port": {}}, "f:timeoutSeconds": {}}, "f:name": {}, "f:ports": {".": {}, "k:{\"containerPort\":6379,\"protocol\":\"TCP\"}": {".": {}, "f:containerPort": {}, "f:protocol": {}}}, "f:readinessProbe": {".": {}, "f:failureThreshold": {}, "f:periodSeconds": {}, "f:successThreshold": {}, "f:tcpSocket": {".": {}, "f:port": {}}, "f:timeoutSeconds": {}}, "f:resources": {".": {}, "f:limits": {".": {}, "f:cpu": {}, "f:memory": {}}, "f:requests": {".": {}, "f:cpu": {}, "f:memory": {}}}, "f:terminationMessagePath": {}, "f:terminationMessagePolicy": {}, "f:volumeMounts": {".": {}, "k:{\"mountPath\":\"/data\"}": {".": {}, "f:mountPath": {}, "f:name": {}}}}}, "f:dnsPolicy": {}, "f:restartPolicy": {}, "f:schedulerName": {}, "f:securityContext": {}, "f:terminationGracePeriodSeconds": {}, "f:volumes": {".": {}, "k:{\"name\":\"redis-data\"}": {".": {}, "f:emptyDir": {}, "f:name": {}}}}}}}}, {"manager": "k3s", "operation": "Update", "apiVersion": "apps/v1", "time": "2021-02-13T06:18:21Z", "fieldsType": "FieldsV1", "fieldsV1": {"f:metadata": {"f:annotations": {"f:deployment.kubernetes.io/revision": {}}}, "f:status": {"f:availableReplicas": {}, "f:conditions": {".": {}, "k:{\"type\":\"Available\"}": {".": {}, "f:lastTransitionTime": {}, "f:lastUpdateTime": {}, "f:message": {}, "f:reason": {}, "f:status": {}, "f:type": {}}, "k:{\"type\":\"Progressing\"}": {".": {}, "f:lastTransitionTime": {}, "f:lastUpdateTime": {}, "f:message": {}, "f:reason": {}, "f:status": {}, "f:type": {}}}, "f:observedGeneration": {}, "f:readyReplicas": {}, "f:replicas": {}, "f:updatedReplicas": {}}}}]}, "spec": {"replicas": 1, "selector": {"matchLabels": {"app": "redis-cart"}}, "template": {"metadata": {"creationTimestamp": null, "labels": {"app": "redis-cart"}}, "spec": {"volumes": [{"name": "redis-data", "emptyDir": {}}], "containers": [{"name": "redis", "image": "redis:alpine", "ports": [{"containerPort": 6379, "protocol": "TCP"}], "resources": {"limits": {"cpu": "125m", "memory": "256Mi"}, "requests": {"cpu": "70m", "memory": "200Mi"}}, "volumeMounts": [{"name": "redis-data", "mountPath": "/data"}], "livenessProbe": {"tcpSocket": {"port": 6379}, "timeoutSeconds": 1, "periodSeconds": 5, "successThreshold": 1, "failureThreshold": 3}, "readinessProbe": {"tcpSocket": {"port": 6379}, "timeoutSeconds": 1, "periodSeconds": 5, "successThreshold": 1, "failureThreshold": 3}, "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent"}], "restartPolicy": "Always", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "securityContext": {}, "schedulerName": "default-scheduler"}}, "strategy": {"type": "RollingUpdate", "rollingUpdate": {"maxUnavailable": "25%", "maxSurge": "25%"}}, "revisionHistoryLimit": 10, "progressDeadlineSeconds": 600}, "status": {"observedGeneration": 1, "replicas": 1, "updatedReplicas": 1, "readyReplicas": 1, "availableReplicas": 1, "conditions": [{"type": "Available", "status": "True", "lastUpdateTime": "2021-02-13T06:18:21Z", "lastTransitionTime": "2021-02-13T06:18:21Z", "reason": "MinimumReplicasAvailable", "message": "Deployment has minimum availability."}, {"type": "Progressing", "status": "True", "lastUpdateTime": "2021-02-13T06:18:21Z", "lastTransitionTime": "2021-02-13T06:18:09Z", "reason": "NewReplicaSetAvailable", "message": "ReplicaSet \"redis-cart-74594bd569\" has successfully progressed."}]}}, "runType": "scaler"}') spec = json.loads(sys.stdin.read()) metric(spec, list_of_deployments, deployments_cpu) except Exception as err: sys.stderr.write(f"Error metric: {err}") exit(1)
def get_current(self) -> float: prom = PrometheusConnect( url=self.config.get("url", "http://localhost:9090"), disable_ssl=self.config.get("disable_ssl", True), ) res = prom.custom_query(query=self.query) if not res: log.error("Prometheus query: no result") raise Exception("Prometheus query: no result") log.info(f"Prometheus query result: {res}") return float(res[0].get("value")[-1])
class my_prometheus(): def __init__(self, host, port, disablessl): if disablessl == True: self.schema = "http" else: self.schema = "https" try: self.prom = PrometheusConnect(url=self.schema + "://" + host + ":" + port, disable_ssl=disablessl) except Exception: print("Fehler") def prom_query(self, query): self.lasttemps = self.prom.custom_query(query=query) self.lasttemp = sorted(self.lasttemps[0]["values"], reverse=True)[0][1]
class PrometheusClient: def __init__(self, promhost, promport): self.prom = PrometheusConnect(url="http://%s:%s" % (promhost, promport), disable_ssl=True) def get_ticktime(self): return self.__get_metric_for_last_five_mins("overall_ticktime")[0].get( "values") def get_dim_ticktime(self): result = {} dim_ticktimes = self.__get_metric_for_last_five_mins("dim_ticktime") for dimension in dim_ticktimes: result[dimension.get("metric").get( "dimension_name")] = dimension.get("values") return result def get_players(self): players = [] for p in self.prom.custom_query("player_playtime"): players.append(p.get("metric").get("player")) return players def get_tps(self): return self.__get_metric_for_last_five_mins("overall_tps")[0].get( "values") def get_dim_tps(self): result = {} dim_tps = self.__get_metric_for_last_five_mins("dim_tps") for dimension in dim_tps: result[dimension.get("metric").get( "dimension_name")] = dimension.get("values") return result def __get_metric_for_last_five_mins(self, metricname): return self.prom.get_metric_range_data( metric_name=metricname, start_time=datetime.datetime.now() - datetime.timedelta(minutes=5), end_time=datetime.datetime.now(), )
def queryMetrics(customquery, trim): # print("\n queryMetrics START\n") prom = PrometheusConnect(url ="http://localhost:9090", disable_ssl=True) data = prom.custom_query(query=customquery, ) # To make it a table where each row is a metric df = MetricSnapshotDataFrame(data) df = df[df.value != "NaN"] df[['value']] = df[['value']].apply(pd.to_numeric) df[['timestamp']] = df[['timestamp']].apply(pd.to_datetime, unit='s') sortedDf = df.sort_values('value', ascending=False).head(trim) # print(nicenumbers) # print(df.index) # print(df.columns) # print("\n queryMetrics END\n") return sortedDf
class TestPrometheusConnectWithMockedNetwork(BaseMockedNetworkTestcase): """ Network is blocked in this testcase, see base class """ def setUp(self): self.pc = PrometheusConnect(url='http://doesnt_matter.xyz', disable_ssl=True) def test_network_is_blocked(self): resp = requests.get('https://google.com') self.assertEqual(resp.status_code, 403) self.assertEqual(resp.text, 'BOOM!') def test_how_mock_prop_works(self): with self.mock_response('kekekeke', status_code=500) as handler: self.assertEqual(len(handler.requests), 0) resp = requests.get('https://redhat.com') self.assertEqual(resp.status_code, 500) self.assertEqual(resp.text, 'kekekeke') self.assertEqual(len(handler.requests), 1) request = handler.requests[0] self.assertEqual(request.url, 'https://redhat.com/') def test_unauthorized(self): with self.mock_response("Unauthorized", status_code=403): with self.assertRaises(PrometheusApiClientException) as exc: self.pc.all_metrics() self.assertEqual("HTTP Status Code 403 (b'Unauthorized')", str(exc.exception)) def test_broken_responses(self): with self.assertRaises(PrometheusApiClientException) as exc: self.pc.all_metrics() self.assertEqual("HTTP Status Code 403 (b'BOOM!')", str(exc.exception)) with self.assertRaises(PrometheusApiClientException) as exc: self.pc.get_current_metric_value("metric") self.assertEqual("HTTP Status Code 403 (b'BOOM!')", str(exc.exception)) with self.assertRaises(PrometheusApiClientException) as exc: self.pc.get_metric_range_data("metric") self.assertEqual("HTTP Status Code 403 (b'BOOM!')", str(exc.exception)) with self.assertRaises(PrometheusApiClientException) as exc: self.pc.custom_query_range("query", datetime.now(), datetime.now(), "1") self.assertEqual("HTTP Status Code 403 (b'BOOM!')", str(exc.exception)) with self.assertRaises(PrometheusApiClientException) as exc: self.pc.custom_query("query") self.assertEqual("HTTP Status Code 403 (b'BOOM!')", str(exc.exception)) def test_all_metrics_method(self): all_metrics_payload = {"status": "success", "data": ["up", "alerts"]} with self.mock_response(all_metrics_payload) as handler: self.assertTrue(len(self.pc.all_metrics())) self.assertEqual(handler.call_count, 1) request = handler.requests[0] self.assertEqual(request.path_url, "/api/v1/label/__name__/values")
class PrometheusMetricProvider(MetricProvider): def __init__(self, nodes: NodeDataView): super().__init__(nodes) self._prom = PrometheusConnect(url=settings.prometheus.url) def get_metric(self, metric: PrometheusMetric) -> List: try: return self.__prom_request(metric.query) except PrometheusApiClientException as e: logger.error(f"Error pulling {metric}: {e}") return [] def __prom_request(self, query: str) -> List: return self._prom.custom_query(query) def refresh_data(self): logger.debug("Pulling metrics from Prometheus") self._data["messages_in"] = self.__get_messages_in() self._data["messages_out"] = self.__get_messages_out() self._data["consumer_lag"] = self.__get_consumer_lag() self._data["consumer_read_rate"] = self.__get_consumer_read_rate() self._data["topic_size"] = self.__get_topic_size() self._data["replicas"] = self.__get_replicas() self._data["connector_tasks"] = self.__get_connector_tasks() def __get_messages_in(self) -> Dict[str, float]: prom_messages_in = self.get_metric(metric=PrometheusMetric.MESSAGES_IN) return { d["metric"]["topic"]: round(float(d["value"][-1]), 2) for d in prom_messages_in } def __get_messages_out(self) -> Dict[str, float]: prom_messages_out = self.get_metric( metric=PrometheusMetric.MESSAGES_OUT) return { d["metric"]["topic"]: round(float(d["value"][-1]), 2) for d in prom_messages_out } def __get_consumer_lag(self) -> Dict[str, int]: prom_consumer_lag = self.get_metric( metric=PrometheusMetric.CONSUMER_LAG) return { d["metric"]["group"]: int(d["value"][-1]) for d in prom_consumer_lag } def __get_consumer_read_rate(self) -> Dict[str, float]: prom_consumer_read_rate = self.get_metric( metric=PrometheusMetric.CONSUMER_READ_RATE) return { d["metric"]["group"]: float(d["value"][-1]) for d in prom_consumer_read_rate } def __get_topic_size(self) -> Dict[str, int]: prom_topic_size = self.get_metric(metric=PrometheusMetric.TOPIC_SIZE) return { d["metric"]["topic"]: int(d["value"][-1]) for d in prom_topic_size } def __get_replicas(self) -> Dict[str, int]: prom_replicas = self.get_metric(metric=PrometheusMetric.REPLICAS) return { d["metric"]["deployment"]: int(d["value"][-1]) for d in prom_replicas } def __get_connector_tasks(self) -> Dict[str, int]: prom_connector_tasks = self.get_metric( metric=PrometheusMetric.CONNECTOR_TASKS) return { d["metric"]["connector"]: int(d["value"][-1]) for d in prom_connector_tasks }
def process_period(config, period): period_start = period['instant'] + dateutil.relativedelta.relativedelta( seconds=-period['range_sec']) print( f"Processing year {period['year']}, month {period['month']}, " f"querying from {period['instant'].isoformat()} and going back {period['range_sec']} s to {period_start.isoformat()}." ) queries = QueryLogic(queryRange=(str(period['range_sec']) + 's')) # SSL generally not used for Prometheus access within a cluster # Docs on instant query API: https://prometheus.io/docs/prometheus/latest/querying/api/#instant-queries prom = PrometheusConnect(url=config.prometheus_server, disable_ssl=True) prom_connect_params = { 'time': period['instant'].isoformat(), 'timeout': config.query_timeout } raw_results, results, result_lengths = {}, {}, [] # iterate over each query (cputime, starttime, endtime, cores) producing raw_results['cputime'] etc. for query_name, query_string in vars(queries).items(): # Each of these raw_results is a list of dicts. Each dict in the list represents an individual data point, and contains: # 'metric': a dict of one or more key-value pairs of labels, one of which is the pod name ('exported_pod'). # 'value': a list in which the 0th element is the timestamp of the value, and 1th element is the actual value we're interested in. print(f'Executing {query_name} query: {query_string}') t1 = timer() raw_results[query_name] = prom.custom_query(query=query_string, params=prom_connect_params) t2 = timer() results[query_name] = dict(rearrange(raw_results[query_name])) result_lengths.append(len(results[query_name])) t3 = timer() print( f'Query finished in {t2 - t1} s, processed in {t3 - t2} s. Got {len(results[query_name])} items from {len(raw_results[query_name])} results. Peak RAM usage: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss}K.' ) del raw_results[query_name] cputime = results['cputime'] endtime = results['endtime'] starttime = results['starttime'] cores = results['cores'] # Confirm the assumption that cputime should have the fewest entries, while starttime and cores may have additional ones # corresponding to jobs that have started but not finished yet, and endtime may have additional ones if there are pods without CPU resource requests. # We only want the jobs for which all values are available: start time, end time, CPU request. # Note that jobs which started last month and finished this month will be properly included and accounted in this month. assert len(cputime) == min( result_lengths), "cputime should be the shortest list" # However, jobs that finished last month may show up in this month's data if they are still present on the cluster this month (in Completed state). # Exclude them by filtering with a lambda (since you can't pass an argument to a function object AFAIK). endtime = dict( filter(lambda x: x[1] >= datetime.datetime.timestamp(period_start), endtime.items())) # Prepare to iterate over jobs which meet all criteria. valid_jobs = cputime.keys() & endtime.keys() # avoid sending empty records if len(valid_jobs) == 0: print('No records to process.') return sum_cputime = 0 t4 = timer() for key in valid_jobs: assert endtime[key] > starttime[ key], "job end time is before start time" # double check cputime calc of this job delta = abs(cputime[key] - (endtime[key] - starttime[key]) * cores[key]) assert delta < 0.001, "cputime calculation is inaccurate" sum_cputime += cputime[key] # CPU time as calculated here means (# cores * job duration), which apparently corresponds to # the concept of wall time in APEL accounting. It is not clear what CPU time means in APEL; # could be the actual CPU usage % integrated over the job (# cores * job duration * usage) # but this does not seem to be documented clearly. Some batch systems do not actually measure # this so it is not reported consistently or accurately. Some sites have CPU efficiency # (presumably defined as CPU time / wall time) time that is up to ~ 500% of the walltime, or # always fixed at 100%. In Kubernetes, the actual CPU usage % is tracked by metrics server # (not KSM), which is not meant to be used for monitoring or accounting purposes and is not # scraped by Prometheus. So just use walltime = cputime sum_cputime = round(sum_cputime) sum_walltime = sum_cputime print(f'total cputime: {sum_cputime}, total walltime: {sum_walltime}') # Write output to the message queue on local filesystem # https://dirq.readthedocs.io/en/latest/queuesimple.html#directory-structure dirq = QueueSimple(str(config.output_path)) summary_output = summary_message( config, year=period['year'], month=period['month'], wall_time=sum_walltime, cpu_time=sum_cputime, n_jobs=len(endtime), # this appears faster than getting min/max during the dict iteration above first_end=round(min(endtime.values())), last_end=round(max(endtime.values()))) sync_output = sync_message(config, year=period['year'], month=period['month'], n_jobs=len(endtime)) t5 = timer() summary_file = dirq.add(summary_output) sync_file = dirq.add(sync_output) print(f'Analyzed {len(endtime)} records in {t5 - t4} s.') print(f'Writing summary record to {config.output_path}/{summary_file}:') print('--------------------------------\n' + summary_output + '--------------------------------') print(f'Writing sync record to {config.output_path}/{sync_file}:') print('--------------------------------\n' + sync_output + '--------------------------------')
class CaptureHelper(): def __init__(self, docker_client_services_path, docker_server_services_path, ingress_distribution_file_path, docker_lb_container_path, service_list): self.docker_client_services = get_docker_services( docker_client_services_path) self.docker_server_services = get_docker_services( docker_server_services_path) self.get_ingress_distribution = get_docker_services( ingress_distribution_file_path) self.docker_lb_services = get_docker_services(docker_lb_container_path) self.prom = PrometheusConnect(url="http://131.155.35.54:9090", disable_ssl=True) self.capture_time = CAPTURE_TIME self.service_list = service_list pass def capture_data(self, ingress_nodes, time_up_already): """ Input: array of ingress nodes to measure ex: ingress_nodes = ["node1", "node2"] Get the latency from Prometheous server API return: [ latency = {"node4":{"search":24, "shop":26, "web":{self.capture_time}, "media":10}, "node3":{"search":24, "shop":26, "web":{self.capture_time}, "media":10}} -> for each client in ingress node dropped_traffic = {"node4":{"search":24, "shop":26, "web":{self.capture_time}, "media":10}, "node3":{"search":24, "shop":26, "web":{self.capture_time}, "media":10} "node2: {"search":24, "shop":26, "web":{self.capture_time}, "media":10}} -> for each server in edge node ingress_bw = {"node4":{"search":200, "shop":560, "web":100, "media":{self.capture_time}00}, "node3":{"search":250, "shop":700, "web":450, "media":2900}} -> for each client in ingress node ] """ # sleep 15s, 5s for stablize, 10s for calculation sleep_time = self.capture_time if time_up_already < self.capture_time: sleep_time = self.capture_time - time_up_already elif time_up_already < self.capture_time * 1.5: sleep_time = 1 logger.info(f"sleep_time: {sleep_time}") time.sleep(sleep_time) # Latency: # Ingress request number: ingress_request = self.calculate_ingress_request() # Dropped connections: # pre_dropped_conn = self.calculate_dropped_connection() # dropped_conn = self.pre_process(pre_dropped_conn) # pre_success_conn = self.calculate_success_connection() # success_conn = self.pre_process(pre_success_conn) pre_succ_request = self.get_metric_value("success_conn_total") succ_request = self.pre_process(pre_succ_request) pre_drop_request = self.get_metric_value("drop_conn_total") drop_request = self.pre_process(pre_drop_request) # pre_latency = self.calculate_latency(ingress_nodes) pre_latency = self.calculate_latency_histogram() latency = self.pre_process(pre_latency) # self.calculate_latency_histogram() # latency = {"node4": {"search": 24, "shop": 26, "web": {self.capture_time}, "media": 10}, # "node3": {"search": 24, "shop": 26, "web": {self.capture_time}, "media": 10}} # ingress_request = {"node4": {"search": 200, "shop": 560, "web": 100, "media": {self.capture_time}00}, # "node3": {"search": 250, "shop": 700, "web": 450, "media": 2900}} return latency, drop_request, succ_request, ingress_request def check_lb_containers(self): """ Check if any lb container (except media for now) is empty or not up yet and return the name! """ working = True container_list = list() for node, container in self.docker_lb_services.items(): for cont_name, cont_value in container.items(): number = ''.join(x for x in cont_name if x.isdigit()) url = "http://" + cont_value[ 'IP_ADDRESS_NODE_' + str(number)] + ":" + str( cont_value['PORT_ADDRESS_SEARCH_LISTEN']) # TODO: try and catch here for the module! # try: response = check_connection(url) if not response: logger.error(f"Error in container: {cont_name}") container_list.append(cont_name) return working, container_list def check_client_containers(self): """ Check if any client container (except media for now) is empty or not up yet and return the name! """ time_up_already = list() working = True container_list = list() # print(metrics_array) for node, container in self.docker_client_services.items(): for cont_name, cont_value in container.items(): if self.is_container_in_service_list( container_name=cont_name ) and self.is_container_has_user(node=node, container_name=cont_name): url = "http://" + cont_value['IP_ADDRESS'] + ":" + str( int(cont_value['PORT_NUMBER']) + 100) + "/metrics" # print(url) # print(cont_name) response = check_connection(url) # print(response.status_code) if not response: logger.error(f"Error in container: {cont_name}") container_list.append(cont_name) else: conn_value = -1 metrics_array = self.prom.custom_query( query="summary_request_latency_seconds_count") flag_got_value, conn_value_get = self.get_value( cont_name, metrics_array) logger.error( f"container: {cont_name} conn_value_get: {conn_value_get} with flag: {flag_got_value}" ) if flag_got_value == True: conn_value = int(float(conn_value_get[1])) if conn_value == 0: container_list.append(cont_name) else: time_up_already.append(conn_value) else: container_list.append(cont_name) #FIXME: ignore media containers for now to work with other 3 services first! con_list = [] for con in container_list: if "media" in con: continue con_list.append(con) if len(con_list) > 0: working = False logger.info(f"time for each node: {time_up_already}") time_return = 0 if (len(time_up_already) > 0): time_return = min(time_up_already) return time_return, working, con_list # for container in name_list: # if requests.get(URL) # self.prom def pre_process(self, arr_data): """ input: latency = {"node4": {"search_client_4": 24, "shop": 26, "web": {self.capture_time}, "media": 10}, "node3": {"search": 24, "shop": 26, "web": {self.capture_time}, "media": 10}} based on ingress_distribution: output: latency = {"search_service": 24, "shop_service": 26, "web_service": {self.capture_time}, "media_service":10} """ data = dict() service_list = list(self.get_ingress_distribution['node4'].keys()) for service in service_list: data[service] = 0 # for each service: for service in service_list: total_service_factor = 0 for node, container in arr_data.items(): for client, value in container.items(): if service in client: if value == -1 or self.get_ingress_distribution[node][ service] == 0: continue else: total_service_factor += 1 data[service] += value if total_service_factor != 0: data[service] = data[service] / total_service_factor return data def calculate_latency_histogram(self): latency = dict() logger.info(f"{self.docker_client_services.items()}") for node, container in self.docker_client_services.items(): cont_dict = dict() for container in container.keys(): if self.is_container_in_service_list(container): logger.info( f"histogram_quantile(0.9, sum(rate(request_latency_seconds_{container}_bucket[{self.capture_time}s])) by (le))" ) metric_array = self.prom.custom_query( query= f"histogram_quantile(0.9, sum(rate(request_latency_seconds_{container}_bucket[{self.capture_time}s])) by (le))" ) logger.info(f"metric_array: {metric_array}") value_latency = metric_array[0]['value'][1] # if container in list(metric_dict.keys()): if (value_latency == 'NaN'): value = 0 else: value = float(value_latency) cont_dict[container] = value latency[node] = cont_dict print(latency) return latency def calculate_latency(self, ingress_nodes): metrics_array = self.prom.custom_query( query= f"rate(summary_request_latency_seconds_sum[{self.capture_time}s])") latency = dict() for node, container in self.docker_client_services.items(): cont_dict = dict() for container in container.keys(): latency_value = -1 flag_got_value, latency_value_get = self.get_value( container, metrics_array) if flag_got_value == True: latency_value = int(float(latency_value_get[1])) cont_dict[container] = latency_value latency[node] = cont_dict # print(latency) return latency def get_metric_value(self, cmd): metrics_array = self.prom.custom_query(query=cmd) return_value = dict() for node, container in self.docker_client_services.items(): cont_dict = dict() for container in container.keys(): if self.is_container_in_service_list(container_name=container): val = -1 flag_got_value, value_get = self.get_value( container, metrics_array) if flag_got_value == True: val = int(float(value_get[1])) cont_dict[container] = val return_value[node] = cont_dict # print(return_value) return return_value def get_array_values(self, container, query_cmd): metrics_array_succ = self.prom.custom_query(query=query_cmd) flag_got_value, values = self.get_values(container, metrics_array_succ) arr = list() if flag_got_value: arr = [int(float(value[1])) for value in values] return arr def is_container_has_ingress(self, container_name, node): has_ingress_traffic = False # logger.info(f"container_name: {container_name}") for service, value in self.get_ingress_distribution[node].items(): # logger.info(f"service: {service}") # logger.info(f"value: {value}") if service in container_name and value != 0: has_ingress_traffic = True return has_ingress_traffic return has_ingress_traffic def is_container_in_service_list(self, container_name): is_found = False for service in self.service_list: if service in container_name: is_found = True return is_found def is_container_has_user(self, node, container_name): is_found = False for service, value in self.get_ingress_distribution[node].items(): if service in container_name and value > 0: is_found = True return is_found def calculate_ingress_request(self): # metrics_array_succ = self.prom.custom_query(query="success_conn_total[10s]") # metrics_array_drop = self.prom.custom_query(query="drop_conn_total[10s]") # print(metrics_array_succ) # print(metrics_array_drop) # working = True ingress_request = dict() for node, container in self.docker_client_services.items(): cont_dict = dict() for container in container.keys(): # print(container) if self.is_container_in_service_list(container_name=container): average_ingress_requests = 0 qualify_array_capture = False while (qualify_array_capture == False): arr_succ = self.get_array_values( container, f"success_conn_total[{self.capture_time}s]") arr_drop = self.get_array_values( container, f"drop_conn_total[{self.capture_time}s]") # logger.info(f"container: {container}") # logger.info(self.get_ingress_distribution) logger.info(f"arr_succ: {arr_succ}") logger.info(f"arr_drop: {arr_drop}") # logger.info(f"length arr_succ: {len(arr_succ)}") # logger.info(f"length arr_drop: {len(arr_drop)}") # logger.info(f"self.capture_time: {self.capture_time}") # check if the length is enough? if len(arr_drop) == self.capture_time and len( arr_succ) == self.capture_time: if self.is_container_has_ingress( container_name=container, node=node): # if 0 in arr_succ: # logger.info(f"There is a 0") # qualify_array_capture = False # time.sleep(1) # else: qualify_array_capture = True else: qualify_array_capture = True else: logger.info(f"In here") qualify_array_capture = False time.sleep(1) arr_sum = [ arr_succ[i] + arr_drop[i] for i in range(len(arr_succ)) ] logger.info(f"arr_sum: {arr_sum}") diff = [ arr_sum[i + 1] - arr_sum[i] for i in range(len(arr_sum) - 1) ] logger.info(f"diff: {diff}") logger.info( f"sum(diff): {sum(diff)} len(diff): {len(diff)}") average_ingress_requests = float(sum(diff) / len(diff)) cont_dict[container] = average_ingress_requests ingress_request[node] = cont_dict print(ingress_request) return ingress_request def calculate_dropped_connection(self): metrics_array_accepted = self.prom.custom_query( query=f"nginx_connections_accepted[{self.capture_time}s]") metrics_array_handled = self.prom.custom_query( query=f"nginx_connections_handled[{self.capture_time}s]") dropped_connection = dict() for node, container in self.docker_server_services.items(): cont_dict = dict() for container in container.keys(): dropped_conn = -1 flag_got_value_accepted, accepted_conn = self.get_values( container, metrics_array_accepted) flag_got_value_handled, handled_conn = self.get_values( container, metrics_array_handled) if flag_got_value_handled == True and flag_got_value_accepted == True: array_accepted = [ int(float(value[1])) for value in accepted_conn ] array_handled = [ int(float(value[1])) for value in handled_conn ] array_dropped = [ array_handled[i] - array_accepted[i] for i in range(len(array_accepted)) ] dropped_conn = sum(array_dropped) cont_dict[container] = dropped_conn dropped_connection[node] = cont_dict return dropped_connection def calculate_capacity(self): metrics_array_handled = self.prom.custom_query( query=f"nginx_connections_handled[{self.capture_time}s]") succ_connection = dict() for node, container in self.docker_server_services.items(): cont_dict = dict() for container in container.keys(): succ_conn = -1 flag_got_value_handled, handled_conn = self.get_values( container, metrics_array_handled) print("\n\n") print(container) print(handled_conn) if flag_got_value_handled == True: array_handled = [ int(float(value[1])) for value in handled_conn ] print(array_handled) succ_conn = array_handled[-1] - array_handled[0] cont_dict[container] = succ_conn / self.capture_time succ_connection[node] = cont_dict print(succ_connection) return succ_connection def get_value(self, container, metrics_array): return_value = 1 flag_got_value = False for job in metrics_array: if job['metric']['job'] == container: return_value = job['value'] flag_got_value = True break return flag_got_value, return_value def get_values(self, container, metrics_array): return_value = 1 flag_got_value = False for job in metrics_array: if job['metric']['job'] == container: return_value = job['values'] flag_got_value = True break return flag_got_value, return_value
def collect_metrics(configuration: Configuration, sli_report: SLIReport): """Collect metrics from Prometheus/Thanos.""" if not _DRY_RUN: pc = PrometheusConnect( url=configuration.thanos_url, headers={"Authorization": f"bearer {configuration.thanos_token}"}, disable_ssl=True, ) collected_info = {} for sli_name, sli_methods in sli_report.report_sli_context.items(): _LOGGER.info(f"Retrieving data for... {sli_name}") collected_info[sli_name] = {} for query_name, query_inputs in sli_methods["query"].items(): requires_range = False if isinstance(query_inputs, dict): query = query_inputs["query"] requires_range = query_inputs["requires_range"] action_type = query_inputs["type"] else: query = query_inputs _LOGGER.info(f"Querying... {query_name}") _LOGGER.info(f"Using query... {query}") try: if not _DRY_RUN: if requires_range: metric_data = pc.custom_query_range( query=query, start_time=configuration.start_time, end_time=configuration.end_time, step=configuration.step, ) else: metric_data = pc.custom_query(query=query) _LOGGER.info(f"Metric obtained... {metric_data}") if requires_range: metrics_vector = [ float(v[1]) for v in metric_data[0]["values"] if float(v[1]) > 0 ] result = manipulate_retrieved_metrics_vector( metrics_vector=metrics_vector, action=action_type) collected_info[sli_name][query_name] = result else: collected_info[sli_name][query_name] = float( metric_data[0]["value"][1]) else: metric_data = [{ "metric": "dry run", "value": [datetime.datetime.utcnow(), 0] }] result = float(metric_data[0]["value"][1]) collected_info[sli_name][query_name] = result except Exception as e: _LOGGER.exception( f"Could not gather metric for {sli_name}-{query_name}...{e}" ) pass collected_info[sli_name][query_name] = "ErrorMetricRetrieval" return collected_info
def query_prometheus(search, mode='pod', nature='cpu', prom_url=False): #/api/v1/query?query=avg(rate(container_cpu_usage_seconds_total%7Bnamespace%3D%22jarvis%22%2Ccontainer%3D%22cast-service%22%7D%5B5d%5D)) prom_url = 'http://prometheus.k.nutz.site' if isinstance(prom_url, str): prom = PrometheusConnect(url=prom_url, disable_ssl=False) elif isinstance(prom_url, bool): prom = False else: prom = prom_url if not prom: log.debug( f'[QUERY-PROMETHEUS] Prometheus is Disabled or not found by K8s Service' ) return {} if nature not in ['cpu', 'mem']: log.error(f'[QUERY-PROMETHEUS] Nature {nature} not valid') return {} if mode not in ['node', 'pod', 'deployment']: log.error(f'[QUERY-PROMETHEUS] Mode {mode} not valid') return {} # NODE METRICS if mode == 'node': if nature == 'cpu': query = { 'usage': '(1000 * (count(node_cpu_seconds_total{instance=~"%s.+",mode="user"}) \ - avg(sum by (mode)(irate(node_cpu_seconds_total{instance=~"%s.+",mode="idle"}[5m])))))' % (search, search), 'total': '(1000 * (count(node_cpu_seconds_total{instance=~"%s.+",mode="user"})))' % (search) } if nature == 'mem': query = { 'usage': 'node_memory_MemFree_bytes{instance=~"%s.+"}' % (search), 'total': 'node_memory_MemTotal_bytes{instance=~"%s.+"}' % (search) } results = {'usage': 0, 'total': 0} for query_mode in query.keys(): value = prom.custom_query(query=query[query_mode]) result = 0 if len(value) > 0: value = value[0].get('value', [0, False])[-1] if not value: log.debug( f'[QUERY-PROMETHEUS] {search} for {mode} and {nature} not found results ({query})' ) return {} if nature == 'mem': result = int(value) if nature == 'cpu': result = int(round(float(value), 0)) log.debug( f'[QUERY-PROMETHEUS] {search} {mode} {nature} results: {result}' ) results[query_mode] = result results['available'] = results.get('total', 0) - results.get( 'usage', 0) return results # DEPLOYMENT METRICS if mode == 'deployment': # max_over_time(rate(container_cpu_usage_seconds_total{container_name="ditto-asia"}[1d:1h])[1d:1h]) # sum by (container_name)(max_over_time(rate(container_cpu_usage_seconds_total{container="ditto-asia"}[3d:3m])[3d:3m])) if nature == 'cpu': metric = 'container_cpu_usage_seconds_total' query = { 'max': 'sum by (container_name)(max_over_time(rate(%s{container="%s"}[3d:3m])[3d:3m]))' % (metric, search), 'avg': 'sum by (container_name)(avg_over_time(rate(%s{container="%s"}[3d:3m])[3d:3m]))' % (metric, search), 'min': 'sum by (container_name)(min_over_time(rate(%s{container="%s"}[3d:3m])[3d:3m]))' % (metric, search) } elif nature == 'mem': metric = 'container_memory_max_usage_bytes' query = { 'max': 'max(max_over_time(%s{container="%s"}[3d]))' % (metric, search), 'avg': 'max(avg_over_time(%s{container="%s"}[3d]))' % (metric, search), 'min': 'max(min_over_time(%s{container="%s"}[3d]))' % (metric, search) } results = {'max': 0, 'avg': 0, 'min': 0} for query_style in query.keys(): value = prom.custom_query(query=query[query_style]) result = 0 if len(value) > 0: value = value[0].get('value', [0, False])[-1] if not value: log.debug( f'[QUERY-PROMETHEUS] {search} for {mode} and {nature} ({query_style}) not found results ({query})' ) continue if nature == 'cpu': result = round(float(value) * 1000, 2) if nature == 'mem': result = round(float(value) / 1024 / 1024, 2) results[query_style] = result log.debug( f'[QUERY-PROMETHEUS] {search} {mode} {nature} {query_style} results: {result}' ) return results # POD METRICS if mode == 'pod': if nature == 'cpu': metric = 'container_cpu_usage_seconds_total' query = { 'max': 'max_over_time(rate(%s{pod="%s", container!="POD"}[3d:3m])[3d:3m])' % (metric, search), 'avg': 'avg_over_time(rate(%s{pod="%s", container!="POD"}[3d:3m])[3d:3m])' % (metric, search), 'min': 'min_over_time(rate(%s{pod="%s", container!="POD"}[3d:3m])[3d:3m])' % (metric, search) } elif nature == 'mem': metric = 'container_memory_max_usage_bytes' query = { 'max': 'max(max_over_time(%s{pod="%s"}[3d]))' % (metric, search), 'avg': 'max(avg_over_time(%s{pod="%s"}[3d]))' % (metric, search), 'min': 'min(min_over_time(%s{pod="%s"}[3d]))' % (metric, search) } results = {'max': 0, 'avg': 0, 'min': 0} for query_style in query.keys(): value = prom.custom_query(query=query[query_style]) result = 0 if len(value) > 0: value = value[0].get('value', [0, False])[-1] if not value: log.debug( f'[QUERY-PROMETHEUS] {search} for {mode} and {nature} ({query_style}) not found results ({query})' ) continue if nature == 'cpu': # transform to something human ready # style is '350'm (cpu fractions as base 100 scale per core) result = round(float(value) * 1000, 2) if nature == 'mem': # transforms to something human ready # style is '350.44'mb result = round(float(value) / 1024 / 1024, 2) results[query_style] = result log.debug( f'[QUERY-PROMETHEUS] {search} {mode} {nature} {query_style} results: {result}' ) return results return {}
async def run_promql(self, query: str, disable_ssl: bool = True) -> list: prometheus = PrometheusConnect(url=self.base_url, disable_ssl=disable_ssl) return prometheus.custom_query(query=query)
from prometheus_api_client import PrometheusConnect import json prometheus_host = 'http://localhost:9090' app = 'bookinfo' prom = PrometheusConnect(url=prometheus_host, disable_ssl=True) metrics_data = prom.custom_query(query="istio_requests_total") topology_json = {"components": [], "links": []} service_dict = {} link_counter = 1 #keep track on which services already extracted extracted_services = [] for item in metrics_data: element_inserted = False #service_source = item['metric']['source_canonical_service'] try: component = item['metric']['app'] #check if component is an istio component namespace = item['metric']['source_workload_namespace'] #service_destination_version = item['metric']['version'] #this two will be the destination of the link #service_destination = item['metric']['destination_canonical_service'] service_destination = item['metric']['destination_service_name'] #connection_point = item['metric']['destination_canonical_revision'] if (namespace != "istio-system"):