예제 #1
0
    def test_retry_on_error(self):  # noqa D102
        retry = Retry(total=3, backoff_factor=0.1, status_forcelist=[400])
        pc = PrometheusConnect(url=self.prometheus_host,
                               disable_ssl=True,
                               retry=retry)

        with self.assertRaises(requests.exceptions.RetryError,
                               msg="too many 400 error responses"):
            pc.custom_query("BOOM.BOOM!#$%")
  def get_gpu_number(self):
    max_available_gpu = 0
    pod_list = []
    ## Verify if dcgm-exporter is deployed
    try:
      pod_list = self.api_client.list_pod_for_all_namespaces(label_selector="app=nvidia-dcgm-exporter")
    except ApiException as e:
      if e.status != 404:
        _LOGGER.error("Exception when calling DCGM exporter pods: %s\n" % e)

    if len(pod_list.items) != 0:
      prom = PrometheusConnect(
        url=self.get_prometheus_url(),
        headers={"Authorization": "Bearer " + self.get_openshift_prometheus_token()},
        disable_ssl=True)

      for pod in pod_list.items:
        pod_IP = pod.status.pod_ip
        gpu_query = 'count (count by (UUID,GPU_I_ID) (DCGM_FI_PROF_GR_ENGINE_ACTIVE{instance="' + pod_IP +\
                    ':9400"}) or vector(0)) - count(count by (UUID,GPU_I_ID) (DCGM_FI_PROF_GR_ENGINE_ACTIVE{instance="'\
                    + pod_IP + ':9400", exported_pod=~".+"}) or vector(0))'

        get_available_gpu_in_node_data = prom.custom_query(query=gpu_query)

        get_available_gpu_in_node = int(get_available_gpu_in_node_data[0]['value'][1])

        if get_available_gpu_in_node > max_available_gpu:
            max_available_gpu = get_available_gpu_in_node
    return max_available_gpu
예제 #3
0
def collect_metrics():
    """Collect metrics from Prometheus/Thanos."""
    pc = PrometheusConnect(
        url=_THANOS_URL,
        headers={"Authorization": f"bearer {_THANOS_TOKEN}"},
        disable_ssl=True)

    collected_info = {}
    for sli_name, sli_methods in SLIReport.REPORT_SLI_CONTEXT.items():
        _LOGGER.info(f"Retrieving data for... {sli_name}")
        collected_info[sli_name] = {}
        for query_name, query in sli_methods["query"].items():
            _LOGGER.info(f"Querying... {query_name}")
            try:
                metric_data = pc.custom_query(query=query)
                _LOGGER.info(f"Metric obtained... {metric_data}")
                collected_info[sli_name][query_name] = float(
                    metric_data[0]["value"][1])
            except Exception as e:
                _LOGGER.exception(
                    f"Could not gather metric for {sli_name}-{query_name}...{e}"
                )
                pass
                collected_info[sli_name][query_name] = None

    return collected_info
예제 #4
0
def main():
    try:
        # Setting up Mongo DB
        MONGO_HOST = str(os.environ.get('MONGO_HOST', '127.0.0.1'))
        MONGO_PORT = str(os.environ.get('MONGO_PORT', '27017'))
        MONGO_DB = str(os.environ.get('MONGO_DBNAME', 'cpa'))
        MONGO_USER = str(os.environ.get('MONGO_USERNAME', 'root'))
        MONGO_PASS = str(os.environ.get('MONGO_PASSWORD', 'iRhrF6O0vp'))
        mongodb_client = MongoClient(
            'mongodb://{}:{}@{}:{}/?authSource=admin'.format(
                MONGO_USER, MONGO_PASS, MONGO_HOST, MONGO_PORT))

        cpa_db = mongodb_client[MONGO_DB]
        deployments_collection = cpa_db.deployments
        list_of_deployments = []

        for deployment in deployments_collection.find():
            list_of_deployments = deployment['list']

        # Setting up Prometheus
        prometheus_base = str(
            os.environ.get('PROMETHEUS_URL', 'http://192.168.23.92:9090'))
        prom = PrometheusConnect(url=prometheus_base, disable_ssl=True)

        # get workload cpu
        query_workload_cpu = """
        sum(
          irate(container_cpu_usage_seconds_total{cluster="", namespace="default"}[2m])
        * on(namespace,pod)
          group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster="", namespace="default", workload_type="deployment"}
        ) by (workload, workload_type)
        """
        get_workload_cpu_query = lambda: prom.custom_query(query=
                                                           query_workload_cpu)

        def get_deployments_cpu_usage(list_of_deployments):
            wl_cpu_res = get_workload_cpu_query()
            # filter results (unit is millicores)
            filtered_cpu_query = {
                q['metric']['workload']: float(q['value'][1]) * 1000
                for q in wl_cpu_res
                if q['metric']['workload'] in list_of_deployments
            }
            # if metric skipped, put in None instead
            for d in list_of_deployments:
                if d not in filtered_cpu_query:
                    filtered_cpu_query[d] = None
            return filtered_cpu_query

        deployments_cpu = get_deployments_cpu_usage(list_of_deployments)

        # Parse spec into a dict
        # spec = json.loads(r'{"resource": {"kind": "Deployment", "apiVersion": "apps/v1", "metadata": {"name": "redis-cart", "namespace": "default", "uid": "1b25ec34-965e-4f57-9638-b95e78edfe41", "resourceVersion": "2238", "generation": 1, "creationTimestamp": "2021-02-13T06:18:09Z", "annotations": {"deployment.kubernetes.io/revision": "1", "kubectl.kubernetes.io/last-applied-configuration": "{\"apiVersion\":\"apps/v1\",\"kind\":\"Deployment\",\"metadata\":{\"annotations\":{},\"name\":\"redis-cart\",\"namespace\":\"default\"},\"spec\":{\"selector\":{\"matchLabels\":{\"app\":\"redis-cart\"}},\"template\":{\"metadata\":{\"labels\":{\"app\":\"redis-cart\"}},\"spec\":{\"containers\":[{\"image\":\"redis:alpine\",\"livenessProbe\":{\"periodSeconds\":5,\"tcpSocket\":{\"port\":6379}},\"name\":\"redis\",\"ports\":[{\"containerPort\":6379}],\"readinessProbe\":{\"periodSeconds\":5,\"tcpSocket\":{\"port\":6379}},\"resources\":{\"limits\":{\"cpu\":\"125m\",\"memory\":\"256Mi\"},\"requests\":{\"cpu\":\"70m\",\"memory\":\"200Mi\"}},\"volumeMounts\":[{\"mountPath\":\"/data\",\"name\":\"redis-data\"}]}],\"volumes\":[{\"emptyDir\":{},\"name\":\"redis-data\"}]}}}}\n"}, "managedFields": [{"manager": "kubectl", "operation": "Update", "apiVersion": "apps/v1", "time": "2021-02-13T06:18:09Z", "fieldsType": "FieldsV1", "fieldsV1": {"f:metadata": {"f:annotations": {".": {}, "f:kubectl.kubernetes.io/last-applied-configuration": {}}}, "f:spec": {"f:progressDeadlineSeconds": {}, "f:replicas": {}, "f:revisionHistoryLimit": {}, "f:selector": {}, "f:strategy": {"f:rollingUpdate": {".": {}, "f:maxSurge": {}, "f:maxUnavailable": {}}, "f:type": {}}, "f:template": {"f:metadata": {"f:labels": {".": {}, "f:app": {}}}, "f:spec": {"f:containers": {"k:{\"name\":\"redis\"}": {".": {}, "f:image": {}, "f:imagePullPolicy": {}, "f:livenessProbe": {".": {}, "f:failureThreshold": {}, "f:periodSeconds": {}, "f:successThreshold": {}, "f:tcpSocket": {".": {}, "f:port": {}}, "f:timeoutSeconds": {}}, "f:name": {}, "f:ports": {".": {}, "k:{\"containerPort\":6379,\"protocol\":\"TCP\"}": {".": {}, "f:containerPort": {}, "f:protocol": {}}}, "f:readinessProbe": {".": {}, "f:failureThreshold": {}, "f:periodSeconds": {}, "f:successThreshold": {}, "f:tcpSocket": {".": {}, "f:port": {}}, "f:timeoutSeconds": {}}, "f:resources": {".": {}, "f:limits": {".": {}, "f:cpu": {}, "f:memory": {}}, "f:requests": {".": {}, "f:cpu": {}, "f:memory": {}}}, "f:terminationMessagePath": {}, "f:terminationMessagePolicy": {}, "f:volumeMounts": {".": {}, "k:{\"mountPath\":\"/data\"}": {".": {}, "f:mountPath": {}, "f:name": {}}}}}, "f:dnsPolicy": {}, "f:restartPolicy": {}, "f:schedulerName": {}, "f:securityContext": {}, "f:terminationGracePeriodSeconds": {}, "f:volumes": {".": {}, "k:{\"name\":\"redis-data\"}": {".": {}, "f:emptyDir": {}, "f:name": {}}}}}}}}, {"manager": "k3s", "operation": "Update", "apiVersion": "apps/v1", "time": "2021-02-13T06:18:21Z", "fieldsType": "FieldsV1", "fieldsV1": {"f:metadata": {"f:annotations": {"f:deployment.kubernetes.io/revision": {}}}, "f:status": {"f:availableReplicas": {}, "f:conditions": {".": {}, "k:{\"type\":\"Available\"}": {".": {}, "f:lastTransitionTime": {}, "f:lastUpdateTime": {}, "f:message": {}, "f:reason": {}, "f:status": {}, "f:type": {}}, "k:{\"type\":\"Progressing\"}": {".": {}, "f:lastTransitionTime": {}, "f:lastUpdateTime": {}, "f:message": {}, "f:reason": {}, "f:status": {}, "f:type": {}}}, "f:observedGeneration": {}, "f:readyReplicas": {}, "f:replicas": {}, "f:updatedReplicas": {}}}}]}, "spec": {"replicas": 1, "selector": {"matchLabels": {"app": "redis-cart"}}, "template": {"metadata": {"creationTimestamp": null, "labels": {"app": "redis-cart"}}, "spec": {"volumes": [{"name": "redis-data", "emptyDir": {}}], "containers": [{"name": "redis", "image": "redis:alpine", "ports": [{"containerPort": 6379, "protocol": "TCP"}], "resources": {"limits": {"cpu": "125m", "memory": "256Mi"}, "requests": {"cpu": "70m", "memory": "200Mi"}}, "volumeMounts": [{"name": "redis-data", "mountPath": "/data"}], "livenessProbe": {"tcpSocket": {"port": 6379}, "timeoutSeconds": 1, "periodSeconds": 5, "successThreshold": 1, "failureThreshold": 3}, "readinessProbe": {"tcpSocket": {"port": 6379}, "timeoutSeconds": 1, "periodSeconds": 5, "successThreshold": 1, "failureThreshold": 3}, "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent"}], "restartPolicy": "Always", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "securityContext": {}, "schedulerName": "default-scheduler"}}, "strategy": {"type": "RollingUpdate", "rollingUpdate": {"maxUnavailable": "25%", "maxSurge": "25%"}}, "revisionHistoryLimit": 10, "progressDeadlineSeconds": 600}, "status": {"observedGeneration": 1, "replicas": 1, "updatedReplicas": 1, "readyReplicas": 1, "availableReplicas": 1, "conditions": [{"type": "Available", "status": "True", "lastUpdateTime": "2021-02-13T06:18:21Z", "lastTransitionTime": "2021-02-13T06:18:21Z", "reason": "MinimumReplicasAvailable", "message": "Deployment has minimum availability."}, {"type": "Progressing", "status": "True", "lastUpdateTime": "2021-02-13T06:18:21Z", "lastTransitionTime": "2021-02-13T06:18:09Z", "reason": "NewReplicaSetAvailable", "message": "ReplicaSet \"redis-cart-74594bd569\" has successfully progressed."}]}}, "runType": "scaler"}')
        spec = json.loads(sys.stdin.read())

        metric(spec, list_of_deployments, deployments_cpu)
    except Exception as err:
        sys.stderr.write(f"Error metric: {err}")
        exit(1)
예제 #5
0
    def get_current(self) -> float:
        prom = PrometheusConnect(
            url=self.config.get("url", "http://localhost:9090"),
            disable_ssl=self.config.get("disable_ssl", True),
        )
        res = prom.custom_query(query=self.query)
        if not res:
            log.error("Prometheus query: no result")
            raise Exception("Prometheus query: no result")

        log.info(f"Prometheus query result: {res}")
        return float(res[0].get("value")[-1])
예제 #6
0
class my_prometheus():
    def __init__(self, host, port, disablessl):
        if disablessl == True:
            self.schema = "http"
        else:
            self.schema = "https"
        try:
            self.prom = PrometheusConnect(url=self.schema + "://" + host +
                                          ":" + port,
                                          disable_ssl=disablessl)
        except Exception:
            print("Fehler")

    def prom_query(self, query):
        self.lasttemps = self.prom.custom_query(query=query)
        self.lasttemp = sorted(self.lasttemps[0]["values"], reverse=True)[0][1]
예제 #7
0
class PrometheusClient:
    def __init__(self, promhost, promport):
        self.prom = PrometheusConnect(url="http://%s:%s" %
                                      (promhost, promport),
                                      disable_ssl=True)

    def get_ticktime(self):
        return self.__get_metric_for_last_five_mins("overall_ticktime")[0].get(
            "values")

    def get_dim_ticktime(self):
        result = {}
        dim_ticktimes = self.__get_metric_for_last_five_mins("dim_ticktime")
        for dimension in dim_ticktimes:
            result[dimension.get("metric").get(
                "dimension_name")] = dimension.get("values")
        return result

    def get_players(self):
        players = []
        for p in self.prom.custom_query("player_playtime"):
            players.append(p.get("metric").get("player"))
        return players

    def get_tps(self):
        return self.__get_metric_for_last_five_mins("overall_tps")[0].get(
            "values")

    def get_dim_tps(self):
        result = {}
        dim_tps = self.__get_metric_for_last_five_mins("dim_tps")
        for dimension in dim_tps:
            result[dimension.get("metric").get(
                "dimension_name")] = dimension.get("values")
        return result

    def __get_metric_for_last_five_mins(self, metricname):
        return self.prom.get_metric_range_data(
            metric_name=metricname,
            start_time=datetime.datetime.now() - datetime.timedelta(minutes=5),
            end_time=datetime.datetime.now(),
        )
예제 #8
0
def queryMetrics(customquery, trim):
    # print("\n queryMetrics START\n")

    prom = PrometheusConnect(url ="http://localhost:9090", disable_ssl=True)

    data = prom.custom_query(query=customquery,
     )
    # To make it a table where each row is a metric
    df = MetricSnapshotDataFrame(data)
    df = df[df.value != "NaN"]

    df[['value']] = df[['value']].apply(pd.to_numeric)
    df[['timestamp']] = df[['timestamp']].apply(pd.to_datetime, unit='s')

    sortedDf = df.sort_values('value', ascending=False).head(trim)

    # print(nicenumbers)
    # print(df.index)
    # print(df.columns)
    # print("\n queryMetrics END\n")
    return sortedDf
class TestPrometheusConnectWithMockedNetwork(BaseMockedNetworkTestcase):
    """
    Network is blocked in this testcase, see base class
    """
    def setUp(self):
        self.pc = PrometheusConnect(url='http://doesnt_matter.xyz',
                                    disable_ssl=True)

    def test_network_is_blocked(self):
        resp = requests.get('https://google.com')
        self.assertEqual(resp.status_code, 403)
        self.assertEqual(resp.text, 'BOOM!')

    def test_how_mock_prop_works(self):
        with self.mock_response('kekekeke', status_code=500) as handler:
            self.assertEqual(len(handler.requests), 0)
            resp = requests.get('https://redhat.com')
            self.assertEqual(resp.status_code, 500)
            self.assertEqual(resp.text, 'kekekeke')

            self.assertEqual(len(handler.requests), 1)
            request = handler.requests[0]
            self.assertEqual(request.url, 'https://redhat.com/')

    def test_unauthorized(self):
        with self.mock_response("Unauthorized", status_code=403):
            with self.assertRaises(PrometheusApiClientException) as exc:
                self.pc.all_metrics()
        self.assertEqual("HTTP Status Code 403 (b'Unauthorized')",
                         str(exc.exception))

    def test_broken_responses(self):
        with self.assertRaises(PrometheusApiClientException) as exc:
            self.pc.all_metrics()
        self.assertEqual("HTTP Status Code 403 (b'BOOM!')", str(exc.exception))

        with self.assertRaises(PrometheusApiClientException) as exc:
            self.pc.get_current_metric_value("metric")
        self.assertEqual("HTTP Status Code 403 (b'BOOM!')", str(exc.exception))

        with self.assertRaises(PrometheusApiClientException) as exc:
            self.pc.get_metric_range_data("metric")
        self.assertEqual("HTTP Status Code 403 (b'BOOM!')", str(exc.exception))

        with self.assertRaises(PrometheusApiClientException) as exc:
            self.pc.custom_query_range("query", datetime.now(), datetime.now(),
                                       "1")
        self.assertEqual("HTTP Status Code 403 (b'BOOM!')", str(exc.exception))

        with self.assertRaises(PrometheusApiClientException) as exc:
            self.pc.custom_query("query")
        self.assertEqual("HTTP Status Code 403 (b'BOOM!')", str(exc.exception))

    def test_all_metrics_method(self):
        all_metrics_payload = {"status": "success", "data": ["up", "alerts"]}

        with self.mock_response(all_metrics_payload) as handler:
            self.assertTrue(len(self.pc.all_metrics()))
            self.assertEqual(handler.call_count, 1)
            request = handler.requests[0]
            self.assertEqual(request.path_url, "/api/v1/label/__name__/values")
예제 #10
0
class PrometheusMetricProvider(MetricProvider):
    def __init__(self, nodes: NodeDataView):
        super().__init__(nodes)
        self._prom = PrometheusConnect(url=settings.prometheus.url)

    def get_metric(self, metric: PrometheusMetric) -> List:
        try:
            return self.__prom_request(metric.query)
        except PrometheusApiClientException as e:
            logger.error(f"Error pulling {metric}: {e}")
        return []

    def __prom_request(self, query: str) -> List:
        return self._prom.custom_query(query)

    def refresh_data(self):
        logger.debug("Pulling metrics from Prometheus")
        self._data["messages_in"] = self.__get_messages_in()
        self._data["messages_out"] = self.__get_messages_out()
        self._data["consumer_lag"] = self.__get_consumer_lag()
        self._data["consumer_read_rate"] = self.__get_consumer_read_rate()
        self._data["topic_size"] = self.__get_topic_size()
        self._data["replicas"] = self.__get_replicas()
        self._data["connector_tasks"] = self.__get_connector_tasks()

    def __get_messages_in(self) -> Dict[str, float]:
        prom_messages_in = self.get_metric(metric=PrometheusMetric.MESSAGES_IN)
        return {
            d["metric"]["topic"]: round(float(d["value"][-1]), 2)
            for d in prom_messages_in
        }

    def __get_messages_out(self) -> Dict[str, float]:
        prom_messages_out = self.get_metric(
            metric=PrometheusMetric.MESSAGES_OUT)
        return {
            d["metric"]["topic"]: round(float(d["value"][-1]), 2)
            for d in prom_messages_out
        }

    def __get_consumer_lag(self) -> Dict[str, int]:
        prom_consumer_lag = self.get_metric(
            metric=PrometheusMetric.CONSUMER_LAG)
        return {
            d["metric"]["group"]: int(d["value"][-1])
            for d in prom_consumer_lag
        }

    def __get_consumer_read_rate(self) -> Dict[str, float]:
        prom_consumer_read_rate = self.get_metric(
            metric=PrometheusMetric.CONSUMER_READ_RATE)
        return {
            d["metric"]["group"]: float(d["value"][-1])
            for d in prom_consumer_read_rate
        }

    def __get_topic_size(self) -> Dict[str, int]:
        prom_topic_size = self.get_metric(metric=PrometheusMetric.TOPIC_SIZE)
        return {
            d["metric"]["topic"]: int(d["value"][-1])
            for d in prom_topic_size
        }

    def __get_replicas(self) -> Dict[str, int]:
        prom_replicas = self.get_metric(metric=PrometheusMetric.REPLICAS)
        return {
            d["metric"]["deployment"]: int(d["value"][-1])
            for d in prom_replicas
        }

    def __get_connector_tasks(self) -> Dict[str, int]:
        prom_connector_tasks = self.get_metric(
            metric=PrometheusMetric.CONNECTOR_TASKS)
        return {
            d["metric"]["connector"]: int(d["value"][-1])
            for d in prom_connector_tasks
        }
예제 #11
0
def process_period(config, period):
    period_start = period['instant'] + dateutil.relativedelta.relativedelta(
        seconds=-period['range_sec'])
    print(
        f"Processing year {period['year']}, month {period['month']}, "
        f"querying from {period['instant'].isoformat()} and going back {period['range_sec']} s to {period_start.isoformat()}."
    )
    queries = QueryLogic(queryRange=(str(period['range_sec']) + 's'))

    # SSL generally not used for Prometheus access within a cluster
    # Docs on instant query API: https://prometheus.io/docs/prometheus/latest/querying/api/#instant-queries
    prom = PrometheusConnect(url=config.prometheus_server, disable_ssl=True)
    prom_connect_params = {
        'time': period['instant'].isoformat(),
        'timeout': config.query_timeout
    }

    raw_results, results, result_lengths = {}, {}, []
    # iterate over each query (cputime, starttime, endtime, cores) producing raw_results['cputime'] etc.
    for query_name, query_string in vars(queries).items():
        # Each of these raw_results is a list of dicts. Each dict in the list represents an individual data point, and contains:
        # 'metric': a dict of one or more key-value pairs of labels, one of which is the pod name ('exported_pod').
        # 'value': a list in which the 0th element is the timestamp of the value, and 1th element is the actual value we're interested in.
        print(f'Executing {query_name} query: {query_string}')
        t1 = timer()
        raw_results[query_name] = prom.custom_query(query=query_string,
                                                    params=prom_connect_params)
        t2 = timer()
        results[query_name] = dict(rearrange(raw_results[query_name]))
        result_lengths.append(len(results[query_name]))
        t3 = timer()
        print(
            f'Query finished in {t2 - t1} s, processed in {t3 - t2} s. Got {len(results[query_name])} items from {len(raw_results[query_name])} results. Peak RAM usage: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss}K.'
        )
        del raw_results[query_name]

    cputime = results['cputime']
    endtime = results['endtime']
    starttime = results['starttime']
    cores = results['cores']

    # Confirm the assumption that cputime should have the fewest entries, while starttime and cores may have additional ones
    # corresponding to jobs that have started but not finished yet, and endtime may have additional ones if there are pods without CPU resource requests.
    # We only want the jobs for which all values are available: start time, end time, CPU request.
    # Note that jobs which started last month and finished this month will be properly included and accounted in this month.
    assert len(cputime) == min(
        result_lengths), "cputime should be the shortest list"
    # However, jobs that finished last month may show up in this month's data if they are still present on the cluster this month (in Completed state).
    # Exclude them by filtering with a lambda (since you can't pass an argument to a function object AFAIK).
    endtime = dict(
        filter(lambda x: x[1] >= datetime.datetime.timestamp(period_start),
               endtime.items()))
    # Prepare to iterate over jobs which meet all criteria.
    valid_jobs = cputime.keys() & endtime.keys()
    # avoid sending empty records
    if len(valid_jobs) == 0:
        print('No records to process.')
        return

    sum_cputime = 0
    t4 = timer()
    for key in valid_jobs:
        assert endtime[key] > starttime[
            key], "job end time is before start time"
        # double check cputime calc of this job
        delta = abs(cputime[key] -
                    (endtime[key] - starttime[key]) * cores[key])
        assert delta < 0.001, "cputime calculation is inaccurate"
        sum_cputime += cputime[key]

    # CPU time as calculated here means (# cores * job duration), which apparently corresponds to
    # the concept of wall time in APEL accounting. It is not clear what CPU time means in APEL;
    # could be the actual CPU usage % integrated over the job (# cores * job duration * usage)
    # but this does not seem to be documented clearly. Some batch systems do not actually measure
    # this so it is not reported consistently or accurately. Some sites have CPU efficiency
    # (presumably defined as CPU time / wall time) time that is up to ~ 500% of the walltime, or
    # always fixed at 100%. In Kubernetes, the actual CPU usage % is tracked by metrics server
    # (not KSM), which is not meant to be used for monitoring or accounting purposes and is not
    # scraped by Prometheus. So just use walltime = cputime
    sum_cputime = round(sum_cputime)
    sum_walltime = sum_cputime

    print(f'total cputime: {sum_cputime}, total walltime: {sum_walltime}')
    # Write output to the message queue on local filesystem
    # https://dirq.readthedocs.io/en/latest/queuesimple.html#directory-structure
    dirq = QueueSimple(str(config.output_path))
    summary_output = summary_message(
        config,
        year=period['year'],
        month=period['month'],
        wall_time=sum_walltime,
        cpu_time=sum_cputime,
        n_jobs=len(endtime),
        # this appears faster than getting min/max during the dict iteration above
        first_end=round(min(endtime.values())),
        last_end=round(max(endtime.values())))
    sync_output = sync_message(config,
                               year=period['year'],
                               month=period['month'],
                               n_jobs=len(endtime))
    t5 = timer()
    summary_file = dirq.add(summary_output)
    sync_file = dirq.add(sync_output)
    print(f'Analyzed {len(endtime)} records in {t5 - t4} s.')
    print(f'Writing summary record to {config.output_path}/{summary_file}:')
    print('--------------------------------\n' + summary_output +
          '--------------------------------')
    print(f'Writing sync record to {config.output_path}/{sync_file}:')
    print('--------------------------------\n' + sync_output +
          '--------------------------------')
예제 #12
0
class CaptureHelper():
    def __init__(self, docker_client_services_path,
                 docker_server_services_path, ingress_distribution_file_path,
                 docker_lb_container_path, service_list):
        self.docker_client_services = get_docker_services(
            docker_client_services_path)
        self.docker_server_services = get_docker_services(
            docker_server_services_path)
        self.get_ingress_distribution = get_docker_services(
            ingress_distribution_file_path)
        self.docker_lb_services = get_docker_services(docker_lb_container_path)
        self.prom = PrometheusConnect(url="http://131.155.35.54:9090",
                                      disable_ssl=True)
        self.capture_time = CAPTURE_TIME
        self.service_list = service_list
        pass

    def capture_data(self, ingress_nodes, time_up_already):
        """
        Input: array of ingress nodes to measure
        ex: ingress_nodes = ["node1", "node2"]
        Get the latency from Prometheous server API
        return:
        [

            latency = {"node4":{"search":24, "shop":26, "web":{self.capture_time}, "media":10}, 
                    "node3":{"search":24, "shop":26, "web":{self.capture_time}, "media":10}} -> for each client in ingress node
            dropped_traffic = {"node4":{"search":24, "shop":26, "web":{self.capture_time}, "media":10}, 
                    "node3":{"search":24, "shop":26, "web":{self.capture_time}, "media":10}
                    "node2: {"search":24, "shop":26, "web":{self.capture_time}, "media":10}} -> for each server in edge node
            ingress_bw = {"node4":{"search":200, "shop":560, "web":100, "media":{self.capture_time}00}, 
                    "node3":{"search":250, "shop":700, "web":450, "media":2900}} -> for each client in ingress node
        ]
        """
        # sleep 15s, 5s for stablize, 10s for calculation
        sleep_time = self.capture_time
        if time_up_already < self.capture_time:
            sleep_time = self.capture_time - time_up_already
        elif time_up_already < self.capture_time * 1.5:
            sleep_time = 1
        logger.info(f"sleep_time: {sleep_time}")
        time.sleep(sleep_time)
        # Latency:

        # Ingress request number:
        ingress_request = self.calculate_ingress_request()
        # Dropped connections:
        # pre_dropped_conn = self.calculate_dropped_connection()
        # dropped_conn = self.pre_process(pre_dropped_conn)

        # pre_success_conn = self.calculate_success_connection()
        # success_conn = self.pre_process(pre_success_conn)
        pre_succ_request = self.get_metric_value("success_conn_total")
        succ_request = self.pre_process(pre_succ_request)

        pre_drop_request = self.get_metric_value("drop_conn_total")
        drop_request = self.pre_process(pre_drop_request)

        # pre_latency = self.calculate_latency(ingress_nodes)
        pre_latency = self.calculate_latency_histogram()
        latency = self.pre_process(pre_latency)

        # self.calculate_latency_histogram()
        # latency = {"node4": {"search": 24, "shop": 26, "web": {self.capture_time}, "media": 10},
        #            "node3": {"search": 24, "shop": 26, "web": {self.capture_time}, "media": 10}}

        # ingress_request = {"node4": {"search": 200, "shop": 560, "web": 100, "media": {self.capture_time}00},
        #                    "node3": {"search": 250, "shop": 700, "web": 450, "media": 2900}}

        return latency, drop_request, succ_request, ingress_request

    def check_lb_containers(self):
        """
        Check if any lb container (except media for now) is empty or not up yet and return the name!
        """
        working = True
        container_list = list()
        for node, container in self.docker_lb_services.items():
            for cont_name, cont_value in container.items():
                number = ''.join(x for x in cont_name if x.isdigit())
                url = "http://" + cont_value[
                    'IP_ADDRESS_NODE_' + str(number)] + ":" + str(
                        cont_value['PORT_ADDRESS_SEARCH_LISTEN'])
                # TODO: try and catch here for the module!
                # try:
                response = check_connection(url)
                if not response:
                    logger.error(f"Error in container: {cont_name}")
                    container_list.append(cont_name)
        return working, container_list

    def check_client_containers(self):
        """
        Check if any client container (except media for now) is empty or not up yet and return the name!
        """
        time_up_already = list()
        working = True
        container_list = list()
        # print(metrics_array)
        for node, container in self.docker_client_services.items():
            for cont_name, cont_value in container.items():
                if self.is_container_in_service_list(
                        container_name=cont_name
                ) and self.is_container_has_user(node=node,
                                                 container_name=cont_name):
                    url = "http://" + cont_value['IP_ADDRESS'] + ":" + str(
                        int(cont_value['PORT_NUMBER']) + 100) + "/metrics"
                    # print(url)
                    # print(cont_name)
                    response = check_connection(url)
                    # print(response.status_code)
                    if not response:
                        logger.error(f"Error in container: {cont_name}")
                        container_list.append(cont_name)
                    else:
                        conn_value = -1
                        metrics_array = self.prom.custom_query(
                            query="summary_request_latency_seconds_count")

                        flag_got_value, conn_value_get = self.get_value(
                            cont_name, metrics_array)
                        logger.error(
                            f"container: {cont_name} conn_value_get: {conn_value_get} with flag: {flag_got_value}"
                        )
                        if flag_got_value == True:
                            conn_value = int(float(conn_value_get[1]))
                            if conn_value == 0:
                                container_list.append(cont_name)
                            else:
                                time_up_already.append(conn_value)
                        else:
                            container_list.append(cont_name)
        #FIXME: ignore media containers for now to work with other 3 services first!
        con_list = []
        for con in container_list:
            if "media" in con:
                continue
            con_list.append(con)
        if len(con_list) > 0:
            working = False
        logger.info(f"time for each node: {time_up_already}")
        time_return = 0
        if (len(time_up_already) > 0):
            time_return = min(time_up_already)
        return time_return, working, con_list
        # for container in name_list:
        #     if requests.get(URL)
        # self.prom

    def pre_process(self, arr_data):
        """
        input: latency = {"node4": {"search_client_4": 24, "shop": 26, "web": {self.capture_time}, "media": 10},
                    "node3": {"search": 24, "shop": 26, "web": {self.capture_time}, "media": 10}}
        based on ingress_distribution: 
        output: latency = {"search_service": 24, "shop_service": 26, "web_service": {self.capture_time}, "media_service":10}
        """
        data = dict()
        service_list = list(self.get_ingress_distribution['node4'].keys())
        for service in service_list:
            data[service] = 0
        # for each service:
        for service in service_list:
            total_service_factor = 0
            for node, container in arr_data.items():
                for client, value in container.items():
                    if service in client:
                        if value == -1 or self.get_ingress_distribution[node][
                                service] == 0:
                            continue
                        else:
                            total_service_factor += 1
                            data[service] += value
            if total_service_factor != 0:
                data[service] = data[service] / total_service_factor
        return data

    def calculate_latency_histogram(self):
        latency = dict()
        logger.info(f"{self.docker_client_services.items()}")
        for node, container in self.docker_client_services.items():
            cont_dict = dict()
            for container in container.keys():
                if self.is_container_in_service_list(container):
                    logger.info(
                        f"histogram_quantile(0.9, sum(rate(request_latency_seconds_{container}_bucket[{self.capture_time}s])) by (le))"
                    )
                    metric_array = self.prom.custom_query(
                        query=
                        f"histogram_quantile(0.9, sum(rate(request_latency_seconds_{container}_bucket[{self.capture_time}s])) by (le))"
                    )
                    logger.info(f"metric_array: {metric_array}")
                    value_latency = metric_array[0]['value'][1]
                    # if container in list(metric_dict.keys()):
                    if (value_latency == 'NaN'):
                        value = 0
                    else:
                        value = float(value_latency)
                    cont_dict[container] = value
            latency[node] = cont_dict
        print(latency)
        return latency

    def calculate_latency(self, ingress_nodes):
        metrics_array = self.prom.custom_query(
            query=
            f"rate(summary_request_latency_seconds_sum[{self.capture_time}s])")
        latency = dict()
        for node, container in self.docker_client_services.items():
            cont_dict = dict()
            for container in container.keys():
                latency_value = -1
                flag_got_value, latency_value_get = self.get_value(
                    container, metrics_array)
                if flag_got_value == True:
                    latency_value = int(float(latency_value_get[1]))
                cont_dict[container] = latency_value
            latency[node] = cont_dict
        # print(latency)
        return latency

    def get_metric_value(self, cmd):
        metrics_array = self.prom.custom_query(query=cmd)
        return_value = dict()
        for node, container in self.docker_client_services.items():
            cont_dict = dict()
            for container in container.keys():
                if self.is_container_in_service_list(container_name=container):
                    val = -1
                    flag_got_value, value_get = self.get_value(
                        container, metrics_array)
                    if flag_got_value == True:
                        val = int(float(value_get[1]))
                    cont_dict[container] = val
            return_value[node] = cont_dict
        # print(return_value)
        return return_value

    def get_array_values(self, container, query_cmd):
        metrics_array_succ = self.prom.custom_query(query=query_cmd)
        flag_got_value, values = self.get_values(container, metrics_array_succ)
        arr = list()
        if flag_got_value:
            arr = [int(float(value[1])) for value in values]
        return arr

    def is_container_has_ingress(self, container_name, node):
        has_ingress_traffic = False
        # logger.info(f"container_name: {container_name}")
        for service, value in self.get_ingress_distribution[node].items():
            # logger.info(f"service: {service}")
            # logger.info(f"value: {value}")
            if service in container_name and value != 0:
                has_ingress_traffic = True
                return has_ingress_traffic
        return has_ingress_traffic

    def is_container_in_service_list(self, container_name):
        is_found = False
        for service in self.service_list:
            if service in container_name:
                is_found = True
        return is_found

    def is_container_has_user(self, node, container_name):
        is_found = False
        for service, value in self.get_ingress_distribution[node].items():
            if service in container_name and value > 0:
                is_found = True
        return is_found

    def calculate_ingress_request(self):
        # metrics_array_succ = self.prom.custom_query(query="success_conn_total[10s]")
        # metrics_array_drop = self.prom.custom_query(query="drop_conn_total[10s]")
        # print(metrics_array_succ)
        # print(metrics_array_drop)

        # working = True
        ingress_request = dict()
        for node, container in self.docker_client_services.items():
            cont_dict = dict()
            for container in container.keys():
                # print(container)
                if self.is_container_in_service_list(container_name=container):

                    average_ingress_requests = 0
                    qualify_array_capture = False
                    while (qualify_array_capture == False):
                        arr_succ = self.get_array_values(
                            container,
                            f"success_conn_total[{self.capture_time}s]")
                        arr_drop = self.get_array_values(
                            container,
                            f"drop_conn_total[{self.capture_time}s]")
                        # logger.info(f"container: {container}")
                        # logger.info(self.get_ingress_distribution)

                        logger.info(f"arr_succ: {arr_succ}")
                        logger.info(f"arr_drop: {arr_drop}")
                        # logger.info(f"length arr_succ: {len(arr_succ)}")
                        # logger.info(f"length arr_drop: {len(arr_drop)}")
                        # logger.info(f"self.capture_time: {self.capture_time}")
                        # check if the length is enough?
                        if len(arr_drop) == self.capture_time and len(
                                arr_succ) == self.capture_time:
                            if self.is_container_has_ingress(
                                    container_name=container, node=node):
                                # if 0 in arr_succ:
                                #     logger.info(f"There is a 0")
                                #     qualify_array_capture = False
                                #     time.sleep(1)
                                # else:
                                qualify_array_capture = True
                            else:
                                qualify_array_capture = True
                        else:
                            logger.info(f"In here")
                            qualify_array_capture = False
                            time.sleep(1)

                    arr_sum = [
                        arr_succ[i] + arr_drop[i] for i in range(len(arr_succ))
                    ]
                    logger.info(f"arr_sum: {arr_sum}")
                    diff = [
                        arr_sum[i + 1] - arr_sum[i]
                        for i in range(len(arr_sum) - 1)
                    ]
                    logger.info(f"diff: {diff}")
                    logger.info(
                        f"sum(diff): {sum(diff)} len(diff): {len(diff)}")
                    average_ingress_requests = float(sum(diff) / len(diff))
                    cont_dict[container] = average_ingress_requests
            ingress_request[node] = cont_dict
        print(ingress_request)
        return ingress_request

    def calculate_dropped_connection(self):
        metrics_array_accepted = self.prom.custom_query(
            query=f"nginx_connections_accepted[{self.capture_time}s]")
        metrics_array_handled = self.prom.custom_query(
            query=f"nginx_connections_handled[{self.capture_time}s]")
        dropped_connection = dict()
        for node, container in self.docker_server_services.items():
            cont_dict = dict()
            for container in container.keys():
                dropped_conn = -1
                flag_got_value_accepted, accepted_conn = self.get_values(
                    container, metrics_array_accepted)
                flag_got_value_handled, handled_conn = self.get_values(
                    container, metrics_array_handled)
                if flag_got_value_handled == True and flag_got_value_accepted == True:
                    array_accepted = [
                        int(float(value[1])) for value in accepted_conn
                    ]
                    array_handled = [
                        int(float(value[1])) for value in handled_conn
                    ]
                    array_dropped = [
                        array_handled[i] - array_accepted[i]
                        for i in range(len(array_accepted))
                    ]
                    dropped_conn = sum(array_dropped)
                cont_dict[container] = dropped_conn
            dropped_connection[node] = cont_dict
        return dropped_connection

    def calculate_capacity(self):
        metrics_array_handled = self.prom.custom_query(
            query=f"nginx_connections_handled[{self.capture_time}s]")
        succ_connection = dict()
        for node, container in self.docker_server_services.items():
            cont_dict = dict()
            for container in container.keys():
                succ_conn = -1
                flag_got_value_handled, handled_conn = self.get_values(
                    container, metrics_array_handled)

                print("\n\n")
                print(container)
                print(handled_conn)
                if flag_got_value_handled == True:
                    array_handled = [
                        int(float(value[1])) for value in handled_conn
                    ]
                    print(array_handled)
                    succ_conn = array_handled[-1] - array_handled[0]
                cont_dict[container] = succ_conn / self.capture_time
            succ_connection[node] = cont_dict
        print(succ_connection)
        return succ_connection

    def get_value(self, container, metrics_array):
        return_value = 1
        flag_got_value = False
        for job in metrics_array:
            if job['metric']['job'] == container:
                return_value = job['value']
                flag_got_value = True
                break
        return flag_got_value, return_value

    def get_values(self, container, metrics_array):
        return_value = 1
        flag_got_value = False
        for job in metrics_array:
            if job['metric']['job'] == container:
                return_value = job['values']
                flag_got_value = True
                break
        return flag_got_value, return_value
예제 #13
0
def collect_metrics(configuration: Configuration, sli_report: SLIReport):
    """Collect metrics from Prometheus/Thanos."""
    if not _DRY_RUN:
        pc = PrometheusConnect(
            url=configuration.thanos_url,
            headers={"Authorization": f"bearer {configuration.thanos_token}"},
            disable_ssl=True,
        )

    collected_info = {}

    for sli_name, sli_methods in sli_report.report_sli_context.items():
        _LOGGER.info(f"Retrieving data for... {sli_name}")
        collected_info[sli_name] = {}

        for query_name, query_inputs in sli_methods["query"].items():

            requires_range = False

            if isinstance(query_inputs, dict):
                query = query_inputs["query"]
                requires_range = query_inputs["requires_range"]
                action_type = query_inputs["type"]
            else:
                query = query_inputs

            _LOGGER.info(f"Querying... {query_name}")
            _LOGGER.info(f"Using query... {query}")

            try:
                if not _DRY_RUN:

                    if requires_range:
                        metric_data = pc.custom_query_range(
                            query=query,
                            start_time=configuration.start_time,
                            end_time=configuration.end_time,
                            step=configuration.step,
                        )

                    else:
                        metric_data = pc.custom_query(query=query)

                    _LOGGER.info(f"Metric obtained... {metric_data}")

                    if requires_range:
                        metrics_vector = [
                            float(v[1]) for v in metric_data[0]["values"]
                            if float(v[1]) > 0
                        ]
                        result = manipulate_retrieved_metrics_vector(
                            metrics_vector=metrics_vector, action=action_type)

                        collected_info[sli_name][query_name] = result

                    else:
                        collected_info[sli_name][query_name] = float(
                            metric_data[0]["value"][1])

                else:
                    metric_data = [{
                        "metric": "dry run",
                        "value": [datetime.datetime.utcnow(), 0]
                    }]
                    result = float(metric_data[0]["value"][1])
                    collected_info[sli_name][query_name] = result

            except Exception as e:
                _LOGGER.exception(
                    f"Could not gather metric for {sli_name}-{query_name}...{e}"
                )
                pass
                collected_info[sli_name][query_name] = "ErrorMetricRetrieval"

    return collected_info
예제 #14
0
def query_prometheus(search, mode='pod', nature='cpu', prom_url=False):
    #/api/v1/query?query=avg(rate(container_cpu_usage_seconds_total%7Bnamespace%3D%22jarvis%22%2Ccontainer%3D%22cast-service%22%7D%5B5d%5D))
    prom_url = 'http://prometheus.k.nutz.site'
    if isinstance(prom_url, str):
        prom = PrometheusConnect(url=prom_url, disable_ssl=False)
    elif isinstance(prom_url, bool):
        prom = False
    else:
        prom = prom_url
    if not prom:
        log.debug(
            f'[QUERY-PROMETHEUS] Prometheus is Disabled or not found by K8s Service'
        )
        return {}
    if nature not in ['cpu', 'mem']:
        log.error(f'[QUERY-PROMETHEUS] Nature {nature} not valid')
        return {}
    if mode not in ['node', 'pod', 'deployment']:
        log.error(f'[QUERY-PROMETHEUS] Mode {mode} not valid')
        return {}
    # NODE METRICS
    if mode == 'node':
        if nature == 'cpu':
            query = {
                'usage':
                '(1000 *  (count(node_cpu_seconds_total{instance=~"%s.+",mode="user"}) \
                - avg(sum by (mode)(irate(node_cpu_seconds_total{instance=~"%s.+",mode="idle"}[5m])))))'
                % (search, search),
                'total':
                '(1000 * (count(node_cpu_seconds_total{instance=~"%s.+",mode="user"})))'
                % (search)
            }
        if nature == 'mem':
            query = {
                'usage':
                'node_memory_MemFree_bytes{instance=~"%s.+"}' % (search),
                'total':
                'node_memory_MemTotal_bytes{instance=~"%s.+"}' % (search)
            }
        results = {'usage': 0, 'total': 0}
        for query_mode in query.keys():
            value = prom.custom_query(query=query[query_mode])
            result = 0
            if len(value) > 0:
                value = value[0].get('value', [0, False])[-1]
                if not value:
                    log.debug(
                        f'[QUERY-PROMETHEUS] {search} for {mode} and {nature} not found results ({query})'
                    )
                    return {}
                if nature == 'mem':
                    result = int(value)
                if nature == 'cpu':
                    result = int(round(float(value), 0))
            log.debug(
                f'[QUERY-PROMETHEUS] {search} {mode} {nature} results: {result}'
            )
            results[query_mode] = result
        results['available'] = results.get('total', 0) - results.get(
            'usage', 0)
        return results
    # DEPLOYMENT METRICS
    if mode == 'deployment':
        # max_over_time(rate(container_cpu_usage_seconds_total{container_name="ditto-asia"}[1d:1h])[1d:1h])
        # sum by (container_name)(max_over_time(rate(container_cpu_usage_seconds_total{container="ditto-asia"}[3d:3m])[3d:3m]))
        if nature == 'cpu':
            metric = 'container_cpu_usage_seconds_total'
            query = {
                'max':
                'sum by (container_name)(max_over_time(rate(%s{container="%s"}[3d:3m])[3d:3m]))'
                % (metric, search),
                'avg':
                'sum by (container_name)(avg_over_time(rate(%s{container="%s"}[3d:3m])[3d:3m]))'
                % (metric, search),
                'min':
                'sum by (container_name)(min_over_time(rate(%s{container="%s"}[3d:3m])[3d:3m]))'
                % (metric, search)
            }
        elif nature == 'mem':
            metric = 'container_memory_max_usage_bytes'
            query = {
                'max':
                'max(max_over_time(%s{container="%s"}[3d]))' %
                (metric, search),
                'avg':
                'max(avg_over_time(%s{container="%s"}[3d]))' %
                (metric, search),
                'min':
                'max(min_over_time(%s{container="%s"}[3d]))' % (metric, search)
            }
        results = {'max': 0, 'avg': 0, 'min': 0}
        for query_style in query.keys():
            value = prom.custom_query(query=query[query_style])
            result = 0
            if len(value) > 0:
                value = value[0].get('value', [0, False])[-1]
            if not value:
                log.debug(
                    f'[QUERY-PROMETHEUS] {search} for {mode} and {nature} ({query_style}) not found results ({query})'
                )
                continue
            if nature == 'cpu':
                result = round(float(value) * 1000, 2)
            if nature == 'mem':
                result = round(float(value) / 1024 / 1024, 2)
            results[query_style] = result
            log.debug(
                f'[QUERY-PROMETHEUS] {search} {mode} {nature} {query_style} results: {result}'
            )
        return results
    # POD METRICS
    if mode == 'pod':
        if nature == 'cpu':
            metric = 'container_cpu_usage_seconds_total'
            query = {
                'max':
                'max_over_time(rate(%s{pod="%s", container!="POD"}[3d:3m])[3d:3m])'
                % (metric, search),
                'avg':
                'avg_over_time(rate(%s{pod="%s", container!="POD"}[3d:3m])[3d:3m])'
                % (metric, search),
                'min':
                'min_over_time(rate(%s{pod="%s", container!="POD"}[3d:3m])[3d:3m])'
                % (metric, search)
            }
        elif nature == 'mem':
            metric = 'container_memory_max_usage_bytes'
            query = {
                'max':
                'max(max_over_time(%s{pod="%s"}[3d]))' % (metric, search),
                'avg':
                'max(avg_over_time(%s{pod="%s"}[3d]))' % (metric, search),
                'min':
                'min(min_over_time(%s{pod="%s"}[3d]))' % (metric, search)
            }
        results = {'max': 0, 'avg': 0, 'min': 0}
        for query_style in query.keys():
            value = prom.custom_query(query=query[query_style])
            result = 0
            if len(value) > 0:
                value = value[0].get('value', [0, False])[-1]
            if not value:
                log.debug(
                    f'[QUERY-PROMETHEUS] {search} for {mode} and {nature} ({query_style}) not found results ({query})'
                )
                continue
            if nature == 'cpu':
                # transform to something human ready
                # style is '350'm (cpu fractions as base 100 scale per core)
                result = round(float(value) * 1000, 2)
            if nature == 'mem':
                # transforms to something human ready
                # style is '350.44'mb
                result = round(float(value) / 1024 / 1024, 2)
            results[query_style] = result
            log.debug(
                f'[QUERY-PROMETHEUS] {search} {mode} {nature} {query_style} results: {result}'
            )
        return results
    return {}
예제 #15
0
 async def run_promql(self, query: str, disable_ssl: bool = True) -> list:
     prometheus = PrometheusConnect(url=self.base_url,
                                    disable_ssl=disable_ssl)
     return prometheus.custom_query(query=query)
예제 #16
0
from prometheus_api_client import PrometheusConnect
import json

prometheus_host = 'http://localhost:9090'
app = 'bookinfo'
prom = PrometheusConnect(url=prometheus_host, disable_ssl=True)

metrics_data = prom.custom_query(query="istio_requests_total")

topology_json = {"components": [], "links": []}

service_dict = {}
link_counter = 1
#keep track on which services already extracted
extracted_services = []

for item in metrics_data:
    element_inserted = False
    #service_source = item['metric']['source_canonical_service']
    try:
        component = item['metric']['app']

        #check if component is an istio component
        namespace = item['metric']['source_workload_namespace']
        #service_destination_version = item['metric']['version']

        #this two will be the destination of the link
        #service_destination = item['metric']['destination_canonical_service']
        service_destination = item['metric']['destination_service_name']
        #connection_point = item['metric']['destination_canonical_revision']
        if (namespace != "istio-system"):