def _get_metrics_from_prometheus(self, observer=None):
        # Collect credentials to connect to a prometheus instance
        prom_token = os.getenv("PROM_ACCESS_TOKEN")
        prom_url = os.getenv("PROM_URL")
        if not (prom_token or prom_url):
            sys.exit("Error: Prometheus credentials not found")


        prom = Prometheus(url=prom_url, token=prom_token, data_chunk='5m',stored_data='5m')

        metrics_list = prom.all_metrics() # Get a list of all the metrics available from Prometheus

        print("Polling Prometheus for new metric data")

        metric_data = dict()
        if observer:
            for metric in metrics_list:
                pkt = ((prom.get_metric(name=metric))[0])
                metric_data[metric] = pkt
                observer.on_next(pkt) # push metric data to the Observer
            pass
        else:
            for metric in metrics_list:
                metric_data[metric] = ((prom.get_metric(name=metric))[0])

        return(metric_data)
Пример #2
0
    def setUp(self):
        port = os.environ.get("PROM_PORT", "9990")
        self.port_forward = subprocess.Popen([
            'kubectl', '-n', 'istio-system', 'port-forward',
            'deployment/prometheus',
            '%s:9090' % port
        ],
                                             stdout=subprocess.PIPE)

        self.port_forward.stdout.readline(
        )  # Wait for port forward to be ready

        self.prom = Prometheus('http://localhost:%s/' % port)
Пример #3
0
    def setUpClass(self):
        port = os.environ.get("PROM_PORT", "9990")
        namespace, deployment = find_prometheus()
        self.port_forward = subprocess.Popen([
            'kubectl', '-n', namespace, 'port-forward', deployment,
            '%s:9090' % port
        ],
                                             stdout=subprocess.PIPE)

        self.port_forward.stdout.readline(
        )  # Wait for port forward to be ready

        self.prom = Prometheus('http://localhost:%s/' % port)
Пример #4
0
class TestAlarms(unittest.TestCase):
    def test_graceful_shutdown(self):
        queries = [
            Query(
                'Graceful Shutdown: 5xx Requests/s',
                'sum(rate(istio_requests_total{destination_service="httpbin.graceful-shutdown.svc.cluster.local", source_app="client", response_code=~"5.."}[10m]))',
                Alarm(
                    lambda error_rate: error_rate > 0,
                    'There were 5xx errors. Requests may be getting dropped.')
            ),
            Query(
                'Graceful Shutdown: Total Requests/s',
                'sum(rate(istio_requests_total{destination_service="httpbin.graceful-shutdown.svc.cluster.local", source_app="client"}[10m]))',
                Alarm(
                    lambda qps: qps < 18,
                    'Not enough requests sent; expect at least 18. Service may be having issues.'
                )),
        ]
        self.run_queries(queries)

    def test_external_traffic(self):
        queries = [
            Query(
                'External Traffic: Total requests',
                'sum(rate(istio_requests_total{destination_service="fortio-server.allow-external-traffic-b.svc.cluster.local"}[10m]))',
                Alarm(
                    lambda qps: qps < 250,
                    'Not enough requests sent; expect at least 250. Service may be having issues.'
                ))
            # Cross namespace metrics are not recorded
        ]
        self.run_queries(queries)

    def setUp(self):
        port = os.environ.get("PROM_PORT", "9990")
        self.port_forward = subprocess.Popen([
            'kubectl', '-n', 'istio-system', 'port-forward',
            'deployment/prometheus',
            '%s:9090' % port
        ],
                                             stdout=subprocess.PIPE)

        self.port_forward.stdout.readline(
        )  # Wait for port forward to be ready

        self.prom = Prometheus('http://localhost:%s/' % port)

    def tearDown(self):
        self.port_forward.terminate()

    def run_queries(self, queries):
        for query in queries:
            with self.subTest(name=query.description):
                errors = self.prom.run_query(query, debug=True)
                message = 'Alarms Triggered:'
                for e in errors:
                    message += '\n- ' + e
                assert_empty(errors, message)
Пример #5
0
def config_push_converge_query(prom: Prometheus, svc: str = "svc-0", namespace: str = 'pilot-load'):
    cluster_name = 'outbound|890||{0}.{1}.svc.cluster.local'.format(
        svc, namespace
    )
    result = prom.fetch_by_query(
        'count(envoy_cluster_upstream_cx_total{cluster_name=~".*pilot-load.*"}) by (cluster_name)')
    if not result:
        return []
    return [(point['metric'], point['value'][1])
            for point in result['data']['result']]
    def observe_prom_metrics_range(self,
                                   observer,
                                   metrics_list,
                                   start_time,
                                   end_time='now',
                                   chunk_size='1h'):
        # Collect credentials to connect to a prometheus instance
        prom_token = os.getenv("FLT_PROM_ACCESS_TOKEN")
        prom_url = os.getenv("FLT_PROM_URL")
        if not (prom_token or prom_url):
            sys.exit("Error: Prometheus credentials not found")
        prom = Prometheus(url=prom_url, token=prom_token)

        # Calculate chunk size to download and push to the observer at each instance
        chunk_seconds = int(
            round((dateparser.parse('now') -
                   dateparser.parse(chunk_size)).total_seconds()))
        print(
            "\nCollecting metric data within datetime range:{0} - {1}".format(
                dateparser.parse(start_time), dateparser.parse(end_time)))
        start = dateparser.parse(start_time).timestamp()
        end = dateparser.parse(end_time).timestamp()

        while start < end:  # Main loop which iterates through time-ranges to collect a chunk of data at every iteration
            for metric_name in metrics_list:  # Loop to get a chunk of data for every metric in the list
                print(
                    "Current Chunk Info: Metric = {0}, Time range = {1} - {2}".
                    format(metric_name, dateparser.parse(str(start)),
                           dateparser.parse(str(start + chunk_seconds))))
                pkt_list = (prom.get_metric_range_data(metric_name=metric_name,
                                                       start_time=start,
                                                       end_time=start +
                                                       chunk_seconds))

                for pkt in pkt_list:  # pkt_list contains a list of data for multiple metrics, each of which is pushed to the observer.
                    # print(dateparser.parse(str(pkt['values'][0][0])), "-", dateparser.parse(str(pkt['values'][-1][0])))
                    try:
                        observer.on_next(pkt)
                    except Exception as e:
                        print(pkt)  # Check which pkt caused the exception
                        raise (e)
            start += chunk_seconds
        pass
Пример #7
0
def setup_promethus():
    port = os.environ.get("PROM_PORT", "9990")
    namespace, deployment = find_prometheus()
    port_forward = subprocess.Popen([
        'kubectl', '-n', namespace, 'port-forward', deployment,
        '%s:9090' % port
    ],
                                    stdout=subprocess.PIPE)
    port_forward.stdout.readline()  # Wait for port forward to be ready
    return Prometheus('http://localhost:%s/' % port, pid=port_forward.pid)
Пример #8
0
def fetch(
    prometheus_server,
    output,
    start_date,
):
    p = Prometheus(prometheus_server)

    header_written = False
    start = datetime.strptime(start_date,
                              '%Y/%m/%d').replace(tzinfo=timezone.utc)
    now = datetime.now(timezone.utc)

    with open(output, 'a') as f:
        # Truncate existing data
        f.truncate(0)

        while start < now:
            duration = timedelta(days=1)
            step = timedelta(minutes=1)
            data = p.range_query(
                'max(bitflyer_last_traded_price{product_code="BTC_JPY"}) by (product_code)',
                start,
                duration,
                step,
            )

            if len(data) > 0:
                series = pd.Series(
                    data[0]['values'].T[1],
                    index=data[0]['values'].T[0],
                )
                df = series.to_frame(name='ltp')
                df.index = pd.to_datetime(df.index, unit='s')
                df.index.name = 'timestamp'

                df.to_csv(f, header=(not header_written))
                header_written = True

            start += duration
Пример #9
0
def job(current_time):
    # TODO: Replace this function with model training function and set up the correct IntervalTrigger time
    global data_dict, predictions_dict_prophet, predictions_dict_fourier, current_metric_metadata, current_metric_metadata_dict, data_window, url, token, chunk_size, data_size, TRUE_LIST, store_intermediate_data
    global data, config_list
    # iteration += 1
    start_time = time.time()
    prom = Prometheus(url=url,
                      token=token,
                      data_chunk=chunk_size,
                      stored_data=data_size)
    metric = prom.get_metric(metric_name)
    print("metric collected.")

    # Convert data to json
    metric = json.loads(metric)

    # Metric Json is converted to a shaped dataframe
    data_dict = get_df_from_json(
        metric, data_dict, data_window
    )  # This dictionary contains all the sub-labels as keys and their data as Pandas DataFrames
    del metric, prom

    if str(store_intermediate_data) in TRUE_LIST:
        print(
            "DataFrame stored at: ",
            cp().store_data(metric_name, pickle.dumps(data_dict),
                            (data_storage_path +
                             str(datetime.now().strftime('%Y%m%d%H%M')))))
        pass

    if fixed_label_config != "None":  #If a label config has been specified
        single_label_data_dict = {}

        # split into multiple label configs
        existing_config_list = list(data_dict.keys())
        # print(existing_config_list)
        for config in config_list:
            config_found = False
            for existing_config in existing_config_list:
                if SortedDict(literal_eval(existing_config)) == SortedDict(
                        literal_eval(config)):
                    single_label_data_dict[existing_config] = data_dict[
                        existing_config]
                    config_found = True
                    pass
            if not config_found:
                print("Specified Label Configuration {} was not found".format(
                    config))
                # raise KeyError
                pass
            # single_label_data_dict[config] = data_dict[config]
            pass

        # single_label_data_dict[fixed_label_config] = data_dict[fixed_label_config]
        current_metric_metadata = list(single_label_data_dict.keys())[0]
        current_metric_metadata_dict = literal_eval(current_metric_metadata)

        print(data_dict[current_metric_metadata].head(5))
        print(data_dict[current_metric_metadata].tail(5))

        print("Using the default label config")
        predictions_dict_prophet = predict_metrics(single_label_data_dict)
        # print(single_label_data_dict)
        predictions_dict_fourier = predict_metrics_fourier(
            single_label_data_dict)
        pass
    else:
        for x in data_dict:
            print(data_dict[x].head(5))
            print(data_dict[x].tail(5))
            break
            pass
        predictions_dict_prophet = predict_metrics(data_dict)
        predictions_dict_fourier = predict_metrics_fourier(data_dict)

    # TODO: Trigger Data Pruning here
    function_run_time = time.time() - start_time

    print(
        "Total time taken to train was: {} seconds.".format(function_run_time))
    pass
Пример #10
0
def metrics():
    global predictions_dict_prophet, predictions_dict_fourier, current_metric_metadata, current_metric_metadata_dict, metric_name, url, token, live_data_dict

    for metadata in predictions_dict_prophet:

        #Find the index matching with the current timestamp
        index_prophet = predictions_dict_prophet[metadata].index.get_loc(
            datetime.now(), method='nearest')
        index_fourier = predictions_dict_fourier[metadata].index.get_loc(
            datetime.now(), method='nearest')
        current_metric_metadata = metadata

        print("The current time is: ", datetime.now())
        print("The matching index for Prophet model found was: \n",
              predictions_dict_prophet[metadata].iloc[[index_prophet]])
        print("The matching index for Fourier Transform found was: \n",
              predictions_dict_fourier[metadata].iloc[[index_fourier]])

        current_metric_metadata_dict = literal_eval(metadata)

        temp_current_metric_metadata_dict = current_metric_metadata_dict.copy()

        # delete the "__name__" key from the dictionary as we don't need it in labels (it is a non-permitted label) when serving the metrics
        del temp_current_metric_metadata_dict["__name__"]

        # TODO: the following function does not have good error handling or retry code in case of get request failure, need to fix that
        # Get the current metric value which will be compared with the predicted value to detect an anomaly
        metric = (Prometheus(url=url, token=token).get_current_metric_value(
            metric_name, temp_current_metric_metadata_dict))

        # print("metric collected.")

        # Convert data to json
        metric = json.loads(metric)

        # Convert the json to a dictionary of pandas dataframes
        live_data_dict = get_df_from_single_value_json(metric, live_data_dict)

        # Trim the live data dataframe to only 5 most recent values
        live_data_dict[metadata] = live_data_dict[metadata][-5:]
        # print(live_data_dict)

        # Update the metric values for prophet model
        PREDICTED_VALUES_PROPHET.labels(
            **temp_current_metric_metadata_dict).set(
                predictions_dict_prophet[metadata]['yhat'][index_prophet])
        PREDICTED_VALUES_PROPHET_UPPER.labels(
            **temp_current_metric_metadata_dict).set(
                predictions_dict_prophet[metadata]['yhat_upper']
                [index_prophet])
        PREDICTED_VALUES_PROPHET_LOWER.labels(
            **temp_current_metric_metadata_dict).set(
                predictions_dict_prophet[metadata]['yhat_lower']
                [index_prophet])

        # Update the metric values for fourier transform model
        PREDICTED_VALUES_FOURIER.labels(
            **temp_current_metric_metadata_dict).set(
                predictions_dict_fourier[metadata]['yhat'][index_fourier])
        PREDICTED_VALUES_FOURIER_UPPER.labels(
            **temp_current_metric_metadata_dict).set(
                predictions_dict_fourier[metadata]['yhat_upper']
                [index_fourier])
        PREDICTED_VALUES_FOURIER_LOWER.labels(
            **temp_current_metric_metadata_dict).set(
                predictions_dict_fourier[metadata]['yhat_lower']
                [index_fourier])

        # TypeError: Invalid comparison between dtype=datetime64[ns] and int
        # if len(live_data_dict[metadata] >= 5):
        #     pass
        #     # Update the metric values for detected anomalies 1 in case of anomaly, 0 if not
        #     if (detect_anomalies(predictions_dict_fourier[metadata][len(predictions_dict_fourier[metadata])-(len(live_data_dict[metadata])):],live_data_dict[metadata])):
        #         PREDICTED_ANOMALY_FOURIER.labels(**temp_current_metric_metadata_dict).set(1)
        #     else:
        #         PREDICTED_ANOMALY_FOURIER.labels(**temp_current_metric_metadata_dict).set(0)
        #
        #     if (detect_anomalies(predictions_dict_prophet[metadata][len(predictions_dict_prophet[metadata])-(len(live_data_dict[metadata])):],live_data_dict[metadata])):
        #         PREDICTED_ANOMALY_PROPHET.labels(**temp_current_metric_metadata_dict).set(1)
        #     else:
        #         PREDICTED_ANOMALY_PROPHET.labels(**temp_current_metric_metadata_dict).set(0)
        # pass

    return Response(generate_latest(REGISTRY).decode("utf-8"),
                    content_type='text; charset=utf-8')
Пример #11
0
    # Chunk size, download the complete data, but in smaller chunks, should be less than or equal to DATA_SIZE
    chunk_size = str(os.getenv('CHUNK_SIZE', '1d'))

    # Net data size to scrape from prometheus
    data_size = str(os.getenv('DATA_SIZE', '1d'))

    # Number of minutes, the model should predict the values for
    # PREDICT_DURATION=1440 # minutes, 1440 = 24 Hours

    # Limit to first few labels of the metric
    # LABEL_LIMIT = None

    # Preparing a connection to Prometheus host
    prom = Prometheus(url=url,
                      token=token,
                      data_chunk=chunk_size,
                      stored_data=data_size)

    # Get metric data from Prometheus
    metric = prom.get_metric(metric_name)
    print("metric collected.")
    del prom

    # Convert data to json
    metric = json.loads(metric)

    # print(metric)

    # Metric Json is converted to a shaped dataframe
    pd_dict = get_df_from_json(
        metric
Пример #12
0
class TestAlarms(unittest.TestCase):
    def test_pilot(self):
        queries = [
            Query(
                "Pilot: XDS rejections", 'pilot_total_xds_rejects',
                Alarm(lambda errors: errors > 0,
                      'There should not be any rejected XDS pushes'), None)
        ]
        self.run_queries(queries)

    def test_graceful_shutdown(self):
        queries = [
            *standard_queries('istio-stability-graceful-shutdown'),
            istio_requests_sanity('istio-stability-graceful-shutdown')
        ]
        self.run_queries(queries)

    def test_http_10(self):
        queries = [
            *standard_queries('istio-stability-http10'),
            istio_requests_sanity('istio-stability-http10')
        ]
        self.run_queries(queries)

    def test_mysql(self):
        queries = [
            # TODO get clientside metrics
            *standard_queries('istio-stability-mysql')
        ]
        self.run_queries(queries)

    def test_load_test(self):
        queries = [
            *standard_queries('service-graph..', cpu_lim=250, mem_lim=100)
        ]
        self.run_queries(queries)

    def test_redis(self):
        queries = [stability_query(source='redis-client', test='redis')]
        self.run_queries(queries)

    def test_rabbitmq(self):
        queries = [stability_query(source='rabbitmq-client', test='rabbitmq')]
        self.run_queries(queries)

    @classmethod
    def setUpClass(self):
        port = os.environ.get("PROM_PORT", "9990")
        namespace, deployment = find_prometheus()
        self.port_forward = subprocess.Popen([
            'kubectl', '-n', namespace, 'port-forward', deployment,
            '%s:9090' % port
        ],
                                             stdout=subprocess.PIPE)

        self.port_forward.stdout.readline(
        )  # Wait for port forward to be ready

        self.prom = Prometheus('http://localhost:%s/' % port)

    @classmethod
    def tearDownClass(self):
        self.port_forward.stdout.close()  # Wait for port forward to be ready
        self.port_forward.terminate()
        self.port_forward.wait()

    def run_queries(self, queries):
        for query in queries:
            with self.subTest(name=query.description):
                if query.running_query:
                    if self.prom.fetch_value(query.running_query) == 0:
                        self.skipTest("Test is not running")
                errors = self.prom.run_query(query)
                message = 'Alarms Triggered:'
                for e in errors:
                    message += '\n- ' + e
                assert_empty(errors, message)
Пример #13
0
def envoy_cds_version_count(prom: Prometheus):
    return prom.fetch_value(
        'count(count_values("value", envoy_cluster_manager_cds_version))')
Пример #14
0
class Prometheus_Query:
    p = Prometheus()
    instance_name = "10.244.0.85:9308"
    oc = OC()

    def __init__(self):
        ns, ip, port = self.get_kafka_exporter_ip()
        if ip and port:
            self.instance_name = "%s:%s" % (ip, port)

    def get_kafka_exporter_ip(self):
        ns = ""
        ip = ""
        port = ""
        output = self.oc.get_services_all_namespace()
        try:
            for line in output.split("\n"):
                if line.find("my-cluster-kafka-exporter") != -1:
                    ns = line.split()[0]
                    ip = line.split()[3]
                    port = line.split()[5].split("/")[0].split(":")[0]
        except Exception as e:
            print "it cannot find kafka exporter ip: %s" % str(e)
            return ns, ip, port
        print "find namespace (%s) exporter ip (%s:%s)" % (ns, ip, port)
        return ns, ip, port

    def query_lag(self):
        # cmd = 'sum(kafka_consumergroup_lag{instance="%s",topic=~"%s"}) by (consumergroup, topic)' % (self.instance_name, topic_name)
        cmd = 'sum(kafka_consumergroup_lag{topic=~"%s"})' % (topic_name)
        output = self.p.run_cmd(cmd)
        return output

    def query_avg_lag(self):
        cmd = 'avg_over_time(kafka_consumergroup_lag{topic="%s",consumergroup="%s"}[1m])' % (
            topic_name, group_name)
        output = self.p.run_cmd(cmd)
        return output

    def query_log_offset(self):
        cmd = 'sum(kafka_topic_partition_current_offset{topic=~"%s"})' % (
            topic_name)
        output = self.p.run_cmd(cmd)
        return output

    def query_log_offset_by_min(self):
        cmd = 'sum(delta(kafka_topic_partition_current_offset{topic=~"%s"}[3m])/3)' % (
            topic_name)
        output = self.p.run_cmd(cmd)
        return output

    def query_log_offset_by_sec(self):
        cmd = 'sum(rate(kafka_topic_partition_current_offset{topic=~"%s"}[1m]))' % (
            topic_name)
        output = self.p.run_cmd(cmd)
        return output

    def query_current_offset(self):
        cmd = 'sum(kafka_consumergroup_current_offset{topic=~"%s"})' % (
            topic_name)
        output = self.p.run_cmd(cmd)
        return output

    def query_current_offset_by_min(self):
        cmd = 'sum(delta(kafka_consumergroup_current_offset{topic=~"%s"}[3m])/3)' % (
            topic_name)
        output = self.p.run_cmd(cmd)
        return output

    def query_current_offset_by_sec(self):
        cmd = 'sum(rate(kafka_consumergroup_current_offset{topic=~"%s"}[1m]))' % (
            topic_name)
        output = self.p.run_cmd(cmd)
        return output

    def query_lag_by_sec(self):
        cmd = 'sum(rate(kafka_consumergroup_lag{topic=~"%s"}[1m]))' % (
            topic_name)
        output = self.p.run_cmd(cmd)
        return output

    def query_lag_by_min(self):
        cmd = 'sum(delta(kafka_consumergroup_lag{topic=~"%s"}[3m])/3)' % (
            topic_name)
        output = self.p.run_cmd(cmd)
        return output

    def query_pod_start_time(self, pod_name):
        cmd = 'kube_pod_start_time{pod="%s"}' % pod_name
        output = self.p.run_cmd(cmd)
        return output

    def wait_time(self, value):
        # print "wait %d seconds" % value
        time.sleep(value)
Пример #15
0
    def observe_prom_metrics_range(self,
                                   observer,
                                   metrics_list,
                                   start_time,
                                   end_time='now',
                                   chunk_size='1h'):
        # Collect credentials to connect to a prometheus instance
        prom_token = os.getenv("FLT_PROM_ACCESS_TOKEN")
        prom_url = os.getenv("FLT_PROM_URL")
        if not (prom_token or prom_url):
            sys.exit("Error: Prometheus credentials not found")
        prom = Prometheus(url=prom_url, token=prom_token)

        # Calculate chunk size to download and push to the observer at each instance
        chunk_seconds = int(
            round((dateparser.parse('now') -
                   dateparser.parse(chunk_size)).total_seconds()))

        start = round(dateparser.parse(start_time).timestamp(), 0)
        end = round(dateparser.parse(end_time).timestamp(), 0)

        _LOGGER.info(
            "Collecting metric data within datetime range:{0} - {1}".format(
                dateparser.parse(str(start)), dateparser.parse(str(end))))
        current_latest_timestamp = 0

        while start < end:  # Main loop which iterates through time-ranges to collect a chunk of data at every iteration
            chunk_end_time = start + chunk_seconds - 1  # Increment the metric chunk time to collect the next chunk

            if (
                    start + chunk_seconds
            ) >= end:  # When the specified start-end datetime range is not divisible by the specified chunk time
                chunk_end_time = end  # Reduce the size of the last chunk to fit the specified datetime frame

            for metric_name in metrics_list:  # Loop to get a chunk of data for every metric in the list
                _LOGGER.info(
                    "Current Chunk Info: Metric = {0}, Time range = {1} - {2}".
                    format(metric_name, dateparser.parse(str(start)),
                           dateparser.parse(str(chunk_end_time))))
                pkt_list = (prom.get_metric_range_data(
                    metric_name=metric_name,
                    start_time=start,
                    end_time=chunk_end_time))
                _LOGGER.info("Collected {0} packets.".format(len(pkt_list)))

                for pkt in pkt_list:  # pkt_list contains a list of data for multiple metrics, each of which is pushed to the observer.
                    # print(dateparser.parse(str(pkt['values'][0][0])), "-", dateparser.parse(str(pkt['values'][-1][0])))
                    if pkt['values'][-1][0] > current_latest_timestamp:
                        current_latest_timestamp = pkt['values'][-1][0]
                    try:
                        observer.on_next(pkt)
                    except Exception as e:
                        _LOGGER.error(
                            "{0}, while processing the following metric packet: \n{1}"
                            .format(str(e), str(
                                pkt)))  # Check which pkt caused the exception
                        raise (e)
                self.final_packet_timestamp[metric_name] = (
                    current_latest_timestamp)

            start += chunk_seconds
        pass