예제 #1
0
class Configuration:
    """docstring for Configuration."""

    prometheus_url = os.getenv(
        "PROMETEUS_URL",
        "http://prometheus-k8s-monitoring.192.168.99.104.nip.io")

    prometheus_headers = None
    if os.getenv("PROMETEUS_ACCESS_TOKEN"):
        prom_connect_headers = {
            "Authorization": "bearer " + os.getenv("PROMETEUS_ACCESS_TOKEN")
        }

    metrics_list = str(
        os.getenv(
            "METRICS_LIST",
            "go_memstats_heap_objects{endpoint='web',instance='172.17.0.17:9090',job='prometheus-k8s',namespace='monitoring',pod='prometheus-k8s-1',service='prometheus-k8s'}",
        )).split(";")

    rolling_training_window_size = parse_timedelta(
        "now", os.getenv("ROLLING_TRAINING_WINDOW_SIZE", "120m"))

    retraining_interval_minutes = int(
        os.getenv("RETRAINING_INTERVAL_MINUTES", "60"))
    metric_chunk_size = parse_timedelta("now",
                                        str(retraining_interval_minutes) + "m")

    deviations = int(os.getenv("DEVIATIONS", "3"))

    anomaly_border = str(os.getenv("ANOMALY_BORDER", "both"))

    algorithm_name = str(os.getenv("ALGORITHM", "agile"))

    algorithm_resolver = {
        "robust": prophet_model.MetricPredictor,
        "agile": sarima_model.MetricPredictor,
        "basic": fourier_model.MetricPredictor
    }
    algorithm = algorithm_resolver.get(algorithm_name)

    seasonality = str(os.getenv("SEASONALITY", "daily"))

    mlflow_tracking_uri = "http://localhost:5000"

    metric_start_time = parse_datetime(
        os.getenv("DATA_START_TIME", "2020-02-05 13:00:00"))

    metric_end_time = parse_datetime(
        os.getenv("DATA_END_TIME", "2020-02-05 13:36:00"))

    metric_train_data_end_time = metric_start_time + rolling_training_window_size

    _LOGGER.info("Metric train data start time: %s", metric_start_time)
    _LOGGER.info("Metric train data end time/test data start time: %s",
                 metric_train_data_end_time)
    _LOGGER.info("Metric test end time: %s", metric_end_time)
    _LOGGER.info("Metric data rolling training window size: %s",
                 rolling_training_window_size)
    _LOGGER.info("Model retraining interval: %s minutes",
                 retraining_interval_minutes)
예제 #2
0
def dataframe_creation():
    # точка входа в программирование Spark с помощью Dataset и DataFrame API.
    # master - задает главный URL-адрес Spark для подключения, например «local» для локального запуска
    # appName - устанавливает имя приложения, которое будет отображаться в веб-интерфейсе Spark
    # config - устанавливает параметр конфигурации
    # getOrCreate() - получает существующий SparkSession или, если его нет, создает новый на основе параметров, установленных в этом построителе

    # конект к прометеусу
    prom = pac.PrometheusConnect(url="http://localhost:9090/",
                                 disable_ssl=True)

    # получение названия всех метрик прометеуса
    all_metrics = prom.all_metrics()
    metrics = list()

    # установка рейнджа времени для данных
    # в данном случае от "минус 1 день" до "сейчас"
    start_time = pac_u.parse_datetime("1d")
    end_time = pac_u.parse_datetime("now")

    dataframe_list = dict()

    # цикл, позволяющий выбрать только те метрики, которые нам нужны,
    # а именно в которых есть название application и process.
    # такими словами помечены собираемые метрики с приложения.
    """
    for i in all_metrics:
        if i.split('application')[0] == '' or i.split('process')[0] == '':
            metric_data = prom.get_metric_range_data(
                i,
                start_time=start_time,
                end_time=end_time
            )
            if metric_data:
                # берем только значения метрики
                values = metric_data[0]['values']
                # создаем датафрейм со значениями метрики и сохраняем его в словаре,
                # где ключ - название метрики
                print(values)
                dataframe_list[i] = (spark.createDataFrame(values, ['time', 'value'])
    """
    dataframe = None
    for i in all_metrics:
        if i.split('application')[0] == '':  #or i.split('process')[0] == '':
            metric_data = prom.get_metric_range_data(i,
                                                     start_time=start_time,
                                                     end_time=end_time)
            if metric_data:
                # берем только значения метрики
                values = metric_data[0]['values']
                if dataframe is None:
                    dataframe = spark.createDataFrame(values, ['time', i])
                else:
                    df = spark.createDataFrame(values, ['time', i])
                    dataframe = dataframe.join(df, 'time', 'right')

    # dataframe.show()
    dataframe.write.csv('metrics.csv')
    return dataframe
예제 #3
0
def profiling(url, pod_ip, ana_window='2m', metrics=MEM_UTIL):
    """if key exists, the value will be replaced,
       add dynamic status
       {ai.centaurus.io/gpu0:{cur_mem_used:4GB, max_gpu_util:60, max_mem_cpy_util:34, cyclic:True, process_cnt:1},
        ai.centaurus.io/gpu1:{cur_mem_used:4GB, max_gpu_util:60, max_mem_cpy_util:34, cyclic:True, process_cnt:2, processes:[{pid:25678, cur_mem_used:3GB},{pid:67234, cur_mem_used:1GB}]}                                 
       }
    """
    ret_dict = dict()
    promi = PrometheusConnect(url=url, disable_ssl=True)
    # except connection error
    try:
        promi.check_prometheus_connection()
    except Exception as e:
        logging.error(e)
        return ret_dict  # if connectioin fails, return empty dict
    instance = pod_ip + ":9400" # tmp fixed
    start_time = parse_datetime(ana_window)
    end_time = parse_datetime("now")
    my_label_config = {"instance": instance}  # select current host metrics
    metric_data = promi.get_metric_range_data(metric_name=metrics,
                                              label_config=my_label_config,
                                              start_time=start_time,
                                              end_time=end_time)
    # reorganize data to label_config and metric_values
    metric_object_list = MetricsList(metric_data)
    ret_dict = dict()
    for item in metric_object_list: # iterate through all the gpus on the node
        if 'gpu' not in item.label_config: # handle metric config info exception
            continue
        id = item.label_config['gpu']  # predefined key from dcgm (gpu index)
        # ip = item.label_config['instance']
        key = DOMAIN + "/gpu-" + id
        cur_usage = collect_cur_usage(int(id))
        ts = item.metric_values.iloc[:, 1]  # metrics_values are two row df, 1st is timestamp, 2nd is value
        cur_usage['cyclic_pattern'] = False
        if ts.max() > 0:
            cyclic, period = cyclic_pattern_detection(ts)
            if cyclic:
                cur_usage['cyclic_pattern'] = True
                cur_usage['period'] = str(period)       
        cur_usage['max_mem_util'] = str(ts.max())
        # Important: flatten nested dictionary to string, otherwise error "cannot unmarshal string into Go value of type map[string]interface {}""
        ret_dict[key] = str(cur_usage)
    return ret_dict
예제 #4
0
    def fetch(self, expression, number_of_days):
        start_time = parse_datetime('%dd' % number_of_days)
        end_time = parse_datetime('now')
        chunk_size = parse_timedelta('now', '1d')

        metric_data = self.prom.get_metric_range_data(
            expression,
            start_time=start_time,
            end_time=end_time,
            chunk_size=chunk_size,
        )

        # MetricsList combines the chunks into a single metric
        metric = MetricsList(metric_data)[0]

        # Yield tuples of timestamp, value
        for value in metric.metric_values.values:
            ts, val = value.tolist()

            # The timestamp is delivered in UTC, convert to local
            ts = ts.to_pydatetime().replace(tzinfo=tz.tzutc())
            ts = ts.astimezone(tz.tzlocal())

            yield ts, val
예제 #5
0
class Configuration:
    """docstring for Configuration."""

    # url for the prometheus host
    prometheus_url = os.getenv("FLT_PROM_URL")

    # any headers that need to be passed while connecting to the prometheus host
    prom_connect_headers = None
    # example oath token passed as a header
    if os.getenv("FLT_PROM_ACCESS_TOKEN"):
        prom_connect_headers = {
            "Authorization": "bearer " + os.getenv("FLT_PROM_ACCESS_TOKEN")
        }

    # list of metrics that need to be scraped and predicted
    # multiple metrics can be separated with a ";"
    # if a metric configuration matches more than one timeseries,
    # it will scrape all the timeseries that match the config.
    metrics_list = str(
        os.getenv(
            "FLT_METRICS_LIST",
            "up{app='openshift-web-console', instance='172.44.0.18:8443'}")
    ).split(";")

    # uri for the mlflow tracking server
    mlflow_tracking_uri = str(os.getenv("MLFLOW_TRACKING_URI"))

    # threshold value to calculate true anomalies using a linear function
    true_anomaly_threshold = float(
        os.getenv("FLT_TRUE_ANOMALY_THRESHOLD", "0.001"))

    metric_start_time = parse_datetime(
        os.getenv("FLT_DATA_START_TIME", "2019-08-05 18:00:00"))

    metric_end_time = parse_datetime(
        os.getenv("FLT_DATA_END_TIME", "2019-08-08 18:00:00"))

    # this will create a rolling data window on which the model will be trained
    # example: if set to 15d will train the model on past 15 days of data,
    # every time new data is added, it will truncate the data that is out of this range.
    rolling_training_window_size = parse_timedelta(
        "now", os.getenv("FLT_ROLLING_TRAINING_WINDOW_SIZE", "2d"))

    metric_train_data_end_time = metric_start_time + rolling_training_window_size

    # How often should the anomaly detector retrain the model (in minutes)
    retraining_interval_minutes = int(
        os.getenv("FLT_RETRAINING_INTERVAL_MINUTES", "120"))
    metric_chunk_size = parse_timedelta("now",
                                        str(retraining_interval_minutes) + "m")

    _LOGGER.info("Metric train data start time: %s", metric_start_time)
    _LOGGER.info("Metric train data end time/test data start time: %s",
                 metric_train_data_end_time)
    _LOGGER.info("Metric test end time: %s", metric_end_time)
    _LOGGER.info("Metric data rolling training window size: %s",
                 rolling_training_window_size)
    _LOGGER.info("Model retraining interval: %s minutes",
                 retraining_interval_minutes)
    _LOGGER.info("True anomaly threshold: %s", true_anomaly_threshold)
    _LOGGER.info("MLflow server url: %s", mlflow_tracking_uri)
예제 #6
0
from prometheus_api_client import Metric, MetricsList, PrometheusConnect
from prometheus_api_client.utils import parse_datetime, parse_timedelta

import matplotlib.pyplot as plt
import pandas as pd

pc = PrometheusConnect(
    url="https://prometheus-k8s-openshift-monitoring.apps-crc.testing",
    headers={
        "Authorization": "bearer 7lmyVwWaTrWZYwiM0KRN30fBw5W70OkcqOMnizZ-cr0"
    },
    disable_ssl=True)

start_time = parse_datetime("7d")
end_time = parse_datetime("now")
chunk_size = parse_timedelta("now", "1d")


def get_data(metrics, timestamp_filenames, datafile):
    def _getTimestamps(timestamp_filename):
        print(timestamp_filename)
        file_name = open(timestamp_filename, "r")
        ts = file_name.readlines()
        return ts

    def _getMetricsData(metric):
        metric_data = pc.get_metric_range_data(
            metric,
            start_time=start_time,
            end_time=end_time,
            chunk_size=chunk_size,
예제 #7
0
def get_all_metrics(start_time='5m', end_time='now', instance='', gpu_id=''):
    """
    all DCGM metrics, on all instances, and all gpus
    save dumped data to csv file
    """
    # save the time first, in case multiple query at different time later
    start_time = parse_datetime(start_time)
    end_time = parse_datetime(end_time)
    # connect to premtheus server, exit if connection fails
    url = "http://prometheus:9090"  # use service name, instead of ip to be more robust
    prom = PrometheusConnect(url=url, disable_ssl=True)
    try:
        prom.check_prometheus_connection()
    except Exception as e:
        logging.error(e)
        exit(1)
    # get all metrics under profiler job, note: some instances/gpus may not have all the metrics due to model variance
    metrics = prom.all_metrics()
    metrics = [a for a in metrics if 'DCGM' in a]
    gpu_util = 'DCGM_FI_DEV_GPU_UTIL'
    label_cfg = {"job": "profiler-pods"}
    # get a screenshot of all the instances (pod ip)
    metric_data = prom.get_current_metric_value(metric_name=gpu_util,
                                                label_config=label_cfg)
    metric_df = MetricSnapshotDataFrame(metric_data)
    instances = metric_df.instance.unique()
    ins_gpu = dict()
    for ins in instances:
        # add instance in query
        label_cfg['instance'] = ins
        metric_data = prom.get_current_metric_value(metric_name=gpu_util,
                                                    label_config=label_cfg)
        metric_df = MetricSnapshotDataFrame(metric_data)
        gpus = metric_df.gpu.unique()
        # put each instance's gpus into dictionary
        ins_gpu[ins] = gpus

    my_label_config = {"job": "profiler-pods", "gpu": gpu_id}  # select gpu0
    #my_label_config = {"instance": instance}  # select all gpu
    # if one particular instance is given, update instances
    if instance != '':
        instances = [
            instance,
        ]
    for ins in instances:
        if gpu_id != '':
            gpus = [
                gpu_id,
            ]
        else:
            gpus = ins_gpu[ins]
            print(ins, gpus)
        for gpu in gpus:
            my_label_config = {"instance": ins, "gpu": gpu}
            df = pd.DataFrame()
            for metric_name in metrics:
                # select from different metric_name to query
                metric_data = prom.get_metric_range_data(
                    metric_name=metric_name,
                    label_config=my_label_config,
                    start_time=parse_datetime(start_time),
                    end_time=parse_datetime(end_time))

                # reorganize data to label_config and metric_values
                metric_object_list = MetricsList(metric_data)
                if len(metric_object_list) > 0:
                    if 'datetime' not in df.columns:
                        df['datetime'] = metric_object_list[0].metric_values[
                            'ds']
                    df[metric_name] = metric_object_list[0].metric_values['y']

            file_name = "_".join([ins, gpu]) + ".csv"
            df.to_csv(file_name)