class Configuration: """docstring for Configuration.""" prometheus_url = os.getenv( "PROMETEUS_URL", "http://prometheus-k8s-monitoring.192.168.99.104.nip.io") prometheus_headers = None if os.getenv("PROMETEUS_ACCESS_TOKEN"): prom_connect_headers = { "Authorization": "bearer " + os.getenv("PROMETEUS_ACCESS_TOKEN") } metrics_list = str( os.getenv( "METRICS_LIST", "go_memstats_heap_objects{endpoint='web',instance='172.17.0.17:9090',job='prometheus-k8s',namespace='monitoring',pod='prometheus-k8s-1',service='prometheus-k8s'}", )).split(";") rolling_training_window_size = parse_timedelta( "now", os.getenv("ROLLING_TRAINING_WINDOW_SIZE", "120m")) retraining_interval_minutes = int( os.getenv("RETRAINING_INTERVAL_MINUTES", "60")) metric_chunk_size = parse_timedelta("now", str(retraining_interval_minutes) + "m") deviations = int(os.getenv("DEVIATIONS", "3")) anomaly_border = str(os.getenv("ANOMALY_BORDER", "both")) algorithm_name = str(os.getenv("ALGORITHM", "agile")) algorithm_resolver = { "robust": prophet_model.MetricPredictor, "agile": sarima_model.MetricPredictor, "basic": fourier_model.MetricPredictor } algorithm = algorithm_resolver.get(algorithm_name) seasonality = str(os.getenv("SEASONALITY", "daily")) mlflow_tracking_uri = "http://localhost:5000" metric_start_time = parse_datetime( os.getenv("DATA_START_TIME", "2020-02-05 13:00:00")) metric_end_time = parse_datetime( os.getenv("DATA_END_TIME", "2020-02-05 13:36:00")) metric_train_data_end_time = metric_start_time + rolling_training_window_size _LOGGER.info("Metric train data start time: %s", metric_start_time) _LOGGER.info("Metric train data end time/test data start time: %s", metric_train_data_end_time) _LOGGER.info("Metric test end time: %s", metric_end_time) _LOGGER.info("Metric data rolling training window size: %s", rolling_training_window_size) _LOGGER.info("Model retraining interval: %s minutes", retraining_interval_minutes)
def dataframe_creation(): # точка входа в программирование Spark с помощью Dataset и DataFrame API. # master - задает главный URL-адрес Spark для подключения, например «local» для локального запуска # appName - устанавливает имя приложения, которое будет отображаться в веб-интерфейсе Spark # config - устанавливает параметр конфигурации # getOrCreate() - получает существующий SparkSession или, если его нет, создает новый на основе параметров, установленных в этом построителе # конект к прометеусу prom = pac.PrometheusConnect(url="http://localhost:9090/", disable_ssl=True) # получение названия всех метрик прометеуса all_metrics = prom.all_metrics() metrics = list() # установка рейнджа времени для данных # в данном случае от "минус 1 день" до "сейчас" start_time = pac_u.parse_datetime("1d") end_time = pac_u.parse_datetime("now") dataframe_list = dict() # цикл, позволяющий выбрать только те метрики, которые нам нужны, # а именно в которых есть название application и process. # такими словами помечены собираемые метрики с приложения. """ for i in all_metrics: if i.split('application')[0] == '' or i.split('process')[0] == '': metric_data = prom.get_metric_range_data( i, start_time=start_time, end_time=end_time ) if metric_data: # берем только значения метрики values = metric_data[0]['values'] # создаем датафрейм со значениями метрики и сохраняем его в словаре, # где ключ - название метрики print(values) dataframe_list[i] = (spark.createDataFrame(values, ['time', 'value']) """ dataframe = None for i in all_metrics: if i.split('application')[0] == '': #or i.split('process')[0] == '': metric_data = prom.get_metric_range_data(i, start_time=start_time, end_time=end_time) if metric_data: # берем только значения метрики values = metric_data[0]['values'] if dataframe is None: dataframe = spark.createDataFrame(values, ['time', i]) else: df = spark.createDataFrame(values, ['time', i]) dataframe = dataframe.join(df, 'time', 'right') # dataframe.show() dataframe.write.csv('metrics.csv') return dataframe
def profiling(url, pod_ip, ana_window='2m', metrics=MEM_UTIL): """if key exists, the value will be replaced, add dynamic status {ai.centaurus.io/gpu0:{cur_mem_used:4GB, max_gpu_util:60, max_mem_cpy_util:34, cyclic:True, process_cnt:1}, ai.centaurus.io/gpu1:{cur_mem_used:4GB, max_gpu_util:60, max_mem_cpy_util:34, cyclic:True, process_cnt:2, processes:[{pid:25678, cur_mem_used:3GB},{pid:67234, cur_mem_used:1GB}]} } """ ret_dict = dict() promi = PrometheusConnect(url=url, disable_ssl=True) # except connection error try: promi.check_prometheus_connection() except Exception as e: logging.error(e) return ret_dict # if connectioin fails, return empty dict instance = pod_ip + ":9400" # tmp fixed start_time = parse_datetime(ana_window) end_time = parse_datetime("now") my_label_config = {"instance": instance} # select current host metrics metric_data = promi.get_metric_range_data(metric_name=metrics, label_config=my_label_config, start_time=start_time, end_time=end_time) # reorganize data to label_config and metric_values metric_object_list = MetricsList(metric_data) ret_dict = dict() for item in metric_object_list: # iterate through all the gpus on the node if 'gpu' not in item.label_config: # handle metric config info exception continue id = item.label_config['gpu'] # predefined key from dcgm (gpu index) # ip = item.label_config['instance'] key = DOMAIN + "/gpu-" + id cur_usage = collect_cur_usage(int(id)) ts = item.metric_values.iloc[:, 1] # metrics_values are two row df, 1st is timestamp, 2nd is value cur_usage['cyclic_pattern'] = False if ts.max() > 0: cyclic, period = cyclic_pattern_detection(ts) if cyclic: cur_usage['cyclic_pattern'] = True cur_usage['period'] = str(period) cur_usage['max_mem_util'] = str(ts.max()) # Important: flatten nested dictionary to string, otherwise error "cannot unmarshal string into Go value of type map[string]interface {}"" ret_dict[key] = str(cur_usage) return ret_dict
def fetch(self, expression, number_of_days): start_time = parse_datetime('%dd' % number_of_days) end_time = parse_datetime('now') chunk_size = parse_timedelta('now', '1d') metric_data = self.prom.get_metric_range_data( expression, start_time=start_time, end_time=end_time, chunk_size=chunk_size, ) # MetricsList combines the chunks into a single metric metric = MetricsList(metric_data)[0] # Yield tuples of timestamp, value for value in metric.metric_values.values: ts, val = value.tolist() # The timestamp is delivered in UTC, convert to local ts = ts.to_pydatetime().replace(tzinfo=tz.tzutc()) ts = ts.astimezone(tz.tzlocal()) yield ts, val
class Configuration: """docstring for Configuration.""" # url for the prometheus host prometheus_url = os.getenv("FLT_PROM_URL") # any headers that need to be passed while connecting to the prometheus host prom_connect_headers = None # example oath token passed as a header if os.getenv("FLT_PROM_ACCESS_TOKEN"): prom_connect_headers = { "Authorization": "bearer " + os.getenv("FLT_PROM_ACCESS_TOKEN") } # list of metrics that need to be scraped and predicted # multiple metrics can be separated with a ";" # if a metric configuration matches more than one timeseries, # it will scrape all the timeseries that match the config. metrics_list = str( os.getenv( "FLT_METRICS_LIST", "up{app='openshift-web-console', instance='172.44.0.18:8443'}") ).split(";") # uri for the mlflow tracking server mlflow_tracking_uri = str(os.getenv("MLFLOW_TRACKING_URI")) # threshold value to calculate true anomalies using a linear function true_anomaly_threshold = float( os.getenv("FLT_TRUE_ANOMALY_THRESHOLD", "0.001")) metric_start_time = parse_datetime( os.getenv("FLT_DATA_START_TIME", "2019-08-05 18:00:00")) metric_end_time = parse_datetime( os.getenv("FLT_DATA_END_TIME", "2019-08-08 18:00:00")) # this will create a rolling data window on which the model will be trained # example: if set to 15d will train the model on past 15 days of data, # every time new data is added, it will truncate the data that is out of this range. rolling_training_window_size = parse_timedelta( "now", os.getenv("FLT_ROLLING_TRAINING_WINDOW_SIZE", "2d")) metric_train_data_end_time = metric_start_time + rolling_training_window_size # How often should the anomaly detector retrain the model (in minutes) retraining_interval_minutes = int( os.getenv("FLT_RETRAINING_INTERVAL_MINUTES", "120")) metric_chunk_size = parse_timedelta("now", str(retraining_interval_minutes) + "m") _LOGGER.info("Metric train data start time: %s", metric_start_time) _LOGGER.info("Metric train data end time/test data start time: %s", metric_train_data_end_time) _LOGGER.info("Metric test end time: %s", metric_end_time) _LOGGER.info("Metric data rolling training window size: %s", rolling_training_window_size) _LOGGER.info("Model retraining interval: %s minutes", retraining_interval_minutes) _LOGGER.info("True anomaly threshold: %s", true_anomaly_threshold) _LOGGER.info("MLflow server url: %s", mlflow_tracking_uri)
from prometheus_api_client import Metric, MetricsList, PrometheusConnect from prometheus_api_client.utils import parse_datetime, parse_timedelta import matplotlib.pyplot as plt import pandas as pd pc = PrometheusConnect( url="https://prometheus-k8s-openshift-monitoring.apps-crc.testing", headers={ "Authorization": "bearer 7lmyVwWaTrWZYwiM0KRN30fBw5W70OkcqOMnizZ-cr0" }, disable_ssl=True) start_time = parse_datetime("7d") end_time = parse_datetime("now") chunk_size = parse_timedelta("now", "1d") def get_data(metrics, timestamp_filenames, datafile): def _getTimestamps(timestamp_filename): print(timestamp_filename) file_name = open(timestamp_filename, "r") ts = file_name.readlines() return ts def _getMetricsData(metric): metric_data = pc.get_metric_range_data( metric, start_time=start_time, end_time=end_time, chunk_size=chunk_size,
def get_all_metrics(start_time='5m', end_time='now', instance='', gpu_id=''): """ all DCGM metrics, on all instances, and all gpus save dumped data to csv file """ # save the time first, in case multiple query at different time later start_time = parse_datetime(start_time) end_time = parse_datetime(end_time) # connect to premtheus server, exit if connection fails url = "http://prometheus:9090" # use service name, instead of ip to be more robust prom = PrometheusConnect(url=url, disable_ssl=True) try: prom.check_prometheus_connection() except Exception as e: logging.error(e) exit(1) # get all metrics under profiler job, note: some instances/gpus may not have all the metrics due to model variance metrics = prom.all_metrics() metrics = [a for a in metrics if 'DCGM' in a] gpu_util = 'DCGM_FI_DEV_GPU_UTIL' label_cfg = {"job": "profiler-pods"} # get a screenshot of all the instances (pod ip) metric_data = prom.get_current_metric_value(metric_name=gpu_util, label_config=label_cfg) metric_df = MetricSnapshotDataFrame(metric_data) instances = metric_df.instance.unique() ins_gpu = dict() for ins in instances: # add instance in query label_cfg['instance'] = ins metric_data = prom.get_current_metric_value(metric_name=gpu_util, label_config=label_cfg) metric_df = MetricSnapshotDataFrame(metric_data) gpus = metric_df.gpu.unique() # put each instance's gpus into dictionary ins_gpu[ins] = gpus my_label_config = {"job": "profiler-pods", "gpu": gpu_id} # select gpu0 #my_label_config = {"instance": instance} # select all gpu # if one particular instance is given, update instances if instance != '': instances = [ instance, ] for ins in instances: if gpu_id != '': gpus = [ gpu_id, ] else: gpus = ins_gpu[ins] print(ins, gpus) for gpu in gpus: my_label_config = {"instance": ins, "gpu": gpu} df = pd.DataFrame() for metric_name in metrics: # select from different metric_name to query metric_data = prom.get_metric_range_data( metric_name=metric_name, label_config=my_label_config, start_time=parse_datetime(start_time), end_time=parse_datetime(end_time)) # reorganize data to label_config and metric_values metric_object_list = MetricsList(metric_data) if len(metric_object_list) > 0: if 'datetime' not in df.columns: df['datetime'] = metric_object_list[0].metric_values[ 'ds'] df[metric_name] = metric_object_list[0].metric_values['y'] file_name = "_".join([ins, gpu]) + ".csv" df.to_csv(file_name)