def test_init_single_metric(self): # noqa D102 self.assertEqual( 1, len(MetricsList(self.raw_metrics_list[0][0])), "incorrect number of Metric objects initialized for a raw metric not in a list", ) self.assertEqual( 1, len(MetricsList([self.raw_metrics_list[0][0]])), "incorrect number of Metric objects initialized for a single metric list", )
def test_get_metric_range_data_with_chunk_size(self): start_time = datetime.now() - timedelta(minutes=65) chunk_size = timedelta(minutes=7) end_time = datetime.now() - timedelta(minutes=5) metric_data = self.pc.get_metric_range_data(metric_name="up", start_time=start_time, end_time=end_time, chunk_size=chunk_size) metric_objects_list = MetricsList(metric_data) self.assertTrue( len(metric_objects_list) > 0, "no metrics received from prometheus") self.assertTrue( start_time.timestamp() < metric_objects_list[0].start_time.timestamp(), "invalid metric start time (with given chunk_size)", ) self.assertTrue( (start_time + timedelta(minutes=1)).timestamp() > metric_objects_list[0].start_time.timestamp(), "invalid metric start time (with given chunk_size)", ) self.assertTrue( end_time.timestamp() > metric_objects_list[0].end_time.timestamp(), "invalid metric end time (with given chunk_size)", ) self.assertTrue( (end_time - timedelta(minutes=1)).timestamp() < metric_objects_list[0].end_time.timestamp(), "invalid metric end time (with given chunk_size)", )
def test_init(self): """Test if metrics initialized in the list are correct.""" self.assertEqual( 9, # manually check the number of unique metric time-series len(MetricsList(self.raw_metrics_list)), "incorrect number of unique metric timeseries", )
def test_get_metric_range_data(self): # noqa D102 start_time = datetime.now() - timedelta(minutes=10) end_time = datetime.now() metric_data = self.pc.get_metric_range_data(metric_name="up", start_time=start_time, end_time=end_time) metric_objects_list = MetricsList(metric_data) self.assertTrue( len(metric_objects_list) > 0, "no metrics received from prometheus") self.assertTrue( start_time.timestamp() < metric_objects_list[0].start_time.timestamp(), "invalid metric start time", ) self.assertTrue( (start_time + timedelta(minutes=1)).timestamp() > metric_objects_list[0].start_time.timestamp(), "invalid metric start time", ) self.assertTrue( end_time.timestamp() > metric_objects_list[0].end_time.timestamp(), "invalid metric end time", ) self.assertTrue( (end_time - timedelta(minutes=1)).timestamp() < metric_objects_list[0].end_time.timestamp(), "invalid metric end time", )
def general(metric_name): metric_data = pc.get_metric_range_data( metric_name, start_time=start_time, end_time=end_time, chunk_size=chunk_size, ) metrics_object_list = MetricsList(metric_data) metric_object = metrics_object_list[0] value_list = metric_object.metric_values print(value_list)
def _getMetricsData(metric): metric_data = pc.get_metric_range_data( metric, start_time=start_time, end_time=end_time, chunk_size=chunk_size, ) metrics_object_list = MetricsList(metric_data) metric_object = metrics_object_list[0] value_list = metric_object.metric_values return value_list
def pro(): pc = PrometheusConnect( url="https://prometheus-k8s-openshift-monitoring.apps-crc.testing", headers={ "Authorization": "bearer BSI2W0euoJWYRAvT0ZnSJVmgNQ87pl3o3yXuyy38qAg" }, disable_ssl=True) up_metric = MetricsList( pc.get_current_metric_value( metric_name="haproxy_backend_up{exported_namespace='prophet'}")) print(up_metric[0])
def test_unique_metric_combination(self): # noqa D102 start_time = datetime.datetime(2019, 7, 28, 10, 0) start_time_plus_1m = datetime.datetime(2019, 7, 28, 10, 1) end_time = datetime.datetime(2019, 7, 30, 10, 0) end_time_minus_1m = datetime.datetime(2019, 7, 30, 9, 59) self.assertTrue( MetricsList(self.raw_metrics_list)[0].start_time > start_time, "Combined metric start time incorrect", ) self.assertTrue( MetricsList(self.raw_metrics_list)[0].start_time < start_time_plus_1m, "Combined metric start time incorrect", ) self.assertTrue( MetricsList(self.raw_metrics_list)[0].end_time < end_time, "Combined metric end time incorrect", ) self.assertTrue( MetricsList(self.raw_metrics_list)[0].end_time > end_time_minus_1m, "Combined metric end time incorrect", )
def profiling(url, pod_ip, ana_window='2m', metrics=MEM_UTIL): """if key exists, the value will be replaced, add dynamic status {ai.centaurus.io/gpu0:{cur_mem_used:4GB, max_gpu_util:60, max_mem_cpy_util:34, cyclic:True, process_cnt:1}, ai.centaurus.io/gpu1:{cur_mem_used:4GB, max_gpu_util:60, max_mem_cpy_util:34, cyclic:True, process_cnt:2, processes:[{pid:25678, cur_mem_used:3GB},{pid:67234, cur_mem_used:1GB}]} } """ ret_dict = dict() promi = PrometheusConnect(url=url, disable_ssl=True) # except connection error try: promi.check_prometheus_connection() except Exception as e: logging.error(e) return ret_dict # if connectioin fails, return empty dict instance = pod_ip + ":9400" # tmp fixed start_time = parse_datetime(ana_window) end_time = parse_datetime("now") my_label_config = {"instance": instance} # select current host metrics metric_data = promi.get_metric_range_data(metric_name=metrics, label_config=my_label_config, start_time=start_time, end_time=end_time) # reorganize data to label_config and metric_values metric_object_list = MetricsList(metric_data) ret_dict = dict() for item in metric_object_list: # iterate through all the gpus on the node if 'gpu' not in item.label_config: # handle metric config info exception continue id = item.label_config['gpu'] # predefined key from dcgm (gpu index) # ip = item.label_config['instance'] key = DOMAIN + "/gpu-" + id cur_usage = collect_cur_usage(int(id)) ts = item.metric_values.iloc[:, 1] # metrics_values are two row df, 1st is timestamp, 2nd is value cur_usage['cyclic_pattern'] = False if ts.max() > 0: cyclic, period = cyclic_pattern_detection(ts) if cyclic: cur_usage['cyclic_pattern'] = True cur_usage['period'] = str(period) cur_usage['max_mem_util'] = str(ts.max()) # Important: flatten nested dictionary to string, otherwise error "cannot unmarshal string into Go value of type map[string]interface {}"" ret_dict[key] = str(cur_usage) return ret_dict
def fetch(self, expression, number_of_days): start_time = parse_datetime('%dd' % number_of_days) end_time = parse_datetime('now') chunk_size = parse_timedelta('now', '1d') metric_data = self.prom.get_metric_range_data( expression, start_time=start_time, end_time=end_time, chunk_size=chunk_size, ) # MetricsList combines the chunks into a single metric metric = MetricsList(metric_data)[0] # Yield tuples of timestamp, value for value in metric.metric_values.values: ts, val = value.tolist() # The timestamp is delivered in UTC, convert to local ts = ts.to_pydatetime().replace(tzinfo=tz.tzutc()) ts = ts.astimezone(tz.tzlocal()) yield ts, val
def compute_true_positive_rate(forecasted_anomalies, labeled_anomalies): num_true_positive = sum((forecasted_anomalies.values == 1) & (labeled_anomalies.values == 1)) true_postive_rate = num_true_positive / sum(labeled_anomalies.values) return true_postive_rate # Run for every metric defined in the METRICS_LIST for metric in METRICS_LIST: # Download the the train data from Prometheus train_data = MetricsList( pc.get_metric_range_data( metric_name=metric, start_time=Configuration.metric_start_time, end_time=Configuration.metric_train_data_end_time, chunk_size=Configuration.metric_chunk_size, )) # If the training data list downloaded is empty if not train_data: _LOGGER.error( "No Metric data received, please check the data window size") raise ValueError # If more than one time-series match the given metric, raise an error if len(train_data) > 1: _LOGGER.error("Multiple timeseries matching %s were found") _LOGGER.error("The timeseries matched were: ") for timeseries in train_data:
def get_metric_obj_list(metric_data): return MetricsList(metric_data)
def update_values(models_include=None): """Update db_values for every TS. If Values record exists then updates its metric. If Values record does not exist then its created When Values record is created its predictor Model selected. Value record is associated with its TS. index (hash): { "metric" (Metric): first item of return value of MetricsList(get_metric_range_data()) "ts" (tsKey): key of db_ts "model" (modelKey): key of db_models } Raises: Exception: [description] Exception: [description] Exception: [description] e: [description] """ logger.info("Updating Values") now = datetime.now() generation = next(values_generation) for (h, ts) in db_ts.items(): logger.debug("Updating [TS:{h}], labels:{labels}".format( h=h, labels=ts["labels"])) if h in db_values.keys(): # TS is already tracked by a Values record in db_values current_start_time = now - Configuration.current_data_window_size record = db_values[h] metric = record["metric"] metric_data = pc.get_metric_range_data( metric_name=metric.metric_name, label_config=metric.label_config, start_time=current_start_time, end_time=now) metrics = MetricsList(metric_data) if len(metrics) != 1: raise Exception("There can be only one") new_metric = metrics[0] + metric trunk_metric = Metric( new_metric, current_start_time ) # This throws some exception really fast but this would have solved the problem. db_values[h]["metric"] = trunk_metric db_values[h]["generation"] = generation logger.debug( "Update and truncate [Metric:{h}] horizon:{current_start_time} metric_name:{metric_name}, label_config:{label_config}" .format(h=h, metric_name=metric.metric_name, label_config=metric.label_config, current_start_time=current_start_time)) else: current_start_time = now - Configuration.current_data_window_size metric_name = ts["labels"]["__name__"] labels = dict() labels.update(ts["labels"]) del labels["__name__"] items = db_models.items() if not models_include is None: items = filter(lambda item: item[0] in models_include, items) models = list( filter( lambda model: ts_hash(all_labels=model[1]["labels"]) == h, items)) if len(models) == 0: logger.warning( "No models matching labels for [Metric:{h}] metric_name:{metric_name}, label_config:{label_config}" .format(h=h, metric_name=metric_name, label_config=labels)) continue metric_data = pc.get_metric_range_data( metric_name=metric_name, label_config=labels, start_time=current_start_time, end_time=now) metrics = MetricsList(metric_data) if len(metrics) != 1: raise Exception("There can be only one") # pick the most recent model models.sort(key=lambda model: model[1].get( "timestamp", datetime.fromtimestamp(0)), reverse=True) predictor = models[0][0] # predictor.build_prediction_df() record = { "metric": metrics[0], "ts": h, "model": predictor, "generation": generation } db_values.update({h: record}) logger.debug( "Add [Metric:{h}] horizon:{current_start_time} metric_name:{metric_name}, label_config:{label_config}" .format(h=h, metric_name=metric_name, label_config=labels, current_start_time=current_start_time))
def get_all_metrics(start_time='5m', end_time='now', instance='', gpu_id=''): """ all DCGM metrics, on all instances, and all gpus save dumped data to csv file """ # save the time first, in case multiple query at different time later start_time = parse_datetime(start_time) end_time = parse_datetime(end_time) # connect to premtheus server, exit if connection fails url = "http://prometheus:9090" # use service name, instead of ip to be more robust prom = PrometheusConnect(url=url, disable_ssl=True) try: prom.check_prometheus_connection() except Exception as e: logging.error(e) exit(1) # get all metrics under profiler job, note: some instances/gpus may not have all the metrics due to model variance metrics = prom.all_metrics() metrics = [a for a in metrics if 'DCGM' in a] gpu_util = 'DCGM_FI_DEV_GPU_UTIL' label_cfg = {"job": "profiler-pods"} # get a screenshot of all the instances (pod ip) metric_data = prom.get_current_metric_value(metric_name=gpu_util, label_config=label_cfg) metric_df = MetricSnapshotDataFrame(metric_data) instances = metric_df.instance.unique() ins_gpu = dict() for ins in instances: # add instance in query label_cfg['instance'] = ins metric_data = prom.get_current_metric_value(metric_name=gpu_util, label_config=label_cfg) metric_df = MetricSnapshotDataFrame(metric_data) gpus = metric_df.gpu.unique() # put each instance's gpus into dictionary ins_gpu[ins] = gpus my_label_config = {"job": "profiler-pods", "gpu": gpu_id} # select gpu0 #my_label_config = {"instance": instance} # select all gpu # if one particular instance is given, update instances if instance != '': instances = [ instance, ] for ins in instances: if gpu_id != '': gpus = [ gpu_id, ] else: gpus = ins_gpu[ins] print(ins, gpus) for gpu in gpus: my_label_config = {"instance": ins, "gpu": gpu} df = pd.DataFrame() for metric_name in metrics: # select from different metric_name to query metric_data = prom.get_metric_range_data( metric_name=metric_name, label_config=my_label_config, start_time=parse_datetime(start_time), end_time=parse_datetime(end_time)) # reorganize data to label_config and metric_values metric_object_list = MetricsList(metric_data) if len(metric_object_list) > 0: if 'datetime' not in df.columns: df['datetime'] = metric_object_list[0].metric_values[ 'ds'] df[metric_name] = metric_object_list[0].metric_values['y'] file_name = "_".join([ins, gpu]) + ".csv" df.to_csv(file_name)
return true_postive_rate for metric in METRICS_LIST: rolling_data_window = Configuration.rolling_data_window_size metric_start_time = str( dateparser.parse(rolling_data_window) - (dateparser.parse("now") - dateparser.parse(rolling_data_window))) # Download the initial training data from prometheus train_data = MetricsList( pc.get_metric_range_data( metric_name=metric, start_time=metric_start_time, end_time=rolling_data_window, chunk_size=None, )) # If the training data downloaded is empty if not train_data: _LOGGER.error( "No Metric data received, please check the data window size") raise ValueError # If more than one time-series match the given metric, raise an error if len(train_data) > 1: _LOGGER.error("Multiple timeseries matching %s were found") _LOGGER.error("The timeseries matched were: ") for timeseries in train_data: