def train_individual_model(predictor_model, initial_run): metric_to_predict = predictor_model.metric pc = PrometheusConnect( url=Configuration.prometheus_url, headers=Configuration.prom_connect_headers, disable_ssl=True, ) data_start_time = datetime.now() - Configuration.metric_chunk_size if initial_run: data_start_time = ( datetime.now() - Configuration.rolling_training_window_size ) # Download new metric data from prometheus new_metric_data = pc.get_metric_range_data( metric_name=metric_to_predict.metric_name, label_config=metric_to_predict.label_config, start_time=data_start_time, end_time=datetime.now(), )[0] # Train the new model start_time = datetime.now() predictor_model.train( new_metric_data, Configuration.retraining_interval_minutes) _LOGGER.info( "Total Training time taken = %s, for metric: %s %s", str(datetime.now() - start_time), metric_to_predict.metric_name, metric_to_predict.label_config, ) return predictor_model
class PromSummarizer(object): def __init__(self, url, disable_ssl=False): self.prom = PrometheusConnect(url=url, disable_ssl=disable_ssl) def fetch(self, expression, number_of_days): start_time = parse_datetime('%dd' % number_of_days) end_time = parse_datetime('now') chunk_size = parse_timedelta('now', '1d') metric_data = self.prom.get_metric_range_data( expression, start_time=start_time, end_time=end_time, chunk_size=chunk_size, ) # MetricsList combines the chunks into a single metric metric = MetricsList(metric_data)[0] # Yield tuples of timestamp, value for value in metric.metric_values.values: ts, val = value.tolist() # The timestamp is delivered in UTC, convert to local ts = ts.to_pydatetime().replace(tzinfo=tz.tzutc()) ts = ts.astimezone(tz.tzlocal()) yield ts, val
def timed_job(): config = configparser.ConfigParser() config.read('config/config.cfg') account = config.get('DEFAULT', 'ACCOUNT') key = config.get('DEFAULT', 'KEY') promi = config.get('DEFAULT', 'PROM') promup = promi.encode() container = config.get('DEFAULT', 'CONTAINER') url = config.get('DEFAULT', 'URL') blob_service = BlockBlobService(account_name=account, account_key=key) userAndPass = b64encode(promup).decode("ascii") headers = {'Authorization': 'Basic %s' % userAndPass} prom = PrometheusConnect(url=url, headers=headers, disable_ssl=False) metric_data = prom.all_metrics() time = datetime.now() metrics = [] values = [] for i in metric_data: metric = prom.get_metric_range_data(metric_name=i, start_time=time - timedelta(hours=1), end_time=time, chunk_size=timedelta(hours=1)) x = int(0) for d in metric: for name, dct in d.items(): dct = dict(dct) if name == 'metric': dct['id'] = x metrics.append(dct) else: for key in dct: va = {} va['time'] = key va['value'] = dct[key] va['id'] = x values.append(va) x = x + 1 df = pd.DataFrame(metrics) df1 = pd.DataFrame(values) df = pd.merge(df, df1, how='inner', left_on=['id'], right_on=['id']) df['time'] = pd.to_datetime(df['time'], unit='s') df = df.drop(['endpoint', 'service', 'id'], axis=1) write_pandas_dataframe_to_blob( blob_service, df, container, str((datetime.now()).date()) + '/' + str(datetime.now().time()).replace(':', '').replace(".", ''))
def profiling(url, pod_ip, ana_window='2m', metrics=MEM_UTIL): """if key exists, the value will be replaced, add dynamic status {ai.centaurus.io/gpu0:{cur_mem_used:4GB, max_gpu_util:60, max_mem_cpy_util:34, cyclic:True, process_cnt:1}, ai.centaurus.io/gpu1:{cur_mem_used:4GB, max_gpu_util:60, max_mem_cpy_util:34, cyclic:True, process_cnt:2, processes:[{pid:25678, cur_mem_used:3GB},{pid:67234, cur_mem_used:1GB}]} } """ ret_dict = dict() promi = PrometheusConnect(url=url, disable_ssl=True) # except connection error try: promi.check_prometheus_connection() except Exception as e: logging.error(e) return ret_dict # if connectioin fails, return empty dict instance = pod_ip + ":9400" # tmp fixed start_time = parse_datetime(ana_window) end_time = parse_datetime("now") my_label_config = {"instance": instance} # select current host metrics metric_data = promi.get_metric_range_data(metric_name=metrics, label_config=my_label_config, start_time=start_time, end_time=end_time) # reorganize data to label_config and metric_values metric_object_list = MetricsList(metric_data) ret_dict = dict() for item in metric_object_list: # iterate through all the gpus on the node if 'gpu' not in item.label_config: # handle metric config info exception continue id = item.label_config['gpu'] # predefined key from dcgm (gpu index) # ip = item.label_config['instance'] key = DOMAIN + "/gpu-" + id cur_usage = collect_cur_usage(int(id)) ts = item.metric_values.iloc[:, 1] # metrics_values are two row df, 1st is timestamp, 2nd is value cur_usage['cyclic_pattern'] = False if ts.max() > 0: cyclic, period = cyclic_pattern_detection(ts) if cyclic: cur_usage['cyclic_pattern'] = True cur_usage['period'] = str(period) cur_usage['max_mem_util'] = str(ts.max()) # Important: flatten nested dictionary to string, otherwise error "cannot unmarshal string into Go value of type map[string]interface {}"" ret_dict[key] = str(cur_usage) return ret_dict
class PrometheusClient: def __init__(self, promhost, promport): self.prom = PrometheusConnect(url="http://%s:%s" % (promhost, promport), disable_ssl=True) def get_ticktime(self): return self.__get_metric_for_last_five_mins("overall_ticktime")[0].get( "values") def get_dim_ticktime(self): result = {} dim_ticktimes = self.__get_metric_for_last_five_mins("dim_ticktime") for dimension in dim_ticktimes: result[dimension.get("metric").get( "dimension_name")] = dimension.get("values") return result def get_players(self): players = [] for p in self.prom.custom_query("player_playtime"): players.append(p.get("metric").get("player")) return players def get_tps(self): return self.__get_metric_for_last_five_mins("overall_tps")[0].get( "values") def get_dim_tps(self): result = {} dim_tps = self.__get_metric_for_last_five_mins("dim_tps") for dimension in dim_tps: result[dimension.get("metric").get( "dimension_name")] = dimension.get("values") return result def __get_metric_for_last_five_mins(self, metricname): return self.prom.get_metric_range_data( metric_name=metricname, start_time=datetime.datetime.now() - datetime.timedelta(minutes=5), end_time=datetime.datetime.now(), )
class TestPrometheusConnect(unittest.TestCase): """ Test module for class PrometheusConnect """ def setUp(self): """ set up connection settings for prometheus """ self.prometheus_host = os.getenv("PROM_URL") self.pc = PrometheusConnect(url=self.prometheus_host, disable_ssl=True) def test_metrics_list(self): """ Check if setup was done correctly """ metrics_list = self.pc.all_metrics() self.assertTrue( len(metrics_list) > 0, "no metrics received from prometheus") def test_get_metric_range_data(self): start_time = datetime.now() - timedelta(minutes=10) end_time = datetime.now() metric_data = self.pc.get_metric_range_data(metric_name="up", start_time=start_time, end_time=end_time) metric_objects_list = MetricsList(metric_data) self.assertTrue( len(metric_objects_list) > 0, "no metrics received from prometheus") self.assertTrue( start_time.timestamp() < metric_objects_list[0].start_time.timestamp(), "invalid metric start time", ) self.assertTrue( (start_time + timedelta(minutes=1)).timestamp() > metric_objects_list[0].start_time.timestamp(), "invalid metric start time", ) self.assertTrue( end_time.timestamp() > metric_objects_list[0].end_time.timestamp(), "invalid metric end time", ) self.assertTrue( (end_time - timedelta(minutes=1)).timestamp() < metric_objects_list[0].end_time.timestamp(), "invalid metric end time", ) def test_get_metric_range_data_with_chunk_size(self): start_time = datetime.now() - timedelta(minutes=65) chunk_size = timedelta(minutes=7) end_time = datetime.now() - timedelta(minutes=5) metric_data = self.pc.get_metric_range_data(metric_name="up", start_time=start_time, end_time=end_time, chunk_size=chunk_size) metric_objects_list = MetricsList(metric_data) self.assertTrue( len(metric_objects_list) > 0, "no metrics received from prometheus") self.assertTrue( start_time.timestamp() < metric_objects_list[0].start_time.timestamp(), "invalid metric start time (with given chunk_size)", ) self.assertTrue( (start_time + timedelta(minutes=1)).timestamp() > metric_objects_list[0].start_time.timestamp(), "invalid metric start time (with given chunk_size)", ) self.assertTrue( end_time.timestamp() > metric_objects_list[0].end_time.timestamp(), "invalid metric end time (with given chunk_size)", ) self.assertTrue( (end_time - timedelta(minutes=1)).timestamp() < metric_objects_list[0].end_time.timestamp(), "invalid metric end time (with given chunk_size)", ) def test_get_metric_range_data_with_incorrect_input_types(self): start_time = datetime.now() - timedelta(minutes=20) chunk_size = timedelta(minutes=7) end_time = datetime.now() - timedelta(minutes=10) with self.assertRaises(TypeError, msg="start_time accepted invalid value type"): _ = self.pc.get_metric_range_data(metric_name="up", start_time="20m", end_time=end_time, chunk_size=chunk_size) with self.assertRaises(TypeError, msg="end_time accepted invalid value type"): _ = self.pc.get_metric_range_data(metric_name="up", start_time=start_time, end_time="10m", chunk_size=chunk_size) with self.assertRaises(TypeError, msg="chunk_size accepted invalid value type"): _ = self.pc.get_metric_range_data(metric_name="up", start_time=start_time, end_time=end_time, chunk_size="10m")
class TestPrometheusConnectWithMockedNetwork(BaseMockedNetworkTestcase): """ Network is blocked in this testcase, see base class """ def setUp(self): self.pc = PrometheusConnect(url='http://doesnt_matter.xyz', disable_ssl=True) def test_network_is_blocked(self): resp = requests.get('https://google.com') self.assertEqual(resp.status_code, 403) self.assertEqual(resp.text, 'BOOM!') def test_how_mock_prop_works(self): with self.mock_response('kekekeke', status_code=500) as handler: self.assertEqual(len(handler.requests), 0) resp = requests.get('https://redhat.com') self.assertEqual(resp.status_code, 500) self.assertEqual(resp.text, 'kekekeke') self.assertEqual(len(handler.requests), 1) request = handler.requests[0] self.assertEqual(request.url, 'https://redhat.com/') def test_unauthorized(self): with self.mock_response("Unauthorized", status_code=403): with self.assertRaises(PrometheusApiClientException) as exc: self.pc.all_metrics() self.assertEqual("HTTP Status Code 403 (b'Unauthorized')", str(exc.exception)) def test_broken_responses(self): with self.assertRaises(PrometheusApiClientException) as exc: self.pc.all_metrics() self.assertEqual("HTTP Status Code 403 (b'BOOM!')", str(exc.exception)) with self.assertRaises(PrometheusApiClientException) as exc: self.pc.get_current_metric_value("metric") self.assertEqual("HTTP Status Code 403 (b'BOOM!')", str(exc.exception)) with self.assertRaises(PrometheusApiClientException) as exc: self.pc.get_metric_range_data("metric") self.assertEqual("HTTP Status Code 403 (b'BOOM!')", str(exc.exception)) with self.assertRaises(PrometheusApiClientException) as exc: self.pc.custom_query_range("query", datetime.now(), datetime.now(), "1") self.assertEqual("HTTP Status Code 403 (b'BOOM!')", str(exc.exception)) with self.assertRaises(PrometheusApiClientException) as exc: self.pc.custom_query("query") self.assertEqual("HTTP Status Code 403 (b'BOOM!')", str(exc.exception)) def test_all_metrics_method(self): all_metrics_payload = {"status": "success", "data": ["up", "alerts"]} with self.mock_response(all_metrics_payload) as handler: self.assertTrue(len(self.pc.all_metrics())) self.assertEqual(handler.call_count, 1) request = handler.requests[0] self.assertEqual(request.path_url, "/api/v1/label/__name__/values")
class TestPrometheusConnect(unittest.TestCase): """Test module for class PrometheusConnect.""" def setUp(self): """Set up connection settings for prometheus.""" self.prometheus_host = os.getenv("PROM_URL") self.pc = PrometheusConnect(url=self.prometheus_host, disable_ssl=True) def test_metrics_list(self): """Check if setup was done correctly.""" metrics_list = self.pc.all_metrics() self.assertTrue( len(metrics_list) > 0, "no metrics received from prometheus") def test_get_metric_range_data(self): # noqa D102 start_time = datetime.now() - timedelta(minutes=10) end_time = datetime.now() metric_data = self.pc.get_metric_range_data(metric_name="up", start_time=start_time, end_time=end_time) metric_objects_list = MetricsList(metric_data) self.assertTrue( len(metric_objects_list) > 0, "no metrics received from prometheus") self.assertTrue( start_time.timestamp() < metric_objects_list[0].start_time.timestamp(), "invalid metric start time", ) self.assertTrue( (start_time + timedelta(minutes=1)).timestamp() > metric_objects_list[0].start_time.timestamp(), "invalid metric start time", ) self.assertTrue( end_time.timestamp() > metric_objects_list[0].end_time.timestamp(), "invalid metric end time", ) self.assertTrue( (end_time - timedelta(minutes=1)).timestamp() < metric_objects_list[0].end_time.timestamp(), "invalid metric end time", ) def test_get_metric_range_data_with_chunk_size(self): # noqa D102 start_time = datetime.now() - timedelta(minutes=65) chunk_size = timedelta(minutes=7) end_time = datetime.now() - timedelta(minutes=5) metric_data = self.pc.get_metric_range_data(metric_name="up", start_time=start_time, end_time=end_time, chunk_size=chunk_size) metric_objects_list = MetricsList(metric_data) self.assertTrue( len(metric_objects_list) > 0, "no metrics received from prometheus") self.assertTrue( start_time.timestamp() < metric_objects_list[0].start_time.timestamp(), "invalid metric start time (with given chunk_size)", ) self.assertTrue( (start_time + timedelta(minutes=1)).timestamp() > metric_objects_list[0].start_time.timestamp(), "invalid metric start time (with given chunk_size)", ) self.assertTrue( end_time.timestamp() > metric_objects_list[0].end_time.timestamp(), "invalid metric end time (with given chunk_size)", ) self.assertTrue( (end_time - timedelta(minutes=1)).timestamp() < metric_objects_list[0].end_time.timestamp(), "invalid metric end time (with given chunk_size)", ) def test_get_metric_range_data_with_incorrect_input_types( self): # noqa D102 start_time = datetime.now() - timedelta(minutes=20) chunk_size = timedelta(minutes=7) end_time = datetime.now() - timedelta(minutes=10) with self.assertRaises(ValueError, msg="specified chunk_size is too big"): _ = self.pc.get_metric_range_data( metric_name="up", start_time=start_time, end_time=end_time, chunk_size=timedelta(minutes=30), ) with self.assertRaises(TypeError, msg="start_time accepted invalid value type"): _ = self.pc.get_metric_range_data(metric_name="up", start_time="20m", end_time=end_time, chunk_size=chunk_size) with self.assertRaises(TypeError, msg="end_time accepted invalid value type"): _ = self.pc.get_metric_range_data(metric_name="up", start_time=start_time, end_time="10m", chunk_size=chunk_size) with self.assertRaises(TypeError, msg="chunk_size accepted invalid value type"): _ = self.pc.get_metric_range_data(metric_name="up", start_time=start_time, end_time=end_time, chunk_size="10m") def test_get_metric_aggregation(self): # noqa D102 operations = [ "sum", "max", "min", "variance", "percentile_50", "deviation", "average" ] start_time = datetime.now() - timedelta(minutes=10) end_time = datetime.now() step = "15" aggregated_values = self.pc.get_metric_aggregation( query="up", operations=operations, start_time=start_time, end_time=end_time, step=step) self.assertTrue( len(aggregated_values) > 0, "no values received after aggregating") def test_get_metric_aggregation_with_incorrect_input_types( self): # noqa D102 with self.assertRaises(TypeError, msg="operations accepted invalid value type"): _ = self.pc.get_metric_aggregation(query="up", operations="sum") def test_retry_on_error(self): # noqa D102 retry = Retry(total=3, backoff_factor=0.1, status_forcelist=[400]) pc = PrometheusConnect(url=self.prometheus_host, disable_ssl=True, retry=retry) with self.assertRaises(requests.exceptions.RetryError, msg="too many 400 error responses"): pc.custom_query("BOOM.BOOM!#$%")
def compute_true_positive_rate(forecasted_anomalies, labeled_anomalies): num_true_positive = sum((forecasted_anomalies.values == 1) & (labeled_anomalies.values == 1)) true_postive_rate = num_true_positive / sum(labeled_anomalies.values) return true_postive_rate # Run for every metric defined in the METRICS_LIST for metric in METRICS_LIST: # Download the the train data from Prometheus train_data = MetricsList( pc.get_metric_range_data( metric_name=metric, start_time=Configuration.metric_start_time, end_time=Configuration.metric_train_data_end_time, chunk_size=Configuration.metric_chunk_size, )) # If the training data list downloaded is empty if not train_data: _LOGGER.error( "No Metric data received, please check the data window size") raise ValueError # If more than one time-series match the given metric, raise an error if len(train_data) > 1: _LOGGER.error("Multiple timeseries matching %s were found") _LOGGER.error("The timeseries matched were: ") for timeseries in train_data: print(timeseries.metric_name, timeseries.label_config)
# list of ModelPredictor Objects shared between processes PREDICTOR_MODEL_LIST = list() pc = PrometheusConnect( url=Configuration.prometheus_url, headers=Configuration.prom_connect_headers, disable_ssl=True, ) for metric in METRICS_LIST: # Initialize a predictor for all metrics first current_start_time = datetime.now( ) - Configuration.current_data_window_size metric_init = pc.get_metric_range_data(metric_name=metric, start_time=current_start_time, end_time=datetime.now()) for unique_metric in metric_init: PREDICTOR_MODEL_LIST.append( model.MetricPredictor( unique_metric, rolling_data_window_size=Configuration. rolling_training_window_size, )) # A gauge set for the predicted values GAUGE_DICT = dict() for predictor in PREDICTOR_MODEL_LIST: unique_metric = predictor.metric label_list = list(unique_metric.label_config.keys())
def get_all_metrics(start_time='5m', end_time='now', instance='', gpu_id=''): """ all DCGM metrics, on all instances, and all gpus save dumped data to csv file """ # save the time first, in case multiple query at different time later start_time = parse_datetime(start_time) end_time = parse_datetime(end_time) # connect to premtheus server, exit if connection fails url = "http://prometheus:9090" # use service name, instead of ip to be more robust prom = PrometheusConnect(url=url, disable_ssl=True) try: prom.check_prometheus_connection() except Exception as e: logging.error(e) exit(1) # get all metrics under profiler job, note: some instances/gpus may not have all the metrics due to model variance metrics = prom.all_metrics() metrics = [a for a in metrics if 'DCGM' in a] gpu_util = 'DCGM_FI_DEV_GPU_UTIL' label_cfg = {"job": "profiler-pods"} # get a screenshot of all the instances (pod ip) metric_data = prom.get_current_metric_value(metric_name=gpu_util, label_config=label_cfg) metric_df = MetricSnapshotDataFrame(metric_data) instances = metric_df.instance.unique() ins_gpu = dict() for ins in instances: # add instance in query label_cfg['instance'] = ins metric_data = prom.get_current_metric_value(metric_name=gpu_util, label_config=label_cfg) metric_df = MetricSnapshotDataFrame(metric_data) gpus = metric_df.gpu.unique() # put each instance's gpus into dictionary ins_gpu[ins] = gpus my_label_config = {"job": "profiler-pods", "gpu": gpu_id} # select gpu0 #my_label_config = {"instance": instance} # select all gpu # if one particular instance is given, update instances if instance != '': instances = [ instance, ] for ins in instances: if gpu_id != '': gpus = [ gpu_id, ] else: gpus = ins_gpu[ins] print(ins, gpus) for gpu in gpus: my_label_config = {"instance": ins, "gpu": gpu} df = pd.DataFrame() for metric_name in metrics: # select from different metric_name to query metric_data = prom.get_metric_range_data( metric_name=metric_name, label_config=my_label_config, start_time=parse_datetime(start_time), end_time=parse_datetime(end_time)) # reorganize data to label_config and metric_values metric_object_list = MetricsList(metric_data) if len(metric_object_list) > 0: if 'datetime' not in df.columns: df['datetime'] = metric_object_list[0].metric_values[ 'ds'] df[metric_name] = metric_object_list[0].metric_values['y'] file_name = "_".join([ins, gpu]) + ".csv" df.to_csv(file_name)
return true_postive_rate for metric in METRICS_LIST: rolling_data_window = Configuration.rolling_data_window_size metric_start_time = str( dateparser.parse(rolling_data_window) - (dateparser.parse("now") - dateparser.parse(rolling_data_window))) # Download the initial training data from prometheus train_data = MetricsList( pc.get_metric_range_data( metric_name=metric, start_time=metric_start_time, end_time=rolling_data_window, chunk_size=None, )) # If the training data downloaded is empty if not train_data: _LOGGER.error( "No Metric data received, please check the data window size") raise ValueError # If more than one time-series match the given metric, raise an error if len(train_data) > 1: _LOGGER.error("Multiple timeseries matching %s were found") _LOGGER.error("The timeseries matched were: ") for timeseries in train_data: print(timeseries.metric_name, timeseries.label_config)