def timed_job(): config = configparser.ConfigParser() config.read('config/config.cfg') account = config.get('DEFAULT', 'ACCOUNT') key = config.get('DEFAULT', 'KEY') promi = config.get('DEFAULT', 'PROM') promup = promi.encode() container = config.get('DEFAULT', 'CONTAINER') url = config.get('DEFAULT', 'URL') blob_service = BlockBlobService(account_name=account, account_key=key) userAndPass = b64encode(promup).decode("ascii") headers = {'Authorization': 'Basic %s' % userAndPass} prom = PrometheusConnect(url=url, headers=headers, disable_ssl=False) metric_data = prom.all_metrics() time = datetime.now() metrics = [] values = [] for i in metric_data: metric = prom.get_metric_range_data(metric_name=i, start_time=time - timedelta(hours=1), end_time=time, chunk_size=timedelta(hours=1)) x = int(0) for d in metric: for name, dct in d.items(): dct = dict(dct) if name == 'metric': dct['id'] = x metrics.append(dct) else: for key in dct: va = {} va['time'] = key va['value'] = dct[key] va['id'] = x values.append(va) x = x + 1 df = pd.DataFrame(metrics) df1 = pd.DataFrame(values) df = pd.merge(df, df1, how='inner', left_on=['id'], right_on=['id']) df['time'] = pd.to_datetime(df['time'], unit='s') df = df.drop(['endpoint', 'service', 'id'], axis=1) write_pandas_dataframe_to_blob( blob_service, df, container, str((datetime.now()).date()) + '/' + str(datetime.now().time()).replace(':', '').replace(".", ''))
class TestPrometheusConnect(unittest.TestCase): """ Test module for class PrometheusConnect """ def setUp(self): """ set up connection settings for prometheus """ self.prometheus_host = os.getenv("PROM_URL") self.pc = PrometheusConnect(url=self.prometheus_host, disable_ssl=True) def test_metrics_list(self): """ Check if setup was done correctly """ metrics_list = self.pc.all_metrics() self.assertTrue( len(metrics_list) > 0, "no metrics received from prometheus") def test_get_metric_range_data(self): start_time = datetime.now() - timedelta(minutes=10) end_time = datetime.now() metric_data = self.pc.get_metric_range_data(metric_name="up", start_time=start_time, end_time=end_time) metric_objects_list = MetricsList(metric_data) self.assertTrue( len(metric_objects_list) > 0, "no metrics received from prometheus") self.assertTrue( start_time.timestamp() < metric_objects_list[0].start_time.timestamp(), "invalid metric start time", ) self.assertTrue( (start_time + timedelta(minutes=1)).timestamp() > metric_objects_list[0].start_time.timestamp(), "invalid metric start time", ) self.assertTrue( end_time.timestamp() > metric_objects_list[0].end_time.timestamp(), "invalid metric end time", ) self.assertTrue( (end_time - timedelta(minutes=1)).timestamp() < metric_objects_list[0].end_time.timestamp(), "invalid metric end time", ) def test_get_metric_range_data_with_chunk_size(self): start_time = datetime.now() - timedelta(minutes=65) chunk_size = timedelta(minutes=7) end_time = datetime.now() - timedelta(minutes=5) metric_data = self.pc.get_metric_range_data(metric_name="up", start_time=start_time, end_time=end_time, chunk_size=chunk_size) metric_objects_list = MetricsList(metric_data) self.assertTrue( len(metric_objects_list) > 0, "no metrics received from prometheus") self.assertTrue( start_time.timestamp() < metric_objects_list[0].start_time.timestamp(), "invalid metric start time (with given chunk_size)", ) self.assertTrue( (start_time + timedelta(minutes=1)).timestamp() > metric_objects_list[0].start_time.timestamp(), "invalid metric start time (with given chunk_size)", ) self.assertTrue( end_time.timestamp() > metric_objects_list[0].end_time.timestamp(), "invalid metric end time (with given chunk_size)", ) self.assertTrue( (end_time - timedelta(minutes=1)).timestamp() < metric_objects_list[0].end_time.timestamp(), "invalid metric end time (with given chunk_size)", ) def test_get_metric_range_data_with_incorrect_input_types(self): start_time = datetime.now() - timedelta(minutes=20) chunk_size = timedelta(minutes=7) end_time = datetime.now() - timedelta(minutes=10) with self.assertRaises(TypeError, msg="start_time accepted invalid value type"): _ = self.pc.get_metric_range_data(metric_name="up", start_time="20m", end_time=end_time, chunk_size=chunk_size) with self.assertRaises(TypeError, msg="end_time accepted invalid value type"): _ = self.pc.get_metric_range_data(metric_name="up", start_time=start_time, end_time="10m", chunk_size=chunk_size) with self.assertRaises(TypeError, msg="chunk_size accepted invalid value type"): _ = self.pc.get_metric_range_data(metric_name="up", start_time=start_time, end_time=end_time, chunk_size="10m")
class TestPrometheusConnectWithMockedNetwork(BaseMockedNetworkTestcase): """ Network is blocked in this testcase, see base class """ def setUp(self): self.pc = PrometheusConnect(url='http://doesnt_matter.xyz', disable_ssl=True) def test_network_is_blocked(self): resp = requests.get('https://google.com') self.assertEqual(resp.status_code, 403) self.assertEqual(resp.text, 'BOOM!') def test_how_mock_prop_works(self): with self.mock_response('kekekeke', status_code=500) as handler: self.assertEqual(len(handler.requests), 0) resp = requests.get('https://redhat.com') self.assertEqual(resp.status_code, 500) self.assertEqual(resp.text, 'kekekeke') self.assertEqual(len(handler.requests), 1) request = handler.requests[0] self.assertEqual(request.url, 'https://redhat.com/') def test_unauthorized(self): with self.mock_response("Unauthorized", status_code=403): with self.assertRaises(PrometheusApiClientException) as exc: self.pc.all_metrics() self.assertEqual("HTTP Status Code 403 (b'Unauthorized')", str(exc.exception)) def test_broken_responses(self): with self.assertRaises(PrometheusApiClientException) as exc: self.pc.all_metrics() self.assertEqual("HTTP Status Code 403 (b'BOOM!')", str(exc.exception)) with self.assertRaises(PrometheusApiClientException) as exc: self.pc.get_current_metric_value("metric") self.assertEqual("HTTP Status Code 403 (b'BOOM!')", str(exc.exception)) with self.assertRaises(PrometheusApiClientException) as exc: self.pc.get_metric_range_data("metric") self.assertEqual("HTTP Status Code 403 (b'BOOM!')", str(exc.exception)) with self.assertRaises(PrometheusApiClientException) as exc: self.pc.custom_query_range("query", datetime.now(), datetime.now(), "1") self.assertEqual("HTTP Status Code 403 (b'BOOM!')", str(exc.exception)) with self.assertRaises(PrometheusApiClientException) as exc: self.pc.custom_query("query") self.assertEqual("HTTP Status Code 403 (b'BOOM!')", str(exc.exception)) def test_all_metrics_method(self): all_metrics_payload = {"status": "success", "data": ["up", "alerts"]} with self.mock_response(all_metrics_payload) as handler: self.assertTrue(len(self.pc.all_metrics())) self.assertEqual(handler.call_count, 1) request = handler.requests[0] self.assertEqual(request.path_url, "/api/v1/label/__name__/values")
class TestPrometheusConnect(unittest.TestCase): """Test module for class PrometheusConnect.""" def setUp(self): """Set up connection settings for prometheus.""" self.prometheus_host = os.getenv("PROM_URL") self.pc = PrometheusConnect(url=self.prometheus_host, disable_ssl=True) def test_metrics_list(self): """Check if setup was done correctly.""" metrics_list = self.pc.all_metrics() self.assertTrue( len(metrics_list) > 0, "no metrics received from prometheus") def test_get_metric_range_data(self): # noqa D102 start_time = datetime.now() - timedelta(minutes=10) end_time = datetime.now() metric_data = self.pc.get_metric_range_data(metric_name="up", start_time=start_time, end_time=end_time) metric_objects_list = MetricsList(metric_data) self.assertTrue( len(metric_objects_list) > 0, "no metrics received from prometheus") self.assertTrue( start_time.timestamp() < metric_objects_list[0].start_time.timestamp(), "invalid metric start time", ) self.assertTrue( (start_time + timedelta(minutes=1)).timestamp() > metric_objects_list[0].start_time.timestamp(), "invalid metric start time", ) self.assertTrue( end_time.timestamp() > metric_objects_list[0].end_time.timestamp(), "invalid metric end time", ) self.assertTrue( (end_time - timedelta(minutes=1)).timestamp() < metric_objects_list[0].end_time.timestamp(), "invalid metric end time", ) def test_get_metric_range_data_with_chunk_size(self): # noqa D102 start_time = datetime.now() - timedelta(minutes=65) chunk_size = timedelta(minutes=7) end_time = datetime.now() - timedelta(minutes=5) metric_data = self.pc.get_metric_range_data(metric_name="up", start_time=start_time, end_time=end_time, chunk_size=chunk_size) metric_objects_list = MetricsList(metric_data) self.assertTrue( len(metric_objects_list) > 0, "no metrics received from prometheus") self.assertTrue( start_time.timestamp() < metric_objects_list[0].start_time.timestamp(), "invalid metric start time (with given chunk_size)", ) self.assertTrue( (start_time + timedelta(minutes=1)).timestamp() > metric_objects_list[0].start_time.timestamp(), "invalid metric start time (with given chunk_size)", ) self.assertTrue( end_time.timestamp() > metric_objects_list[0].end_time.timestamp(), "invalid metric end time (with given chunk_size)", ) self.assertTrue( (end_time - timedelta(minutes=1)).timestamp() < metric_objects_list[0].end_time.timestamp(), "invalid metric end time (with given chunk_size)", ) def test_get_metric_range_data_with_incorrect_input_types( self): # noqa D102 start_time = datetime.now() - timedelta(minutes=20) chunk_size = timedelta(minutes=7) end_time = datetime.now() - timedelta(minutes=10) with self.assertRaises(ValueError, msg="specified chunk_size is too big"): _ = self.pc.get_metric_range_data( metric_name="up", start_time=start_time, end_time=end_time, chunk_size=timedelta(minutes=30), ) with self.assertRaises(TypeError, msg="start_time accepted invalid value type"): _ = self.pc.get_metric_range_data(metric_name="up", start_time="20m", end_time=end_time, chunk_size=chunk_size) with self.assertRaises(TypeError, msg="end_time accepted invalid value type"): _ = self.pc.get_metric_range_data(metric_name="up", start_time=start_time, end_time="10m", chunk_size=chunk_size) with self.assertRaises(TypeError, msg="chunk_size accepted invalid value type"): _ = self.pc.get_metric_range_data(metric_name="up", start_time=start_time, end_time=end_time, chunk_size="10m") def test_get_metric_aggregation(self): # noqa D102 operations = [ "sum", "max", "min", "variance", "percentile_50", "deviation", "average" ] start_time = datetime.now() - timedelta(minutes=10) end_time = datetime.now() step = "15" aggregated_values = self.pc.get_metric_aggregation( query="up", operations=operations, start_time=start_time, end_time=end_time, step=step) self.assertTrue( len(aggregated_values) > 0, "no values received after aggregating") def test_get_metric_aggregation_with_incorrect_input_types( self): # noqa D102 with self.assertRaises(TypeError, msg="operations accepted invalid value type"): _ = self.pc.get_metric_aggregation(query="up", operations="sum") def test_retry_on_error(self): # noqa D102 retry = Retry(total=3, backoff_factor=0.1, status_forcelist=[400]) pc = PrometheusConnect(url=self.prometheus_host, disable_ssl=True, retry=retry) with self.assertRaises(requests.exceptions.RetryError, msg="too many 400 error responses"): pc.custom_query("BOOM.BOOM!#$%")
def get_all_metrics(start_time='5m', end_time='now', instance='', gpu_id=''): """ all DCGM metrics, on all instances, and all gpus save dumped data to csv file """ # save the time first, in case multiple query at different time later start_time = parse_datetime(start_time) end_time = parse_datetime(end_time) # connect to premtheus server, exit if connection fails url = "http://prometheus:9090" # use service name, instead of ip to be more robust prom = PrometheusConnect(url=url, disable_ssl=True) try: prom.check_prometheus_connection() except Exception as e: logging.error(e) exit(1) # get all metrics under profiler job, note: some instances/gpus may not have all the metrics due to model variance metrics = prom.all_metrics() metrics = [a for a in metrics if 'DCGM' in a] gpu_util = 'DCGM_FI_DEV_GPU_UTIL' label_cfg = {"job": "profiler-pods"} # get a screenshot of all the instances (pod ip) metric_data = prom.get_current_metric_value(metric_name=gpu_util, label_config=label_cfg) metric_df = MetricSnapshotDataFrame(metric_data) instances = metric_df.instance.unique() ins_gpu = dict() for ins in instances: # add instance in query label_cfg['instance'] = ins metric_data = prom.get_current_metric_value(metric_name=gpu_util, label_config=label_cfg) metric_df = MetricSnapshotDataFrame(metric_data) gpus = metric_df.gpu.unique() # put each instance's gpus into dictionary ins_gpu[ins] = gpus my_label_config = {"job": "profiler-pods", "gpu": gpu_id} # select gpu0 #my_label_config = {"instance": instance} # select all gpu # if one particular instance is given, update instances if instance != '': instances = [ instance, ] for ins in instances: if gpu_id != '': gpus = [ gpu_id, ] else: gpus = ins_gpu[ins] print(ins, gpus) for gpu in gpus: my_label_config = {"instance": ins, "gpu": gpu} df = pd.DataFrame() for metric_name in metrics: # select from different metric_name to query metric_data = prom.get_metric_range_data( metric_name=metric_name, label_config=my_label_config, start_time=parse_datetime(start_time), end_time=parse_datetime(end_time)) # reorganize data to label_config and metric_values metric_object_list = MetricsList(metric_data) if len(metric_object_list) > 0: if 'datetime' not in df.columns: df['datetime'] = metric_object_list[0].metric_values[ 'ds'] df[metric_name] = metric_object_list[0].metric_values['y'] file_name = "_".join([ins, gpu]) + ".csv" df.to_csv(file_name)