def test_init_single_metric(self):  # noqa D102
     self.assertEqual(
         1,
         len(MetricsList(self.raw_metrics_list[0][0])),
         "incorrect number of Metric objects initialized for a raw metric not in a list",
     )
     self.assertEqual(
         1,
         len(MetricsList([self.raw_metrics_list[0][0]])),
         "incorrect number of Metric objects initialized for a single metric list",
     )
    def test_get_metric_range_data_with_chunk_size(self):
        start_time = datetime.now() - timedelta(minutes=65)
        chunk_size = timedelta(minutes=7)
        end_time = datetime.now() - timedelta(minutes=5)
        metric_data = self.pc.get_metric_range_data(metric_name="up",
                                                    start_time=start_time,
                                                    end_time=end_time,
                                                    chunk_size=chunk_size)

        metric_objects_list = MetricsList(metric_data)

        self.assertTrue(
            len(metric_objects_list) > 0,
            "no metrics received from prometheus")
        self.assertTrue(
            start_time.timestamp() <
            metric_objects_list[0].start_time.timestamp(),
            "invalid metric start time (with given chunk_size)",
        )
        self.assertTrue(
            (start_time + timedelta(minutes=1)).timestamp() >
            metric_objects_list[0].start_time.timestamp(),
            "invalid metric start time (with given chunk_size)",
        )
        self.assertTrue(
            end_time.timestamp() > metric_objects_list[0].end_time.timestamp(),
            "invalid metric end time (with given chunk_size)",
        )
        self.assertTrue(
            (end_time - timedelta(minutes=1)).timestamp() <
            metric_objects_list[0].end_time.timestamp(),
            "invalid metric end time (with given chunk_size)",
        )
 def test_init(self):
     """Test if metrics initialized in the list are correct."""
     self.assertEqual(
         9,  # manually check the number of unique metric time-series
         len(MetricsList(self.raw_metrics_list)),
         "incorrect number of unique metric timeseries",
     )
예제 #4
0
    def test_get_metric_range_data(self):  # noqa D102
        start_time = datetime.now() - timedelta(minutes=10)
        end_time = datetime.now()
        metric_data = self.pc.get_metric_range_data(metric_name="up",
                                                    start_time=start_time,
                                                    end_time=end_time)

        metric_objects_list = MetricsList(metric_data)

        self.assertTrue(
            len(metric_objects_list) > 0,
            "no metrics received from prometheus")
        self.assertTrue(
            start_time.timestamp() <
            metric_objects_list[0].start_time.timestamp(),
            "invalid metric start time",
        )
        self.assertTrue(
            (start_time + timedelta(minutes=1)).timestamp() >
            metric_objects_list[0].start_time.timestamp(),
            "invalid metric start time",
        )
        self.assertTrue(
            end_time.timestamp() > metric_objects_list[0].end_time.timestamp(),
            "invalid metric end time",
        )
        self.assertTrue(
            (end_time - timedelta(minutes=1)).timestamp() <
            metric_objects_list[0].end_time.timestamp(),
            "invalid metric end time",
        )
예제 #5
0
def general(metric_name):
    metric_data = pc.get_metric_range_data(
        metric_name,
        start_time=start_time,
        end_time=end_time,
        chunk_size=chunk_size,
    )

    metrics_object_list = MetricsList(metric_data)
    metric_object = metrics_object_list[0]
    value_list = metric_object.metric_values
    print(value_list)
예제 #6
0
    def _getMetricsData(metric):
        metric_data = pc.get_metric_range_data(
            metric,
            start_time=start_time,
            end_time=end_time,
            chunk_size=chunk_size,
        )

        metrics_object_list = MetricsList(metric_data)
        metric_object = metrics_object_list[0]
        value_list = metric_object.metric_values
        return value_list
예제 #7
0
def pro():
    pc = PrometheusConnect(
        url="https://prometheus-k8s-openshift-monitoring.apps-crc.testing",
        headers={
            "Authorization":
            "bearer BSI2W0euoJWYRAvT0ZnSJVmgNQ87pl3o3yXuyy38qAg"
        },
        disable_ssl=True)
    up_metric = MetricsList(
        pc.get_current_metric_value(
            metric_name="haproxy_backend_up{exported_namespace='prophet'}"))
    print(up_metric[0])
    def test_unique_metric_combination(self):  # noqa D102
        start_time = datetime.datetime(2019, 7, 28, 10, 0)
        start_time_plus_1m = datetime.datetime(2019, 7, 28, 10, 1)
        end_time = datetime.datetime(2019, 7, 30, 10, 0)
        end_time_minus_1m = datetime.datetime(2019, 7, 30, 9, 59)

        self.assertTrue(
            MetricsList(self.raw_metrics_list)[0].start_time > start_time,
            "Combined metric start time incorrect",
        )
        self.assertTrue(
            MetricsList(self.raw_metrics_list)[0].start_time <
            start_time_plus_1m,
            "Combined metric start time incorrect",
        )
        self.assertTrue(
            MetricsList(self.raw_metrics_list)[0].end_time < end_time,
            "Combined metric end time incorrect",
        )
        self.assertTrue(
            MetricsList(self.raw_metrics_list)[0].end_time > end_time_minus_1m,
            "Combined metric end time incorrect",
        )
예제 #9
0
def profiling(url, pod_ip, ana_window='2m', metrics=MEM_UTIL):
    """if key exists, the value will be replaced,
       add dynamic status
       {ai.centaurus.io/gpu0:{cur_mem_used:4GB, max_gpu_util:60, max_mem_cpy_util:34, cyclic:True, process_cnt:1},
        ai.centaurus.io/gpu1:{cur_mem_used:4GB, max_gpu_util:60, max_mem_cpy_util:34, cyclic:True, process_cnt:2, processes:[{pid:25678, cur_mem_used:3GB},{pid:67234, cur_mem_used:1GB}]}                                 
       }
    """
    ret_dict = dict()
    promi = PrometheusConnect(url=url, disable_ssl=True)
    # except connection error
    try:
        promi.check_prometheus_connection()
    except Exception as e:
        logging.error(e)
        return ret_dict  # if connectioin fails, return empty dict
    instance = pod_ip + ":9400" # tmp fixed
    start_time = parse_datetime(ana_window)
    end_time = parse_datetime("now")
    my_label_config = {"instance": instance}  # select current host metrics
    metric_data = promi.get_metric_range_data(metric_name=metrics,
                                              label_config=my_label_config,
                                              start_time=start_time,
                                              end_time=end_time)
    # reorganize data to label_config and metric_values
    metric_object_list = MetricsList(metric_data)
    ret_dict = dict()
    for item in metric_object_list: # iterate through all the gpus on the node
        if 'gpu' not in item.label_config: # handle metric config info exception
            continue
        id = item.label_config['gpu']  # predefined key from dcgm (gpu index)
        # ip = item.label_config['instance']
        key = DOMAIN + "/gpu-" + id
        cur_usage = collect_cur_usage(int(id))
        ts = item.metric_values.iloc[:, 1]  # metrics_values are two row df, 1st is timestamp, 2nd is value
        cur_usage['cyclic_pattern'] = False
        if ts.max() > 0:
            cyclic, period = cyclic_pattern_detection(ts)
            if cyclic:
                cur_usage['cyclic_pattern'] = True
                cur_usage['period'] = str(period)       
        cur_usage['max_mem_util'] = str(ts.max())
        # Important: flatten nested dictionary to string, otherwise error "cannot unmarshal string into Go value of type map[string]interface {}""
        ret_dict[key] = str(cur_usage)
    return ret_dict
예제 #10
0
    def fetch(self, expression, number_of_days):
        start_time = parse_datetime('%dd' % number_of_days)
        end_time = parse_datetime('now')
        chunk_size = parse_timedelta('now', '1d')

        metric_data = self.prom.get_metric_range_data(
            expression,
            start_time=start_time,
            end_time=end_time,
            chunk_size=chunk_size,
        )

        # MetricsList combines the chunks into a single metric
        metric = MetricsList(metric_data)[0]

        # Yield tuples of timestamp, value
        for value in metric.metric_values.values:
            ts, val = value.tolist()

            # The timestamp is delivered in UTC, convert to local
            ts = ts.to_pydatetime().replace(tzinfo=tz.tzutc())
            ts = ts.astimezone(tz.tzlocal())

            yield ts, val
예제 #11
0
def compute_true_positive_rate(forecasted_anomalies, labeled_anomalies):

    num_true_positive = sum((forecasted_anomalies.values == 1)
                            & (labeled_anomalies.values == 1))
    true_postive_rate = num_true_positive / sum(labeled_anomalies.values)

    return true_postive_rate


# Run for every metric defined in the METRICS_LIST
for metric in METRICS_LIST:
    # Download the the train data from Prometheus
    train_data = MetricsList(
        pc.get_metric_range_data(
            metric_name=metric,
            start_time=Configuration.metric_start_time,
            end_time=Configuration.metric_train_data_end_time,
            chunk_size=Configuration.metric_chunk_size,
        ))

    # If the training data list downloaded is empty
    if not train_data:
        _LOGGER.error(
            "No Metric data received, please check the data window size")
        raise ValueError

    # If more than one time-series match the given metric, raise an error
    if len(train_data) > 1:
        _LOGGER.error("Multiple timeseries matching %s were found")
        _LOGGER.error("The timeseries matched were: ")
        for timeseries in train_data:
예제 #12
0
 def get_metric_obj_list(metric_data):
     return MetricsList(metric_data)
예제 #13
0
def update_values(models_include=None):
    """Update db_values for every TS.
    If Values record exists then updates its metric. If Values record does not exist then its created
    When Values record is created its predictor Model selected. Value record is associated with its TS.
    
    index (hash):
    {
        "metric" (Metric): first item of return value of MetricsList(get_metric_range_data())
        "ts" (tsKey): key of db_ts
        "model" (modelKey): key of db_models
    }

    Raises:
        Exception: [description]
        Exception: [description]
        Exception: [description]
        e: [description]
    """
    logger.info("Updating Values")
    now = datetime.now()
    generation = next(values_generation)
    for (h, ts) in db_ts.items():
        logger.debug("Updating [TS:{h}], labels:{labels}".format(
            h=h, labels=ts["labels"]))
        if h in db_values.keys():
            # TS is already tracked by a Values record in db_values
            current_start_time = now - Configuration.current_data_window_size
            record = db_values[h]
            metric = record["metric"]
            metric_data = pc.get_metric_range_data(
                metric_name=metric.metric_name,
                label_config=metric.label_config,
                start_time=current_start_time,
                end_time=now)
            metrics = MetricsList(metric_data)
            if len(metrics) != 1:
                raise Exception("There can be only one")
            new_metric = metrics[0] + metric

            trunk_metric = Metric(
                new_metric, current_start_time
            )  # This throws some exception really fast but this would have solved the problem.
            db_values[h]["metric"] = trunk_metric
            db_values[h]["generation"] = generation
            logger.debug(
                "Update and truncate [Metric:{h}] horizon:{current_start_time} metric_name:{metric_name}, label_config:{label_config}"
                .format(h=h,
                        metric_name=metric.metric_name,
                        label_config=metric.label_config,
                        current_start_time=current_start_time))
        else:
            current_start_time = now - Configuration.current_data_window_size
            metric_name = ts["labels"]["__name__"]
            labels = dict()
            labels.update(ts["labels"])
            del labels["__name__"]

            items = db_models.items()
            if not models_include is None:
                items = filter(lambda item: item[0] in models_include, items)

            models = list(
                filter(
                    lambda model: ts_hash(all_labels=model[1]["labels"]) == h,
                    items))
            if len(models) == 0:
                logger.warning(
                    "No models matching labels for [Metric:{h}] metric_name:{metric_name}, label_config:{label_config}"
                    .format(h=h, metric_name=metric_name, label_config=labels))
                continue

            metric_data = pc.get_metric_range_data(
                metric_name=metric_name,
                label_config=labels,
                start_time=current_start_time,
                end_time=now)
            metrics = MetricsList(metric_data)
            if len(metrics) != 1:
                raise Exception("There can be only one")

            # pick the most recent model
            models.sort(key=lambda model: model[1].get(
                "timestamp", datetime.fromtimestamp(0)),
                        reverse=True)
            predictor = models[0][0]
            # predictor.build_prediction_df()
            record = {
                "metric": metrics[0],
                "ts": h,
                "model": predictor,
                "generation": generation
            }
            db_values.update({h: record})
            logger.debug(
                "Add [Metric:{h}] horizon:{current_start_time} metric_name:{metric_name}, label_config:{label_config}"
                .format(h=h,
                        metric_name=metric_name,
                        label_config=labels,
                        current_start_time=current_start_time))
예제 #14
0
def get_all_metrics(start_time='5m', end_time='now', instance='', gpu_id=''):
    """
    all DCGM metrics, on all instances, and all gpus
    save dumped data to csv file
    """
    # save the time first, in case multiple query at different time later
    start_time = parse_datetime(start_time)
    end_time = parse_datetime(end_time)
    # connect to premtheus server, exit if connection fails
    url = "http://prometheus:9090"  # use service name, instead of ip to be more robust
    prom = PrometheusConnect(url=url, disable_ssl=True)
    try:
        prom.check_prometheus_connection()
    except Exception as e:
        logging.error(e)
        exit(1)
    # get all metrics under profiler job, note: some instances/gpus may not have all the metrics due to model variance
    metrics = prom.all_metrics()
    metrics = [a for a in metrics if 'DCGM' in a]
    gpu_util = 'DCGM_FI_DEV_GPU_UTIL'
    label_cfg = {"job": "profiler-pods"}
    # get a screenshot of all the instances (pod ip)
    metric_data = prom.get_current_metric_value(metric_name=gpu_util,
                                                label_config=label_cfg)
    metric_df = MetricSnapshotDataFrame(metric_data)
    instances = metric_df.instance.unique()
    ins_gpu = dict()
    for ins in instances:
        # add instance in query
        label_cfg['instance'] = ins
        metric_data = prom.get_current_metric_value(metric_name=gpu_util,
                                                    label_config=label_cfg)
        metric_df = MetricSnapshotDataFrame(metric_data)
        gpus = metric_df.gpu.unique()
        # put each instance's gpus into dictionary
        ins_gpu[ins] = gpus

    my_label_config = {"job": "profiler-pods", "gpu": gpu_id}  # select gpu0
    #my_label_config = {"instance": instance}  # select all gpu
    # if one particular instance is given, update instances
    if instance != '':
        instances = [
            instance,
        ]
    for ins in instances:
        if gpu_id != '':
            gpus = [
                gpu_id,
            ]
        else:
            gpus = ins_gpu[ins]
            print(ins, gpus)
        for gpu in gpus:
            my_label_config = {"instance": ins, "gpu": gpu}
            df = pd.DataFrame()
            for metric_name in metrics:
                # select from different metric_name to query
                metric_data = prom.get_metric_range_data(
                    metric_name=metric_name,
                    label_config=my_label_config,
                    start_time=parse_datetime(start_time),
                    end_time=parse_datetime(end_time))

                # reorganize data to label_config and metric_values
                metric_object_list = MetricsList(metric_data)
                if len(metric_object_list) > 0:
                    if 'datetime' not in df.columns:
                        df['datetime'] = metric_object_list[0].metric_values[
                            'ds']
                    df[metric_name] = metric_object_list[0].metric_values['y']

            file_name = "_".join([ins, gpu]) + ".csv"
            df.to_csv(file_name)
    return true_postive_rate


for metric in METRICS_LIST:

    rolling_data_window = Configuration.rolling_data_window_size
    metric_start_time = str(
        dateparser.parse(rolling_data_window) -
        (dateparser.parse("now") - dateparser.parse(rolling_data_window)))

    # Download the initial training data from prometheus
    train_data = MetricsList(
        pc.get_metric_range_data(
            metric_name=metric,
            start_time=metric_start_time,
            end_time=rolling_data_window,
            chunk_size=None,
        ))

    # If the training data downloaded is empty
    if not train_data:
        _LOGGER.error(
            "No Metric data received, please check the data window size")
        raise ValueError

    # If more than one time-series match the given metric, raise an error
    if len(train_data) > 1:
        _LOGGER.error("Multiple timeseries matching %s were found")
        _LOGGER.error("The timeseries matched were: ")
        for timeseries in train_data: