예제 #1
0
    def request(self, db):

        (self.metrics, self.nodeset) = \
            db.get_metrics_results(self.cluster,
                                   self.job,
                                   ['cpus',
                                    'cpu-user',
                                    'cpu-system',
                                    'memory-pss'],
                                   self.period)
        self.stack_cpu_idle()
        profiler = Profiler()
        profiler.meta('producers', str(self.nodeset))
        profiler.meta('nodes', str(self.job.nodeset))
        profiler.meta('mutes', str(self.job.nodeset - self.nodeset))
예제 #2
0
    def request(self, db):

        (self.metrics, self.nodeset) = \
            db.get_metrics_results(self.cluster,
                                   self.job,
                                   ['cpu-system',
                                    'cpu-iowait',
                                    'cpu-user',
        'cpu-softirq',
                                    'cpu-idle',
                                    'memory-pss',
                                    'memory-rss',
                                    'utilization_gpu',
                                    'utilization_memory',
        'cpus'],
                                   self.period)
        #self.stack_cpu_idle()
        profiler = Profiler()
        profiler.meta('producers', str(self.nodeset))
        profiler.meta('nodes', str(self.job.nodeset))
        profiler.meta('mutes', str(self.job.nodeset - self.nodeset))
예제 #3
0
    def get_metrics_results(self, cluster, job, metrics, period):
        """Get the metrics of the job on the cluster for the period in parameters.

           It sends an HTTP request to InfluxDB service to download the metric
           values in JSON format and returns a list.
        """

        time_group = periods[period]

        profiler = Profiler()

        metrics_s = "\"" + "\", \"".join(metrics) + "\""
        req = "select mean(value) from {metrics} " \
              "where time > now() - {period} " \
              "and cluster = '{cluster}' " \
              "and job = 'job_{job}' " \
              "group by time({time_group}), node fill(0)" \
              .format(metrics=metrics_s,
                      period=period,
                      cluster=cluster,
                      job=job.jobid,
                      time_group=time_group)

        profiler.meta('metrics_req', req)

        payload = {'db': self.db, 'q': req, 'epoch': 'ms'}

        profiler.start('metrics_req')
        resp = requests.get(url=self.url, params=payload)
        profiler.stop('metrics_req')
        if resp.status_code == 404:
            raise LookupError("metrics not found for job {job} on cluster "
                              "{cluster}"
                              .format(job=job.jobid,
                                      cluster=cluster))

        profiler.start('metrics_proc')
        data = json.loads(resp.text)

        # data is a dict with 'results' key that is itself a list of dict with
        # 'series' key that is as well a list of dict, one dict per node/node
        # association. Each dict has it own list of values. We have to compute
        # the sum the values for all nodes at every timestampsi, for each
        # metric.
        #
        # Ex:
        #
        # { "results": [
        #   { "series": [
        #     { "name": "cpu-system",
        #       "tags": {"node":"cn1"},
        #       "columns": ["time","mean"],
        #       "values": [
        #         ["2015-10-16T11:37:20Z",0],
        #         ...
        #         ["2015-10-16T12:37:10Z",0],
        #         ["2015-10-16T12:37:20Z",0]
        #       ]
        #     },
        #     { "name": "cpu-system",
        #       "tags": {"node":"cn2"},
        #       "columns": ["time","mean"],
        #       "values": [
        #         ["2015-10-16T11:37:20Z",0],
        #         ["2015-10-16T11:37:30Z",0],
        #         ...
        #         ["2015-10-16T12:37:10Z",0],
        #        ["2015-10-16T12:37:20Z",0]
        #       ]
        #     },
        #
        #     ( ... then cpu-system for cn3 ...)
        #
        #     { "name": "cpu-user",
        #       "tags": {"node":"cn1"},
        #       "columns": ["time","mean"],
        #       "values": [
        #         ["2015-10-16T11:37:20Z",0],
        #         ["2015-10-16T11:37:30Z",0],
        #         ...
        #         ["2015-10-16T12:37:10Z",0],
        #         ["2015-10-16T12:37:20Z",0]
        #       ]
        #     },
        #
        #     ( ... then cpu-user for cn[2-3] ...)
        #
        #     { "name": "cpus",
        #       "tags": {"node":"admin"},
        #       "columns": ["time","mean"],
        #       "values": [
        #         ["2015-10-16T11:37:20Z",0],
        #         ["2015-10-16T11:37:30Z",0],
        #         ...
        #         ["2015-10-16T12:37:10Z",6],
        #         ["2015-10-16T12:37:20Z",0]
        #       ]
        #     },
        #     { "name": "memory-pss",
        #       "tags": {"node":"cn1"},
        #       "columns": ["time","mean"],
        #       "values": [
        #         ["2015-10-16T11:37:20Z",0],
        #         ["2015-10-16T11:37:30Z",0],
        #         ...
        #         ["2015-10-16T12:37:10Z",0],
        #         ["2015-10-16T12:37:20Z",0]
        #       ]
        #     },
        #
        #     ( ... then memory-pss for cn[2-3] ...)
        #
        #   ]}
        # ]}
        series = data['results'][0]['series']

        results = {}
        nodeset = NodeSet()

        for serie in series:
            metric = serie['name']
            node = serie['tags']['node'].encode('utf-8')

            if node not in nodeset:
                nodeset.update(node)

            for pair in serie['values']:
                timestamp = str(pair[0])
                value = pair[1]
                if timestamp not in results:
                    results[timestamp] = list()
                    for xidx in range(len(metrics)):
                        if xidx == metrics.index(metric):
                            results[timestamp].append(value)
                        else:
                            results[timestamp].append(0)
                else:
                    # The cpus/nodes metrics can be produced by several batch
                    # servers and thus returned multiple times by InfluxDB
                    # server in the result of the request. We must take care to
                    # not add the multiple results of this metric here!
                    if metric in ['cpus', 'nodes']:
                        results[timestamp][metrics.index(metric)] = value
                    else:
                        results[timestamp][metrics.index(metric)] += value

        profiler.stop('metrics_proc')
        return (results, nodeset)
예제 #4
0
    def get_metrics_results(self, cluster, job, metrics, period):
        """Get the metrics of the job on the cluster for the period in parameters.

           It sends an HTTP request to InfluxDB service to download the metric
           values in JSON format and returns a list.
        """

        time_group = periods[period]

        profiler = Profiler()

        metrics_s = "\"" + "\", \"".join(metrics) + "\""
        req = "select mean(value) from {metrics} " \
              "where time > now() - {period} " \
              "and cluster = '{cluster}' " \
              "and job = 'job_{job}' " \
              "group by time({time_group}), node fill(0)" \
              .format(metrics=metrics_s,
                      period=period,
                      cluster=cluster,
                      job=job.jobid,
                      time_group=time_group)

        profiler.meta('metrics_req', req)

        payload = {'db': self.db, 'q': req, 'epoch': 'ms'}

        profiler.start('metrics_req')
        resp = requests.get(url=self.url, params=payload)
        profiler.stop('metrics_req')
        if resp.status_code == 404:
            raise LookupError("metrics not found for job {job} on cluster "
                              "{cluster}".format(job=job.jobid,
                                                 cluster=cluster))

        profiler.start('metrics_proc')
        data = json.loads(resp.text)

        # data is a dict with 'results' key that is itself a list of dict with
        # 'series' key that is as well a list of dict, one dict per node/node
        # association. Each dict has it own list of values. We have to compute
        # the sum the values for all nodes at every timestampsi, for each
        # metric.
        #
        # Ex:
        #
        # { "results": [
        #   { "series": [
        #     { "name": "cpu-system",
        #       "tags": {"node":"cn1"},
        #       "columns": ["time","mean"],
        #       "values": [
        #         ["2015-10-16T11:37:20Z",0],
        #         ...
        #         ["2015-10-16T12:37:10Z",0],
        #         ["2015-10-16T12:37:20Z",0]
        #       ]
        #     },
        #     { "name": "cpu-system",
        #       "tags": {"node":"cn2"},
        #       "columns": ["time","mean"],
        #       "values": [
        #         ["2015-10-16T11:37:20Z",0],
        #         ["2015-10-16T11:37:30Z",0],
        #         ...
        #         ["2015-10-16T12:37:10Z",0],
        #        ["2015-10-16T12:37:20Z",0]
        #       ]
        #     },
        #
        #     ( ... then cpu-system for cn3 ...)
        #
        #     { "name": "cpu-user",
        #       "tags": {"node":"cn1"},
        #       "columns": ["time","mean"],
        #       "values": [
        #         ["2015-10-16T11:37:20Z",0],
        #         ["2015-10-16T11:37:30Z",0],
        #         ...
        #         ["2015-10-16T12:37:10Z",0],
        #         ["2015-10-16T12:37:20Z",0]
        #       ]
        #     },
        #
        #     ( ... then cpu-user for cn[2-3] ...)
        #
        #     { "name": "cpus",
        #       "tags": {"node":"admin"},
        #       "columns": ["time","mean"],
        #       "values": [
        #         ["2015-10-16T11:37:20Z",0],
        #         ["2015-10-16T11:37:30Z",0],
        #         ...
        #         ["2015-10-16T12:37:10Z",6],
        #         ["2015-10-16T12:37:20Z",0]
        #       ]
        #     },
        #     { "name": "memory-pss",
        #       "tags": {"node":"cn1"},
        #       "columns": ["time","mean"],
        #       "values": [
        #         ["2015-10-16T11:37:20Z",0],
        #         ["2015-10-16T11:37:30Z",0],
        #         ...
        #         ["2015-10-16T12:37:10Z",0],
        #         ["2015-10-16T12:37:20Z",0]
        #       ]
        #     },
        #
        #     ( ... then memory-pss for cn[2-3] ...)
        #
        #   ]}
        # ]}
        series = data['results'][0]['series']

        results = {}
        nodeset = NodeSet()

        for serie in series:
            metric = serie['name']
            node = serie['tags']['node'].encode('utf-8')

            if node not in nodeset:
                nodeset.update(node)

            for pair in serie['values']:
                timestamp = str(pair[0])
                value = pair[1]
                if timestamp not in results:
                    results[timestamp] = list()
                    for xidx in range(len(metrics)):
                        if xidx == metrics.index(metric):
                            results[timestamp].append(value)
                        else:
                            results[timestamp].append(0)
                else:
                    # The cpus/nodes metrics can be produced by several batch
                    # servers and thus returned multiple times by InfluxDB
                    # server in the result of the request. We must take care to
                    # not add the multiple results of this metric here!
                    if metric in ['cpus', 'nodes']:
                        results[timestamp][metrics.index(metric)] = value
                    else:
                        results[timestamp][metrics.index(metric)] += value

        profiler.stop('metrics_proc')
        return (results, nodeset)
예제 #5
0
    def get_metrics_results(self, cluster, job, metrics, period):
        """Get the metrics of the job on the cluster for the period in parameters.

           It sends an HTTP request to InfluxDB service to download the metric
           values in JSON format and returns a list.
        """
        timejob = job.end_time - job.start_time
        logger.debug("time job: %d", timejob)
        if timejob < 3600:
            period = "1h"
        if timejob < 21600 and timejob > 3600:
            period = "6h"

        time_group = periods[period]

        profiler = Profiler()

        metrics_s = "\"" + "\", \"".join(metrics) + "\""
        req = "select mean(value) from {metrics} " \
              "where cluster = '{cluster}' " \
              "and (( job = 'job_{job}' and time > now() - {period} ) or" \
              " ( job = 'none' and plugin = 'cuda' and time >= {start_time}000000000 and time <= {end_time}000000000 and node = '{nodes}' )) " \
              "group by time({time_group}), node fill(0)" \
              .format(metrics=metrics_s,
                      period=period,
                      cluster=cluster,
                      job=job.jobid,
                      nodes=job.nodeset,
                      start_time=job.start_time,
                      end_time=job.end_time,
                      time_group=time_group)

        logger.debug("req influx: %s", req)
        profiler.meta('metrics_req', req)

        payload = {'db': self.db, 'q': req, 'epoch': 'ms'}

        profiler.start('metrics_req')
        resp = requests.get(url=self.url, params=payload)
        profiler.stop('metrics_req')
        if resp.status_code == 404:
            raise LookupError("metrics not found for job {job} on cluster "
                              "{cluster}".format(job=job.jobid,
                                                 cluster=cluster))

        profiler.start('metrics_proc')

        json_data = json.loads(resp.text)

        # data is a dict with 'results' key that is itself a list of dict with
        # 'series' key that is as well a list of dict, one dict per node/node
        # association. Each dict has it own list of values. We have to compute
        # the sum the values for all nodes at every timestampsi, for each
        # metric.
        #
        # Ex:
        #
        # { "results": [
        #   { "series": [
        #     { "name": "cpu-system",
        #       "tags": {"node":"cn1"},
        #       "columns": ["time","mean"],
        #       "values": [
        #         ["2015-10-16T11:37:20Z",0],
        #         ...
        #         ["2015-10-16T12:37:10Z",0],
        #         ["2015-10-16T12:37:20Z",0]
        #       ]
        #     },
        #     { "name": "cpu-system",
        #       "tags": {"node":"cn2"},
        #       "columns": ["time","mean"],
        #       "values": [
        #         ["2015-10-16T11:37:20Z",0],
        #         ["2015-10-16T11:37:30Z",0],
        #         ...
        #         ["2015-10-16T12:37:10Z",0],
        #        ["2015-10-16T12:37:20Z",0]
        #       ]
        #     },
        #
        #     ( ... then cpu-system for cn3 ...)
        #
        #     { "name": "cpu-user",
        #       "tags": {"node":"cn1"},
        #       "columns": ["time","mean"],
        #       "values": [
        #         ["2015-10-16T11:37:20Z",0],
        #         ["2015-10-16T11:37:30Z",0],
        #         ...
        #         ["2015-10-16T12:37:10Z",0],
        #         ["2015-10-16T12:37:20Z",0]
        #       ]
        #     },
        #
        #     ( ... then cpu-user for cn[2-3] ...)
        #
        #     { "name": "cpus",
        #       "tags": {"node":"admin"},
        #       "columns": ["time","mean"],
        #       "values": [
        #         ["2015-10-16T11:37:20Z",0],
        #         ["2015-10-16T11:37:30Z",0],
        #         ...
        #         ["2015-10-16T12:37:10Z",6],
        #         ["2015-10-16T12:37:20Z",0]
        #       ]
        #     },
        #     { "name": "memory-pss",
        #       "tags": {"node":"cn1"},
        #       "columns": ["time","mean"],
        #       "values": [
        #         ["2015-10-16T11:37:20Z",0],
        #         ["2015-10-16T11:37:30Z",0],
        #         ...
        #         ["2015-10-16T12:37:10Z",0],
        #         ["2015-10-16T12:37:20Z",0]
        #       ]
        #     },
        #
        #     ( ... then memory-pss for cn[2-3] ...)
        #
        #   ]}
        # ]}

        results = {}
        nodeset = NodeSet()
        for result in json_data['results']:
            if 'series' in result:
                series = result['series']
            else:
                logger.warn("No series in one result for query: %s", req)
                series = {}

            for serie in series:
                metric = serie['name']
                node = serie['tags']['node'].encode('utf-8')

                if node not in nodeset:
                    nodeset.update(node)

                for pair in serie['values']:
                    timestamp = str(pair[0])
                    value = pair[1]
                    if timestamp not in results:
                        # init all values for timestamp to 0
                        results[timestamp] = [0] * len(metrics)
                    # The cpus/nodes metrics can be produced by several
                    # batch servers and thus returned multiple times by
                    # InfluxDB server in the result of the request. We
                    # must take care to not add the multiple results of
                    # this metric here!
                    if metric in ['cpus', 'nodes']:
                        results[timestamp][metrics.index(metric)] = value
                    else:
                        results[timestamp][metrics.index(metric)] += value

        profiler.stop('metrics_proc')
        return (results, nodeset)