示例#1
0
    def collect_impl(self):
        all_conns = network.iftop(self.network_interface,
                                  ContainerCollector.iftop_histogram,
                                  ContainerCollector.iftop_timeout)

        stats_obj = docker_stats.stats(ContainerCollector.stats_histogram,
                                       ContainerCollector.stats_timeout)

        now = datetime.datetime.now()
        gpu_infos = self.gpu_info_ref.get(now)
        self.stats_info_ref.set(stats_obj, now)
        dcgm_infos = self.dcgm_info_ref.get(now)
        infiniband_infos = self.infiniband_info_ref.get(now)
        ipoib_infos = self.ipoib_info_ref.get(now)

        logger.debug("all_conns is %s", all_conns)
        logger.debug("gpu_info is %s", gpu_infos)
        logger.debug("stats_obj is %s", stats_obj)
        logger.debug("dcgm_infos is %s", dcgm_infos)
        logger.debug("infiniband_infos is %s", infiniband_infos)
        logger.debug("ipoib_infos is %s", ipoib_infos)

        return self.collect_container_metrics(stats_obj, gpu_infos, all_conns,
                                              dcgm_infos, infiniband_infos,
                                              ipoib_infos)
示例#2
0
def main(argv):
    log_dir = argv[0]
    gpu_metrics_path = log_dir + "/gpu_exporter.prom"
    job_metrics_path = log_dir + "/job_exporter.prom"
    time_sleep_s = int(argv[1])

    iter = 0

    singleton = utils.Singleton(gpu_exporter.collect_gpu_info)

    type1_zombies = ZombieRecorder()
    type2_zombies = ZombieRecorder()

    while True:
        try:
            logger.info("job exporter running {0} iteration".format(str(iter)))
            iter += 1
            gpu_infos = singleton.try_get()

            gpu_metrics = gpu_exporter.convert_gpu_info_to_metrics(gpu_infos)
            utils.export_metrics_to_file(gpu_metrics_path, gpu_metrics)

            all_conns = network.iftop()
            logger.debug("iftop result is %s", all_conns)

            # join with docker stats metrics and docker inspect labels
            job_metrics = collect_job_metrics(gpu_infos, all_conns,
                                              type1_zombies, type2_zombies)
            utils.export_metrics_to_file(job_metrics_path, job_metrics)
        except Exception as e:
            logger.exception("exception in job exporter loop")

        time.sleep(time_sleep_s)
示例#3
0
def main(argv):
    log_dir = argv[0]
    gpu_metrics_path = log_dir + "/gpu_exporter.prom"
    job_metrics_path = log_dir + "/job_exporter.prom"
    docker_metrics_path = log_dir + "/docker.prom"
    time_metrics_path = log_dir + "/time.prom"
    time_sleep_s = int(argv[1])

    iter = 0

    gpu_singleton = utils.Singleton(gpu_exporter.collect_gpu_info,
                                    name="gpu_singleton")
    docker_status_singleton = utils.Singleton(collect_docker_daemon_status,
                                              name="docker_singleton")

    type1_zombies = ZombieRecorder()
    type2_zombies = ZombieRecorder()

    while True:
        start = datetime.datetime.now()
        try:
            logger.info("job exporter running {0} iteration".format(str(iter)))
            iter += 1
            gpu_infos = gpu_singleton.try_get()

            docker_status = docker_status_singleton.try_get()
            if docker_status is not None:
                utils.export_metrics_to_file(docker_metrics_path,
                                             [docker_status])

            gpu_metrics = gpu_exporter.convert_gpu_info_to_metrics(gpu_infos)
            utils.export_metrics_to_file(gpu_metrics_path, gpu_metrics)

            all_conns = network.iftop()
            logger.debug("iftop result is %s", all_conns)

            # join with docker stats metrics and docker inspect labels
            job_metrics = collect_job_metrics(gpu_infos, all_conns,
                                              type1_zombies, type2_zombies)
            utils.export_metrics_to_file(job_metrics_path, job_metrics)
        except Exception as e:
            logger.exception("exception in job exporter loop")
        finally:
            end = datetime.datetime.now()

            time_metrics = [
                Metric("job_exporter_iteration_seconds", {},
                       (end - start).seconds)
            ]
            utils.export_metrics_to_file(time_metrics_path, time_metrics)

        time.sleep(time_sleep_s)
示例#4
0
文件: collector.py 项目: zmoon111/pai
    def collect_impl(self):
        all_conns = network.iftop(self.network_interface,
                ContainerCollector.iftop_histogram,
                ContainerCollector.iftop_timeout)

        # set it to None so if nvidia-smi hangs till next time we get,
        # we will get None
        gpu_infos = self.gpu_info_ref.get_and_set(None)

        stats_obj = docker_stats.stats(ContainerCollector.stats_histogram,
                ContainerCollector.stats_timeout)
        self.stats_info_ref.get_and_set(stats_obj)

        logger.debug("all_conns is %s, gpu_info is %s, stats_obj is %s",
                all_conns, gpu_infos, stats_obj)

        return self.collect_container_metrics(stats_obj, gpu_infos, all_conns)