示例#1
0
def main(argv):
    log_dir = argv[0]
    gpu_metrics_path = log_dir + "/gpu_exporter.prom"
    job_metrics_path = log_dir + "/job_exporter.prom"
    docker_metrics_path = log_dir + "/docker.prom"
    time_metrics_path = log_dir + "/time.prom"
    time_sleep_s = int(argv[1])

    iter = 0

    gpu_singleton = utils.Singleton(gpu_exporter.collect_gpu_info,
                                    name="gpu_singleton")
    docker_status_singleton = utils.Singleton(collect_docker_daemon_status,
                                              name="docker_singleton")

    type1_zombies = ZombieRecorder()
    type2_zombies = ZombieRecorder()

    while True:
        start = datetime.datetime.now()
        try:
            logger.info("job exporter running {0} iteration".format(str(iter)))
            iter += 1
            gpu_infos = gpu_singleton.try_get()

            docker_status = docker_status_singleton.try_get()
            if docker_status is not None:
                utils.export_metrics_to_file(docker_metrics_path,
                                             [docker_status])

            gpu_metrics = gpu_exporter.convert_gpu_info_to_metrics(gpu_infos)
            utils.export_metrics_to_file(gpu_metrics_path, gpu_metrics)

            all_conns = network.iftop()
            logger.debug("iftop result is %s", all_conns)

            # join with docker stats metrics and docker inspect labels
            job_metrics = collect_job_metrics(gpu_infos, all_conns,
                                              type1_zombies, type2_zombies)
            utils.export_metrics_to_file(job_metrics_path, job_metrics)
        except Exception as e:
            logger.exception("exception in job exporter loop")
        finally:
            end = datetime.datetime.now()

            time_metrics = [
                Metric("job_exporter_iteration_seconds", {},
                       (end - start).seconds)
            ]
            utils.export_metrics_to_file(time_metrics_path, time_metrics)

        time.sleep(time_sleep_s)
示例#2
0
def main(argv):
    log_dir = argv[0]
    gpu_metrics_path = log_dir + "/gpu_exporter.prom"
    job_metrics_path = log_dir + "/job_exporter.prom"
    time_sleep_s = int(argv[1])

    iter = 0

    singleton = utils.Singleton(gpu_exporter.collect_gpu_info)

    type1_zombies = ZombieRecorder()
    type2_zombies = ZombieRecorder()

    while True:
        try:
            logger.info("job exporter running {0} iteration".format(str(iter)))
            iter += 1
            gpu_infos = singleton.try_get()

            gpu_metrics = gpu_exporter.convert_gpu_info_to_metrics(gpu_infos)
            utils.export_metrics_to_file(gpu_metrics_path, gpu_metrics)

            all_conns = network.iftop()
            logger.debug("iftop result is %s", all_conns)

            # join with docker stats metrics and docker inspect labels
            job_metrics = collect_job_metrics(gpu_infos, all_conns,
                                              type1_zombies, type2_zombies)
            utils.export_metrics_to_file(job_metrics_path, job_metrics)
        except Exception as e:
            logger.exception("exception in job exporter loop")

        time.sleep(time_sleep_s)
示例#3
0
def main(argv):
    log_dir = argv[0]
    gpu_metrics_path = log_dir + "/gpu_exporter.prom"
    job_metrics_path = log_dir + "/job_exporter.prom"
    time_sleep_s = int(argv[1])

    iter = 0

    singleton = utils.Singleton(gpu_exporter.collect_gpu_info)

    while True:
        try:
            logger.info("job exporter running {0} iteration".format(str(iter)))
            iter += 1
            gpu_infos = singleton.try_get()

            gpu_metrics = gpu_exporter.convert_gpu_info_to_metrics(gpu_infos)
            utils.export_metrics_to_file(gpu_metrics_path, gpu_metrics)

            # join with docker stats metrics and docker inspect labels
            job_metrics = collect_job_metrics(gpu_infos)
            utils.export_metrics_to_file(job_metrics_path, job_metrics)
        except Exception as e:
            logger.exception("exception in job exporter loop")

        time.sleep(time_sleep_s)
示例#4
0
    def test_singleton_with_blocking_getter_allow_old_data(self):
        semaphore = threading.Semaphore(1)

        def blocking_getter():
            semaphore.acquire(blocking=True)
            semaphore.release()
            return 100

        singleton = utils.Singleton(blocking_getter,
                                    get_timeout_s=0.2,
                                    old_data_timeout_s=30)

        semaphore.acquire()

        for _ in xrange(3):
            self.assertIsNone(singleton.try_get())

        semaphore.release()
        # let singleton cache one value
        self.assertEqual(100, singleton.try_get())

        for _ in xrange(3):
            semaphore.acquire()

            for _ in xrange(3):
                # singleton returns old value
                self.assertEqual(100, singleton.try_get())

            semaphore.release()
            self.assertEqual(100, singleton.try_get())
示例#5
0
    def test_singleton_with_blocking_getter_no_old_data(self):
        semaphore = threading.Semaphore(1)

        def blocking_getter():
            semaphore.acquire(blocking=True)
            semaphore.release()
            return 100

        singleton = utils.Singleton(blocking_getter, get_timeout_s=0.2)

        val, is_old = singleton.try_get()
        self.assertIsNotNone(val)
        self.assertFalse(is_old)

        for _ in xrange(3):
            semaphore.acquire()

            for _ in xrange(3):
                val, is_old = singleton.try_get()
                self.assertEqual(100, val)
                self.assertTrue(is_old)

            semaphore.release()
            val, is_old = singleton.try_get()
            self.assertEqual(100, val)
            self.assertFalse(is_old)
示例#6
0
    def test_singleton_normal(self):
        def getter():
            return 100

        singleton = utils.Singleton(getter)

        for _ in xrange(10):
            self.assertEqual(100, singleton.try_get())
示例#7
0
    def test_singleton_normal(self):
        def getter():
            return 100

        singleton = utils.Singleton(getter)

        for _ in xrange(10):
            val, is_old = singleton.try_get()
            self.assertEqual(100, val)
            self.assertFalse(is_old)
示例#8
0
def main(argv):
    logDir = argv[0]
    gpuMetricsPath = logDir + "/gpu_exporter.prom"
    jobMetricsPath = logDir + "/job_exporter.prom"
    timeSleep = int(argv[1])

    rootLogger = logging.getLogger()
    rootLogger.setLevel(logging.INFO)
    fh = RotatingFileHandler(logDir + "/gpu_exporter.log",
                             maxBytes=1024 * 1024 * 10,
                             backupCount=5)
    fh.setLevel(logging.INFO)
    formatter = logging.Formatter(
        "%(asctime)s - %(levelname)s - %(filename)s:%(lineno)s - %(message)s")
    fh.setFormatter(formatter)
    rootLogger.addHandler(fh)

    iter = 0

    singleton = utils.Singleton(gpu_exporter.collect_gpu_info)

    while True:
        try:
            logger.info("job exporter running {0} iteration".format(str(iter)))
            iter += 1
            gpuInfos = singleton.try_get()

            gpuMetrics = gpu_exporter.convert_gpu_info_to_metrics(gpuInfos)
            if gpuMetrics is not None:
                utils.export_metrics_to_file(gpuMetricsPath, gpuMetrics)

            # join with docker stats metrics and docker inspect labels
            jobMetrics = collect_job_metrics(gpuInfos)
            if jobMetrics is not None:
                utils.export_metrics_to_file(jobMetricsPath, jobMetrics)
        except Exception as e:
            logger.exception("exception in job exporter loop")

        time.sleep(timeSleep)