def collect_impl(self): all_conns = network.iftop(self.network_interface, ContainerCollector.iftop_histogram, ContainerCollector.iftop_timeout) stats_obj = docker_stats.stats(ContainerCollector.stats_histogram, ContainerCollector.stats_timeout) now = datetime.datetime.now() gpu_infos = self.gpu_info_ref.get(now) self.stats_info_ref.set(stats_obj, now) dcgm_infos = self.dcgm_info_ref.get(now) infiniband_infos = self.infiniband_info_ref.get(now) ipoib_infos = self.ipoib_info_ref.get(now) logger.debug("all_conns is %s", all_conns) logger.debug("gpu_info is %s", gpu_infos) logger.debug("stats_obj is %s", stats_obj) logger.debug("dcgm_infos is %s", dcgm_infos) logger.debug("infiniband_infos is %s", infiniband_infos) logger.debug("ipoib_infos is %s", ipoib_infos) return self.collect_container_metrics(stats_obj, gpu_infos, all_conns, dcgm_infos, infiniband_infos, ipoib_infos)
def main(argv): log_dir = argv[0] gpu_metrics_path = log_dir + "/gpu_exporter.prom" job_metrics_path = log_dir + "/job_exporter.prom" time_sleep_s = int(argv[1]) iter = 0 singleton = utils.Singleton(gpu_exporter.collect_gpu_info) type1_zombies = ZombieRecorder() type2_zombies = ZombieRecorder() while True: try: logger.info("job exporter running {0} iteration".format(str(iter))) iter += 1 gpu_infos = singleton.try_get() gpu_metrics = gpu_exporter.convert_gpu_info_to_metrics(gpu_infos) utils.export_metrics_to_file(gpu_metrics_path, gpu_metrics) all_conns = network.iftop() logger.debug("iftop result is %s", all_conns) # join with docker stats metrics and docker inspect labels job_metrics = collect_job_metrics(gpu_infos, all_conns, type1_zombies, type2_zombies) utils.export_metrics_to_file(job_metrics_path, job_metrics) except Exception as e: logger.exception("exception in job exporter loop") time.sleep(time_sleep_s)
def main(argv): log_dir = argv[0] gpu_metrics_path = log_dir + "/gpu_exporter.prom" job_metrics_path = log_dir + "/job_exporter.prom" docker_metrics_path = log_dir + "/docker.prom" time_metrics_path = log_dir + "/time.prom" time_sleep_s = int(argv[1]) iter = 0 gpu_singleton = utils.Singleton(gpu_exporter.collect_gpu_info, name="gpu_singleton") docker_status_singleton = utils.Singleton(collect_docker_daemon_status, name="docker_singleton") type1_zombies = ZombieRecorder() type2_zombies = ZombieRecorder() while True: start = datetime.datetime.now() try: logger.info("job exporter running {0} iteration".format(str(iter))) iter += 1 gpu_infos = gpu_singleton.try_get() docker_status = docker_status_singleton.try_get() if docker_status is not None: utils.export_metrics_to_file(docker_metrics_path, [docker_status]) gpu_metrics = gpu_exporter.convert_gpu_info_to_metrics(gpu_infos) utils.export_metrics_to_file(gpu_metrics_path, gpu_metrics) all_conns = network.iftop() logger.debug("iftop result is %s", all_conns) # join with docker stats metrics and docker inspect labels job_metrics = collect_job_metrics(gpu_infos, all_conns, type1_zombies, type2_zombies) utils.export_metrics_to_file(job_metrics_path, job_metrics) except Exception as e: logger.exception("exception in job exporter loop") finally: end = datetime.datetime.now() time_metrics = [ Metric("job_exporter_iteration_seconds", {}, (end - start).seconds) ] utils.export_metrics_to_file(time_metrics_path, time_metrics) time.sleep(time_sleep_s)
def collect_impl(self): all_conns = network.iftop(self.network_interface, ContainerCollector.iftop_histogram, ContainerCollector.iftop_timeout) # set it to None so if nvidia-smi hangs till next time we get, # we will get None gpu_infos = self.gpu_info_ref.get_and_set(None) stats_obj = docker_stats.stats(ContainerCollector.stats_histogram, ContainerCollector.stats_timeout) self.stats_info_ref.get_and_set(stats_obj) logger.debug("all_conns is %s, gpu_info is %s, stats_obj is %s", all_conns, gpu_infos, stats_obj) return self.collect_container_metrics(stats_obj, gpu_infos, all_conns)