def main(args): register_stack_trace_dump() burninate_gc_collector() config_environ() configured_gpu_counter.set( get_gpu_count("/gpu-config/gpu-configuration.json")) decay_time = datetime.timedelta(seconds=args.interval * 2) # used to exchange gpu info between GpuCollector and ContainerCollector npu_info_ref = collector.AtomicRef(decay_time) nvidia_info_ref = collector.AtomicRef(decay_time) # used to exchange docker stats info between ContainerCollector and ZombieCollector stats_info_ref = collector.AtomicRef(decay_time) # used to exchange zombie info between GpuCollector and ZombieCollector zombie_info_ref = collector.AtomicRef(decay_time) # used to exchange dcgm info between DCGMCollector and ContainerCollector dcgm_info_ref = collector.AtomicRef(decay_time) interval = args.interval # Because all collector except container_collector will spent little time in calling # external command to get metrics, so they need to sleep 30s to align with prometheus # scrape interval. The 99th latency of container_collector loop is around 20s, so it # should only sleep 10s to adapt to scrape interval collector_args = [ ("npu_collector", interval, decay_time, collector.NpuCollector, npu_info_ref, zombie_info_ref, args.threshold), ("docker_daemon_collector", interval, decay_time, collector.DockerCollector), ("gpu_collector", interval, decay_time, collector.GpuCollector, nvidia_info_ref, zombie_info_ref, args.threshold), ("container_collector", max(0, interval - 18), decay_time, collector.ContainerCollector, nvidia_info_ref, stats_info_ref, args.interface, npu_info_ref, dcgm_info_ref), ("zombie_collector", interval, decay_time, collector.ZombieCollector, stats_info_ref, zombie_info_ref), ("process_collector", interval, decay_time, collector.ProcessCollector), ("dcgm_collector", interval, decay_time, collector.DCGMCollector, dcgm_info_ref), ] refs = list(map(lambda x: collector.make_collector(*x), collector_args)) REGISTRY.register(CustomCollector(refs)) root = Resource() root.putChild(b"metrics", MetricsResource()) root.putChild(b"healthz", HealthResource()) factory = Site(root) reactor.listenTCP(int(args.port), factory) reactor.run()
def test_base_collector(self): """ actually setup DockerCollector thread, and test, since this is multi-thread test case, maybe sensitive to the system load """ ref = collector.make_collector("test_docker_collector2", 0.5, collector.DockerCollector) metrics = None for i in range(10): metrics = ref.get() if metrics is not None: break time.sleep(0.1) self.assert_metrics(metrics)
def main(args): register_stack_trace_dump() burninate_gc_collector() config_environ() try_remove_old_prom_file(args.log + "/gpu_exporter.prom") try_remove_old_prom_file(args.log + "/job_exporter.prom") try_remove_old_prom_file(args.log + "/docker.prom") try_remove_old_prom_file(args.log + "/time.prom") try_remove_old_prom_file(args.log + "/configured_gpu.prom") configured_gpu_counter.set( get_gpu_count("/gpu-config/gpu-configuration.json")) # used to exchange gpu info between GpuCollector and ContainerCollector gpu_info_ref = collector.AtomicRef() # used to exchange docker stats info between ContainerCollector and ZombieCollector stats_info_ref = collector.AtomicRef() interval = args.interval # Because all collector except container_collector will spent little time in calling # external command to get metrics, so they need to sleep 30s to align with prometheus # scrape interval. The 99th latency of container_collector loop is around 20s, so it # should only sleep 10s to adapt to scrape interval collector_args = [ ("docker_daemon_collector", interval, collector.DockerCollector), ("gpu_collector", interval / 2, collector.GpuCollector, gpu_info_ref), ("container_collector", interval - 18, collector.ContainerCollector, gpu_info_ref, stats_info_ref, args.interface), ("zombie_collector", interval, collector.ZombieCollector, stats_info_ref), ] refs = list(map(lambda x: collector.make_collector(*x), collector_args)) REGISTRY.register(CustomCollector(refs)) root = Resource() root.putChild(b"metrics", MetricsResource()) root.putChild(b"healthz", HealthResource()) factory = Site(root) reactor.listenTCP(int(args.port), factory) reactor.run()
def main(args): config_environ() try_remove_old_prom_file(args.log + "/gpu_exporter.prom") try_remove_old_prom_file(args.log + "/job_exporter.prom") try_remove_old_prom_file(args.log + "/docker.prom") try_remove_old_prom_file(args.log + "/time.prom") try_remove_old_prom_file(args.log + "/configured_gpu.prom") configured_gpu_counter.set( get_gpu_count("/gpu-config/gpu-configuration.json")) # used to exchange gpu info between GpuCollector and ContainerCollector gpu_info_ref = collector.AtomicRef() # used to exchange docker stats info between ContainerCollector and ZombieCollector stats_info_ref = collector.AtomicRef() interval = args.interval # Because all collector except container_collector will spent little time in calling # external command to get metrics, so they need to sleep 30s to align with prometheus # scrape interval. The 99th latency of container_collector loop is around 20s, so it # should only sleep 10s to adapt to scrape interval collector_args = [ ("docker_daemon_collector", interval, collector.DockerCollector), ("gpu_collector", interval, collector.GpuCollector, gpu_info_ref), ("container_collector", interval - 18, collector.ContainerCollector, gpu_info_ref, stats_info_ref, args.interface), ("zombie_collector", interval, collector.ZombieCollector, stats_info_ref), ] refs = list(map(lambda x: collector.make_collector(*x), collector_args)) REGISTRY.register(CustomCollector(refs)) app = make_wsgi_app(REGISTRY) httpd = make_server("", int(args.port), app) httpd.serve_forever()