def main(args): register_stack_trace_dump() burninate_gc_collector() log_dir = args.log try_remove_old_prom_file(log_dir + "/watchdog.prom") atomic_ref = AtomicRef() address = get_apiserver_address() t = threading.Thread(target=loop, name="loop", args=(address, args, atomic_ref)) t.daemon = True t.start() REGISTRY.register(CustomCollector(atomic_ref)) root = Resource() root.putChild(b"metrics", MetricsResource()) root.putChild(b"healthz", HealthResource()) factory = Site(root) reactor.listenTCP(int(args.port), factory) reactor.run()
def startService(self): log.msg("Starting Prometheus reporter") yield service.BuildbotService.startService(self) root = Resource() root.putChild(b'metrics', MetricsResource(registry=self.registry)) self.server = reactor.listenTCP(self.port, Site(root)) log.msg("Prometheus service starting on {}".format(self.server.port))
def main(args): register_stack_trace_dump() burninate_gc_collector() log_dir = args.log try_remove_old_prom_file(log_dir + "/watchdog.prom") decay_time = datetime.timedelta(seconds=float(args.interval) * 2) services_ref = AtomicRef(decay_time) loop_result_ref = AtomicRef(decay_time) t = threading.Thread(target=loop, name="loop", args=(args, services_ref, loop_result_ref)) t.daemon = True t.start() t = threading.Thread(target=requestor, name="requestor", args=(args, services_ref)) t.daemon = True t.start() REGISTRY.register(CustomCollector([loop_result_ref])) root = Resource() root.putChild(b"metrics", MetricsResource()) root.putChild(b"healthz", HealthResource()) factory = Site(root) reactor.listenTCP(int(args.port), factory) reactor.run()
def start_prometheus_exporter(ursula: 'Ursula', prometheus_config: PrometheusMetricsConfig, registry: CollectorRegistry = REGISTRY) -> None: """Configure, collect, and serve prometheus metrics.""" from prometheus_client.twisted import MetricsResource from twisted.web.resource import Resource from twisted.web.server import Site metrics_collectors = create_metrics_collectors(ursula, prometheus_config.metrics_prefix) # initialize collectors for collector in metrics_collectors: collector.initialize(metrics_prefix=prometheus_config.metrics_prefix, registry=registry) # TODO: was never used # "requests_counter": Counter(f'{metrics_prefix}_http_failures', 'HTTP Failures', ['method', 'endpoint']), # Scheduling metrics_task = task.LoopingCall(collect_prometheus_metrics, metrics_collectors=metrics_collectors) metrics_task.start(interval=prometheus_config.collection_interval, now=prometheus_config.start_now) # WSGI Service root = Resource() root.putChild(b'metrics', MetricsResource()) root.putChild(b'json_metrics', JSONMetricsResource()) factory = Site(root) reactor.listenTCP(prometheus_config.port, factory, interface=prometheus_config.listen_address)
def exporter_thread(port): root = Resource() root.putChild(b"metrics", MetricsResource()) root.putChild(b"healthz", HealthResource()) factory = Site(root) reactor.listenTCP(port, factory) reactor.run(installSignalHandlers=False)
def main(): cfg_store = ConfigStore() # assuming loglevel is bound to the string value obtained from the command line argument. log_level = cfg_store.get_log_level() logging.basicConfig(level=log_level) eng = engine.create_engine( f"openmldb:///?zk={cfg_store.zk_root}&zkPath={cfg_store.zk_path}") conn = eng.connect() collectors = ( TableStatusCollector(conn), DeployQueryStatCollector(conn), ComponentStatusCollector(conn), ) task.LoopingCall(collect_task, collectors).start(cfg_store.pull_interval) root = Resource() # child path must be bytes root.putChild(cfg_store.telemetry_path.encode(), MetricsResource()) factory = Site(root) reactor.listenTCP(cfg_store.listen_port, factory) reactor.run()
def run_metrics_server(): root = Resource() root.putChild(b'metrics', MetricsResource()) factory = Site(root) reactor.listenTCP(8080, factory) reactor.run()
def create_twisted_server(port, metrics_path): root = Resource() root.putChild(metrics_path.encode("utf-8"), MetricsResource()) factory = Site(root) reactor.listenTCP(port, factory) logging.info(f"listening port: {port}") return reactor
def run(self): root = Resource() root.putChild(b'metrics', MetricsResource()) root.putChild(b'heartbeat', HeartBeatResource()) factory = Site(root) reactor.listenTCP(CONF.heartbeat.bind_port, factory, interface=CONF.heartbeat.bind_host) reactor.run()
def main(args): register_stack_trace_dump() burninate_gc_collector() config_environ() configured_gpu_counter.set( get_gpu_count("/gpu-config/gpu-configuration.json")) decay_time = datetime.timedelta(seconds=args.interval * 2) # used to exchange gpu info between GpuCollector and ContainerCollector npu_info_ref = collector.AtomicRef(decay_time) nvidia_info_ref = collector.AtomicRef(decay_time) # used to exchange docker stats info between ContainerCollector and ZombieCollector stats_info_ref = collector.AtomicRef(decay_time) # used to exchange zombie info between GpuCollector and ZombieCollector zombie_info_ref = collector.AtomicRef(decay_time) # used to exchange dcgm info between DCGMCollector and ContainerCollector dcgm_info_ref = collector.AtomicRef(decay_time) interval = args.interval # Because all collector except container_collector will spent little time in calling # external command to get metrics, so they need to sleep 30s to align with prometheus # scrape interval. The 99th latency of container_collector loop is around 20s, so it # should only sleep 10s to adapt to scrape interval collector_args = [ ("npu_collector", interval, decay_time, collector.NpuCollector, npu_info_ref, zombie_info_ref, args.threshold), ("docker_daemon_collector", interval, decay_time, collector.DockerCollector), ("gpu_collector", interval, decay_time, collector.GpuCollector, nvidia_info_ref, zombie_info_ref, args.threshold), ("container_collector", max(0, interval - 18), decay_time, collector.ContainerCollector, nvidia_info_ref, stats_info_ref, args.interface, npu_info_ref, dcgm_info_ref), ("zombie_collector", interval, decay_time, collector.ZombieCollector, stats_info_ref, zombie_info_ref), ("process_collector", interval, decay_time, collector.ProcessCollector), ("dcgm_collector", interval, decay_time, collector.DCGMCollector, dcgm_info_ref), ] refs = list(map(lambda x: collector.make_collector(*x), collector_args)) REGISTRY.register(CustomCollector(refs)) root = Resource() root.putChild(b"metrics", MetricsResource()) root.putChild(b"healthz", HealthResource()) factory = Site(root) reactor.listenTCP(int(args.port), factory) reactor.run()
def start_http_server(self, reactor): root = resource.Resource() root.putChild(b'metrics', MetricsResource()) port = os.getenv('PORT', 8000) endpoints.serverFromString( reactor, r'tcp:interface=\:\:0:port={}'.format(port)).listen( server.Site(root)) print('HTTP server listening on port {}'.format(port))
def getChild(self, path, request): if path == 'pools': return PoolsRoot() if path == 'alerts': return Alerts() if prometheus_support and path == 'metrics': return MetricsResource() else: return Resp404()
def __init__(self, debug=False): super(RootResource, self).__init__() self.putChild(b'health', HealthcheckResource()) self.putChild(b'metrics', MetricsResource()) if debug: self.putChild(b'meminfo', MemInfoResource()) self.putChild(b'objgrowth', ObjGrowthResource()) self.putChild(b'objtypes', ObjTypesResource()) self.putChild(b'objleak', ObjLeakResource())
def prometheus_exporter(reactor, port_string): """ Create an ``IService`` that exposes Prometheus metrics from this process on an HTTP server on the given port. """ root = Resource() root.putChild(b"metrics", MetricsResource()) service = StreamServerEndpointService( serverFromString(reactor, port_string), Site(root), ) return service
def render_GET(self, request): logger.info("Request processing started") hostname = (request.args[b"hostname"][0]).decode("utf-8") port = int(request.args[b"port"][0]) username = (request.args[b"username"][0]).decode("utf-8") password = (request.args[b"password"][0]).decode("utf-8") logger.info( f"Initializing ILO with : {hostname} {port} {username} {password}") ilo_client = Ilo(hostname=hostname, port=port, login=username, password=password) # ILO was reporting stale metrics when the server turns off. Doing this to prevent that. if ilo_client.get_host_power_status() != "OFF": power_status = ilo_client.get_host_power_saver_status( )["host_power_saver"] logger.debug(f"Powered on Status: {power_status}") server_health = ilo_client.get_embedded_health() present_power_reading_str = server_health["power_supply_summary"][ "present_power_reading"] logger.debug(f"Present power usage : {present_power_reading_str}") power_mode = server_health["power_supply_summary"][ "high_efficiency_mode"] logger.debug(f"Power mode : {power_mode}") present_power_reading = present_power_reading_str.replace( "Watts", "").strip() present_power_reading_gauge.labels( power_mode, power_status).set(present_power_reading) for (fan_name, fan_status) in server_health["fans"].items(): logger.debug(f"Fan name : {fan_name} , data: {fan_status}") fan_speed_percentage = fan_status["speed"][0] fan_gauge.labels(fan_name).set(fan_speed_percentage) for (label, temperature_data) in server_health["temperature"].items(): logger.debug( f"Temperature Label : {label} , data: {temperature_data}") location = temperature_data["location"] currentreading = temperature_data["currentreading"] caution = temperature_data["caution"] critical = temperature_data["critical"] if currentreading != "N/A": current_temperature_gauge.labels(location).set( currentreading[0]) if caution != "N/A": caution_temperature_gauge.labels(location).set(caution[0]) if critical != "N/A": critical_temperature_gauge.labels(location).set( critical[0]) logger.info("Request processing finished") return MetricsResource().render_GET(request) else: request.setResponseCode(500) return "Internal Server Error"
def main(args): register_stack_trace_dump() burninate_gc_collector() REGISTRY.register(YarnCollector(args.yarn_url)) root = Resource() root.putChild(b"metrics", MetricsResource()) root.putChild(b"healthz", HealthResource()) factory = Site(root) reactor.listenTCP(int(args.port), factory) reactor.run()
def __init__(self, reactor, port): self.reactor = reactor self.reactor.addSystemEventTrigger('before', 'shutdown', self.before_shutdown) self.reactor.suggestThreadPoolSize(config.REACTOR_THREAD_POOL_MAX) self.app = create_app() self.thread_pool = ThreadPool(maxthreads=config.FLASK_THREAD_POOL_MAX) self.thread_pool.start() wsgi_resource = WSGIResource(self.reactor, self.thread_pool, self.app) root_resource = RootResource(wsgi_resource) root_resource.putChild("metrics", MetricsResource()) site = Site(root_resource) self.bind = self.reactor.listenTCP(port, site) log.info('Provider is listening on {} ...'.format(port))
def initialize_prometheus_exporter(ursula, port: int) -> None: from prometheus_client.twisted import MetricsResource from twisted.web.resource import Resource from twisted.web.server import Site from nucypher.utilities.metrics import collect_prometheus_metrics # Scheduling metrics_task = task.LoopingCall(collect_prometheus_metrics, ursula=ursula) metrics_task.start(interval=10, now=False) # TODO: make configurable # WSGI Service root = Resource() root.putChild(b'metrics', MetricsResource()) factory = Site(root) reactor.listenTCP(port, factory)
def start_webserver(conf): REGISTRY.register( TableauMetricsCollector( TokenManager(conf['tableau_user'], conf['tableau_password'], conf['site'], conf['server_host'], conf['api_version']))) # Start up the server to expose the metrics. root = Resource() root.putChild(b'metrics', MetricsResource()) factory = Site(root) logger.info('Starting webserver on {}'.format(conf['exporter_port'])) reactor.listenTCP(conf['exporter_port'], factory) reactor.run()
def prometheus_exporter(reactor, port_string): """ Create an ``IService`` that exposes Prometheus metrics from this process on an HTTP server on the given port. """ parent = MultiService() root = Resource() root.putChild(b"metrics", MetricsResource()) StreamServerEndpointService( serverFromString(reactor, port_string), Site(root), ).setServiceParent(parent) _ExtraMetrics().setServiceParent(parent) return parent
def main(args): register_stack_trace_dump() burninate_gc_collector() config_environ() try_remove_old_prom_file(args.log + "/gpu_exporter.prom") try_remove_old_prom_file(args.log + "/job_exporter.prom") try_remove_old_prom_file(args.log + "/docker.prom") try_remove_old_prom_file(args.log + "/time.prom") try_remove_old_prom_file(args.log + "/configured_gpu.prom") configured_gpu_counter.set( get_gpu_count("/gpu-config/gpu-configuration.json")) # used to exchange gpu info between GpuCollector and ContainerCollector gpu_info_ref = collector.AtomicRef() # used to exchange docker stats info between ContainerCollector and ZombieCollector stats_info_ref = collector.AtomicRef() interval = args.interval # Because all collector except container_collector will spent little time in calling # external command to get metrics, so they need to sleep 30s to align with prometheus # scrape interval. The 99th latency of container_collector loop is around 20s, so it # should only sleep 10s to adapt to scrape interval collector_args = [ ("docker_daemon_collector", interval, collector.DockerCollector), ("gpu_collector", interval / 2, collector.GpuCollector, gpu_info_ref), ("container_collector", interval - 18, collector.ContainerCollector, gpu_info_ref, stats_info_ref, args.interface), ("zombie_collector", interval, collector.ZombieCollector, stats_info_ref), ] refs = list(map(lambda x: collector.make_collector(*x), collector_args)) REGISTRY.register(CustomCollector(refs)) root = Resource() root.putChild(b"metrics", MetricsResource()) root.putChild(b"healthz", HealthResource()) factory = Site(root) reactor.listenTCP(int(args.port), factory) reactor.run()
def run(self): """ TODO """ root = Resource() # TODO: add error counter root.putChild(b'metrics', MetricsResource()) factory = Site(root) reactor.listenTCP(self._prometheus_port, factory, interface='0.0.0.0') self.log.info('Start listen Prometheus (port: {prometheus_port})', prometheus_port=self._prometheus_port) reactor.listenUDP( self._gateway_port, ReceiveMetricProtocol(log=self.log, metrics=self._metrics) ) self.log.info( 'Start listen udp gateway (port: {gateway_port})', gateway_port=self._gateway_port ) reactor.run()
def start_webserver(conf): token_manager = TokenManager(conf.get('tableau_user'), conf.get('tableau_password'), conf['site'], conf['server_host'], conf['api_version'], token_name=conf.get('tableau_token_name'), token_secret=conf.get('tableau_token_secret')) REGISTRY.register( TableauMetricsCollector(token_manager, verify_ssl=conf.get('verify_ssl', False))) # Start up the server to expose the metrics. root = Resource() root.putChild(b'metrics', MetricsResource()) factory = Site(root) logger.info('Starting webserver on {}'.format(conf['exporter_port'])) reactor.listenTCP(conf['exporter_port'], factory) reactor.run()
def makeService(options): from twisted.internet import reactor parent = MultiService() root = Resource() root.putChild(b"metrics", MetricsResource()) endpoint = serverFromString(reactor, options["metrics-port"]) StreamServerEndpointService( endpoint, Site(root), ).setServiceParent(parent) TimerService( 15, _get_collect_wrapper(options["host-mount-namespace"], ), _DiskStatsCollector(), ).setServiceParent(parent) return parent
def test_reports_metrics(self): """ ``MetricsResource`` serves the metrics from the provided registry. """ c = Counter('cc', 'A counter', registry=self.registry) c.inc() root = Resource() root.putChild(b'metrics', MetricsResource(registry=self.registry)) server = reactor.listenTCP(0, Site(root)) self.addCleanup(server.stopListening) agent = Agent(reactor) port = server.getHost().port url = "http://localhost:{port}/metrics".format(port=port) d = agent.request(b"GET", url.encode("ascii")) d.addCallback(readBody) d.addCallback(self.assertEqual, generate_latest(self.registry)) return d
def start(config): output = textFileLogObserver(sys.stderr) globalLogBeginner.beginLoggingTo([output]) root = Resource() metrics = MetricsResource() root.putChild(b'metrics', metrics) site = QuietSite(root) endpoint = endpoints.serverFromString(reactor, 'tcp:{}'.format(config['prometheus']['http_port'])) endpoint.listen(site) perfmon = Gauge('cisco_perfmon_counter', 'Cisco PerfMon API Counter', labelnames=('host', 'object', 'instance', 'counter')) for server in config['perfmon']: ServerPoller(perfmon, server['server_name'], server['server_address'], server['server_port'], server['username'], server['password'], server['verify'], server['included_objects'])
def __init__(self, reactor, port): self.reactor = reactor self.reactor.addSystemEventTrigger('before', 'shutdown', self.before_shutdown) self.reactor.addSystemEventTrigger('during', 'shutdown', self.during_shutdown) self.reactor.suggestThreadPoolSize(config.REACTOR_THREAD_POOL_MAX) self.app = create_app() self.thread_pool = ThreadPool(maxthreads=config.FLASK_THREAD_POOL_MAX) self.thread_pool.start() wsgi_resource = WSGIResource(self.reactor, self.thread_pool, self.app) root_resource = RootResource(wsgi_resource) root_resource.putChild("proxy", ProxyResource(self.app)) root_resource.putChild("metrics", MetricsResource()) site = Site(root_resource) site.protocol = HTTPChannelWithClient self.bind = self.reactor.listenTCP(port, site) self._wait_for_end_active_sessions = getattr(config, 'WAIT_ACTIVE_SESSIONS', False) log.info('Server is listening on %s ...' % port)
import logging from prometheus_client.twisted import MetricsResource from twisted.web.server import Site from twisted.web.resource import Resource from twisted.internet.error import CannotListenError from twisted.internet import reactor from prometheus_client import Counter, Summary log = logging.getLogger("subtocall") root = Resource() root.putChild(b'metrics', MetricsResource()) factory = Site(root) try: reactor.listenTCP(8910, factory) except CannotListenError as e: log.error("ERROR: metrics: %s" % e) NAMESPACE = 'pip_subtocall' REDIS_PUSHS = Counter('%s_redis_push_total' % NAMESPACE, 'Counter (int) of outgoing redis push (queue entry)') SUB_RECV_TIME = Summary( f"{NAMESPACE}_sub_recv_time", "Time it took until an event got received by this service from (subscription)" )
def main(args): register_stack_trace_dump() burninate_gc_collector() logDir = args.log try_remove_old_prom_file(logDir + "/watchdog.prom") address = args.k8s_api parse_result = urllib.parse.urlparse(address) api_server_scheme = parse_result.scheme api_server_ip = parse_result.hostname api_server_port = parse_result.port or 80 ca_path = args.ca bearer_path = args.bearer if (ca_path is None and bearer_path is not None) or (ca_path is not None and bearer_path is None): log.warning("please provide bearer_path and ca_path at the same time or not") headers = None if not os.path.isfile(ca_path): ca_path = None if not os.path.isfile(bearer_path): bearer_path = None if bearer_path is not None: with open(bearer_path, 'r') as bearer_file: bearer = bearer_file.read() headers = {'Authorization': "Bearer {}".format(bearer)} list_pods_url = "{}/api/v1/namespaces/default/pods/".format(address) list_nodes_url = "{}/api/v1/nodes/".format(address) atomic_ref = AtomicRef() REGISTRY.register(CustomCollector(atomic_ref)) root = Resource() root.putChild(b"metrics", MetricsResource()) root.putChild(b"healthz", HealthResource()) factory = Site(root) reactor.listenTCP(int(args.port), factory) t = threading.Thread(target=reactor.run, name="twisted") t.daemon = True t.start() while True: # these gauge is generate on each iteration pai_pod_gauge = gen_pai_pod_gauge() pai_container_gauge = gen_pai_container_gauge() pai_node_gauge = gen_pai_node_gauge() k8s_gauge = gen_k8s_component_gauge() try: # 1. check service level status podsStatus = request_with_histogram(list_pods_url, list_pods_histogram, ca_path, headers) process_pods_status(pai_pod_gauge, pai_container_gauge, podsStatus) # 2. check nodes level status nodes_status = request_with_histogram(list_nodes_url, list_nodes_histogram, ca_path, headers) process_nodes_status(pai_node_gauge, nodes_status) # 3. check k8s level status collect_k8s_component(k8s_gauge, api_server_scheme, api_server_ip, api_server_port, ca_path, headers) except Exception as e: error_counter.labels(type="unknown").inc() logger.exception("watchdog failed in one iteration") atomic_ref.get_and_set([pai_pod_gauge, pai_container_gauge, pai_node_gauge, k8s_gauge]) time.sleep(float(args.interval))
def __init__(self, crawler): if not crawler.settings.getbool('PROMETHEUS_ENABLED', True): raise NotConfigured self.tasks = [] self.stats = crawler.stats self.crawler = crawler self.name = crawler.settings.get('BOT_NAME') self.port = crawler.settings.get('PROMETHEUS_PORT', [9410]) self.host = crawler.settings.get('PROMETHEUS_HOST', '0.0.0.0') self.path = crawler.settings.get('PROMETHEUS_PATH', 'metrics') self.interval = crawler.settings.get('PROMETHEUS_UPDATE_INTERVAL', 30) self.spr_item_scraped = Gauge('spr_items_scraped', 'Spider items scraped', ['spider']) self.spr_item_dropped = Gauge('spr_items_dropped', 'Spider items dropped', ['spider']) self.spr_response_received = Gauge('spr_response_received', 'Spider responses received', ['spider']) self.spr_opened = Gauge('spr_opened', 'Spider opened', ['spider']) self.spr_closed = Gauge('spr_closed', 'Spider closed', ['spider', 'reason']) self.spr_downloader_request_bytes = Gauge( 'spr_downloader_request_bytes', '...', ['spider']) self.spr_downloader_request_total = Gauge( 'spr_downloader_request_total', '...', ['spider']) self.spr_downloader_request_count = Gauge('spr_downloader_request', '...', ['spider', 'method']) self.spr_downloader_response_count = Gauge('spr_downloader_response', '...', ['spider']) self.spr_downloader_response_bytes = Gauge( 'spr_downloader_response_bytes', '...', ['spider']) self.spr_downloader_response_status_count = Gauge( 'spr_downloader_response_status', '...', ['spider', 'code']) self.spr_log_count = Gauge('spr_log', '...', ['spider', 'level']) self.spr_duplicate_filtered = Gauge('spr_duplicate_filtered', '...', ['spider']) self.spr_memdebug_gc_garbage_count = Gauge('spr_memdebug_gc_garbage', '...', ['spider']) self.spr_memdebug_live_refs = Gauge('spr_memdebug_live_refs', '...', ['spider']) self.spr_memusage_max = Gauge('spr_memusage_max', '...', ['spider']) self.spr_memusage_startup = Gauge('spr_memusage_startup', '...', ['spider']) self.spr_scheduler_dequeued = Gauge('spr_scheduler_dequeued', '...', ['spider']) self.spr_scheduler_enqueued = Gauge('spr_scheduler_enqueued', '...', ['spider']) self.spr_scheduler_enqueued_memory = Gauge( 'spr_scheduler_enqueued_memory', '...', ['spider']) self.spr_offsite_domains_count = Gauge('spr_offsite_domains', '...', ['spider']) self.spr_offsite_filtered_count = Gauge('spr_offsite_filtered', '...', ['spider']) self.spr_request_depth = Gauge('spr_request_depth', '...', ['spider']) self.spr_request_depth_max = Gauge('spr_request_depth_max', '...', ['spider']) root = resource.Resource() self.promtheus = None root.putChild(self.path.encode('utf-8'), MetricsResource()) server.Site.__init__(self, root) crawler.signals.connect(self.engine_started, signals.engine_started) crawler.signals.connect(self.engine_stopped, signals.engine_stopped) crawler.signals.connect(self.spider_opened, signals.spider_opened) crawler.signals.connect(self.spider_closed, signals.spider_closed) crawler.signals.connect(self.item_scraped, signals.item_scraped) crawler.signals.connect(self.item_dropped, signals.item_dropped) crawler.signals.connect(self.response_received, signals.response_received)