def test_collector(self): agentConfig = { "api_key": "test_apikey", "check_timings": True, "collect_ec2_tags": True, "collect_instance_metadata": False, "version": "test", "tags": "", } # Run a single checks.d check as part of the collector. redis_config = {"init_config": {}, "instances": [{"host": "localhost", "port": 6379}]} checks = [load_check("redisdb", redis_config, agentConfig)] c = Collector(agentConfig, [], {}, get_hostname(agentConfig)) payload = c.run({"initialized_checks": checks, "init_failed_checks": {}}) metrics = payload["metrics"] # Check that we got a timing metric for all checks. timing_metrics = [m for m in metrics if m[0] == "datadog.agent.check_run_time"] all_tags = [] for metric in timing_metrics: all_tags.extend(metric[3]["tags"]) for check in checks: tag = "check:%s" % check.name assert tag in all_tags, all_tags
def test_collector(self): agentConfig = { 'agent_key': 'test_agentkey', 'check_timings': True, 'collect_ec2_tags': True, 'collect_instance_metadata': False, 'create_dd_check_tags': False, 'version': 'test', 'tags': '', } # Run a single checks.d check as part of the collector. redis_config = { "init_config": {}, "instances": [{"host": "localhost", "port": 6379}] } checks = [load_check('redisdb', redis_config, agentConfig)] c = Collector(agentConfig, [], {}, get_hostname(agentConfig)) payload = c.run({ 'initialized_checks': checks, 'init_failed_checks': {} }) metrics = payload['metrics'] # Check that we got a timing metric for all checks. timing_metrics = [m for m in metrics if m[0] == 'sd.agent.check_run_time'] all_tags = [] for metric in timing_metrics: all_tags.extend(metric[3]['tags']) for check in checks: tag = "check:%s" % check.name assert tag in all_tags, all_tags
def test_apptags(self): ''' Tests that the app tags are sent if specified so ''' agentConfig = { 'api_key': 'test_apikey', 'collect_ec2_tags': False, 'collect_instance_metadata': False, 'create_dd_check_tags': True, 'version': 'test', 'tags': '', } # Run a single checks.d check as part of the collector. redis_config = { "init_config": {}, "instances": [{"host": "localhost", "port": 6379}] } checks = [load_check('redisdb', redis_config, agentConfig)] c = Collector(agentConfig, [], {}, get_hostname(agentConfig)) payload = c.run({ 'initialized_checks': checks, 'init_failed_checks': {} }) # We check that the redis DD_CHECK_TAG is sent in the payload self.assertTrue('dd_check:redisdb' in payload['host-tags']['system'])
def test_apptags(self): ''' Tests that the app tags are sent if specified so ''' agentConfig = { 'agent_key': 'test_agentkey', 'collect_ec2_tags': False, 'collect_orchestrator_tags': False, 'collect_instance_metadata': False, 'create_sd_check_tags': True, 'version': 'test', 'tags': '', } # Run a single checks.d check as part of the collector. disk_config = { "init_config": {}, "instances": [{}] } checks = [load_check('disk', disk_config, agentConfig)] c = Collector(agentConfig, [], {}, get_hostname(agentConfig)) payload = c.run({ 'initialized_checks': checks, 'init_failed_checks': {} }) # We check that the redis SD_CHECK_TAG is sent in the payload self.assertTrue('sd_check:disk' in payload['host-tags']['system'])
def test_hostname_metadata(self): """ Collect hostname metadata """ c = Collector({"collect_instance_metadata": True}, None, {}, "foo") metadata = c._get_hostname_metadata() assert "hostname" in metadata assert "socket-fqdn" in metadata assert "socket-hostname" in metadata
def run(self): emitters = self.get_emitters() systemStats = get_system_stats() collector = Collector(self.config, emitters, systemStats) # Load the checks.d checks checksd = load_check_directory(self.config) # Main agent loop will run until interrupted while self.running: collector.run(checksd=checksd) time.sleep(self.config['check_freq'])
def run(self): log.debug("Windows Service - Starting collector") emitters = self.get_emitters() systemStats = get_system_stats() collector = Collector(self.config, emitters, systemStats) # Load the checks.d checks checksd = load_check_directory(self.config) # Main agent loop will run until interrupted while self.running: collector.run(checksd=checksd) time.sleep(self.config["check_freq"])
class DDAgent(multiprocessing.Process): def __init__(self, agentConfig, hostname, heartbeat=None): multiprocessing.Process.__init__(self, name='ddagent') self.config = agentConfig self.hostname = hostname self._heartbeat = heartbeat # FIXME: `running` flag should be handled by the service self.running = True self.is_enabled = True def run(self): from config import initialize_logging initialize_logging('windows_collector') log.debug("Windows Service - Starting collector") emitters = self.get_emitters() systemStats = get_system_stats() self.collector = Collector(self.config, emitters, systemStats, self.hostname) # Load the checks.d checks checksd = load_check_directory(self.config, self.hostname) # Main agent loop will run until interrupted while self.running: if self._heartbeat: self._heartbeat.send(0) self.collector.run(checksd=checksd) time.sleep(self.config['check_freq']) def stop(self): log.debug("Windows Service - Stopping collector") self.collector.stop() if JMXFetch.is_running(): JMXFetch.stop() self.running = False def get_emitters(self): emitters = [http_emitter] custom = [ s.strip() for s in self.config.get('custom_emitters', '').split(',') ] for emitter_spec in custom: if not emitter_spec: continue emitters.append(modules.load(emitter_spec, 'emitter')) return emitters
def run(self): from config import initialize_logging initialize_logging('windows_collector') log.debug("Windows Service - Starting collector") set_win32_requests_ca_bundle_path() emitters = self.get_emitters() systemStats = get_system_stats() self.collector = Collector(self.config, emitters, systemStats, self.hostname) in_developer_mode = self.config.get('developer_mode') # In developer mode, the number of runs to be included in a single collector profile collector_profile_interval = self.config.get( 'collector_profile_interval', DEFAULT_COLLECTOR_PROFILE_INTERVAL) profiled = False collector_profiled_runs = 0 # Load the checks.d checks checksd = load_check_directory(self.config, self.hostname) # Main agent loop will run until interrupted while self.running: if self._heartbeat: self._heartbeat.send(0) if in_developer_mode and not profiled: try: profiler = AgentProfiler() profiler.enable_profiling() profiled = True except Exception as e: log.warn("Cannot enable profiler: %s" % str(e)) self.collector.run(checksd=checksd) if profiled: if collector_profiled_runs >= collector_profile_interval: try: profiler.disable_profiling() profiled = False collector_profiled_runs = 0 except Exception as e: log.warn("Cannot disable profiler: %s" % str(e)) else: collector_profiled_runs += 1 time.sleep(self.config['check_freq'])
def run(self): from config import initialize_logging initialize_logging('windows_collector') log.debug("Windows Service - Starting collector") emitters = self.get_emitters() systemStats = get_system_stats() self.collector = Collector(self.config, emitters, systemStats, self.hostname) # Load the checks.d checks checksd = load_check_directory(self.config, self.hostname) # Main agent loop will run until interrupted while self.running: self.collector.run(checksd=checksd, start_event=self.start_event) time.sleep(self.config['check_freq'])
class DDAgent(multiprocessing.Process): def __init__(self, agentConfig, hostname, heartbeat=None): multiprocessing.Process.__init__(self, name='ddagent') self.config = agentConfig self.hostname = hostname self._heartbeat = heartbeat # FIXME: `running` flag should be handled by the service self.running = True self.is_enabled = True def run(self): from config import initialize_logging initialize_logging('windows_collector') log.debug("Windows Service - Starting collector") emitters = self.get_emitters() systemStats = get_system_stats() self.collector = Collector(self.config, emitters, systemStats, self.hostname) # Load the checks.d checks checksd = load_check_directory(self.config, self.hostname) # Main agent loop will run until interrupted while self.running: if self._heartbeat: self._heartbeat.send(0) self.collector.run(checksd=checksd) time.sleep(self.config['check_freq']) def stop(self): log.debug("Windows Service - Stopping collector") self.collector.stop() if JMXFetch.is_running(): JMXFetch.stop() self.running = False def get_emitters(self): emitters = [http_emitter] custom = [s.strip() for s in self.config.get('custom_emitters', '').split(',')] for emitter_spec in custom: if not emitter_spec: continue emitters.append(modules.load(emitter_spec, 'emitter')) return emitters
def run(self): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. agentConfig = self._set_agent_config_hostname(get_config()) systemStats = get_system_stats() emitters = self._get_emitters(agentConfig) self.collector = Collector(agentConfig, emitters, systemStats) # Load the checks.d checks checksd = load_check_directory(agentConfig) # Configure the watchdog. check_frequency = int(agentConfig['check_freq']) watchdog = self._get_watchdog(check_frequency, agentConfig) # Run the main loop. while self.run_forever: # Do the work. self.collector.run(checksd=checksd) # Only plan for the next loop if we will continue, # otherwise just exit quickly. if self.run_forever: if watchdog: watchdog.reset() time.sleep(check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except: pass # Explicitly kill the process, because it might be running # as a daemon. agent_logger.info("Exiting. Bye bye.") sys.exit(0)
def run(self, agentConfig=None, run_forever=True): """Main loop of the collector""" agentLogger = logging.getLogger('agent') systemStats = get_system_stats() if agentConfig is None: agentConfig = get_config() # Load the checks.d checks checksd = load_check_directory(agentConfig) # Try to fetch instance Id from EC2 if not hostname has been set # in the config file. # DEPRECATED if agentConfig.get('hostname') is None and agentConfig.get('use_ec2_instance_id'): instanceId = EC2.get_instance_id() if instanceId is not None: agentLogger.info("Running on EC2, instanceId: %s" % instanceId) agentConfig['hostname'] = instanceId else: agentLogger.info('Not running on EC2, using hostname to identify this server') emitters = [http_emitter] for emitter_spec in [s.strip() for s in agentConfig.get('custom_emitters', '').split(',')]: if len(emitter_spec) == 0: continue emitters.append(modules.load(emitter_spec, 'emitter')) check_freq = int(agentConfig['check_freq']) # Checks instance collector = Collector(agentConfig, emitters, systemStats) # Watchdog watchdog = None if agentConfig.get("watchdog", True): watchdog = Watchdog(check_freq * WATCHDOG_MULTIPLIER) watchdog.reset() # Main loop while run_forever: collector.run(checksd=checksd) if watchdog is not None: watchdog.reset() time.sleep(check_freq)
def test_decode_tzname(self, mock_platform): # Examples of expected inputs/outputs # Korean systems with mock.patch('locale.getpreferredencoding', return_value='cp949'): self.assertEquals( Collector._decode_tzname(('\xb4\xeb\xc7\xd1\xb9\xce\xb1\xb9 \xc7\xa5\xc1\xd8\xbd\xc3', '\xb4\xeb\xc7\xd1\xb9\xce\xb1\xb9 \xc0\xcf\xb1\xa4 \xc0\xfd\xbe\xe0 \xbd\xc3\xb0\xa3')), (u'대한민국 표준시', u'대한민국 일광 절약 시간') ) # Japanese systems with mock.patch('locale.getpreferredencoding', return_value='cp932'): self.assertEquals( Collector._decode_tzname(('\x93\x8c\x8b\x9e (\x95W\x8f\x80\x8e\x9e)', '\x93\x8c\x8b\x9e (\x89\xc4\x8e\x9e\x8a\xd4)')), (u'東京 (標準時)', u'東京 (夏時間)') ) # if the preferred encoding were to be invalid, return empty timezone with mock.patch('locale.getpreferredencoding', return_value='invalidencoding'): self.assertEquals( Collector._decode_tzname(('\x93\x8c\x8b\x9e (\x95W\x8f\x80\x8e\x9e)', '\x93\x8c\x8b\x9e (\x89\xc4\x8e\x9e\x8a\xd4)')), ('', '') )
class DDAgent(threading.Thread): def __init__(self, agentConfig): threading.Thread.__init__(self) self.config = agentConfig # FIXME: `running` flag should be handled by the service self.running = True def run(self): log.debug("Windows Service - Starting collector") emitters = self.get_emitters() systemStats = get_system_stats() self.collector = Collector(self.config, emitters, systemStats) # Load the checks.d checks checksd = load_check_directory(self.config) # Main agent loop will run until interrupted while self.running: self.collector.run(checksd=checksd) time.sleep(self.config['check_freq']) def stop(self): log.debug("Windows Service - Stopping collector") self.collector.stop() if JMXFetch.is_running(): JMXFetch.stop() self.running = False def get_emitters(self): emitters = [http_emitter] custom = [ s.strip() for s in self.config.get('custom_emitters', '').split(',') ] for emitter_spec in custom: if not emitter_spec: continue emitters.append(modules.load(emitter_spec, 'emitter')) return emitters
def test_collector(self): agentConfig = { 'api_key': 'test_apikey', 'check_timings': True, 'collect_ec2_tags': True, 'collect_instance_metadata': False, 'create_dd_check_tags': False, 'version': 'test', 'tags': '', } # Run a single checks.d check as part of the collector. redis_config = { "init_config": {}, "instances": [{ "host": "localhost", "port": 6379 }] } checks = [load_check('redisdb', redis_config, agentConfig)] c = Collector(agentConfig, [], {}, get_hostname(agentConfig)) payload = c.run({ 'initialized_checks': checks, 'init_failed_checks': {} }) metrics = payload['metrics'] # Check that we got a timing metric for all checks. timing_metrics = [ m for m in metrics if m[0] == 'datadog.agent.check_run_time' ] all_tags = [] for metric in timing_metrics: all_tags.extend(metric[3]['tags']) for check in checks: tag = "check:%s" % check.name assert tag in all_tags, all_tags
class DDAgent(threading.Thread): def __init__(self, agentConfig): threading.Thread.__init__(self) self.config = agentConfig # FIXME: `running` flag should be handled by the service self.running = True def run(self): log.debug("Windows Service - Starting collector") emitters = self.get_emitters() systemStats = get_system_stats() self.collector = Collector(self.config, emitters, systemStats) # Load the checks.d checks checksd = load_check_directory(self.config) # Main agent loop will run until interrupted while self.running: self.collector.run(checksd=checksd) time.sleep(self.config['check_freq']) def stop(self): log.debug("Windows Service - Stopping collector") self.collector.stop() if JMXFetch.is_running(): JMXFetch.stop() self.running = False def get_emitters(self): emitters = [http_emitter] custom = [s.strip() for s in self.config.get('custom_emitters', '').split(',')] for emitter_spec in custom: if not emitter_spec: continue emitters.append(modules.load(emitter_spec, 'emitter')) return emitters
def run(self): from config import initialize_logging; initialize_logging('windows_collector') log.debug("Windows Service - Starting collector") emitters = self.get_emitters() systemStats = get_system_stats() self.collector = Collector(self.config, emitters, systemStats, self.hostname) # Load the checks.d checks checksd = load_check_directory(self.config, self.hostname) # Main agent loop will run until interrupted while self.running: self.collector.run(checksd=checksd, start_event=self.start_event) time.sleep(self.config['check_freq'])
def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) agentConfig = self._set_agent_config_hostname(config) systemStats = get_system_stats() emitters = self._get_emitters(agentConfig) self.collector = Collector(agentConfig, emitters, systemStats) # Load the checks.d checks checksd = load_check_directory(agentConfig) # Configure the watchdog. check_frequency = int(agentConfig['check_freq']) watchdog = self._get_watchdog(check_frequency, agentConfig) # Run the main loop. while self.run_forever: # Do the work. self.collector.run(checksd=checksd) # Only plan for the next loop if we will continue, # otherwise just exit quickly. if self.run_forever: if watchdog: watchdog.reset() time.sleep(check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except: pass # Explicitly kill the process, because it might be running # as a daemon. log.info("Exiting. Bye bye.") sys.exit(0)
def run(self): from config import initialize_logging initialize_logging('windows_collector') log.debug("Windows Service - Starting collector") set_win32_requests_ca_bundle_path() emitters = self.get_emitters() systemStats = get_system_stats() self.collector = Collector(self.config, emitters, systemStats, self.hostname) in_developer_mode = self.config.get('developer_mode') # In developer mode, the number of runs to be included in a single collector profile collector_profile_interval = self.config.get('collector_profile_interval', DEFAULT_COLLECTOR_PROFILE_INTERVAL) profiled = False collector_profiled_runs = 0 # Load the checks.d checks checksd = load_check_directory(self.config, self.hostname) # Main agent loop will run until interrupted while self.running: if self._heartbeat: self._heartbeat.send(0) if in_developer_mode and not profiled: try: profiler = AgentProfiler() profiler.enable_profiling() profiled = True except Exception as e: log.warn("Cannot enable profiler: %s" % str(e)) self.collector.run(checksd=checksd) if profiled: if collector_profiled_runs >= collector_profile_interval: try: profiler.disable_profiling() profiled = False collector_profiled_runs = 0 except Exception as e: log.warn("Cannot disable profiler: %s" % str(e)) else: collector_profiled_runs += 1 time.sleep(self.config['check_freq'])
def main(): options, args = get_parsed_args() agentConfig = get_config(options=options) autorestart = agentConfig.get('autorestart', False) hostname = get_hostname(agentConfig) in_developer_mode = agentConfig.get('developer_mode') COMMANDS_AGENT = [ 'start', 'stop', 'restart', 'status', 'foreground', ] COMMANDS_NO_AGENT = [ 'info', 'check', 'configcheck', 'jmx', 'flare', ] COMMANDS = COMMANDS_AGENT + COMMANDS_NO_AGENT if len(args) < 1: sys.stderr.write("Usage: %s %s\n" % (sys.argv[0], "|".join(COMMANDS))) return 2 command = args[0] if command not in COMMANDS: sys.stderr.write("Unknown command: %s\n" % command) return 3 # Deprecation notice if command not in DD_AGENT_COMMANDS: # Will become an error message and exit after deprecation period from utils.deprecations import deprecate_old_command_line_tools deprecate_old_command_line_tools() if command in COMMANDS_AGENT: agent = Agent(PidFile('sd-agent').get_path(), autorestart, in_developer_mode=in_developer_mode) if command in START_COMMANDS: log.info('Agent version %s' % get_version()) if 'start' == command: log.info('Start daemon') agent.start() elif 'stop' == command: log.info('Stop daemon') agent.stop() elif 'restart' == command: log.info('Restart daemon') agent.restart() elif 'status' == command: agent.status() elif 'info' == command: return Agent.info(verbose=options.verbose) elif 'foreground' == command: logging.info('Running in foreground') if autorestart: # Set-up the supervisor callbacks and fork it. logging.info('Running Agent with auto-restart ON') def child_func(): agent.start(foreground=True) def parent_func(): agent.start_event = False AgentSupervisor.start(parent_func, child_func) else: # Run in the standard foreground. agent.start(foreground=True) elif 'check' == command: if len(args) < 2: sys.stderr.write( "Usage: %s check <check_name> [check_rate]\n" "Add check_rate as last argument to compute rates\n" % sys.argv[0]) return 1 check_name = args[1] try: import checks.collector # Try the old-style check first print getattr(checks.collector, check_name)(log).check(agentConfig) except Exception: # If not an old-style check, try checks.d checks = load_check_directory(agentConfig, hostname) for check in checks['initialized_checks']: if check.name == check_name: if in_developer_mode: check.run = AgentProfiler.wrap_profiling(check.run) cs = Collector.run_single_check(check, verbose=True) print CollectorStatus.render_check_status(cs) if len(args) == 3 and args[2] == 'check_rate': print "Running 2nd iteration to capture rate metrics" time.sleep(1) cs = Collector.run_single_check(check, verbose=True) print CollectorStatus.render_check_status(cs) check.stop() elif 'configcheck' == command or 'configtest' == command: configcheck() elif 'jmx' == command: jmx_command(args[1:], agentConfig) return 0
def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) agentConfig = self._set_agent_config_hostname(config) hostname = get_hostname(agentConfig) systemStats = get_system_stats() emitters = self._get_emitters(agentConfig) # Load the checks.d checks checksd = load_check_directory(agentConfig, hostname) self.collector = Collector(agentConfig, emitters, systemStats, hostname) # In developer mode, the number of runs to be included in a single collector profile collector_profile_interval = agentConfig.get( 'collector_profile_interval', DEFAULT_COLLECTOR_PROFILE_INTERVAL) # Configure the watchdog. check_frequency = int(agentConfig['check_freq']) watchdog = self._get_watchdog(check_frequency, agentConfig) # Initialize the auto-restarter self.restart_interval = int( agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() profiled = False collector_profiled_runs = 0 # Run the main loop. while self.run_forever: # Setup profiling if necessary if self.in_developer_mode and not profiled: try: profiler = AgentProfiler() profiler.enable_profiling() profiled = True except Exception as e: log.warn("Cannot enable profiler: %s" % str(e)) # Do the work. self.collector.run(checksd=checksd, start_event=self.start_event) if profiled: if collector_profiled_runs >= collector_profile_interval: try: profiler.disable_profiling() profiled = False collector_profiled_runs = 0 except Exception as e: log.warn("Cannot disable profiler: %s" % str(e)) # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for the next loop if we will continue, # otherwise just exit quickly. if self.run_forever: if watchdog: watchdog.reset() if profiled: collector_profiled_runs += 1 time.sleep(check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except Exception: pass # Explicitly kill the process, because it might be running # as a daemon. log.info("Exiting. Bye bye.") sys.exit(0)
def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # A SIGHUP signals a configuration reload signal.signal(signal.SIGHUP, self._handle_sighup) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) self._agentConfig = self._set_agent_config_hostname(config) hostname = get_hostname(self._agentConfig) systemStats = get_system_stats( proc_path=self._agentConfig.get('procfs_path', '/proc').rstrip('/') ) emitters = self._get_emitters() # Initialize service discovery if self._agentConfig.get('service_discovery'): self.sd_backend = get_sd_backend(self._agentConfig) if _is_affirmative(self._agentConfig.get('sd_jmx_enable')): pipe_path = get_jmx_pipe_path() if Platform.is_windows(): pipe_name = pipe_path.format(pipename=SD_PIPE_NAME) else: pipe_name = os.path.join(pipe_path, SD_PIPE_NAME) if os.access(pipe_path, os.W_OK): if not os.path.exists(pipe_name): os.mkfifo(pipe_name) self.sd_pipe = os.open(pipe_name, os.O_RDWR) # RW to avoid blocking (will only W) # Initialize Supervisor proxy self.supervisor_proxy = self._get_supervisor_socket(self._agentConfig) else: log.debug('Unable to create pipe in temporary directory. JMX service discovery disabled.') # Load the checks.d checks self._checksd = load_check_directory(self._agentConfig, hostname) # Load JMX configs if available if self._jmx_service_discovery_enabled: jmx_sd_configs = generate_jmx_configs(self._agentConfig, hostname) if jmx_sd_configs: self._submit_jmx_service_discovery(jmx_sd_configs) # Initialize the Collector self.collector = Collector(self._agentConfig, emitters, systemStats, hostname) # In developer mode, the number of runs to be included in a single collector profile try: self.collector_profile_interval = int( self._agentConfig.get('collector_profile_interval', DEFAULT_COLLECTOR_PROFILE_INTERVAL)) except ValueError: log.warn('collector_profile_interval is invalid. ' 'Using default value instead (%s).' % DEFAULT_COLLECTOR_PROFILE_INTERVAL) self.collector_profile_interval = DEFAULT_COLLECTOR_PROFILE_INTERVAL # Configure the watchdog. self.check_frequency = int(self._agentConfig['check_freq']) watchdog = self._get_watchdog(self.check_frequency) # Initialize the auto-restarter self.restart_interval = int(self._agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() self.allow_profiling = self._agentConfig.get('allow_profiling', True) profiled = False collector_profiled_runs = 0 # Run the main loop. while self.run_forever: # Setup profiling if necessary if self.allow_profiling and self.in_developer_mode and not profiled: try: profiler = AgentProfiler() profiler.enable_profiling() profiled = True except Exception as e: log.warn("Cannot enable profiler: %s" % str(e)) if self.reload_configs_flag: if isinstance(self.reload_configs_flag, set): self.reload_configs(checks_to_reload=self.reload_configs_flag) else: self.reload_configs() # Do the work. Pass `configs_reloaded` to let the collector know if it needs to # look for the AgentMetrics check and pop it out. self.collector.run(checksd=self._checksd, start_event=self.start_event, configs_reloaded=True if self.reload_configs_flag else False) self.reload_configs_flag = False # Look for change in the config template store. # The self.sd_backend.reload_check_configs flag is set # to True if a config reload is needed. if self._agentConfig.get('service_discovery') and self.sd_backend and \ not self.sd_backend.reload_check_configs: try: self.sd_backend.reload_check_configs = get_config_store( self._agentConfig).crawl_config_template() except Exception as e: log.warn('Something went wrong while looking for config template changes: %s' % str(e)) # Check if we should run service discovery # The `reload_check_configs` flag can be set through the docker_daemon check or # using ConfigStore.crawl_config_template if self._agentConfig.get('service_discovery') and self.sd_backend and \ self.sd_backend.reload_check_configs: self.reload_configs_flag = self.sd_backend.reload_check_configs self.sd_backend.reload_check_configs = False if profiled: if collector_profiled_runs >= self.collector_profile_interval: try: profiler.disable_profiling() profiled = False collector_profiled_runs = 0 except Exception as e: log.warn("Cannot disable profiler: %s" % str(e)) # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for next loop if we will continue, otherwise exit quickly. if self.run_forever: if watchdog: watchdog.reset() if profiled: collector_profiled_runs += 1 log.debug("Sleeping for {0} seconds".format(self.check_frequency)) time.sleep(self.check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except Exception: pass # Explicitly kill the process, because it might be running as a daemon. log.info("Exiting. Bye bye.") sys.exit(0)
class Agent(Daemon): """ The agent class is a daemon that runs the collector in a background process. """ def __init__(self, pidfile, autorestart, start_event=True, in_developer_mode=False): Daemon.__init__(self, pidfile, autorestart=autorestart) self.run_forever = True self.collector = None self.start_event = start_event self.in_developer_mode = in_developer_mode self._agentConfig = {} self._checksd = [] self.collector_profile_interval = DEFAULT_COLLECTOR_PROFILE_INTERVAL self.check_frequency = None # this flag can be set to True, False, or a list of checks (for partial reload) self.reload_configs_flag = False self.sd_backend = None self.supervisor_proxy = None self.sd_pipe = None def _handle_sigterm(self, signum, frame): """Handles SIGTERM and SIGINT, which gracefully stops the agent.""" log.debug("Caught sigterm. Stopping run loop.") self.run_forever = False if self.collector: self.collector.stop() log.debug("Collector is stopped.") def _handle_sigusr1(self, signum, frame): """Handles SIGUSR1, which signals an exit with an autorestart.""" self._handle_sigterm(signum, frame) self._do_restart() def _handle_sighup(self, signum, frame): """Handles SIGHUP, which signals a configuration reload.""" log.info("SIGHUP caught! Scheduling configuration reload before next collection run.") self.reload_configs_flag = True def reload_configs(self, checks_to_reload=set()): """Reload the agent configuration and checksd configurations. Can also reload only an explicit set of checks.""" log.info("Attempting a configuration reload...") hostname = get_hostname(self._agentConfig) jmx_sd_configs = None # if no check was given, reload them all if not checks_to_reload: log.debug("No check list was passed, reloading every check") # stop checks for check in self._checksd.get('initialized_checks', []): check.stop() self._checksd = load_check_directory(self._agentConfig, hostname) if self._jmx_service_discovery_enabled: jmx_sd_configs = generate_jmx_configs(self._agentConfig, hostname) else: new_checksd = copy(self._checksd) jmx_checks = [check for check in checks_to_reload if check in JMX_CHECKS] py_checks = set(checks_to_reload) - set(jmx_checks) self.refresh_specific_checks(hostname, new_checksd, py_checks) if self._jmx_service_discovery_enabled: jmx_sd_configs = generate_jmx_configs(self._agentConfig, hostname, jmx_checks) # once the reload is done, replace existing checks with the new ones self._checksd = new_checksd if jmx_sd_configs: self._submit_jmx_service_discovery(jmx_sd_configs) # Logging num_checks = len(self._checksd['initialized_checks']) if num_checks > 0: opt_msg = " (refreshed %s checks)" % len(checks_to_reload) if checks_to_reload else '' msg = "Check reload was successful. Running {num_checks} checks{opt_msg}.".format( num_checks=num_checks, opt_msg=opt_msg) log.info(msg) else: log.info("No checksd configs found") def refresh_specific_checks(self, hostname, checksd, checks): """take a list of checks and for each of them: - remove it from the init_failed_checks if it was there - load a fresh config for it - replace its old config with the new one in initialized_checks if there was one - disable the check if no new config was found - otherwise, append it to initialized_checks """ for check_name in checks: idx = None for num, check in enumerate(checksd['initialized_checks']): if check.name == check_name: idx = num # stop the existing check before reloading it check.stop() if not idx and check_name in checksd['init_failed_checks']: # if the check previously failed to load, pop it from init_failed_checks checksd['init_failed_checks'].pop(check_name) fresh_check = load_check(self._agentConfig, hostname, check_name) # this is an error dict # checks that failed to load are added to init_failed_checks # and poped from initialized_checks if isinstance(fresh_check, dict) and 'error' in fresh_check.keys(): checksd['init_failed_checks'][fresh_check.keys()[0]] = fresh_check.values()[0] if idx: checksd['initialized_checks'].pop(idx) elif not fresh_check: # no instance left of it to monitor so the check was not loaded if idx: checksd['initialized_checks'].pop(idx) # the check was not previously running so we were trying to instantiate it and it failed else: log.error("Configuration for check %s was not found, it won't be reloaded." % check_name) # successfully reloaded check are added to initialized_checks # (appended or replacing a previous version) else: if idx is not None: checksd['initialized_checks'][idx] = fresh_check # it didn't exist before and doesn't need to be replaced so we append it else: checksd['initialized_checks'].append(fresh_check) @classmethod def info(cls, verbose=None): logging.getLogger().setLevel(logging.ERROR) return CollectorStatus.print_latest_status(verbose=verbose) def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # A SIGHUP signals a configuration reload signal.signal(signal.SIGHUP, self._handle_sighup) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) self._agentConfig = self._set_agent_config_hostname(config) hostname = get_hostname(self._agentConfig) systemStats = get_system_stats( proc_path=self._agentConfig.get('procfs_path', '/proc').rstrip('/') ) emitters = self._get_emitters() # Initialize service discovery if self._agentConfig.get('service_discovery'): self.sd_backend = get_sd_backend(self._agentConfig) if _is_affirmative(self._agentConfig.get('sd_jmx_enable')): pipe_path = get_jmx_pipe_path() if Platform.is_windows(): pipe_name = pipe_path.format(pipename=SD_PIPE_NAME) else: pipe_name = os.path.join(pipe_path, SD_PIPE_NAME) if os.access(pipe_path, os.W_OK): if not os.path.exists(pipe_name): os.mkfifo(pipe_name) self.sd_pipe = os.open(pipe_name, os.O_RDWR) # RW to avoid blocking (will only W) # Initialize Supervisor proxy self.supervisor_proxy = self._get_supervisor_socket(self._agentConfig) else: log.debug('Unable to create pipe in temporary directory. JMX service discovery disabled.') # Load the checks.d checks self._checksd = load_check_directory(self._agentConfig, hostname) # Load JMX configs if available if self._jmx_service_discovery_enabled: jmx_sd_configs = generate_jmx_configs(self._agentConfig, hostname) if jmx_sd_configs: self._submit_jmx_service_discovery(jmx_sd_configs) # Initialize the Collector self.collector = Collector(self._agentConfig, emitters, systemStats, hostname) # In developer mode, the number of runs to be included in a single collector profile try: self.collector_profile_interval = int( self._agentConfig.get('collector_profile_interval', DEFAULT_COLLECTOR_PROFILE_INTERVAL)) except ValueError: log.warn('collector_profile_interval is invalid. ' 'Using default value instead (%s).' % DEFAULT_COLLECTOR_PROFILE_INTERVAL) self.collector_profile_interval = DEFAULT_COLLECTOR_PROFILE_INTERVAL # Configure the watchdog. self.check_frequency = int(self._agentConfig['check_freq']) watchdog = self._get_watchdog(self.check_frequency) # Initialize the auto-restarter self.restart_interval = int(self._agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() self.allow_profiling = self._agentConfig.get('allow_profiling', True) profiled = False collector_profiled_runs = 0 # Run the main loop. while self.run_forever: # Setup profiling if necessary if self.allow_profiling and self.in_developer_mode and not profiled: try: profiler = AgentProfiler() profiler.enable_profiling() profiled = True except Exception as e: log.warn("Cannot enable profiler: %s" % str(e)) if self.reload_configs_flag: if isinstance(self.reload_configs_flag, set): self.reload_configs(checks_to_reload=self.reload_configs_flag) else: self.reload_configs() # Do the work. Pass `configs_reloaded` to let the collector know if it needs to # look for the AgentMetrics check and pop it out. self.collector.run(checksd=self._checksd, start_event=self.start_event, configs_reloaded=True if self.reload_configs_flag else False) self.reload_configs_flag = False # Look for change in the config template store. # The self.sd_backend.reload_check_configs flag is set # to True if a config reload is needed. if self._agentConfig.get('service_discovery') and self.sd_backend and \ not self.sd_backend.reload_check_configs: try: self.sd_backend.reload_check_configs = get_config_store( self._agentConfig).crawl_config_template() except Exception as e: log.warn('Something went wrong while looking for config template changes: %s' % str(e)) # Check if we should run service discovery # The `reload_check_configs` flag can be set through the docker_daemon check or # using ConfigStore.crawl_config_template if self._agentConfig.get('service_discovery') and self.sd_backend and \ self.sd_backend.reload_check_configs: self.reload_configs_flag = self.sd_backend.reload_check_configs self.sd_backend.reload_check_configs = False if profiled: if collector_profiled_runs >= self.collector_profile_interval: try: profiler.disable_profiling() profiled = False collector_profiled_runs = 0 except Exception as e: log.warn("Cannot disable profiler: %s" % str(e)) # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for next loop if we will continue, otherwise exit quickly. if self.run_forever: if watchdog: watchdog.reset() if profiled: collector_profiled_runs += 1 log.debug("Sleeping for {0} seconds".format(self.check_frequency)) time.sleep(self.check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except Exception: pass # Explicitly kill the process, because it might be running as a daemon. log.info("Exiting. Bye bye.") sys.exit(0) def _get_emitters(self): return [http_emitter] def _get_watchdog(self, check_freq): watchdog = None if self._agentConfig.get("watchdog", True): watchdog = Watchdog(check_freq * WATCHDOG_MULTIPLIER, max_mem_mb=self._agentConfig.get('limit_memory_consumption', None)) watchdog.reset() return watchdog def _set_agent_config_hostname(self, agentConfig): # Try to fetch instance Id from EC2 if not hostname has been set # in the config file. # DEPRECATED if agentConfig.get('hostname') is None and agentConfig.get('use_ec2_instance_id'): instanceId = EC2.get_instance_id(agentConfig) if instanceId is not None: log.info("Running on EC2, instanceId: %s" % instanceId) agentConfig['hostname'] = instanceId else: log.info('Not running on EC2, using hostname to identify this server') return agentConfig def _get_supervisor_socket(self, agentConfig): if Platform.is_windows(): return None sockfile = agentConfig.get('supervisor_socket', DEFAULT_SUPERVISOR_SOCKET) supervisor_proxy = xmlrpclib.ServerProxy( 'http://127.0.0.1', transport=supervisor.xmlrpc.SupervisorTransport( None, None, serverurl="unix://{socket}".format(socket=sockfile)) ) return supervisor_proxy @property def _jmx_service_discovery_enabled(self): return self.sd_pipe is not None def _submit_jmx_service_discovery(self, jmx_sd_configs): if not jmx_sd_configs or not self.sd_pipe: return if self.supervisor_proxy is not None: jmx_state = self.supervisor_proxy.supervisor.getProcessInfo(JMX_SUPERVISOR_ENTRY) log.debug("Current JMX check state: %s", jmx_state['statename']) # restart jmx if stopped if jmx_state['statename'] in ['STOPPED', 'EXITED', 'FATAL'] and self._agentConfig.get('sd_jmx_enable'): self.supervisor_proxy.supervisor.startProcess(JMX_SUPERVISOR_ENTRY) time.sleep(JMX_GRACE_SECS) else: log.debug("Unable to automatically start jmxfetch on Windows via supervisor.") buffer = "" for name, yaml in jmx_sd_configs.iteritems(): try: buffer += SD_CONFIG_SEP buffer += "# {}\n".format(name) buffer += yaml except Exception as e: log.exception("unable to submit YAML via RPC: %s", e) else: log.info("JMX SD Config via named pip %s successfully.", name) if buffer: os.write(self.sd_pipe, buffer) def _should_restart(self): if time.time() - self.agent_start > self.restart_interval: return True return False def _do_restart(self): log.info("Running an auto-restart.") if self.collector: self.collector.stop() sys.exit(AgentSupervisor.RESTART_EXIT_STATUS)
def main(): options, args = get_parsed_args() agentConfig = get_config(options=options) autorestart = agentConfig.get('autorestart', False) hostname = get_hostname(agentConfig) in_developer_mode = agentConfig.get('developer_mode') COMMANDS_AGENT = [ 'start', 'stop', 'restart', 'status', 'foreground', ] COMMANDS_NO_AGENT = [ 'info', 'check', 'configcheck', 'jmx', 'flare', ] COMMANDS = COMMANDS_AGENT + COMMANDS_NO_AGENT if len(args) < 1: sys.stderr.write("Usage: %s %s\n" % (sys.argv[0], "|".join(COMMANDS))) return 2 command = args[0] if command not in COMMANDS: sys.stderr.write("Unknown command: %s\n" % command) return 3 # Deprecation notice if command not in DD_AGENT_COMMANDS: # Will become an error message and exit after deprecation period from utils.deprecations import deprecate_old_command_line_tools deprecate_old_command_line_tools() if command in COMMANDS_AGENT: agent = Agent(PidFile('dd-agent').get_path(), autorestart, in_developer_mode=in_developer_mode) if command in START_COMMANDS: log.info('Agent version %s' % get_version()) if 'start' == command: log.info('Start daemon') agent.start() elif 'stop' == command: log.info('Stop daemon') agent.stop() elif 'restart' == command: log.info('Restart daemon') agent.restart() elif 'status' == command: agent.status() elif 'info' == command: return Agent.info(verbose=options.verbose) elif 'foreground' == command: logging.info('Running in foreground') if autorestart: # Set-up the supervisor callbacks and fork it. logging.info('Running Agent with auto-restart ON') def child_func(): agent.start(foreground=True) def parent_func(): agent.start_event = False AgentSupervisor.start(parent_func, child_func) else: # Run in the standard foreground. agent.start(foreground=True) elif 'check' == command: if len(args) < 2: sys.stderr.write( "Usage: %s check <check_name> [check_rate]\n" "Add check_rate as last argument to compute rates\n" % sys.argv[0] ) return 1 check_name = args[1] try: import checks.collector # Try the old-style check first print getattr(checks.collector, check_name)(log).check(agentConfig) except Exception: # If not an old-style check, try checks.d checks = load_check_directory(agentConfig, hostname) for check in checks['initialized_checks']: if check.name == check_name: if in_developer_mode: check.run = AgentProfiler.wrap_profiling(check.run) cs = Collector.run_single_check(check, verbose=True) print CollectorStatus.render_check_status(cs) if len(args) == 3 and args[2] == 'check_rate': print "Running 2nd iteration to capture rate metrics" time.sleep(1) cs = Collector.run_single_check(check, verbose=True) print CollectorStatus.render_check_status(cs) check.stop() elif 'configcheck' == command or 'configtest' == command: configcheck() elif 'jmx' == command: if len(args) < 2 or args[1] not in JMX_LIST_COMMANDS.keys(): print "#" * 80 print "JMX tool to be used to help configuring your JMX checks." print "See http://docs.datadoghq.com/integrations/java/ for more information" print "#" * 80 print "\n" print "You have to specify one of the following commands:" for command, desc in JMX_LIST_COMMANDS.iteritems(): print " - %s [OPTIONAL: LIST OF CHECKS]: %s" % (command, desc) print "Example: sudo /etc/init.d/datadog-agent jmx list_matching_attributes tomcat jmx solr" print "\n" else: jmx_command = args[1] checks_list = args[2:] confd_directory = get_confd_path(get_os()) jmx_process = JMXFetch(confd_directory, agentConfig) jmx_process.configure() should_run = jmx_process.should_run() if should_run: jmx_process.run(jmx_command, checks_list, reporter="console") else: print "Couldn't find any valid JMX configuration in your conf.d directory: %s" % confd_directory print "Have you enabled any JMX check ?" print "If you think it's not normal please get in touch with Datadog Support" elif 'flare' == command: Flare.check_user_rights() case_id = int(args[1]) if len(args) > 1 else None f = Flare(True, case_id) f.collect() try: f.upload() except Exception, e: print 'The upload failed:\n{0}'.format(str(e))
def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # A SIGHUP signals a configuration reload signal.signal(signal.SIGHUP, self._handle_sighup) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) self._agentConfig = self._set_agent_config_hostname(config) hostname = get_hostname(self._agentConfig) systemStats = get_system_stats(proc_path=self._agentConfig.get( 'procfs_path', '/proc').rstrip('/')) emitters = self._get_emitters() # Initialize service discovery if self._agentConfig.get('service_discovery'): self.sd_backend = get_sd_backend(self._agentConfig) # Load the checks.d checks self._checksd = load_check_directory(self._agentConfig, hostname) # Initialize the Collector self.collector = Collector(self._agentConfig, emitters, systemStats, hostname) # In developer mode, the number of runs to be included in a single collector profile self.collector_profile_interval = self._agentConfig.get( 'collector_profile_interval', DEFAULT_COLLECTOR_PROFILE_INTERVAL) # Configure the watchdog. self.check_frequency = int(self._agentConfig['check_freq']) watchdog = self._get_watchdog(self.check_frequency) # Initialize the auto-restarter self.restart_interval = int( self._agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() profiled = False collector_profiled_runs = 0 # Run the main loop. while self.run_forever: log.debug("Found {num_checks} checks".format( num_checks=len(self._checksd['initialized_checks']))) # Setup profiling if necessary if self.in_developer_mode and not profiled: try: profiler = AgentProfiler() profiler.enable_profiling() profiled = True except Exception as e: log.warn("Cannot enable profiler: %s" % str(e)) # Do the work. self.collector.run(checksd=self._checksd, start_event=self.start_event, configs_reloaded=self.configs_reloaded) # This flag is used to know if the check configs have been reloaded at the current # run of the agent yet or not. It's used by the collector to know if it needs to # look for the AgentMetrics check and pop it out. # See: https://github.com/DataDog/dd-agent/blob/5.6.x/checks/collector.py#L265-L272 self.configs_reloaded = False # Look for change in the config template store. # The self.sd_backend.reload_check_configs flag is set # to True if a config reload is needed. if self._agentConfig.get('service_discovery') and self.sd_backend and \ not self.sd_backend.reload_check_configs: try: self.sd_backend.reload_check_configs = get_config_store( self._agentConfig).crawl_config_template() except Exception as e: log.warn( 'Something went wrong while looking for config template changes: %s' % str(e)) # Check if we should run service discovery # The `reload_check_configs` flag can be set through the docker_daemon check or # using ConfigStore.crawl_config_template if self._agentConfig.get('service_discovery') and self.sd_backend and \ self.sd_backend.reload_check_configs: self.reload_configs() self.configs_reloaded = True self.sd_backend.reload_check_configs = False if profiled: if collector_profiled_runs >= self.collector_profile_interval: try: profiler.disable_profiling() profiled = False collector_profiled_runs = 0 except Exception as e: log.warn("Cannot disable profiler: %s" % str(e)) # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for next loop if we will continue, otherwise exit quickly. if self.run_forever: if watchdog: watchdog.reset() if profiled: collector_profiled_runs += 1 log.debug("Sleeping for {0} seconds".format( self.check_frequency)) time.sleep(self.check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except Exception: pass # Explicitly kill the process, because it might be running as a daemon. log.info("Exiting. Bye bye.") sys.exit(0)
def main(): options, args = get_parsed_args() agentConfig = get_config(options=options) autorestart = agentConfig.get('autorestart', False) hostname = get_hostname(agentConfig) in_developer_mode = agentConfig.get('developer_mode') COMMANDS_AGENT = [ 'start', 'stop', 'restart', 'status', 'foreground', ] COMMANDS_NO_AGENT = [ 'info', 'check', 'configcheck', 'jmx', 'flare', ] COMMANDS = COMMANDS_AGENT + COMMANDS_NO_AGENT if len(args) < 1: sys.stderr.write("Usage: %s %s\n" % (sys.argv[0], "|".join(COMMANDS))) return 2 command = args[0] if command not in COMMANDS: sys.stderr.write("Unknown command: %s\n" % command) return 3 # TODO: actually kill the start/stop/restart/status command for 5.11 if command in ['start', 'stop', 'restart', 'status' ] and not in_developer_mode: logging.error('Please use supervisor to manage the agent') return 1 if command in COMMANDS_AGENT: agent = Agent(PidFile(PID_NAME, PID_DIR).get_path(), autorestart, in_developer_mode=in_developer_mode) if 'start' == command: log.info('Start daemon') agent.start() elif 'stop' == command: log.info('Stop daemon') agent.stop() elif 'restart' == command: log.info('Restart daemon') agent.restart() elif 'status' == command: agent.status() elif 'info' == command: return Agent.info(verbose=options.verbose) elif 'foreground' == command: log.info('Agent version %s' % get_version()) if autorestart: # Set-up the supervisor callbacks and fork it. logging.info('Running Agent with auto-restart ON') def child_func(): agent.start(foreground=True) def parent_func(): agent.start_event = False AgentSupervisor.start(parent_func, child_func) else: # Run in the standard foreground. agent.start(foreground=True) elif 'check' == command: if len(args) < 2: sys.stderr.write( "Usage: %s check <check_name> [check_rate]\n" "Add check_rate as last argument to compute rates\n" % sys.argv[0]) return 1 check_name = args[1] try: import checks.collector # Try the old-style check first print getattr(checks.collector, check_name)(log).check(agentConfig) except Exception: # If not an old-style check, try checks.d checks = load_check_directory(agentConfig, hostname) for check in checks['initialized_checks']: if check.name == check_name: if in_developer_mode: check.run = AgentProfiler.wrap_profiling(check.run) cs = Collector.run_single_check(check, verbose=True) print CollectorStatus.render_check_status(cs) if len(args) == 3 and args[2] == 'check_rate': print "Running 2nd iteration to capture rate metrics" time.sleep(1) cs = Collector.run_single_check(check, verbose=True) print CollectorStatus.render_check_status(cs) check.stop() elif 'configcheck' == command or 'configtest' == command: configcheck() sd_configcheck(agentConfig) elif 'jmx' == command: jmx_command(args[1:], agentConfig) elif 'flare' == command: Flare.check_user_rights() case_id = int(args[1]) if len(args) > 1 else None f = Flare(True, case_id) f.collect() try: f.upload() except Exception as e: print 'The upload failed:\n{0}'.format(str(e)) return 0
class Agent(Daemon): """ The agent class is a daemon that runs the collector in a background process. """ def __init__(self, pidfile): Daemon.__init__(self, pidfile) self.run_forever = True self.collector = None def _handle_sigterm(self, signum, frame): agent_logger.debug("Caught sigterm. Stopping run loop.") self.run_forever = False if self.collector: self.collector.stop() def run(self): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. agentConfig = self._set_agent_config_hostname(get_config()) systemStats = get_system_stats() emitters = self._get_emitters(agentConfig) self.collector = Collector(agentConfig, emitters, systemStats) # Load the checks.d checks checksd = load_check_directory(agentConfig) # Configure the watchdog. check_frequency = int(agentConfig['check_freq']) watchdog = self._get_watchdog(check_frequency, agentConfig) # Run the main loop. while self.run_forever: # Do the work. self.collector.run(checksd=checksd) # Only plan for the next loop if we will continue, # otherwise just exit quickly. if self.run_forever: if watchdog: watchdog.reset() time.sleep(check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except: pass # Explicitly kill the process, because it might be running # as a daemon. agent_logger.info("Exiting. Bye bye.") sys.exit(0) def _get_emitters(self, agentConfig): emitters = [http_emitter] return emitters def _get_watchdog(self, check_freq, agentConfig): watchdog = None if agentConfig.get("watchdog", True): watchdog = Watchdog(check_freq * WATCHDOG_MULTIPLIER) watchdog.reset() return watchdog def _set_agent_config_hostname(self, agentConfig): # Try to fetch instance Id from EC2 if not hostname has been set # in the config file. # DEPRECATED if agentConfig.get('hostname') is None and agentConfig.get('use_ec2_instance_id'): instanceId = EC2.get_instance_id() if instanceId is not None: agent_logger.info("Running on EC2, instanceId: %s" % instanceId) agentConfig['hostname'] = instanceId else: agent_logger.info('Not running on EC2, using hostname to identify this server') return agentConfig
def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) agentConfig = self._set_agent_config_hostname(config) hostname = get_hostname(agentConfig) systemStats = get_system_stats() emitters = self._get_emitters(agentConfig) # Load the checks.d checks checksd = load_check_directory(agentConfig, hostname) self.collector = Collector(agentConfig, emitters, systemStats, hostname) # Configure the watchdog. check_frequency = int(agentConfig['check_freq']) watchdog = self._get_watchdog(check_frequency, agentConfig) # Initialize the auto-restarter self.restart_interval = int(agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() # Run the main loop. while self.run_forever: # enable profiler if needed profiled = False if agentConfig.get('profile', False) and agentConfig.get('profile').lower() == 'yes': try: import cProfile profiler = cProfile.Profile() profiled = True profiler.enable() log.debug("Agent profiling is enabled") except Exception: log.warn("Cannot enable profiler") # Do the work. self.collector.run(checksd=checksd, start_event=self.start_event) # disable profiler and printout stats to stdout if agentConfig.get('profile', False) and agentConfig.get('profile').lower() == 'yes' and profiled: try: profiler.disable() import pstats from cStringIO import StringIO s = StringIO() ps = pstats.Stats(profiler, stream=s).sort_stats("cumulative") ps.print_stats() log.debug(s.getvalue()) except Exception: log.warn("Cannot disable profiler") # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for the next loop if we will continue, # otherwise just exit quickly. if self.run_forever: if watchdog: watchdog.reset() time.sleep(check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except Exception: pass # Explicitly kill the process, because it might be running # as a daemon. log.info("Exiting. Bye bye.") sys.exit(0)
def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) agentConfig = self._set_agent_config_hostname(config) systemStats = get_system_stats() emitters = self._get_emitters(agentConfig) # Load the checks.d checks checksd = load_check_directory(agentConfig) self.collector = Collector(agentConfig, emitters, systemStats) # Configure the watchdog. check_frequency = int(agentConfig['check_freq']) watchdog = self._get_watchdog(check_frequency, agentConfig) # Initialize the auto-restarter self.restart_interval = int( agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() # Run the main loop. while self.run_forever: # enable profiler if needed profiled = False if agentConfig.get( 'profile', False) and agentConfig.get('profile').lower() == 'yes': try: import cProfile profiler = cProfile.Profile() profiled = True profiler.enable() log.debug("Agent profiling is enabled") except Exception: log.warn("Cannot enable profiler") # Do the work. self.collector.run(checksd=checksd, start_event=self.start_event) # disable profiler and printout stats to stdout if agentConfig.get('profile', False) and agentConfig.get( 'profile').lower() == 'yes' and profiled: try: profiler.disable() import pstats from cStringIO import StringIO s = StringIO() ps = pstats.Stats(profiler, stream=s).sort_stats("cumulative") ps.print_stats() log.debug(s.getvalue()) except Exception: log.warn("Cannot disable profiler") # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for the next loop if we will continue, # otherwise just exit quickly. if self.run_forever: if watchdog: watchdog.reset() time.sleep(check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except Exception: pass # Explicitly kill the process, because it might be running # as a daemon. log.info("Exiting. Bye bye.") sys.exit(0)
class Agent(Daemon): """ The agent class is a daemon that runs the collector in a background process. """ def __init__(self, pidfile, autorestart, start_event=True, in_developer_mode=False): Daemon.__init__(self, pidfile, autorestart=autorestart) self.run_forever = True self.collector = None self.start_event = start_event self.in_developer_mode = in_developer_mode self._agentConfig = {} self._checksd = [] self.collector_profile_interval = DEFAULT_COLLECTOR_PROFILE_INTERVAL self.check_frequency = None # this flag can be set to True, False, or a list of checks (for partial reload) self.reload_configs_flag = False self.sd_backend = None def _handle_sigterm(self, signum, frame): """Handles SIGTERM and SIGINT, which gracefully stops the agent.""" log.debug("Caught sigterm. Stopping run loop.") self.run_forever = False if self.collector: self.collector.stop() log.debug("Collector is stopped.") def _handle_sigusr1(self, signum, frame): """Handles SIGUSR1, which signals an exit with an autorestart.""" self._handle_sigterm(signum, frame) self._do_restart() def _handle_sighup(self, signum, frame): """Handles SIGHUP, which signals a configuration reload.""" log.info( "SIGHUP caught! Scheduling configuration reload before next collection run." ) self.reload_configs_flag = True def reload_configs(self, checks_to_reload=set()): """Reload the agent configuration and checksd configurations. Can also reload only an explicit set of checks.""" log.info("Attempting a configuration reload...") hostname = get_hostname(self._agentConfig) # if no check was given, reload them all if not checks_to_reload: log.debug("No check list was passed, reloading every check") # stop checks for check in self._checksd.get('initialized_checks', []): check.stop() self._checksd = load_check_directory(self._agentConfig, hostname) else: new_checksd = copy(self._checksd) self.refresh_specific_checks(hostname, new_checksd, checks_to_reload) # once the reload is done, replace existing checks with the new ones self._checksd = new_checksd # Logging num_checks = len(self._checksd['initialized_checks']) if num_checks > 0: opt_msg = " (refreshed %s checks)" % len( checks_to_reload) if checks_to_reload else '' msg = "Check reload was successful. Running {num_checks} checks{opt_msg}.".format( num_checks=num_checks, opt_msg=opt_msg) log.info(msg) else: log.info("No checksd configs found") def refresh_specific_checks(self, hostname, checksd, checks): """take a list of checks and for each of them: - remove it from the init_failed_checks if it was there - load a fresh config for it - replace its old config with the new one in initialized_checks if there was one - disable the check if no new config was found - otherwise, append it to initialized_checks """ for check_name in checks: idx = None for num, check in enumerate(checksd['initialized_checks']): if check.name == check_name: idx = num # stop the existing check before reloading it check.stop() if not idx and check_name in checksd['init_failed_checks']: # if the check previously failed to load, pop it from init_failed_checks checksd['init_failed_checks'].pop(check_name) fresh_check = load_check(self._agentConfig, hostname, check_name) # this is an error dict # checks that failed to load are added to init_failed_checks # and poped from initialized_checks if isinstance(fresh_check, dict) and 'error' in fresh_check.keys(): checksd['init_failed_checks'][fresh_check.keys() [0]] = fresh_check.values()[0] if idx: checksd['initialized_checks'].pop(idx) elif not fresh_check: # no instance left of it to monitor so the check was not loaded if idx: checksd['initialized_checks'].pop(idx) # the check was not previously running so we were trying to instantiate it and it failed else: log.error( "Configuration for check %s was not found, it won't be reloaded." % check_name) # successfully reloaded check are added to initialized_checks # (appended or replacing a previous version) else: if idx is not None: checksd['initialized_checks'][idx] = fresh_check # it didn't exist before and doesn't need to be replaced so we append it else: checksd['initialized_checks'].append(fresh_check) @classmethod def info(cls, verbose=None): logging.getLogger().setLevel(logging.ERROR) return CollectorStatus.print_latest_status(verbose=verbose) def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # A SIGHUP signals a configuration reload signal.signal(signal.SIGHUP, self._handle_sighup) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) self._agentConfig = self._set_agent_config_hostname(config) hostname = get_hostname(self._agentConfig) systemStats = get_system_stats(proc_path=self._agentConfig.get( 'procfs_path', '/proc').rstrip('/')) emitters = self._get_emitters() # Initialize service discovery if self._agentConfig.get('service_discovery'): self.sd_backend = get_sd_backend(self._agentConfig) # Load the checks.d checks self._checksd = load_check_directory(self._agentConfig, hostname) # Initialize the Collector self.collector = Collector(self._agentConfig, emitters, systemStats, hostname) # In developer mode, the number of runs to be included in a single collector profile self.collector_profile_interval = self._agentConfig.get( 'collector_profile_interval', DEFAULT_COLLECTOR_PROFILE_INTERVAL) # Configure the watchdog. self.check_frequency = int(self._agentConfig['check_freq']) watchdog = self._get_watchdog(self.check_frequency) # Initialize the auto-restarter self.restart_interval = int( self._agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() profiled = False collector_profiled_runs = 0 # Run the main loop. while self.run_forever: # Setup profiling if necessary if self.in_developer_mode and not profiled: try: profiler = AgentProfiler() profiler.enable_profiling() profiled = True except Exception as e: log.warn("Cannot enable profiler: %s" % str(e)) if self.reload_configs_flag: if isinstance(self.reload_configs_flag, set): self.reload_configs( checks_to_reload=self.reload_configs_flag) else: self.reload_configs() # Do the work. Pass `configs_reloaded` to let the collector know if it needs to # look for the AgentMetrics check and pop it out. self.collector.run( checksd=self._checksd, start_event=self.start_event, configs_reloaded=True if self.reload_configs_flag else False) self.reload_configs_flag = False # Look for change in the config template store. # The self.sd_backend.reload_check_configs flag is set # to True if a config reload is needed. if self._agentConfig.get('service_discovery') and self.sd_backend and \ not self.sd_backend.reload_check_configs: try: self.sd_backend.reload_check_configs = get_config_store( self._agentConfig).crawl_config_template() except Exception as e: log.warn( 'Something went wrong while looking for config template changes: %s' % str(e)) # Check if we should run service discovery # The `reload_check_configs` flag can be set through the docker_daemon check or # using ConfigStore.crawl_config_template if self._agentConfig.get('service_discovery') and self.sd_backend and \ self.sd_backend.reload_check_configs: self.reload_configs_flag = self.sd_backend.reload_check_configs self.sd_backend.reload_check_configs = False if profiled: if collector_profiled_runs >= self.collector_profile_interval: try: profiler.disable_profiling() profiled = False collector_profiled_runs = 0 except Exception as e: log.warn("Cannot disable profiler: %s" % str(e)) # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for next loop if we will continue, otherwise exit quickly. if self.run_forever: if watchdog: watchdog.reset() if profiled: collector_profiled_runs += 1 log.debug("Sleeping for {0} seconds".format( self.check_frequency)) time.sleep(self.check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except Exception: pass # Explicitly kill the process, because it might be running as a daemon. log.info("Exiting. Bye bye.") sys.exit(0) def _get_emitters(self): return [http_emitter] def _get_watchdog(self, check_freq): watchdog = None if self._agentConfig.get("watchdog", True): watchdog = Watchdog(check_freq * WATCHDOG_MULTIPLIER, max_mem_mb=self._agentConfig.get( 'limit_memory_consumption', None)) watchdog.reset() return watchdog def _set_agent_config_hostname(self, agentConfig): # Try to fetch instance Id from EC2 if not hostname has been set # in the config file. # DEPRECATED if agentConfig.get('hostname') is None and agentConfig.get( 'use_ec2_instance_id'): instanceId = EC2.get_instance_id(agentConfig) if instanceId is not None: log.info("Running on EC2, instanceId: %s" % instanceId) agentConfig['hostname'] = instanceId else: log.info( 'Not running on EC2, using hostname to identify this server' ) return agentConfig def _should_restart(self): if time.time() - self.agent_start > self.restart_interval: return True return False def _do_restart(self): log.info("Running an auto-restart.") if self.collector: self.collector.stop() sys.exit(AgentSupervisor.RESTART_EXIT_STATUS)
class Agent(Daemon): """ The agent class is a daemon that runs the collector in a background process. """ def __init__(self, pidfile, autorestart, start_event=True, in_developer_mode=False): Daemon.__init__(self, pidfile, autorestart=autorestart) self.run_forever = True self.collector = None self.start_event = start_event self.in_developer_mode = in_developer_mode self._agentConfig = {} self._checksd = [] self.collector_profile_interval = DEFAULT_COLLECTOR_PROFILE_INTERVAL self.check_frequency = None self.configs_reloaded = False self.sd_backend = None def _handle_sigterm(self, signum, frame): """Handles SIGTERM and SIGINT, which gracefully stops the agent.""" log.debug("Caught sigterm. Stopping run loop.") self.run_forever = False if self.collector: self.collector.stop() log.debug("Collector is stopped.") def _handle_sigusr1(self, signum, frame): """Handles SIGUSR1, which signals an exit with an autorestart.""" self._handle_sigterm(signum, frame) self._do_restart() def _handle_sighup(self, signum, frame): """Handles SIGHUP, which signals a configuration reload.""" log.info("SIGHUP caught!") self.reload_configs() self.configs_reloaded = True def reload_configs(self): """Reloads the agent configuration and checksd configurations.""" log.info("Attempting a configuration reload...") # Reload checksd configs hostname = get_hostname(self._agentConfig) self._checksd = load_check_directory(self._agentConfig, hostname) # Logging num_checks = len(self._checksd['initialized_checks']) if num_checks > 0: log.info("Successfully reloaded {num_checks} checks".format( num_checks=num_checks)) else: log.info("No checksd configs found") @classmethod def info(cls, verbose=None): logging.getLogger().setLevel(logging.ERROR) return CollectorStatus.print_latest_status(verbose=verbose) def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # A SIGHUP signals a configuration reload signal.signal(signal.SIGHUP, self._handle_sighup) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) self._agentConfig = self._set_agent_config_hostname(config) hostname = get_hostname(self._agentConfig) systemStats = get_system_stats(proc_path=self._agentConfig.get( 'procfs_path', '/proc').rstrip('/')) emitters = self._get_emitters() # Initialize service discovery if self._agentConfig.get('service_discovery'): self.sd_backend = get_sd_backend(self._agentConfig) # Load the checks.d checks self._checksd = load_check_directory(self._agentConfig, hostname) # Initialize the Collector self.collector = Collector(self._agentConfig, emitters, systemStats, hostname) # In developer mode, the number of runs to be included in a single collector profile self.collector_profile_interval = self._agentConfig.get( 'collector_profile_interval', DEFAULT_COLLECTOR_PROFILE_INTERVAL) # Configure the watchdog. self.check_frequency = int(self._agentConfig['check_freq']) watchdog = self._get_watchdog(self.check_frequency) # Initialize the auto-restarter self.restart_interval = int( self._agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() profiled = False collector_profiled_runs = 0 # Run the main loop. while self.run_forever: log.debug("Found {num_checks} checks".format( num_checks=len(self._checksd['initialized_checks']))) # Setup profiling if necessary if self.in_developer_mode and not profiled: try: profiler = AgentProfiler() profiler.enable_profiling() profiled = True except Exception as e: log.warn("Cannot enable profiler: %s" % str(e)) # Do the work. self.collector.run(checksd=self._checksd, start_event=self.start_event, configs_reloaded=self.configs_reloaded) # This flag is used to know if the check configs have been reloaded at the current # run of the agent yet or not. It's used by the collector to know if it needs to # look for the AgentMetrics check and pop it out. # See: https://github.com/DataDog/dd-agent/blob/5.6.x/checks/collector.py#L265-L272 self.configs_reloaded = False # Look for change in the config template store. # The self.sd_backend.reload_check_configs flag is set # to True if a config reload is needed. if self._agentConfig.get('service_discovery') and self.sd_backend and \ not self.sd_backend.reload_check_configs: try: self.sd_backend.reload_check_configs = get_config_store( self._agentConfig).crawl_config_template() except Exception as e: log.warn( 'Something went wrong while looking for config template changes: %s' % str(e)) # Check if we should run service discovery # The `reload_check_configs` flag can be set through the docker_daemon check or # using ConfigStore.crawl_config_template if self._agentConfig.get('service_discovery') and self.sd_backend and \ self.sd_backend.reload_check_configs: self.reload_configs() self.configs_reloaded = True self.sd_backend.reload_check_configs = False if profiled: if collector_profiled_runs >= self.collector_profile_interval: try: profiler.disable_profiling() profiled = False collector_profiled_runs = 0 except Exception as e: log.warn("Cannot disable profiler: %s" % str(e)) # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for next loop if we will continue, otherwise exit quickly. if self.run_forever: if watchdog: watchdog.reset() if profiled: collector_profiled_runs += 1 log.debug("Sleeping for {0} seconds".format( self.check_frequency)) time.sleep(self.check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except Exception: pass # Explicitly kill the process, because it might be running as a daemon. log.info("Exiting. Bye bye.") sys.exit(0) def _get_emitters(self): return [http_emitter] def _get_watchdog(self, check_freq): watchdog = None if self._agentConfig.get("watchdog", True): watchdog = Watchdog(check_freq * WATCHDOG_MULTIPLIER, max_mem_mb=self._agentConfig.get( 'limit_memory_consumption', None)) watchdog.reset() return watchdog def _set_agent_config_hostname(self, agentConfig): # Try to fetch instance Id from EC2 if not hostname has been set # in the config file. # DEPRECATED if agentConfig.get('hostname') is None and agentConfig.get( 'use_ec2_instance_id'): instanceId = EC2.get_instance_id(agentConfig) if instanceId is not None: log.info("Running on EC2, instanceId: %s" % instanceId) agentConfig['hostname'] = instanceId else: log.info( 'Not running on EC2, using hostname to identify this server' ) return agentConfig def _should_restart(self): if time.time() - self.agent_start > self.restart_interval: return True return False def _do_restart(self): log.info("Running an auto-restart.") if self.collector: self.collector.stop() sys.exit(AgentSupervisor.RESTART_EXIT_STATUS)
def main(): options, args = get_parsed_args() agentConfig = get_config(options=options) autorestart = agentConfig.get('autorestart', False) hostname = get_hostname(agentConfig) in_developer_mode = agentConfig.get('developer_mode') COMMANDS_AGENT = [ 'start', 'stop', 'restart', 'status', 'foreground', ] COMMANDS_NO_AGENT = [ 'info', 'check', 'configcheck', 'jmx', 'flare', ] COMMANDS = COMMANDS_AGENT + COMMANDS_NO_AGENT if len(args) < 1: sys.stderr.write("Usage: %s %s\n" % (sys.argv[0], "|".join(COMMANDS))) return 2 command = args[0] if command not in COMMANDS: sys.stderr.write("Unknown command: %s\n" % command) return 3 # Deprecation notice if command not in DD_AGENT_COMMANDS: # Will become an error message and exit after deprecation period from utils.deprecations import deprecate_old_command_line_tools deprecate_old_command_line_tools() if command in COMMANDS_AGENT: agent = Agent(PidFile(PID_NAME, PID_DIR).get_path(), autorestart, in_developer_mode=in_developer_mode) if command in START_COMMANDS: log.info('Agent version %s' % get_version()) if 'start' == command: log.info('Start daemon') agent.start() elif 'stop' == command: log.info('Stop daemon') agent.stop() elif 'restart' == command: log.info('Restart daemon') agent.restart() elif 'status' == command: agent.status() elif 'info' == command: return Agent.info(verbose=options.verbose) elif 'foreground' == command: logging.info('Running in foreground') if autorestart: # Set-up the supervisor callbacks and fork it. logging.info('Running Agent with auto-restart ON') def child_func(): agent.start(foreground=True) def parent_func(): agent.start_event = False AgentSupervisor.start(parent_func, child_func) else: # Run in the standard foreground. agent.start(foreground=True) elif 'check' == command: if len(args) < 2: sys.stderr.write( "Usage: %s check <check_name> [check_rate]\n" "Add check_rate as last argument to compute rates\n" % sys.argv[0]) return 1 check_name = args[1] try: import checks.collector # Try the old-style check first print getattr(checks.collector, check_name)(log).check(agentConfig) except Exception: # If not an old-style check, try checks.d checks = load_check_directory(agentConfig, hostname) for check in checks['initialized_checks']: if check.name == check_name: if in_developer_mode: check.run = AgentProfiler.wrap_profiling(check.run) cs = Collector.run_single_check(check, verbose=True) print CollectorStatus.render_check_status(cs) if len(args) == 3 and args[2] == 'check_rate': print "Running 2nd iteration to capture rate metrics" time.sleep(1) cs = Collector.run_single_check(check, verbose=True) print CollectorStatus.render_check_status(cs) check.stop() elif 'configcheck' == command or 'configtest' == command: configcheck() if agentConfig.get('service_discovery', False): # set the TRACE_CONFIG flag to True to make load_check_directory return # the source of config objects. # Then call load_check_directory here and pass the result to sd_configcheck # to avoid circular imports agentConfig[TRACE_CONFIG] = True configs = { # check_name: (config_source, config) } print("\nLoading check configurations...\n\n") configs = load_check_directory(agentConfig, hostname) sd_configcheck(agentConfig, configs) elif 'jmx' == command: jmx_command(args[1:], agentConfig) elif 'flare' == command: Flare.check_user_rights() case_id = int(args[1]) if len(args) > 1 else None f = Flare(True, case_id) f.collect() try: f.upload() except Exception, e: print 'The upload failed:\n{0}'.format(str(e))
def test_metadata(self): c = Collector({}, None, {}) assert "hostname" in c._get_metadata(), c.get_metadata() assert "fqdn" in c._get_metadata(), c.get_metadata()
def test_topology_collection(self): agentConfig = { 'api_key': 'test_apikey', 'check_timings': True, 'collect_ec2_tags': True, 'collect_orchestrator_tags': False, 'collect_instance_metadata': False, 'create_dd_check_tags': False, 'version': 'test', 'tags': '', } # Run a single checks.d check as part of the collector. dummy_topology_check_config = { "init_config": {}, "instances": [{ "dummy_instance": "dummy_instance" }] } # create dummy checks, creating two component and 1 relation check1 = DummyTopologyCheck( 1, 'dummy_topology_check', dummy_topology_check_config.get('init_config'), agentConfig, instances=[{ "instance_id": 1, "pass": True }, { "instance_id": 2, "pass": True }]) check2 = DummyTopologyCheck( 2, 'dummy_topology_check', dummy_topology_check_config.get('init_config'), agentConfig, instances=[{ "instance_id": 3, "pass": True }, { "instance_id": 4, "pass": True }], snapshot=True) emitted_topologies = [] # mock emitter to pick up data emitted by the collector def mock_emitter(message, log, agentConfig, endpoint): emitted_topologies.extend(message['topologies']) c = Collector(agentConfig, [mock_emitter], {}, get_hostname(agentConfig)) payload, _ = c.run({ 'initialized_checks': [check1, check2], 'init_failed_checks': {} }) topologies = payload['topologies'] def assertTopology(topology, check, instance_id): self.assertEquals(topology['instance'], check.instance_key(instance_id)) self.assertEquals(len(topology['components']), 2) self.assertEquals(len(topology['relations']), 1) self.assertEquals(check.expected_components(instance_id), topology['components']) self.assertEquals(check.expected_relations(), topology['relations']) if check.snapshot: self.assertTrue(topology["start_snapshot"]) self.assertTrue(topology["stop_snapshot"]) else: self.assertTrue("start_snapshot" not in topology) self.assertTrue("stop_snapshot" not in topology) # Make sure the emissions of the collector are observed assertTopology(topologies[0], check1, 1) assertTopology(topologies[1], check1, 2) assertTopology(topologies[2], check2, 4) assertTopology(topologies[3], check2, 3) assertTopology(emitted_topologies[0], check1, 1) assertTopology(emitted_topologies[1], check1, 2) assertTopology(emitted_topologies[2], check2, 4) assertTopology(emitted_topologies[3], check2, 3)
def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # A SIGHUP signals a configuration reload signal.signal(signal.SIGHUP, self._handle_sighup) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) self._agentConfig = self._set_agent_config_hostname(config) hostname = get_hostname(self._agentConfig) systemStats = get_system_stats() emitters = self._get_emitters() # Load the checks.d checks self._checksd = load_check_directory(self._agentConfig, hostname) # Initialize the Collector self.collector = Collector(self._agentConfig, emitters, systemStats, hostname) # In developer mode, the number of runs to be included in a single collector profile self.collector_profile_interval = self._agentConfig.get('collector_profile_interval', DEFAULT_COLLECTOR_PROFILE_INTERVAL) # Configure the watchdog. self.check_frequency = int(self._agentConfig['check_freq']) watchdog = self._get_watchdog(self.check_frequency) # Initialize the auto-restarter self.restart_interval = int(self._agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() profiled = False collector_profiled_runs = 0 # Run the main loop. while self.run_forever: log.debug("Found {num_checks} checks".format(num_checks=len(self._checksd['initialized_checks']))) # Setup profiling if necessary if self.in_developer_mode and not profiled: try: profiler = AgentProfiler() profiler.enable_profiling() profiled = True except Exception as e: log.warn("Cannot enable profiler: %s" % str(e)) # Do the work. self.collector.run(checksd=self._checksd, start_event=self.start_event, configs_reloaded=self.configs_reloaded) if self.configs_reloaded: self.configs_reloaded = False if profiled: if collector_profiled_runs >= self.collector_profile_interval: try: profiler.disable_profiling() profiled = False collector_profiled_runs = 0 except Exception as e: log.warn("Cannot disable profiler: %s" % str(e)) # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for next loop if we will continue, otherwise exit quickly. if self.run_forever: if watchdog: watchdog.reset() if profiled: collector_profiled_runs += 1 log.info("Sleeping for {0} seconds".format(self.check_frequency)) time.sleep(self.check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except Exception: pass # Explicitly kill the process, because it might be running as a daemon. log.info("Exiting. Bye bye.") sys.exit(0)
def test_metadata(self): c = Collector({"collect_instance_metadata": True}, None, {}, "foo") assert "hostname" in c._get_metadata() assert "socket-fqdn" in c._get_metadata() assert "socket-hostname" in c._get_metadata()
class DDAgent(multiprocessing.Process): def __init__(self, agentConfig, hostname, **options): multiprocessing.Process.__init__(self, name='ddagent') self.config = agentConfig self.hostname = hostname self.options = options self._heartbeat = options.get('heartbeat') # FIXME: `running` flag should be handled by the service self.running = True self.is_enabled = True def run(self): from config import initialize_logging initialize_logging('windows_collector') log.debug("Windows Service - Starting collector") set_win32_requests_ca_bundle_path() emitters = self.get_emitters() systemStats = get_system_stats() self.collector = Collector(self.config, emitters, systemStats, self.hostname) in_developer_mode = self.config.get('developer_mode') # In developer mode, the number of runs to be included in a single collector profile collector_profile_interval = self.config.get( 'collector_profile_interval', DEFAULT_COLLECTOR_PROFILE_INTERVAL) profiled = False collector_profiled_runs = 0 # Load the checks.d checks checksd = load_check_directory(self.config, self.hostname) # Main agent loop will run until interrupted while self.running: if self._heartbeat: self._heartbeat.send(0) if in_developer_mode and not profiled: try: profiler = AgentProfiler() profiler.enable_profiling() profiled = True except Exception as e: log.warn("Cannot enable profiler: %s" % str(e)) self.collector.run(checksd=checksd) if profiled: if collector_profiled_runs >= collector_profile_interval: try: profiler.disable_profiling() profiled = False collector_profiled_runs = 0 except Exception as e: log.warn("Cannot disable profiler: %s" % str(e)) else: collector_profiled_runs += 1 time.sleep(self.config['check_freq']) def stop(self): log.debug("Windows Service - Stopping collector") self.collector.stop() self.running = False def get_emitters(self): emitters = [http_emitter] custom = [ s.strip() for s in self.config.get('custom_emitters', '').split(',') ] for emitter_spec in custom: if not emitter_spec: continue emitters.append(modules.load(emitter_spec, 'emitter')) return emitters
def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) agentConfig = self._set_agent_config_hostname(config) systemStats = get_system_stats() emitters = self._get_emitters(agentConfig) # Load the checks.d checks checksd = load_check_directory(agentConfig) self.collector = Collector(agentConfig, emitters, systemStats) # Configure the watchdog. check_frequency = int(agentConfig['check_freq']) watchdog = self._get_watchdog(check_frequency, agentConfig) # Initialize the auto-restarter self.restart_interval = int(agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() # Run the main loop. while self.run_forever: # Do the work. self.collector.run(checksd=checksd, start_event=self.start_event) # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for the next loop if we will continue, # otherwise just exit quickly. if self.run_forever: if watchdog: watchdog.reset() time.sleep(check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except Exception: pass # Explicitly kill the process, because it might be running # as a daemon. log.info("Exiting. Bye bye.") sys.exit(0)
class DDAgent(multiprocessing.Process): def __init__(self, agentConfig, hostname, heartbeat=None): multiprocessing.Process.__init__(self, name='ddagent') self.config = agentConfig self.hostname = hostname self._heartbeat = heartbeat # FIXME: `running` flag should be handled by the service self.running = True self.is_enabled = True def run(self): from config import initialize_logging initialize_logging('windows_collector') log.debug("Windows Service - Starting collector") emitters = self.get_emitters() systemStats = get_system_stats() self.collector = Collector(self.config, emitters, systemStats, self.hostname) in_developer_mode = self.config.get('developer_mode') # In developer mode, the number of runs to be included in a single collector profile collector_profile_interval = self.config.get('collector_profile_interval', DEFAULT_COLLECTOR_PROFILE_INTERVAL) profiled = False collector_profiled_runs = 0 # Load the checks.d checks checksd = load_check_directory(self.config, self.hostname) # Main agent loop will run until interrupted while self.running: if self._heartbeat: self._heartbeat.send(0) if in_developer_mode and not profiled: try: profiler = AgentProfiler() profiler.enable_profiling() profiled = True except Exception as e: log.warn("Cannot enable profiler: %s" % str(e)) self.collector.run(checksd=checksd) if profiled: if collector_profiled_runs >= collector_profile_interval: try: profiler.disable_profiling() profiled = False collector_profiled_runs = 0 except Exception as e: log.warn("Cannot disable profiler: %s" % str(e)) else: collector_profiled_runs += 1 time.sleep(self.config['check_freq']) def stop(self): log.debug("Windows Service - Stopping collector") self.collector.stop() self.running = False def get_emitters(self): emitters = [http_emitter] custom = [s.strip() for s in self.config.get('custom_emitters', '').split(',')] for emitter_spec in custom: if not emitter_spec: continue emitters.append(modules.load(emitter_spec, 'emitter')) return emitters
class Agent(Daemon): """ The agent class is a daemon that runs the collector in a background process. """ def __init__(self, pidfile, autorestart, start_event=True, in_developer_mode=False): Daemon.__init__(self, pidfile, autorestart=autorestart) self.run_forever = True self.collector = None self.start_event = start_event self.in_developer_mode = in_developer_mode self._agentConfig = {} self._checksd = [] self.collector_profile_interval = DEFAULT_COLLECTOR_PROFILE_INTERVAL self.check_frequency = None self.configs_reloaded = False self.sd_backend = None def _handle_sigterm(self, signum, frame): """Handles SIGTERM and SIGINT, which gracefully stops the agent.""" log.debug("Caught sigterm. Stopping run loop.") self.run_forever = False if self.collector: self.collector.stop() log.debug("Collector is stopped.") def _handle_sigusr1(self, signum, frame): """Handles SIGUSR1, which signals an exit with an autorestart.""" self._handle_sigterm(signum, frame) self._do_restart() def _handle_sighup(self, signum, frame): """Handles SIGHUP, which signals a configuration reload.""" log.info("SIGHUP caught!") self.reload_configs() self.configs_reloaded = True def reload_configs(self): """Reloads the agent configuration and checksd configurations.""" log.info("Attempting a configuration reload...") # Reload checksd configs hostname = get_hostname(self._agentConfig) self._checksd = load_check_directory(self._agentConfig, hostname) # Logging num_checks = len(self._checksd["initialized_checks"]) if num_checks > 0: log.info("Successfully reloaded {num_checks} checks".format(num_checks=num_checks)) else: log.info("No checksd configs found") @classmethod def info(cls, verbose=None): logging.getLogger().setLevel(logging.ERROR) return CollectorStatus.print_latest_status(verbose=verbose) def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # A SIGHUP signals a configuration reload signal.signal(signal.SIGHUP, self._handle_sighup) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) self._agentConfig = self._set_agent_config_hostname(config) hostname = get_hostname(self._agentConfig) systemStats = get_system_stats() emitters = self._get_emitters() # Initialize service discovery if self._agentConfig.get("service_discovery"): self.sd_backend = get_sd_backend(self._agentConfig) # Load the checks.d checks self._checksd = load_check_directory(self._agentConfig, hostname) # Initialize the Collector self.collector = Collector(self._agentConfig, emitters, systemStats, hostname) # In developer mode, the number of runs to be included in a single collector profile self.collector_profile_interval = self._agentConfig.get( "collector_profile_interval", DEFAULT_COLLECTOR_PROFILE_INTERVAL ) # Configure the watchdog. self.check_frequency = int(self._agentConfig["check_freq"]) watchdog = self._get_watchdog(self.check_frequency) # Initialize the auto-restarter self.restart_interval = int(self._agentConfig.get("restart_interval", RESTART_INTERVAL)) self.agent_start = time.time() profiled = False collector_profiled_runs = 0 # Run the main loop. while self.run_forever: log.debug("Found {num_checks} checks".format(num_checks=len(self._checksd["initialized_checks"]))) # Setup profiling if necessary if self.in_developer_mode and not profiled: try: profiler = AgentProfiler() profiler.enable_profiling() profiled = True except Exception as e: log.warn("Cannot enable profiler: %s" % str(e)) # Do the work. self.collector.run( checksd=self._checksd, start_event=self.start_event, configs_reloaded=self.configs_reloaded ) # This flag is used to know if the check configs have been reloaded at the current # run of the agent yet or not. It's used by the collector to know if it needs to # look for the AgentMetrics check and pop it out. # See: https://github.com/DataDog/dd-agent/blob/5.6.x/checks/collector.py#L265-L272 self.configs_reloaded = False # Look for change in the config template store. # The self.sd_backend.reload_check_configs flag is set # to True if a config reload is needed. if ( self._agentConfig.get("service_discovery") and self.sd_backend and not self.sd_backend.reload_check_configs ): try: self.sd_backend.reload_check_configs = get_config_store(self._agentConfig).crawl_config_template() except Exception as e: log.warn("Something went wrong while looking for config template changes: %s" % str(e)) # Check if we should run service discovery # The `reload_check_configs` flag can be set through the docker_daemon check or # using ConfigStore.crawl_config_template if self._agentConfig.get("service_discovery") and self.sd_backend and self.sd_backend.reload_check_configs: self.reload_configs() self.configs_reloaded = True self.sd_backend.reload_check_configs = False if profiled: if collector_profiled_runs >= self.collector_profile_interval: try: profiler.disable_profiling() profiled = False collector_profiled_runs = 0 except Exception as e: log.warn("Cannot disable profiler: %s" % str(e)) # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for next loop if we will continue, otherwise exit quickly. if self.run_forever: if watchdog: watchdog.reset() if profiled: collector_profiled_runs += 1 log.debug("Sleeping for {0} seconds".format(self.check_frequency)) time.sleep(self.check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except Exception: pass # Explicitly kill the process, because it might be running as a daemon. log.info("Exiting. Bye bye.") sys.exit(0) def _get_emitters(self): return [http_emitter] def _get_watchdog(self, check_freq): watchdog = None if self._agentConfig.get("watchdog", True): watchdog = Watchdog( check_freq * WATCHDOG_MULTIPLIER, max_mem_mb=self._agentConfig.get("limit_memory_consumption", None) ) watchdog.reset() return watchdog def _set_agent_config_hostname(self, agentConfig): # Try to fetch instance Id from EC2 if not hostname has been set # in the config file. # DEPRECATED if agentConfig.get("hostname") is None and agentConfig.get("use_ec2_instance_id"): instanceId = EC2.get_instance_id(agentConfig) if instanceId is not None: log.info("Running on EC2, instanceId: %s" % instanceId) agentConfig["hostname"] = instanceId else: log.info("Not running on EC2, using hostname to identify this server") return agentConfig def _should_restart(self): if time.time() - self.agent_start > self.restart_interval: return True return False def _do_restart(self): log.info("Running an auto-restart.") if self.collector: self.collector.stop() sys.exit(AgentSupervisor.RESTART_EXIT_STATUS)
def test_metadata(self): c = Collector({"collect_instance_metadata": True}, None, {}) assert "hostname" in c._get_metadata() assert "socket-fqdn" in c._get_metadata() assert "socket-hostname" in c._get_metadata()
class Agent(Daemon): def __init__(self, pidfile, autorestart, start_event=True, in_developer_mode=False): Daemon.__init__(self, pidfile, autorestart=autorestart) self.run_forever = True self.collector = None self.start_event = start_event self.in_developer_mode = in_developer_mode self._agentConfig = {} self._checksd = [] self.collector_profile_interval = DEFAULT_COLLECTOR_PROFILE_INTERVAL self.check_frequency = None self.configs_reloaded = False def _handle_sigterm(self, signum, frame): log.debug("Caught sigterm. Stopping run loop.") self.run_forever = False if self.collector: self.collector.stop() log.debug("Collector is stopped.") def _handle_sigusr1(self, signum, frame): self._handle_sigterm(signum, frame) self._do_restart() def _handle_sighup(self, signum, frame): log.info("SIGHUP caught!") self.reload_configs() self.configs_reloaded = True def reload_configs(self): log.info("Attempting a configuration reload...") hostname = get_hostname(self._agentConfig) self._checksd = load_check_directory(self._agentConfig, hostname) num_checks = len(self._checksd['initialized_checks']) if num_checks > 0: log.info("Successfully reloaded {num_checks} checks". format(num_checks=num_checks)) else: log.info("No checksd configs found") @classmethod def info(cls, verbose=None): logging.getLogger().setLevel(logging.ERROR) return CollectorStatus.print_latest_status(verbose=verbose) def run(self, config=None): signal.signal(signal.SIGTERM, self._handle_sigterm) signal.signal(signal.SIGUSR1, self._handle_sigusr1) signal.signal(signal.SIGINT, self._handle_sigterm) signal.signal(signal.SIGHUP, self._handle_sighup) CollectorStatus().persist() if not config: config = get_config(parse_args=True) self._agentConfig = self._set_agent_config_hostname(config) hostname = get_hostname(self._agentConfig) systemStats = get_system_stats() emitters = self._get_emitters() self._checksd = load_check_directory(self._agentConfig, hostname) self.collector = Collector(self._agentConfig, emitters, systemStats, hostname) self.collector_profile_interval = self._agentConfig.get('collector_profile_interval', DEFAULT_COLLECTOR_PROFILE_INTERVAL) self.check_frequency = int(self._agentConfig['check_freq']) watchmonitor = self._get_watchmonitor(self.check_frequency) self.restart_interval = int(self._agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() profiled = False collector_profiled_runs = 0 while self.run_forever: log.debug("Found {num_checks} checks".format(num_checks=len(self._checksd['initialized_checks']))) if self.in_developer_mode and not profiled: try: profiler = AgentProfiler() profiler.enable_profiling() profiled = True except Exception as e: log.warn("Cannot enable profiler: %s" % str(e)) self.collector.run(checksd=self._checksd, start_event=self.start_event, configs_reloaded=self.configs_reloaded) if self.configs_reloaded: self.configs_reloaded = False if profiled: if collector_profiled_runs >= self.collector_profile_interval: try: profiler.disable_profiling() profiled = False collector_profiled_runs = 0 except Exception as e: log.warn("Cannot disable profiler: %s" % str(e)) if self.autorestart and self._should_restart(): self._do_restart() if self.run_forever: if watchmonitor: watchmonitor.reset() if profiled: collector_profiled_runs += 1 log.debug("Sleeping for {0} seconds".format(self.check_frequency)) time.sleep(self.check_frequency) try: CollectorStatus.remove_latest_status() except Exception: pass log.info("Exiting. Bye bye.") sys.exit(0) def _get_emitters(self): return [http_emitter] def _get_watchmonitor(self, check_freq): watchmonitor = None if self._agentConfig.get("watchmonitor", True): watchmonitor = Watchmonitor(check_freq * WATCHmonitor_MULTIPLIER, max_mem_mb=self._agentConfig.get('limit_memory_consumption', None)) watchmonitor.reset() return watchmonitor def _set_agent_config_hostname(self, agentConfig): if agentConfig.get('hostname') is None and agentConfig.get('use_ec2_instance_id'): instanceId = EC2.get_instance_id(agentConfig) if instanceId is not None: log.info("Running on EC2, instanceId: %s" % instanceId) agentConfig['hostname'] = instanceId else: log.info('Not running on EC2, using hostname to identify this server') return agentConfig def _should_restart(self): if time.time() - self.agent_start > self.restart_interval: return True return False def _do_restart(self): log.info("Running an auto-restart.") if self.collector: self.collector.stop() sys.exit(AgentSupervisor.RESTART_EXIT_STATUS)
class Agent(Daemon): """ The agent class is a daemon that runs the collector in a background process. """ def __init__(self, pidfile): Daemon.__init__(self, pidfile) self.run_forever = True self.collector = None def _handle_sigterm(self, signum, frame): agent_logger.debug("Caught sigterm. Stopping run loop.") self.run_forever = False if self.collector: self.collector.stop() def run(self): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. agentConfig = self._set_agent_config_hostname(get_config()) systemStats = get_system_stats() emitters = self._get_emitters(agentConfig) self.collector = Collector(agentConfig, emitters, systemStats) # Load the checks.d checks checksd = load_check_directory(agentConfig) # Configure the watchdog. check_frequency = int(agentConfig['check_freq']) watchdog = self._get_watchdog(check_frequency, agentConfig) # Run the main loop. while self.run_forever: # Do the work. self.collector.run(checksd=checksd) # Only plan for the next loop if we will continue, # otherwise just exit quickly. if self.run_forever: if watchdog: watchdog.reset() time.sleep(check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except: pass # Explicitly kill the process, because it might be running # as a daemon. agent_logger.info("Exiting. Bye bye.") sys.exit(0) def _get_emitters(self, agentConfig): emitters = [http_emitter] for emitter_spec in [ s.strip() for s in agentConfig.get('custom_emitters', '').split(',') ]: if len(emitter_spec) == 0: continue emitters.append(modules.load(emitter_spec, 'emitter')) return emitters def _get_watchdog(self, check_freq, agentConfig): watchdog = None if agentConfig.get("watchdog", True): watchdog = Watchdog(check_freq * WATCHDOG_MULTIPLIER) watchdog.reset() return watchdog def _set_agent_config_hostname(self, agentConfig): # Try to fetch instance Id from EC2 if not hostname has been set # in the config file. # DEPRECATED if agentConfig.get('hostname') is None and agentConfig.get( 'use_ec2_instance_id'): instanceId = EC2.get_instance_id() if instanceId is not None: agent_logger.info("Running on EC2, instanceId: %s" % instanceId) agentConfig['hostname'] = instanceId else: agent_logger.info( 'Not running on EC2, using hostname to identify this server' ) return agentConfig
def run(self, config=None): signal.signal(signal.SIGTERM, self._handle_sigterm) signal.signal(signal.SIGUSR1, self._handle_sigusr1) signal.signal(signal.SIGINT, self._handle_sigterm) signal.signal(signal.SIGHUP, self._handle_sighup) CollectorStatus().persist() if not config: config = get_config(parse_args=True) self._agentConfig = self._set_agent_config_hostname(config) hostname = get_hostname(self._agentConfig) systemStats = get_system_stats() emitters = self._get_emitters() self._checksd = load_check_directory(self._agentConfig, hostname) self.collector = Collector(self._agentConfig, emitters, systemStats, hostname) self.collector_profile_interval = self._agentConfig.get('collector_profile_interval', DEFAULT_COLLECTOR_PROFILE_INTERVAL) self.check_frequency = int(self._agentConfig['check_freq']) watchmonitor = self._get_watchmonitor(self.check_frequency) self.restart_interval = int(self._agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() profiled = False collector_profiled_runs = 0 while self.run_forever: log.debug("Found {num_checks} checks".format(num_checks=len(self._checksd['initialized_checks']))) if self.in_developer_mode and not profiled: try: profiler = AgentProfiler() profiler.enable_profiling() profiled = True except Exception as e: log.warn("Cannot enable profiler: %s" % str(e)) self.collector.run(checksd=self._checksd, start_event=self.start_event, configs_reloaded=self.configs_reloaded) if self.configs_reloaded: self.configs_reloaded = False if profiled: if collector_profiled_runs >= self.collector_profile_interval: try: profiler.disable_profiling() profiled = False collector_profiled_runs = 0 except Exception as e: log.warn("Cannot disable profiler: %s" % str(e)) if self.autorestart and self._should_restart(): self._do_restart() if self.run_forever: if watchmonitor: watchmonitor.reset() if profiled: collector_profiled_runs += 1 log.debug("Sleeping for {0} seconds".format(self.check_frequency)) time.sleep(self.check_frequency) try: CollectorStatus.remove_latest_status() except Exception: pass log.info("Exiting. Bye bye.") sys.exit(0)
def main(): options, args = get_parsed_args() agentConfig = get_config(options=options) autorestart = agentConfig.get('autorestart', False) hostname = get_hostname(agentConfig) in_developer_mode = agentConfig.get('developer_mode') COMMANDS_AGENT = [ 'start', 'stop', 'restart', 'status', 'foreground', ] COMMANDS_NO_AGENT = [ 'info', 'check', 'configcheck', 'jmx', 'flare', ] COMMANDS = COMMANDS_AGENT + COMMANDS_NO_AGENT if len(args) < 1: sys.stderr.write("Usage: %s %s\n" % (sys.argv[0], "|".join(COMMANDS))) return 2 command = args[0] if command not in COMMANDS: sys.stderr.write("Unknown command: %s\n" % command) return 3 # TODO: actually kill the start/stop/restart/status command for 5.11 if command in ['start', 'stop', 'restart', 'status'] and not in_developer_mode: logging.error('Please use supervisor to manage the agent') return 1 if command in COMMANDS_AGENT: agent = Agent(PidFile(PID_NAME, PID_DIR).get_path(), autorestart, in_developer_mode=in_developer_mode) if 'start' == command: log.info('Start daemon') agent.start() elif 'stop' == command: log.info('Stop daemon') agent.stop() elif 'restart' == command: log.info('Restart daemon') agent.restart() elif 'status' == command: agent.status() elif 'info' == command: return Agent.info(verbose=options.verbose) elif 'foreground' == command: log.info('Agent version %s' % get_version()) if autorestart: # Set-up the supervisor callbacks and fork it. logging.info('Running Agent with auto-restart ON') def child_func(): agent.start(foreground=True) def parent_func(): agent.start_event = False AgentSupervisor.start(parent_func, child_func) else: # Run in the standard foreground. agent.start(foreground=True) elif 'check' == command: if len(args) < 2: sys.stderr.write( "Usage: %s check <check_name> [check_rate]\n" "Add check_rate as last argument to compute rates\n" % sys.argv[0] ) return 1 check_name = args[1] try: import checks.collector # Try the old-style check first print getattr(checks.collector, check_name)(log).check(agentConfig) except Exception: # If not an old-style check, try checks.d checks = load_check_directory(agentConfig, hostname) for check in checks['initialized_checks']: if check.name == check_name: if in_developer_mode: check.run = AgentProfiler.wrap_profiling(check.run) cs = Collector.run_single_check(check, verbose=True) print CollectorStatus.render_check_status(cs) if len(args) == 3 and args[2] == 'check_rate': print "Running 2nd iteration to capture rate metrics" time.sleep(1) cs = Collector.run_single_check(check, verbose=True) print CollectorStatus.render_check_status(cs) check.stop() elif 'configcheck' == command or 'configtest' == command: configcheck() sd_configcheck(agentConfig) elif 'jmx' == command: jmx_command(args[1:], agentConfig) elif 'flare' == command: Flare.check_user_rights() case_id = int(args[1]) if len(args) > 1 else None f = Flare(True, case_id) f.collect() try: f.upload() except Exception as e: print 'The upload failed:\n{0}'.format(str(e)) return 0
def test_metadata(self): c = Collector({}, None, {}) assert "hostname" in c._get_metadata() assert "socket-fqdn" in c._get_metadata() assert "socket-hostname" in c._get_metadata()
class Agent(Daemon): """ The agent class is a daemon that runs the collector in a background process. """ def __init__(self, pidfile, autorestart, start_event=True): Daemon.__init__(self, pidfile, autorestart=autorestart) self.run_forever = True self.collector = None self.start_event = start_event def _handle_sigterm(self, signum, frame): log.debug("Caught sigterm. Stopping run loop.") self.run_forever = False if JMXFetch.is_running(): JMXFetch.stop() if self.collector: self.collector.stop() log.debug("Collector is stopped.") def _handle_sigusr1(self, signum, frame): self._handle_sigterm(signum, frame) self._do_restart() def info(self, verbose=None): logging.getLogger().setLevel(logging.ERROR) return CollectorStatus.print_latest_status(verbose=verbose) def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) agentConfig = self._set_agent_config_hostname(config) hostname = get_hostname(agentConfig) systemStats = get_system_stats() emitters = self._get_emitters(agentConfig) # Load the checks.d checks checksd = load_check_directory(agentConfig, hostname) self.collector = Collector(agentConfig, emitters, systemStats, hostname) # Configure the watchdog. check_frequency = int(agentConfig['check_freq']) watchdog = self._get_watchdog(check_frequency, agentConfig) # Initialize the auto-restarter self.restart_interval = int(agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() # Run the main loop. while self.run_forever: # enable profiler if needed profiled = False if agentConfig.get('profile', False) and agentConfig.get('profile').lower() == 'yes': try: import cProfile profiler = cProfile.Profile() profiled = True profiler.enable() log.debug("Agent profiling is enabled") except Exception: log.warn("Cannot enable profiler") # Do the work. self.collector.run(checksd=checksd, start_event=self.start_event) # disable profiler and printout stats to stdout if agentConfig.get('profile', False) and agentConfig.get('profile').lower() == 'yes' and profiled: try: profiler.disable() import pstats from cStringIO import StringIO s = StringIO() ps = pstats.Stats(profiler, stream=s).sort_stats("cumulative") ps.print_stats() log.debug(s.getvalue()) except Exception: log.warn("Cannot disable profiler") # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for the next loop if we will continue, # otherwise just exit quickly. if self.run_forever: if watchdog: watchdog.reset() time.sleep(check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except Exception: pass # Explicitly kill the process, because it might be running # as a daemon. log.info("Exiting. Bye bye.") sys.exit(0) def _get_emitters(self, agentConfig): return [http_emitter] def _get_watchdog(self, check_freq, agentConfig): watchdog = None if agentConfig.get("watchdog", True): watchdog = Watchdog(check_freq * WATCHDOG_MULTIPLIER, max_mem_mb=agentConfig.get('limit_memory_consumption', None)) watchdog.reset() return watchdog def _set_agent_config_hostname(self, agentConfig): # Try to fetch instance Id from EC2 if not hostname has been set # in the config file. # DEPRECATED if agentConfig.get('hostname') is None and agentConfig.get('use_ec2_instance_id'): instanceId = EC2.get_instance_id(agentConfig) if instanceId is not None: log.info("Running on EC2, instanceId: %s" % instanceId) agentConfig['hostname'] = instanceId else: log.info('Not running on EC2, using hostname to identify this server') return agentConfig def _should_restart(self): if time.time() - self.agent_start > self.restart_interval: return True return False def _do_restart(self): log.info("Running an auto-restart.") if self.collector: self.collector.stop() sys.exit(AgentSupervisor.RESTART_EXIT_STATUS)
def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) agentConfig = self._set_agent_config_hostname(config) systemStats = get_system_stats() emitters = self._get_emitters(agentConfig) # Load the checks.d checks checksd = load_check_directory(agentConfig) self.collector = Collector(agentConfig, emitters, systemStats) # Configure the watchdog. check_frequency = int(agentConfig['check_freq']) watchdog = self._get_watchdog(check_frequency, agentConfig) # Initialize the auto-restarter self.restart_interval = int( agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() # Run the main loop. while self.run_forever: # Do the work. self.collector.run(checksd=checksd, start_event=self.start_event) # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for the next loop if we will continue, # otherwise just exit quickly. if self.run_forever: if watchdog: watchdog.reset() time.sleep(check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except: pass # Explicitly kill the process, because it might be running # as a daemon. log.info("Exiting. Bye bye.") sys.exit(0)
class Agent(Daemon): """ The agent class is a daemon that runs the collector in a background process. """ def __init__(self, pidfile, autorestart, start_event=True): Daemon.__init__(self, pidfile, autorestart=autorestart) self.run_forever = True self.collector = None self.start_event = start_event def _handle_sigterm(self, signum, frame): log.debug("Caught sigterm. Stopping run loop.") self.run_forever = False if JMXFetch.is_running(): JMXFetch.stop() if self.collector: self.collector.stop() log.debug("Collector is stopped.") def _handle_sigusr1(self, signum, frame): self._handle_sigterm(signum, frame) self._do_restart() def info(self, verbose=None): logging.getLogger().setLevel(logging.ERROR) return CollectorStatus.print_latest_status(verbose=verbose) def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) agentConfig = self._set_agent_config_hostname(config) systemStats = get_system_stats() emitters = self._get_emitters(agentConfig) # Load the checks.d checks checksd = load_check_directory(agentConfig) self.collector = Collector(agentConfig, emitters, systemStats) # Configure the watchdog. check_frequency = int(agentConfig['check_freq']) watchdog = self._get_watchdog(check_frequency, agentConfig) # Initialize the auto-restarter self.restart_interval = int( agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() # Run the main loop. while self.run_forever: # enable profiler if needed profiled = False if agentConfig.get( 'profile', False) and agentConfig.get('profile').lower() == 'yes': try: import cProfile profiler = cProfile.Profile() profiled = True profiler.enable() log.debug("Agent profiling is enabled") except Exception: log.warn("Cannot enable profiler") # Do the work. self.collector.run(checksd=checksd, start_event=self.start_event) # disable profiler and printout stats to stdout if agentConfig.get('profile', False) and agentConfig.get( 'profile').lower() == 'yes' and profiled: try: profiler.disable() import pstats from cStringIO import StringIO s = StringIO() ps = pstats.Stats(profiler, stream=s).sort_stats("cumulative") ps.print_stats() log.debug(s.getvalue()) except Exception: log.warn("Cannot disable profiler") # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for the next loop if we will continue, # otherwise just exit quickly. if self.run_forever: if watchdog: watchdog.reset() time.sleep(check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except Exception: pass # Explicitly kill the process, because it might be running # as a daemon. log.info("Exiting. Bye bye.") sys.exit(0) def _get_emitters(self, agentConfig): return [http_emitter] def _get_watchdog(self, check_freq, agentConfig): watchdog = None if agentConfig.get("watchdog", True): watchdog = Watchdog(check_freq * WATCHDOG_MULTIPLIER, max_mem_mb=agentConfig.get( 'limit_memory_consumption', None)) watchdog.reset() return watchdog def _set_agent_config_hostname(self, agentConfig): # Try to fetch instance Id from EC2 if not hostname has been set # in the config file. # DEPRECATED if agentConfig.get('hostname') is None and agentConfig.get( 'use_ec2_instance_id'): instanceId = EC2.get_instance_id() if instanceId is not None: log.info("Running on EC2, instanceId: %s" % instanceId) agentConfig['hostname'] = instanceId else: log.info( 'Not running on EC2, using hostname to identify this server' ) return agentConfig def _should_restart(self): if time.time() - self.agent_start > self.restart_interval: return True return False def _do_restart(self): log.info("Running an auto-restart.") if self.collector: self.collector.stop() sys.exit(AgentSupervisor.RESTART_EXIT_STATUS)
class Agent(Daemon): """ The agent class is a daemon that runs the collector in a background process. """ def __init__(self, pidfile, autorestart, start_event=True, in_developer_mode=False): Daemon.__init__(self, pidfile, autorestart=autorestart) self.run_forever = True self.collector = None self.start_event = start_event self.in_developer_mode = in_developer_mode self._agentConfig = {} self._checksd = [] self.collector_profile_interval = DEFAULT_COLLECTOR_PROFILE_INTERVAL self.check_frequency = None # this flag can be set to True, False, or a list of checks (for partial reload) self.reload_configs_flag = False self.sd_backend = None def _handle_sigterm(self, signum, frame): """Handles SIGTERM and SIGINT, which gracefully stops the agent.""" log.debug("Caught sigterm. Stopping run loop.") self.run_forever = False if self.collector: self.collector.stop() log.debug("Collector is stopped.") def _handle_sigusr1(self, signum, frame): """Handles SIGUSR1, which signals an exit with an autorestart.""" self._handle_sigterm(signum, frame) self._do_restart() def _handle_sighup(self, signum, frame): """Handles SIGHUP, which signals a configuration reload.""" log.info("SIGHUP caught! Scheduling configuration reload before next collection run.") self.reload_configs_flag = True def reload_configs(self, checks_to_reload=set()): """Reload the agent configuration and checksd configurations. Can also reload only an explicit set of checks.""" log.info("Attempting a configuration reload...") hostname = get_hostname(self._agentConfig) # if no check was given, reload them all if not checks_to_reload: log.debug("No check list was passed, reloading every check") # stop checks for check in self._checksd.get('initialized_checks', []): check.stop() self._checksd = load_check_directory(self._agentConfig, hostname) else: new_checksd = copy(self._checksd) self.refresh_specific_checks(hostname, new_checksd, checks_to_reload) # once the reload is done, replace existing checks with the new ones self._checksd = new_checksd # Logging num_checks = len(self._checksd['initialized_checks']) if num_checks > 0: opt_msg = " (refreshed %s checks)" % len(checks_to_reload) if checks_to_reload else '' msg = "Check reload was successful. Running {num_checks} checks{opt_msg}.".format( num_checks=num_checks, opt_msg=opt_msg) log.info(msg) else: log.info("No checksd configs found") def refresh_specific_checks(self, hostname, checksd, checks): """take a list of checks and for each of them: - remove it from the init_failed_checks if it was there - load a fresh config for it - replace its old config with the new one in initialized_checks if there was one - disable the check if no new config was found - otherwise, append it to initialized_checks """ for check_name in checks: idx = None for num, check in enumerate(checksd['initialized_checks']): if check.name == check_name: idx = num # stop the existing check before reloading it check.stop() if not idx and check_name in checksd['init_failed_checks']: # if the check previously failed to load, pop it from init_failed_checks checksd['init_failed_checks'].pop(check_name) fresh_check = load_check(self._agentConfig, hostname, check_name) # this is an error dict # checks that failed to load are added to init_failed_checks # and poped from initialized_checks if isinstance(fresh_check, dict) and 'error' in fresh_check.keys(): checksd['init_failed_checks'][fresh_check.keys()[0]] = fresh_check.values()[0] if idx: checksd['initialized_checks'].pop(idx) elif not fresh_check: # no instance left of it to monitor so the check was not loaded if idx: checksd['initialized_checks'].pop(idx) # the check was not previously running so we were trying to instantiate it and it failed else: log.error("Configuration for check %s was not found, it won't be reloaded." % check_name) # successfully reloaded check are added to initialized_checks # (appended or replacing a previous version) else: if idx is not None: checksd['initialized_checks'][idx] = fresh_check # it didn't exist before and doesn't need to be replaced so we append it else: checksd['initialized_checks'].append(fresh_check) @classmethod def info(cls, verbose=None): logging.getLogger().setLevel(logging.ERROR) return CollectorStatus.print_latest_status(verbose=verbose) def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # A SIGHUP signals a configuration reload signal.signal(signal.SIGHUP, self._handle_sighup) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) self._agentConfig = self._set_agent_config_hostname(config) hostname = get_hostname(self._agentConfig) systemStats = get_system_stats( proc_path=self._agentConfig.get('procfs_path', '/proc').rstrip('/') ) emitters = self._get_emitters() # Initialize service discovery if self._agentConfig.get('service_discovery'): self.sd_backend = get_sd_backend(self._agentConfig) # Load the checks.d checks self._checksd = load_check_directory(self._agentConfig, hostname) # Initialize the Collector self.collector = Collector(self._agentConfig, emitters, systemStats, hostname) # In developer mode, the number of runs to be included in a single collector profile self.collector_profile_interval = self._agentConfig.get('collector_profile_interval', DEFAULT_COLLECTOR_PROFILE_INTERVAL) # Configure the watchdog. self.check_frequency = int(self._agentConfig['check_freq']) watchdog = self._get_watchdog(self.check_frequency) # Initialize the auto-restarter self.restart_interval = int(self._agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() profiled = False collector_profiled_runs = 0 # Run the main loop. while self.run_forever: # Setup profiling if necessary if self.in_developer_mode and not profiled: try: profiler = AgentProfiler() profiler.enable_profiling() profiled = True except Exception as e: log.warn("Cannot enable profiler: %s" % str(e)) if self.reload_configs_flag: if isinstance(self.reload_configs_flag, set): self.reload_configs(checks_to_reload=self.reload_configs_flag) else: self.reload_configs() # Do the work. Pass `configs_reloaded` to let the collector know if it needs to # look for the AgentMetrics check and pop it out. self.collector.run(checksd=self._checksd, start_event=self.start_event, configs_reloaded=True if self.reload_configs_flag else False) self.reload_configs_flag = False # Look for change in the config template store. # The self.sd_backend.reload_check_configs flag is set # to True if a config reload is needed. if self._agentConfig.get('service_discovery') and self.sd_backend and \ not self.sd_backend.reload_check_configs: try: self.sd_backend.reload_check_configs = get_config_store( self._agentConfig).crawl_config_template() except Exception as e: log.warn('Something went wrong while looking for config template changes: %s' % str(e)) # Check if we should run service discovery # The `reload_check_configs` flag can be set through the docker_daemon check or # using ConfigStore.crawl_config_template if self._agentConfig.get('service_discovery') and self.sd_backend and \ self.sd_backend.reload_check_configs: self.reload_configs_flag = self.sd_backend.reload_check_configs self.sd_backend.reload_check_configs = False if profiled: if collector_profiled_runs >= self.collector_profile_interval: try: profiler.disable_profiling() profiled = False collector_profiled_runs = 0 except Exception as e: log.warn("Cannot disable profiler: %s" % str(e)) # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for next loop if we will continue, otherwise exit quickly. if self.run_forever: if watchdog: watchdog.reset() if profiled: collector_profiled_runs += 1 log.debug("Sleeping for {0} seconds".format(self.check_frequency)) time.sleep(self.check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except Exception: pass # Explicitly kill the process, because it might be running as a daemon. log.info("Exiting. Bye bye.") sys.exit(0) def _get_emitters(self): return [http_emitter] def _get_watchdog(self, check_freq): watchdog = None if self._agentConfig.get("watchdog", True): watchdog = Watchdog(check_freq * WATCHDOG_MULTIPLIER, max_mem_mb=self._agentConfig.get('limit_memory_consumption', None)) watchdog.reset() return watchdog def _set_agent_config_hostname(self, agentConfig): # Try to fetch instance Id from EC2 if not hostname has been set # in the config file. # DEPRECATED if agentConfig.get('hostname') is None and agentConfig.get('use_ec2_instance_id'): instanceId = EC2.get_instance_id(agentConfig) if instanceId is not None: log.info("Running on EC2, instanceId: %s" % instanceId) agentConfig['hostname'] = instanceId else: log.info('Not running on EC2, using hostname to identify this server') return agentConfig def _should_restart(self): if time.time() - self.agent_start > self.restart_interval: return True return False def _do_restart(self): log.info("Running an auto-restart.") if self.collector: self.collector.stop() sys.exit(AgentSupervisor.RESTART_EXIT_STATUS)
def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) if not Platform.is_windows(): # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # A SIGHUP signals a configuration reload signal.signal(signal.SIGHUP, self._handle_sighup) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) self._agentConfig = self._set_agent_config_hostname(config) hostname = get_hostname(self._agentConfig) systemStats = get_system_stats(proc_path=self._agentConfig.get( 'procfs_path', '/proc').rstrip('/')) emitters = self._get_emitters() # Initialize service discovery if self._agentConfig.get('service_discovery'): self.sd_backend = get_sd_backend(self._agentConfig) if _is_affirmative(self._agentConfig.get('sd_jmx_enable', False)): pipe_path = get_jmx_pipe_path() if Platform.is_windows(): pipe_name = pipe_path.format(pipename=SD_PIPE_NAME) else: pipe_name = os.path.join(pipe_path, SD_PIPE_NAME) if os.access(pipe_path, os.W_OK): if not os.path.exists(pipe_name): os.mkfifo(pipe_name) self.sd_pipe = os.open( pipe_name, os.O_RDWR) # RW to avoid blocking (will only W) # Initialize Supervisor proxy self.supervisor_proxy = self._get_supervisor_socket( self._agentConfig) else: log.debug( 'Unable to create pipe in temporary directory. JMX service discovery disabled.' ) # Load the checks.d checks self._checksd = load_check_directory(self._agentConfig, hostname) # Load JMX configs if available if self._jmx_service_discovery_enabled: jmx_sd_configs = generate_jmx_configs(self._agentConfig, hostname) if jmx_sd_configs: self._submit_jmx_service_discovery(jmx_sd_configs) # Initialize the Collector self.collector = Collector(self._agentConfig, emitters, systemStats, hostname) # In developer mode, the number of runs to be included in a single collector profile try: self.collector_profile_interval = int( self._agentConfig.get('collector_profile_interval', DEFAULT_COLLECTOR_PROFILE_INTERVAL)) except ValueError: log.warn('collector_profile_interval is invalid. ' 'Using default value instead (%s).' % DEFAULT_COLLECTOR_PROFILE_INTERVAL) self.collector_profile_interval = DEFAULT_COLLECTOR_PROFILE_INTERVAL # Configure the watchdog. self.check_frequency = int(self._agentConfig['check_freq']) watchdog = self._get_watchdog(self.check_frequency) # Initialize the auto-restarter self.restart_interval = int( self._agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() self.allow_profiling = self._agentConfig.get('allow_profiling', True) profiled = False collector_profiled_runs = 0 # Run the main loop. while self.run_forever: # Setup profiling if necessary if self.allow_profiling and self.in_developer_mode and not profiled: try: profiler = AgentProfiler() profiler.enable_profiling() profiled = True except Exception as e: log.warn("Cannot enable profiler: %s" % str(e)) if self.reload_configs_flag: if isinstance(self.reload_configs_flag, set): self.reload_configs( checks_to_reload=self.reload_configs_flag) else: self.reload_configs() # Do the work. Pass `configs_reloaded` to let the collector know if it needs to # look for the AgentMetrics check and pop it out. self.collector.run( checksd=self._checksd, start_event=self.start_event, configs_reloaded=True if self.reload_configs_flag else False) self.reload_configs_flag = False # Look for change in the config template store. # The self.sd_backend.reload_check_configs flag is set # to True if a config reload is needed. if self._agentConfig.get('service_discovery') and self.sd_backend and \ not self.sd_backend.reload_check_configs: try: self.sd_backend.reload_check_configs = get_config_store( self._agentConfig).crawl_config_template() except Exception as e: log.warn( 'Something went wrong while looking for config template changes: %s' % str(e)) # Check if we should run service discovery # The `reload_check_configs` flag can be set through the docker_daemon check or # using ConfigStore.crawl_config_template if self._agentConfig.get('service_discovery') and self.sd_backend and \ self.sd_backend.reload_check_configs: self.reload_configs_flag = self.sd_backend.reload_check_configs self.sd_backend.reload_check_configs = False if profiled: if collector_profiled_runs >= self.collector_profile_interval: try: profiler.disable_profiling() profiled = False collector_profiled_runs = 0 except Exception as e: log.warn("Cannot disable profiler: %s" % str(e)) # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for next loop if we will continue, otherwise exit quickly. if self.run_forever: if watchdog: watchdog.reset() if profiled: collector_profiled_runs += 1 log.debug("Sleeping for {0} seconds".format( self.check_frequency)) time.sleep(self.check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except Exception: pass # Explicitly kill the process, because it might be running as a daemon. log.info("Exiting. Bye bye.") sys.exit(0)
def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # A SIGHUP signals a configuration reload signal.signal(signal.SIGHUP, self._handle_sighup) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) self._agentConfig = self._set_agent_config_hostname(config) hostname = get_hostname(self._agentConfig) systemStats = get_system_stats() emitters = self._get_emitters() # Initialize service discovery if self._agentConfig.get("service_discovery"): self.sd_backend = get_sd_backend(self._agentConfig) # Load the checks.d checks self._checksd = load_check_directory(self._agentConfig, hostname) # Initialize the Collector self.collector = Collector(self._agentConfig, emitters, systemStats, hostname) # In developer mode, the number of runs to be included in a single collector profile self.collector_profile_interval = self._agentConfig.get( "collector_profile_interval", DEFAULT_COLLECTOR_PROFILE_INTERVAL ) # Configure the watchdog. self.check_frequency = int(self._agentConfig["check_freq"]) watchdog = self._get_watchdog(self.check_frequency) # Initialize the auto-restarter self.restart_interval = int(self._agentConfig.get("restart_interval", RESTART_INTERVAL)) self.agent_start = time.time() profiled = False collector_profiled_runs = 0 # Run the main loop. while self.run_forever: log.debug("Found {num_checks} checks".format(num_checks=len(self._checksd["initialized_checks"]))) # Setup profiling if necessary if self.in_developer_mode and not profiled: try: profiler = AgentProfiler() profiler.enable_profiling() profiled = True except Exception as e: log.warn("Cannot enable profiler: %s" % str(e)) # Do the work. self.collector.run( checksd=self._checksd, start_event=self.start_event, configs_reloaded=self.configs_reloaded ) # This flag is used to know if the check configs have been reloaded at the current # run of the agent yet or not. It's used by the collector to know if it needs to # look for the AgentMetrics check and pop it out. # See: https://github.com/DataDog/dd-agent/blob/5.6.x/checks/collector.py#L265-L272 self.configs_reloaded = False # Look for change in the config template store. # The self.sd_backend.reload_check_configs flag is set # to True if a config reload is needed. if ( self._agentConfig.get("service_discovery") and self.sd_backend and not self.sd_backend.reload_check_configs ): try: self.sd_backend.reload_check_configs = get_config_store(self._agentConfig).crawl_config_template() except Exception as e: log.warn("Something went wrong while looking for config template changes: %s" % str(e)) # Check if we should run service discovery # The `reload_check_configs` flag can be set through the docker_daemon check or # using ConfigStore.crawl_config_template if self._agentConfig.get("service_discovery") and self.sd_backend and self.sd_backend.reload_check_configs: self.reload_configs() self.configs_reloaded = True self.sd_backend.reload_check_configs = False if profiled: if collector_profiled_runs >= self.collector_profile_interval: try: profiler.disable_profiling() profiled = False collector_profiled_runs = 0 except Exception as e: log.warn("Cannot disable profiler: %s" % str(e)) # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for next loop if we will continue, otherwise exit quickly. if self.run_forever: if watchdog: watchdog.reset() if profiled: collector_profiled_runs += 1 log.debug("Sleeping for {0} seconds".format(self.check_frequency)) time.sleep(self.check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except Exception: pass # Explicitly kill the process, because it might be running as a daemon. log.info("Exiting. Bye bye.") sys.exit(0)
class Agent(Daemon): """ The agent class is a daemon that runs the collector in a background process. """ def __init__(self, pidfile, autorestart, start_event=True, in_developer_mode=False): Daemon.__init__(self, pidfile, autorestart=autorestart) self.run_forever = True self.collector = None self.start_event = start_event self.in_developer_mode = in_developer_mode self._agentConfig = {} self._checksd = [] self.collector_profile_interval = DEFAULT_COLLECTOR_PROFILE_INTERVAL self.check_frequency = None # this flag can be set to True, False, or a set of checks (for partial reload) self.reload_configs_flag = False self.sd_backend = None self.supervisor_proxy = None self.sd_pipe = None def _handle_sigterm(self, signum, frame): """Handles SIGTERM and SIGINT, which gracefully stops the agent.""" log.debug("Caught sigterm. Stopping run loop.") self.run_forever = False if self.collector: self.collector.stop() log.debug("Collector is stopped.") def _handle_sigusr1(self, signum, frame): """Handles SIGUSR1, which signals an exit with an autorestart.""" self._handle_sigterm(signum, frame) self._do_restart() def _handle_sighup(self, signum, frame): """Handles SIGHUP, which signals a configuration reload.""" log.info( "SIGHUP caught! Scheduling configuration reload before next collection run." ) self.reload_configs_flag = True def reload_configs(self, checks_to_reload=set()): """Reload the agent configuration and checksd configurations. Can also reload only an explicit set of checks.""" log.info("Attempting a configuration reload...") hostname = get_hostname(self._agentConfig) jmx_sd_configs = None # if no check was given, reload them all if not checks_to_reload: log.debug("No check list was passed, reloading every check") # stop checks for check in self._checksd.get('initialized_checks', []): check.stop() self._checksd = load_check_directory(self._agentConfig, hostname) if self._jmx_service_discovery_enabled: jmx_sd_configs = generate_jmx_configs(self._agentConfig, hostname) else: new_checksd = copy(self._checksd) all_jmx_checks = get_jmx_checks(auto_conf=True) jmx_checks = [ check for check in checks_to_reload if check in all_jmx_checks ] py_checks = set(checks_to_reload) - set(jmx_checks) self.refresh_specific_checks(hostname, new_checksd, py_checks) if self._jmx_service_discovery_enabled: jmx_sd_configs = generate_jmx_configs(self._agentConfig, hostname, jmx_checks) # once the reload is done, replace existing checks with the new ones self._checksd = new_checksd if jmx_sd_configs: self._submit_jmx_service_discovery(jmx_sd_configs) # Logging num_checks = len(self._checksd['initialized_checks']) if num_checks > 0: opt_msg = " (refreshed %s checks)" % len( checks_to_reload) if checks_to_reload else '' msg = "Check reload was successful. Running {num_checks} checks{opt_msg}.".format( num_checks=num_checks, opt_msg=opt_msg) log.info(msg) else: log.info("No checksd configs found") def refresh_specific_checks(self, hostname, checksd, checks): """take a set of checks and for each of them: - remove it from the init_failed_checks if it was there - load a fresh config for it - replace its old config with the new one in initialized_checks if there was one - disable the check if no new config was found - otherwise, append it to initialized_checks """ for check_name in checks: idx = None for num, check in enumerate(checksd['initialized_checks']): if check.name == check_name: idx = num # stop the existing check before reloading it check.stop() if not idx and check_name in checksd['init_failed_checks']: # if the check previously failed to load, pop it from init_failed_checks checksd['init_failed_checks'].pop(check_name) fresh_check = load_check(self._agentConfig, hostname, check_name) # this is an error dict # checks that failed to load are added to init_failed_checks # and poped from initialized_checks if isinstance(fresh_check, dict) and 'error' in fresh_check.keys(): checksd['init_failed_checks'][fresh_check.keys() [0]] = fresh_check.values()[0] if idx: checksd['initialized_checks'].pop(idx) elif not fresh_check: # no instance left of it to monitor so the check was not loaded if idx: checksd['initialized_checks'].pop(idx) # the check was not previously running so we were trying to instantiate it and it failed else: log.error( "Configuration for check %s was not found, it won't be reloaded." % check_name) # successfully reloaded check are added to initialized_checks # (appended or replacing a previous version) else: if idx is not None: checksd['initialized_checks'][idx] = fresh_check # it didn't exist before and doesn't need to be replaced so we append it else: checksd['initialized_checks'].append(fresh_check) @classmethod def info(cls, verbose=None): logging.getLogger().setLevel(logging.ERROR) return CollectorStatus.print_latest_status(verbose=verbose) def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) if not Platform.is_windows(): # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # A SIGHUP signals a configuration reload signal.signal(signal.SIGHUP, self._handle_sighup) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) self._agentConfig = self._set_agent_config_hostname(config) hostname = get_hostname(self._agentConfig) systemStats = get_system_stats(proc_path=self._agentConfig.get( 'procfs_path', '/proc').rstrip('/')) emitters = self._get_emitters() # Initialize service discovery if self._agentConfig.get('service_discovery'): self.sd_backend = get_sd_backend(self._agentConfig) if _is_affirmative(self._agentConfig.get('sd_jmx_enable', False)): pipe_path = get_jmx_pipe_path() if Platform.is_windows(): pipe_name = pipe_path.format(pipename=SD_PIPE_NAME) else: pipe_name = os.path.join(pipe_path, SD_PIPE_NAME) if os.access(pipe_path, os.W_OK): if not os.path.exists(pipe_name): os.mkfifo(pipe_name) self.sd_pipe = os.open( pipe_name, os.O_RDWR) # RW to avoid blocking (will only W) # Initialize Supervisor proxy self.supervisor_proxy = self._get_supervisor_socket( self._agentConfig) else: log.debug( 'Unable to create pipe in temporary directory. JMX service discovery disabled.' ) # Load the checks.d checks self._checksd = load_check_directory(self._agentConfig, hostname) # Load JMX configs if available if self._jmx_service_discovery_enabled: jmx_sd_configs = generate_jmx_configs(self._agentConfig, hostname) if jmx_sd_configs: self._submit_jmx_service_discovery(jmx_sd_configs) # Initialize the Collector self.collector = Collector(self._agentConfig, emitters, systemStats, hostname) # In developer mode, the number of runs to be included in a single collector profile try: self.collector_profile_interval = int( self._agentConfig.get('collector_profile_interval', DEFAULT_COLLECTOR_PROFILE_INTERVAL)) except ValueError: log.warn('collector_profile_interval is invalid. ' 'Using default value instead (%s).' % DEFAULT_COLLECTOR_PROFILE_INTERVAL) self.collector_profile_interval = DEFAULT_COLLECTOR_PROFILE_INTERVAL # Configure the watchdog. self.check_frequency = int(self._agentConfig['check_freq']) watchdog = self._get_watchdog(self.check_frequency) # Initialize the auto-restarter self.restart_interval = int( self._agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() self.allow_profiling = self._agentConfig.get('allow_profiling', True) profiled = False collector_profiled_runs = 0 # Run the main loop. while self.run_forever: # Setup profiling if necessary if self.allow_profiling and self.in_developer_mode and not profiled: try: profiler = AgentProfiler() profiler.enable_profiling() profiled = True except Exception as e: log.warn("Cannot enable profiler: %s" % str(e)) if self.reload_configs_flag: if isinstance(self.reload_configs_flag, set): self.reload_configs( checks_to_reload=self.reload_configs_flag) else: self.reload_configs() # Do the work. Pass `configs_reloaded` to let the collector know if it needs to # look for the AgentMetrics check and pop it out. self.collector.run( checksd=self._checksd, start_event=self.start_event, configs_reloaded=True if self.reload_configs_flag else False) self.reload_configs_flag = False # Look for change in the config template store. # The self.sd_backend.reload_check_configs flag is set # to True if a config reload is needed. if self._agentConfig.get('service_discovery') and self.sd_backend and \ not self.sd_backend.reload_check_configs: try: self.sd_backend.reload_check_configs = get_config_store( self._agentConfig).crawl_config_template() except Exception as e: log.warn( 'Something went wrong while looking for config template changes: %s' % str(e)) # Check if we should run service discovery # The `reload_check_configs` flag can be set through the docker_daemon check or # using ConfigStore.crawl_config_template if self._agentConfig.get('service_discovery') and self.sd_backend and \ self.sd_backend.reload_check_configs: self.reload_configs_flag = self.sd_backend.reload_check_configs self.sd_backend.reload_check_configs = False if profiled: if collector_profiled_runs >= self.collector_profile_interval: try: profiler.disable_profiling() profiled = False collector_profiled_runs = 0 except Exception as e: log.warn("Cannot disable profiler: %s" % str(e)) # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for next loop if we will continue, otherwise exit quickly. if self.run_forever: if watchdog: watchdog.reset() if profiled: collector_profiled_runs += 1 log.debug("Sleeping for {0} seconds".format( self.check_frequency)) time.sleep(self.check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except Exception: pass # Explicitly kill the process, because it might be running as a daemon. log.info("Exiting. Bye bye.") sys.exit(0) def _get_emitters(self): return [http_emitter] def _get_watchdog(self, check_freq): watchdog = None if self._agentConfig.get("watchdog", True): watchdog = Watchdog.create(check_freq * WATCHDOG_MULTIPLIER) watchdog.reset() return watchdog def _set_agent_config_hostname(self, agentConfig): # Try to fetch instance Id from EC2 if not hostname has been set # in the config file. # DEPRECATED if agentConfig.get('hostname') is None and agentConfig.get( 'use_ec2_instance_id'): instanceId = EC2.get_instance_id(agentConfig) if instanceId is not None: log.info("Running on EC2, instanceId: %s" % instanceId) agentConfig['hostname'] = instanceId else: log.info( 'Not running on EC2, using hostname to identify this server' ) return agentConfig def _get_supervisor_socket(self, agentConfig): if Platform.is_windows(): return None sockfile = agentConfig.get('supervisor_socket', DEFAULT_SUPERVISOR_SOCKET) supervisor_proxy = xmlrpclib.ServerProxy( 'http://127.0.0.1', transport=supervisor.xmlrpc.SupervisorTransport( None, None, serverurl="unix://{socket}".format(socket=sockfile))) return supervisor_proxy @property def _jmx_service_discovery_enabled(self): return self.sd_pipe is not None def _submit_jmx_service_discovery(self, jmx_sd_configs): if not jmx_sd_configs or not self.sd_pipe: return if self.supervisor_proxy is not None: jmx_state = self.supervisor_proxy.supervisor.getProcessInfo( JMX_SUPERVISOR_ENTRY) log.debug("Current JMX check state: %s", jmx_state['statename']) # restart jmx if stopped if jmx_state['statename'] in [ 'STOPPED', 'EXITED', 'FATAL' ] and self._agentConfig.get('sd_jmx_enable'): self.supervisor_proxy.supervisor.startProcess( JMX_SUPERVISOR_ENTRY) time.sleep(JMX_GRACE_SECS) else: log.debug( "Unable to automatically start jmxfetch on Windows via supervisor." ) buffer = "" for name, yaml in jmx_sd_configs.iteritems(): try: buffer += SD_CONFIG_SEP buffer += "# {}\n".format(name) buffer += yaml except Exception as e: log.exception("unable to submit YAML via RPC: %s", e) else: log.info("JMX SD Config via named pip %s successfully.", name) if buffer: os.write(self.sd_pipe, buffer) def _should_restart(self): if time.time() - self.agent_start > self.restart_interval: return True return False def _do_restart(self): log.info("Running an auto-restart.") if self.collector: self.collector.stop() sys.exit(AgentSupervisor.RESTART_EXIT_STATUS)
def main(): options, args = get_parsed_args() agentConfig = get_config(options=options) autorestart = agentConfig.get("autorestart", False) hostname = get_hostname(agentConfig) in_developer_mode = agentConfig.get("developer_mode") COMMANDS_AGENT = ["start", "stop", "restart", "status", "foreground"] COMMANDS_NO_AGENT = ["info", "check", "configcheck", "jmx", "flare"] COMMANDS = COMMANDS_AGENT + COMMANDS_NO_AGENT if len(args) < 1: sys.stderr.write("Usage: %s %s\n" % (sys.argv[0], "|".join(COMMANDS))) return 2 command = args[0] if command not in COMMANDS: sys.stderr.write("Unknown command: %s\n" % command) return 3 # Deprecation notice if command not in DD_AGENT_COMMANDS: # Will become an error message and exit after deprecation period from utils.deprecations import deprecate_old_command_line_tools deprecate_old_command_line_tools() if command in COMMANDS_AGENT: agent = Agent(PidFile(PID_NAME, PID_DIR).get_path(), autorestart, in_developer_mode=in_developer_mode) if command in START_COMMANDS: log.info("Agent version %s" % get_version()) if "start" == command: log.info("Start daemon") agent.start() elif "stop" == command: log.info("Stop daemon") agent.stop() elif "restart" == command: log.info("Restart daemon") agent.restart() elif "status" == command: agent.status() elif "info" == command: return Agent.info(verbose=options.verbose) elif "foreground" == command: logging.info("Running in foreground") if autorestart: # Set-up the supervisor callbacks and fork it. logging.info("Running Agent with auto-restart ON") def child_func(): agent.start(foreground=True) def parent_func(): agent.start_event = False AgentSupervisor.start(parent_func, child_func) else: # Run in the standard foreground. agent.start(foreground=True) elif "check" == command: if len(args) < 2: sys.stderr.write( "Usage: %s check <check_name> [check_rate]\n" "Add check_rate as last argument to compute rates\n" % sys.argv[0] ) return 1 check_name = args[1] try: import checks.collector # Try the old-style check first print getattr(checks.collector, check_name)(log).check(agentConfig) except Exception: # If not an old-style check, try checks.d checks = load_check_directory(agentConfig, hostname) for check in checks["initialized_checks"]: if check.name == check_name: if in_developer_mode: check.run = AgentProfiler.wrap_profiling(check.run) cs = Collector.run_single_check(check, verbose=True) print CollectorStatus.render_check_status(cs) if len(args) == 3 and args[2] == "check_rate": print "Running 2nd iteration to capture rate metrics" time.sleep(1) cs = Collector.run_single_check(check, verbose=True) print CollectorStatus.render_check_status(cs) check.stop() elif "configcheck" == command or "configtest" == command: configcheck() if agentConfig.get("service_discovery", False): # set the TRACE_CONFIG flag to True to make load_check_directory return # the source of config objects. # Then call load_check_directory here and pass the result to sd_configcheck # to avoid circular imports agentConfig[TRACE_CONFIG] = True configs = { # check_name: (config_source, config) } print ("\nLoading check configurations...\n\n") configs = load_check_directory(agentConfig, hostname) sd_configcheck(agentConfig, configs) elif "jmx" == command: jmx_command(args[1:], agentConfig) elif "flare" == command: Flare.check_user_rights() case_id = int(args[1]) if len(args) > 1 else None f = Flare(True, case_id) f.collect() try: f.upload() except Exception, e: print "The upload failed:\n{0}".format(str(e))