def get_filtered_list_of_containers( options=defaults.DEFAULT_CRAWL_OPTIONS, host_namespace=misc.get_host_ipaddr() ): """ Returns a partition of all the Container objects currently running in the system and set the `namespace` and metadata of these containers. The partitioning is given by `partition_strategy`. """ environment = options.get('environment', defaults.DEFAULT_ENVIRONMENT) metadata = options.get('metadata', {}) _map = metadata.get('container_long_id_to_namespace_map', {}) container_opts = {'host_namespace': host_namespace, 'environment': environment, 'long_id_to_namespace_map': _map, } user_list = options.get('docker_containers_list', 'ALL') partition_strategy = options.get('partition_strategy', None) assert(partition_strategy['name'] == 'equally_by_pid') process_id = partition_strategy['args']['process_id'] num_processes = partition_strategy['args']['num_processes'] filtered_list = [] containers_list = list_all_containers(user_list, container_opts) for container in containers_list: """ There are docker and non-docker containers in this list. An example of a non-docker container is a chromium-browser process. TODO(kollerr): the logic that defines whether a container is acceptable to a plugin or not should be in the plugin itself. """ if (environment != defaults.DEFAULT_ENVIRONMENT and not container.is_docker_container()): continue """ The partition strategy is to split all the containers equally by process pid. We do it by hashing the long_id of the container. """ _hash = container.long_id num = int(_hash, 16) % int(num_processes) if num == process_id: filtered_list.append(container) return filtered_list
def get_filtered_list_of_containers(options=defaults.DEFAULT_CRAWL_OPTIONS, host_namespace=misc.get_host_ipaddr()): """ Returns a partition of all the Container objects currently running in the system and set the `namespace` and metadata of these containers. The partitioning is given by `partition_strategy`. """ environment = options.get('environment', defaults.DEFAULT_ENVIRONMENT) metadata = options.get('metadata', {}) _map = metadata.get('container_long_id_to_namespace_map', {}) container_opts = { 'host_namespace': host_namespace, 'environment': environment, 'long_id_to_namespace_map': _map, } user_list = options.get('docker_containers_list', 'ALL') partition_strategy = options.get('partition_strategy', None) assert (partition_strategy['name'] == 'equally_by_pid') process_id = partition_strategy['args']['process_id'] num_processes = partition_strategy['args']['num_processes'] filtered_list = [] containers_list = list_all_containers(user_list, container_opts) for container in containers_list: """ There are docker and non-docker containers in this list. An example of a non-docker container is a chromium-browser process. TODO(kollerr): the logic that defines whether a container is acceptable to a plugin or not should be in the plugin itself. """ if (environment != defaults.DEFAULT_ENVIRONMENT and not container.is_docker_container()): continue """ The partition strategy is to split all the containers equally by process pid. We do it by hashing the long_id of the container. """ _hash = container.long_id num = int(_hash, 16) % int(num_processes) if num == process_id: filtered_list.append(container) return filtered_list
def get_filtered_list_of_containers( options=defaults.DEFAULT_CRAWL_OPTIONS, host_namespace=misc.get_host_ipaddr(), ): """ Returns a partition of all the Container objects currently running in the system and set the `namespace` and metadata of these containers. The partitioning is given by `partition_strategy`. """ environment = options.get('environment', defaults.DEFAULT_ENVIRONMENT) metadata = options.get('metadata', {}) _map = metadata.get('container_long_id_to_namespace_map', {}) namespace_opts = { 'host_namespace': host_namespace, 'environment': environment, 'long_id_to_namespace_map': _map } user_list = options.get('docker_containers_list', 'ALL') partition_strategy = options.get('partition_strategy', None) assert (partition_strategy['name'] == 'equally_by_pid') process_id = partition_strategy['args']['process_id'] num_processes = partition_strategy['args']['num_processes'] filtered_list = [] containers_list = list_all_containers(user_list, namespace_opts) for container in containers_list: # The partition strategy is to split all the containers equally by # process pid. We do it by hashing the long_id of the container. _hash = container.long_id num = int(_hash, 16) % int(num_processes) if num == process_id: try: container.setup_namespace_and_metadata(namespace_opts) except ContainerInvalidEnvironment: continue if not container.namespace: continue filtered_list.append(container) return filtered_list
def get_filtered_list_of_containers( options=defaults.DEFAULT_CRAWL_OPTIONS, host_namespace=misc.get_host_ipaddr(), ): """ Returns a partition of all the Container objects currently running in the system and set the `namespace` and metadata of these containers. The partitioning is given by `partition_strategy`. """ environment = options.get('environment', defaults.DEFAULT_ENVIRONMENT) metadata = options.get('metadata', {}) _map = metadata.get('container_long_id_to_namespace_map', {}) container_opts = {'host_namespace': host_namespace, 'environment': environment, 'long_id_to_namespace_map': _map, 'container_logs': options['logcrawler']['default_log_files'] } user_list = options.get('docker_containers_list', 'ALL') partition_strategy = options.get('partition_strategy', None) assert(partition_strategy['name'] == 'equally_by_pid') process_id = partition_strategy['args']['process_id'] num_processes = partition_strategy['args']['num_processes'] filtered_list = [] containers_list = list_all_containers(user_list, container_opts) for container in containers_list: # The partition strategy is to split all the containers equally by # process pid. We do it by hashing the long_id of the container. _hash = container.long_id num = int(_hash, 16) % int(num_processes) if num == process_id: try: container.setup_namespace_and_metadata(container_opts) except ContainerInvalidEnvironment: continue if not container.namespace: continue filtered_list.append(container) return filtered_list
def snapshot( urls=['stdout://'], namespace=misc.get_host_ipaddr(), features=defaults.DEFAULT_FEATURES_TO_CRAWL, options=defaults.DEFAULT_CRAWL_OPTIONS, since='BOOT', frequency=-1, crawlmode=Modes.INVM, inputfile='Undefined', format='csv', overwrite=False, ): """Entrypoint for crawler functionality. This is the function executed by long running crawler processes. It just loops sleeping for `frequency` seconds at each crawl interval. During each interval, it collects the features listed in `features`, and sends them to the outputs listed in `urls`. :param urls: The url used as the output of the snapshot. :param namespace: This a pointer to a specific system (e.g. IP for INVM). :param features: List of features to crawl. :param options: Tree of options with details like what config files. :param since: Calculate deltas or not. XXX needs some work. :param frequency: Sleep duration between iterations. -1 means just one run. :param crawlmode: What's the system we want to crawl. :param inputfile: Applies to mode.FILE. The frame emitted is this file. :param format: The format of the frame, defaults to csv. """ global should_exit saved_args = locals() logger.debug('snapshot args: %s' % (saved_args)) assert ('metadata' in options) environment = options.get('environment', defaults.DEFAULT_ENVIRONMENT) since_timestamp, last_snapshot_time = get_initial_since_values(since) snapshot_num = 0 # Die if the parent dies PR_SET_PDEATHSIG = 1 libc.prctl(PR_SET_PDEATHSIG, signal.SIGHUP) signal.signal(signal.SIGHUP, signal_handler_exit) if crawlmode == Modes.OUTCONTAINER: containers = get_filtered_list_of_containers(options, namespace) # This is the main loop of the system, taking a snapshot and sleeping at # every iteration. while True: snapshot_time = int(time.time()) if crawlmode == Modes.OUTCONTAINER: curr_containers = get_filtered_list_of_containers( options, namespace) deleted = [c for c in containers if c not in curr_containers] containers = curr_containers for container in deleted: if options.get('link_container_log_files', False): container.unlink_logfiles(options) logger.debug('Crawling %d containers' % (len(containers))) for container in containers: logger.info( 'Crawling container %s %s %s' % (container.pid, container.short_id, container.namespace)) if options.get('link_container_log_files', False): # This is a NOP if files are already linked (which is # pretty much always). container.link_logfiles(options=options) snapshot_container(urls=urls, snapshot_num=snapshot_num, features=features, options=options, format=format, inputfile=inputfile, container=container, since=since, since_timestamp=since_timestamp, overwrite=overwrite) elif crawlmode in (Modes.INVM, Modes.MOUNTPOINT, Modes.DEVICE, Modes.FILE, Modes.ISCSI): snapshot_generic(crawlmode=crawlmode, urls=urls, snapshot_num=snapshot_num, features=features, options=options, format=format, inputfile=inputfile, namespace=namespace, since=since, since_timestamp=since_timestamp, overwrite=overwrite) else: raise RuntimeError('Unknown Mode') # Frequency <= 0 means only one run. if frequency < 0 or should_exit: logger.info('Bye') break if since == 'LASTSNAPSHOT': # Subsequent snapshots will update this value. since_timestamp = snapshot_time time.sleep(frequency) snapshot_num += 1
import logging.handlers import time import multiprocessing import argparse import json from config_parser import (get_config, apply_user_args) # External dependencies that must be pip install'ed separately import misc import crawlutils from crawlmodes import Modes CRAWLER_HOST = misc.get_host_ipaddr() logger = None def csv_list(string): return string.split(',') def setup_logger(logger_name, logfile='crawler.log', process_id=None): _logger = logging.getLogger(logger_name) _logger.setLevel(logging.INFO) (logfile_name, logfile_xtnsion) = os.path.splitext(logfile) if process_id is None: fname = logfile else:
def snapshot( urls=['stdout://'], namespace=misc.get_host_ipaddr(), features=config_parser.get_config()['general']['features_to_crawl'], options={}, frequency=-1, crawlmode=Modes.INVM, format='csv', overwrite=False, first_snapshot_num=0, max_snapshots=-1): """Entrypoint for crawler functionality. This is the function executed by long running crawler processes. It just loops sleeping for `frequency` seconds at each crawl interval. During each interval, it collects the features listed in `features`, and sends them to the outputs listed in `urls`. :param urls: The url used as the output of the snapshot. :param namespace: This a pointer to a specific system (e.g. IP for INVM). :param features: List of features to crawl. :param options: Tree of options with details like what config files. :param frequency: Target time period for iterations. -1 means just one run. :param crawlmode: What's the system we want to crawl. :param format: The format of the frame, defaults to csv. """ global should_exit saved_args = locals() logger.debug('snapshot args: %s' % (saved_args)) environment = options.get( 'environment', config_parser.get_config()['general']['environment']) plugin_places = options.get( 'plugin_places', config_parser.get_config()['general']['plugin_places']) plugin_mode = config_parser.get_config()['general']['plugin_mode'] plugins_manager.reload_env_plugin(plugin_places=plugin_places, environment=environment) plugins_manager.reload_container_crawl_plugins(plugin_places=plugin_places, features=features, plugin_mode=plugin_mode) plugins_manager.reload_vm_crawl_plugins(plugin_places=plugin_places, features=features, plugin_mode=plugin_mode) plugins_manager.reload_host_crawl_plugins(plugin_places=plugin_places, features=features, plugin_mode=plugin_mode) next_iteration_time = None snapshot_num = first_snapshot_num # Die if the parent dies PR_SET_PDEATHSIG = 1 try: libc.prctl(PR_SET_PDEATHSIG, signal.SIGHUP) signal.signal(signal.SIGHUP, signal_handler_exit) except AttributeError: logger.warning('prctl is not available. MacOS is not supported.') containers = [] # This is the main loop of the system, taking a snapshot and sleeping at # every iteration. while True: snapshot_time = int(time.time()) if crawlmode == Modes.OUTCONTAINER: containers = snapshot_containers( containers=containers, urls=urls, snapshot_num=snapshot_num, features=features, options=options, format=format, overwrite=overwrite, host_namespace=namespace, ) elif crawlmode == Modes.MESOS: snapshot_mesos( crawlmode=crawlmode, urls=urls, snapshot_num=snapshot_num, options=options, format=format, overwrite=overwrite, namespace=namespace, ) elif crawlmode == Modes.OUTVM: snapshot_vms( urls=urls, snapshot_num=snapshot_num, features=features, options=options, format=format, overwrite=overwrite, namespace=namespace, ) elif crawlmode in [Modes.INVM, Modes.MOUNTPOINT]: snapshot_generic(crawlmode=crawlmode, urls=urls, snapshot_num=snapshot_num, features=features, options=options, format=format, namespace=namespace, overwrite=overwrite) else: raise NotImplementedError('Crawl mode %s is not implemented' % crawlmode) # Frequency < 0 means only one run. if (frequency < 0 or should_exit or snapshot_num == max_snapshots): logger.info('Bye') break time_to_sleep, next_iteration_time = _get_next_iteration_time( next_iteration_time, frequency, snapshot_time) if time_to_sleep > 0: time.sleep(time_to_sleep) snapshot_num += 1
def snapshot( urls=['stdout://'], namespace=misc.get_host_ipaddr(), features=defaults.DEFAULT_FEATURES_TO_CRAWL, options=defaults.DEFAULT_CRAWL_OPTIONS, since='BOOT', frequency=-1, crawlmode=Modes.INVM, inputfile='Undefined', format='csv', overwrite=False, ): """Entrypoint for crawler functionality. This is the function executed by long running crawler processes. It just loops sleeping for `frequency` seconds at each crawl interval. During each interval, it collects the features listed in `features`, and sends them to the outputs listed in `urls`. :param urls: The url used as the output of the snapshot. :param namespace: This a pointer to a specific system (e.g. IP for INVM). :param features: List of features to crawl. :param options: Tree of options with details like what config files. :param since: Calculate deltas or not. XXX needs some work. :param frequency: Target time period for iterations. -1 means just one run. :param crawlmode: What's the system we want to crawl. :param inputfile: Applies to mode.FILE. The frame emitted is this file. :param format: The format of the frame, defaults to csv. """ global should_exit saved_args = locals() logger.debug('snapshot args: %s' % (saved_args)) assert('metadata' in options) environment = options.get('environment', defaults.DEFAULT_ENVIRONMENT) plugin_places = options.get('plugin_places', defaults.DEFAULT_PLUGIN_PLACES).split(',') plugins_manager.reload_env_plugin(plugin_places=plugin_places, environment=environment) since_timestamp, last_snapshot_time = get_initial_since_values(since) next_iteration_time = None snapshot_num = 0 # Die if the parent dies PR_SET_PDEATHSIG = 1 libc.prctl(PR_SET_PDEATHSIG, signal.SIGHUP) signal.signal(signal.SIGHUP, signal_handler_exit) if crawlmode == Modes.OUTCONTAINER: containers = get_filtered_list_of_containers(options, namespace) # This is the main loop of the system, taking a snapshot and sleeping at # every iteration. while True: snapshot_time = int(time.time()) if crawlmode == Modes.OUTCONTAINER: curr_containers = get_filtered_list_of_containers(options, namespace) deleted = [c for c in containers if c not in curr_containers] containers = curr_containers for container in deleted: if options.get('link_container_log_files', False): try: container.unlink_logfiles(options) except NotImplementedError: pass logger.debug('Crawling %d containers' % (len(containers))) for container in containers: logger.info( 'Crawling container %s %s %s' % (container.pid, container.short_id, container.namespace)) if options.get('link_container_log_files', False): # This is a NOP if files are already linked (which is # pretty much always). try: container.link_logfiles(options=options) except NotImplementedError: pass # no feature crawling if 'nofeatures' in features: continue snapshot_container( urls=urls, snapshot_num=snapshot_num, features=features, options=options, format=format, inputfile=inputfile, container=container, since=since, since_timestamp=since_timestamp, overwrite=overwrite ) elif crawlmode in (Modes.INVM, Modes.MOUNTPOINT, Modes.DEVICE, Modes.FILE, Modes.ISCSI): snapshot_generic( crawlmode=crawlmode, urls=urls, snapshot_num=snapshot_num, features=features, options=options, format=format, inputfile=inputfile, namespace=namespace, since=since, since_timestamp=since_timestamp, overwrite=overwrite ) else: raise RuntimeError('Unknown Mode') if since == 'LASTSNAPSHOT': # Subsequent snapshots will update this value. since_timestamp = snapshot_time # Frequency <= 0 means only one run. if frequency < 0 or should_exit: logger.info('Bye') break elif frequency == 0: continue if next_iteration_time is None: next_iteration_time = snapshot_time + frequency else: next_iteration_time = next_iteration_time + frequency while next_iteration_time + frequency < time.time(): next_iteration_time = next_iteration_time + frequency time_to_sleep = next_iteration_time - time.time() if time_to_sleep > 0: time.sleep(time_to_sleep) snapshot_num += 1
import cPickle as pickle import json import copy # External dependencies that must be pip install'ed separately import bottle import defaults import misc import crawlutils from crawlmodes import Modes app = bottle.Bottle() CRAWLER_HOST = misc.get_host_ipaddr() CRAWLER_PORT = 9999 logger = None # This dict keeps track of active snapshot tasks on this host tasks = {} # this string should be same as the contents of the README.API file apihelp = \ ''' Crawler API -----------