def getsnapshot(): logger.debug('Received getsnapshot request') framefile = tempfile.mktemp(prefix='frame.') try: args = {} try: crawl_options = json.loads(bottle.request.body()) if crawl_options: args['options'] = crawl_options except: pass # benign- no json payload found in the request if bottle.request.query: value = bottle.request.query.get('features', None) if value: args['features'] = value value = bottle.request.query.get('since', None) if value and value in ['EPOCH','BOOT','LASTSNAPSHOT']: args['since'] = value args['url'] = 'file://{0}'.format(framefile) args['compress'] = False crawlutils.snapshot(**args) bottle.response.content_type = 'text/csv' with open(framefile, 'r') as fd: for line in fd: yield line os.remove(framefile) except: if os.path.exists(framefile): os.remove(framefile) bottle.response.content_type = 'application/json' yield json.dumps({'success': False, 'stacktrace': traceback.format_exc().split('\n')}, indent=2)
def start_autonomous_crawler(num_processes, logfile): if params['crawlmode'] == 'OUTCONTAINER': jobs = [] for index in xrange(num_processes): # XXX use options.get() instead options['partition_strategy']['name'] = 'equally_by_pid' partition_args = options['partition_strategy']['args'] partition_args['process_id'] = index partition_args['num_processes'] = num_processes """ XXX(ricarkol): remember that when we finally get rid of these worker processes in favor of a proper pool of working threads, we have to move the caches of previous metrics somewhere out of the FeaturesCrawler objects. And that cache has to be shared among all the working threads. """ p = multiprocessing.Process(name='crawler-%s' % index, target=crawler_worker, args=(index, logfile, params)) jobs.append((p, index)) p.start() logger.info('Crawler %s (pid=%s) started', index, p.pid) while jobs: for (index, (job, process_id)) in enumerate(jobs): if not job.is_alive(): exitcode = job.exitcode pname = job.name pid = job.pid if job.exitcode: logger.info( '%s terminated unexpectedly with errorcode %s' % (pname, exitcode)) for (other_job, process_id) in jobs: if other_job != job: logger.info('Terminating crawler %s (pid=%s)', process_id, other_job.pid) os.kill(other_job.pid, 9) logger.info('Exiting as all jobs were terminated.') raise RuntimeError( '%s terminated unexpectedly with errorcode %s' % (pname, exitcode)) else: logger.info('Crawler %s (pid=%s) exited normally.', process_id, pid) del jobs[index] time.sleep(0.1) logger.info('Exiting as there are no more processes running.') else: # INVM, OUTVM, and others setup_logger('crawlutils', logfile, 0) crawlutils.snapshot(**params)
def crawler_worker(process_id, logfile, params): setup_logger('crawlutils', logfile, process_id) # Starting message logger.info('*' * 50) logger.info('Crawler #%d started.' % (process_id)) logger.info('*' * 50) crawlutils.snapshot(**params)
def start_autonomous_crawler(num_processes, logfile): if params['crawlmode'] == 'OUTCONTAINER': jobs = [] for index in xrange(num_processes): # XXX use options.get() instead options['partition_strategy']['name'] = 'equally_by_pid' partition_args = options['partition_strategy']['args'] partition_args['process_id'] = index partition_args['num_processes'] = num_processes p = multiprocessing.Process( name='crawler-%s' % index, target=crawler_worker, args=( index, logfile, params)) jobs.append((p, index)) p.start() logger.info('Crawler %s (pid=%s) started', index, p.pid) while jobs: for (index, (job, process_id)) in enumerate(jobs): if not job.is_alive(): exitcode = job.exitcode pname = job.name pid = job.pid if job.exitcode: logger.info( '%s terminated unexpectedly with errorcode %s' % (pname, exitcode)) for (other_job, process_id) in jobs: if other_job != job: logger.info( 'Terminating crawler %s (pid=%s)', process_id, other_job.pid) os.kill(other_job.pid, 9) logger.info('Exiting as all jobs were terminated.' ) raise RuntimeError( '%s terminated unexpectedly with errorcode %s' % (pname, exitcode)) else: logger.info( 'Crawler %s (pid=%s) exited normally.', process_id, pid) del jobs[index] time.sleep(0.1) logger.info('Exiting as there are no more processes running.') else: # INVM, OUTVM, and others setup_logger('crawlutils', logfile, 0) crawlutils.snapshot(**params)
def start_autonomous_crawler(num_processes, logfile): if params['crawlmode'] == 'OUTCONTAINER': jobs = [] for index in xrange(num_processes): # XXX use options.get() instead options['partition_strategy']['name'] = 'equally_by_pid' partition_args = options['partition_strategy']['args'] partition_args['process_id'] = index partition_args['num_processes'] = num_processes p = multiprocessing.Process(name='crawler-%s' % index, target=crawler_worker, args=(index, logfile, params)) jobs.append((p, index)) p.start() logger.info('Crawler %s (pid=%s) started', index, p.pid) while jobs: for (index, (job, process_id)) in enumerate(jobs): if not job.is_alive(): exitcode = job.exitcode pname = job.name pid = job.pid if job.exitcode: logger.info( '%s terminated unexpectedly with errorcode %s' % (pname, exitcode)) for (other_job, process_id) in jobs: if other_job != job: logger.info('Terminating crawler %s (pid=%s)', process_id, other_job.pid) os.kill(other_job.pid, 9) logger.info('Exiting as all jobs were terminated.') raise RuntimeError( '%s terminated unexpectedly with errorcode %s' % (pname, exitcode)) else: logger.info('Crawler %s (pid=%s) exited normally.', process_id, pid) del jobs[index] time.sleep(0.1) logger.info('Exiting as there are no more processes running.') else: # INVM, OUTVM, and others setup_logger('crawlutils', logfile, 0) crawlutils.snapshot(**params)
def start_autonomous_crawler(snapshot_params, process_count, logfile): params['parent_pid'] = int(os.getpid()) if params['crawlmode'] == 'OUTCONTAINER': jobs = [] for index in xrange(process_count): params['process_id'] = index params['process_count'] = process_count p = multiprocessing.Process(name="crawler-%s" % (index), target=crawler_worker, args=(index, logfile, snapshot_params)) jobs.append((p, index)) p.start() logger.info("Crawler %s (pid=%s) started", index, p.pid) """ Monitor the children. The behavior is to wait for all children to terminate, or to exit and raise an exception when any of the processes crashes. """ while jobs: for index, (job, process_id) in enumerate(jobs): if not job.is_alive(): exitcode = job.exitcode pname = job.name pid = job.pid if job.exitcode: logger.info("%s terminated unexpectedly with " "errorcode %s" % (pname, exitcode)) for other_job, process_id in jobs: if other_job != job: logger.info("Terminating crawler %s (pid=%s)", process_id, other_job.pid) os.kill(other_job.pid, 9) logger.info("Exiting as all jobs were terminated.") raise RuntimeError("%s terminated unexpectedly with " "errorcode %s" % (pname, exitcode)) else: logger.info("Crawler %s (pid=%s) exited normally.", process_id, pid) del jobs[index] time.sleep(0.1) logger.info("Exiting as there are no more processes running.") else: # INVM, OUTVM, and others setup_logger("crawlutils", logfile, 0) crawlutils.snapshot(**params)
def getsnapshot(): logger.debug('Received getsnapshot request') framefile = tempfile.mktemp(prefix='frame.') try: args = {} try: crawl_options = json.loads(bottle.request.body()) if crawl_options: args['options'] = crawl_options except: pass # benign- no json payload found in the request if bottle.request.query: value = bottle.request.query.get('features', None) if value: args['features'] = value value = bottle.request.query.get('since', None) if value and value in ['EPOCH', 'BOOT', 'LASTSNAPSHOT']: args['since'] = value args['url'] = 'file://{0}'.format(framefile) args['compress'] = False crawlutils.snapshot(**args) bottle.response.content_type = 'text/csv' with open(framefile, 'r') as fd: for line in fd: yield line os.remove(framefile) except: if os.path.exists(framefile): os.remove(framefile) bottle.response.content_type = 'application/json' yield json.dumps( { 'success': False, 'stacktrace': traceback.format_exc().split('\n') }, indent=2)
def crawler_worker(process_id, logfile, snapshot_params): setup_logger("crawlutils", logfile, process_id) crawlutils.snapshot(**snapshot_params)
print 'Starting crawler at URL http://{0}:{1}'.format( CRAWLER_HOST, CRAWLER_PORT) print 'Log output will be in /var/log/crawler.log' print '' logging.basicConfig(filename='/var/log/crawler.log', filemode='w', format='%(asctime)s %(levelname)s : %(message)s', level=logging.DEBUG) logger = logging.getLogger(__name__) logger.info('Started crawler at URL http://{0}:{1}'.format( CRAWLER_HOST, CRAWLER_PORT)) logger.info('Log output will be in /var/log/crawler.log') app.run(host=CRAWLER_HOST, port=CRAWLER_PORT, quiet=True) # Example Usage #1: crawl all features with default options ''' crawlutils.snapshot() ''' # Example Usage #2: crawl selected features with custom options, emit frame to local file ''' my_crawl_commands = [ ('os', None), ('disk', None), ('process', None), ('connection', None), # these features don't take options ('file', {'root_dir':'/', 'exclude_dirs':['boot', 'dev', 'mnt', 'proc', 'sys']}), ('config', {'root_dir':'/', 'known_config_files':['etc/passwd', 'etc/hosts', 'etc/issue', 'etc/mtab', 'etc/group'], 'discover_config_files': True}) ] crawlutils.snapshot(emit_to_url='file://frame.csv', crawl_commands=my_crawl_commands) ''' # Example Usage #3 (UDeploy use case): crawl "file" features and use a customer root_dir_alias, emit frame to local file '''
start_autonomous_crawler(params, args.numprocesses, args.logfile) else: print '' print 'Starting crawler at URL http://{0}:{1}'.format(CRAWLER_HOST, CRAWLER_PORT) print 'Log output will be in /var/log/crawler.log' print '' logging.basicConfig(filename='/var/log/crawler.log', filemode='w', format='%(asctime)s %(levelname)s : %(message)s', level=logging.DEBUG) logger = logging.getLogger(__name__) logger.info('Started crawler at URL http://{0}:{1}'.format(CRAWLER_HOST, CRAWLER_PORT)) logger.info('Log output will be in /var/log/crawler.log') app.run(host=CRAWLER_HOST, port=CRAWLER_PORT, quiet=True) # Example Usage #1: crawl all features with default options ''' crawlutils.snapshot() ''' # Example Usage #2: crawl selected features with custom options, emit frame to local file ''' my_crawl_commands = [ ('os', None), ('disk', None), ('process', None), ('connection', None), # these features don't take options ('file', {'root_dir':'/', 'exclude_dirs':['boot', 'dev', 'mnt', 'proc', 'sys']}), ('config', {'root_dir':'/', 'known_config_files':['etc/passwd', 'etc/hosts', 'etc/issue', 'etc/mtab', 'etc/group'], 'discover_config_files': True}) ] crawlutils.snapshot(emit_to_url='file://frame.csv', crawl_commands=my_crawl_commands) ''' # Example Usage #3 (UDeploy use case): crawl "file" features and use a customer root_dir_alias, emit frame to local file '''