def _StartAnalysisProcesses( self, knowledge_base_object, storage_writer, analysis_plugins, data_location, event_filter_expression=None): """Starts the analysis processes. Args: knowledge_base_object (KnowledgeBase): contains information from the source data needed for processing. storage_writer (StorageWriter): storage writer. analysis_plugins (list[AnalysisPlugin]): analysis plugins that should be run. data_location (str): path to the location that data files should be loaded from. event_filter_expression (Optional[str]): event filter expression. """ logging.info(u'Starting analysis plugins.') for analysis_plugin in analysis_plugins: if self._use_zeromq: queue_name = u'{0:s} output event queue'.format(analysis_plugin.NAME) output_event_queue = zeromq_queue.ZeroMQPushBindQueue( name=queue_name, timeout_seconds=self._QUEUE_TIMEOUT) # Open the queue so it can bind to a random port, and we can get the # port number to use in the input queue. output_event_queue.Open() else: output_event_queue = multi_process_queue.MultiProcessingQueue( timeout=self._QUEUE_TIMEOUT) self._event_queues[analysis_plugin.NAME] = output_event_queue if self._use_zeromq: queue_name = u'{0:s} input event queue'.format(analysis_plugin.NAME) input_event_queue = zeromq_queue.ZeroMQPullConnectQueue( name=queue_name, delay_open=True, port=output_event_queue.port, timeout_seconds=self._QUEUE_TIMEOUT) else: input_event_queue = output_event_queue process = analysis_process.AnalysisProcess( input_event_queue, storage_writer, knowledge_base_object, analysis_plugin, data_location=data_location, event_filter_expression=event_filter_expression, name=analysis_plugin.plugin_name) process.start() logging.info(u'Started analysis plugin: {0:s} (PID: {1:d}).'.format( analysis_plugin.plugin_name, process.pid)) self._RegisterProcess(process) self._StartMonitoringProcess(process.pid) logging.info(u'Analysis plugins running')
def testMain(self): """Tests the _Main function.""" task_queue = multi_process_queue.MultiProcessingQueue(timeout=1) configuration = configurations.ProcessingConfiguration() test_process = worker_process.WorkerProcess( task_queue, None, None, None, None, configuration, name='TestWorker') test_process._abort = True test_process._pid = 0 test_process._Main()
def testPushPopItem(self): """Tests the PushItem and PopItem functions.""" # A timeout is used to prevent the multi processing queue to close and # stop blocking the current process test_queue = multi_process_queue.MultiProcessingQueue(timeout=0.1) for item in self._ITEMS: test_queue.PushItem(item) test_queue_consumer = TestQueueConsumer(test_queue) test_queue_consumer.ConsumeItems() self.assertEqual(test_queue_consumer.number_of_items, len(self._ITEMS))
def testMain(self): """Tests the _Main function.""" event_queue = multi_process_queue.MultiProcessingQueue(timeout=1) session = sessions.Session() storage_writer = self._CreateStorageWriter(session) analysis_plugin = TestAnalysisPlugin() configuration = configurations.ProcessingConfiguration() test_process = analysis_process.AnalysisProcess( event_queue, storage_writer, None, analysis_plugin, configuration, name='TestAnalysis') test_process._abort = True test_process._FOREMAN_STATUS_WAIT = 1 test_process._pid = 0 test_process._Main()
def ProcessSources(self, session_identifier, source_path_specs, storage_writer, processing_configuration, enable_sigsegv_handler=False, number_of_worker_processes=0, status_update_callback=None, worker_memory_limit=None): """Processes the sources and extract events. Args: session_identifier (str): identifier of the session. source_path_specs (list[dfvfs.PathSpec]): path specifications of the sources to process. storage_writer (StorageWriter): storage writer for a session storage. processing_configuration (ProcessingConfiguration): processing configuration. enable_sigsegv_handler (Optional[bool]): True if the SIGSEGV handler should be enabled. number_of_worker_processes (Optional[int]): number of worker processes. status_update_callback (Optional[function]): callback function for status updates. worker_memory_limit (Optional[int]): maximum amount of memory a worker is allowed to consume, where None represents the default memory limit and 0 represents no limit. Returns: ProcessingStatus: processing status. """ if number_of_worker_processes < 1: # One worker for each "available" CPU (minus other processes). # The number here is derived from the fact that the engine starts up: # * A main process. # # If we want to utilize all CPUs on the system we therefore need to start # up workers that amounts to the total number of CPUs - the other # processes. try: cpu_count = multiprocessing.cpu_count() - 1 if cpu_count <= self._WORKER_PROCESSES_MINIMUM: cpu_count = self._WORKER_PROCESSES_MINIMUM elif cpu_count >= self._WORKER_PROCESSES_MAXIMUM: cpu_count = self._WORKER_PROCESSES_MAXIMUM except NotImplementedError: logger.error(( 'Unable to determine number of CPUs defaulting to {0:d} worker ' 'processes.').format(self._WORKER_PROCESSES_MINIMUM)) cpu_count = self._WORKER_PROCESSES_MINIMUM number_of_worker_processes = cpu_count self._enable_sigsegv_handler = enable_sigsegv_handler self._number_of_worker_processes = number_of_worker_processes if worker_memory_limit is None: self._worker_memory_limit = definitions.DEFAULT_WORKER_MEMORY_LIMIT else: self._worker_memory_limit = worker_memory_limit # Keep track of certain values so we can spawn new extraction workers. self._processing_configuration = processing_configuration self._debug_output = processing_configuration.debug_output self._log_filename = processing_configuration.log_filename self._session_identifier = session_identifier self._status_update_callback = status_update_callback self._storage_writer = storage_writer # Set up the task queue. if not self._use_zeromq: self._task_queue = multi_process_queue.MultiProcessingQueue( maximum_number_of_queued_items=self._maximum_number_of_tasks) else: task_outbound_queue = zeromq_queue.ZeroMQBufferedReplyBindQueue( delay_open=True, linger_seconds=0, maximum_items=1, name='main_task_queue', timeout_seconds=self._ZEROMQ_NO_WORKER_REQUEST_TIME_SECONDS) self._task_queue = task_outbound_queue # The ZeroMQ backed queue must be started first, so we can save its port. # TODO: raises: attribute-defined-outside-init # self._task_queue.name = 'Task queue' self._task_queue.Open() self._task_queue_port = self._task_queue.port self._StartProfiling(self._processing_configuration.profiling) self._task_manager.StartProfiling( self._processing_configuration.profiling, self._name) if self._serializers_profiler: storage_writer.SetSerializersProfiler(self._serializers_profiler) if self._storage_profiler: storage_writer.SetStorageProfiler(self._storage_profiler) # Set up the storage writer before the worker processes. storage_writer.StartTaskStorage() for worker_number in range(number_of_worker_processes): # First argument to _StartWorkerProcess is not used. extraction_process = self._StartWorkerProcess('', storage_writer) if not extraction_process: logger.error('Unable to create worker process: {0:d}'.format( worker_number)) self._StartStatusUpdateThread() try: # Open the storage file after creating the worker processes otherwise # the ZIP storage file will remain locked as long as the worker processes # are alive. storage_writer.Open() storage_writer.WriteSessionStart() try: storage_writer.WritePreprocessingInformation( self.knowledge_base) self._ProcessSources(source_path_specs, storage_writer) finally: storage_writer.WriteSessionCompletion(aborted=self._abort) storage_writer.Close() finally: # Stop the status update thread after close of the storage writer # so we include the storage sync to disk in the status updates. self._StopStatusUpdateThread() if self._serializers_profiler: storage_writer.SetSerializersProfiler(None) if self._storage_profiler: storage_writer.SetStorageProfiler(None) self._task_manager.StopProfiling() self._StopProfiling() try: self._StopExtractionProcesses(abort=self._abort) except KeyboardInterrupt: self._AbortKill() # The abort can leave the main process unresponsive # due to incorrectly finalized IPC. self._KillProcess(os.getpid()) # The task queue should be closed by _StopExtractionProcesses, this # close is a failsafe, primarily due to MultiProcessingQueue's # blocking behavior. self._task_queue.Close(abort=True) if self._processing_status.error_path_specs: task_storage_abort = True else: task_storage_abort = self._abort try: storage_writer.StopTaskStorage(abort=task_storage_abort) except (IOError, OSError) as exception: logger.error( 'Unable to stop task storage with error: {0!s}'.format( exception)) if self._abort: logger.debug('Processing aborted.') self._processing_status.aborted = True else: logger.debug('Processing completed.') # Reset values. self._enable_sigsegv_handler = None self._number_of_worker_processes = None self._worker_memory_limit = definitions.DEFAULT_WORKER_MEMORY_LIMIT self._processing_configuration = None self._session_identifier = None self._status_update_callback = None self._storage_writer = None return self._processing_status
def _StartWorkerProcess(self, process_name, storage_writer): """Creates, starts, monitors and registers a worker process. Args: process_name (str): process name. storage_writer (StorageWriter): storage writer for a session storage used to create task storage. Returns: MultiProcessWorkerProcess: extraction worker process or None on error. """ analysis_plugin = self._analysis_plugins.get(process_name, None) if not analysis_plugin: logger.error('Missing analysis plugin: {0:s}'.format(process_name)) return None if self._use_zeromq: queue_name = '{0:s} output event queue'.format(process_name) output_event_queue = zeromq_queue.ZeroMQPushBindQueue( name=queue_name, timeout_seconds=self._QUEUE_TIMEOUT) # Open the queue so it can bind to a random port, and we can get the # port number to use in the input queue. output_event_queue.Open() else: output_event_queue = multi_process_queue.MultiProcessingQueue( timeout=self._QUEUE_TIMEOUT) self._event_queues[process_name] = output_event_queue if self._use_zeromq: queue_name = '{0:s} input event queue'.format(process_name) input_event_queue = zeromq_queue.ZeroMQPullConnectQueue( name=queue_name, delay_open=True, port=output_event_queue.port, timeout_seconds=self._QUEUE_TIMEOUT) else: input_event_queue = output_event_queue process = analysis_process.AnalysisProcess( input_event_queue, storage_writer, self._knowledge_base, analysis_plugin, self._processing_configuration, data_location=self._data_location, event_filter_expression=self._event_filter_expression, name=process_name) process.start() logger.info('Started analysis plugin: {0:s} (PID: {1:d}).'.format( process_name, process.pid)) try: self._StartMonitoringProcess(process) except (IOError, KeyError) as exception: logger.error( ('Unable to monitor analysis plugin: {0:s} (PID: {1:d}) ' 'with error: {2!s}').format(process_name, process.pid, exception)) process.terminate() return None self._RegisterProcess(process) return process
def ProcessSources( self, session_identifier, source_path_specs, storage_writer, enable_sigsegv_handler=False, filter_find_specs=None, filter_object=None, hasher_names_string=None, mount_path=None, number_of_worker_processes=0, parser_filter_expression=None, preferred_year=None, process_archives=False, process_compressed_streams=True, status_update_callback=None, show_memory_usage=False, temporary_directory=None, text_prepend=None, yara_rules_string=None): """Processes the sources and extract event objects. Args: session_identifier (str): identifier of the session. source_path_specs (list[dfvfs.PathSpec]): path specifications of the sources to process. storage_writer (StorageWriter): storage writer for a session storage. enable_sigsegv_handler (Optional[bool]): True if the SIGSEGV handler should be enabled. filter_find_specs (Optional[list[dfvfs.FindSpec]]): find specifications used in path specification extraction. filter_object (Optional[objectfilter.Filter]): filter object. hasher_names_string (Optional[str]): comma separated string of names of hashers to use during processing. mount_path (Optional[str]): mount path. number_of_worker_processes (Optional[int]): number of worker processes. parser_filter_expression (Optional[str]): parser filter expression, where None represents all parsers and plugins. preferred_year (Optional[int]): preferred year. process_archives (Optional[bool]): True if archive files should be scanned for file entries. process_compressed_streams (Optional[bool]): True if file content in compressed streams should be processed. show_memory_usage (Optional[bool]): True if memory information should be included in status updates. status_update_callback (Optional[function]): callback function for status updates. temporary_directory (Optional[str]): path of the directory for temporary files. text_prepend (Optional[str]): text to prepend to every event. yara_rules_string (Optional[str]): unparsed yara rule definitions. Returns: ProcessingStatus: processing status. """ if number_of_worker_processes < 1: # One worker for each "available" CPU (minus other processes). # The number here is derived from the fact that the engine starts up: # * A main process. # # If we want to utilize all CPUs on the system we therefore need to start # up workers that amounts to the total number of CPUs - the other # processes. try: cpu_count = multiprocessing.cpu_count() - 1 if cpu_count <= self._WORKER_PROCESSES_MINIMUM: cpu_count = self._WORKER_PROCESSES_MINIMUM elif cpu_count >= self._WORKER_PROCESSES_MAXIMUM: cpu_count = self._WORKER_PROCESSES_MAXIMUM except NotImplementedError: logging.error(( u'Unable to determine number of CPUs defaulting to {0:d} worker ' u'processes.').format(self._WORKER_PROCESSES_MINIMUM)) cpu_count = self._WORKER_PROCESSES_MINIMUM number_of_worker_processes = cpu_count self._enable_sigsegv_handler = enable_sigsegv_handler self._number_of_worker_processes = number_of_worker_processes self._show_memory_usage = show_memory_usage # Keep track of certain values so we can spawn new extraction workers. self._filter_find_specs = filter_find_specs self._filter_object = filter_object self._hasher_names_string = hasher_names_string self._mount_path = mount_path self._parser_filter_expression = parser_filter_expression self._preferred_year = preferred_year self._process_archives = process_archives self._process_compressed_streams = process_compressed_streams self._session_identifier = session_identifier self._status_update_callback = status_update_callback self._storage_writer = storage_writer self._temporary_directory = temporary_directory self._text_prepend = text_prepend self._yara_rules_string = yara_rules_string # Set up the task queue. if not self._use_zeromq: self._task_queue = multi_process_queue.MultiProcessingQueue( maximum_number_of_queued_items=self._maximum_number_of_tasks) else: task_outbound_queue = zeromq_queue.ZeroMQBufferedReplyBindQueue( delay_open=True, linger_seconds=0, maximum_items=1, name=u'main_task_queue', timeout_seconds=self._ZEROMQ_NO_WORKER_REQUEST_TIME_SECONDS) self._task_queue = task_outbound_queue # The ZeroMQ backed queue must be started first, so we can save its port. # TODO: raises: attribute-defined-outside-init # self._task_queue.name = u'Task queue' self._task_queue.Open() self._task_queue_port = self._task_queue.port self._StartProfiling() if self._serializers_profiler: storage_writer.SetSerializersProfiler(self._serializers_profiler) # Set up the storage writer before the worker processes. storage_writer.StartTaskStorage() for _ in range(number_of_worker_processes): extraction_process = self._StartExtractionWorkerProcess(storage_writer) self._StartMonitoringProcess(extraction_process.pid) self._StartStatusUpdateThread() try: # Open the storage file after creating the worker processes otherwise # the ZIP storage file will remain locked as long as the worker processes # are alive. storage_writer.Open() storage_writer.WriteSessionStart() try: storage_writer.WritePreprocessingInformation(self.knowledge_base) self._ProcessSources( source_path_specs, storage_writer, filter_find_specs=filter_find_specs) finally: storage_writer.WriteSessionCompletion(aborted=self._abort) storage_writer.Close() finally: # Stop the status update thread after close of the storage writer # so we include the storage sync to disk in the status updates. self._StopStatusUpdateThread() if self._serializers_profiler: storage_writer.SetSerializersProfiler(None) self._StopProfiling() try: self._StopExtractionProcesses(abort=self._abort) except KeyboardInterrupt: self._AbortKill() # The abort can leave the main process unresponsive # due to incorrectly finalized IPC. self._KillProcess(os.getpid()) # The task queue should be closed by _StopExtractionProcesses, this # close is a failsafe, primarily due to MultiProcessingQueue's # blocking behaviour. self._task_queue.Close(abort=True) if self._processing_status.error_path_specs: task_storage_abort = True else: task_storage_abort = self._abort try: storage_writer.StopTaskStorage(abort=task_storage_abort) except (IOError, OSError) as exception: logging.error(u'Unable to stop task storage with error: {0:s}'.format( exception)) if self._abort: logging.debug(u'Processing aborted.') self._processing_status.aborted = True else: logging.debug(u'Processing completed.') # Reset values. self._enable_sigsegv_handler = None self._number_of_worker_processes = None self._show_memory_usage = None self._filter_find_specs = None self._filter_object = None self._hasher_names_string = None self._mount_path = None self._parser_filter_expression = None self._preferred_year = None self._process_archives = None self._process_compressed_streams = None self._session_identifier = None self._status_update_callback = None self._storage_writer = None self._text_prepend = None return self._processing_status