def testFileSystemWithFilterCollection(self): """Test collection on the file system with a filter.""" dirname = u'.' path_spec = path_spec_factory.Factory.NewPathSpec( dfvfs_definitions.TYPE_INDICATOR_OS, location=dirname) filter_name = '' with tempfile.NamedTemporaryFile(delete=False) as temp_file: filter_name = temp_file.name temp_file.write('/test_data/testdir/filter_.+.txt\n') temp_file.write('/test_data/.+evtx\n') temp_file.write('/AUTHORS\n') temp_file.write('/does_not_exist/some_file_[0-9]+txt\n') test_collection_queue = queue.SingleThreadedQueue() test_store = queue.SingleThreadedQueue() resolver_context = context.Context() test_collector = collector.Collector( test_collection_queue, test_store, dirname, path_spec, resolver_context=resolver_context) find_specs = engine_utils.BuildFindSpecsFromFile(filter_name) test_collector.SetFilter(find_specs) test_collector.Collect() test_collector_queue_consumer = TestCollectorQueueConsumer( test_collection_queue) test_collector_queue_consumer.ConsumePathSpecs() try: os.remove(filter_name) except (OSError, IOError) as exception: logging.warning(( u'Unable to remove temporary file: {0:s} with error: {1:s}').format( filter_name, exception)) # Two files with test_data/testdir/filter_*.txt, AUTHORS # and test_data/System.evtx. self.assertEquals(test_collector_queue_consumer.number_of_path_specs, 4) paths = test_collector_queue_consumer.GetFilePaths() current_directory = os.getcwd() expected_path = os.path.join( current_directory, 'test_data', 'testdir', 'filter_1.txt') self.assertTrue(expected_path in paths) expected_path = os.path.join( current_directory, 'test_data', 'testdir', 'filter_2.txt') self.assertFalse(expected_path in paths) expected_path = os.path.join( current_directory, 'test_data', 'testdir', 'filter_3.txt') self.assertTrue(expected_path in paths) expected_path = os.path.join( current_directory, 'AUTHORS') self.assertTrue(expected_path in paths)
def testImageWithFilterCollection(self): """Test collection on a storage media image file with a filter.""" test_file = self._GetTestFilePath(['image.dd']) volume_path_spec = path_spec_factory.Factory.NewPathSpec( dfvfs_definitions.TYPE_INDICATOR_OS, location=test_file) path_spec = path_spec_factory.Factory.NewPathSpec( dfvfs_definitions.TYPE_INDICATOR_TSK, location=u'/', parent=volume_path_spec) filter_name = '' with tempfile.NamedTemporaryFile(delete=False) as temp_file: filter_name = temp_file.name temp_file.write('/a_directory/.+zip\n') temp_file.write('/a_directory/another.+\n') temp_file.write('/passwords.txt\n') test_collection_queue = queue.SingleThreadedQueue() test_storage_queue = queue.SingleThreadedQueue() test_storage_queue_producer = queue.EventObjectQueueProducer( test_storage_queue) resolver_context = context.Context() test_collector = collector.Collector( test_collection_queue, test_storage_queue_producer, test_file, path_spec, resolver_context=resolver_context) find_specs = engine_utils.BuildFindSpecsFromFile(filter_name) test_collector.SetFilter(find_specs) test_collector.Collect() test_collector_queue_consumer = TestCollectorQueueConsumer( test_collection_queue) test_collector_queue_consumer.ConsumePathSpecs() try: os.remove(filter_name) except (OSError, IOError) as exception: logging.warning(( u'Unable to remove temporary file: {0:s} with error: {1:s}').format( filter_name, exception)) self.assertEquals(test_collector_queue_consumer.number_of_path_specs, 2) paths = test_collector_queue_consumer.GetFilePaths() # path_specs[0] # type: TSK # file_path: '/a_directory/another_file' # container_path: 'test_data/image.dd' # image_offset: 0 self.assertEquals(paths[0], u'/a_directory/another_file') # path_specs[1] # type: TSK # file_path: '/passwords.txt' # container_path: 'test_data/image.dd' # image_offset: 0 self.assertEquals(paths[1], u'/passwords.txt')
def testBuildFindSpecsFromFile(self): """Tests the BuildFindSpecsFromFile function.""" filter_name = '' with tempfile.NamedTemporaryFile(delete=False) as temp_file: filter_name = temp_file.name # 2 hits. temp_file.write('/test_data/testdir/filter_.+.txt\n') # A single hit. temp_file.write('/test_data/.+evtx\n') # A single hit. temp_file.write('/AUTHORS\n') temp_file.write('/does_not_exist/some_file_[0-9]+txt\n') # This should not compile properly, missing file information. temp_file.write('failing/\n') # This should not fail during initial loading, but fail later on. temp_file.write('bad re (no close on that parenthesis/file\n') find_specs = engine_utils.BuildFindSpecsFromFile(filter_name) try: os.remove(filter_name) except (OSError, IOError) as exception: logging.warning( u'Unable to remove temporary file: {0:s} with error: {1:s}'.format( filter_name, exception)) self.assertEqual(len(find_specs), 4) dirname = u'.' path_spec = path_spec_factory.Factory.NewPathSpec( dfvfs_definitions.TYPE_INDICATOR_OS, location=dirname) file_system = path_spec_resolver.Resolver.OpenFileSystem(path_spec) searcher = file_system_searcher.FileSystemSearcher( file_system, path_spec) path_spec_generator = searcher.Find(find_specs=find_specs) self.assertNotEqual(path_spec_generator, None) path_specs = list(path_spec_generator) # One evtx, one AUTHORS, two filter_*.txt files, total 4 files. self.assertEqual(len(path_specs), 4) with self.assertRaises(IOError): _ = engine_utils.BuildFindSpecsFromFile('thisfiledoesnotexist') file_system.Close()
def testExtractPathSpecsFileSystemWithFilter(self): """Tests the ExtractPathSpecs function on the file system with a filter.""" source_path_spec = path_spec_factory.Factory.NewPathSpec( dfvfs_definitions.TYPE_INDICATOR_OS, location=u'.') filter_name = '' with tempfile.NamedTemporaryFile(delete=False) as temp_file: filter_name = temp_file.name temp_file.write('/test_data/testdir/filter_.+.txt\n') temp_file.write('/test_data/.+evtx\n') temp_file.write('/AUTHORS\n') temp_file.write('/does_not_exist/some_file_[0-9]+txt\n') resolver_context = context.Context() test_extractor = extractors.PathSpecExtractor(resolver_context) find_specs = engine_utils.BuildFindSpecsFromFile(filter_name) path_specs = list( test_extractor.ExtractPathSpecs([source_path_spec], find_specs=find_specs)) try: os.remove(filter_name) except (OSError, IOError) as exception: logging.warning( (u'Unable to remove temporary file: {0:s} with error: {1:s}' ).format(filter_name, exception)) # Two files with test_data/testdir/filter_*.txt, AUTHORS # and test_data/System.evtx. self.assertEqual(len(path_specs), 4) paths = self._GetFilePaths(path_specs) current_directory = os.getcwd() expected_path = os.path.join(current_directory, u'test_data', u'testdir', u'filter_1.txt') self.assertTrue(expected_path in paths) expected_path = os.path.join(current_directory, u'test_data', u'testdir', u'filter_2.txt') self.assertFalse(expected_path in paths) expected_path = os.path.join(current_directory, u'test_data', u'testdir', u'filter_3.txt') self.assertTrue(expected_path in paths) expected_path = os.path.join(current_directory, u'AUTHORS') self.assertTrue(expected_path in paths)
def testExtractPathSpecsStorageMediaImageWithFilter(self): """Tests the ExtractPathSpecs function on an image file with a filter.""" test_file = self._GetTestFilePath([u'ímynd.dd']) volume_path_spec = path_spec_factory.Factory.NewPathSpec( dfvfs_definitions.TYPE_INDICATOR_OS, location=test_file) source_path_spec = path_spec_factory.Factory.NewPathSpec( dfvfs_definitions.TYPE_INDICATOR_TSK, location=u'/', parent=volume_path_spec) filter_name = '' with tempfile.NamedTemporaryFile(delete=False) as temp_file: filter_name = temp_file.name temp_file.write('/a_directory/.+zip\n') temp_file.write('/a_directory/another.+\n') temp_file.write('/passwords.txt\n') resolver_context = context.Context() test_extractor = extractors.PathSpecExtractor(resolver_context) find_specs = engine_utils.BuildFindSpecsFromFile(filter_name) path_specs = list( test_extractor.ExtractPathSpecs([source_path_spec], find_specs=find_specs)) try: os.remove(filter_name) except (OSError, IOError) as exception: logging.warning( (u'Unable to remove temporary file: {0:s} with error: {1:s}' ).format(filter_name, exception)) self.assertEqual(len(path_specs), 2) paths = self._GetFilePaths(path_specs) # path_specs[0] # type: TSK # file_path: '/a_directory/another_file' # container_path: 'test_data/ímynd.dd' # image_offset: 0 self.assertEqual(paths[0], u'/a_directory/another_file') # path_specs[1] # type: TSK # file_path: '/passwords.txt' # container_path: 'test_data/ímynd.dd' # image_offset: 0 self.assertEqual(paths[1], u'/passwords.txt')
def _ExtractWithFilter(self, source_path_specs, destination_path, output_writer, filter_file_path, skip_duplicates=True): """Extracts files using a filter expression. This method runs the file extraction process on the image and potentially on every VSS if that is wanted. Args: source_path_specs (list[dfvfs.PathSpec]): path specifications to extract. destination_path (str): path where the extracted files should be stored. output_writer (CLIOutputWriter): output writer. filter_file_path (str): path of the file that contains the filter expressions. skip_duplicates (Optional[bool]): True if files with duplicate content should be skipped. """ for source_path_spec in source_path_specs: file_system, mount_point = self._GetSourceFileSystem( source_path_spec, resolver_context=self._resolver_context) if self._knowledge_base is None: self._Preprocess(file_system, mount_point) display_name = path_helper.PathHelper.GetDisplayNameForPathSpec( source_path_spec) output_writer.Write( u'Extracting file entries from: {0:s}\n'.format(display_name)) path_attributes = self._knowledge_base.GetPathAttributes() find_specs = engine_utils.BuildFindSpecsFromFile( filter_file_path, path_attributes=path_attributes) searcher = file_system_searcher.FileSystemSearcher( file_system, mount_point) for path_spec in searcher.Find(find_specs=find_specs): self._ExtractFileEntry(path_spec, destination_path, output_writer, skip_duplicates=skip_duplicates) file_system.Close()
def _ExtractWithFilter(self, source_path_specs, destination_path, filter_file_path, remove_duplicates=True): """Extracts files using a filter expression. This method runs the file extraction process on the image and potentially on every VSS if that is wanted. Args: source_path_specs: list of path specifications (instances of dfvfs.PathSpec) to process. destination_path: The path where the extracted files should be stored. filter_file_path: The path of the file that contains the filter expressions. remove_duplicates: optional boolean value to indicate if files with duplicate content should be removed. The default is True. """ for source_path_spec in source_path_specs: file_system, mount_point = self._GetSourceFileSystem( source_path_spec, resolver_context=self._resolver_context) if self._knowledge_base is None: self._Preprocess(file_system, mount_point) if not os.path.isdir(destination_path): os.makedirs(destination_path) find_specs = engine_utils.BuildFindSpecsFromFile( filter_file_path, pre_obj=self._knowledge_base.pre_obj) # Save the regular files. file_saver = FileSaver(skip_duplicates=remove_duplicates) searcher = file_system_searcher.FileSystemSearcher( file_system, mount_point) for path_spec in searcher.Find(find_specs=find_specs): self._ExtractFile(file_saver, path_spec, destination_path) file_system.Close()
def ProcessSources( self, source_path_specs, source_type, enable_sigsegv_handler=False, filter_file=None, hasher_names_string=None, parser_filter_string=None, preferred_encoding=u'utf-8', single_process_mode=False, status_update_callback=None, storage_serializer_format=definitions.SERIALIZER_FORMAT_PROTOBUF, timezone=pytz.UTC): """Processes the sources. Args: source_path_specs: list of path specifications (instances of dfvfs.PathSpec) to process. source_type: the dfVFS source type definition. enable_sigsegv_handler: optional boolean value to indicate the SIGSEGV handler should be enabled. The default is False. filter_file: optional path to a file that contains find specifications. The default is None. hasher_names_string: optional comma separated string of names of hashers to enable. The default is None. parser_filter_string: optional parser filter string. The default is None. preferred_encoding: optional preferred encoding. The default is UTF-8. single_process_mode: optional boolean value to indicate if the front-end should run in single process mode. The default is False. status_update_callback: optional callback function for status updates. The default is None. storage_serializer_format: optional storage serializer format. The default is protobuf. timezone: optional preferred timezone. The default is UTC. Returns: The processing status (instance of ProcessingStatus) or None. Raises: SourceScannerError: if the source scanner could not find a supported file system. UserAbort: if the user initiated an abort. """ # If the source is a directory or a storage media image # run pre-processing. # TODO: move source_scanner.SourceScannerContext.SOURCE_TYPE_ # to definitions.SOURCE_TYPE_. if source_type in [ source_scanner.SourceScannerContext.SOURCE_TYPE_DIRECTORY, source_scanner.SourceScannerContext. SOURCE_TYPE_STORAGE_MEDIA_DEVICE, source_scanner. SourceScannerContext.SOURCE_TYPE_STORAGE_MEDIA_IMAGE ]: self.SetEnablePreprocessing(True) else: self.SetEnablePreprocessing(False) self._CheckStorageFile(self._storage_file_path) self._single_process_mode = single_process_mode # TODO: move source_scanner.SourceScannerContext.SOURCE_TYPE_ # to definitions.SOURCE_TYPE_. if source_type == source_scanner.SourceScannerContext.SOURCE_TYPE_FILE: # No need to multi process a single file source. self._single_process_mode = True if self._single_process_mode: self._engine = single_process.SingleProcessEngine(self._queue_size) else: self._engine = multi_process.MultiProcessEngine( maximum_number_of_queued_items=self._queue_size) self._engine.SetEnableDebugOutput(self._debug_mode) self._engine.SetEnableProfiling( self._enable_profiling, profiling_sample_rate=self._profiling_sample_rate, profiling_type=self._profiling_type) pre_obj = self._PreprocessSource(source_path_specs, source_type) self._operating_system = getattr(pre_obj, u'guessed_os', None) if not parser_filter_string: guessed_os = self._operating_system os_version = getattr(pre_obj, u'osversion', u'') parser_filter_string = self._GetParserFilterPreset( os_guess=guessed_os, os_version=os_version) if parser_filter_string: logging.info( u'Parser filter expression changed to: {0:s}'.format( parser_filter_string)) self._parser_names = [] for _, parser_class in parsers_manager.ParsersManager.GetParsers( parser_filter_string=parser_filter_string): self._parser_names.append(parser_class.NAME) if u'filestat' in self._parser_names: include_directory_stat = True else: include_directory_stat = False self._hasher_names = [] hasher_manager = hashers_manager.HashersManager for hasher_name in hasher_manager.GetHasherNamesFromString( hasher_names_string=hasher_names_string): self._hasher_names.append(hasher_name) self._PreprocessSetTimezone(pre_obj, timezone=timezone) if filter_file: filter_find_specs = engine_utils.BuildFindSpecsFromFile( filter_file, pre_obj=pre_obj) else: filter_find_specs = None self._PreprocessSetCollectionInformation( pre_obj, source_type, self._engine, filter_file=filter_file, parser_filter_string=parser_filter_string, preferred_encoding=preferred_encoding) if self._output_module: storage_writer = storage.BypassStorageWriter( self._engine.event_object_queue, self._storage_file_path, output_module_string=self._output_module, pre_obj=pre_obj) else: storage_writer = storage.FileStorageWriter( self._engine.event_object_queue, self._storage_file_path, buffer_size=self._buffer_size, pre_obj=pre_obj, serializer_format=storage_serializer_format) storage_writer.SetEnableProfiling( self._enable_profiling, profiling_type=self._profiling_type) processing_status = None try: if self._single_process_mode: logging.debug(u'Starting extraction in single process mode.') processing_status = self._engine.ProcessSources( source_path_specs, storage_writer, filter_find_specs=filter_find_specs, filter_object=self._filter_object, hasher_names_string=hasher_names_string, include_directory_stat=include_directory_stat, mount_path=self._mount_path, parser_filter_string=parser_filter_string, process_archive_files=self._process_archive_files, resolver_context=self._resolver_context, status_update_callback=status_update_callback, text_prepend=self._text_prepend) else: logging.debug(u'Starting extraction in multi process mode.') # TODO: pass number_of_extraction_workers. processing_status = self._engine.ProcessSources( source_path_specs, storage_writer, enable_sigsegv_handler=enable_sigsegv_handler, filter_find_specs=filter_find_specs, filter_object=self._filter_object, hasher_names_string=hasher_names_string, include_directory_stat=include_directory_stat, mount_path=self._mount_path, parser_filter_string=parser_filter_string, process_archive_files=self._process_archive_files, status_update_callback=status_update_callback, show_memory_usage=self._show_worker_memory_information, text_prepend=self._text_prepend) except KeyboardInterrupt: self._CleanUpAfterAbort() raise errors.UserAbort # TODO: check if this still works and if still needed. except Exception as exception: if not self._single_process_mode: raise # The tool should generally not be run in single process mode # for other reasons than to debug. Hence the general error # catching. logging.error( u'An uncaught exception occurred: {0:s}.\n{1:s}'.format( exception, traceback.format_exc())) if self._debug_mode: pdb.post_mortem() return processing_status
def ProcessSource( self, filter_file=None, parser_filter_string=None, hasher_names_string=None, storage_serializer_format=definitions.SERIALIZER_FORMAT_PROTOBUF, timezone=pytz.UTC): """Processes the source. Args: filter_file: a path to a file that contains find specifications. The default is None. parser_filter_string: optional parser filter string. The default is None. hasher_names_string: optional comma separated string of names of hashers to enable. The default is None. storage_serializer_format: optional storage serializer format. The default is protobuf. timezone: optional preferred timezone. The default is UTC. Raises: SourceScannerError: if the source scanner could not find a supported file system. UserAbort: if the user initiated an abort. """ if self.SourceIsDirectory() or self.SourceIsStorageMediaImage(): # If the source is a directory or a storage media image # run pre-processing. self._preprocess = True else: self._preprocess = False self._CheckStorageFile(self._storage_file_path) if self.SourceIsFile(): # No need to multi process a single file source. self._single_process_mode = True if self._single_process_mode: self._engine = self._InitializeSingleProcessModeEngine() else: self._engine = self._InitializeMultiProcessModeEngine() pre_obj = self._PreprocessSource() if not parser_filter_string: guessed_os = getattr(pre_obj, u'guessed_os', u'') os_version = getattr(pre_obj, u'osversion', u'') parser_filter_string = self._GetParserFilterPreset( os_guess=guessed_os, os_version=os_version) if parser_filter_string: logging.info(u'Parser filter expression changed to: {0:s}'.format( parser_filter_string)) self._parser_names = [] for _, parser_class in parsers_manager.ParsersManager.GetParsers( parser_filter_string=parser_filter_string): self._parser_names.append(parser_class.NAME) if u'filestat' in self._parser_names: include_directory_stat = True else: include_directory_stat = False self._hasher_names = [] hasher_manager = hashers_manager.HashersManager for hasher_name in hasher_manager.GetHasherNamesFromString( hasher_names_string=hasher_names_string): self._hasher_names.append(hasher_name) self._PreprocessSetTimezone(pre_obj, timezone=timezone) if filter_file: filter_find_specs = engine_utils.BuildFindSpecsFromFile( filter_file, pre_obj=pre_obj) else: filter_find_specs = None self._PreprocessSetCollectionInformation( pre_obj, self._engine, filter_file=filter_file, parser_filter_string=parser_filter_string) if self._single_process_mode: self._ProcessSourceSingleProcessMode( pre_obj, filter_find_specs=filter_find_specs, include_directory_stat=include_directory_stat, parser_filter_string=parser_filter_string, hasher_names_string=hasher_names_string, storage_serializer_format=storage_serializer_format) else: self._ProcessSourceMultiProcessMode( pre_obj, filter_find_specs=filter_find_specs, include_directory_stat=include_directory_stat, parser_filter_string=parser_filter_string, hasher_names_string=hasher_names_string, storage_serializer_format=storage_serializer_format)
def _ExtractWithFilter(self, filter_file_path, destination_path): """Extracts files using a filter expression. This method runs the file extraction process on the image and potentially on every VSS if that is wanted. Args: filter_file_path: The path of the file that contains the filter expressions. destination_path: The path where the extracted files should be stored. """ # TODO: add support to handle multiple partitions. self._source_path_spec = self.GetSourcePathSpec() searcher = self._GetSourceFileSystemSearcher( resolver_context=self._resolver_context) if self._knowledge_base is None: self._Preprocess(searcher) if not os.path.isdir(destination_path): os.makedirs(destination_path) find_specs = engine_utils.BuildFindSpecsFromFile( filter_file_path, pre_obj=self._knowledge_base.pre_obj) # Save the regular files. FileSaver.calc_md5 = self._remove_duplicates for path_spec in searcher.Find(find_specs=find_specs): FileSaver.WriteFile(path_spec, destination_path) if self._process_vss and self._vss_stores: volume_path_spec = self._source_path_spec.parent logging.info(u'Extracting files from VSS.') vss_path_spec = path_spec_factory.Factory.NewPathSpec( dfvfs_definitions.TYPE_INDICATOR_VSHADOW, location=u'/', parent=volume_path_spec) vss_file_entry = path_spec_resolver.Resolver.OpenFileEntry( vss_path_spec) number_of_vss = vss_file_entry.number_of_sub_file_entries # In plaso 1 represents the first store index in dfvfs and pyvshadow 0 # represents the first store index so 1 is subtracted. vss_store_range = [store_nr - 1 for store_nr in self._vss_stores] for store_index in vss_store_range: logging.info( u'Extracting files from VSS {0:d} out of {1:d}'.format( store_index + 1, number_of_vss)) vss_path_spec = path_spec_factory.Factory.NewPathSpec( dfvfs_definitions.TYPE_INDICATOR_VSHADOW, store_index=store_index, parent=volume_path_spec) path_spec = path_spec_factory.Factory.NewPathSpec( dfvfs_definitions.TYPE_INDICATOR_TSK, location=u'/', parent=vss_path_spec) filename_prefix = 'vss_{0:d}'.format(store_index) file_system = path_spec_resolver.Resolver.OpenFileSystem( path_spec, resolver_context=self._resolver_context) searcher = file_system_searcher.FileSystemSearcher( file_system, vss_path_spec) for path_spec in searcher.Find(find_specs=find_specs): FileSaver.WriteFile(path_spec, destination_path, filename_prefix=filename_prefix)
def ProcessSources(self, source_path_specs, source_type, command_line_arguments=None, enable_sigsegv_handler=False, filter_file=None, hasher_names_string=None, number_of_extraction_workers=0, preferred_encoding=u'utf-8', parser_filter_expression=None, single_process_mode=False, status_update_callback=None, timezone=pytz.UTC): """Processes the sources. Args: source_path_specs: list of path specifications (instances of dfvfs.PathSpec) to process. source_type: the dfVFS source type definition. command_line_arguments: optional string of the command line arguments or None if not set. enable_sigsegv_handler: optional boolean value to indicate the SIGSEGV handler should be enabled. filter_file: optional path to a file that contains find specifications. hasher_names_string: optional comma separated string of names of hashers to enable. number_of_extraction_workers: the number of extraction workers to run. If 0, the number will be selected automatically. preferred_encoding: optional preferred encoding. parser_filter_expression: optional string containing the parser filter expression, where None represents all parsers and plugins. single_process_mode: optional boolean value to indicate if the front-end should run in single process mode. status_update_callback: optional callback function for status updates. timezone: optional preferred timezone. Returns: The processing status (instance of ProcessingStatus) or None. Raises: SourceScannerError: if the source scanner could not find a supported file system. UserAbort: if the user initiated an abort. """ # If the source is a directory or a storage media image # run pre-processing. if source_type in [ dfvfs_definitions.SOURCE_TYPE_DIRECTORY, dfvfs_definitions.SOURCE_TYPE_STORAGE_MEDIA_DEVICE, dfvfs_definitions.SOURCE_TYPE_STORAGE_MEDIA_IMAGE ]: self.SetEnablePreprocessing(True) else: self.SetEnablePreprocessing(False) self._CheckStorageFile(self._storage_file_path) self._single_process_mode = single_process_mode if source_type == dfvfs_definitions.SOURCE_TYPE_FILE: # No need to multi process a single file source. self._single_process_mode = True if self._single_process_mode: self._engine = single_process.SingleProcessEngine(self._queue_size) else: self._engine = multi_process.MultiProcessEngine( maximum_number_of_queued_items=self._queue_size, use_zeromq=self._use_zeromq) self._engine.SetEnableDebugOutput(self._debug_mode) self._engine.SetEnableProfiling( self._enable_profiling, profiling_sample_rate=self._profiling_sample_rate, profiling_type=self._profiling_type) pre_obj = self._PreprocessSources(source_path_specs, source_type) self._operating_system = getattr(pre_obj, u'guessed_os', None) if not parser_filter_expression: guessed_os = self._operating_system os_version = getattr(pre_obj, u'osversion', u'') parser_filter_expression = self._GetParserFilterPreset( os_guess=guessed_os, os_version=os_version) if parser_filter_expression: logging.info( u'Parser filter expression changed to: {0:s}'.format( parser_filter_expression)) self._parser_names = [] for _, parser_class in parsers_manager.ParsersManager.GetParsers( parser_filter_expression=parser_filter_expression): self._parser_names.append(parser_class.NAME) self._hasher_names = [] hasher_manager = hashers_manager.HashersManager for hasher_name in hasher_manager.GetHasherNamesFromString( hasher_names_string=hasher_names_string): self._hasher_names.append(hasher_name) self._PreprocessSetTimezone(pre_obj, timezone=timezone) if filter_file: filter_find_specs = engine_utils.BuildFindSpecsFromFile( filter_file, pre_obj=pre_obj) else: filter_find_specs = None # TODO: deprecate the need for this function. self._PreprocessSetCollectionInformation(pre_obj) session_start = self._CreateSessionStart( command_line_arguments=command_line_arguments, filter_file=filter_file, parser_filter_expression=parser_filter_expression, preferred_encoding=preferred_encoding) storage_writer = storage_zip_file.ZIPStorageFileWriter( self._storage_file_path, pre_obj, buffer_size=self._buffer_size) storage_writer.SetEnableProfiling(self._enable_profiling, profiling_type=self._profiling_type) storage_writer.Open() storage_writer.WriteSessionStart(session_start) processing_status = None try: if self._single_process_mode: logging.debug(u'Starting extraction in single process mode.') processing_status = self._engine.ProcessSources( source_path_specs, storage_writer, filter_find_specs=filter_find_specs, filter_object=self._filter_object, hasher_names_string=hasher_names_string, mount_path=self._mount_path, parser_filter_expression=parser_filter_expression, process_archive_files=self._process_archive_files, resolver_context=self._resolver_context, status_update_callback=status_update_callback, text_prepend=self._text_prepend) else: logging.debug(u'Starting extraction in multi process mode.') # TODO: pass number_of_extraction_workers. processing_status = self._engine.ProcessSources( source_path_specs, storage_writer, enable_sigsegv_handler=enable_sigsegv_handler, filter_find_specs=filter_find_specs, filter_object=self._filter_object, hasher_names_string=hasher_names_string, mount_path=self._mount_path, number_of_extraction_workers=number_of_extraction_workers, parser_filter_expression=parser_filter_expression, process_archive_files=self._process_archive_files, status_update_callback=status_update_callback, show_memory_usage=self._show_worker_memory_information, text_prepend=self._text_prepend) except KeyboardInterrupt: self._CleanUpAfterAbort() raise errors.UserAbort # TODO: check if this still works and if still needed. except Exception as exception: # pylint: disable=broad-except if not self._single_process_mode: raise # The tool should generally not be run in single process mode # for other reasons than to debug. Hence the general error # catching. logging.error( u'An uncaught exception occurred: {0:s}.\n{1:s}'.format( exception, traceback.format_exc())) if self._debug_mode: pdb.post_mortem() return processing_status
def _StartSingleThread(self, options): """Starts everything up in a single process. This should not normally be used, since running the tool in a single process buffers up everything into memory until the storage is called. Just to make it clear, this starts up the collection, completes that before calling the worker that extracts all EventObjects and stores them in memory. when that is all done, the storage function is called to drain the buffer. Hence the tool's excessive use of memory in this mode and the reason why it is not suggested to be used except for debugging reasons (and mostly to get into the debugger). This is therefore mostly useful during debugging sessions for some limited parsing. Args: options: the command line arguments (instance of argparse.Namespace). """ self._engine = single_process.SingleProcessEngine(self._queue_size) self._engine.SetEnableDebugOutput(self._debug_mode) self._engine.SetEnableProfiling( self._enable_profiling, profiling_sample_rate=self._profiling_sample_rate) self._engine.SetProcessArchiveFiles(self._process_archive_files) if self._filter_object: self._engine.SetFilterObject(self._filter_object) if self._mount_path: self._engine.SetMountPath(self._mount_path) if self._text_prepend: self._engine.SetTextPrepend(self._text_prepend) # TODO: add support to handle multiple partitions. self._engine.SetSource( self.GetSourcePathSpec(), resolver_context=self._resolver_context) logging.debug(u'Starting preprocessing.') pre_obj = self.PreprocessSource(options) logging.debug(u'Preprocessing done.') # TODO: make sure parsers option is not set by preprocessing. parser_filter_string = getattr(options, 'parsers', '') self._parser_names = [] for _, parser_class in parsers_manager.ParsersManager.GetParsers( parser_filter_string=parser_filter_string): self._parser_names.append(parser_class.NAME) self._PreprocessSetCollectionInformation(options, pre_obj) if 'filestat' in self._parser_names: include_directory_stat = True else: include_directory_stat = False filter_file = getattr(options, 'file_filter', None) if filter_file: filter_find_specs = engine_utils.BuildFindSpecsFromFile( filter_file, pre_obj=pre_obj) else: filter_find_specs = None self._collector = self._engine.CreateCollector( include_directory_stat, vss_stores=self._vss_stores, filter_find_specs=filter_find_specs, resolver_context=self._resolver_context) self._DebugPrintCollector(options) if self._output_module: storage_writer = storage.BypassStorageWriter( self._engine.storage_queue, self._storage_file_path, output_module_string=self._output_module, pre_obj=pre_obj) else: storage_writer = storage.StorageFileWriter( self._engine.storage_queue, self._storage_file_path, buffer_size=self._buffer_size, pre_obj=pre_obj, serializer_format=self._storage_serializer_format) hasher_names_string = getattr(options, u'hashers', u'') try: self._engine.ProcessSource( self._collector, storage_writer, parser_filter_string=parser_filter_string, hasher_names_string=hasher_names_string) except KeyboardInterrupt: self._CleanUpAfterAbort() raise errors.UserAbort(u'Process source aborted.') finally: self._resolver_context.Empty()
def _ProcessSourceMultiProcessMode(self, options): """Processes the source in a multiple process. Multiprocessing is used to start up separate processes. Args: options: the command line arguments (instance of argparse.Namespace). """ # TODO: replace by an option. start_collection_process = True self._number_of_worker_processes = getattr(options, 'workers', 0) logging.info(u'Starting extraction in multi process mode.') self._engine = multi_process.MultiProcessEngine( maximum_number_of_queued_items=self._queue_size) self._engine.SetEnableDebugOutput(self._debug_mode) self._engine.SetEnableProfiling( self._enable_profiling, profiling_sample_rate=self._profiling_sample_rate) self._engine.SetProcessArchiveFiles(self._process_archive_files) if self._filter_object: self._engine.SetFilterObject(self._filter_object) if self._mount_path: self._engine.SetMountPath(self._mount_path) if self._text_prepend: self._engine.SetTextPrepend(self._text_prepend) # TODO: add support to handle multiple partitions. self._engine.SetSource( self.GetSourcePathSpec(), resolver_context=self._resolver_context) logging.debug(u'Starting preprocessing.') pre_obj = self.PreprocessSource(options) logging.debug(u'Preprocessing done.') # TODO: make sure parsers option is not set by preprocessing. parser_filter_string = getattr(options, 'parsers', '') self._parser_names = [] for _, parser_class in parsers_manager.ParsersManager.GetParsers( parser_filter_string=parser_filter_string): self._parser_names.append(parser_class.NAME) hasher_names_string = getattr(options, u'hashers', u'') self._hasher_names = [] hasher_manager = hashers_manager.HashersManager for hasher_name in hasher_manager.GetHasherNamesFromString( hasher_names_string=hasher_names_string): self._hasher_names.append(hasher_name) self._PreprocessSetCollectionInformation(options, pre_obj) if 'filestat' in self._parser_names: include_directory_stat = True else: include_directory_stat = False filter_file = getattr(options, 'file_filter', None) if filter_file: filter_find_specs = engine_utils.BuildFindSpecsFromFile( filter_file, pre_obj=pre_obj) else: filter_find_specs = None if start_collection_process: resolver_context = context.Context() else: resolver_context = self._resolver_context # TODO: create multi process collector. self._collector = self._engine.CreateCollector( include_directory_stat, vss_stores=self._vss_stores, filter_find_specs=filter_find_specs, resolver_context=resolver_context) self._DebugPrintCollector(options) if self._output_module: storage_writer = storage.BypassStorageWriter( self._engine.storage_queue, self._storage_file_path, output_module_string=self._output_module, pre_obj=pre_obj) else: storage_writer = storage.StorageFileWriter( self._engine.storage_queue, self._storage_file_path, buffer_size=self._buffer_size, pre_obj=pre_obj, serializer_format=self._storage_serializer_format) try: self._engine.ProcessSource( self._collector, storage_writer, parser_filter_string=parser_filter_string, hasher_names_string=hasher_names_string, number_of_extraction_workers=self._number_of_worker_processes, have_collection_process=start_collection_process, have_foreman_process=self._run_foreman, show_memory_usage=self._show_worker_memory_information) except KeyboardInterrupt: self._CleanUpAfterAbort() raise errors.UserAbort(u'Process source aborted.')
def ProcessSources(self, session, storage_writer, source_path_specs, source_type, enable_sigsegv_handler=False, force_preprocessing=False, hasher_names_string=None, number_of_extraction_workers=0, process_archives=False, process_compressed_streams=True, single_process_mode=False, status_update_callback=None, temporary_directory=None, timezone=u'UTC', yara_rules_string=None): """Processes the sources. Args: session (Session): session the storage changes are part of. storage_writer (StorageWriter): storage writer. source_path_specs (list[dfvfs.PathSpec]): path specifications of the sources to process. source_type (str): the dfVFS source type definition. enable_sigsegv_handler (Optional[bool]): True if the SIGSEGV handler should be enabled. force_preprocessing (Optional[bool]): True if preprocessing should be forced. hasher_names_string (Optional[str]): comma separated string of names of hashers to use during processing. number_of_extraction_workers (Optional[int]): number of extraction workers to run. If 0, the number will be selected automatically. process_archives (Optional[bool]): True if archive files should be scanned for file entries. process_compressed_streams (Optional[bool]): True if file content in compressed streams should be processed. single_process_mode (Optional[bool]): True if the front-end should run in single process mode. status_update_callback (Optional[function]): callback function for status updates. temporary_directory (Optional[str]): path of the directory for temporary files. timezone (Optional[datetime.tzinfo]): timezone. yara_rules_string (Optional[str]): unparsed yara rule definitions. Returns: ProcessingStatus: processing status or None. Raises: SourceScannerError: if the source scanner could not find a supported file system. UserAbort: if the user initiated an abort. """ if source_type == dfvfs_definitions.SOURCE_TYPE_FILE: # No need to multi process a single file source. single_process_mode = True engine = self._CreateEngine(single_process_mode) # If the source is a directory or a storage media image # run pre-processing. if force_preprocessing or source_type in self._SOURCE_TYPES_TO_PREPROCESS: self._PreprocessSources(engine, source_path_specs) if not session.parser_filter_expression: operating_system = engine.knowledge_base.GetValue( u'operating_system') operating_system_product = engine.knowledge_base.GetValue( u'operating_system_product') operating_system_version = engine.knowledge_base.GetValue( u'operating_system_version') session.parser_filter_expression = self._GetParserFilterPreset( operating_system, operating_system_product, operating_system_version) if session.parser_filter_expression: logging.info( u'Parser filter expression changed to: {0:s}'.format( session.parser_filter_expression)) self._parser_names = [] for _, parser_class in parsers_manager.ParsersManager.GetParsers( parser_filter_expression=session.parser_filter_expression): self._parser_names.append(parser_class.NAME) self._SetTimezone(engine.knowledge_base, timezone) if session.filter_file: path_attributes = engine.knowledge_base.GetPathAttributes() filter_find_specs = engine_utils.BuildFindSpecsFromFile( session.filter_file, path_attributes=path_attributes) else: filter_find_specs = None processing_status = None if single_process_mode: logging.debug(u'Starting extraction in single process mode.') # TODO: check if preferred_encoding should be passed. processing_status = engine.ProcessSources( source_path_specs, storage_writer, self._resolver_context, filter_find_specs=filter_find_specs, filter_object=self._filter_object, hasher_names_string=hasher_names_string, mount_path=self._mount_path, parser_filter_expression=session.parser_filter_expression, preferred_year=session.preferred_year, process_archives=process_archives, process_compressed_streams=process_compressed_streams, status_update_callback=status_update_callback, temporary_directory=temporary_directory, text_prepend=self._text_prepend, yara_rules_string=yara_rules_string) else: logging.debug(u'Starting extraction in multi process mode.') # TODO: check if preferred_encoding should be passed. processing_status = engine.ProcessSources( session.identifier, source_path_specs, storage_writer, enable_sigsegv_handler=enable_sigsegv_handler, filter_find_specs=filter_find_specs, filter_object=self._filter_object, hasher_names_string=hasher_names_string, mount_path=self._mount_path, number_of_worker_processes=number_of_extraction_workers, parser_filter_expression=session.parser_filter_expression, preferred_year=session.preferred_year, process_archives=process_archives, process_compressed_streams=process_compressed_streams, status_update_callback=status_update_callback, show_memory_usage=self._show_worker_memory_information, temporary_directory=temporary_directory, text_prepend=self._text_prepend, yara_rules_string=yara_rules_string) return processing_status