def _ExtractWithExtensions(self, extensions, destination_path): """Extracts files using extensions. Args: extensions: a list of extensions. destination_path: the path where the extracted files should be stored. """ logging.info( u'Finding files with extensions: {0:s}'.format(extensions)) if not os.path.isdir(destination_path): os.makedirs(destination_path) input_queue = queue.SingleThreadedQueue() # TODO: add support to handle multiple partitions. self._source_path_spec = self.GetSourcePathSpec() image_collector = collector.Collector(input_queue, self._source_path, self._source_path_spec) image_collector.Collect() FileSaver.calc_md5 = self._remove_duplicates input_queue_consumer = ImageExtractorQueueConsumer( input_queue, extensions, destination_path) input_queue_consumer.ConsumePathSpecs()
def testFileSystemCollection(self): """Test collection on the file system.""" test_files = [ self._GetTestFilePath([u'syslog.tgz']), self._GetTestFilePath([u'syslog.zip']), self._GetTestFilePath([u'syslog.bz2']), self._GetTestFilePath([u'wtmp.1'])] with shared_test_lib.TempDirectory() as dirname: for a_file in test_files: shutil.copy(a_file, dirname) path_spec = path_spec_factory.Factory.NewPathSpec( dfvfs_definitions.TYPE_INDICATOR_OS, location=dirname) test_path_spec_queue = single_process.SingleProcessQueue() resolver_context = context.Context() test_collector = collector.Collector( test_path_spec_queue, resolver_context=resolver_context) test_collector.Collect([path_spec]) test_collector_queue_consumer = TestCollectorQueueConsumer( test_path_spec_queue) test_collector_queue_consumer.ConsumeItems() self.assertEqual(test_collector_queue_consumer.number_of_path_specs, 4)
def _Extract(self, source_path_specs, destination_path, remove_duplicates=True): """Extracts files. Args: source_path_specs: list of path specifications (instances of dfvfs.PathSpec) to process. destination_path: the path where the extracted files should be stored. remove_duplicates: optional boolean value to indicate if files with duplicate content should be removed. The default is True. """ if not os.path.isdir(destination_path): os.makedirs(destination_path) input_queue = single_process.SingleProcessQueue() image_collector = collector.Collector(input_queue) image_collector.Collect(source_path_specs) file_saver = FileSaver(skip_duplicates=remove_duplicates) input_queue_consumer = ImageExtractorQueueConsumer( input_queue, file_saver, destination_path, self._filter_collection) input_queue_consumer.ConsumeItems()
def _Extract(self, destination_path, remove_duplicates=True): """Extracts files. Args: destination_path: the path where the extracted files should be stored. remove_duplicates: optional boolean value to indicate if files with duplicate content should be removed. The default is True. """ if not os.path.isdir(destination_path): os.makedirs(destination_path) input_queue = single_process.SingleProcessQueue() # TODO: add support to handle multiple partitions. self._source_path_spec = self.GetSourcePathSpec() image_collector = collector.Collector( input_queue, self._source_path, self._source_path_spec) image_collector.Collect() file_saver = FileSaver(skip_duplicates=remove_duplicates) input_queue_consumer = ImageExtractorQueueConsumer( input_queue, file_saver, destination_path, self._filter_collection) input_queue_consumer.ConsumeItems()
def testFileSystemWithFilterCollection(self): """Test collection on the file system with a filter.""" dirname = u'.' path_spec = path_spec_factory.Factory.NewPathSpec( dfvfs_definitions.TYPE_INDICATOR_OS, location=dirname) filter_name = '' with tempfile.NamedTemporaryFile(delete=False) as temp_file: filter_name = temp_file.name temp_file.write('/test_data/testdir/filter_.+.txt\n') temp_file.write('/test_data/.+evtx\n') temp_file.write('/AUTHORS\n') temp_file.write('/does_not_exist/some_file_[0-9]+txt\n') test_collection_queue = queue.SingleThreadedQueue() test_store = queue.SingleThreadedQueue() resolver_context = context.Context() test_collector = collector.Collector( test_collection_queue, test_store, dirname, path_spec, resolver_context=resolver_context) find_specs = engine_utils.BuildFindSpecsFromFile(filter_name) test_collector.SetFilter(find_specs) test_collector.Collect() test_collector_queue_consumer = TestCollectorQueueConsumer( test_collection_queue) test_collector_queue_consumer.ConsumePathSpecs() try: os.remove(filter_name) except (OSError, IOError) as exception: logging.warning(( u'Unable to remove temporary file: {0:s} with error: {1:s}').format( filter_name, exception)) # Two files with test_data/testdir/filter_*.txt, AUTHORS # and test_data/System.evtx. self.assertEquals(test_collector_queue_consumer.number_of_path_specs, 4) paths = test_collector_queue_consumer.GetFilePaths() current_directory = os.getcwd() expected_path = os.path.join( current_directory, 'test_data', 'testdir', 'filter_1.txt') self.assertTrue(expected_path in paths) expected_path = os.path.join( current_directory, 'test_data', 'testdir', 'filter_2.txt') self.assertFalse(expected_path in paths) expected_path = os.path.join( current_directory, 'test_data', 'testdir', 'filter_3.txt') self.assertTrue(expected_path in paths) expected_path = os.path.join( current_directory, 'AUTHORS') self.assertTrue(expected_path in paths)
def testFileSystemCollection(self): """Test collection on the file system.""" test_files = [ self._GetTestFilePath(['syslog.tgz']), self._GetTestFilePath(['syslog.zip']), self._GetTestFilePath(['syslog.bz2']), self._GetTestFilePath(['wtmp.1'])] with TempDirectory() as dirname: for a_file in test_files: shutil.copy(a_file, dirname) path_spec = path_spec_factory.Factory.NewPathSpec( dfvfs_definitions.TYPE_INDICATOR_OS, location=dirname) test_collection_queue = queue.SingleThreadedQueue() test_store = queue.SingleThreadedQueue() resolver_context = context.Context() test_collector = collector.Collector( test_collection_queue, test_store, dirname, path_spec, resolver_context=resolver_context) test_collector.Collect() test_collector_queue_consumer = TestCollectorQueueConsumer( test_collection_queue) test_collector_queue_consumer.ConsumePathSpecs() self.assertEquals(test_collector_queue_consumer.number_of_path_specs, 4)
def testImageWithFilterCollection(self): """Test collection on a storage media image file with a filter.""" test_file = self._GetTestFilePath(['image.dd']) volume_path_spec = path_spec_factory.Factory.NewPathSpec( dfvfs_definitions.TYPE_INDICATOR_OS, location=test_file) path_spec = path_spec_factory.Factory.NewPathSpec( dfvfs_definitions.TYPE_INDICATOR_TSK, location=u'/', parent=volume_path_spec) filter_name = '' with tempfile.NamedTemporaryFile(delete=False) as temp_file: filter_name = temp_file.name temp_file.write('/a_directory/.+zip\n') temp_file.write('/a_directory/another.+\n') temp_file.write('/passwords.txt\n') test_collection_queue = queue.SingleThreadedQueue() test_storage_queue = queue.SingleThreadedQueue() test_storage_queue_producer = queue.EventObjectQueueProducer( test_storage_queue) resolver_context = context.Context() test_collector = collector.Collector( test_collection_queue, test_storage_queue_producer, test_file, path_spec, resolver_context=resolver_context) find_specs = engine_utils.BuildFindSpecsFromFile(filter_name) test_collector.SetFilter(find_specs) test_collector.Collect() test_collector_queue_consumer = TestCollectorQueueConsumer( test_collection_queue) test_collector_queue_consumer.ConsumePathSpecs() try: os.remove(filter_name) except (OSError, IOError) as exception: logging.warning(( u'Unable to remove temporary file: {0:s} with error: {1:s}').format( filter_name, exception)) self.assertEquals(test_collector_queue_consumer.number_of_path_specs, 2) paths = test_collector_queue_consumer.GetFilePaths() # path_specs[0] # type: TSK # file_path: '/a_directory/another_file' # container_path: 'test_data/image.dd' # image_offset: 0 self.assertEquals(paths[0], u'/a_directory/another_file') # path_specs[1] # type: TSK # file_path: '/passwords.txt' # container_path: 'test_data/image.dd' # image_offset: 0 self.assertEquals(paths[1], u'/passwords.txt')
def CreateCollector( self, include_directory_stat, vss_stores=None, filter_find_specs=None, resolver_context=None): """Creates a collector object. The collector discovers all the files that need to be processed by the workers. Once a file is discovered it is added to the process queue as a path specification (instance of dfvfs.PathSpec). Args: include_directory_stat: Boolean value to indicate whether directory stat information should be collected. vss_stores: Optional list of VSS stores to include in the collection, where 1 represents the first store. Set to None if no VSS stores should be processed. The default is None. filter_find_specs: Optional list of filter find specifications (instances of dfvfs.FindSpec). The default is None. resolver_context: Optional resolver context (instance of dfvfs.Context). The default is None. Note that every thread or process must have its own resolver context. Returns: A collector object (instance of Collector). Raises: RuntimeError: if source path specification is not set. """ if not self._source_path_spec: raise RuntimeError(u'Missing source.') collector_object = collector.Collector( self._collection_queue, self._source, self._source_path_spec, resolver_context=resolver_context) collector_object.SetCollectDirectoryMetadata(include_directory_stat) if vss_stores: collector_object.SetVssInformation(vss_stores) if filter_find_specs: collector_object.SetFilter(filter_find_specs) return collector_object
def testImageCollection(self): """Test collection on a storage media image file. This images has two files: + logs/hidden.zip + logs/sys.tgz The hidden.zip file contains one file, syslog, which is the same for sys.tgz. The end results should therefore be: + logs/hidden.zip (unchanged) + logs/hidden.zip:syslog (the text file extracted out) + logs/sys.tgz (unchanged) + logs/sys.tgz (read as a GZIP file, so not compressed) + logs/sys.tgz:syslog.gz (A GZIP file from the TAR container) + logs/sys.tgz:syslog.gz:syslog (the extracted syslog file) This means that the collection script should collect 6 files in total. """ test_file = self._GetTestFilePath(['syslog_image.dd']) volume_path_spec = path_spec_factory.Factory.NewPathSpec( dfvfs_definitions.TYPE_INDICATOR_OS, location=test_file) path_spec = path_spec_factory.Factory.NewPathSpec( dfvfs_definitions.TYPE_INDICATOR_TSK, location=u'/', parent=volume_path_spec) test_collection_queue = queue.SingleThreadedQueue() test_storage_queue = queue.SingleThreadedQueue() test_storage_queue_producer = queue.EventObjectQueueProducer( test_storage_queue) resolver_context = context.Context() test_collector = collector.Collector( test_collection_queue, test_storage_queue_producer, test_file, path_spec, resolver_context=resolver_context) test_collector.Collect() test_collector_queue_consumer = TestCollectorQueueConsumer( test_collection_queue) test_collector_queue_consumer.ConsumePathSpecs() self.assertEquals(test_collector_queue_consumer.number_of_path_specs, 2)
def CreateCollector(self, include_directory_stat, vss_stores=None, filter_find_specs=None, resolver_context=None): """Creates a collector. Args: include_directory_stat: Boolean value to indicate whether directory stat information should be collected. vss_stores: Optional list of VSS stores to include in the collection, where 1 represents the first store. Set to None if no VSS stores should be processed. The default is None. filter_find_specs: Optional list of filter find specifications (instances of dfvfs.FindSpec). The default is None. resolver_context: Optional resolver context (instance of dfvfs.Context). The default is None. Note that every thread or process must have its own resolver context. Raises: RuntimeError: if source path specification is not set. """ if not self._source_path_spec: raise RuntimeError(u'Missing source.') collector_object = collector.Collector( self._collection_queue, self._storage_queue_producer, self._source, self._source_path_spec, resolver_context=resolver_context) collector_object.collect_directory_metadata = include_directory_stat if vss_stores: collector_object.SetVssInformation(vss_stores) if filter_find_specs: collector_object.SetFilter(filter_find_specs) return collector_object
def __init__(self, stop_collector_event, source_path_specs, path_spec_queue, filter_find_specs=None, include_directory_stat=True, **kwargs): """Initializes the process object. Args: stop_collector_event: the stop process event (instance of multiprocessing.Event). The collector should exit after this event is set. source_path_specs: list of path specifications (instances of dfvfs.PathSpec) to process. path_spec_queue: the path specification queue object (instance of MultiProcessingQueue). filter_find_specs: Optional list of filter find specifications (instances of dfvfs.FindSpec). The default is None. include_directory_stat: Optional boolean value to indicate whether directory stat information should be collected. The default is True. kwargs: keyword arguments to pass to multiprocessing.Process. """ super(MultiProcessCollectorProcess, self).__init__(definitions.PROCESS_TYPE_COLLECTOR, **kwargs) resolver_context = context.Context() self._collector = collector.Collector( path_spec_queue, resolver_context=resolver_context) self._path_spec_queue = path_spec_queue self._source_path_specs = source_path_specs self._stop_collector_event = stop_collector_event self._collector.SetCollectDirectoryMetadata(include_directory_stat) if filter_find_specs: self._collector.SetFilter(filter_find_specs)
def _Extract(self, destination_path): """Extracts files. Args: destination_path: the path where the extracted files should be stored. """ if not os.path.isdir(destination_path): os.makedirs(destination_path) input_queue = single_process.SingleProcessQueue() # TODO: add support to handle multiple partitions. self._source_path_spec = self.GetSourcePathSpec() image_collector = collector.Collector( input_queue, self._source_path, self._source_path_spec) image_collector.Collect() FileSaver.calc_md5 = self._remove_duplicates input_queue_consumer = ImageExtractorQueueConsumer( input_queue, destination_path, self._filter_collection) input_queue_consumer.ConsumeItems()
def testImageWithPartitionsCollections(self): """Test collection on a storage media image file with multiple partitions. The image contains 2 partitions (p1 and p2) with NFTS file systems. """ test_file = self._GetTestFilePath([u'multi_partition_image.vmdk']) image_path_spec = path_spec_factory.Factory.NewPathSpec( dfvfs_definitions.TYPE_INDICATOR_OS, location=test_file) p1_path_spec = path_spec_factory.Factory.NewPathSpec( dfvfs_definitions.TYPE_INDICATOR_TSK_PARTITION, location=u'/p1', part_index=2, start_offset=0x00010000, parent=image_path_spec) p1_file_system_path_spec = path_spec_factory.Factory.NewPathSpec( dfvfs_definitions.TYPE_INDICATOR_TSK, location=u'/', parent=p1_path_spec) p2_path_spec = path_spec_factory.Factory.NewPathSpec( dfvfs_definitions.TYPE_INDICATOR_TSK_PARTITION, location=u'/p2', part_index=3, start_offset=0x00510000, parent=image_path_spec) p2_file_system_path_spec = path_spec_factory.Factory.NewPathSpec( dfvfs_definitions.TYPE_INDICATOR_TSK, location=u'/', parent=p2_path_spec) test_path_spec_queue = single_process.SingleProcessQueue() resolver_context = context.Context() test_collector = collector.Collector( test_path_spec_queue, resolver_context=resolver_context) test_collector.Collect([p1_file_system_path_spec, p2_file_system_path_spec]) test_collector_queue_consumer = TestCollectorQueueConsumer( test_path_spec_queue) test_collector_queue_consumer.ConsumeItems() paths = test_collector_queue_consumer.GetFilePaths() expected_paths_p1 = [ u'/$AttrDef', u'/$BadClus', u'/$Bitmap', u'/$Boot', u'/$Extend', u'/$Extend/$ObjId', u'/$Extend/$Quota', u'/$Extend/$Reparse', u'/$Extend/$RmMetadata', u'/$Extend/$RmMetadata/$Repair', u'/$Extend/$RmMetadata/$TxfLog', u'/$LogFile', u'/$MFT', u'/$MFTMirr', u'/$Secure', u'/$UpCase', u'/$Volume', u'/file1.txt', u'/file2.txt'] expected_paths_p2 = [ u'/$AttrDef', u'/$BadClus', u'/$Bitmap', u'/$Boot', u'/$Extend', u'/$Extend/$ObjId', u'/$Extend/$Quota', u'/$Extend/$Reparse', u'/$Extend/$RmMetadata', u'/$Extend/$RmMetadata/$Repair', u'/$Extend/$RmMetadata/$TxfLog', u'/$LogFile', u'/$MFT', u'/$MFTMirr', u'/$Secure', u'/$UpCase', u'/$Volume', u'/file1_on_part_2.txt', u'/file2_on_part_2.txt'] expected_paths = [] expected_paths.extend(expected_paths_p1) expected_paths.extend(expected_paths_p2) self.assertEqual( test_collector_queue_consumer.number_of_path_specs, len(expected_paths)) self.assertEqual(sorted(paths), sorted(expected_paths))