def test_file_size(): setup_test_files() assert file_size(f'{test_folder_path}/readwrite.txt') == 11 # Need to either add logic to common to deal with file not existing # OR need to read from strerr and assert this check evaluates FileNotFoundError # assert file_size('C:/udp-app-master/dev/tests/working/this_not_does_exist.txt') == 0 teardown_test_files()
def prepare_binary(): env = common.prepare_env() # make dir to put the binary in if not os.path.isdir(os.path.join("..", "artifact")): os.mkdir(os.path.join("..", "artifact")) BUILD_FILENAME = "" if isinstance(BUILD_FILENAME, str): BUILD_FILENAME = list(BUILD_FILENAME) BUILD_FILENAMES = BUILD_FILENAME for BUILD_FILENAME in BUILD_FILENAMES: DEST_FILENAME = common.prepare_filename(BUILD_FILENAME) print(f"OS Name: {env['OS_NAME']}") print(f"OS Version: {env['OS_VERSION']}") print(f"Build Filename: {BUILD_FILENAME}") print(f"Dest Filename: {DEST_FILENAME}") if not BUILD_FILENAME == "": print("Build Filesize: " + common.file_size(BUILD_FILENAME)) else: exit(1) if not BUILD_FILENAME == "": move(os.path.join(".", BUILD_FILENAME), os.path.join("..", "artifact", BUILD_FILENAME))
def extract(self, offset, description, file_name, size, name=None): ''' Extract an embedded file from the target file, if it matches an extract rule. Called automatically by Binwalk.scan(). @offset - Offset inside the target file to begin the extraction. @description - Description of the embedded file to extract, as returned by libmagic. @file_name - Path to the target file. @size - Number of bytes to extract. @name - Name to save the file as. Returns the name of the extracted file (blank string if nothing was extracted). ''' cleanup_extracted_fname = True rule = self._match(description) if rule is not None: fname = self._dd(file_name, offset, size, rule['extension'], output_file_name=name) if rule['cmd']: # Many extraction utilities will extract the file to a new file, just without # the file extension (i.e., myfile.7z => myfile). If the presumed resulting # file name already exists before executing the extract command, do not attempt # to clean it up even if its resulting file size is 0. if self.remove_after_execute: extracted_fname = os.path.splitext(fname)[0] if os.path.exists(extracted_fname): cleanup_extracted_fname = False # Execute the specified command against the extracted file self._execute(rule['cmd'], fname) # Only clean up files if remove_after_execute was specified if self.remove_after_execute: # Remove the original file that we extracted try: os.unlink(fname) except: pass # If the command worked, assume it removed the file extension from the extracted file # If the extracted file name file exists and is empty, remove it if cleanup_extracted_fname and os.path.exists( extracted_fname) and file_size( extracted_fname) == 0: try: os.unlink(extracted_fname) except: pass else: fname = '' return fname
def __init__(self, file_name, binwalk, length=0, offset=0, n=MIN_STRING_LENGTH, block=0, algorithm=None, plugins=None): ''' Class constructor. Preferred to be invoked from the Strings class instead of directly. @file_name - The file name to perform a strings analysis on. @binwalk - An instance of the Binwalk class. @length - The number of bytes in the file to analyze. @offset - The starting offset into the file to begin analysis. @n - The minimum valid string length. @block - The block size to use when performing entropy analysis. @algorithm - The entropy algorithm to use when performing entropy analysis. @plugins - An instance of the Plugins class. Returns None. ''' self.n = n self.binwalk = binwalk self.length = length self.start = offset self.file_name = file_name self.fd = open(self.file_name, 'rb') self.data = '' self.dlen = 0 self.i = 0 self.total_read = 0 self.entropy = {} self.valid_strings = [] self.external_validators = [] self.plugins = plugins if not self.n: self.n = self.MIN_STRING_LENGTH # Perform an entropy analysis over the entire file (anything less may generate poor entropy data). # Give fake file results list to prevent FileEntropy from doing too much analysis. with entropy.FileEntropy(self.file_name, block=block, file_results=['foo']) as e: (self.x, self.y, self.average_entropy) = e.analyze() for i in range(0, len(self.x)): self.entropy[self.x[i]] = self.y[i] # Make sure our block size matches the entropy analysis's block size self.block = e.block # Make sure the starting offset is a multiple of the block size; else, when later checking # the entropy analysis, block offsets won't line up. self.start -= (self.start % self.block) try: self.fd.seek(self.start) except: self.fd.read(self.start) # Set the total_scanned and scan_length values for plugins and status display messages self.binwalk.total_scanned = 0 if self.length: self.binwalk.scan_length = self.length else: self.binwalk.scan_length = common.file_size(self.fd.name) - self.start
def __init__(self, file_name=None, fd=None, binwalk=None, offset=0, length=None, block=DEFAULT_BLOCK_SIZE, plugins=None, file_results=[], compcheck=False): ''' Class constructor. @file_name - The path to the file to analyze. @fd - A file object to analyze data from. @binwalk - An instance of the Binwalk class. @offset - The offset into the data to begin analysis. @length - The number of bytes to analyze. @block - The size of the data blocks to analyze. @plugins - Instance of the Plugins class. @file_results - Scan results to overlay on the entropy plot graph. @compcheck - Set to True to enable entropy compression detection. Returns None. ''' self.fd = fd self.start = offset self.length = length self.block = block self.binwalk = binwalk self.plugins = plugins self.total_read = 0 self.fd_open = False self.file_results = file_results self.do_chisq = compcheck if file_name is None and self.fd is None: raise Exception("Entropy.__init__ requires at least the file_name or fd options") if self.fd is None: self.fd = open(file_name, 'rb') self.fd_open = True if not self.length: self.length = None if not self.start: self.start = 0 if not self.block: self.block = self.DEFAULT_BLOCK_SIZE # Some file descriptors aren't seekable (stdin, for example) try: self.fd.seek(self.start) except: self.fd.read(self.start) if self.binwalk: # Set the total_scanned and scan_length values for plugins and status display messages self.binwalk.total_scanned = 0 if self.length: self.binwalk.scan_length = self.length else: self.binwalk.scan_length = common.file_size(self.fd.name) - self.start
def extract(self, offset, description, file_name, size, name=None): ''' Extract an embedded file from the target file, if it matches an extract rule. Called automatically by Binwalk.scan(). @offset - Offset inside the target file to begin the extraction. @description - Description of the embedded file to extract, as returned by libmagic. @file_name - Path to the target file. @size - Number of bytes to extract. @name - Name to save the file as. Returns the name of the extracted file (blank string if nothing was extracted). ''' cleanup_extracted_fname = True rule = self._match(description) if rule is not None: fname = self._dd(file_name, offset, size, rule['extension'], output_file_name=name) if rule['cmd']: # Many extraction utilities will extract the file to a new file, just without # the file extension (i.e., myfile.7z => myfile). If the presumed resulting # file name already exists before executing the extract command, do not attempt # to clean it up even if its resulting file size is 0. if self.remove_after_execute: extracted_fname = os.path.splitext(fname)[0] if os.path.exists(extracted_fname): cleanup_extracted_fname = False # Execute the specified command against the extracted file self._execute(rule['cmd'], fname) # Only clean up files if remove_after_execute was specified if self.remove_after_execute: # Remove the original file that we extracted try: os.unlink(fname) except: pass # If the command worked, assume it removed the file extension from the extracted file # If the extracted file name file exists and is empty, remove it if cleanup_extracted_fname and os.path.exists(extracted_fname) and file_size(extracted_fname) == 0: try: os.unlink(extracted_fname) except: pass else: fname = '' return fname
def upload_to_blobstore(self): """Upload publish_folder's <dataset_name>-<job_id>.zip to landing blobstore.""" # don't upload captured data if we're in --notransfer mode if self.option('notransfer'): logger.warning( 'Not uploading data to landing per --notransfer option') return # upload capture file to landing blobstore self.events.start('upload', 'step') resource = self.config(self.project.blobstore_landing) bs_landing = BlobStore() bs_landing.connect(resource) bs_landing.put(self.zip_file_name, just_file_name(self.zip_file_name)) bs_landing.disconnect() # finish self.events.stop('upload', 0, file_size(self.zip_file_name))
def compress_work_folder(self): """Compress all files in work_folder to single file in publish_folder.""" # setup self.events.start('compress', 'step') self.capture_file_name = f'{self.dataset_name}#{self.job_id:09}' self.zip_file_name = f'{self.publish_folder}/{self.capture_file_name}' # copy capture_state files to work folder to be included in capture zip package as well copy_file_if_exists(f'{self.state_folder}/last_job.log', self.work_folder) # compress (make_archive() appends a .zip file extension to zip_file_name) self.zip_file_name = shutil.make_archive(self.zip_file_name, format='zip', root_dir=self.work_folder) # finish self.events.stop('compress', 0, file_size(self.zip_file_name))
def extract(self, offset, description, file_name, size, name=None): ''' Extract an embedded file from the target file, if it matches an extract rule. Called automatically by Binwalk.scan(). @offset - Offset inside the target file to begin the extraction. @description - Description of the embedded file to extract, as returned by libmagic. @file_name - Path to the target file. @size - Number of bytes to extract. @name - Name to save the file as. Returns the name of the extracted file (blank string if nothing was extracted). ''' fname = '' cleanup_extracted_fname = True original_dir = os.getcwd() if not os.path.exists(self.extract_path): os.mkdir(self.extract_path) file_path = os.path.realpath(file_name) if os.path.isfile(file_path): os.chdir(self.extract_path) # Loop through each extraction rule until one succeeds for rule in self._match(description): # Copy out the data to disk fname = self._dd(file_path, offset, size, rule['extension'], output_file_name=name) # If there was a command specified for this rule, try to execute it. # If execution fails, the next rule will be attempted. if rule['cmd']: # Many extraction utilities will extract the file to a new file, just without # the file extension (i.e., myfile.7z -> myfile). If the presumed resulting # file name already exists before executing the extract command, do not attempt # to clean it up even if its resulting file size is 0. if self.remove_after_execute: extracted_fname = os.path.splitext(fname)[0] if os.path.exists(extracted_fname): cleanup_extracted_fname = False # Execute the specified command against the extracted file extract_ok = self._execute(rule['cmd'], fname) # Only clean up files if remove_after_execute was specified if self.remove_after_execute: # Remove the original file that we extracted try: os.unlink(fname) except: pass # If the command worked, assume it removed the file extension from the extracted file # If the extracted file name file exists and is empty, remove it if cleanup_extracted_fname and os.path.exists( extracted_fname) and file_size( extracted_fname) == 0: try: os.unlink(extracted_fname) except: pass # If the command executed OK, don't try any more rules if extract_ok: break # If there was no command to execute, just use the first rule else: break os.chdir(original_dir) # If a file was extracted, return the full path to that file if fname: fname = os.path.join(self.extract_path, fname) return fname
def single_scan(self, target_file='', fd=None, offset=0, length=0, show_invalid_results=False, callback=None, plugins_whitelist=[], plugins_blacklist=[]): ''' Performs a binwalk scan on one target file or file descriptor. @target_file - File to scan. @fd - File descriptor to scan. @offset - Starting offset at which to start the scan. @length - Number of bytes to scan. Specify -1 for streams. @show_invalid_results - Set to True to display invalid results. @callback - Callback function to be invoked when matches are found. @plugins_whitelist - A list of plugin names to load. If not empty, only these plugins will be loaded. @plugins_blacklist - A list of plugin names to not load. The callback function is passed two arguments: a list of result dictionaries containing the scan results (one result per dict), and the offset at which those results were identified. Example callback function: def my_callback(offset, results): print "Found %d results at offset %d:" % (len(results), offset) for result in results: print "\t%s" % result['description'] binwalk.Binwalk(callback=my_callback).scan("firmware.bin") Upon completion, the scan method returns a sorted list of tuples containing a list of results dictionaries and the offsets at which those results were identified: scan_results = [ (0, [{description : "LZMA compressed data..."}]), (112, [{description : "gzip compressed data..."}]) ] See SmartSignature.parse for a more detailed description of the results dictionary structure. ''' scan_results = {} fsize = 0 jump_offset = 0 i_opened_fd = False i_loaded_plugins = False plugret = PLUGIN_CONTINUE plugret_start = PLUGIN_CONTINUE self.total_read = 0 self.total_scanned = 0 self.scan_length = length self.filter.show_invalid_results = show_invalid_results self.start_offset = offset # Check to make sure either a target file or a file descriptor was supplied if not target_file and fd is None: raise Exception("Must supply Binwalk.single_scan with a valid file path or file object") # Load the default signatures if self.load_signatures has not already been invoked if self.magic is None: self.load_signatures() # Need the total size of the target file, even if we aren't scanning the whole thing if target_file: fsize = file_size(target_file) # Open the target file and seek to the specified start offset if fd is None: fd = open(target_file) i_opened_fd = True # Seek to the starting offset. This is invalid for some file-like objects such as stdin, # so if an exception is thrown try reading offset bytes from the file object. try: fd.seek(offset) except: fd.read(offset) # If no length was specified, make the length the size of the target file minus the starting offset if self.scan_length == 0: self.scan_length = fsize - offset # If the Plugins class has not already been instantitated, do that now. if self.plugins is None: self.plugins = Plugins(self, blacklist=plugins_blacklist, whitelist=plugins_whitelist) i_loaded_plugins = True if self.load_plugins: self.plugins._load_plugins() # Invoke any pre-scan plugins plugret_start = self.plugins._pre_scan_callbacks(fd) # Main loop, scan through all the data while not ((plugret | plugret_start) & PLUGIN_TERMINATE): i = 0 # Read in the next block of data from the target file and make sure it's valid (data, dlen) = self._read_block(fd) if data is None or dlen == 0: break # The total number of bytes scanned could be bigger than the total number # of bytes read from the file if the previous signature result specified a # jump offset that was beyond the end of the then current data block. # # If this is the case, we need to index into this data block appropriately in order to # resume the scan from the appropriate offset, and adjust dlen accordingly. if jump_offset > 0: total_check = self.total_scanned + dlen if jump_offset >= total_check: i = -1 # Try to seek to the jump offset; this won't work if fd == sys.stdin try: fd.seek(jump_offset) self.total_read = jump_offset self.total_scanned = jump_offset - dlen jump_offset = 0 except: pass elif jump_offset < total_check: # Index into this block appropriately i = jump_offset - self.total_scanned jump_offset = 0 # Scan through each block of data looking for signatures if i >= 0 and i < dlen: # Scan this data block for a list of offsets which are candidates for possible valid signatures for candidate in self.parser.find_signature_candidates(data[i:dlen]): # If a signature specified a jump offset beyond this candidate signature offset, ignore it if (i + candidate + self.total_scanned) < jump_offset: continue # Reset these values on each loop smart = {} results = [] results_offset = -1 # Pass the data to libmagic, and split out multiple results into a list for magic_result in self.parser.split(self.magic.buffer(data[i+candidate:i+candidate+self.MAX_SIGNATURE_SIZE])): i_set_results_offset = False # Some file names are not NULL byte terminated, but rather their length is # specified in a size field. To ensure these are not marked as invalid due to # non-printable characters existing in the file name, parse the filename(s) and # trim them to the specified filename length, if one was specified. magic_result = self.smart._parse_raw_strings(magic_result) # Make sure this is a valid result before further processing if not self.filter.invalid(magic_result): # The smart filter parser returns a dictionary of keyword values and the signature description. smart = self.smart.parse(magic_result) # Validate the jump value and check if the response description should be displayed if smart['jump'] > -1 and self._should_display(smart): # If multiple results are returned and one of them has smart['jump'] set to a non-zero value, # the calculated results offset will be wrong since i will have been incremented. Only set the # results_offset value when the first match is encountered. if results_offset < 0: results_offset = offset + i + candidate + smart['adjust'] + self.total_scanned i_set_results_offset = True # Double check to make sure the smart['adjust'] value is sane. # If it makes results_offset negative, then it is not sane. if results_offset >= 0: smart['offset'] = results_offset # Invoke any scan plugins if not (plugret_start & PLUGIN_STOP_PLUGINS): plugret = self.plugins._scan_callbacks(smart) results_offset = smart['offset'] if (plugret & PLUGIN_TERMINATE): break # Extract the result, if it matches one of the extract rules and is not a delayed extract. if self.extractor.enabled and not (self.extractor.delayed and smart['delay']) and not ((plugret | plugret_start) & PLUGIN_NO_EXTRACT): # If the signature did not specify a size, extract to the end of the file. if not smart['size']: smart['size'] = fsize-results_offset smart['extract'] = self.extractor.extract( results_offset, smart['description'], target_file, smart['size'], name=smart['name']) if not ((plugret | plugret_start) & PLUGIN_NO_DISPLAY): # This appears to be a valid result, so append it to the results list. results.append(smart) elif i_set_results_offset: results_offset = -1 # Did we find any valid results? if results_offset >= 0: scan_results[results_offset] = results if callback is not None: callback(results_offset, results) # If a relative jump offset was specified, update the absolute jump_offset variable if smart.has_key('jump') and smart['jump'] > 0: jump_offset = results_offset + smart['jump'] # Track the total number of bytes scanned self.total_scanned += dlen # The starting offset only affects the reported offset for results # in the first block of data. Zero it out after the first block has # been processed. offset = 0 # Sort the results before returning them scan_items = scan_results.items() scan_items.sort() # Do delayed extraction, if specified. if self.extractor.enabled and self.extractor.delayed: scan_items = self.extractor.delayed_extract(scan_items, target_file, fsize) # Invoke any post-scan plugins #if not (plugret_start & PLUGIN_STOP_PLUGINS): self.plugins._post_scan_callbacks(fd) # Be sure to delete the Plugins instance so that there isn't a lingering reference to # this Binwalk class instance (lingering handles to this Binwalk instance cause the # __del__ deconstructor to not be called). if i_loaded_plugins: del self.plugins self.plugins = None if i_opened_fd: fd.close() return scan_items
def prepare_release(): env = common.prepare_env() # get env vars dirs = [ os.path.join("..", "artifact"), # temp dir for binary os.path.join("..", "build"), # temp dir for other stuff os.path.join("..", "deploy") # dir for archive ] for dirname in dirs: if not os.path.isdir(dirname): os.makedirs(dirname) # make dirs for each os # for dirname in ["linux", "macos", "windows"]: for dirname in ["linux"]: if not os.path.isdir(os.path.join("..", "deploy", dirname)): os.mkdir(os.path.join("..", "deploy", dirname)) # sanity check permissions for working_dirs.json dirpath = "." for dirname in ["resources", "user", "meta", "manifests"]: dirpath += os.path.join(dirpath, dirname) if os.path.isdir(dirpath): os.chmod(dirpath, 0o755) # nuke git files for git in [ os.path.join(".", ".gitattrubutes"), os.path.join(".", ".gitignore") ]: if os.path.isfile(git): os.remove(git) # nuke travis file if it exists for travis in [ os.path.join(".", ".travis.yml"), os.path.join(".", ".travis.off") ]: if os.path.isfile(travis): os.remove(travis) # nuke test suite if it exists if os.path.isdir(os.path.join(".", "tests")): distutils.dir_util.remove_tree(os.path.join(".", "tests")) BUILD_FILENAME = "" ZIP_FILENAME = "" # list executables BUILD_FILENAME = (os.path.join(".")) if BUILD_FILENAME == "": BUILD_FILENAME = (os.path.join("..", "artifact")) if isinstance(BUILD_FILENAME, str): BUILD_FILENAME = list(BUILD_FILENAME) BUILD_FILENAMES = BUILD_FILENAME print(BUILD_FILENAMES) if len(BUILD_FILENAMES) > 0: # clean the git slate git_clean() # mv dirs from source code dirs = [ os.path.join(".", ".git"), os.path.join(".", ".github"), os.path.join(".", ".gitattributes"), os.path.join(".", ".gitignore"), os.path.join(".", "html"), os.path.join(".", "resources"), os.path.join(".", "schemas"), os.path.join(".", "CODE_OF_CONDUCT.md") ] for dirname in dirs: if os.path.exists(dirname): move(dirname, os.path.join("..", "build", dirname)) # .zip if windows # .tar.gz otherwise ZIP_FILENAME = os.path.join("..", "deploy", env["REPO_NAME"]) make_archive(ZIP_FILENAME, "zip") ZIP_FILENAME += ".zip" # mv dirs back for thisDir in dirs: if os.path.exists(os.path.join("..", "build", thisDir)): move(os.path.join("..", "build", thisDir), os.path.join(".", thisDir)) if not ZIP_FILENAME == "": print(f"Zip Filename: {ZIP_FILENAME}") print("Zip Filesize: " + common.file_size(ZIP_FILENAME)) else: print(f"No Zip to prepare: {ZIP_FILENAME}") print(f"Git tag: {env['GITHUB_TAG']}") if (ZIP_FILENAME == ""): exit(1)
def prepare_release(): env = common.prepare_env() # get env vars dirs = [ os.path.join("..", "artifact"), # temp dir for binary os.path.join("..", "build"), # temp dir for other stuff os.path.join("..", "deploy") # dir for archive ] for dirname in dirs: if not os.path.isdir(dirname): os.makedirs(dirname) # make dirs for each os for dirname in ["linux", "macos", "windows"]: if not os.path.isdir(os.path.join("..", "deploy", dirname)): os.mkdir(os.path.join("..", "deploy", dirname)) # sanity check permissions for working_dirs.json dirpath = "." for dirname in ["resources", "user", "meta", "manifests"]: dirpath += os.path.join(dirpath, dirname) if os.path.isdir(dirpath): os.chmod(dirpath, 0o755) # nuke git files for git in [ os.path.join(".", ".gitattrubutes"), os.path.join(".", ".gitignore") ]: if os.path.isfile(git): os.remove(git) # nuke travis file if it exists for travis in [ os.path.join(".", ".travis.yml"), os.path.join(".", ".travis.off") ]: if os.path.isfile(travis): os.remove(travis) # nuke test suite if it exists if os.path.isdir(os.path.join(".", "tests")): distutils.dir_util.remove_tree(os.path.join(".", "tests")) BUILD_FILENAME = "" ZIP_FILENAME = "" # list executables BUILD_FILENAME = common.find_binary(os.path.join(".")) if BUILD_FILENAME == "": BUILD_FILENAME = common.find_binary(os.path.join("..", "artifact")) if isinstance(BUILD_FILENAME, str): BUILD_FILENAME = list(BUILD_FILENAME) BUILD_FILENAMES = BUILD_FILENAME print(BUILD_FILENAMES) if len(BUILD_FILENAMES) > 0: for BUILD_FILENAME in BUILD_FILENAMES: if not BUILD_FILENAME == "": if "artifact" not in BUILD_FILENAME: # move the binary to temp folder move(os.path.join(".", BUILD_FILENAME), os.path.join("..", "artifact", BUILD_FILENAME)) # clean the git slate git_clean() # mv dirs from source code dirs = [ os.path.join(".", ".git"), os.path.join(".", ".github"), os.path.join(".", ".gitattributes"), os.path.join(".", ".gitignore"), os.path.join(".", "html"), os.path.join(".", "resources", "ci") ] for dirname in dirs: if os.path.isdir(dirname): move(dirname, os.path.join("..", "build", dirname)) for BUILD_FILENAME in BUILD_FILENAMES: if "artifact" not in BUILD_FILENAME: if os.path.isfile( os.path.join("..", "artifact", BUILD_FILENAME)): # move the binary back move(os.path.join("..", "artifact", BUILD_FILENAME), os.path.join(".", BUILD_FILENAME)) # Make Linux/Mac binary executable if "linux" in env["OS_NAME"] or \ "ubuntu" in env["OS_NAME"] or \ "mac" in env["OS_NAME"] or \ "osx" in env["OS_NAME"]: os.chmod(os.path.join(".", BUILD_FILENAME), 0o755) # .zip if windows # .tar.gz otherwise if len(BUILD_FILENAMES) > 1: ZIP_FILENAME = os.path.join("..", "deploy", env["REPO_NAME"]) else: ZIP_FILENAME = os.path.join("..", "deploy", os.path.splitext(BUILD_FILENAME)[0]) if env["OS_NAME"] == "windows": make_archive(ZIP_FILENAME, "zip") ZIP_FILENAME += ".zip" else: make_archive(ZIP_FILENAME, "gztar") ZIP_FILENAME += ".tar.gz" # mv dirs back for thisDir in dirs: if os.path.isdir(os.path.join("..", "build", thisDir)): move(os.path.join("..", "build", thisDir), os.path.join(".", thisDir)) for BUILD_FILENAME in BUILD_FILENAMES: if not BUILD_FILENAME == "": print(f"Build Filename: {BUILD_FILENAME}") print("Build Filesize: " + common.file_size(BUILD_FILENAME)) else: print(f"No Build to prepare: {BUILD_FILENAME}") if not ZIP_FILENAME == "": print(f"Zip Filename: {ZIP_FILENAME}") print("Zip Filesize: " + common.file_size(ZIP_FILENAME)) else: print(f"No Zip to prepare: {ZIP_FILENAME}") print(f"Git tag: {env['GITHUB_TAG']}") if (len(BUILD_FILENAMES) == 0) or (ZIP_FILENAME == ""): exit(1)
def extract(self, offset, description, file_name, size, name=None): ''' Extract an embedded file from the target file, if it matches an extract rule. Called automatically by Binwalk.scan(). @offset - Offset inside the target file to begin the extraction. @description - Description of the embedded file to extract, as returned by libmagic. @file_name - Path to the target file. @size - Number of bytes to extract. @name - Name to save the file as. Returns the name of the extracted file (blank string if nothing was extracted). ''' fname = '' cleanup_extracted_fname = True original_dir = os.getcwd() if not os.path.exists(self.extract_path): os.mkdir(self.extract_path) file_path = os.path.realpath(file_name) if os.path.isfile(file_path): os.chdir(self.extract_path) # Loop through each extraction rule until one succeeds for rule in self._match(description): # Copy out the data to disk fname = self._dd(file_path, offset, size, rule['extension'], output_file_name=name) # If there was a command specified for this rule, try to execute it. # If execution fails, the next rule will be attempted. if rule['cmd']: # Many extraction utilities will extract the file to a new file, just without # the file extension (i.e., myfile.7z -> myfile). If the presumed resulting # file name already exists before executing the extract command, do not attempt # to clean it up even if its resulting file size is 0. if self.remove_after_execute: extracted_fname = os.path.splitext(fname)[0] if os.path.exists(extracted_fname): cleanup_extracted_fname = False # Execute the specified command against the extracted file extract_ok = self._execute(rule['cmd'], fname) # Only clean up files if remove_after_execute was specified if self.remove_after_execute: # Remove the original file that we extracted try: os.unlink(fname) except: pass # If the command worked, assume it removed the file extension from the extracted file # If the extracted file name file exists and is empty, remove it if cleanup_extracted_fname and os.path.exists(extracted_fname) and file_size(extracted_fname) == 0: try: os.unlink(extracted_fname) except: pass # If the command executed OK, don't try any more rules if extract_ok: break # If there was no command to execute, just use the first rule else: break os.chdir(original_dir) # If a file was extracted, return the full path to that file if fname: fname = os.path.join(self.extract_path, fname) return fname
def scan(self, target_file, offset=0, length=0, align=DEFAULT_BYTE_ALIGNMENT, show_invalid_results=False, callback=None): ''' Performs a Binwalk scan on the target file. @target_file - File to scan. @offset - Starting offset at which to start the scan. @length - Number of bytes to scan. @align - Look for signatures every align bytes. @show_invalid_results - Set to True to display invalid results. @callback - Callback function to be invoked when matches are found. The callback function is passed two arguments: a list of result dictionaries containing the scan results (one result per dict), and the offset at which those results were identified. Example callback function: def my_callback(offset, results): print "Found %d results at offset %d:" % (len(results), offset) for result in results: print "\t%s" % result['description'] binwalk.Binwalk(callback=my_callback).scan("firmware.bin") Upon completion, the scan method returns a sorted list of tuples containing a list of results dictionaries and the offsets at which those results were identified: scan_items = [ (0, [{description : "LZMA compressed data..."}]), (112, [{description : "gzip compressed data..."}]) ] See SmartSignature.parse for a more detailed description of the results dictionary structure. ''' scan_results = {} self.total_read = 0 self.total_scanned = 0 self.scan_length = length self.filter.show_invalid_results = show_invalid_results # Load the default signatures if self.load_signatures has not already been invoked if self.magic is None: self.load_signatures() # Get a local copy of the signature sets generated by self.parser.build_signature_set. # This is accessed heavily throughout the scan, and there is less overhead for accessing local variables in Python. signature_set = self.parser.build_signature_set() # Need the total size of the target file, even if we aren't scanning the whole thing fsize = file_size(target_file) # Open the target file and seek to the specified start offset fd = open(target_file) fd.seek(offset) # If no length was specified, make the length the size of the target file minus the starting offset if self.scan_length == 0: self.scan_length = fsize - offset # Sanity check on the byte alignment; default to 1 if align <= 0: align = 1 # Main loop, scan through all the data while True: i = 0 # Read in the next block of data from the target file and make sure it's valid (data, dlen) = self._read_block(fd) if data is None or dlen == 0: break # The total number of bytes scanned could be bigger than the total number # of bytes read from the file under the following circumstances: # # o The previous dlen was not a multiple of align # o A previous result specified a jump offset that was beyond the end of the # then current data block # # If this is the case, we need to index into this data block appropriately in order to # resume the scan from the appropriate offset, and adjust dlen accordingly. bufindex = self.total_scanned - self.total_read if bufindex > 0: # If the total_scanned > total_read, then the total_scanned offset is in a subsequent block. # Set i to bufindex, which will cause i to be greater than dlen and this block will be skipped. i = bufindex elif bufindex < 0: # If the total_scanned offset is less than total_read, then the total_scanned offset is # somewhere inside this block. Set i to index into the block appropriately. i = dlen + bufindex else: # If the total_scanned offset ends at the end of this block, don't scan any of this block i = dlen # Scan through each block of data looking for signatures while i < dlen: smart = {} results = [] results_offset = -1 pre_filter_ok = False smart_jump_done = False # Pre-filter data by checking to see if the parser thinks this might be a valid match. # This eliminates unnecessary calls into libmagic, which are very expensive. # # Ideally, this should be done in the MagicParser class, but function calls are expensive. # Doing it here greatly decreases the scan time. if self.smart.pre_filter: for (sig_offset, sigset) in signature_set: if data[i+sig_offset:i+sig_offset+self.parser.MATCH_INDEX_SIZE] in sigset: pre_filter_ok = True break else: pre_filter_ok = True if pre_filter_ok: # Pass the data to libmagic, and split out multiple results into a list for magic_result in self.parser.split(self.magic.buffer(data[i:i+self.MAX_SIGNATURE_SIZE])): # Some file names are not NULL byte terminated, but rather their length is # specified in a size field. To ensure these are not marked as invalid due to # non-printable characters existing in the file name, parse the filename(s) and # trim them to the specified filename length, if one was specified. magic_result = self.smart._parse_raw_strings(magic_result) # Make sure this is a valid result before further processing if not self.filter.invalid(magic_result): # The smart filter parser returns a dictionary of keyword values and the signature description. smart = self.smart.parse(magic_result) # Validate the jump value and check if the response description should be displayed if smart['jump'] > -1 and self._should_display(smart['description']): # If multiple results are returned and one of them has smart['jump'] set to a non-zero value, # the calculated results offset will be wrong since i will have been incremented. Only set the # results_offset value when the first match is encountered. if results_offset < 0: results_offset = offset + smart['adjust'] + self.total_scanned # Double check to make sure the smart['adjust'] value is sane. # If it makes results_offset negative, then it is not sane. if results_offset >= 0: # Extract the result, if it matches one of the extract rules and is not a delayed extract. if self.extractor.enabled and not (self.extractor.delayed and smart['delay']): # If the signature did not specify a size, extract to the end of the file. if smart['size'] == 0: smart['size'] = fsize-results_offset smart['extract'] = self.extractor.extract( results_offset, smart['description'], target_file, smart['size'], name=smart['name']) # This appears to be a valid result, so append it to the results list. results.append(smart) # Jump to the offset specified by jump. Only do this once, so that if multiple results # are returned each of which specify a jump offset, only the first will be honored. if smart['jump'] > 0 and not smart_jump_done: # Once a jump offset has been honored, we need to start scanning every byte since the # jump offset may have thrown off the original alignment. In terms of speed this is fine, # since the jump offset usually saves more time anyway. If this is not what the user # wanted/intended, disabling pre filtering will disable jump offset processing completely. align = self.DEFAULT_BYTE_ALIGNMENT smart_jump_done = True i += (smart['jump'] - align) self.total_scanned += (smart['jump'] - align) # Did we find any valid results? if results_offset >= 0: scan_results[results_offset] = results if callback is not None: callback(results_offset, results) # Track the number of bytes scanned in this block, and the total number of bytes scanned. i += align self.total_scanned += align # Sort the results before returning them scan_items = scan_results.items() scan_items.sort() # Do delayed extraction, if specified. if self.extractor.enabled and self.extractor.delayed: scan_items = self.extractor.delayed_extract(scan_items, target_file, fsize) return scan_items
def single_scan(self, target_file='', fd=None, offset=0, length=0, show_invalid_results=False, callback=None, plugins_whitelist=[], plugins_blacklist=[]): ''' Performs a binwalk scan on one target file or file descriptor. @target_file - File to scan. @fd - File descriptor to scan. @offset - Starting offset at which to start the scan. @length - Number of bytes to scan. Specify -1 for streams. @show_invalid_results - Set to True to display invalid results. @callback - Callback function to be invoked when matches are found. @plugins_whitelist - A list of plugin names to load. If not empty, only these plugins will be loaded. @plugins_blacklist - A list of plugin names to not load. The callback function is passed two arguments: a list of result dictionaries containing the scan results (one result per dict), and the offset at which those results were identified. Example callback function: def my_callback(offset, results): print "Found %d results at offset %d:" % (len(results), offset) for result in results: print "\t%s" % result['description'] binwalk.Binwalk(callback=my_callback).scan("firmware.bin") Upon completion, the scan method returns a sorted list of tuples containing a list of results dictionaries and the offsets at which those results were identified: scan_results = [ (0, [{description : "LZMA compressed data..."}]), (112, [{description : "gzip compressed data..."}]) ] See SmartSignature.parse for a more detailed description of the results dictionary structure. ''' scan_results = {} fsize = 0 jump_offset = 0 i_opened_fd = False i_loaded_plugins = False plugret = PLUGIN_CONTINUE plugret_start = PLUGIN_CONTINUE self.total_read = 0 self.total_scanned = 0 self.scan_length = length self.filter.show_invalid_results = show_invalid_results # Check to make sure either a target file or a file descriptor was supplied if not target_file and fd is None: raise Exception( "Must supply Binwalk.single_scan with a valid file path or file object" ) # Load the default signatures if self.load_signatures has not already been invoked if self.magic is None: self.load_signatures() # Need the total size of the target file, even if we aren't scanning the whole thing if target_file: fsize = file_size(target_file) # Open the target file and seek to the specified start offset if fd is None: fd = open(target_file) i_opened_fd = True # Seek to the starting offset. This is invalid for some file-like objects such as stdin, # so if an exception is thrown try reading offset bytes from the file object. try: fd.seek(offset) except: fd.read(offset) # If no length was specified, make the length the size of the target file minus the starting offset if self.scan_length == 0: self.scan_length = fsize - offset # If the Plugins class has not already been instantitated, do that now. if self.plugins is None: self.plugins = Plugins(self, blacklist=plugins_blacklist, whitelist=plugins_whitelist) i_loaded_plugins = True if self.load_plugins: self.plugins._load_plugins() # Invoke any pre-scan plugins plugret_start = self.plugins._pre_scan_callbacks(fd) # Main loop, scan through all the data while not ((plugret | plugret_start) & PLUGIN_TERMINATE): i = 0 # Read in the next block of data from the target file and make sure it's valid (data, dlen) = self._read_block(fd) if data is None or dlen == 0: break # The total number of bytes scanned could be bigger than the total number # of bytes read from the file if the previous signature result specified a # jump offset that was beyond the end of the then current data block. # # If this is the case, we need to index into this data block appropriately in order to # resume the scan from the appropriate offset, and adjust dlen accordingly. if jump_offset > 0: total_check = self.total_scanned + dlen if jump_offset >= total_check: i = -1 # Try to seek to the jump offset; this won't work if fd == sys.stdin try: fd.seek(jump_offset) self.total_read = jump_offset self.total_scanned = jump_offset - dlen jump_offset = 0 except: pass elif jump_offset < total_check: # Index into this block appropriately i = jump_offset - self.total_scanned jump_offset = 0 # Scan through each block of data looking for signatures if i >= 0 and i < dlen: # Scan this data block for a list of offsets which are candidates for possible valid signatures for candidate in self.parser.find_signature_candidates( data[i:dlen]): # If a signature specified a jump offset beyond this candidate signature offset, ignore it if (i + candidate + self.total_scanned) < jump_offset: continue # Reset these values on each loop smart = {} results = [] results_offset = -1 # Pass the data to libmagic, and split out multiple results into a list for magic_result in self.parser.split( self.magic.buffer( data[i + candidate:i + candidate + self.MAX_SIGNATURE_SIZE])): i_set_results_offset = False # Some file names are not NULL byte terminated, but rather their length is # specified in a size field. To ensure these are not marked as invalid due to # non-printable characters existing in the file name, parse the filename(s) and # trim them to the specified filename length, if one was specified. magic_result = self.smart._parse_raw_strings( magic_result) # Make sure this is a valid result before further processing if not self.filter.invalid(magic_result): # The smart filter parser returns a dictionary of keyword values and the signature description. smart = self.smart.parse(magic_result) # Validate the jump value and check if the response description should be displayed if smart['jump'] > -1 and self._should_display( smart['description']): # If multiple results are returned and one of them has smart['jump'] set to a non-zero value, # the calculated results offset will be wrong since i will have been incremented. Only set the # results_offset value when the first match is encountered. if results_offset < 0: results_offset = offset + i + candidate + smart[ 'adjust'] + self.total_scanned i_set_results_offset = True # Double check to make sure the smart['adjust'] value is sane. # If it makes results_offset negative, then it is not sane. if results_offset >= 0: smart['offset'] = results_offset # Invoke any scan plugins if not (plugret_start & PLUGIN_STOP_PLUGINS): plugret = self.plugins._scan_callbacks( smart) results_offset = smart['offset'] if (plugret & PLUGIN_TERMINATE): break # Extract the result, if it matches one of the extract rules and is not a delayed extract. if self.extractor.enabled and not ( self.extractor.delayed and smart['delay']) and not ( (plugret | plugret_start) & PLUGIN_NO_EXTRACT): # If the signature did not specify a size, extract to the end of the file. if not smart['size']: smart[ 'size'] = fsize - results_offset smart[ 'extract'] = self.extractor.extract( results_offset, smart['description'], target_file, smart['size'], name=smart['name']) if not ((plugret | plugret_start) & PLUGIN_NO_DISPLAY): # This appears to be a valid result, so append it to the results list. results.append(smart) elif i_set_results_offset: results_offset = -1 # Did we find any valid results? if results_offset >= 0: scan_results[results_offset] = results if callback is not None: callback(results_offset, results) # If a relative jump offset was specified, update the absolute jump_offset variable if smart.has_key('jump') and smart['jump'] > 0: jump_offset = results_offset + smart['jump'] # Track the total number of bytes scanned self.total_scanned += dlen # Sort the results before returning them scan_items = scan_results.items() scan_items.sort() # Do delayed extraction, if specified. if self.extractor.enabled and self.extractor.delayed: scan_items = self.extractor.delayed_extract( scan_items, target_file, fsize) # Invoke any post-scan plugins #if not (plugret_start & PLUGIN_STOP_PLUGINS): self.plugins._post_scan_callbacks(fd) # Be sure to delete the Plugins instance so that there isn't a lingering reference to # this Binwalk class instance (lingering handles to this Binwalk instance cause the # __del__ deconstructor to not be called). if i_loaded_plugins: del self.plugins self.plugins = None if i_opened_fd: fd.close() return scan_items
def __init__(self, file_name=None, fd=None, binwalk=None, offset=0, length=None, block=DEFAULT_BLOCK_SIZE, plugins=None, file_results=[], compcheck=False): ''' Class constructor. @file_name - The path to the file to analyze. @fd - A file object to analyze data from. @binwalk - An instance of the Binwalk class. @offset - The offset into the data to begin analysis. @length - The number of bytes to analyze. @block - The size of the data blocks to analyze. @plugins - Instance of the Plugins class. @file_results - Scan results to overlay on the entropy plot graph. @compcheck - Set to True to enable entropy compression detection. Returns None. ''' self.fd = fd self.start = offset self.length = length self.block = block self.binwalk = binwalk self.plugins = plugins self.total_read = 0 self.fd_open = False self.file_results = file_results self.do_chisq = compcheck if file_name is None and self.fd is None: raise Exception( "Entropy.__init__ requires at least the file_name or fd options" ) if self.fd is None: self.fd = open(file_name, 'rb') self.fd_open = True if not self.length: self.length = None if not self.start: self.start = 0 if not self.block: self.block = self.DEFAULT_BLOCK_SIZE # Some file descriptors aren't seekable (stdin, for example) try: self.fd.seek(self.start) except: self.fd.read(self.start) if self.binwalk: # Set the total_scanned and scan_length values for plugins and status display messages self.binwalk.total_scanned = 0 if self.length: self.binwalk.scan_length = self.length else: self.binwalk.scan_length = common.file_size( self.fd.name) - self.start
def process_table(self, db, db_engine, schema_name, table_name, table_object, table_history, current_timestamp, current_sequence=0): """Process a specific table.""" # skip default table and ignored tables if table_name == 'default': return # TODO: Allow ignore and drop table conditions to be passed to archive (log table state) and stage (to drop table and table references) elif table_object.ignore_table: logger.info(f'Skipping table: {table_name} (ignore_table=1)') return elif table_object.drop_table: logger.info(f'Skipping table: {table_name} (drop_table=1)') return # initialize table history's last time stamp to first timestamp if not set yet if not table_history.last_timestamp: # default first timestamp to 1900-01-01 if project has no first timestamp if not table_object.first_timestamp: table_object.first_timestamp = '1900-01-01' table_history.last_timestamp = iso_to_datetime( table_object.first_timestamp) # skip table if last timestamp > current timestamp, eg. tables pre-configured for the future if table_history.last_timestamp > current_timestamp: explanation = f'first/last timestamp {table_history.last_timestamp} > current timestamp {current_timestamp}' logger.info(f'Skipping table: {table_name} ({explanation})') return # if we're here then we have a legit last timestamp value to use for CDC last_timestamp = table_history.last_timestamp # initialize table's last_sequence to first_sequence if not set yet if not table_history.last_sequence: if not table_object.first_sequence: table_object.first_sequence = 0 table_history.last_sequence = table_object.first_sequence self.events.start(table_name, 'table') # logger.info(f'Processing {table_name} ...') # create a fresh cursor for each table cursor = db.conn.cursor() # save table object for stage table_file_name = f'{self.work_folder}/{table_name}.table' save_jsonpickle(table_file_name, table_object) # discover table schema table_schema = db_engine.select_table_schema(schema_name, table_name) # handle non-existent tables if table_schema is None: if table_object.optional_table: logger.info( f'Optional table not found; skipped ({table_name})') else: logger.warning(f'Table not found; skipped ({table_name})') return # remove ignored columns from table schema if table_object.ignore_columns: # find columns to ignore (remove) based on ignore column names/glob-style patterns ignore_columns = [] for column_name in table_schema.columns: for pattern in split(table_object.ignore_columns): if is_glob_match(column_name, pattern): ignore_columns.append(column_name) # delete ignored columns from our table schema for column_name in ignore_columns: logger.info(f'Ignore_column: {table_name}.{column_name}') table_schema.columns.pop(column_name) # save table schema for stage to use schema_table_name = f'{self.work_folder}/{table_name}.schema' save_jsonpickle(schema_table_name, table_schema) # save table pk for stage to use pk_columns = db_engine.select_table_pk(schema_name, table_name) if not pk_columns and table_object.primary_key: pk_columns = table_object.primary_key save_text(f'{self.work_folder}/{table_name}.pk', pk_columns) # normalize cdc setting table_object.cdc = table_object.cdc.lower() if table_object.cdc == 'none': table_object.cdc = '' # clear unknown cdc settings if table_object.cdc and table_object.cdc not in ( 'filehash', 'rowhash', 'rowversion', 'sequence', 'timestamp'): logger.warning( f'Warning: Unknown CDC setting; CDC setting cleared ({table_name}.cdc={table_object.cdc})' ) table_object.cdc = '' # clear cdc setting when no pk_columns are present # NOTE: filehash cdc does not require pk_columns. if table_object.cdc and table_object.cdc != 'filehash' and not pk_columns: logger.warning( f'Warning: CDC enabled but no PK; CDC setting cleared ({table_name}.cdc={table_object.cdc})' ) table_object.cdc = '' # if no cdc, then clear cdc related attributes if not table_object.cdc: table_object.filehash = '' table_object.rowhash = '' table_object.rowversion = '' table_object.sequence = '' table_object.timestamp = '' # update table object properties for cdc select build column_names = list(table_schema.columns.keys()) table_object.schema_name = schema_name table_object.table_name = table_name table_object.column_names = column_names select_cdc = cdc_select.SelectCDC(db_engine, table_object) sql = select_cdc.select(self.job_id, current_timestamp, last_timestamp) # save generated SQL to work folder for documentation purposes sql_file_name = f'{self.work_folder}/{table_name}.sql' save_text(sql_file_name, sql) # run sql here vs via db_engine.capture_select # cursor = db_engine.capture_select(schema_name, table_name, column_names, last_timestamp, current_timestamp) cursor.execute(sql) # capture rows in fixed size batches to support unlimited size record counts # Note: Batching on capture side allows stage to insert multiple batches in parallel. if self.project.batch_size: batch_size = int(self.project.batch_size) # logger.info(f'Using project specific batch size: {self.project.batch_size}') else: batch_size = 250_000 batch_number = 0 row_count = 0 data_size = 0 while True: batch_number += 1 rows = cursor.fetchmany(batch_size) if not rows: break logger.info( f'Table({table_name}): batch={batch_number} using batch size {batch_size:,}' ) self.progress_message( f'extracting({table_name}.{batch_number:04}) ...') # flatten rows to list of column values json_rows = [list(row) for row in rows] output_file = f'{self.work_folder}/{table_name}#{batch_number:04}.json' save_jsonpickle(output_file, json_rows) # track metrics row_count += len(json_rows) data_size += file_size(output_file) # update table history with new last timestamp and sequence values table_history.last_timestamp = current_timestamp table_history.last_sequence = current_sequence # track total row count and file size across all of a table's batched json files self.events.stop(table_name, row_count, data_size) # save interim metrics for diagnostics self.events.save() self.job_row_count += row_count self.job_data_size += data_size # explicitly close cursor when finished # cursor.close() return
os.path.splitext(BUILD_FILENAME)[0]) if env["OS_NAME"] == "windows": make_archive(ZIP_FILENAME, "zip") ZIP_FILENAME += ".zip" else: make_archive(ZIP_FILENAME, "gztar") ZIP_FILENAME += ".tar.gz" # mv dirs back for dir in dirs: if os.path.isdir(os.path.join("..", "build", dir)): move(os.path.join("..", "build", dir), os.path.join(".", dir)) for BUILD_FILENAME in BUILD_FILENAMES: if not BUILD_FILENAME == "": print("Build Filename: " + BUILD_FILENAME) print("Build Filesize: " + common.file_size(BUILD_FILENAME)) else: print("No Build to prepare: " + BUILD_FILENAME) if not ZIP_FILENAME == "": print("Zip Filename: " + ZIP_FILENAME) print("Zip Filesize: " + common.file_size(ZIP_FILENAME)) else: print("No Zip to prepare: " + ZIP_FILENAME) print("Git tag: " + env["GITHUB_TAG"]) if (len(BUILD_FILENAMES) == 0) or (ZIP_FILENAME == ""): exit(1)
def scan(self, target_file, offset=0, length=0, align=DEFAULT_BYTE_ALIGNMENT, show_invalid_results=False, callback=None): ''' Performs a Binwalk scan on the target file. @target_file - File to scan. @offset - Starting offset at which to start the scan. @length - Number of bytes to scan. @align - Look for signatures every align bytes. @show_invalid_results - Set to True to display invalid results. @callback - Callback function to be invoked when matches are found. The callback function is passed two arguments: a list of result dictionaries containing the scan results (one result per dict), and the offset at which those results were identified. Example callback function: def my_callback(offset, results): print "Found %d results at offset %d:" % (len(results), offset) for result in results: print "\t%s" % result['description'] binwalk.Binwalk(callback=my_callback).scan("firmware.bin") Upon completion, the scan method returns a sorted list of tuples containing a list of results dictionaries and the offsets at which those results were identified: scan_items = [ (0, [{description : "LZMA compressed data..."}]), (112, [{description : "gzip compressed data..."}]) ] See SmartSignature.parse for a more detailed description of the results dictionary structure. ''' scan_results = {} self.total_read = 0 self.total_scanned = 0 self.scan_length = length self.filter.show_invalid_results = show_invalid_results # Load the default signatures if self.load_signatures has not already been invoked if self.magic is None: self.load_signatures() # Get a local copy of the signature sets generated by self.parser.build_signature_set. # This is accessed heavily throughout the scan, and there is less overhead for accessing local variables in Python. signature_set = self.parser.build_signature_set() # Need the total size of the target file, even if we aren't scanning the whole thing fsize = file_size(target_file) # Open the target file and seek to the specified start offset fd = open(target_file) fd.seek(offset) # If no length was specified, make the length the size of the target file minus the starting offset if self.scan_length == 0: self.scan_length = fsize - offset # Sanity check on the byte alignment; default to 1 if align <= 0: align = 1 # Main loop, scan through all the data while True: i = 0 # Read in the next block of data from the target file and make sure it's valid (data, dlen) = self._read_block(fd) if data is None or dlen == 0: break # The total number of bytes scanned could be bigger than the total number # of bytes read from the file under the following circumstances: # # o The previous dlen was not a multiple of align # o A previous result specified a jump offset that was beyond the end of the # then current data block # # If this is the case, we need to index into this data block appropriately in order to # resume the scan from the appropriate offset, and adjust dlen accordingly. bufindex = self.total_scanned - self.total_read if bufindex > 0: # If the total_scanned > total_read, then the total_scanned offset is in a subsequent block. # Set i to bufindex, which will cause i to be greater than dlen and this block will be skipped. i = bufindex elif bufindex < 0: # If the total_scanned offset is less than total_read, then the total_scanned offset is # somewhere inside this block. Set i to index into the block appropriately. i = dlen + bufindex else: # If the total_scanned offset ends at the end of this block, don't scan any of this block i = dlen # Scan through each block of data looking for signatures while i < dlen: smart = {} results = [] results_offset = -1 pre_filter_ok = False smart_jump_done = False # Pre-filter data by checking to see if the parser thinks this might be a valid match. # This eliminates unnecessary calls into libmagic, which are very expensive. # # Ideally, this should be done in the MagicParser class, but function calls are expensive. # Doing it here greatly decreases the scan time. if self.smart.pre_filter: for (sig_offset, sigset) in signature_set: if data[i + sig_offset:i + sig_offset + self.parser.MATCH_INDEX_SIZE] in sigset: pre_filter_ok = True break else: pre_filter_ok = True if pre_filter_ok: # Pass the data to libmagic, and split out multiple results into a list for magic_result in self.parser.split( self.magic.buffer(data[i:i + self.MAX_SIGNATURE_SIZE])): # Some file names are not NULL byte terminated, but rather their length is # specified in a size field. To ensure these are not marked as invalid due to # non-printable characters existing in the file name, parse the filename(s) and # trim them to the specified filename length, if one was specified. magic_result = self.smart._parse_raw_strings( magic_result) # Make sure this is a valid result before further processing if not self.filter.invalid(magic_result): # The smart filter parser returns a dictionary of keyword values and the signature description. smart = self.smart.parse(magic_result) # Validate the jump value and check if the response description should be displayed if smart['jump'] > -1 and self._should_display( smart['description']): # If multiple results are returned and one of them has smart['jump'] set to a non-zero value, # the calculated results offset will be wrong since i will have been incremented. Only set the # results_offset value when the first match is encountered. if results_offset < 0: results_offset = offset + smart[ 'adjust'] + self.total_scanned # Double check to make sure the smart['adjust'] value is sane. # If it makes results_offset negative, then it is not sane. if results_offset >= 0: # Extract the result, if it matches one of the extract rules and is not a delayed extract. if self.extractor.enabled and not ( self.extractor.delayed and smart['delay']): # If the signature did not specify a size, extract to the end of the file. if smart['size'] == 0: smart[ 'size'] = fsize - results_offset smart[ 'extract'] = self.extractor.extract( results_offset, smart['description'], target_file, smart['size'], name=smart['name']) # This appears to be a valid result, so append it to the results list. results.append(smart) # Jump to the offset specified by jump. Only do this once, so that if multiple results # are returned each of which specify a jump offset, only the first will be honored. if smart['jump'] > 0 and not smart_jump_done: # Once a jump offset has been honored, we need to start scanning every byte since the # jump offset may have thrown off the original alignment. In terms of speed this is fine, # since the jump offset usually saves more time anyway. If this is not what the user # wanted/intended, disabling pre filtering will disable jump offset processing completely. align = self.DEFAULT_BYTE_ALIGNMENT smart_jump_done = True i += (smart['jump'] - align) self.total_scanned += (smart['jump'] - align) # Did we find any valid results? if results_offset >= 0: scan_results[results_offset] = results if callback is not None: callback(results_offset, results) # Track the number of bytes scanned in this block, and the total number of bytes scanned. i += align self.total_scanned += align # Sort the results before returning them scan_items = scan_results.items() scan_items.sort() # Do delayed extraction, if specified. if self.extractor.enabled and self.extractor.delayed: scan_items = self.extractor.delayed_extract( scan_items, target_file, fsize) return scan_items