class Disasm(Module): THRESHOLD = 10 DEFAULT_MIN_INSN_COUNT = 500 TITLE = "Disassembly Scan" ORDER = 10 CLI = [ Option(short='Y', long='disasm', kwargs={'enabled': True}, description='Identify the CPU architecture of a file using the capstone disassembler'), Option(short='T', long='minsn', type=int, kwargs={'min_insn_count': 0}, description='Minimum number of consecutive instructions to be considered valid (default: %d)' % DEFAULT_MIN_INSN_COUNT), Option(long='continue', short='k', kwargs={'keep_going': True}, description="Don't stop at the first match"), ] KWARGS = [ Kwarg(name='enabled', default=False), Kwarg(name='keep_going', default=False), Kwarg(name='min_insn_count', default=DEFAULT_MIN_INSN_COUNT), ] ARCHITECTURES = [ Architecture(type=capstone.CS_ARCH_X86, mode=capstone.CS_MODE_32, endianness=capstone.CS_MODE_LITTLE_ENDIAN, description="x86 executable code, 32-bit, little endian"), Architecture(type=capstone.CS_ARCH_X86, mode=capstone.CS_MODE_64, endianness=capstone.CS_MODE_LITTLE_ENDIAN, description="x86 executable code, 64-bit, little endian"), Architecture(type=capstone.CS_ARCH_ARM, mode=capstone.CS_MODE_ARM, endianness=capstone.CS_MODE_BIG_ENDIAN, description="ARM executable code, 32-bit, big endian"), Architecture(type=capstone.CS_ARCH_ARM, mode=capstone.CS_MODE_ARM, endianness=capstone.CS_MODE_LITTLE_ENDIAN, description="ARM executable code, 32-bit, little endian"), Architecture(type=capstone.CS_ARCH_ARM64, mode=capstone.CS_MODE_ARM, endianness=capstone.CS_MODE_BIG_ENDIAN, description="ARM executable code, 64-bit, big endian"), Architecture(type=capstone.CS_ARCH_ARM64, mode=capstone.CS_MODE_ARM, endianness=capstone.CS_MODE_LITTLE_ENDIAN, description="ARM executable code, 64-bit, little endian"), Architecture(type=capstone.CS_ARCH_PPC, mode=capstone.CS_MODE_BIG_ENDIAN, endianness=capstone.CS_MODE_BIG_ENDIAN, description="PPC executable code, 32/64-bit, big endian"), Architecture(type=capstone.CS_ARCH_MIPS, mode=capstone.CS_MODE_64, endianness=capstone.CS_MODE_BIG_ENDIAN, description="MIPS executable code, 32/64-bit, big endian"), Architecture(type=capstone.CS_ARCH_MIPS, mode=capstone.CS_MODE_64, endianness=capstone.CS_MODE_LITTLE_ENDIAN, description="MIPS executable code, 32/64-bit, little endian"), Architecture(type=capstone.CS_ARCH_ARM, mode=capstone.CS_MODE_THUMB, endianness=capstone.CS_MODE_LITTLE_ENDIAN, description="ARM executable code, 16-bit (Thumb), little endian"), Architecture(type=capstone.CS_ARCH_ARM, mode=capstone.CS_MODE_THUMB, endianness=capstone.CS_MODE_BIG_ENDIAN, description="ARM executable code, 16-bit (Thumb), big endian"), ] def init(self): self.disassemblers = [] if not self.min_insn_count: self.min_insn_count = self.DEFAULT_MIN_INSN_COUNT self.disasm_data_size = self.min_insn_count * 10 for arch in self.ARCHITECTURES: self.disassemblers.append((capstone.Cs(arch.type, (arch.mode + arch.endianness)), arch.description)) def scan_file(self, fp): total_read = 0 while True: result = None (data, dlen) = fp.read_block() if dlen < 1: break # If this data block doesn't contain at least two different bytes, skip it # to prevent false positives (e.g., "\x00\x00\x00\x00" is a nop in # MIPS). if len(set(data)) >= 2: block_offset = 0 # Loop through the entire block, or until we're pretty sure # we've found some valid code in this block while (block_offset < dlen) and (result is None or result.count < self.THRESHOLD): # Don't pass the entire data block into disasm_lite, it's horribly inefficient # to pass large strings around in Python. Break it up into # smaller code blocks instead. code_block = binwalk.core.compat.str2bytes(data[block_offset:block_offset + self.disasm_data_size]) # If this code block doesn't contain at least two different bytes, skip it # to prevent false positives (e.g., "\x00\x00\x00\x00" is a # nop in MIPS). if len(set(code_block)) >= 2: for (md, description) in self.disassemblers: insns = [insn for insn in md.disasm_lite(code_block, (total_read + block_offset))] binwalk.core.common.debug("0x%.8X %s, at least %d valid instructions" % ((total_read + block_offset), description, len(insns))) # Did we disassemble at least self.min_insn_count # instructions? if len(insns) >= self.min_insn_count: # If we've already found the same type of code # in this block, simply update the result # counter if result and result.description == description: result.count += 1 if result.count >= self.THRESHOLD: break else: result = ArchResult(offset=total_read + block_offset + fp.offset, description=description, insns=insns, count=1) block_offset += 1 self.status.completed += 1 if result is not None: r = self.result(offset=result.offset, file=fp, description=(result.description + ", at least %d valid instructions" % len(result.insns))) if r.valid and r.display: if self.config.verbose: for (position, size, mnem, opnds) in result.insns: self.result(offset=position, file=fp, description="%s %s" % (mnem, opnds)) if not self.keep_going: return total_read += dlen self.status.completed = total_read def run(self): for fp in iter(self.next_file, None): self.header() self.scan_file(fp) self.footer()
class General(Module): TITLE = "General" ORDER = 0 DEFAULT_DEPENDS = [] CLI = [ Option(long='length', short='l', type=int, kwargs={'length': 0}, description='Number of bytes to scan'), Option(long='offset', short='o', type=int, kwargs={'offset': 0}, description='Start scan at this file offset'), Option(long='block', short='K', type=int, kwargs={'block': 0}, description='Set file block size'), Option(long='continue', short='k', kwargs={'keep_going': True}, description="Don't stop at the first match"), Option(long='swap', short='g', type=int, kwargs={'swap_size': 0}, description='Reverse every n bytes before scanning'), Option(short='I', long='invalid', kwargs={'show_invalid': True}, description='Show results marked as invalid'), Option(short='x', long='exclude', kwargs={'exclude_filters': []}, type=list, dtype=str.__name__, description='Exclude results that match <str>'), Option(short='y', long='include', kwargs={'include_filters': []}, type=list, dtype=str.__name__, description='Only show results that match <str>'), Option(long='log', short='f', type=argparse.FileType, kwargs={'log_file': None}, description='Log results to file'), Option(long='csv', short='c', kwargs={'csv': True}, description='Log results to file in CSV format'), Option(long='term', short='t', kwargs={'format_to_terminal': True}, description='Format output to fit the terminal window'), Option(long='quiet', short='q', kwargs={'quiet': True}, description='Suppress output to stdout'), Option(long='verbose', short='v', kwargs={'verbose': True}, description='Enable verbose output'), Option(short='h', long='help', kwargs={'show_help': True}, description='Show help output'), Option(long=None, short=None, type=binwalk.core.common.BlockFile, kwargs={'files': []}), ] KWARGS = [ Kwarg(name='length', default=0), Kwarg(name='offset', default=0), Kwarg(name='block', default=0), Kwarg(name='swap_size', default=0), Kwarg(name='show_invalid', default=False), Kwarg(name='include_filters', default=[]), Kwarg(name='exclude_filters', default=[]), Kwarg(name='log_file', default=None), Kwarg(name='csv', default=False), Kwarg(name='format_to_terminal', default=False), Kwarg(name='quiet', default=False), Kwarg(name='verbose', default=False), Kwarg(name='files', default=[]), Kwarg(name='show_help', default=False), Kwarg(name='keep_going', default=False), ] PRIMARY = False def load(self): self.target_files = [] # Order is important with these two methods self._open_target_files() self._set_verbosity() #self.filter = binwalk.core.filter.Filter(self._display_invalid) self.filter = binwalk.core.filter.Filter(self.show_invalid) # Set any specified include/exclude filters for regex in self.exclude_filters: self.filter.exclude(regex) for regex in self.include_filters: self.filter.include(regex) self.settings = binwalk.core.settings.Settings() self.display = binwalk.core.display.Display( log=self.log_file, csv=self.csv, quiet=self.quiet, verbose=self.verbose, filter=self.filter, fit_to_screen=self.format_to_terminal) if self.show_help: show_help() if not binwalk.core.idb.LOADED_IN_IDA: sys.exit(0) def reset(self): for fp in self.target_files: fp.reset() def __del__(self): self._cleanup() def __exit__(self, a, b, c): self._cleanup() def _cleanup(self): if hasattr(self, 'target_files'): for fp in self.target_files: fp.close() def _set_verbosity(self): ''' Sets the appropriate verbosity. Must be called after self._test_target_files so that self.target_files is properly set. ''' # If more than one target file was specified, enable verbose mode; else, there is # nothing in some outputs to indicate which scan corresponds to which file. if len(self.target_files) > 1 and not self.verbose: self.verbose = True def open_file(self, fname, length=None, offset=None, swap=None, block=None, peek=None): ''' Opens the specified file with all pertinent configuration settings. ''' if length is None: length = self.length if offset is None: offset = self.offset if swap is None: swap = self.swap_size return binwalk.core.common.BlockFile(fname, length=length, offset=offset, swap=swap, block=block, peek=peek) def _open_target_files(self): ''' Checks if the target files can be opened. Any files that cannot be opened are removed from the self.target_files list. ''' # Validate the target files listed in target_files for tfile in self.files: # Ignore directories. if not os.path.isdir(tfile): # Make sure we can open the target files try: self.target_files.append(self.open_file(tfile)) except KeyboardInterrupt as e: raise e except Exception as e: self.error(description="Cannot open file : %s" % str(e))
class Entropy(Module): XLABEL = 'Offset' YLABEL = 'Entropy' XUNITS = 'B' YUNITS = 'E' FILE_WIDTH = 1024 FILE_FORMAT = 'png' COLORS = ['r', 'g', 'c', 'b', 'm'] DEFAULT_BLOCK_SIZE = 1024 DEFAULT_DATA_POINTS = 2048 DEFAULT_TRIGGER_HIGH = .95 DEFAULT_TRIGGER_LOW = .85 TITLE = "Entropy Analysis" ORDER = 8 # TODO: Add --dpoints option to set the number of data points? CLI = [ Option(short='E', long='entropy', kwargs={'enabled': True}, description='Calculate file entropy'), Option(short='F', long='fast', kwargs={'use_zlib': True}, description='Use faster, but less detailed, entropy analysis'), Option(short='J', long='save', kwargs={'save_plot': True}, description='Save plot as a PNG'), Option(short='Q', long='nlegend', kwargs={'show_legend': False}, description='Omit the legend from the entropy plot graph'), Option(short='N', long='nplot', kwargs={'do_plot': False}, description='Do not generate an entropy plot graph'), Option( short='H', long='high', type=float, kwargs={'trigger_high': DEFAULT_TRIGGER_HIGH}, description= 'Set the rising edge entropy trigger threshold (default: %.2f)' % DEFAULT_TRIGGER_HIGH), Option( short='L', long='low', type=float, kwargs={'trigger_low': DEFAULT_TRIGGER_LOW}, description= 'Set the falling edge entropy trigger threshold (default: %.2f)' % DEFAULT_TRIGGER_LOW), ] KWARGS = [ Kwarg(name='enabled', default=False), Kwarg(name='save_plot', default=False), Kwarg(name='trigger_high', default=DEFAULT_TRIGGER_HIGH), Kwarg(name='trigger_low', default=DEFAULT_TRIGGER_LOW), Kwarg(name='use_zlib', default=False), Kwarg(name='display_results', default=True), Kwarg(name='do_plot', default=True), Kwarg(name='show_legend', default=True), Kwarg(name='block_size', default=0), ] # Run this module last so that it can process all other module's results and overlay them on the entropy graph PRIORITY = 0 def init(self): self.HEADER[-1] = "ENTROPY" self.max_description_length = 0 self.file_markers = {} if self.use_zlib: self.algorithm = self.gzip else: self.algorithm = self.shannon # Get a list of all other module's results to mark on the entropy graph for (module, obj) in iterator(self.modules): for result in obj.results: if result.plot and result.file and result.description: description = result.description.split(',')[0] if not has_key(self.file_markers, result.file.name): self.file_markers[result.file.name] = [] if len(description) > self.max_description_length: self.max_description_length = len(description) self.file_markers[result.file.name].append( (result.offset, description)) # If other modules have been run and they produced results, don't spam the terminal with entropy results if self.file_markers: self.display_results = False if not self.block_size: if self.config.block: self.block_size = self.config.block else: self.block_size = None def run(self): for fp in iter(self.next_file, None): if self.display_results: self.header() self.calculate_file_entropy(fp) if self.display_results: self.footer() if self.do_plot: import pyqtgraph as pg if not self.save_plot: from pyqtgraph.Qt import QtGui QtGui.QApplication.instance().exec_() pg.exit() def calculate_file_entropy(self, fp): # Tracks the last displayed rising/falling edge (0 for falling, 1 for rising, None if nothing has been printed yet) last_edge = None # Auto-reset the trigger; if True, an entropy above/below self.trigger_high/self.trigger_low will be printed trigger_reset = True # Clear results from any previously analyzed files self.clear(results=True) # If -K was not specified, calculate the block size to create DEFAULT_DATA_POINTS data points if self.block_size is None: block_size = fp.size / self.DEFAULT_DATA_POINTS # Round up to the nearest DEFAULT_BLOCK_SIZE (1024) block_size = int(block_size + ((self.DEFAULT_BLOCK_SIZE - block_size) % self.DEFAULT_BLOCK_SIZE)) else: block_size = self.block_size binwalk.core.common.debug("Entropy block size (%d data points): %d" % (self.DEFAULT_DATA_POINTS, block_size)) while True: file_offset = fp.tell() (data, dlen) = fp.read_block() if not data: break i = 0 while i < dlen: entropy = self.algorithm(data[i:i + block_size]) display = self.display_results description = "%f" % entropy if not self.config.verbose: if last_edge in [None, 0] and entropy > self.trigger_low: trigger_reset = True elif last_edge in [None, 1 ] and entropy < self.trigger_high: trigger_reset = True if trigger_reset and entropy >= self.trigger_high: description = "Rising entropy edge (%f)" % entropy display = self.display_results last_edge = 1 trigger_reset = False elif trigger_reset and entropy <= self.trigger_low: description = "Falling entropy edge (%f)" % entropy display = self.display_results last_edge = 0 trigger_reset = False else: display = False description = "%f" % entropy r = self.result(offset=(file_offset + i), file=fp, entropy=entropy, description=description, display=display) i += block_size if self.do_plot: self.plot_entropy(fp.name) def shannon(self, data): ''' Performs a Shannon entropy analysis on a given block of data. ''' entropy = 0 if data: length = len(data) seen = dict(((chr(x), 0) for x in range(0, 256))) for byte in data: seen[byte] += 1 for x in range(0, 256): p_x = float(seen[chr(x)]) / length if p_x > 0: entropy -= p_x * math.log(p_x, 2) return (entropy / 8) def gzip(self, data, truncate=True): ''' Performs an entropy analysis based on zlib compression ratio. This is faster than the shannon entropy analysis, but not as accurate. ''' # Entropy is a simple ratio of: <zlib compressed size> / <original size> e = float( float(len(zlib.compress(str2bytes(data), 9))) / float(len(data))) if truncate and e > 1.0: e = 1.0 return e def plot_entropy(self, fname): import numpy as np import pyqtgraph as pg import pyqtgraph.exporters as exporters i = 0 x = [] y = [] plotted_colors = {} for r in self.results: x.append(r.offset) y.append(r.entropy) plt = pg.plot(title=fname, clear=True) # Disable auto-ranging of the Y (entropy) axis, as it # can cause some very un-intuitive graphs, particularly #for files with only high-entropy data. plt.setYRange(0, 1) if self.show_legend and has_key(self.file_markers, fname): plt.addLegend(size=(self.max_description_length * 10, 0)) for (offset, description) in self.file_markers[fname]: # If this description has already been plotted at a different offset, we need to # use the same color for the marker, but set the description to None to prevent # duplicate entries in the graph legend. # # Else, get the next color and use it to mark descriptions of this type. if has_key(plotted_colors, description): color = plotted_colors[description] description = None else: color = self.COLORS[i] plotted_colors[description] = color i += 1 if i >= len(self.COLORS): i = 0 plt.plot(x=[offset, offset], y=[0, 1.1], name=description, pen=pg.mkPen(color, width=2.5)) # Plot data points plt.plot(x, y, pen='y') # TODO: legend is not displayed properly when saving plots to disk if self.save_plot: exporter = exporters.ImageExporter.ImageExporter(plt.plotItem) exporter.parameters()['width'] = self.FILE_WIDTH exporter.export( binwalk.core.common.unique_file_name(os.path.basename(fname), self.FILE_FORMAT)) else: plt.setLabel('left', self.YLABEL, units=self.YUNITS) plt.setLabel('bottom', self.XLABEL, units=self.XUNITS)
class CodeID(Module): DEFAULT_MIN_INSN_COUNT = 500 TITLE = "Disassembly Scan" ORDER = 10 CLI = [ Option( short='Y', long='code', kwargs={'enabled': True}, description= 'Attempts to identify the CPU architecture of a file using the capstone disassembler' ), Option( short='T', long='minsn', type=int, kwargs={'min_insn_count': 0}, description= 'Minimum number of consecutive instructions to be considered valid (default: %d)' % DEFAULT_MIN_INSN_COUNT), Option(short='V', long='disasm', kwargs={'show_disasm': True}, description='Display the disassembled instructions'), ] KWARGS = [ Kwarg(name='enabled', default=False), Kwarg(name='show_disasm', default=False), Kwarg(name='min_insn_count', default=DEFAULT_MIN_INSN_COUNT), ] ARCHITECTURES = [ Architecture(type=capstone.CS_ARCH_MIPS, mode=capstone.CS_MODE_32, endianess=capstone.CS_MODE_BIG_ENDIAN, description="MIPS executable code, 32-bit, big endian"), Architecture( type=capstone.CS_ARCH_MIPS, mode=capstone.CS_MODE_32, endianess=capstone.CS_MODE_LITTLE_ENDIAN, description="MIPS executable code, 32-bit, little endian"), Architecture(type=capstone.CS_ARCH_ARM, mode=capstone.CS_MODE_ARM, endianess=capstone.CS_MODE_BIG_ENDIAN, description="ARM executable code, 32-bit, big endian"), Architecture(type=capstone.CS_ARCH_ARM, mode=capstone.CS_MODE_ARM, endianess=capstone.CS_MODE_LITTLE_ENDIAN, description="ARM executable code, 32-bit, little endian"), Architecture(type=capstone.CS_ARCH_PPC, mode=capstone.CS_MODE_BIG_ENDIAN, endianess=capstone.CS_MODE_BIG_ENDIAN, description="PPC executable code, 32/64-bit, big endian"), #Architecture(type=capstone.CS_ARCH_MIPS, # mode=capstone.CS_MODE_16, # endianess=capstone.CS_MODE_BIG_ENDIAN, # description="MIPS executable code, 16-bit, big endian"), #Architecture(type=capstone.CS_ARCH_MIPS, # mode=capstone.CS_MODE_16, # endianess=capstone.CS_MODE_LITTLE_ENDIAN, # description="MIPSEL executable code, 16-bit, little endian"), Architecture( type=capstone.CS_ARCH_ARM, mode=capstone.CS_MODE_THUMB, endianess=capstone.CS_MODE_LITTLE_ENDIAN, description="ARM executable code, 16-bit (Thumb), little endian"), Architecture( type=capstone.CS_ARCH_ARM, mode=capstone.CS_MODE_THUMB, endianess=capstone.CS_MODE_BIG_ENDIAN, description="ARM executable code, 16-bit (Thumb), big endian"), Architecture(type=capstone.CS_ARCH_MIPS, mode=capstone.CS_MODE_64, endianess=capstone.CS_MODE_BIG_ENDIAN, description="MIPS executable code, 64-bit, big endian"), Architecture( type=capstone.CS_ARCH_MIPS, mode=capstone.CS_MODE_64, endianess=capstone.CS_MODE_LITTLE_ENDIAN, description="MIPS executable code, 64-bit, little endian"), Architecture(type=capstone.CS_ARCH_ARM64, mode=capstone.CS_MODE_ARM, endianess=capstone.CS_MODE_BIG_ENDIAN, description="ARM executable code, 64-bit, big endian"), Architecture(type=capstone.CS_ARCH_ARM64, mode=capstone.CS_MODE_ARM, endianess=capstone.CS_MODE_LITTLE_ENDIAN, description="ARM executable code, 64-bit, little endian"), ] def init(self): self.disassemblers = [] if not self.min_insn_count: self.min_insn_count = self.DEFAULT_MIN_INSN_COUNT self.disasm_data_size = self.min_insn_count * 10 for arch in self.ARCHITECTURES: self.disassemblers.append( (capstone.Cs(arch.type, (arch.mode + arch.endianess)), arch.description)) def scan_file(self, fp): total_read = 0 while True: (data, dlen) = fp.read_block() if not data: break # If this data block doesn't contain at least two different bytes, skip it # to prevent false positives (e.g., "\x00\x00\x00x\00" is a nop in MIPS). if len(set(data)) >= 2: block_offset = 0 while block_offset < dlen: # Don't pass the entire data block into disasm_lite, it's horribly inefficient # to pass large strings around in Python. Break it up into smaller code blocks instead. code_block = binwalk.core.compat.str2bytes( data[block_offset:block_offset + self.disasm_data_size]) # If this code block doesn't contain at least two different bytes, skip it # to prevent false positives (e.g., "\x00\x00\x00x\00" is a nop in MIPS). if len(set(code_block)) >= 2: for (md, description) in self.disassemblers: insns = [ insn for insn in md.disasm_lite( code_block, (total_read + block_offset)) ] binwalk.core.common.debug( "0x%.8X %s, at least %d valid instructions" % ((total_read + block_offset), description, len(insns))) if len(insns) >= self.min_insn_count: r = self.result( offset=total_read + block_offset, file=fp, description=( description + ", at least %d valid instructions" % len(insns))) if r.valid and r.display: if self.show_disasm: for (position, size, mnem, opnds) in insns: self.result(offset=position, file=fp, description="\t%s %s" % (mnem, opnds)) if not self.config.verbose: return block_offset += 1 total_read += dlen def run(self): for fp in iter(self.next_file, None): self.header() self.scan_file(fp) self.footer()
class Entropy(Module): XLABEL = 'Offset' YLABEL = 'Entropy' XUNITS = 'B' YUNITS = 'E' FILE_WIDTH = 1024 FILE_FORMAT = 'png' COLORS = ['g', 'r', 'c', 'm', 'y'] DEFAULT_BLOCK_SIZE = 1024 DEFAULT_DATA_POINTS = 2048 DEFAULT_TRIGGER_HIGH = .95 DEFAULT_TRIGGER_LOW = .85 TITLE = "Entropy Analysis" ORDER = 8 # TODO: Add --dpoints option to set the number of data points? CLI = [ Option(short='E', long='entropy', kwargs={'enabled': True}, description='Calculate file entropy'), Option(short='F', long='fast', kwargs={'use_zlib': True}, description='Use faster, but less detailed, entropy analysis'), Option(short='J', long='save', kwargs={'save_plot': True}, description='Save plot as a PNG'), Option(short='Q', long='nlegend', kwargs={'show_legend': False}, description='Omit the legend from the entropy plot graph'), Option(short='N', long='nplot', kwargs={'do_plot': False}, description='Do not generate an entropy plot graph'), Option( short='H', long='high', type=float, kwargs={'trigger_high': DEFAULT_TRIGGER_HIGH}, description= 'Set the rising edge entropy trigger threshold (default: %.2f)' % DEFAULT_TRIGGER_HIGH), Option( short='L', long='low', type=float, kwargs={'trigger_low': DEFAULT_TRIGGER_LOW}, description= 'Set the falling edge entropy trigger threshold (default: %.2f)' % DEFAULT_TRIGGER_LOW), ] KWARGS = [ Kwarg(name='enabled', default=False), Kwarg(name='save_plot', default=False), Kwarg(name='trigger_high', default=DEFAULT_TRIGGER_HIGH), Kwarg(name='trigger_low', default=DEFAULT_TRIGGER_LOW), Kwarg(name='use_zlib', default=False), Kwarg(name='display_results', default=True), Kwarg(name='do_plot', default=True), Kwarg(name='show_legend', default=True), Kwarg(name='block_size', default=0), ] # Run this module last so that it can process all other module's results # and overlay them on the entropy graph PRIORITY = 0 def init(self): self.HEADER[-1] = "ENTROPY" self.max_description_length = 0 self.file_markers = {} if self.use_zlib: self.algorithm = self.gzip else: self.algorithm = self.shannon # Get a list of all other module's results to mark on the entropy graph for (module, obj) in iterator(self.modules): for result in obj.results: if result.plot and result.file and result.description: description = result.description.split(',')[0] if not has_key(self.file_markers, result.file.name): self.file_markers[result.file.name] = [] if len(description) > self.max_description_length: self.max_description_length = len(description) self.file_markers[result.file.name].append( (result.offset, description)) # If other modules have been run and they produced results, don't spam # the terminal with entropy results if self.file_markers: self.display_results = False if not self.block_size: if self.config.block: self.block_size = self.config.block else: self.block_size = None def _entropy_sigterm_handler(self, *args): print("F**k it all.") def run(self): self._run() def _run(self): # Sanity check and warning if matplotlib isn't found if self.do_plot: try: import matplotlib.pyplot as plt except ImportError as e: binwalk.core.common.warning( "Failed to import matplotlib module, visual entropy graphing will be disabled" ) self.do_plot = False for fp in iter(self.next_file, None): if self.display_results: self.header() self.calculate_file_entropy(fp) if self.display_results: self.footer() def calculate_file_entropy(self, fp): # Tracks the last displayed rising/falling edge (0 for falling, 1 for # rising, None if nothing has been printed yet) last_edge = None # Auto-reset the trigger; if True, an entropy above/below # self.trigger_high/self.trigger_low will be printed trigger_reset = True # Clear results from any previously analyzed files self.clear(results=True) # If -K was not specified, calculate the block size to create # DEFAULT_DATA_POINTS data points if self.block_size is None: block_size = fp.size / self.DEFAULT_DATA_POINTS # Round up to the nearest DEFAULT_BLOCK_SIZE (1024) block_size = int(block_size + ((self.DEFAULT_BLOCK_SIZE - block_size) % self.DEFAULT_BLOCK_SIZE)) else: block_size = self.block_size # Make sure block size is greater than 0 if block_size <= 0: block_size = self.DEFAULT_BLOCK_SIZE binwalk.core.common.debug("Entropy block size (%d data points): %d" % (self.DEFAULT_DATA_POINTS, block_size)) while True: file_offset = fp.tell() (data, dlen) = fp.read_block() if dlen < 1: break i = 0 while i < dlen: entropy = self.algorithm(data[i:i + block_size]) display = self.display_results description = "%f" % entropy if not self.config.verbose: if last_edge in [None, 0] and entropy > self.trigger_low: trigger_reset = True elif last_edge in [None, 1 ] and entropy < self.trigger_high: trigger_reset = True if trigger_reset and entropy >= self.trigger_high: description = "Rising entropy edge (%f)" % entropy display = self.display_results last_edge = 1 trigger_reset = False elif trigger_reset and entropy <= self.trigger_low: description = "Falling entropy edge (%f)" % entropy display = self.display_results last_edge = 0 trigger_reset = False else: display = False description = "%f" % entropy r = self.result(offset=(file_offset + i), file=fp, entropy=entropy, description=description, display=display) i += block_size if self.do_plot: self.plot_entropy(fp.name) def shannon(self, data): ''' Performs a Shannon entropy analysis on a given block of data. ''' entropy = 0 if data: length = len(data) seen = dict(((chr(x), 0) for x in range(0, 256))) for byte in data: seen[byte] += 1 for x in range(0, 256): p_x = float(seen[chr(x)]) / length if p_x > 0: entropy -= p_x * math.log(p_x, 2) return (entropy / 8) def gzip(self, data, truncate=True): ''' Performs an entropy analysis based on zlib compression ratio. This is faster than the shannon entropy analysis, but not as accurate. ''' # Entropy is a simple ratio of: <zlib compressed size> / <original # size> e = float( float(len(zlib.compress(str2bytes(data), 9))) / float(len(data))) if truncate and e > 1.0: e = 1.0 return e def plot_entropy(self, fname): try: import matplotlib.pyplot as plt except ImportError as e: return i = 0 x = [] y = [] plotted_colors = {} for r in self.results: x.append(r.offset) y.append(r.entropy) fig = plt.figure() ax = fig.add_subplot(1, 1, 1, autoscale_on=True) ax.set_title(fname) ax.set_xlabel(self.XLABEL) ax.set_ylabel(self.YLABEL) ax.plot(x, y, lw=2) # Add a fake, invisible plot entry so that offsets at/near the # minimum x value (0) are actually visible on the plot. ax.plot(-(max(x) * .001), 0, lw=0) if self.show_legend and has_key(self.file_markers, fname): for (offset, description) in self.file_markers[fname]: # If this description has already been plotted at a different offset, we need to # use the same color for the marker, but set the description to None to prevent # duplicate entries in the graph legend. # # Else, get the next color and use it to mark descriptions of # this type. if has_key(plotted_colors, description): color = plotted_colors[description] description = None else: color = self.COLORS[i] plotted_colors[description] = color i += 1 if i >= len(self.COLORS): i = 0 ax.plot([offset, offset], [0, 1.1], '%s-' % color, lw=2, label=description) ax.legend(loc='lower right', shadow=True) if self.save_plot: out_file = os.path.join(os.getcwd(), os.path.basename(fname)) + '.png' fig.savefig(out_file) else: plt.show()
class Extractor(Module): ''' Extractor class, responsible for extracting files from the target file and executing external applications, if requested. ''' # Extract rules are delimited with a colon. # <case insensitive matching string>:<file extension>[:<command to run>] RULE_DELIM = ':' # Comments in the extract.conf files start with a pound COMMENT_DELIM = '#' # Place holder for the extracted file name in the command FILE_NAME_PLACEHOLDER = '%e' # Unique path delimiter, used for generating unique output file/directory names. # Useful when, for example, extracting two squashfs images (squashfs-root, # squashfs-root-0). UNIQUE_PATH_DELIMITER = '%%' TITLE = 'Extraction' ORDER = 9 PRIMARY = False CLI = [ Option(short='e', long='extract', kwargs={ 'load_default_rules': True, 'enabled': True }, description='Automatically extract known file types'), Option( short='D', long='dd', type=list, dtype='type:ext:cmd', kwargs={ 'manual_rules': [], 'enabled': True }, description= 'Extract <type> signatures, give the files an extension of <ext>, and execute <cmd>' ), Option(short='M', long='matryoshka', kwargs={'matryoshka': 8}, description='Recursively scan extracted files'), Option(short='d', long='depth', type=int, kwargs={'matryoshka': 0}, description= 'Limit matryoshka recursion depth (default: 8 levels deep)'), Option( short='C', long='directory', type=str, kwargs={'base_directory': 0}, description= 'Extract files/folders to a custom directory (default: current working directory)' ), Option(short='j', long='size', type=int, kwargs={'max_size': 0}, description='Limit the size of each extracted file'), Option(short='n', long='count', type=int, kwargs={'max_count': 0}, description='Limit the number of extracted files'), #Option(short='u', # long='limit', # type=int, # kwargs={'recursive_max_size': 0}, # description="Limit the total size of all extracted files"), Option(short='r', long='rm', kwargs={'remove_after_execute': True}, description='Delete carved files after extraction'), Option( short='z', long='carve', kwargs={'run_extractors': False}, description= "Carve data from files, but don't execute extraction utilities"), ] KWARGS = [ Kwarg(name='max_size', default=None), Kwarg(name='recursive_max_size', default=None), Kwarg(name='max_count', default=None), Kwarg(name='base_directory', default=None), Kwarg(name='remove_after_execute', default=False), Kwarg(name='load_default_rules', default=False), Kwarg(name='run_extractors', default=True), Kwarg(name='manual_rules', default=[]), Kwarg(name='matryoshka', default=0), Kwarg(name='enabled', default=False), ] def load(self): # Holds a list of extraction rules loaded either from a file or when # manually specified. self.extract_rules = [] # The input file specific output directory path (default to CWD) if self.base_directory: self.directory = os.path.realpath(self.base_directory) if not os.path.exists(self.directory): os.makedirs(self.directory) else: self.directory = os.getcwd() # Key value pairs of input file path and output extraction path self.output = {} # Number of extracted files self.extraction_count = 0 # Override the directory name used for extraction output directories self.output_directory_override = None if self.load_default_rules: self.load_defaults() for manual_rule in self.manual_rules: self.add_rule(manual_rule) if self.matryoshka: self.config.verbose = True def add_pending(self, f): # Ignore symlinks if os.path.islink(f): return # Get the file mode to check and see if it's a block/char device try: file_mode = os.stat(f).st_mode except OSError as e: return # Only add this to the pending list of files to scan # if the file is a regular file. Special files (block/character # devices) can be tricky; they may fail to open, or worse, simply # hang when an attempt to open them is made. So for recursive # extraction purposes, they are ignored, albeit with a warning to # the user. if stat.S_ISREG(file_mode): # Make sure we can open the file too... try: fp = binwalk.core.common.BlockFile(f) fp.close() self.pending.append(f) except IOError as e: binwalk.core.common.warning("Ignoring file '%s': %s" % (f, str(e))) else: binwalk.core.common.warning( "Ignoring file '%s': Not a regular file" % f) def reset(self): # Holds a list of pending files that should be scanned; only populated # if self.matryoshka == True self.pending = [] # Holds a dictionary of extraction directories created for each scanned # file. self.extraction_directories = {} # Holds a dictionary of the last directory listing for a given directory; used for identifying # newly created/extracted files that need to be appended to # self.pending. self.last_directory_listing = {} def callback(self, r): # Make sure the file attribute is set to a compatible instance of # binwalk.core.common.BlockFile try: r.file.size except KeyboardInterrupt as e: pass except Exception as e: return if not r.size: size = r.file.size - r.offset else: size = r.size # Only extract valid results that have been marked for extraction and displayed to the user. # Note that r.display is still True even if --quiet has been specified; it is False if the result has been # explicitly excluded via the -y/-x options. if r.valid and r.extract and r.display and ( not self.max_count or self.extraction_count < self.max_count): # Create some extract output for this file, it it doesn't already # exist if not binwalk.core.common.has_key(self.output, r.file.path): self.output[r.file.path] = ExtractInfo() # Attempt extraction binwalk.core.common.debug("Extractor callback for %s @%d [%s]" % (r.file.name, r.offset, r.description)) (extraction_directory, dd_file, scan_extracted_files, extraction_utility) = self.extract(r.offset, r.description, r.file.path, size, r.name) # If the extraction was successful, self.extract will have returned # the output directory and name of the dd'd file if extraction_directory and dd_file: # Track the number of extracted files self.extraction_count += 1 # Get the full path to the dd'd file and save it in the output # info for this file dd_file_path = os.path.join(extraction_directory, dd_file) self.output[r.file.path].carved[r.offset] = dd_file_path self.output[r.file.path].extracted[r.offset] = ExtractDetails( files=[], command=extraction_utility) # Do a directory listing of the output directory directory_listing = set(os.listdir(extraction_directory)) # If this is a newly created output directory, self.last_directory_listing won't have a record of it. # If we've extracted other files to this directory before, it # will. if not has_key(self.last_directory_listing, extraction_directory): self.last_directory_listing[extraction_directory] = set() # Loop through a list of newly created files (i.e., files that # weren't listed in the last directory listing) for f in directory_listing.difference( self.last_directory_listing[extraction_directory]): # Build the full file path and add it to the extractor # results file_path = os.path.join(extraction_directory, f) real_file_path = os.path.realpath(file_path) self.result(description=file_path, display=False) # Also keep a list of files created by the extraction # utility if real_file_path != dd_file_path: self.output[r.file.path].extracted[ r.offset].files.append(real_file_path) # If recursion was specified, and the file is not the same # one we just dd'd if (self.matryoshka and file_path != dd_file_path and scan_extracted_files and self.directory in real_file_path): # If the recursion level of this file is less than or # equal to our desired recursion level if len( real_file_path.split(self.directory)[1].split( os.path.sep)) <= self.matryoshka: # If this is a directory and we are supposed to process directories for this extractor, # then add all files under that directory to the # list of pending files. if os.path.isdir(file_path): for root, dirs, files in os.walk(file_path): for f in files: full_path = os.path.join(root, f) self.add_pending(full_path) # If it's just a file, it to the list of pending # files else: self.add_pending(file_path) # Update the last directory listing for the next time we # extract a file to this same output directory self.last_directory_listing[ extraction_directory] = directory_listing def append_rule(self, r): self.extract_rules.append(r.copy()) def prepend_rule(self, r): self.extract_rules = [r] + self.extract_rules def add_rule(self, txtrule=None, regex=None, extension=None, cmd=None, codes=[0, None], recurse=True, prepend=False): rules = self.create_rule(txtrule, regex, extension, cmd, codes, recurse) for r in rules: if prepend: self.prepend_rule(r) else: self.append_rule(r) def create_rule(self, txtrule=None, regex=None, extension=None, cmd=None, codes=[0, None], recurse=True): ''' Adds a set of rules to the extraction rule list. @txtrule - Rule string, or list of rule strings, in the format <regular expression>:<file extension>[:<command to run>] @regex - If rule string is not specified, this is the regular expression string to use. @extension - If rule string is not specified, this is the file extension to use. @cmd - If rule string is not specified, this is the command to run. Alternatively a callable object may be specified, which will be passed one argument: the path to the file to extract. @codes - A list of valid return codes for the extractor. @recurse - If False, extracted directories will not be recursed into when the matryoshka option is enabled. Returns None. ''' rules = [] created_rules = [] match = False r = { 'extension': '', 'cmd': '', 'regex': None, 'codes': codes, 'recurse': recurse, } # Process single explicitly specified rule if not txtrule and regex and extension: r['extension'] = extension r['regex'] = re.compile(regex) if cmd: r['cmd'] = cmd return [r] # Process rule string, or list of rule strings if not isinstance(txtrule, type([])): rules = [txtrule] else: rules = txtrule for rule in rules: r['cmd'] = '' r['extension'] = '' try: values = self._parse_rule(rule) match = values[0] r['regex'] = re.compile(values[0]) r['extension'] = values[1] r['cmd'] = values[2] r['codes'] = values[3] r['recurse'] = values[4] except KeyboardInterrupt as e: raise e except Exception: pass # Verify that the match string was retrieved. if match: created_rules.append(r) return created_rules def remove_rules(self, description): ''' Remove all rules that match a specified description. @description - The description to match against. Returns the number of rules removed. ''' rm = [] description = description.lower() for i in range(0, len(self.extract_rules)): if self.extract_rules[i]['regex'].search(description): rm.append(i) for i in rm: self.extract_rules.pop(i) return len(rm) def edit_rules(self, description, key, value): ''' Edit all rules that match a specified description. @description - The description to match against. @key - The key to change for each matching rule. @value - The new key value for each matching rule. Returns the number of rules modified. ''' count = 0 description = description.lower() for i in range(0, len(self.extract_rules)): if self.extract_rules[i]['regex'].search(description): if has_key(self.extract_rules[i], key): self.extract_rules[i][key] = value count += 1 return count def clear_rules(self): ''' Deletes all extraction rules. Returns None. ''' self.extract_rules = [] def get_rules(self, description=None): ''' Returns a list of extraction rules that match a given description. @description - The description to match against. Returns a list of extraction rules that match the given description. If no description is provided, a list of all rules are returned. ''' if description: rules = [] description = description.lower() for i in range(0, len(self.extract_rules)): if self.extract_rules[i]['regex'].search(description): rules.append(self.extract_rules[i]) else: rules = self.extract_rules return rules def load_from_file(self, fname): ''' Loads extraction rules from the specified file. @fname - Path to the extraction rule file. Returns None. ''' try: # Process each line from the extract file, ignoring comments with open(fname, 'r') as f: for rule in f.readlines(): self.add_rule(rule.split(self.COMMENT_DELIM, 1)[0]) except KeyboardInterrupt as e: raise e except Exception as e: raise Exception( "Extractor.load_from_file failed to load file '%s': %s" % (fname, str(e))) def load_defaults(self): ''' Loads default extraction rules from the user and system extract.conf files. Returns None. ''' # Load the user extract file first to ensure its rules take precedence. extract_files = [ self.config.settings.user.extract, self.config.settings.system.extract, ] for extract_file in extract_files: if extract_file: try: self.load_from_file(extract_file) except KeyboardInterrupt as e: raise e except Exception as e: if binwalk.core.common.DEBUG: raise Exception( "Extractor.load_defaults failed to load file '%s': %s" % (extract_file, str(e))) def get_output_directory_override(self): ''' Returns the current output directory basename override value. ''' return self.output_directory_override def override_output_directory_basename(self, dirname): ''' Allows the overriding of the default extraction directory basename. @dirname - The directory base name to use. Returns the current output directory basename override value. ''' self.output_directory_override = dirname return self.output_directory_override def build_output_directory(self, path): ''' Set the output directory for extracted files. @path - The path to the file that data will be extracted from. Returns None. ''' # If we have not already created an output directory for this target # file, create one now if not has_key(self.extraction_directories, path): basedir = os.path.dirname(path) basename = os.path.basename(path) if basedir != self.directory: # During recursive extraction, extracted files will be in subdirectories # of the CWD. This allows us to figure out the subdirectory by simply # splitting the target file's base directory on our known CWD. # # However, the very *first* file being scanned is not necessarily in the # CWD, so this will raise an IndexError. This is easy to handle though, # since the very first file being scanned needs to have its contents # extracted to ${CWD}/_basename.extracted, so we just set the subdir # variable to a blank string when an IndexError is encountered. try: subdir = basedir.split(self.directory)[1][1:] except IndexError as e: subdir = "" else: subdir = "" if self.output_directory_override: output_directory = os.path.join(self.directory, subdir, self.output_directory_override) else: outdir = os.path.join(self.directory, subdir, '_' + basename) output_directory = unique_file_name(outdir, extension='extracted') if not os.path.exists(output_directory): os.mkdir(output_directory) self.extraction_directories[path] = output_directory self.output[path].directory = os.path.realpath( output_directory) + os.path.sep # Else, just use the already created directory else: output_directory = self.extraction_directories[path] return output_directory def cleanup_extracted_files(self, tf=None): ''' Set the action to take after a file is extracted. @tf - If set to True, extracted files will be cleaned up after running a command against them. If set to False, extracted files will not be cleaned up after running a command against them. If set to None or not specified, the current setting will not be changed. Returns the current cleanup status (True/False). ''' if tf is not None: self.remove_after_execute = tf return self.remove_after_execute def extract(self, offset, description, file_name, size, name=None): ''' Extract an embedded file from the target file, if it matches an extract rule. Called automatically by Binwalk.scan(). @offset - Offset inside the target file to begin the extraction. @description - Description of the embedded file to extract, as returned by libmagic. @file_name - Path to the target file. @size - Number of bytes to extract. @name - Name to save the file as. Returns the name of the extracted file (blank string if nothing was extracted). ''' fname = '' rule = None recurse = False original_dir = os.getcwd() rules = self.match(description) file_path = os.path.realpath(file_name) # No extraction rules for this file if not rules: return (None, None, False, str(None)) else: binwalk.core.common.debug("Found %d matching extraction rules" % len(rules)) # Generate the output directory name where extracted files will be # stored output_directory = self.build_output_directory(file_name) # Extract to end of file if no size was specified if not size: size = file_size(file_path) - offset if os.path.isfile(file_path): os.chdir(output_directory) # Loop through each extraction rule until one succeeds for i in range(0, len(rules)): rule = rules[i] # Make sure we don't recurse into any extracted directories if # instructed not to if rule['recurse'] in [True, False]: recurse = rule['recurse'] else: recurse = True # Copy out the data to disk, if we haven't already fname = self._dd(file_path, offset, size, rule['extension'], output_file_name=name) # If there was a command specified for this rule, try to execute it. # If execution fails, the next rule will be attempted. if rule['cmd']: # Note the hash of the original file; if --rm is specified and the # extraction utility modifies the original file rather than creating # a new one (AFAIK none currently do, but could happen in the future), # we don't want to remove this file. if self.remove_after_execute: fname_md5 = file_md5(fname) # Execute the specified command against the extracted file if self.run_extractors: extract_ok = self.execute(rule['cmd'], fname, rule['codes']) else: extract_ok = True # Only clean up files if remove_after_execute was specified if extract_ok == True and self.remove_after_execute: # Remove the original file that we extracted, # if it has not been modified by the extractor. try: if file_md5(fname) == fname_md5: os.unlink(fname) except KeyboardInterrupt as e: raise e except Exception as e: pass # If the command executed OK, don't try any more rules if extract_ok == True: break # Else, remove the extracted file if this isn't the last rule in the list. # If it is the last rule, leave the file on disk for the # user to examine. elif i != (len(rules) - 1): try: os.unlink(fname) except KeyboardInterrupt as e: raise e except Exception as e: pass # If there was no command to execute, just use the first rule else: break os.chdir(original_dir) if rule is not None: return (output_directory, fname, recurse, str(rule['cmd'])) else: return (output_directory, fname, recurse, '') def _entry_offset(self, index, entries, description): ''' Gets the offset of the first entry that matches the description. @index - Index into the entries list to begin searching. @entries - Dictionary of result entries. @description - Case insensitive description. Returns the offset, if a matching description is found. Returns -1 if a matching description is not found. ''' description = description.lower() for (offset, infos) in entries[index:]: for info in infos: if info['description'].lower().startswith(description): return offset return -1 def match(self, description): ''' Check to see if the provided description string matches an extract rule. Called internally by self.extract(). @description - Description string to check. Returns the associated rule dictionary if a match is found. Returns None if no match is found. ''' rules = [] ordered_rules = [] description = description.lower() for rule in self.extract_rules: if rule['regex'].search(description): rules.append(rule) # Plugin rules should take precedence over external extraction commands. for rule in rules: if callable(rule['cmd']): ordered_rules.append(rule) for rule in rules: if not callable(rule['cmd']): ordered_rules.append(rule) return ordered_rules def _parse_rule(self, rule): ''' Parses an extraction rule. @rule - Rule string. Returns an array of ['<case insensitive matching string>', '<file extension>', '<command to run>', '<comma separated return codes>', <recurse into extracted directories: True|False>]. ''' values = rule.strip().split(self.RULE_DELIM, 4) if len(values) >= 4: codes = values[3].split(',') for i in range(0, len(codes)): try: codes[i] = int(codes[i], 0) except ValueError as e: binwalk.core.common.warning( "The specified return code '%s' for extractor '%s' is not a valid number!" % (codes[i], values[0])) values[3] = codes if len(values) >= 5: values[4] = (values[4].lower() == 'true') return values def _dd(self, file_name, offset, size, extension, output_file_name=None): ''' Extracts a file embedded inside the target file. @file_name - Path to the target file. @offset - Offset inside the target file where the embedded file begins. @size - Number of bytes to extract. @extension - The file exension to assign to the extracted file on disk. @output_file_name - The requested name of the output file. Returns the extracted file name. ''' total_size = 0 # Default extracted file name is <displayed hex offset>.<extension> default_bname = "%X" % (offset + self.config.base) if self.max_size and size > self.max_size: size = self.max_size if not output_file_name or output_file_name is None: bname = default_bname else: # Strip the output file name of invalid/dangerous characters (like # file paths) bname = os.path.basename(output_file_name) fname = unique_file_name(bname, extension) try: # If byte swapping is enabled, we need to start reading at a swap-size # aligned offset, then index in to the read data appropriately. if self.config.swap_size: adjust = offset % self.config.swap_size else: adjust = 0 offset -= adjust # Open the target file and seek to the offset fdin = self.config.open_file(file_name) fdin.seek(offset) # Open the output file try: fdout = BlockFile(fname, 'w') except KeyboardInterrupt as e: raise e except Exception as e: # Fall back to the default name if the requested name fails fname = unique_file_name(default_bname, extension) fdout = BlockFile(fname, 'w') while total_size < size: (data, dlen) = fdin.read_block() if not data: break else: total_size += (dlen - adjust) if total_size > size: dlen -= (total_size - size) fdout.write(str2bytes(data[adjust:dlen])) adjust = 0 # Cleanup fdout.close() fdin.close() except KeyboardInterrupt as e: raise e except Exception as e: raise Exception( "Extractor.dd failed to extract data from '%s' to '%s': %s" % (file_name, fname, str(e))) binwalk.core.common.debug( "Carved data block 0x%X - 0x%X from '%s' to '%s'" % (offset, offset + size, file_name, fname)) return fname def execute(self, cmd, fname, codes=[0, None]): ''' Execute a command against the specified file. @cmd - Command to execute. @fname - File to run command against. @codes - List of return codes indicating cmd success. Returns True on success, False on failure, or None if the external extraction utility could not be found. ''' tmp = None rval = 0 retval = True binwalk.core.common.debug("Running extractor '%s'" % str(cmd)) try: if callable(cmd): try: retval = cmd(fname) except KeyboardInterrupt as e: raise e except Exception as e: binwalk.core.common.warning( "Internal extractor '%s' failed with exception: '%s'" % (str(cmd), str(e))) elif cmd: # If not in debug mode, create a temporary file to redirect # stdout and stderr to if not binwalk.core.common.DEBUG: tmp = tempfile.TemporaryFile() # Generate unique file paths for all paths in the current # command that are surrounded by UNIQUE_PATH_DELIMITER while self.UNIQUE_PATH_DELIMITER in cmd: need_unique_path = cmd.split( self.UNIQUE_PATH_DELIMITER)[1].split( self.UNIQUE_PATH_DELIMITER)[0] unique_path = binwalk.core.common.unique_file_name( need_unique_path) cmd = cmd.replace( self.UNIQUE_PATH_DELIMITER + need_unique_path + self.UNIQUE_PATH_DELIMITER, unique_path) # Execute. for command in cmd.split("&&"): # Replace all instances of FILE_NAME_PLACEHOLDER in the # command with fname command = command.strip().replace( self.FILE_NAME_PLACEHOLDER, fname) binwalk.core.common.debug( "subprocess.call(%s, stdout=%s, stderr=%s)" % (command, str(tmp), str(tmp))) rval = subprocess.call(shlex.split(command), stdout=tmp, stderr=tmp) if rval in codes: retval = True else: retval = False binwalk.core.common.debug( 'External extractor command "%s" completed with return code %d (success: %s)' % (cmd, rval, str(retval))) # TODO: Should errors from all commands in a command string be checked? Currently we only support # specifying one set of error codes, so at the moment, this is not done; it is up to the # final command to return success or failure (which presumably it will if previous necessary # commands were not successful, but this is an assumption). # if retval == False: # break except KeyboardInterrupt as e: raise e except Exception as e: binwalk.core.common.warning( "Extractor.execute failed to run external extractor '%s': %s, '%s' might not be installed correctly" % (str(cmd), str(e), str(cmd))) retval = None if tmp is not None: tmp.close() return retval
class HeuristicCompressionAnalyzer(Module): ''' Performs analysis and attempts to interpret the results. ''' BLOCK_SIZE = 32 CHI_CUTOFF = 512 ENTROPY_TRIGGER = .90 MIN_BLOCK_SIZE = 4096 BLOCK_OFFSET = 1024 ENTROPY_BLOCK_SIZE = 1024 TITLE = "Heuristic Compression" DEPENDS = [ Dependency(name='Entropy', attribute='entropy', kwargs={ 'enabled': True, 'do_plot': False, 'display_results': False, 'block_size': ENTROPY_BLOCK_SIZE }), ] CLI = [ Option(short='H', long='heuristic', kwargs={'enabled': True}, description='Heuristically classify high entropy data'), Option(short='a', long='trigger', kwargs={'trigger_level': 0}, type=float, description= 'Set the entropy trigger level (0.0 - 1.0, default: %.2f)' % ENTROPY_TRIGGER), ] KWARGS = [ Kwarg(name='enabled', default=False), Kwarg(name='trigger_level', default=ENTROPY_TRIGGER), ] def init(self): self.blocks = {} self.HEADER[-1] = "HEURISTIC ENTROPY ANALYSIS" # Trigger level sanity check if self.trigger_level > 1.0: self.trigger_level = 1.0 elif self.trigger_level < 0.0: self.trigger_level = 0.0 if self.config.block: self.block_size = self.config.block else: self.block_size = self.BLOCK_SIZE for result in self.entropy.results: if not has_key(self.blocks, result.file.name): self.blocks[result.file.name] = [] if result.entropy >= self.trigger_level and ( not self.blocks[result.file.name] or self.blocks[result.file.name][-1].end is not None): self.blocks[result.file.name].append( EntropyBlock(start=result.offset + self.BLOCK_OFFSET)) elif result.entropy < self.trigger_level and self.blocks[ result.file. name] and self.blocks[result.file.name][-1].end is None: self.blocks[result.file. name][-1].end = result.offset - self.BLOCK_OFFSET def run(self): for fp in iter(self.next_file, None): if has_key(self.blocks, fp.name): self.header() for block in self.blocks[fp.name]: if block.end is None: block.length = fp.offset + fp.length - block.start else: block.length = block.end - block.start if block.length >= self.MIN_BLOCK_SIZE: self.analyze(fp, block) self.footer() def analyze(self, fp, block): ''' Perform analysis and interpretation. ''' i = 0 num_error = 0 analyzer_results = [] chi = ChiSquare() fp.seek(block.start) while i < block.length: j = 0 (d, dlen) = fp.read_block() if not d: break while j < dlen: chi.reset() data = d[j:j + self.block_size] if len(data) < self.block_size: break chi.update(data) if chi.chisq() >= self.CHI_CUTOFF: num_error += 1 j += self.block_size if (j + i) > block.length: break i += dlen if num_error > 0: verdict = 'Moderate entropy data, best guess: compressed' else: verdict = 'High entropy data, best guess: encrypted' desc = '%s, size: %d, %d low entropy blocks' % (verdict, block.length, num_error) self.result(offset=block.start, description=desc, file=fp)
class RawCompression(Module): TITLE = 'Raw Compression' CLI = [ Option(short='X', long='deflate', kwargs={'enabled' : True, 'scan_for_deflate' : True}, description='Scan for raw deflate compression streams'), Option(short='Z', long='lzma', kwargs={'enabled' : True, 'scan_for_lzma' : True}, description='Scan for raw LZMA compression streams'), Option(short='P', long='partial', kwargs={'partial_scan' : True}, description='Perform a superficial, but faster, scan'), Option(short='S', long='stop', kwargs={'stop_on_first_hit' : True}, description='Stop after the first result'), ] KWARGS = [ Kwarg(name='enabled', default=False), Kwarg(name='partial_scan', default=False), Kwarg(name='stop_on_first_hit', default=False), Kwarg(name='scan_for_deflate', default=False), Kwarg(name='scan_for_lzma', default=False), ] def init(self): self.decompressors = [] if self.scan_for_deflate: self.decompressors.append(Deflate(self)) if self.scan_for_lzma: self.decompressors.append(LZMA(self)) def run(self): for fp in iter(self.next_file, None): file_done = False self.header() while not file_done: (data, dlen) = fp.read_block() if not data: break for i in range(0, dlen): for decompressor in self.decompressors: description = decompressor.decompress(data[i:i+decompressor.BLOCK_SIZE]) if description: self.result(description=description, file=fp, offset=fp.tell()-dlen+i) if self.stop_on_first_hit: file_done = True break if file_done: break self.status.completed += 1 self.status.completed = fp.tell() - fp.offset self.footer()
class Plotter(Module): ''' Base class for visualizing binaries in Qt. Other plotter classes are derived from this. ''' VIEW_DISTANCE = 1024 MAX_2D_PLOT_POINTS = 12500 MAX_3D_PLOT_POINTS = 25000 TITLE = "Binary Visualization" CLI = [ Option(short='3', long='3D', kwargs={ 'axis': 3, 'enabled': True }, description='Generate a 3D binary visualization'), Option(short='2', long='2D', kwargs={ 'axis': 2, 'enabled': True }, description='Project data points onto 3D cube walls only'), Option(short='Z', long='points', type=int, kwargs={'max_points': 0}, description='Set the maximum number of plotted data points'), # Option(short='V', # long='grids', # kwargs={'show_grids' : True}, # description='Display the x-y-z grids in the resulting plot'), ] KWARGS = [ Kwarg(name='axis', default=3), Kwarg(name='max_points', default=0), Kwarg(name='show_grids', default=False), Kwarg(name='enabled', default=False), ] # There isn't really any useful data to print to console. Disable header and result output. HEADER = None RESULT = None def init(self): import pyqtgraph.opengl as gl from pyqtgraph.Qt import QtGui self.verbose = self.config.verbose self.offset = self.config.offset self.length = self.config.length self.plane_count = -1 self.plot_points = None if self.axis == 2: self.MAX_PLOT_POINTS = self.MAX_2D_PLOT_POINTS self._generate_data_point = self._generate_2d_data_point elif self.axis == 3: self.MAX_PLOT_POINTS = self.MAX_3D_PLOT_POINTS self._generate_data_point = self._generate_3d_data_point else: raise Exception( "Invalid Plotter axis specified: %d. Must be one of: [2,3]" % self.axis) if not self.max_points: self.max_points = self.MAX_PLOT_POINTS self.app = QtGui.QApplication([]) self.window = gl.GLViewWidget() self.window.opts['distance'] = self.VIEW_DISTANCE if len(self.config.target_files) == 1: self.window.setWindowTitle(self.config.target_files[0].name) def _print(self, message): ''' Print console messages. For internal use only. ''' if self.verbose: print(message) def _generate_plot_points(self, data_points): ''' Generates plot points from a list of data points. @data_points - A dictionary containing each unique point and its frequency of occurance. Returns a set of plot points. ''' total = 0 min_weight = 0 weightings = {} plot_points = {} # If the number of data points exceeds the maximum number of allowed data points, use a # weighting system to eliminate data points that occur less freqently. if sum(data_points.values()) > self.max_points: # First, generate a set of weight values 1 - 10 for i in range(1, 11): weightings[i] = 0 # Go through every data point and how many times that point occurs for (point, count) in iterator(data_points): # For each data point, compare it to each remaining weight value for w in get_keys(weightings): # If the number of times this data point occurred is >= the weight value, # then increment the weight value. Since weight values are ordered lowest # to highest, this means that more frequent data points also increment lower # weight values. Thus, the more high-frequency data points there are, the # more lower-frequency data points are eliminated. if count >= w: weightings[w] += 1 else: break # Throw out weight values that exceed the maximum number of data points if weightings[w] > self.max_points: del weightings[w] # If there's only one weight value left, no sense in continuing the loop... if len(weightings) == 1: break # The least weighted value is our minimum weight min_weight = min(weightings) # Get rid of all data points that occur less frequently than our minimum weight for point in get_keys(data_points): if data_points[point] < min_weight: del data_points[point] for point in sorted(data_points, key=data_points.get, reverse=True): plot_points[point] = data_points[point] # Register this as a result in case future modules need access to the raw point information, # but mark plot as False to prevent the entropy module from attempting to overlay this data on its graph. self.result(point=point, plot=False) total += 1 if total >= self.max_points: break return plot_points def _generate_data_point(self, data): ''' Subclasses must override this to return the appropriate data point. @data - A string of data self.axis in length. Returns a data point tuple. ''' return (0, 0, 0) def _generate_data_points(self, fp): ''' Generates a dictionary of data points and their frequency of occurrance. @fp - The BlockFile object to generate data points from. Returns a dictionary. ''' i = 0 data_points = {} self._print("Generating data points for %s" % fp.name) # We don't need any extra data from BlockFile fp.set_block_size(peek=0) while True: (data, dlen) = fp.read_block() if not data or not dlen: break i = 0 while (i + (self.axis - 1)) < dlen: point = self._generate_data_point(data[i:i + self.axis]) if has_key(data_points, point): data_points[point] += 1 else: data_points[point] = 1 i += 3 return data_points def _generate_plot(self, plot_points): import numpy as np import pyqtgraph.opengl as gl nitems = float(len(plot_points)) pos = np.empty((nitems, 3)) size = np.empty((nitems)) color = np.empty((nitems, 4)) i = 0 for (point, weight) in iterator(plot_points): r = 0.0 g = 0.0 b = 0.0 pos[i] = point frequency_percentage = (weight / nitems) # Give points that occur more frequently a brighter color and larger point size. # Frequency is determined as a percentage of total unique data points. if frequency_percentage > .010: size[i] = .20 r = 1.0 elif frequency_percentage > .005: size[i] = .15 b = 1.0 elif frequency_percentage > .002: size[i] = .10 g = 1.0 r = 1.0 else: size[i] = .05 g = 1.0 color[i] = (r, g, b, 1.0) i += 1 scatter_plot = gl.GLScatterPlotItem(pos=pos, size=size, color=color, pxMode=False) scatter_plot.translate(-127.5, -127.5, -127.5) return scatter_plot def plot(self, wait=True): import pyqtgraph.opengl as gl self.window.show() if self.show_grids: xgrid = gl.GLGridItem() ygrid = gl.GLGridItem() zgrid = gl.GLGridItem() self.window.addItem(xgrid) self.window.addItem(ygrid) self.window.addItem(zgrid) # Rotate x and y grids to face the correct direction xgrid.rotate(90, 0, 1, 0) ygrid.rotate(90, 1, 0, 0) # Scale grids to the appropriate dimensions xgrid.scale(12.8, 12.8, 12.8) ygrid.scale(12.8, 12.8, 12.8) zgrid.scale(12.8, 12.8, 12.8) for fd in iter(self.next_file, None): data_points = self._generate_data_points(fd) self._print("Generating plot points from %d data points" % len(data_points)) self.plot_points = self._generate_plot_points(data_points) del data_points self._print("Generating graph from %d plot points" % len(self.plot_points)) self.window.addItem(self._generate_plot(self.plot_points)) if wait: self.wait() def wait(self): from pyqtgraph.Qt import QtCore, QtGui t = QtCore.QTimer() t.start(50) QtGui.QApplication.instance().exec_() def _generate_3d_data_point(self, data): ''' Plot data points within a 3D cube. ''' return (ord(data[0]), ord(data[1]), ord(data[2])) def _generate_2d_data_point(self, data): ''' Plot data points projected on each cube face. ''' self.plane_count += 1 if self.plane_count > 5: self.plane_count = 0 if self.plane_count == 0: return (0, ord(data[0]), ord(data[1])) elif self.plane_count == 1: return (ord(data[0]), 0, ord(data[1])) elif self.plane_count == 2: return (ord(data[0]), ord(data[1]), 0) elif self.plane_count == 3: return (255, ord(data[0]), ord(data[1])) elif self.plane_count == 4: return (ord(data[0]), 255, ord(data[1])) elif self.plane_count == 5: return (ord(data[0]), ord(data[1]), 255) def run(self): self.plot() return True
class Signature(Module): TITLE = "Signature Scan" ORDER = 10 CLI = [ Option(short='B', long='signature', kwargs={'enabled' : True, 'explicit_signature_scan' : True}, description='Scan target file(s) for common file signatures'), Option(short='R', long='raw', kwargs={'enabled' : True, 'raw_bytes' : ''}, type=str, description='Scan target file(s) for the specified sequence of bytes'), Option(short='A', long='opcodes', kwargs={'enabled' : True, 'search_for_opcodes' : True}, description='Scan target file(s) for common executable opcodes'), Option(short='C', long='cast', kwargs={'enabled' : True, 'cast_data_types' : True}, description='Cast offsets as a given data type (use -y to specify the data type / endianess)'), Option(short='m', long='magic', kwargs={'enabled' : True, 'magic_files' : []}, type=list, dtype='file', description='Specify a custom magic file to use'), Option(short='b', long='dumb', kwargs={'dumb_scan' : True}, description='Disable smart signature keywords'), ] KWARGS = [ Kwarg(name='enabled', default=False), Kwarg(name='raw_bytes', default=None), Kwarg(name='search_for_opcodes', default=False), Kwarg(name='explicit_signature_scan', default=False), Kwarg(name='cast_data_types', default=False), Kwarg(name='dumb_scan', default=False), Kwarg(name='magic_files', default=[]), ] VERBOSE_FORMAT = "%s %d" def init(self): # Create Signature and MagicParser class instances. These are mostly for internal use. self.smart = binwalk.core.smart.Signature(self.config.filter, ignore_smart_signatures=self.dumb_scan) self.parser = binwalk.core.parser.MagicParser(self.config.filter, self.smart) # If a raw byte sequence was specified, build a magic file from that instead of using the default magic files if self.raw_bytes is not None: self.magic_files = [self.parser.file_from_string(self.raw_bytes)] # Append the user's magic file first so that those signatures take precedence elif self.search_for_opcodes: self.magic_files = [ self.config.settings.user.binarch, self.config.settings.system.binarch, ] elif self.cast_data_types: self.magic_files = [ self.config.settings.user.bincast, self.config.settings.system.bincast, ] # Use the system default magic file if no other was specified, or if -B was explicitly specified if (not self.magic_files) or (self.explicit_signature_scan and not self.cast_data_types): self.magic_files.append(self.config.settings.user.binwalk) self.magic_files.append(self.config.settings.system.binwalk) # Parse the magic file(s) and initialize libmagic binwalk.core.common.debug("Loading magic files: %s" % str(self.magic_files)) self.mfile = self.parser.parse(self.magic_files) self.magic = binwalk.core.magic.Magic(self.mfile) # Once the temporary magic files are loaded into libmagic, we don't need them anymore; delete the temp files self.parser.rm_magic_files() self.VERBOSE = ["Signatures:", self.parser.signature_count] def validate(self, r): ''' Called automatically by self.result. ''' if not r.description: r.valid = False if r.size and (r.size + r.offset) > r.file.size: r.valid = False if r.jump and (r.jump + r.offset) > r.file.size: r.valid = False r.valid = self.config.filter.valid_result(r.description) def scan_file(self, fp): current_file_offset = 0 while True: (data, dlen) = fp.read_block() if not data: break current_block_offset = 0 block_start = fp.tell() - dlen self.status.completed = block_start - fp.offset for candidate_offset in self.parser.find_signature_candidates(data, dlen): # current_block_offset is set when a jump-to-offset keyword is encountered while # processing signatures. This points to an offset inside the current data block # that scanning should jump to, so ignore any subsequent candidate signatures that # occurr before this offset inside the current data block. if candidate_offset < current_block_offset: continue # Pass the data to libmagic for parsing magic_result = self.magic.buffer(data[candidate_offset:candidate_offset+fp.block_peek_size]) if not magic_result: continue # The smart filter parser returns a binwalk.core.module.Result object r = self.smart.parse(magic_result) # Set the absolute offset inside the target file r.offset = block_start + candidate_offset + r.adjust # Provide an instance of the current file object r.file = fp # Register the result for futher processing/display # self.result automatically calls self.validate for result validation self.result(r=r) # Is this a valid result and did it specify a jump-to-offset keyword? if r.valid and r.jump > 0: absolute_jump_offset = r.offset + r.jump current_block_offset = candidate_offset + r.jump # If the jump-to-offset is beyond the confines of the current block, seek the file to # that offset and quit processing this block of data. if absolute_jump_offset >= fp.tell(): fp.seek(r.offset + r.jump) break def run(self): for fp in iter(self.next_file, None): self.header() self.scan_file(fp) self.footer() if hasattr(self, "magic") and self.magic: self.magic.close()
class HexDiff(Module): COLORS = { 'red': '31', 'green': '32', 'blue': '34', } SEPERATORS = ['\\', '/'] DEFAULT_BLOCK_SIZE = 16 SKIPPED_LINE = "*" CUSTOM_DISPLAY_FORMAT = "0x%.8X %s" TITLE = "Binary Diffing" CLI = [ Option(short='W', long='hexdump', kwargs={'enabled': True}, description='Perform a hexdump / diff of a file or files'), Option( short='G', long='green', kwargs={ 'show_green': True, 'show_blue': False, 'show_red': False }, description= 'Only show lines containing bytes that are the same among all files' ), Option( short='i', long='red', kwargs={ 'show_red': True, 'show_blue': False, 'show_green': False }, description= 'Only show lines containing bytes that are different among all files' ), Option( short='U', long='blue', kwargs={ 'show_blue': True, 'show_red': False, 'show_green': False }, description= 'Only show lines containing bytes that are different among some files' ), Option( short='w', long='terse', kwargs={'terse': True}, description= 'Diff all files, but only display a hex dump of the first file'), ] KWARGS = [ Kwarg(name='show_red', default=True), Kwarg(name='show_blue', default=True), Kwarg(name='show_green', default=True), Kwarg(name='terse', default=False), Kwarg(name='enabled', default=False), ] RESULT_FORMAT = "%s\n" RESULT = ['display'] def _no_colorize(self, c, color="red", bold=True): return c def _colorize(self, c, color="red", bold=True): attr = [] attr.append(self.COLORS[color]) if bold: attr.append('1') return "\x1b[%sm%s\x1b[0m" % (';'.join(attr), c) def _color_filter(self, data): red = '\x1b[' + self.COLORS['red'] + ';' green = '\x1b[' + self.COLORS['green'] + ';' blue = '\x1b[' + self.COLORS['blue'] + ';' if self.show_blue and blue in data: return True elif self.show_green and green in data: return True elif self.show_red and red in data: return True return False def hexascii(self, target_data, byte, offset): color = "green" for (fp_i, data_i) in iterator(target_data): diff_count = 0 for (fp_j, data_j) in iterator(target_data): if fp_i == fp_j: continue try: if data_i[offset] != data_j[offset]: diff_count += 1 except IndexError as e: diff_count += 1 if diff_count == len(target_data) - 1: color = "red" elif diff_count > 0: color = "blue" break hexbyte = self.colorize("%.2X" % ord(byte), color) if byte not in string.printable or byte in string.whitespace: byte = "." asciibyte = self.colorize(byte, color) return (hexbyte, asciibyte) def diff_files(self, target_files): last_line = None loop_count = 0 sep_count = 0 while True: line = "" done_files = 0 block_data = {} seperator = self.SEPERATORS[sep_count % 2] for fp in target_files: block_data[fp] = fp.read(self.block) if not block_data[fp]: done_files += 1 # No more data from any of the target files? Done. if done_files == len(target_files): break for fp in target_files: hexline = "" asciiline = "" for i in range(0, self.block): if i >= len(block_data[fp]): hexbyte = "XX" asciibyte = "." else: (hexbyte, asciibyte) = self.hexascii(block_data, block_data[fp][i], i) hexline += "%s " % hexbyte asciiline += "%s" % asciibyte line += "%s |%s|" % (hexline, asciiline) if self.terse: break if fp != target_files[-1]: line += " %s " % seperator offset = fp.offset + (self.block * loop_count) if not self._color_filter(line): display = line = self.SKIPPED_LINE else: display = self.CUSTOM_DISPLAY_FORMAT % (offset, line) sep_count += 1 if line != self.SKIPPED_LINE or last_line != line: self.result(offset=offset, description=line, display=display) last_line = line loop_count += 1 def init(self): # Disable the invalid description auto-filtering feature. # This will not affect our own validation. self.config.filter.show_invalid_results = True # Always disable terminal formatting, as it won't work properly with colorized output self.config.display.fit_to_screen = False # Set the block size (aka, hexdump line size) self.block = self.config.block if not self.block: self.block = self.DEFAULT_BLOCK_SIZE # Build a list of files to hexdiff self.hex_target_files = [x for x in iter(self.next_file, None)] # Build the header format string header_width = (self.block * 4) + 2 if self.terse: file_count = 1 else: file_count = len(self.hex_target_files) self.HEADER_FORMAT = "OFFSET " + ( ("%%-%ds " % header_width) * file_count) + "\n" # Build the header argument list self.HEADER = [fp.name for fp in self.hex_target_files] if self.terse and len(self.HEADER) > 1: self.HEADER = self.HEADER[0] # Set up the tty for colorization, if it is supported if hasattr(sys.stderr, 'isatty') and sys.stderr.isatty( ) and platform.system() != 'Windows': curses.setupterm() self.colorize = self._colorize else: self.colorize = self._no_colorize def run(self): if self.hex_target_files: self.header() self.diff_files(self.hex_target_files) self.footer()