def test_main(self): # Read the three saved sections as bytestreams with open( os.path.join('test', 'testfiles_for_unittests', 'arm64_on_dwarfv2.info.dat'), 'rb') as f: info = f.read() with open( os.path.join('test', 'testfiles_for_unittests', 'arm64_on_dwarfv2.abbrev.dat'), 'rb') as f: abbrev = f.read() with open( os.path.join('test', 'testfiles_for_unittests', 'arm64_on_dwarfv2.str.dat'), 'rb') as f: str = f.read() # Parse the DWARF info di = DWARFInfo( config=DwarfConfig(little_endian=True, default_address_size=8, machine_arch="ARM64"), debug_info_sec=DebugSectionDescriptor(io.BytesIO(info), '__debug_info', None, len(info), 0), debug_aranges_sec=None, debug_abbrev_sec=DebugSectionDescriptor(io.BytesIO(abbrev), '__debug_abbrev', None, len(abbrev), 0), debug_frame_sec=None, eh_frame_sec=None, debug_str_sec=DebugSectionDescriptor(io.BytesIO(str), '__debug_str', None, len(str), 0), debug_loc_sec=None, debug_ranges_sec=None, debug_line_sec=None, debug_pubtypes_sec=None, debug_pubnames_sec=None, debug_addr_sec=None, debug_str_offsets_sec=None, debug_line_str_sec=None, ) CUs = [cu for cu in di.iter_CUs()] # Locate a CU that I know has a reference in DW_FORM_ref_addr form CU = CUs[21] self.assertEqual(CU['version'], 2) # Make sure pyelftools appreciates the difference between the target address size and DWARF inter-DIE offset size self.assertEqual(CU.structs.dwarf_format, 32) self.assertEqual(CU['address_size'], 8) DIEs = [die for die in CU.iter_DIEs()] # Before the patch, DIE #2 is misparsed, the current offset is off, the rest are misparsed too self.assertEqual(len(DIEs), 15)
def fake_section(filename, section_name): with open(os.path.splitext(filename)[0] + section_name, 'rb') as f: debug_bytes = f.read() return DebugSectionDescriptor(stream=io.BytesIO(debug_bytes), name=section_name, global_offset=0, size=len(debug_bytes))
def read_macho(filename, resolve_arch, friendly_filename): from filebytes.mach_o import MachO, CpuType, TypeFlags, LC fat_arch = None macho = MachO(filename) if macho.isFat: slices = [make_macho_arch_name(slice) for slice in macho.fatArches] arch_no = resolve_arch(slices) if arch_no is None: # User cancellation return False fat_arch = slices[arch_no] macho = macho.fatArches[arch_no] # We proceed with macho being a arch-specific file, or a slice within a fat binary data = { section.name: DebugSectionDescriptor(io.BytesIO(section.bytes), section.name, None, len(section.bytes), 0) for cmd in macho.loadCommands if cmd.header.cmd in (LC.SEGMENT, LC.SEGMENT_64) for section in cmd.sections if section.name.startswith('__debug') } #macho_save_sections(friendly_filename, macho) if not '__debug_info' in data: return None cpu = macho.machHeader.header.cputype di = DWARFInfo( config = DwarfConfig( little_endian=True, default_address_size = 8 if (cpu & TypeFlags.ABI64) != 0 else 4, machine_arch = make_macho_arch_name(macho) ), debug_info_sec = data['__debug_info'], debug_aranges_sec = data.get('__debug_aranges'), debug_abbrev_sec = data['__debug_abbrev'], debug_frame_sec = data.get('__debug_frame'), eh_frame_sec = None, # Haven't seen those in Mach-O debug_str_sec = data['__debug_str'], debug_loc_sec = data.get('__debug_loc'), debug_ranges_sec = data.get('__debug_ranges'), debug_line_sec = data.get('__debug_line'), debug_pubtypes_sec = data.get('__debug_pubtypes'), #__debug_gnu_pubn? debug_pubnames_sec = data.get('__debug_pubtypes'), #__debug_gnu_pubt? ) di._format = 1 di._fat_arch = fat_arch text_cmd = next((cmd for cmd in macho.loadCommands if cmd.header.cmd in (LC.SEGMENT, LC.SEGMENT_64) and cmd.name == "__TEXT"), False) di._start_address = text_cmd.header.vmaddr if text_cmd else 0 return di
def read_pe(filename): from filebytes.pe import PE, IMAGE_FILE_MACHINE pefile = PE(filename) # Section's real size might be padded - see https://github.com/sashs/filebytes/issues/28 sections = [(section.name, section, section.header.PhysicalAddress_or_VirtualSize, section.header.SizeOfRawData) for section in pefile.sections if section.name.startswith('.debug')] data = {name: DebugSectionDescriptor(io.BytesIO(section.bytes), name, None, raw_size if virtual_size == 0 else min((raw_size, virtual_size)), 0) for (name, section, virtual_size, raw_size) in sections} if not '.debug_info' in data: return None machine = pefile.imageNtHeaders.header.FileHeader.Machine is64 = machine in (IMAGE_FILE_MACHINE.AMD64, IMAGE_FILE_MACHINE.ARM64, IMAGE_FILE_MACHINE.IA64) # There are also some exotic architectures... di = DWARFInfo( config = DwarfConfig( little_endian = True, default_address_size = 8 if is64 else 4, machine_arch = IMAGE_FILE_MACHINE[machine].name ), debug_info_sec = data['.debug_info'], debug_aranges_sec = data.get('.debug_aranges'), debug_abbrev_sec = data.get('.debug_abbrev'), debug_frame_sec = data.get('.debug_frame'), eh_frame_sec = None, # Haven't see one in the wild so far debug_str_sec = data.get('.debug_str'), debug_loc_sec = data.get('.debug_loc'), debug_ranges_sec = data.get('.debug_ranges'), debug_line_sec = data.get('.debug_line'), debug_pubtypes_sec = data.get('.debug_pubtypes'), debug_pubnames_sec = data.get('.debug_pubnames'), ) di._format = 2 return di
def _read_dwarf_section(self, section, relocate_dwarf_sections): """ Read the contents of a DWARF section from the stream and return a DebugSectionDescriptor. Apply relocations if asked to. """ # The section data is read into a new stream, for processing section_stream = BytesIO() section_stream.write(section.get_data()) if relocate_dwarf_sections: reloc_handler = RelocationHandler(self) reloc_section = reloc_handler.find_relocations_for_section(section) if reloc_section is not None: reloc_handler.apply_section_relocations(section_stream, reloc_section) return DebugSectionDescriptor( stream=section_stream, name=section.name, global_offset=section.PointerToRawData, size=section.SizeOfRawData, address=section.get_rva_from_offset(0))
def process_file(filename): logging.debug('Processing file: {}'.format(filename)) logging.debug('Working directory: {}'.format(os.getcwd())) coff = epyqlib.ticoff.Coff() coff.from_file(filename) section_bytes = { s.name: (io.BytesIO(s.data), len(s.data)) for s in coff.sections if s.name.startswith('.debug_') } debug_sections = { name: DebugSectionDescriptor(stream=stream, name=name, global_offset=0, size=length) for name, (stream, length) in section_bytes.items() } from elftools.dwarf.dwarfinfo import DWARFInfo, DwarfConfig dwarfinfo = DWARFInfo( config=DwarfConfig(little_endian=True, default_address_size=4, machine_arch='<unknown>'), debug_info_sec=debug_sections.get('.debug_info', None), # debug_info_sec=DebugSectionDescriptor( # stream=io.BytesIO(dwarf_debug_info_bytes), # name='.debug_info', # global_offset=0, # size=len(dwarf_debug_info_bytes)), debug_aranges_sec=debug_sections.get('.debug_aranges', None), debug_abbrev_sec=debug_sections.get('.debug_abbrev', None), debug_frame_sec=debug_sections.get('.debug_frame', None), # TODO(eliben): reading of eh_frame is not hooked up yet eh_frame_sec=None, debug_str_sec=debug_sections.get('.debug_str', None), debug_loc_sec=debug_sections.get('.debug_loc', None), debug_ranges_sec=debug_sections.get('.debug_ranges', None), debug_line_sec=debug_sections.get('.debug_line', None)) objects = collections.OrderedDict((tag, []) for tag in [ 'DW_TAG_subprogram', 'DW_TAG_variable', 'DW_TAG_typedef', 'DW_TAG_base_type', 'DW_AT_encoding', 'DW_TAG_structure_type', 'DW_TAG_union_type', 'DW_TAG_ptr_to_member_type', 'DW_TAG_enumeration_type', 'DW_TAG_pointer_type', 'DW_TAG_array_type', 'DW_TAG_volatile_type', 'DW_TAG_const_type', 'DW_TAG_restrict_type', 'DW_TAG_lo_user', 'DW_TAG_hi_user', 'DW_TAG_unspecified_type', 'DW_TAG_subroutine_type' ]) for CU in dwarfinfo.iter_CUs(): # it = dwarfinfo.iter_CUs() # while True: # try: # CU = next(it) # except StopIteration: # break # except elftools.common.exceptions.DWARFError: # traceback.print_exc() # logging.debug('Skipping current CU') # next # DWARFInfo allows to iterate over the compile units contained in # the .debug_info section. CU is a CompileUnit object, with some # computed attributes (such as its offset in the section) and # a header which conforms to the DWARF standard. The access to # header elements is, as usual, via item-lookup. logging.debug(' Found a compile unit at offset %s, length %s' % (CU.cu_offset, CU['unit_length'])) # Start with the top DIE, the root for this CU's DIE tree top_DIE = CU.get_top_DIE() logging.debug(' Top DIE with tag=%s' % top_DIE.tag) path = top_DIE.get_full_path() # We're interested in the filename... logging.debug(' name=%s' % path) if path.endswith('__TI_internal'): logging.debug('__TI_internal found, terminating DWARF parsing') break else: # Display DIEs recursively starting with top_DIE die_info_rec(top_DIE, objects=objects) # pass def die_info_rec_structure_type(die, indent_level): for child in die.iter_children(): # logging.debug(indent_level + str(child.attributes['DW_AT_name'].value.decode('utf-8'))) location = str( child.attributes['DW_AT_data_member_location'].value) name = str(child.attributes['DW_AT_name'].value.decode('utf-8')) logging.debug(indent_level + name + ': ' + location) # logging.debug(indent_level + str(child.attributes['DW_AT_name'].value.decode('utf-8')) + ': ' + str(child.attributes['DW_AT_data_member_location'].value.decode('utf-u'))) # this is yucky but the embedded system is weird with two bytes # per address and even sizeof() responds in units of addressable units # rather than actual bytes byte_size_fudge = 1 offsets = {} types = [] for die in objects['DW_TAG_base_type']: type = Type(name=die.attributes['DW_AT_name'].value.decode('utf-8'), bytes=die.attributes['DW_AT_byte_size'].value * byte_size_fudge, format=TypeFormats(die.attributes['DW_AT_encoding'].value)) types.append(type) offsets[die.offset] = type logging.debug('{: 10d} {}'.format(die.offset, type)) variables = [] for die in objects['DW_TAG_variable']: location = die.attributes.get('DW_AT_location', []) if location: location = location.value # TODO: check this better if len(location) != 5: continue address = int.from_bytes(bytes(location[1:5]), 'little') variable = Variable( name=die.attributes['DW_AT_name'].value.decode('utf-8'), type=die.attributes['DW_AT_type'].value, address=address, file=get_die_path(die)) variables.append(variable) offsets[die.offset] = variable logging.debug('{: 10d} {}'.format(die.offset, variable)) lo_users = [] for die in objects['DW_TAG_lo_user']: name = die.attributes.get('DW_AT_name', None) if name is not None: name = name.value.decode('utf-8') lo_user = LoUser(type=die.attributes['DW_AT_type'].value) lo_users.append(lo_user) offsets[die.offset] = lo_user logging.debug('{: 10d} {}'.format(die.offset, lo_user)) hi_users = [] for die in objects['DW_TAG_hi_user']: name = die.attributes.get('DW_AT_name', None) if name is not None: name = name.value.decode('utf-8') hi_user = HiUser(type=die.attributes['DW_AT_type'].value) hi_users.append(hi_user) offsets[die.offset] = hi_user logging.debug('{: 10d} {}'.format(die.offset, hi_user)) subroutine_types = [] for die in objects['DW_TAG_subroutine_type']: name = die.attributes.get('DW_AT_name', None) if name is not None: name = name.value.decode('utf-8') type = die.attributes.get('DW_AT_type', None) if type is not None: type = type.value subroutine_type = SubroutineType(name=name, return_type=type) for parameter in die.iter_children(): subroutine_type.parameters.append( parameter.attributes['DW_AT_type'].value) subroutine_types.append(subroutine_type) offsets[die.offset] = subroutine_type logging.debug('{: 10d} {}'.format(die.offset, subroutine_type)) unspecified_types = [] for die in objects['DW_TAG_unspecified_type']: name = die.attributes.get('DW_AT_name', None) if name is not None: name = name.value.decode('utf-8') unspecified_type = UnspecifiedType(name=name) unspecified_types.append(unspecified_type) offsets[die.offset] = unspecified_type logging.debug('{: 10d} {}'.format(die.offset, unspecified_type)) pointer_types = [] for die in objects['DW_TAG_pointer_type']: type = die.attributes['DW_AT_type'].value name = die.attributes.get('DW_AT_name', None) if name is not None: name = name.value.decode('utf-8') pointer_type = PointerType(name=name, type=type) else: pointer_type = PointerType(type=type) pointer_types.append(pointer_type) offsets[die.offset] = pointer_type logging.debug('{: 10d} {}'.format(die.offset, pointer_type)) volatile_types = [] for die in objects['DW_TAG_volatile_type']: name = die.attributes.get('DW_AT_name', None) if name is not None: name = name.value.decode('utf-8') volatile_type = VolatileType(name=name, type=die.attributes['DW_AT_type'].value) volatile_types.append(volatile_type) offsets[die.offset] = volatile_type logging.debug('{: 10d} {}'.format(die.offset, volatile_type)) array_types = [] for die in objects['DW_TAG_array_type']: name = die.attributes.get('DW_AT_name', None) if name is not None: name = name.value.decode('utf-8') byte_size = die.attributes.get('DW_AT_byte_size', None) if byte_size is not None: byte_size = byte_size.value array_type = ArrayType(name=name, bytes=byte_size, type=die.attributes['DW_AT_type'].value) array_types.append(array_type) offsets[die.offset] = array_type logging.debug('{: 10d} {}'.format(die.offset, array_type)) const_types = [] for die in objects['DW_TAG_const_type']: name = die.attributes.get('DW_AT_name', None) if name is not None: name = name.value.decode('utf-8') const_type = ConstType(name=name, type=die.attributes['DW_AT_type'].value) const_types.append(const_type) offsets[die.offset] = const_type logging.debug('{: 10d} {}'.format(die.offset, const_type)) restrict_types = [] for die in objects['DW_TAG_restrict_type']: name = die.attributes.get('DW_AT_name', None) if name is not None: name = name.value.decode('utf-8') restrict_type = RestrictType(name=name, type=die.attributes['DW_AT_type'].value) restrict_types.append(restrict_type) offsets[die.offset] = restrict_type logging.debug('{: 10d} {}'.format(die.offset, restrict_type)) structure_types = [] for die in objects['DW_TAG_structure_type']: name = die.attributes.get('DW_AT_name', None) if name is not None: name = name.value.decode('utf-8') byte_size_attribute = die.attributes.get('DW_AT_byte_size') if byte_size_attribute is None: print( 'Skipping DW_TAG_structure_type due to lack of ' 'DW_AT_byte_size', name) continue struct = Struct(name=name, bytes=byte_size_attribute.value) structure_types.append(struct) offsets[die.offset] = struct for member_die in die.iter_children(): a = member_die.attributes bit_offset = a.get('DW_AT_bit_offset', None) if bit_offset is not None: bit_offset = bit_offset.value bit_size = a.get('DW_AT_bit_size', None) if bit_size is not None: bit_size = bit_size.value # TODO: location[1] is just based on observation name = a['DW_AT_name'].value.decode('utf-8') struct.members[name] = StructMember( name=name, type=a['DW_AT_type'].value, location=a['DW_AT_data_member_location'].value[1], bit_offset=bit_offset, bit_size=bit_size) logging.debug(list(die.iter_children())) logging.debug('{: 10d} {}'.format(die.offset, struct)) union_types = [] for die in objects['DW_TAG_union_type']: name = die.attributes.get('DW_AT_name', None) if name is not None: name = name.value.decode('utf-8') byte_size_attribute = die.attributes.get('DW_AT_byte_size') if byte_size_attribute is None: print( 'Skipping DW_TAG_union_type due to lack of ' 'DW_AT_byte_size', name) continue members = collections.OrderedDict((( member.attributes['DW_AT_name'].value.decode('utf-8'), UnionMember( name=member.attributes['DW_AT_name'].value.decode('utf-8'), type=member.attributes.get('DW_AT_type').value, ), ) for member in die.iter_children())) union = Union( name=name, bytes=byte_size_attribute.value, members=members, ) union_types.append(union) offsets[die.offset] = union logging.debug('{: 10d} {}'.format(die.offset, union)) pointer_to_member_types = [] for die in objects['DW_TAG_ptr_to_member_type']: name = die.attributes.get('DW_AT_name', None) if name is not None: name = name.value.decode('utf-8') pointer_to_member = PointerToMember(name=name) pointer_to_member_types.append(pointer_to_member) offsets[die.offset] = pointer_to_member logging.debug('{: 10d} {}'.format(die.offset, pointer_to_member)) enumeration_types = [] for die in objects['DW_TAG_enumeration_type']: name = die.attributes.get('DW_AT_name', None) if name is not None: name = name.value.decode('utf-8') type = die.attributes.get('DW_AT_type', None) if type is not None: type = type.value enumeration = EnumerationType( name=name, bytes=die.attributes['DW_AT_byte_size'].value * byte_size_fudge, type=type) for value in die.iter_children(): enumeration.values.append( EnumerationValue( name=value.attributes['DW_AT_name'].value.decode('utf-8'), value=value.attributes['DW_AT_const_value'].value)) enumeration_types.append(enumeration) offsets[die.offset] = enumeration logging.debug('{: 10d} {}'.format(die.offset, enumeration)) typedefs = [] for die in objects['DW_TAG_typedef']: typedef = TypeDef( name=die.attributes['DW_AT_name'].value.decode('utf-8'), type=(die.offset, die.attributes['DW_AT_type'].value)) typedefs.append(typedef) offsets[die.offset] = typedef offset_values = sorted(offsets.keys()) logging.debug(len(offset_values)) logging.debug(offset_values) fails = 0 for typedef in typedefs: offset = typedef.type[0] try: typedef.type = offsets[typedef.type[1]] except KeyError: logging.debug('Failed to find type for {}'.format(typedef)) fails += 1 else: logging.debug('{: 10d} {}'.format(offset, typedef)) logging.debug(fails) for structure in structure_types: for member in structure.members.values(): member.type = offsets[member.type] for union in union_types: for member in union.members.values(): member.type = offsets[member.type] passes = 0 while True: logging.debug('Starting pass {}'.format(passes)) pass_again = False for item in subroutine_types: if isinstance(item.return_type, int): item.return_type = offsets[item.return_type] for i, parameter in enumerate(item.parameters): if isinstance(parameter, int): item.parameters[i] = offsets[parameter] for item in offsets.values(): if hasattr(item, 'type') and isinstance(item.type, int): try: item.type = offsets[item.type] except KeyError: if passes >= 10: logging.debug(item) raise pass_again = True passes += 1 if not pass_again: break # for pointer_type in pointer_types: # logging.debug(pointer_type) # pointer_type.type = offsets[pointer_type.type] # logging.debug(pointer_type) # # for array_type in array_types: # logging.debug(array_type) # array_type.type = offsets[array_type.type] # logging.debug(array_type) # # for volatile_type in volatile_types: # logging.debug(volatile_type) # volatile_type.type = offsets[volatile_type.type] # logging.debug(volatile_type) names = collections.defaultdict(list) for item in offsets.values(): if hasattr(item, 'name'): valid = False if item.name is None: valid = True elif is_modifier(item): pass elif item.name.startswith('$'): pass elif isinstance(item, SubroutineType): pass else: valid = True if valid: names[item.name].append(item) result = names, variables, bits_per_byte logging.debug('Finished processing file: {}'.format(filename)) return result