def _ComputePakFileSymbols( file_name, contents, res_info, symbols_by_id, compression_ratio=1): id_map = {id(v): k for k, v in sorted(contents.resources.items(), reverse=True)} alias_map = {k: id_map[id(v)] for k, v in contents.resources.iteritems() if id_map[id(v)] != k} # Longest locale pak is es-419.pak if len(os.path.basename(file_name)) <= 9: section_name = models.SECTION_PAK_TRANSLATIONS else: section_name = models.SECTION_PAK_NONTRANSLATED overhead = (12 + 6) * compression_ratio # Header size plus extra offset symbols_by_id[file_name] = models.Symbol( section_name, overhead, full_name='{}: overhead'.format(file_name)) for resource_id in sorted(contents.resources): if resource_id in alias_map: # 4 extra bytes of metadata (2 16-bit ints) size = 4 resource_id = alias_map[resource_id] else: # 6 extra bytes of metadata (1 32-bit int, 1 16-bit int) size = len(contents.resources[resource_id]) + 6 name, source_path = res_info[resource_id] if resource_id not in symbols_by_id: full_name = '{}: {}'.format(source_path, name) symbols_by_id[resource_id] = models.Symbol( section_name, 0, address=resource_id, full_name=full_name) size *= compression_ratio symbols_by_id[resource_id].size += size
def test_Diff_Clustering(self): size_info1 = self._CloneSizeInfo() size_info2 = self._CloneSizeInfo() S = '.text' size_info1.symbols += [ models.Symbol(S, 11, name='.L__unnamed_1193', object_path='a'), # 1 models.Symbol(S, 22, name='.L__unnamed_1194', object_path='a'), # 2 models.Symbol(S, 33, name='.L__unnamed_1195', object_path='b'), # 3 models.Symbol(S, 44, name='.L__bar_195', object_path='b'), # 4 models.Symbol(S, 55, name='.L__bar_1195', object_path='b'), # 5 ] size_info2.symbols += [ models.Symbol(S, 33, name='.L__unnamed_2195', object_path='b'), # 3 models.Symbol(S, 11, name='.L__unnamed_2194', object_path='a'), # 1 models.Symbol(S, 22, name='.L__unnamed_2193', object_path='a'), # 2 models.Symbol(S, 44, name='.L__bar_2195', object_path='b'), # 4 models.Symbol(S, 55, name='.L__bar_295', object_path='b'), # 5 ] d = diff.Diff(size_info1, size_info2) d.symbols = d.symbols.Sorted() self.assertEquals( d.symbols.CountsByDiffStatus()[models.DIFF_STATUS_ADDED], 0) self.assertEquals(d.symbols.size, 0)
def _DiffSymbolGroups(before, after): # For changed symbols, padding is zeroed out. In order to not lose the # information entirely, store it in aggregate. padding_by_section_name = collections.defaultdict(int) # Usually >90% of symbols are exact matches, so all of the time is spent in # this first pass. all_deltas, before, after = _MatchSymbols(before, after, _Key1, padding_by_section_name) for key_func in (_Key2, _Key3, _Key4): delta_syms, before, after = _MatchSymbols(before, after, key_func, padding_by_section_name) all_deltas.extend(delta_syms) logging.debug('Creating %d unmatched symbols', len(after) + len(before)) for after_sym in after: all_deltas.append(models.DeltaSymbol(None, after_sym)) for before_sym in before: all_deltas.append(models.DeltaSymbol(before_sym, None)) # Create a DeltaSymbol to represent the zero'd out padding of matched symbols. for section_name, padding in padding_by_section_name.items(): if padding != 0: after_sym = models.Symbol(section_name, padding) # This is after _NormalizeNames() is called, so set |full_name|, # |template_name|, and |name|. after_sym.SetName("Overhead: aggregate padding of diff'ed symbols") after_sym.padding = padding all_deltas.append(models.DeltaSymbol(None, after_sym)) return models.DeltaSymbolGroup(all_deltas)
def _ComputePakFileSymbols( file_name, file_size, contents, res_info, symbols_by_name): total = 12 + 6 # Header size plus extra offset id_map = {id(v): k for k, v in sorted(contents.resources.items(), reverse=True)} alias_map = {k: id_map[id(v)] for k, v in contents.resources.iteritems() if id_map[id(v)] != k} # Longest locale pak is es-419.pak if len(os.path.basename(file_name)) <= 9: section_name = models.SECTION_PAK_TRANSLATIONS else: section_name = models.SECTION_PAK_NONTRANSLATED object_path = path_util.ToSrcRootRelative(file_name) for resource_id in sorted(contents.resources): if resource_id in alias_map: # 4 extra bytes of metadata (2 16-bit ints) size = 4 name = res_info[alias_map[resource_id]][0] else: # 6 extra bytes of metadata (1 32-bit int, 1 16-bit int) size = len(contents.resources[resource_id]) + 6 name, source_path = res_info[resource_id] if name not in symbols_by_name: full_name = '{}: {}'.format(source_path, name) symbols_by_name[name] = models.Symbol( section_name, 0, address=resource_id, full_name=full_name, source_path=source_path, object_path=object_path) symbols_by_name[name].size += size total += size assert file_size == total, ( '{} bytes in pak file not accounted for'.format(file_size - total))
def _DiffSymbolGroups(before, after): # For changed symbols, padding is zeroed out. In order to not lose the # information entirely, store it in aggregate. padding_by_section_name = collections.defaultdict(int) # Usually >90% of symbols are exact matches, so all of the time is spent in # this first pass. all_deltas, before, after = _MatchSymbols(before, after, _ExactMatchKey, padding_by_section_name) for key_func in (_GoodMatchKey, _PoorMatchKey): delta_syms, before, after = _MatchSymbols(before, after, key_func, padding_by_section_name) all_deltas.extend(delta_syms) logging.debug('Creating %d unmatched symbols', len(after) + len(before)) for after_sym in after: all_deltas.append(models.DeltaSymbol(None, after_sym)) for before_sym in before: all_deltas.append(models.DeltaSymbol(before_sym, None)) # Create a DeltaSymbol to represent the zero'd out padding of matched symbols. for section_name, padding in padding_by_section_name.iteritems(): if padding != 0: after_sym = models.Symbol( section_name, padding, name="** aggregate padding of diff'ed symbols") after_sym.padding = padding all_deltas.append(models.DeltaSymbol(None, after_sym)) return models.DeltaSymbolGroup(all_deltas)
def CreateDexSymbols(apk_path, output_directory): apk_name = os.path.basename(apk_path) source_map = _LoadSourceMap(apk_name, output_directory) nodes = UndoHierarchicalSizing(_RunApkAnalyzer(apk_path, output_directory)) dex_expected_size = _ExpectedDexTotalSize(apk_path) total_node_size = sum(map(lambda x: x[1], nodes)) assert dex_expected_size >= total_node_size, ( 'Node size too large, check for node processing errors.') # We have more than 100KB of ids for methods, strings id_metadata_overhead_size = dex_expected_size - total_node_size symbols = [] for name, node_size in nodes: package = name.split(' ', 1)[0] class_path = package.split('$')[0] source_path = source_map.get(class_path, '') if source_path: object_path = package elif package == _TOTAL_NODE_NAME: name = '* Unattributed Dex' object_path = os.path.join(apk_name, _DEX_PATH_COMPONENT) node_size += id_metadata_overhead_size else: object_path = os.path.join(apk_name, _DEX_PATH_COMPONENT, *package.split('.')) if name.endswith(')'): section_name = models.SECTION_DEX_METHOD else: section_name = models.SECTION_DEX symbols.append( models.Symbol(section_name, node_size, full_name=name, object_path=object_path, source_path=source_path)) return symbols
def CreateDexSymbol(name, size, source_map, lambda_normalizer): parts = name.split(' ') # (class_name, return_type, method_name) new_package = parts[0] if new_package == _TOTAL_NODE_NAME: return None # Make d8 desugared lambdas look the same as Desugar ones. outer_class, name = lambda_normalizer.Normalize(new_package, name) # Look for class merging. old_package = new_package # len(parts) == 2 for class nodes. if len(parts) > 2: method = parts[2] # last_idx == -1 for fields, which is fine. last_idx = method.find('(') last_idx = method.rfind('.', 0, last_idx) if last_idx != -1: old_package = method[:last_idx] outer_class, name = lambda_normalizer.Normalize(old_package, name) source_path = source_map.get(outer_class, '') object_path = posixpath.join(models.APK_PREFIX_PATH, *old_package.split('.')) if name.endswith(')'): section_name = models.SECTION_DEX_METHOD else: section_name = models.SECTION_DEX return models.Symbol(section_name, size, full_name=name, object_path=object_path, source_path=source_path)
def _AddNmAliases(raw_symbols, names_by_address): """Adds symbols that were removed by identical code folding.""" # Step 1: Create list of (index_of_symbol, name_list). logging.debug('Creating alias list') replacements = [] num_new_symbols = 0 missing_names = collections.defaultdict(list) for i, s in enumerate(raw_symbols): # Don't alias padding-only symbols (e.g. ** symbol gap) if s.size_without_padding == 0: continue name_list = names_by_address.get(s.address) if name_list: if s.full_name not in name_list: missing_names[s.full_name].append(s.address) logging.warning('Name missing from aliases: %s %s', s.full_name, name_list) continue replacements.append((i, name_list)) num_new_symbols += len(name_list) - 1 if missing_names and logging.getLogger().isEnabledFor(logging.INFO): for address, names in names_by_address.iteritems(): for name in names: if name in missing_names: logging.info('Missing name %s is at address %x instead of [%s]' % (name, address, ','.join('%x' % a for a in missing_names[name]))) if float(num_new_symbols) / len(raw_symbols) < .05: logging.warning('Number of aliases is oddly low (%.0f%%). It should ' 'usually be around 25%%. Ensure --tool-prefix is correct. ', float(num_new_symbols) / len(raw_symbols) * 100) # Step 2: Create new symbols as siblings to each existing one. logging.debug('Creating %d new symbols from nm output', num_new_symbols) src_cursor_end = len(raw_symbols) raw_symbols += [None] * num_new_symbols dst_cursor_end = len(raw_symbols) for src_index, name_list in reversed(replacements): # Copy over symbols that come after the current one. chunk_size = src_cursor_end - src_index - 1 dst_cursor_end -= chunk_size src_cursor_end -= chunk_size raw_symbols[dst_cursor_end:dst_cursor_end + chunk_size] = ( raw_symbols[src_cursor_end:src_cursor_end + chunk_size]) sym = raw_symbols[src_index] src_cursor_end -= 1 # Create symbols (does not bother reusing the existing symbol). for i, full_name in enumerate(name_list): dst_cursor_end -= 1 # Do not set |aliases| in order to avoid being pruned by # _CompactLargeAliasesIntoSharedSymbols(), which assumes aliases differ # only by path. The field will be set afterwards by _ConnectNmAliases(). raw_symbols[dst_cursor_end] = models.Symbol( sym.section_name, sym.size, address=sym.address, full_name=full_name) assert dst_cursor_end == src_cursor_end
def _AddSymbolAliases(raw_symbols, aliases_by_address): # Step 1: Create list of (index_of_symbol, name_list). logging.debug('Creating alias list') replacements = [] num_new_symbols = 0 for i, s in enumerate(raw_symbols): # Don't alias padding-only symbols (e.g. ** symbol gap) if s.size_without_padding == 0: continue name_list = aliases_by_address.get(s.address) if name_list: if s.full_name not in name_list: logging.warning('Name missing from aliases: %s %s', s.full_name, name_list) continue replacements.append((i, name_list)) num_new_symbols += len(name_list) - 1 if float(num_new_symbols) / len(raw_symbols) < .05: # TODO(agrieve): Figure out if there's a way to get alias information from # clang-compiled nm. logging.warning( 'Number of aliases is oddly low (%.0f%%). It should ' 'usually be around 25%%. Ensure --tool-prefix is correct. ' 'Ignore this if you compiled with clang.', float(num_new_symbols) / len(raw_symbols) * 100) # Step 2: Create new symbols as siblings to each existing one. logging.debug('Creating %d aliases', num_new_symbols) src_cursor_end = len(raw_symbols) raw_symbols += [None] * num_new_symbols dst_cursor_end = len(raw_symbols) for src_index, name_list in reversed(replacements): # Copy over symbols that come after the current one. chunk_size = src_cursor_end - src_index - 1 dst_cursor_end -= chunk_size src_cursor_end -= chunk_size raw_symbols[dst_cursor_end:dst_cursor_end + chunk_size] = (raw_symbols[src_cursor_end:src_cursor_end + chunk_size]) sym = raw_symbols[src_index] src_cursor_end -= 1 # Create aliases (does not bother reusing the existing symbol). aliases = [None] * len(name_list) for i, full_name in enumerate(name_list): aliases[i] = models.Symbol(sym.section_name, sym.size, address=sym.address, full_name=full_name, aliases=aliases) dst_cursor_end -= len(aliases) raw_symbols[dst_cursor_end:dst_cursor_end + len(aliases)] = aliases assert dst_cursor_end == src_cursor_end
def _DiffSymbolGroups(before, after): before_symbols_by_key = collections.defaultdict(list) for s in before: before_symbols_by_key[_SymbolKey(s)].append(s) similar = [] diffed_symbol_by_after_aliases = {} matched_before_aliases = set() unmatched_after_syms = [] # For similar symbols, padding is zeroed out. In order to not lose the # information entirely, store it in aggregate. padding_by_section_name = collections.defaultdict(int) # Step 1: Create all delta symbols and record unmatched symbols. for after_sym in after: matching_syms = before_symbols_by_key.get(_SymbolKey(after_sym)) if matching_syms: before_sym = matching_syms.pop(0) if before_sym.IsGroup() and after_sym.IsGroup(): similar.append(_DiffSymbolGroups(before_sym, after_sym)) else: if before_sym.aliases: matched_before_aliases.add(id(before_sym.aliases)) similar.append( _DiffSymbol(before_sym, after_sym, diffed_symbol_by_after_aliases, padding_by_section_name)) else: unmatched_after_syms.append(after_sym) continue # Step 2: Copy symbols only in "after" (being careful with aliases). added = _CloneUnmatched(unmatched_after_syms, diffed_symbol_by_after_aliases) # Step 3: Negate symbols only in "before" (being careful with aliases). removed = [] negated_symbol_by_before_aliases = {} for remaining_syms in before_symbols_by_key.itervalues(): removed.extend( _NegateAndClone(remaining_syms, matched_before_aliases, negated_symbol_by_before_aliases)) # Step 4: Create ** symbols to represent padding differences. for section_name, padding in padding_by_section_name.iteritems(): if padding != 0: similar.append( models.Symbol(section_name, padding, name="** aggregate padding of diff'ed symbols")) return models.SymbolDiff(added, removed, similar, name=after.name, full_name=after.full_name, section_name=after.section_name)
def _AssignNmAliasPathsAndCreatePathAliases(raw_symbols, object_paths_by_name): num_found_paths = 0 num_unknown_names = 0 num_path_mismatches = 0 num_aliases_created = 0 ret = [] for symbol in raw_symbols: ret.append(symbol) full_name = symbol.full_name if (symbol.IsBss() or not full_name or full_name[0] in '*.' or # e.g. ** merge symbols, .Lswitch.table full_name == 'startup'): continue object_paths = object_paths_by_name.get(full_name) if object_paths: num_found_paths += 1 else: if num_unknown_names < 10: logging.warning('Symbol not found in any .o files: %r', symbol) num_unknown_names += 1 continue if symbol.object_path and symbol.object_path not in object_paths: if num_path_mismatches < 10: logging.warning( 'Symbol path reported by .map not found by nm.') logging.warning('sym=%r', symbol) logging.warning('paths=%r', object_paths) object_paths.append(symbol.object_path) object_paths.sort() num_path_mismatches += 1 symbol.object_path = object_paths[0] if len(object_paths) > 1: # Create one symbol for each object_path. aliases = symbol.aliases or [symbol] symbol.aliases = aliases num_aliases_created += len(object_paths) - 1 for object_path in object_paths[1:]: new_sym = models.Symbol(symbol.section_name, symbol.size, address=symbol.address, full_name=full_name, object_path=object_path, aliases=aliases) aliases.append(new_sym) ret.append(new_sym) logging.debug( 'Cross-referenced %d symbols with nm output. ' 'num_unknown_names=%d num_path_mismatches=%d ' 'num_aliases_created=%d', num_found_paths, num_unknown_names, num_path_mismatches, num_aliases_created) return ret
def _MakeSym(section, size, path, name=None): if name is None: # Trailing letter is important since diffing trims numbers. name = '{}_{}A'.format(section[1:], size) return models.Symbol(section, size, full_name=name, template_name=name, name=name, object_path=path)
def _ParseApkOtherSymbols(section_sizes, apk_path): apk_symbols = [] zip_info_total = 0 with zipfile.ZipFile(apk_path) as z: for zip_info in z.infolist(): zip_info_total += zip_info.compress_size # Skip shared library and pak files as they are already accounted for. if (zip_info.filename.endswith('.so') or zip_info.filename.endswith('.pak')): continue apk_symbols.append(models.Symbol( models.SECTION_OTHER, zip_info.compress_size, full_name=zip_info.filename)) overhead_size = os.path.getsize(apk_path) - zip_info_total zip_overhead_symbol = models.Symbol( models.SECTION_OTHER, overhead_size, full_name='Overhead: APK file') apk_symbols.append(zip_overhead_symbol) prev = section_sizes.setdefault(models.SECTION_OTHER, 0) section_sizes[models.SECTION_OTHER] = prev + sum(s.size for s in apk_symbols) return apk_symbols
def _AddNmAliases(raw_symbols, names_by_address): """Adds symbols that were removed by identical code folding.""" # Step 1: Create list of (index_of_symbol, name_list). logging.debug('Creating alias list') replacements = [] num_new_symbols = 0 missing_names = collections.defaultdict(list) for i, s in enumerate(raw_symbols): # Don't alias padding-only symbols (e.g. ** symbol gap) if s.size_without_padding == 0: continue name_list = names_by_address.get(s.address) if name_list: if s.full_name not in name_list: missing_names[s.full_name].append(s.address) logging.warning('Name missing from aliases: %s %s', s.full_name, name_list) continue replacements.append((i, name_list)) num_new_symbols += len(name_list) - 1 if missing_names and logging.getLogger().isEnabledFor(logging.INFO): for address, names in names_by_address.iteritems(): for name in names: if name in missing_names: logging.info('Missing name %s is at address %x instead of [%s]' % (name, address, ','.join('%x' % a for a in missing_names[name]))) if float(num_new_symbols) / len(raw_symbols) < .05: logging.warning('Number of aliases is oddly low (%.0f%%). It should ' 'usually be around 25%%. Ensure --tool-prefix is correct. ', float(num_new_symbols) / len(raw_symbols) * 100) # Step 2: Create new symbols as siblings to each existing one. logging.debug('Creating %d new symbols from nm output', num_new_symbols) expected_num_symbols = len(raw_symbols) + num_new_symbols ret = [] prev_src = 0 for cur_src, name_list in replacements: ret += raw_symbols[prev_src:cur_src] prev_src = cur_src + 1 sym = raw_symbols[cur_src] # Create symbols (|sym| gets recreated and discarded). new_syms = [] for full_name in name_list: # Do not set |aliases| in order to avoid being pruned by # _CompactLargeAliasesIntoSharedSymbols(), which assumes aliases differ # only by path. The field will be set afterwards by _ConnectNmAliases(). new_syms.append(models.Symbol( sym.section_name, sym.size, address=sym.address, full_name=full_name)) ret += new_syms ret += raw_symbols[prev_src:] assert expected_num_symbols == len(ret) return ret
def test_Diff_Clustering(self): size_info1 = self._CloneSizeInfo() size_info2 = self._CloneSizeInfo() S = '.text' size_info1.symbols += [ models.Symbol(S, 11, name='.L__unnamed_1193', object_path='a'), # 1 models.Symbol(S, 22, name='.L__unnamed_1194', object_path='a'), # 2 models.Symbol(S, 33, name='.L__unnamed_1195', object_path='b'), # 3 models.Symbol(S, 44, name='.L__bar_195', object_path='b'), # 4 models.Symbol(S, 55, name='.L__bar_1195', object_path='b'), # 5 ] size_info2.symbols += [ models.Symbol(S, 33, name='.L__unnamed_2195', object_path='b'), # 3 models.Symbol(S, 11, name='.L__unnamed_2194', object_path='a'), # 1 models.Symbol(S, 22, name='.L__unnamed_2193', object_path='a'), # 2 models.Symbol(S, 44, name='.L__bar_2195', object_path='b'), # 4 models.Symbol(S, 55, name='.L__bar_295', object_path='b'), # 5 ] d = diff.Diff(size_info1, size_info2) self.assertEquals(d.symbols.added_count, 0) self.assertEquals(d.symbols.size, 0)
def _CloneSymbol(sym, size): """Returns a copy of |sym| with an updated |size|. Padding and aliases are not copied. """ return models.Symbol(sym.section_name, size, address=sym.address, name=sym.name, source_path=sym.source_path, object_path=sym.object_path, full_name=sym.full_name, flags=sym.flags)
def _ParseApkOtherSymbols(section_sizes, apk_path, apk_so_path): apk_name = os.path.basename(apk_path) apk_symbols = [] zip_info_total = 0 with zipfile.ZipFile(apk_path) as z: for zip_info in z.infolist(): zip_info_total += zip_info.compress_size # Skip main shared library, pak, and dex files as they are accounted for. if (zip_info.filename == apk_so_path or zip_info.filename.endswith('.dex') or zip_info.filename.endswith('.pak')): continue path = os.path.join(apk_name, 'other', zip_info.filename) apk_symbols.append(models.Symbol( models.SECTION_OTHER, zip_info.compress_size, object_path=path, full_name=os.path.basename(zip_info.filename))) overhead_size = os.path.getsize(apk_path) - zip_info_total assert overhead_size >= 0, 'Apk overhead must be non-negative' zip_overhead_symbol = models.Symbol( models.SECTION_OTHER, overhead_size, full_name='Overhead: APK file') apk_symbols.append(zip_overhead_symbol) prev = section_sizes.setdefault(models.SECTION_OTHER, 0) section_sizes[models.SECTION_OTHER] = prev + sum(s.size for s in apk_symbols) return apk_symbols
def _AddSymbolAliases(raw_symbols, aliases_by_address): # Step 1: Create list of (index_of_symbol, name_list). logging.debug('Creating alias list') replacements = [] num_new_symbols = 0 for i, s in enumerate(raw_symbols): # Don't alias padding-only symbols (e.g. ** symbol gap) if s.size_without_padding == 0: continue name_list = aliases_by_address.get(s.address) if name_list: if s.name not in name_list: logging.warning('Name missing from aliases: %s %s', s.name, name_list) continue replacements.append((i, name_list)) num_new_symbols += len(name_list) - 1 # Step 2: Create new symbols as siblings to each existing one. logging.debug('Creating %d aliases', num_new_symbols) src_cursor_end = len(raw_symbols) raw_symbols += [None] * num_new_symbols dst_cursor_end = len(raw_symbols) for src_index, name_list in reversed(replacements): # Copy over symbols that come after the current one. chunk_size = src_cursor_end - src_index - 1 dst_cursor_end -= chunk_size src_cursor_end -= chunk_size raw_symbols[dst_cursor_end:dst_cursor_end + chunk_size] = (raw_symbols[src_cursor_end:src_cursor_end + chunk_size]) sym = raw_symbols[src_index] src_cursor_end -= 1 # Create aliases (does not bother reusing the existing symbol). aliases = [None] * len(name_list) for i, name in enumerate(name_list): aliases[i] = models.Symbol(sym.section_name, sym.size, address=sym.address, name=name, aliases=aliases) dst_cursor_end -= len(aliases) raw_symbols[dst_cursor_end:dst_cursor_end + len(aliases)] = aliases assert dst_cursor_end == src_cursor_end
def CreateDexSymbols(apk_path, mapping_path, size_info_prefix): source_map = _ParseJarInfoFile(size_info_prefix + '.jar.info') nodes = _RunApkAnalyzer(apk_path, mapping_path) nodes = UndoHierarchicalSizing(nodes) dex_expected_size = _ExpectedDexTotalSize(apk_path) total_node_size = sum(map(lambda x: x[2], nodes)) # TODO(agrieve): Figure out why this log is triggering for # ChromeModernPublic.apk (https://crbug.com/851535). # Reporting: dex_expected_size=6546088 total_node_size=6559549 if dex_expected_size < total_node_size: logging.error( 'Node size too large, check for node processing errors. ' 'dex_expected_size=%d total_node_size=%d', dex_expected_size, total_node_size) # We have more than 100KB of ids for methods, strings id_metadata_overhead_size = dex_expected_size - total_node_size symbols = [] for _, name, node_size in nodes: package = name.split(' ', 1)[0] class_path = package.split('$')[0] source_path = source_map.get(class_path, '') if source_path: object_path = package elif package == _TOTAL_NODE_NAME: name = '* Unattributed Dex' object_path = '' # Categorize in the anonymous section. node_size += id_metadata_overhead_size else: object_path = os.path.join(models.APK_PREFIX_PATH, *package.split('.')) if name.endswith(')'): section_name = models.SECTION_DEX_METHOD else: section_name = models.SECTION_DEX symbols.append( models.Symbol(section_name, node_size, full_name=name, object_path=object_path, source_path=source_path)) return symbols
def _ParseCommonSymbols(self): # Common symbol size file # # ff_cos_131072 0x40000 obj/third_party/<snip> # ff_cos_131072_fixed # 0x20000 obj/third_party/<snip> ret = [] next(self._lines) # Skip past blank line name, size_str, path = None, None, None for l in self._lines: parts = self._ParsePossiblyWrappedParts(l, 3) if not parts: break name, size_str, path = parts sym = models.Symbol(models.SECTION_BSS, int(size_str[2:], 16), full_name=name, object_path=path) ret.append(sym) return ret
def _ParseCommonSymbols(self): # Common symbol size file # # ff_cos_131072 0x40000 obj/third_party/<snip> # ff_cos_131072_fixed # 0x20000 obj/third_party/<snip> self._SkipToLineWithPrefix('Common symbol') next(self._lines) # Skip past blank line name, size_str, path = None, None, None for l in self._lines: parts = self._ParsePossiblyWrappedParts(l, 3) if not parts: break name, size_str, path = parts self._symbols.append( models.Symbol('.bss', int(size_str[2:], 16), name=name, object_path=path))
def _ParsePakSymbols(section_sizes, object_paths, output_directory, symbols_by_id): for path in object_paths: whitelist_path = os.path.join(output_directory, path + '.whitelist') if (not os.path.exists(whitelist_path) or os.path.getsize(whitelist_path) == 0): continue with open(whitelist_path, 'r') as f: for line in f: resource_id = int(line.rstrip()) # There may be object files in static libraries that are removed by the # linker when there are no external references to its symbols. These # files may be included in object_paths which our apk does not use, # resulting in resource_ids that don't end up being in the final apk. if resource_id not in symbols_by_id: continue symbols_by_id[resource_id].object_path = path raw_symbols = sorted(symbols_by_id.values(), key=lambda s: (s.section_name, s.address)) raw_total = 0.0 int_total = 0 for symbol in raw_symbols: raw_total += symbol.size # We truncate rather than round to ensure that we do not over attribute. It # is easier to add another symbol to make up the difference. symbol.size = int(symbol.size) int_total += symbol.size # Attribute excess to translations since only those are compressed. raw_symbols.append( models.Symbol(models.SECTION_PAK_TRANSLATIONS, int(round(raw_total - int_total)), full_name='Pak compression leftover artifacts')) for symbol in raw_symbols: prev = section_sizes.setdefault(symbol.section_name, 0) section_sizes[symbol.section_name] = prev + symbol.size return raw_symbols
def _DiffSymbolGroups(containers, before, after): # For changed symbols, padding is zeroed out. In order to not lose the # information entirely, store it in aggregate. These aggregations are grouped # by "segment names", which are (container name, section name) tuples. padding_by_segment = collections.defaultdict(float) # Usually >90% of symbols are exact matches, so all of the time is spent in # this first pass. all_deltas, before, after = _MatchSymbols(before, after, _Key1, padding_by_segment) for key_func in (_Key2, _Key3, _Key4): delta_syms, before, after = _MatchSymbols(before, after, key_func, padding_by_segment) all_deltas.extend(delta_syms) logging.debug('Creating %d unmatched symbols', len(after) + len(before)) for after_sym in after: all_deltas.append(models.DeltaSymbol(None, after_sym)) for before_sym in before: all_deltas.append(models.DeltaSymbol(before_sym, None)) container_from_name = {c.name: c for c in containers} # Create a DeltaSymbol to represent the zero'd out padding of matched symbols. for (container_name, section_name), padding in padding_by_segment.items(): # Values need to be integer (crbug.com/1132394). padding = round(padding) if padding != 0: after_sym = models.Symbol(section_name, padding) after_sym.container = container_from_name[container_name] # This is after _NormalizeNames() is called, so set |full_name|, # |template_name|, and |name|. after_sym.SetName("Overhead: aggregate padding of diff'ed symbols") after_sym.padding = padding all_deltas.append(models.DeltaSymbol(None, after_sym)) return models.DeltaSymbolGroup(all_deltas)
from flaskapp import db import models from ImageFile import ImageFile from features import feature_histogram, trim, zoning_method zeros = ['0-00a.bmp', '0-00b.bmp', '0-00c.bmp', '0-00d.bmp', '0-00e.bmp'] ones = ['1-00a.bmp', '1-00b.bmp', '1-00c.bmp', '1-00d.bmp', '1-00e.bmp'] for j in range(2): if j == 0: s = models.Symbol(name="zero") else: s = models.Symbol(name="one") db.session.add(s) db.session.commit() for i in range(5): if j == 0: path = "./images/zero/" + zeros[i] else: path = "./images/one/" + ones[i] img = ImageFile(path) trimmed = trim(img) img_vector = zoning_method(trimmed) for k in range(16): if k == 0: v = models.V1(histogram_value=img_vector[k], number=s) elif k == 1: v = models.V2(histogram_value=img_vector[k], number=s) elif k == 2:
def _ParseSections(self): # .text 0x0028c600 0x22d3468 # .text.startup._GLOBAL__sub_I_bbr_sender.cc # 0x0028c600 0x38 obj/net/net/bbr_sender.o # .text._reset 0x00339d00 0xf0 obj/third_party/icu/icuuc/ucnv.o # ** fill 0x0255fb00 0x02 # .text._ZN4base8AutoLockD2Ev # 0x00290710 0xe obj/net/net/file_name.o # 0x00290711 base::AutoLock::~AutoLock() # 0x00290711 base::AutoLock::~AutoLock() # .text._ZNK5blink15LayoutBlockFlow31mustSeparateMarginAfterForChildERK... # 0xffffffffffffffff 0x46 obj/... # 0x006808e1 blink::LayoutBlockFlow::... # .bss # .bss._ZGVZN11GrProcessor11initClassIDI10LightingFPEEvvE8kClassID # 0x02d4b294 0x4 obj/skia/skia/SkLightingShader.o # 0x02d4b294 guard variable for void GrProcessor::initClassID # .data 0x0028c600 0x22d3468 # .data.rel.ro._ZTVN3gvr7android19ScopedJavaGlobalRefIP12_jfloatArrayEE # 0x02d1e668 0x10 ../../third_party/.../libfoo.a(bar.o) # 0x02d1e668 vtable for gvr::android::GlobalRef<_jfloatArray*> # ** merge strings # 0x0255fb00 0x1f2424 # ** merge constants # 0x0255fb00 0x8 # ** common 0x02db5700 0x13ab48 syms = self._symbols while True: line = self._SkipToLineWithPrefix('.') if not line: break section_name = None try: # Parse section name and size. parts = self._ParsePossiblyWrappedParts(line, 3) if not parts: break section_name, section_address_str, section_size_str = parts section_address = int(section_address_str[2:], 16) section_size = int(section_size_str[2:], 16) self._section_sizes[section_name] = section_size if (section_name in (models.SECTION_BSS, models.SECTION_RODATA, models.SECTION_TEXT) or section_name.startswith(models.SECTION_DATA)): logging.info('Parsing %s', section_name) if section_name == models.SECTION_BSS: # Common symbols have no address. syms.extend(self._common_symbols) prefix_len = len( section_name) + 1 # + 1 for the trailing . symbol_gap_count = 0 merge_symbol_start_address = section_address sym_count_at_start = len(syms) line = next(self._lines) # Parse section symbols. while True: if not line or line.isspace(): break if line.startswith(' **'): zero_index = line.find('0') if zero_index == -1: # Line wraps. name = line.strip() line = next(self._lines) else: # Line does not wrap. name = line[:zero_index].strip() line = line[zero_index:] address_str, size_str = self._ParsePossiblyWrappedParts( line, 2) line = next(self._lines) # These bytes are already accounted for. if name == '** common': continue address = int(address_str[2:], 16) size = int(size_str[2:], 16) path = None sym = models.Symbol(section_name, size, address=address, full_name=name, object_path=path) syms.append(sym) if merge_symbol_start_address > 0: merge_symbol_start_address += size else: # A normal symbol entry. subsection_name, address_str, size_str, path = ( self._ParsePossiblyWrappedParts(line, 4)) size = int(size_str[2:], 16) assert subsection_name.startswith(section_name), ( 'subsection name was: ' + subsection_name) mangled_name = subsection_name[prefix_len:] name = None address_str2 = None while True: line = next(self._lines).rstrip() if not line or line.startswith(' .'): break # clang includes ** fill, but gcc does not. if line.startswith(' ** fill'): # Alignment explicitly recorded in map file. Rather than # record padding based on these entries, we calculate it # using addresses. We do this because fill lines are not # present when compiling with gcc (only for clang). continue elif line.startswith(' **'): break elif name is None: address_str2, name = self._ParsePossiblyWrappedParts( line, 2) if address_str == '0xffffffffffffffff': # The section needs special handling (e.g., a merge section) # It also generally has a large offset after it, so don't # penalize the subsequent symbol for this gap (e.g. a 50kb gap). # There seems to be no corelation between where these gaps occur # and the symbols they come in-between. # TODO(agrieve): Learn more about why this happens. if address_str2: address = int(address_str2[2:], 16) - 1 elif syms and syms[-1].address > 0: # Merge sym with no second line showing real address. address = syms[-1].end_address else: logging.warning( 'First symbol of section had address -1' ) address = 0 merge_symbol_start_address = address + size else: address = int(address_str[2:], 16) # Finish off active address gap / merge section. if merge_symbol_start_address: merge_size = address - merge_symbol_start_address merge_symbol_start_address = 0 if merge_size > 0: # merge_size == 0 for the initial symbol generally. logging.debug( 'Merge symbol of size %d found at:\n %r', merge_size, syms[-1]) # Set size=0 so that it will show up as padding. sym = models.Symbol( section_name, 0, address=address, full_name='** symbol gap %d' % symbol_gap_count) symbol_gap_count += 1 syms.append(sym) # .text.res_findResource_60 # 0x00178de8 0x12a obj/... # 0x00178de9 res_findResource_60 # .text._ZN3url6ParsedC2Ev # 0x0021ad62 0x2e obj/url/url/url_parse.o # 0x0021ad63 url::Parsed::Parsed() # .text.unlikely._ZN4base3CPUC2Ev # 0x003f9d3c 0x48 obj/base/base/cpu.o # 0x003f9d3d base::CPU::CPU() full_name = name if mangled_name and (not name or mangled_name.startswith('_Z') or '._Z' in mangled_name): full_name = mangled_name sym = models.Symbol(section_name, size, address=address, full_name=full_name, object_path=path) syms.append(sym) section_end_address = section_address + section_size if section_name != models.SECTION_BSS and ( syms[-1].end_address < section_end_address): # Set size=0 so that it will show up as padding. sym = models.Symbol( section_name, 0, address=section_end_address, full_name=('** symbol gap %d (end of section)' % symbol_gap_count)) syms.append(sym) logging.debug('Symbol count for %s: %d', section_name, len(syms) - sym_count_at_start) except: logging.error('Problem line: %r', line) logging.error('In section: %r', section_name) raise
def Create(self, *args, **kwargs): self.Flush() self.cur_sym = models.Symbol(*args, **kwargs)
def _CreateMergeStringsReplacements(merge_string_syms, list_of_positions_by_object_path): """Creates replacement symbols for |merge_syms|.""" ret = [] STRING_LITERAL_NAME = models.STRING_LITERAL_NAME assert len(merge_string_syms) == len(list_of_positions_by_object_path) tups = itertools.izip(merge_string_syms, list_of_positions_by_object_path) for merge_sym, positions_by_object_path in tups: merge_sym_address = merge_sym.address new_symbols = [] ret.append(new_symbols) for object_path, positions in positions_by_object_path.iteritems(): for offset, size in positions: address = merge_sym_address + offset symbol = models.Symbol( models.SECTION_RODATA, size, address, STRING_LITERAL_NAME, object_path=object_path) new_symbols.append(symbol) logging.debug('Created %d string literal symbols', sum(len(x) for x in ret)) logging.debug('Sorting string literals') for symbols in ret: # In order to achieve a total ordering in the presense of aliases, need to # include both |address| and |object_path|. # In order to achieve consistent deduping, need to include |size|. symbols.sort(key=lambda x: (x.address, -x.size, x.object_path)) logging.debug('Deduping string literals') num_removed = 0 size_removed = 0 num_aliases = 0 for i, symbols in enumerate(ret): if not symbols: continue prev_symbol = symbols[0] new_symbols = [prev_symbol] for symbol in symbols[1:]: padding = symbol.address - prev_symbol.end_address if (prev_symbol.address == symbol.address and prev_symbol.size == symbol.size): # String is an alias. num_aliases += 1 aliases = prev_symbol.aliases if aliases: aliases.append(symbol) symbol.aliases = aliases else: aliases = [prev_symbol, symbol] prev_symbol.aliases = aliases symbol.aliases = aliases elif padding + symbol.size <= 0: # String is a substring of prior one. num_removed += 1 size_removed += symbol.size continue elif padding < 0: # String overlaps previous one. Adjust to not overlap. symbol.address -= padding symbol.size += padding new_symbols.append(symbol) prev_symbol = symbol ret[i] = new_symbols # Aliases come out in random order, so sort to be deterministic. ret[i].sort(key=lambda s: (s.address, s.object_path)) logging.debug( 'Removed %d overlapping string literals (%d bytes) & created %d aliases', num_removed, size_removed, num_aliases) return ret
def CreateSectionSizesAndSymbols( map_path=None, tool_prefix=None, output_directory=None, elf_path=None, apk_path=None, track_string_literals=True, metadata=None, apk_elf_result=None, pak_files=None, pak_info_file=None, knobs=SectionSizeKnobs()): """Creates sections sizes and symbols for a SizeInfo. Args: map_path: Path to the linker .map(.gz) file to parse. elf_path: Path to the corresponding unstripped ELF file. Used to find symbol aliases and inlined functions. Can be None. tool_prefix: Prefix for c++filt & nm (required). output_directory: Build output directory. If None, source_paths and symbol alias information will not be recorded. track_string_literals: Whether to break down "** merge string" sections into smaller symbols (requires output_directory). """ source_mapper = None elf_object_paths = None if output_directory: # Start by finding the elf_object_paths, so that nm can run on them while # the linker .map is being parsed. logging.info('Parsing ninja files.') source_mapper, elf_object_paths = ninja_parser.Parse( output_directory, elf_path) logging.debug('Parsed %d .ninja files.', source_mapper.parsed_file_count) assert not elf_path or elf_object_paths, ( 'Failed to find link command in ninja files for ' + os.path.relpath(elf_path, output_directory)) section_sizes, raw_symbols = _ParseElfInfo( map_path, elf_path, tool_prefix, output_directory, track_string_literals, elf_object_paths) elf_overhead_size = _CalculateElfOverhead(section_sizes, elf_path) pak_symbols_by_id = None if apk_path: pak_symbols_by_id = _FindPakSymbolsFromApk(apk_path, output_directory, knobs) section_sizes, elf_overhead_size = _ParseApkElfSectionSize( section_sizes, metadata, apk_elf_result) raw_symbols.extend(_ParseApkOtherSymbols(section_sizes, apk_path)) elif pak_files and pak_info_file: pak_symbols_by_id = _FindPakSymbolsFromFiles( pak_files, pak_info_file, output_directory) if elf_path: elf_overhead_symbol = models.Symbol( models.SECTION_OTHER, elf_overhead_size, full_name='Overhead: ELF file') prev = section_sizes.setdefault(models.SECTION_OTHER, 0) section_sizes[models.SECTION_OTHER] = prev + elf_overhead_size raw_symbols.append(elf_overhead_symbol) if pak_symbols_by_id: object_paths = (p for p in source_mapper.IterAllPaths() if p.endswith('.o')) pak_raw_symbols = _ParsePakSymbols( section_sizes, object_paths, output_directory, pak_symbols_by_id) raw_symbols.extend(pak_raw_symbols) _ExtractSourcePathsAndNormalizeObjectPaths(raw_symbols, source_mapper) logging.info('Converting excessive aliases into shared-path symbols') _CompactLargeAliasesIntoSharedSymbols(raw_symbols, knobs) logging.debug('Connecting nm aliases') _ConnectNmAliases(raw_symbols) return section_sizes, raw_symbols
def Parse(self, lines): """Parses a linker map file. Args: lines: Iterable of lines, the first of which has been consumed to identify file type. Returns: A tuple of (section_ranges, symbols). """ # Newest format: # VMA LMA Size Align Out In Symbol # 194 194 13 1 .interp # 194 194 13 1 <internal>:(.interp) # 1a8 1a8 22d8 4 .ARM.exidx # 1b0 1b0 8 4 obj/sandbox/syscall.o:(.ARM.exidx) # 400 400 123400 64 .text # 600 600 14 4 ...:(.text.OUTLINED_FUNCTION_0) # 600 600 0 1 $x.3 # 600 600 14 1 OUTLINED_FUNCTION_0 # 123800 123800 20000 256 .rodata # 123800 123800 4 4 ...:o:(.rodata._ZN3fooE.llvm.1234) # 123800 123800 4 1 foo (.llvm.1234) # 123804 123804 4 4 ...:o:(.rodata.bar.llvm.1234) # 123804 123804 4 1 bar.llvm.1234 # Older format: # Address Size Align Out In Symbol # 00000000002002a8 000000000000001c 1 .interp # 00000000002002a8 000000000000001c 1 <internal>:(.interp) # ... # 0000000000201000 0000000000000202 16 .text # 0000000000201000 000000000000002a 1 /[...]/crt1.o:(.text) # 0000000000201000 0000000000000000 0 _start # 000000000020102a 0000000000000000 1 /[...]/crti.o:(.text) # 0000000000201030 00000000000000bd 16 /[...]/crtbegin.o:(.text) # 0000000000201030 0000000000000000 0 deregister_tm_clones # 0000000000201060 0000000000000000 0 register_tm_clones # 00000000002010a0 0000000000000000 0 __do_global_dtors_aux # 00000000002010c0 0000000000000000 0 frame_dummy # 00000000002010ed 0000000000000071 1 a.o:(.text) # 00000000002010ed 0000000000000071 0 main syms = [] cur_section = None cur_section_is_useful = False promoted_name_count = 0 # |is_partial| indicates that an eligible Level 3 line should be used to # update |syms[-1].full_name| instead of creating a new symbol. is_partial = False # Assembly code can create consecutive Level 3 lines with |size == 0|. These # lines can represent # (1) assembly functions (should form symbol), or # (2) assembly labels (should NOT form symbol). # It seems (2) correlates with the presence of a leading Level 3 line with # |size > 0|. This gives rise to the following strategy: Each symbol S from # a Level 3 line suppresses Level 3 lines with |address| less than # |next_usable_address := S.address + S.size|. next_usable_address = 0 # For Thin-LTO, a map from each address to the Thin-LTO cache file. This # provides hints downstream to identify object_paths for .L.ref.tmp symbols, # but is not useful in the final output. Therefore it's stored separately, # instead of being in Symbol. thin_map = {} tokenizer = self.Tokenize(lines) in_partitions = False in_jump_table = False jump_tables_count = 0 jump_entries_count = 0 for (line, address, size, level, span, tok) in tokenizer: # Level 1 data match the "Out" column. They specify sections or # PROVIDE_HIDDEN lines. if level == 1: # Ignore sections that belong to feature library partitions. Seeing a # partition name is an indicator that we've entered a list of feature # partitions. After these, a single .part.end section will follow to # reserve memory at runtime. Seeing the .part.end section also marks the # end of partition sections in the map file. if tok.endswith('_partition'): in_partitions = True elif tok == '.part.end': # Note that we want to retain .part.end section, so it's fine to # restart processing on this section, rather than the next one. in_partitions = False if in_partitions: # For now, completely ignore feature partitions. cur_section = None cur_section_is_useful = False else: if not tok.startswith('PROVIDE_HIDDEN'): self._section_ranges[tok] = (address, size) cur_section = tok # E.g., Want to convert "(.text._name)" -> "_name" later. mangled_start_idx = len(cur_section) + 2 cur_section_is_useful = ( cur_section in models.BSS_SECTIONS or cur_section in (models.SECTION_RODATA, models.SECTION_TEXT) or cur_section.startswith(models.SECTION_DATA)) elif cur_section_is_useful: # Level 2 data match the "In" column. They specify object paths and # section names within objects, or '<internal>:...'. if level == 2: # E.g., 'path.o:(.text._name)' => ['path.o', '(.text._name)']. cur_obj, paren_value = tok.split(':') in_jump_table = '.L.cfi.jumptable' in paren_value if in_jump_table: # Store each CFI jump table as a Level 2 symbol, whose Level 3 # details are discarded. jump_tables_count += 1 cur_obj = '' # Replaces 'lto.tmp' to prevent problem later. mangled_name = '** CFI jump table' else: # E.g., '(.text.unlikely._name)' -> '_name'. mangled_name = paren_value[mangled_start_idx:-1] cur_flags = _FlagsFromMangledName(mangled_name) is_partial = True # As of 2017/11 LLD does not distinguish merged strings from other # merged data. Feature request is filed under: # https://bugs.llvm.org/show_bug.cgi?id=35248 if cur_obj == '<internal>': if cur_section == '.rodata' and mangled_name == '': # Treat all <internal> sections within .rodata as as string # literals. Some may hold numeric constants or other data, but # there is currently no way to distinguish them. mangled_name = '** lld merge strings' else: # e.g. <internal>:(.text.thunk) mangled_name = '** ' + mangled_name is_partial = False cur_obj = None elif cur_obj == 'lto.tmp' or 'thinlto-cache' in cur_obj: thin_map[address] = os.path.basename(cur_obj) cur_obj = None # Create a symbol here since there may be no ensuing Level 3 lines. # But if there are, then the symbol can be modified later as sym[-1]. sym = models.Symbol(cur_section, size, address=address, full_name=mangled_name, object_path=cur_obj, flags=cur_flags) syms.append(sym) # Level 3 |address| is nested under Level 2, don't add |size|. next_usable_address = address # Level 3 data match the "Symbol" column. They specify symbol names or # special names such as '.L_MergeGlobals'. Annotations such as '$d', # '$t.42' also appear at Level 3, but they are consumed by |tokenizer|, # so don't appear hear. elif level == 3: # Handle .L.cfi.jumptable. if in_jump_table: # Level 3 entries in CFI jump tables are thunks with mangled names. # Extracting them as symbols is not worthwhile; we only store the # Level 2 symbol, and print the count for verbose output. For # counting, '__typeid_' entries are excluded since they're likely # just annotations. if not tok.startswith('__typeid_'): jump_entries_count += 1 continue # Ignore anything with '.L_MergedGlobals' prefix. This seems to only # happen for ARM (32-bit) builds. if tok.startswith('.L_MergedGlobals'): continue # Use |span| to decide whether to use a Level 3 line for Symbols. This # is useful for two purposes: # * This is a better indicator than |size|, which can be 0 for # assembly functions. # * If multiple Level 3 lines have the same starting address, this # cause all but the last line to have |span > 0|. This dedups lines # with identical symbol names (why do they exist?). Note that this # also skips legitimate aliases, but that's desired because nm.py # (downstream) assumes no aliases already exist. if span > 0: stripped_tok = demangle.StripLlvmPromotedGlobalNames(tok) if len(tok) != len(stripped_tok): promoted_name_count += 1 tok = stripped_tok tok = _NormalizeName(tok) # Handle special case where a partial symbol consumes bytes before # the first Level 3 symbol. if is_partial and syms[-1].address < address: # Truncate the partial symbol and leave it without |full_name|. # The data from the current line will form a new symbol. syms[-1].size = address - syms[-1].address next_usable_address = address is_partial = False if is_partial: syms[-1].full_name = tok syms[-1].size = size if size > 0 else min(syms[-1].size, span) next_usable_address = address + syms[-1].size is_partial = False elif address >= next_usable_address: if tok.startswith('__typeid_'): assert size == 1 if tok.endswith('_byte_array'): # CFI byte array table: |size| is inaccurate, so use |span|. size_to_use = span else: # Likely '_global_addr' or '_unique_member'. These should be: # * Skipped since they're in CFI tables. # * Suppressed (via |next_usable_address|) by another Level 3 # symbol. # Anything that makes it here would be an anomaly worthy of # investigation, so print warnings. logging.warn('Unrecognized __typeid_ symbol at %08X', address) continue else: # Prefer |size|, and only fall back to |span| if |size == 0|. size_to_use = size if size > 0 else span sym = models.Symbol(cur_section, size_to_use, address=address, full_name=tok, flags=cur_flags) syms.append(sym) # Suppress symbols with overlapping |address|. This eliminates # labels from assembly sources. next_usable_address = address + size_to_use if cur_obj is not None: syms[-1].object_path = cur_obj else: logging.error('Problem line: %r', line) if promoted_name_count: logging.info('Found %d promoted global names', promoted_name_count) if jump_tables_count: logging.info('Found %d CFI jump tables with %d total entries', jump_tables_count, jump_entries_count) return self._section_ranges, syms, {'thin_map': thin_map}
def Parse(self, lines): """Parses a linker map file. Args: lines: Iterable of lines, the first of which has been consumed to identify file type. Returns: A tuple of (section_sizes, symbols). """ # Newest format: # VMA LMA Size Align Out In Symbol # 194 194 13 1 .interp # 194 194 13 1 <internal>:(.interp) # 1a8 1a8 22d8 4 .ARM.exidx # 1b0 1b0 8 4 obj/sandbox/syscall.o:(.ARM.exidx) # 400 400 123400 64 .text # 600 600 14 4 obj/...:(.text.OUTLINED_FUNCTION_0) # 600 600 0 1 $x.3 # 600 600 14 1 OUTLINED_FUNCTION_0 # 123800 123800 20000 256 .rodata # 123800 123800 4 4 ...:o:(.rodata._ZN3fooE.llvm.1234) # 123800 123800 4 1 foo (.llvm.1234) # 123804 123804 4 4 ...:o:(.rodata.bar.llvm.1234) # 123804 123804 4 1 bar.llvm.1234 # Older format: # Address Size Align Out In Symbol # 00000000002002a8 000000000000001c 1 .interp # 00000000002002a8 000000000000001c 1 <internal>:(.interp) # ... # 0000000000201000 0000000000000202 16 .text # 0000000000201000 000000000000002a 1 /[...]/crt1.o:(.text) # 0000000000201000 0000000000000000 0 _start # 000000000020102a 0000000000000000 1 /[...]/crti.o:(.text) # 0000000000201030 00000000000000bd 16 /[...]/crtbegin.o:(.text) # 0000000000201030 0000000000000000 0 deregister_tm_clones # 0000000000201060 0000000000000000 0 register_tm_clones # 00000000002010a0 0000000000000000 0 __do_global_dtors_aux # 00000000002010c0 0000000000000000 0 frame_dummy # 00000000002010ed 0000000000000071 1 a.o:(.text) # 00000000002010ed 0000000000000071 0 main syms = [] cur_section = None cur_section_is_useful = None promoted_name_count = 0 # A Level 2 line does not supply |full_name| data (unless '<internal>'). # This would be taken from a Level 3 line. |is_partial| indicates that an # eligible Level 3 line should be used to update |syms[-1].full_name| # instead of creating a new symbol. is_partial = False # Assembly code can create consecutive Level 3 lines with |size == 0|. These # lines can represent # (1) assembly functions (should form symbol), or # (2) assembly labels (should NOT form symbol). # It seems (2) correlates with the presence of a leading Level 3 line with # |size > 0|. This gives rise to the following strategy: Each symbol S from # a Level 3 line suppresses Level 3 lines with |address| less than # |next_usable_address := S.address + S.size|. next_usable_address = 0 tokenizer = self.Tokenize(lines) for (line, address, size, level, span, tok) in tokenizer: # Level 1 data match the "Out" column. They specify sections or # PROVIDE_HIDDEN lines. if level == 1: if not tok.startswith('PROVIDE_HIDDEN'): self._section_sizes[tok] = size cur_section = tok # E.g., Want to convert "(.text._name)" -> "_name" later. mangled_start_idx = len(cur_section) + 2 cur_section_is_useful = ( cur_section in (models.SECTION_BSS, models.SECTION_RODATA, models.SECTION_TEXT) or cur_section.startswith(models.SECTION_DATA)) elif cur_section_is_useful: # Level 2 data match the "In" column. They specify object paths and # section names within objects, or '<internal>:...'. if level == 2: # Create a symbol here since there may be no ensuing Level 3 lines. # But if there are, then the symbol can be modified later as sym[-1]. syms.append(models.Symbol(cur_section, size, address=address)) # E.g., 'path.o:(.text._name)' => ['path.o', '(.text._name)']. cur_obj, paren_value = tok.split(':') # E.g., '(.text._name)' -> '_name'. mangled_name = paren_value[mangled_start_idx:-1] # As of 2017/11 LLD does not distinguish merged strings from other # merged data. Feature request is filed under: # https://bugs.llvm.org/show_bug.cgi?id=35248 if cur_obj == '<internal>': if cur_section == '.rodata' and mangled_name == '': # Treat all <internal> sections within .rodata as as string # literals. Some may hold numeric constants or other data, but # there is currently no way to distinguish them. syms[-1].full_name = '** lld merge strings' else: # e.g. <internal>:(.text.thunk) syms[-1].full_name = '** ' + mangled_name cur_obj = None elif cur_obj == 'lto.tmp' or 'thinlto-cache' in cur_obj: cur_obj = None if cur_obj is not None: syms[-1].object_path = cur_obj is_partial = not bool(syms[-1].full_name) # Level 3 |address| is nested under Level 2, don't add |size|. next_usable_address = address # Level 3 data match the "Symbol" column. They specify symbol names or # special names such as '.L_MergeGlobals'. Annotations such as '$d', # '$t.42' also appear at Level 3, but they are consumed by |tokenizer|, # so don't appear hear. elif level == 3: # Ignore anything with '.L_MergedGlobals' prefix. This seems to only # happen for ARM (32-bit) builds. if tok.startswith('.L_MergedGlobals'): continue # Use |span| to decide whether to use a Level 3 line for Symbols. This # is useful for two purposes: # * This is a better indicator than |size|, which can be 0 for # assembly functions. # * If multiple Level 3 lines have the same starting address, this # cause all but the last line to have |span > 0|. This dedups lines # with identical symbol names (why do they exist?). Note that this # also skips legitimate aliases, but that's desired because nm.py # (downstream) assumes no aliases already exist. if span > 0: # Outlined functions have names like OUTLINED_FUNCTION_0, which can # appear 1000+ time, and can cause false aliasing. We treat these as # special cases by designating them as a placeholder symbols and # renaming them to '** outlined function'. if tok.startswith('OUTLINED_FUNCTION_'): tok = '** outlined function' stripped_tok = demangle.StripLlvmPromotedGlobalNames(tok) if len(tok) != len(stripped_tok): promoted_name_count += 1 tok = stripped_tok # Handle special case where a partial symbol consumes bytes before # the first Level 3 symbol. if is_partial and syms[-1].address < address: # Truncate the partial symbol and leave it without |full_name|. # The data from the current line will form a new symbol. syms[-1].size = address - syms[-1].address next_usable_address = address is_partial = False if is_partial: syms[-1].full_name = tok syms[-1].size = size if size > 0 else min(syms[-1].size, span) next_usable_address = address + syms[-1].size is_partial = False elif address >= next_usable_address: # Prefer |size|, and only fall back to |span| if |size == 0|. size_to_use = size if size > 0 else span syms.append( models.Symbol( cur_section, size_to_use, address=address, full_name=tok)) # Suppress symbols with overlapping |address|. This eliminates # labels from assembly sources. next_usable_address = address + size_to_use if cur_obj is not None: syms[-1].object_path = cur_obj else: logging.error('Problem line: %r', line) if promoted_name_count: logging.info('Found %d promoted global names', promoted_name_count) return self._section_sizes, syms