def CollectAliasesByAddress(elf_path, tool_prefix): """Runs nm on |elf_path| and returns a dict of address->[names]""" # Constructors often show up twice, so use sets to ensure no duplicates. names_by_address = collections.defaultdict(set) # About 60mb of output, but piping takes ~30s, and loading it into RAM # directly takes 3s. args = [path_util.GetNmPath(tool_prefix), '--no-sort', '--defined-only', elf_path] output = subprocess.check_output(args) for line in output.splitlines(): space_idx = line.find(' ') address_str = line[:space_idx] section = line[space_idx + 1] mangled_name = line[space_idx + 3:] # To verify that rodata does not have aliases: # nm --no-sort --defined-only libchrome.so > nm.out # grep -v '\$' nm.out | grep ' r ' | sort | cut -d' ' -f1 > addrs # wc -l < addrs; uniq < addrs | wc -l if section not in 'tTW' or not _IsRelevantNmName(mangled_name): continue address = int(address_str, 16) if not address: continue names_by_address[address].add(mangled_name) # Demangle all names. names_by_address = demangle.DemangleSetsInDicts(names_by_address, tool_prefix) # Since this is run in a separate process, minimize data passing by returning # only aliased symbols. # Also: Sort to ensure stable ordering. return {k: sorted(v) for k, v in names_by_address.iteritems() if len(v) > 1}
def RunNmOnIntermediates(target, tool_prefix, output_directory): """Returns encoded_symbol_names_by_path, encoded_string_addresses_by_path. Args: target: Either a single path to a .a (as a string), or a list of .o paths. """ is_archive = isinstance(target, str) args = [path_util.GetNmPath(tool_prefix), '--no-sort', '--defined-only'] if is_archive: args.append(target) else: args.extend(target) # pylint: disable=unexpected-keyword-arg proc = subprocess.Popen(args, cwd=output_directory, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8') # llvm-nm can print 'no symbols' to stderr. Capture and count the number of # lines, to be returned to the caller. stdout, stderr = proc.communicate() assert proc.returncode == 0, 'NM failed: ' + ' '.join(args) num_no_symbols = len(stderr.splitlines()) lines = stdout.splitlines() # Empty .a file has no output. if not lines: return parallel.EMPTY_ENCODED_DICT, parallel.EMPTY_ENCODED_DICT is_multi_file = not lines[0] lines = iter(lines) if is_multi_file: next(lines) path = next(lines)[:-1] # Path ends with a colon. else: assert not is_archive path = target[0] symbol_names_by_path = {} string_addresses_by_path = {} while path: if is_archive: # E.g. foo/bar.a(baz.o) path = '%s(%s)' % (target, path) mangled_symbol_names, string_addresses = _ParseOneObjectFileNmOutput( lines) symbol_names_by_path[path] = mangled_symbol_names if string_addresses: string_addresses_by_path[path] = string_addresses path = next(lines, ':')[:-1] # The multiprocess API uses pickle, which is ridiculously slow. More than 2x # faster to use join & split. # TODO(agrieve): We could use path indices as keys rather than paths to cut # down on marshalling overhead. return (parallel.EncodeDictOfLists(symbol_names_by_path), parallel.EncodeDictOfLists(string_addresses_by_path), num_no_symbols)
def _RunNmOnIntermediates(target, tool_prefix, output_directory): """Returns encoded_symbol_names_by_path, encoded_string_addresses_by_path. Args: target: Either a single path to a .a (as a string), or a list of .o paths. """ is_archive = isinstance(target, basestring) args = [ path_util.GetNmPath(tool_prefix), '--no-sort', '--defined-only', '--demangle' ] if is_archive: args.append(target) else: args.extend(target) output = subprocess.check_output(args, cwd=output_directory) lines = output.splitlines() # Empty .a file has no output. if not lines: return concurrent.EMPTY_ENCODED_DICT, concurrent.EMPTY_ENCODED_DICT is_multi_file = not lines[0] lines = iter(lines) if is_multi_file: next(lines) path = next(lines)[:-1] # Path ends with a colon. else: assert not is_archive path = target[0] string_addresses_by_path = {} symbol_names_by_path = {} while path: if is_archive: # E.g. foo/bar.a(baz.o) path = '%s(%s)' % (target, path) string_addresses, symbol_names = _ParseOneObjectFileNmOutput(lines) symbol_names_by_path[path] = symbol_names if string_addresses: string_addresses_by_path[path] = string_addresses path = next(lines, ':')[:-1] # The multiprocess API uses pickle, which is ridiculously slow. More than 2x # faster to use join & split. # TODO(agrieve): We could use path indices as keys rather than paths to cut # down on marshalling overhead. return (concurrent.EncodeDictOfLists(symbol_names_by_path), concurrent.EncodeDictOfLists(string_addresses_by_path))
def CollectAliasesByAddress(elf_path, tool_prefix): """Runs nm on |elf_path| and returns a dict of address->[names]""" # Constructors often show up twice, so use sets to ensure no duplicates. names_by_address = collections.defaultdict(set) # Many OUTLINED_FUNCTION_* entries can coexist on a single address, possibly # mixed with regular symbols. However, naively keeping these is bad because: # * OUTLINED_FUNCTION_* can have many duplicates. Keeping them would cause # false associations downstream, when looking up object_paths from names. # * For addresses with multiple OUTLINED_FUNCTION_* entries, we can't get the # associated object_path (exception: the one entry in the .map file, for LLD # without ThinLTO). So keeping copies around is rather useless. # Our solution is to merge OUTLINED_FUNCTION_* entries at the same address # into a single symbol. We'd also like to keep track of the number of copies # (although it will not be used to compute PSS computation). This is done by # writing the count in the name, e.g., '** outlined function * 5'. num_outlined_functions_at_address = collections.Counter() # About 60mb of output, but piping takes ~30s, and loading it into RAM # directly takes 3s. args = [path_util.GetNmPath(tool_prefix), '--no-sort', '--defined-only', elf_path] # pylint: disable=unexpected-keyword-arg proc = subprocess.Popen( args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8') # llvm-nm may write to stderr. Discard to denoise. stdout, _ = proc.communicate() assert proc.returncode == 0 for line in stdout.splitlines(): space_idx = line.find(' ') address_str = line[:space_idx] section = line[space_idx + 1] mangled_name = line[space_idx + 3:] # To verify that rodata does not have aliases: # nm --no-sort --defined-only libchrome.so > nm.out # grep -v '\$' nm.out | grep ' r ' | sort | cut -d' ' -f1 > addrs # wc -l < addrs; uniq < addrs | wc -l if section not in 'tTW' or not _IsRelevantNmName(mangled_name): continue address = int(address_str, 16) if not address: continue if mangled_name.startswith('OUTLINED_FUNCTION_'): num_outlined_functions_at_address[address] += 1 else: names_by_address[address].add(mangled_name) # Need to add before demangling because |names_by_address| changes type. for address, count in num_outlined_functions_at_address.items(): name = '** outlined function' + (' * %d' % count if count > 1 else '') names_by_address[address].add(name) # Demangle all names. names_by_address = demangle.DemangleSetsInDicts(names_by_address, tool_prefix) # Since this is run in a separate process, minimize data passing by returning # only aliased symbols. # Also: Sort to ensure stable ordering. return { addr: sorted(names, key=lambda n: (n.startswith('**'), n)) for addr, names in names_by_address.items() if len(names) > 1 or num_outlined_functions_at_address.get(addr, 0) > 1 }
def CreateUniqueSymbols(elf_path, tool_prefix, section_ranges): """Creates symbols from nm --print-size output. Creates only one symbol for each address (does not create symbol aliases). """ # Filter to sections we care about and sort by (address, size). section_ranges = [ x for x in section_ranges.items() if x[0] in models.NATIVE_SECTIONS ] section_ranges.sort(key=lambda x: x[1]) min_address = section_ranges[0][1][0] max_address = sum(section_ranges[-1][1]) args = [ path_util.GetNmPath(tool_prefix), '--no-sort', '--defined-only', '--print-size', elf_path ] # pylint: disable=unexpected-keyword-arg stdout = subprocess.check_output(args, stderr=subprocess.DEVNULL, encoding='utf-8') lines = stdout.splitlines() logging.debug('Parsing %d lines of output', len(lines)) symbols_by_address = {} # Example 32-bit output: # 00857f94 00000004 t __on_dlclose_late # 000001ec r ndk_build_number for line in lines: tokens = line.split(' ', 3) num_tokens = len(tokens) if num_tokens < 3: # Address with no size and no name. continue address_str = tokens[0] # Check if size is omitted (can happen with binutils but not llvm). if num_tokens == 3: size_str = '0' section = tokens[1] mangled_name = tokens[2] else: size_str = tokens[1] section = tokens[2] mangled_name = tokens[3] if section not in 'BbDdTtRrWw' or not _IsRelevantNmName(mangled_name): continue address = int(address_str, 16) # Ignore symbols outside of sections that we care about. # Symbols can still exist in sections that we do not care about if those # sections are interleaved. We discard such symbols in the next loop. if not min_address <= address < max_address: continue # Pick the alias that defines a size. existing_alias = symbols_by_address.get(address) if existing_alias and existing_alias.size > 0: continue size = int(size_str, 16) # E.g.: .str.2.llvm.12282370934750212 if mangled_name.startswith('.str.'): mangled_name = models.STRING_LITERAL_NAME elif mangled_name.startswith('__ARMV7PILongThunk_'): # Convert thunks from prefix to suffix so that name is demangleable. mangled_name = mangled_name[len('__ARMV7PILongThunk_' ):] + '.LongThunk' elif mangled_name.startswith('__ThumbV7PILongThunk_'): mangled_name = mangled_name[len('__ThumbV7PILongThunk_' ):] + '.LongThunk' # Use address (next loop) to determine between .data and .data.rel.ro. section_name = None if section in 'Tt': section_name = models.SECTION_TEXT elif section in 'Rr': section_name = models.SECTION_RODATA elif section in 'Bb': section_name = models.SECTION_BSS # No need to demangle names since they will be demangled by # DemangleRemainingSymbols(). symbols_by_address[address] = models.Symbol(section_name, size, address=address, full_name=mangled_name) logging.debug('Sorting %d NM symbols', len(symbols_by_address)) # Sort symbols by address. sorted_symbols = sorted(symbols_by_address.values(), key=lambda s: s.address) # Assign section to symbols based on address, and size where unspecified. # Use address rather than nm's section character to distinguish between # .data.rel.ro and .data. logging.debug('Assigning section_name and filling in missing sizes') section_range_iter = iter(section_ranges) section_end = -1 raw_symbols = [] active_assembly_sym = None for i, sym in enumerate(sorted_symbols): # Move to next section if applicable. while sym.address >= section_end: section_range = next(section_range_iter) section_name, (section_start, section_size) = section_range section_end = section_start + section_size # Skip symbols that don't fall into a section that we care about # (e.g. GCC_except_table533 from .eh_frame). if sym.address < section_start: continue if sym.section_name and sym.section_name != section_name: logging.warning('Re-assigning section for %r to %s', sym, section_name) sym.section_name = section_name if i + 1 < len(sorted_symbols): next_addr = sorted_symbols[i + 1].address else: next_addr = section_end # Heuristic: Discard subsequent assembly symbols (no size) that are ALL_CAPS # or .-prefixed, since they are likely labels within a function. if (active_assembly_sym and sym.size == 0 and sym.section_name == models.SECTION_TEXT): if sym.full_name.startswith('.') or sym.full_name.isupper(): active_assembly_sym.size += next_addr - sym.address # Triggers ~30 times for all of libchrome.so. logging.debug('Discarding assembly label: %s', sym.full_name) continue active_assembly_sym = sym if sym.size == 0 else None # For assembly symbols: # Add in a size when absent and guard against size overlapping next symbol. if active_assembly_sym or sym.end_address > next_addr: sym.size = next_addr - sym.address raw_symbols.append(sym) return raw_symbols