def Kill(self, sig, log_level, first=False): """Kill process with signal, ignoring if the process is dead. Args: sig: Signal to send. log_level: The log level of log messages. first: Whether this is the first signal we've sent. """ self._killing.set() self._WaitForStartup() if logging.getLogger().isEnabledFor(log_level): # Dump debug information about the hanging process. logging.log(log_level, 'Killing %r (sig=%r %s)', self.pid, sig, signals.StrSignal(sig)) if first: ppid = str(self.pid) output = self._DebugRunCommand( ('pgrep', '-P', ppid), debug_level=log_level, print_cmd=False, error_code_ok=True, capture_output=True) for pid in [ppid] + output.splitlines(): self._DumpDebugPid(log_level, pid) try: os.kill(self.pid, sig) except OSError as ex: if ex.errno != errno.ESRCH: raise
def run(self): """Runs the test in a proper environment (e.g. qemu).""" # We know these pre-tests are fast (especially if they've already been run # once), so run them automatically for the user if they test by hand. self.pre_test() paths_to_mount = (list(self._BIND_MOUNT_PATHS) + [ mount for mount in self._BIND_MOUNT_IF_NOT_SYMLINK_PATHS if not os.path.islink('/' + mount) ]) for mount in paths_to_mount: path = os.path.join(self.sysroot, mount) osutils.SafeMakedirs(path) osutils.Mount('/' + mount, path, 'none', osutils.MS_BIND) positive_filters = self.gtest_filter[0] negative_filters = self.gtest_filter[1] if self.user_gtest_filter: positive_filters += self.user_gtest_filter[0] negative_filters += self.user_gtest_filter[1] filters = (':'.join(positive_filters), ':'.join(negative_filters)) gtest_filter = '%s-%s' % filters cmd = self.removeSysrootPrefix(self.bin) argv = self.args[:] argv[0] = self.removeSysrootPrefix(argv[0]) if gtest_filter != '-': argv.append('--gtest_filter=' + gtest_filter) # Some programs expect to find data files via $CWD, so doing a chroot # and dropping them into / would make them fail. cwd = self.removeSysrootPrefix(os.getcwd()) # Make orphaned child processes reparent to this process instead of the init # process. This allows us to kill them if they do not terminate after the # test has finished running. _MakeProcessSubreaper() # Fork off a child to run the test. This way we can make tweaks to the # env that only affect the child (gid/uid/chroot/cwd/etc...). We have # to fork anyways to run the test, so might as well do it all ourselves # to avoid (slow) chaining through programs like: # sudo -u $SUDO_UID -g $SUDO_GID chroot $SYSROOT bash -c 'cd $CWD; $BIN' child = os.fork() if child == 0: print('chroot: %s' % self.sysroot) print('cwd: %s' % cwd) print('cmd: {%s} %s' % (cmd, ' '.join(map(repr, argv)))) os.chroot(self.sysroot) os.chdir(cwd) # Set the child's pgid to its pid, so we can kill any processes that the # child creates after the child terminates. os.setpgid(0, 0) # Remove sysroot from path environment variables. for var in ('OUT', 'SRC', 'T'): if var in os.environ: os.environ[var] = self.removeSysrootPrefix(os.environ[var]) # The TERM the user is leveraging might not exist in the sysroot. # Force a sane default that supports standard color sequences. os.environ['TERM'] = 'ansi' # Some progs want this like bash else they get super confused. os.environ['PWD'] = cwd os.environ['GTEST_COLOR'] = 'yes' if not self.run_as_root: user, uid, gid, home = self.GetNonRootAccount() os.setgid(gid) os.setuid(uid) os.environ['HOME'] = home os.environ['USER'] = user sys.exit(os.execvp(cmd, argv)) proctitle.settitle('sysroot watcher', cmd) # Mask SIGINT with the assumption that the child will catch & process it. # We'll pass that back up below. signal.signal(signal.SIGINT, signal.SIG_IGN) # Reap any processes that were reparented to us until the child exits. status = _ReapUntilProcessExits(child) leaked_children = psutil.Process().get_children(recursive=True) if leaked_children: # It's possible the child forked and the forked processes are still # running. Kill the forked processes. try: os.killpg(child, signal.SIGTERM) except OSError as e: if e.errno != errno.ESRCH: print( 'Warning: while trying to kill pgid %s caught exception\n%s' % (child, e), file=sys.stderr) # Kill any orphaned processes originally created by the test that were in # a different process group. This will also kill any processes that did # not respond to the SIGTERM. for child in leaked_children: try: child.kill() except psutil.NoSuchProcess: pass failmsg = None if os.WIFSIGNALED(status): sig = os.WTERMSIG(status) failmsg = 'signal %s(%i)' % (signals.StrSignal(sig), sig) else: exit_status = os.WEXITSTATUS(status) if exit_status: failmsg = 'exit code %i' % exit_status if failmsg: print('Error: %s: failed with %s' % (cmd, failmsg), file=sys.stderr) if leaked_children: for p in leaked_children: print( 'Error: the test leaked process %s with pid %s (it was forcefully' ' killed)' % (p.name(), p.pid), file=sys.stderr) # TODO(vapier): Make this an error. We need to track down some scenarios # where processes do leak though before we can make this fatal :(. #sys.exit(100) process_util.ExitAsStatus(status)
def _CrashCheck(ret, msg): if ret < 0: logging.PrintBuildbotStepWarnings() logging.warning('dump_syms crashed with %s; %s', signals.StrSignal(-ret), msg)
def run(self): """Runs the test in a proper environment (e.g. qemu).""" # We know these pre-tests are fast (especially if they've already been run # once), so run them automatically for the user if they test by hand. self.pre_test() for mount in self._BIND_MOUNT_PATHS: path = os.path.join(self.sysroot, mount) osutils.SafeMakedirs(path) osutils.Mount('/' + mount, path, 'none', osutils.MS_BIND) positive_filters = self.gtest_filter[0] negative_filters = self.gtest_filter[1] if self.user_gtest_filter: positive_filters += self.user_gtest_filter[0] negative_filters += self.user_gtest_filter[1] filters = (':'.join(positive_filters), ':'.join(negative_filters)) gtest_filter = '%s-%s' % filters cmd = self.removeSysrootPrefix(self.bin) argv = self.args[:] argv[0] = self.removeSysrootPrefix(argv[0]) if gtest_filter != '-': argv.append('--gtest_filter=' + gtest_filter) # Some programs expect to find data files via $CWD, so doing a chroot # and dropping them into / would make them fail. cwd = self.removeSysrootPrefix(os.getcwd()) # Fork off a child to run the test. This way we can make tweaks to the # env that only affect the child (gid/uid/chroot/cwd/etc...). We have # to fork anyways to run the test, so might as well do it all ourselves # to avoid (slow) chaining through programs like: # sudo -u $SUDO_UID -g $SUDO_GID chroot $SYSROOT bash -c 'cd $CWD; $BIN' child = os.fork() if child == 0: print('chroot: %s' % self.sysroot) print('cwd: %s' % cwd) print('cmd: {%s} %s' % (cmd, ' '.join(map(repr, argv)))) os.chroot(self.sysroot) os.chdir(cwd) # The TERM the user is leveraging might not exist in the sysroot. # Force a sane default that supports standard color sequences. os.environ['TERM'] = 'ansi' # Some progs want this like bash else they get super confused. os.environ['PWD'] = cwd if not self.run_as_root: _, uid, gid, home = self.GetNonRootAccount() os.setgid(gid) os.setuid(uid) os.environ['HOME'] = home sys.exit(os.execvp(cmd, argv)) proctitle.settitle('sysroot watcher', cmd) # Mask SIGINT with the assumption that the child will catch & process it. # We'll pass that back up below. signal.signal(signal.SIGINT, signal.SIG_IGN) _, status = os.waitpid(child, 0) failmsg = None if os.WIFSIGNALED(status): sig = os.WTERMSIG(status) failmsg = 'signal %s(%i)' % (signals.StrSignal(sig), sig) else: exit_status = os.WEXITSTATUS(status) if exit_status: failmsg = 'exit code %i' % exit_status if failmsg: print('Error: %s: failed with %s' % (cmd, failmsg), file=sys.stderr) process_util.ExitAsStatus(status)
def UploadSymbols(board=None, official=False, server=None, breakpad_dir=None, file_limit=DEFAULT_FILE_LIMIT, sleep=DEFAULT_SLEEP_DELAY, upload_limit=None, sym_paths=None, failed_list=None, root=None, retry=True, dedupe_namespace=None, product_name='ChromeOS'): """Upload all the generated symbols for |board| to the crash server You can use in a few ways: * pass |board| to locate all of its symbols * pass |breakpad_dir| to upload all the symbols in there * pass |sym_paths| to upload specific symbols (or dirs of symbols) Args: board: The board whose symbols we wish to upload official: Use the official symbol server rather than the staging one server: Explicit server to post symbols to breakpad_dir: The full path to the breakpad directory where symbols live file_limit: The max file size of a symbol file before we try to strip it sleep: How long to sleep in between uploads upload_limit: If set, only upload this many symbols (meant for testing) sym_paths: Specific symbol files (or dirs of sym files) to upload, otherwise search |breakpad_dir| failed_list: Write the names of all sym files we did not upload; can be a filename or file-like object. root: The tree to prefix to |breakpad_dir| (if |breakpad_dir| is not set) retry: Whether we should retry failures. dedupe_namespace: The isolateserver namespace to dedupe uploaded symbols. product_name: A string for stats purposes. Usually 'ChromeOS' or 'Android'. Returns: The number of errors that were encountered. """ if server is None: if official: upload_url = OFFICIAL_UPLOAD_URL else: logging.warning('unofficial builds upload to the staging server') upload_url = STAGING_UPLOAD_URL else: upload_url = server if sym_paths: logging.info('uploading specified symbols to %s', upload_url) else: if breakpad_dir is None: if root is None: raise ValueError('breakpad_dir requires root to be set') breakpad_dir = os.path.join( root, cros_generate_breakpad_symbols.FindBreakpadDir(board).lstrip( '/')) logging.info('uploading all symbols to %s from %s', upload_url, breakpad_dir) sym_paths = [breakpad_dir] # We use storage_query to ask the server about existing symbols. The # storage_notify_proc process is used to post updates to the server. We # cannot safely share the storage object between threads/processes, but # we also want to minimize creating new ones as each object has to init # new state (like server connections). storage_query = None if dedupe_namespace: dedupe_limit = DEDUPE_LIMIT dedupe_queue = multiprocessing.Queue() try: with timeout_util.Timeout(DEDUPE_TIMEOUT): storage_query = isolateserver.get_storage_api( constants.ISOLATESERVER, dedupe_namespace) except Exception: logging.warning('initializing dedupe server connection failed', exc_info=True) else: dedupe_limit = 1 dedupe_queue = None # Can't use parallel.BackgroundTaskRunner because that'll create multiple # processes and we want only one the whole time (see comment above). storage_notify_proc = multiprocessing.Process( target=SymbolDeduplicatorNotify, args=(dedupe_namespace, dedupe_queue)) bg_errors = multiprocessing.Value('i') watermark_errors = multiprocessing.Value('f') failed_queue = multiprocessing.Queue() uploader = functools.partial(UploadSymbol, upload_url, product_name=product_name, file_limit=file_limit, sleep=sleep, num_errors=bg_errors, watermark_errors=watermark_errors, failed_queue=failed_queue, passed_queue=dedupe_queue) start_time = datetime.datetime.now() Counters = cros_build_lib.Collection('Counters', upload_limit=upload_limit, uploaded_count=0, deduped_count=0) counters = Counters() def _Upload(queue, counters, files): if not files: return missing_count = 0 for item in SymbolDeduplicator(storage_query, files): missing_count += 1 if counters.upload_limit == 0: continue queue.put((item, )) counters.uploaded_count += 1 if counters.upload_limit is not None: counters.upload_limit -= 1 counters.deduped_count += (len(files) - missing_count) try: storage_notify_proc.start() with osutils.TempDir(prefix='upload_symbols.') as tempdir: # For the first run, we collect the symbols that failed. If the # overall failure rate was low, we'll retry them on the second run. for retry in (retry, False): # We need to limit ourselves to one upload at a time to avoid the server # kicking in DoS protection. See these bugs for more details: # http://crbug.com/209442 # http://crbug.com/212496 with parallel.BackgroundTaskRunner(uploader, processes=1) as queue: dedupe_list = [] for sym_file in SymbolFinder(tempdir, sym_paths): dedupe_list.append(sym_file) dedupe_len = len(dedupe_list) if dedupe_len < dedupe_limit: if (counters.upload_limit is None or dedupe_len < counters.upload_limit): continue # We check the counter before _Upload so that we don't keep talking # to the dedupe server. Otherwise, we end up sending one symbol at # a time to it and that slows things down a lot. if counters.upload_limit == 0: break _Upload(queue, counters, dedupe_list) dedupe_list = [] _Upload(queue, counters, dedupe_list) # See if we need to retry, and if we haven't failed too many times yet. if not retry or ErrorLimitHit(bg_errors, watermark_errors): break sym_paths = [] failed_queue.put(None) while True: sym_path = failed_queue.get() if sym_path is None: break sym_paths.append(sym_path) if sym_paths: logging.warning('retrying %i symbols', len(sym_paths)) if counters.upload_limit is not None: counters.upload_limit += len(sym_paths) # Decrement the error count in case we recover in the second pass. assert bg_errors.value >= len(sym_paths), \ 'more failed files than errors?' bg_errors.value -= len(sym_paths) else: # No failed symbols, so just return now. break # If the user has requested it, save all the symbol files that we failed to # upload to a listing file. This should help with recovery efforts later. failed_queue.put(None) WriteQueueToFile(failed_list, failed_queue, breakpad_dir) finally: logging.info('finished uploading; joining background process') if dedupe_queue: dedupe_queue.put(None) # The notification might be slow going, so give it some time to finish. # We have to poll here as the process monitor is watching for output and # will kill us if we go silent for too long. wait_minutes = DEDUPE_NOTIFY_TIMEOUT while storage_notify_proc.is_alive() and wait_minutes > 0: if dedupe_queue: qsize = str(dedupe_queue.qsize()) else: qsize = '[None]' logging.info('waiting up to %i minutes for ~%s notifications', wait_minutes, qsize) storage_notify_proc.join(60) wait_minutes -= 1 # The process is taking too long, so kill it and complain. if storage_notify_proc.is_alive(): logging.warning('notification process took too long') logging.PrintBuildbotStepWarnings() # Kill it gracefully first (traceback) before tacking it down harder. pid = storage_notify_proc.pid for sig in (signal.SIGINT, signal.SIGTERM, signal.SIGKILL): logging.warning('sending %s to %i', signals.StrSignal(sig), pid) # The process might have exited between the last check and the # actual kill below, so ignore ESRCH errors. try: os.kill(pid, sig) except OSError as e: if e.errno == errno.ESRCH: break else: raise time.sleep(5) if not storage_notify_proc.is_alive(): break # Drain the queue so we don't hang when we finish. try: logging.warning('draining the notify queue manually') with timeout_util.Timeout(60): try: while dedupe_queue.get_nowait(): pass except Queue.Empty: pass except timeout_util.TimeoutError: logging.warning( 'draining the notify queue failed; trashing it') dedupe_queue.cancel_join_thread() logging.info('uploaded %i symbols (%i were deduped) which took: %s', counters.uploaded_count, counters.deduped_count, datetime.datetime.now() - start_time) return bg_errors.value