示例#1
0
  def Kill(self, sig, log_level, first=False):
    """Kill process with signal, ignoring if the process is dead.

    Args:
      sig: Signal to send.
      log_level: The log level of log messages.
      first: Whether this is the first signal we've sent.
    """
    self._killing.set()
    self._WaitForStartup()
    if logging.getLogger().isEnabledFor(log_level):
      # Dump debug information about the hanging process.
      logging.log(log_level, 'Killing %r (sig=%r %s)', self.pid, sig,
                  signals.StrSignal(sig))

      if first:
        ppid = str(self.pid)
        output = self._DebugRunCommand(
            ('pgrep', '-P', ppid), debug_level=log_level, print_cmd=False,
            error_code_ok=True, capture_output=True)
        for pid in [ppid] + output.splitlines():
          self._DumpDebugPid(log_level, pid)

    try:
      os.kill(self.pid, sig)
    except OSError as ex:
      if ex.errno != errno.ESRCH:
        raise
    def run(self):
        """Runs the test in a proper environment (e.g. qemu)."""

        # We know these pre-tests are fast (especially if they've already been run
        # once), so run them automatically for the user if they test by hand.
        self.pre_test()

        paths_to_mount = (list(self._BIND_MOUNT_PATHS) + [
            mount for mount in self._BIND_MOUNT_IF_NOT_SYMLINK_PATHS
            if not os.path.islink('/' + mount)
        ])
        for mount in paths_to_mount:
            path = os.path.join(self.sysroot, mount)
            osutils.SafeMakedirs(path)
            osutils.Mount('/' + mount, path, 'none', osutils.MS_BIND)

        positive_filters = self.gtest_filter[0]
        negative_filters = self.gtest_filter[1]

        if self.user_gtest_filter:
            positive_filters += self.user_gtest_filter[0]
            negative_filters += self.user_gtest_filter[1]

        filters = (':'.join(positive_filters), ':'.join(negative_filters))
        gtest_filter = '%s-%s' % filters

        cmd = self.removeSysrootPrefix(self.bin)
        argv = self.args[:]
        argv[0] = self.removeSysrootPrefix(argv[0])
        if gtest_filter != '-':
            argv.append('--gtest_filter=' + gtest_filter)

        # Some programs expect to find data files via $CWD, so doing a chroot
        # and dropping them into / would make them fail.
        cwd = self.removeSysrootPrefix(os.getcwd())

        # Make orphaned child processes reparent to this process instead of the init
        # process.  This allows us to kill them if they do not terminate after the
        # test has finished running.
        _MakeProcessSubreaper()

        # Fork off a child to run the test.  This way we can make tweaks to the
        # env that only affect the child (gid/uid/chroot/cwd/etc...).  We have
        # to fork anyways to run the test, so might as well do it all ourselves
        # to avoid (slow) chaining through programs like:
        #   sudo -u $SUDO_UID -g $SUDO_GID chroot $SYSROOT bash -c 'cd $CWD; $BIN'
        child = os.fork()
        if child == 0:
            print('chroot: %s' % self.sysroot)
            print('cwd: %s' % cwd)
            print('cmd: {%s} %s' % (cmd, ' '.join(map(repr, argv))))
            os.chroot(self.sysroot)
            os.chdir(cwd)

            # Set the child's pgid to its pid, so we can kill any processes that the
            # child creates after the child terminates.
            os.setpgid(0, 0)

            # Remove sysroot from path environment variables.
            for var in ('OUT', 'SRC', 'T'):
                if var in os.environ:
                    os.environ[var] = self.removeSysrootPrefix(os.environ[var])

            # The TERM the user is leveraging might not exist in the sysroot.
            # Force a sane default that supports standard color sequences.
            os.environ['TERM'] = 'ansi'
            # Some progs want this like bash else they get super confused.
            os.environ['PWD'] = cwd
            os.environ['GTEST_COLOR'] = 'yes'
            if not self.run_as_root:
                user, uid, gid, home = self.GetNonRootAccount()
                os.setgid(gid)
                os.setuid(uid)
                os.environ['HOME'] = home
                os.environ['USER'] = user
            sys.exit(os.execvp(cmd, argv))

        proctitle.settitle('sysroot watcher', cmd)

        # Mask SIGINT with the assumption that the child will catch & process it.
        # We'll pass that back up below.
        signal.signal(signal.SIGINT, signal.SIG_IGN)

        # Reap any processes that were reparented to us until the child exits.
        status = _ReapUntilProcessExits(child)

        leaked_children = psutil.Process().get_children(recursive=True)
        if leaked_children:
            # It's possible the child forked and the forked processes are still
            # running.  Kill the forked processes.
            try:
                os.killpg(child, signal.SIGTERM)
            except OSError as e:
                if e.errno != errno.ESRCH:
                    print(
                        'Warning: while trying to kill pgid %s caught exception\n%s'
                        % (child, e),
                        file=sys.stderr)

            # Kill any orphaned processes originally created by the test that were in
            # a different process group.  This will also kill any processes that did
            # not respond to the SIGTERM.
            for child in leaked_children:
                try:
                    child.kill()
                except psutil.NoSuchProcess:
                    pass

        failmsg = None
        if os.WIFSIGNALED(status):
            sig = os.WTERMSIG(status)
            failmsg = 'signal %s(%i)' % (signals.StrSignal(sig), sig)
        else:
            exit_status = os.WEXITSTATUS(status)
            if exit_status:
                failmsg = 'exit code %i' % exit_status
        if failmsg:
            print('Error: %s: failed with %s' % (cmd, failmsg),
                  file=sys.stderr)

        if leaked_children:
            for p in leaked_children:
                print(
                    'Error: the test leaked process %s with pid %s (it was forcefully'
                    ' killed)' % (p.name(), p.pid),
                    file=sys.stderr)
            # TODO(vapier): Make this an error.  We need to track down some scenarios
            # where processes do leak though before we can make this fatal :(.
            #sys.exit(100)

        process_util.ExitAsStatus(status)
 def _CrashCheck(ret, msg):
     if ret < 0:
         logging.PrintBuildbotStepWarnings()
         logging.warning('dump_syms crashed with %s; %s',
                         signals.StrSignal(-ret), msg)
    def run(self):
        """Runs the test in a proper environment (e.g. qemu)."""

        # We know these pre-tests are fast (especially if they've already been run
        # once), so run them automatically for the user if they test by hand.
        self.pre_test()

        for mount in self._BIND_MOUNT_PATHS:
            path = os.path.join(self.sysroot, mount)
            osutils.SafeMakedirs(path)
            osutils.Mount('/' + mount, path, 'none', osutils.MS_BIND)

        positive_filters = self.gtest_filter[0]
        negative_filters = self.gtest_filter[1]

        if self.user_gtest_filter:
            positive_filters += self.user_gtest_filter[0]
            negative_filters += self.user_gtest_filter[1]

        filters = (':'.join(positive_filters), ':'.join(negative_filters))
        gtest_filter = '%s-%s' % filters

        cmd = self.removeSysrootPrefix(self.bin)
        argv = self.args[:]
        argv[0] = self.removeSysrootPrefix(argv[0])
        if gtest_filter != '-':
            argv.append('--gtest_filter=' + gtest_filter)

        # Some programs expect to find data files via $CWD, so doing a chroot
        # and dropping them into / would make them fail.
        cwd = self.removeSysrootPrefix(os.getcwd())

        # Fork off a child to run the test.  This way we can make tweaks to the
        # env that only affect the child (gid/uid/chroot/cwd/etc...).  We have
        # to fork anyways to run the test, so might as well do it all ourselves
        # to avoid (slow) chaining through programs like:
        #   sudo -u $SUDO_UID -g $SUDO_GID chroot $SYSROOT bash -c 'cd $CWD; $BIN'
        child = os.fork()
        if child == 0:
            print('chroot: %s' % self.sysroot)
            print('cwd: %s' % cwd)
            print('cmd: {%s} %s' % (cmd, ' '.join(map(repr, argv))))
            os.chroot(self.sysroot)
            os.chdir(cwd)
            # The TERM the user is leveraging might not exist in the sysroot.
            # Force a sane default that supports standard color sequences.
            os.environ['TERM'] = 'ansi'
            # Some progs want this like bash else they get super confused.
            os.environ['PWD'] = cwd
            if not self.run_as_root:
                _, uid, gid, home = self.GetNonRootAccount()
                os.setgid(gid)
                os.setuid(uid)
                os.environ['HOME'] = home
            sys.exit(os.execvp(cmd, argv))

        proctitle.settitle('sysroot watcher', cmd)

        # Mask SIGINT with the assumption that the child will catch & process it.
        # We'll pass that back up below.
        signal.signal(signal.SIGINT, signal.SIG_IGN)
        _, status = os.waitpid(child, 0)

        failmsg = None
        if os.WIFSIGNALED(status):
            sig = os.WTERMSIG(status)
            failmsg = 'signal %s(%i)' % (signals.StrSignal(sig), sig)
        else:
            exit_status = os.WEXITSTATUS(status)
            if exit_status:
                failmsg = 'exit code %i' % exit_status
        if failmsg:
            print('Error: %s: failed with %s' % (cmd, failmsg),
                  file=sys.stderr)

        process_util.ExitAsStatus(status)
def UploadSymbols(board=None,
                  official=False,
                  server=None,
                  breakpad_dir=None,
                  file_limit=DEFAULT_FILE_LIMIT,
                  sleep=DEFAULT_SLEEP_DELAY,
                  upload_limit=None,
                  sym_paths=None,
                  failed_list=None,
                  root=None,
                  retry=True,
                  dedupe_namespace=None,
                  product_name='ChromeOS'):
    """Upload all the generated symbols for |board| to the crash server

  You can use in a few ways:
    * pass |board| to locate all of its symbols
    * pass |breakpad_dir| to upload all the symbols in there
    * pass |sym_paths| to upload specific symbols (or dirs of symbols)

  Args:
    board: The board whose symbols we wish to upload
    official: Use the official symbol server rather than the staging one
    server: Explicit server to post symbols to
    breakpad_dir: The full path to the breakpad directory where symbols live
    file_limit: The max file size of a symbol file before we try to strip it
    sleep: How long to sleep in between uploads
    upload_limit: If set, only upload this many symbols (meant for testing)
    sym_paths: Specific symbol files (or dirs of sym files) to upload,
      otherwise search |breakpad_dir|
    failed_list: Write the names of all sym files we did not upload; can be a
      filename or file-like object.
    root: The tree to prefix to |breakpad_dir| (if |breakpad_dir| is not set)
    retry: Whether we should retry failures.
    dedupe_namespace: The isolateserver namespace to dedupe uploaded symbols.
    product_name: A string for stats purposes. Usually 'ChromeOS' or 'Android'.

  Returns:
    The number of errors that were encountered.
  """
    if server is None:
        if official:
            upload_url = OFFICIAL_UPLOAD_URL
        else:
            logging.warning('unofficial builds upload to the staging server')
            upload_url = STAGING_UPLOAD_URL
    else:
        upload_url = server

    if sym_paths:
        logging.info('uploading specified symbols to %s', upload_url)
    else:
        if breakpad_dir is None:
            if root is None:
                raise ValueError('breakpad_dir requires root to be set')
            breakpad_dir = os.path.join(
                root,
                cros_generate_breakpad_symbols.FindBreakpadDir(board).lstrip(
                    '/'))
        logging.info('uploading all symbols to %s from %s', upload_url,
                     breakpad_dir)
        sym_paths = [breakpad_dir]

    # We use storage_query to ask the server about existing symbols.  The
    # storage_notify_proc process is used to post updates to the server.  We
    # cannot safely share the storage object between threads/processes, but
    # we also want to minimize creating new ones as each object has to init
    # new state (like server connections).
    storage_query = None
    if dedupe_namespace:
        dedupe_limit = DEDUPE_LIMIT
        dedupe_queue = multiprocessing.Queue()
        try:
            with timeout_util.Timeout(DEDUPE_TIMEOUT):
                storage_query = isolateserver.get_storage_api(
                    constants.ISOLATESERVER, dedupe_namespace)
        except Exception:
            logging.warning('initializing dedupe server connection failed',
                            exc_info=True)
    else:
        dedupe_limit = 1
        dedupe_queue = None
    # Can't use parallel.BackgroundTaskRunner because that'll create multiple
    # processes and we want only one the whole time (see comment above).
    storage_notify_proc = multiprocessing.Process(
        target=SymbolDeduplicatorNotify, args=(dedupe_namespace, dedupe_queue))

    bg_errors = multiprocessing.Value('i')
    watermark_errors = multiprocessing.Value('f')
    failed_queue = multiprocessing.Queue()
    uploader = functools.partial(UploadSymbol,
                                 upload_url,
                                 product_name=product_name,
                                 file_limit=file_limit,
                                 sleep=sleep,
                                 num_errors=bg_errors,
                                 watermark_errors=watermark_errors,
                                 failed_queue=failed_queue,
                                 passed_queue=dedupe_queue)

    start_time = datetime.datetime.now()
    Counters = cros_build_lib.Collection('Counters',
                                         upload_limit=upload_limit,
                                         uploaded_count=0,
                                         deduped_count=0)
    counters = Counters()

    def _Upload(queue, counters, files):
        if not files:
            return

        missing_count = 0
        for item in SymbolDeduplicator(storage_query, files):
            missing_count += 1

            if counters.upload_limit == 0:
                continue

            queue.put((item, ))
            counters.uploaded_count += 1
            if counters.upload_limit is not None:
                counters.upload_limit -= 1

        counters.deduped_count += (len(files) - missing_count)

    try:
        storage_notify_proc.start()

        with osutils.TempDir(prefix='upload_symbols.') as tempdir:
            # For the first run, we collect the symbols that failed.  If the
            # overall failure rate was low, we'll retry them on the second run.
            for retry in (retry, False):
                # We need to limit ourselves to one upload at a time to avoid the server
                # kicking in DoS protection.  See these bugs for more details:
                # http://crbug.com/209442
                # http://crbug.com/212496
                with parallel.BackgroundTaskRunner(uploader,
                                                   processes=1) as queue:
                    dedupe_list = []
                    for sym_file in SymbolFinder(tempdir, sym_paths):
                        dedupe_list.append(sym_file)
                        dedupe_len = len(dedupe_list)
                        if dedupe_len < dedupe_limit:
                            if (counters.upload_limit is None
                                    or dedupe_len < counters.upload_limit):
                                continue

                        # We check the counter before _Upload so that we don't keep talking
                        # to the dedupe server.  Otherwise, we end up sending one symbol at
                        # a time to it and that slows things down a lot.
                        if counters.upload_limit == 0:
                            break

                        _Upload(queue, counters, dedupe_list)
                        dedupe_list = []
                    _Upload(queue, counters, dedupe_list)

                # See if we need to retry, and if we haven't failed too many times yet.
                if not retry or ErrorLimitHit(bg_errors, watermark_errors):
                    break

                sym_paths = []
                failed_queue.put(None)
                while True:
                    sym_path = failed_queue.get()
                    if sym_path is None:
                        break
                    sym_paths.append(sym_path)

                if sym_paths:
                    logging.warning('retrying %i symbols', len(sym_paths))
                    if counters.upload_limit is not None:
                        counters.upload_limit += len(sym_paths)
                    # Decrement the error count in case we recover in the second pass.
                    assert bg_errors.value >= len(sym_paths), \
                           'more failed files than errors?'
                    bg_errors.value -= len(sym_paths)
                else:
                    # No failed symbols, so just return now.
                    break

        # If the user has requested it, save all the symbol files that we failed to
        # upload to a listing file.  This should help with recovery efforts later.
        failed_queue.put(None)
        WriteQueueToFile(failed_list, failed_queue, breakpad_dir)

    finally:
        logging.info('finished uploading; joining background process')
        if dedupe_queue:
            dedupe_queue.put(None)

        # The notification might be slow going, so give it some time to finish.
        # We have to poll here as the process monitor is watching for output and
        # will kill us if we go silent for too long.
        wait_minutes = DEDUPE_NOTIFY_TIMEOUT
        while storage_notify_proc.is_alive() and wait_minutes > 0:
            if dedupe_queue:
                qsize = str(dedupe_queue.qsize())
            else:
                qsize = '[None]'
            logging.info('waiting up to %i minutes for ~%s notifications',
                         wait_minutes, qsize)
            storage_notify_proc.join(60)
            wait_minutes -= 1

        # The process is taking too long, so kill it and complain.
        if storage_notify_proc.is_alive():
            logging.warning('notification process took too long')
            logging.PrintBuildbotStepWarnings()

            # Kill it gracefully first (traceback) before tacking it down harder.
            pid = storage_notify_proc.pid
            for sig in (signal.SIGINT, signal.SIGTERM, signal.SIGKILL):
                logging.warning('sending %s to %i', signals.StrSignal(sig),
                                pid)
                # The process might have exited between the last check and the
                # actual kill below, so ignore ESRCH errors.
                try:
                    os.kill(pid, sig)
                except OSError as e:
                    if e.errno == errno.ESRCH:
                        break
                    else:
                        raise
                time.sleep(5)
                if not storage_notify_proc.is_alive():
                    break

            # Drain the queue so we don't hang when we finish.
            try:
                logging.warning('draining the notify queue manually')
                with timeout_util.Timeout(60):
                    try:
                        while dedupe_queue.get_nowait():
                            pass
                    except Queue.Empty:
                        pass
            except timeout_util.TimeoutError:
                logging.warning(
                    'draining the notify queue failed; trashing it')
                dedupe_queue.cancel_join_thread()

    logging.info('uploaded %i symbols (%i were deduped) which took: %s',
                 counters.uploaded_count, counters.deduped_count,
                 datetime.datetime.now() - start_time)

    return bg_errors.value