Exemplo n.º 1
0
    def setUp(self, *args, **kwargs):
        try:
            container.execute_in_namespace(lambda: 0)
        except OSError as e:
            self.skipTest("Namespaces not supported: {}".format(os.strerror(e.errno)))

        self.runexecutor = RunExecutor(
            use_namespaces=True,
            dir_modes={"/": containerexecutor.DIR_READ_ONLY,
                       "/tmp": containerexecutor.DIR_HIDDEN},
            container_system_config=False,
            *args, **kwargs)
Exemplo n.º 2
0
    def setUp(self, *args, **kwargs):
        try:
            container.execute_in_namespace(lambda: 0)
        except OSError as e:
            self.skipTest("Namespaces not supported: {}".format(os.strerror(e.errno)))

        self.runexecutor = RunExecutor(
            use_namespaces=True,
            dir_modes={"/": containerexecutor.DIR_READ_ONLY,
                       "/tmp": containerexecutor.DIR_HIDDEN},
            container_system_config=False,
            *args, **kwargs)
Exemplo n.º 3
0
    def setUp(self, *args, **kwargs):
        try:
            container.execute_in_namespace(lambda: 0)
        except OSError as e:
            self.skipTest(f"Namespaces not supported: {os.strerror(e.errno)}")

        dir_modes = kwargs.pop(
            "dir_modes",
            {
                "/": containerexecutor.DIR_READ_ONLY,
                "/home": containerexecutor.DIR_HIDDEN,
                "/tmp": containerexecutor.DIR_HIDDEN,
            },
        )

        self.runexecutor = RunExecutor(
            use_namespaces=True, dir_modes=dir_modes, *args, **kwargs
        )
    def _start_execution_in_container(
        self,
        args,
        stdin,
        stdout,
        stderr,
        env,
        root_dir,
        cwd,
        temp_dir,
        memlimit,
        memory_nodes,
        cgroups,
        output_dir,
        result_files_patterns,
        parent_setup_fn,
        child_setup_fn,
        parent_cleanup_fn,
    ):
        """Execute the given command and measure its resource usage similarly to
        super()._start_execution(), but inside a container implemented using Linux
        namespaces.  The command has no network access (only loopback),
        a fresh directory as /tmp and no write access outside of this,
        and it does not see other processes except itself.
        """
        assert self._use_namespaces

        if root_dir is None:
            env.update(self._env_override)

        # We have three processes involved:
        # parent: the current Python process in which RunExecutor is executing
        # child: child process in new namespace (PID 1 in inner namespace),
        #        configures inner namespace, serves as dummy init,
        #        collects result of grandchild and passes it to parent
        # grandchild: child of child process (PID 2 in inner namespace), exec()s tool

        # We need the following communication steps between these proceses:
        # 1a) grandchild tells parent its PID (in outer namespace).
        # 1b) grandchild tells parent that it is ready and measurement should begin.
        # 2) parent tells grandchild that measurement has begun and tool should
        #    be exec()ed.
        # 3) child tells parent about return value and resource consumption of
        #    grandchild.
        # 1a and 1b are done together by sending the PID through a pipe.
        # 2 is done by sending a null byte through a pipe.
        # 3 is done by sending a pickled object through the same pipe as #2.
        # We cannot use the same pipe for both directions, because otherwise a sender
        # might read the bytes it has sent itself.

        # Error codes from child to parent
        CHILD_OSERROR = 128  # noqa: N806 local constant
        CHILD_UNKNOWN_ERROR = 129  # noqa: N806 local constant

        # "downstream" pipe parent->grandchild
        from_parent, to_grandchild = os.pipe()
        # "upstream" pipe grandchild/child->parent
        from_grandchild, to_parent = os.pipe()

        # The protocol for these pipes is that first the parent sends the marker for
        # user mappings, then the grand child sends its outer PID back,
        # and finally the parent sends its completion marker.
        # After the run, the child sends the result of the grand child and then waits
        # for the post_run marker, before it terminates.
        MARKER_USER_MAPPING_COMPLETED = b"A"  # noqa: N806 local constant
        MARKER_PARENT_COMPLETED = b"B"  # noqa: N806 local constant
        MARKER_PARENT_POST_RUN_COMPLETED = b"C"  # noqa: N806 local constant

        # If the current directory is within one of the bind mounts we create,
        # we need to cd into this directory again, otherwise we would not see the
        # bind mount, but the directory behind it.
        # Thus we always set cwd to force a change of directory.
        if root_dir is None:
            cwd = os.path.abspath(cwd or os.curdir)
        else:
            root_dir = os.path.abspath(root_dir)
            cwd = os.path.abspath(cwd)

        def grandchild():
            """Setup everything inside the process that finally exec()s the tool."""
            try:
                # We know that this process has PID 2 in the inner namespace,
                # but we actually need to know its PID in the outer namespace
                # such that parent can put us into the correct cgroups.  According to
                # http://man7.org/linux/man-pages/man7/pid_namespaces.7.html,
                # there are two ways to achieve this: sending a message with the PID
                # via a socket (but Python 2 lacks a convenient API for sendmsg),
                # and reading /proc/self in the outer procfs instance
                # (that's what we do).
                my_outer_pid = container.get_my_pid_from_procfs()

                container.mount_proc(self._container_system_config)
                container.drop_capabilities()
                container.reset_signal_handling()
                child_setup_fn()  # Do some other setup the caller wants.

                # Signal readiness to parent by sending our PID
                # and wait until parent is also ready
                os.write(to_parent, str(my_outer_pid).encode())
                received = os.read(from_parent, 1)
                assert received == MARKER_PARENT_COMPLETED, received
            finally:
                # close remaining ends of pipe
                os.close(from_parent)
                os.close(to_parent)
            # here Python will exec() the tool for us

        def child():
            """Setup everything inside the container,
            start the tool, and wait for result."""
            try:
                logging.debug(
                    "Child: child process of RunExecutor with PID %d started",
                    container.get_my_pid_from_procfs(),
                )

                # Put all received signals on hold until we handle them later.
                container.block_all_signals()

                # We want to avoid leaking file descriptors to the executed child.
                # It is also nice if the child has only the minimal necessary file
                # descriptors, to avoid keeping other pipes and files open, e.g.,
                # those that the parent uses to communicate with other containers
                # (if containers are started in parallel).
                # Thus we do not use the close_fds feature of subprocess.Popen,
                # but do the same here manually. We keep the relevant ends of our pipes,
                # and stdin/out/err of child and grandchild.
                necessary_fds = {
                    sys.stdin,
                    sys.stdout,
                    sys.stderr,
                    to_parent,
                    from_parent,
                    stdin,
                    stdout,
                    stderr,
                } - {None}
                container.close_open_fds(keep_files=necessary_fds)

                try:
                    if self._container_system_config:
                        # A standard hostname increases reproducibility.
                        socket.sethostname(container.CONTAINER_HOSTNAME)

                    if not self._allow_network:
                        container.activate_network_interface("lo")

                    # Wait until user mapping is finished,
                    # this is necessary for filesystem writes
                    received = os.read(from_parent,
                                       len(MARKER_USER_MAPPING_COMPLETED))
                    assert received == MARKER_USER_MAPPING_COMPLETED, received

                    if root_dir is not None:
                        self._setup_root_filesystem(root_dir)
                    else:
                        self._setup_container_filesystem(
                            temp_dir,
                            output_dir if result_files_patterns else None,
                            memlimit,
                            memory_nodes,
                        )

                    # Marking this process as "non-dumpable" (no core dumps) also
                    # forbids several other ways how other processes can access and
                    # influence it:
                    # ptrace is forbidden and much of /proc/<child>/ is inaccessible.
                    # We set this to prevent the benchmarked tool from messing with this
                    # process or using it to escape from the container. More info:
                    # http://man7.org/linux/man-pages/man5/proc.5.html
                    # It needs to be done after MARKER_USER_MAPPING_COMPLETED.
                    libc.prctl(libc.PR_SET_DUMPABLE, libc.SUID_DUMP_DISABLE, 0,
                               0, 0)
                except OSError as e:
                    logging.critical("Failed to configure container: %s", e)
                    return CHILD_OSERROR

                try:
                    os.chdir(cwd)
                except OSError as e:
                    logging.critical(
                        "Cannot change into working directory inside container: %s",
                        e)
                    return CHILD_OSERROR

                container.setup_seccomp_filter()

                try:
                    grandchild_proc = subprocess.Popen(
                        args,
                        stdin=stdin,
                        stdout=stdout,
                        stderr=stderr,
                        env=env,
                        close_fds=False,
                        preexec_fn=grandchild,
                    )
                except (OSError, RuntimeError) as e:
                    logging.critical("Cannot start process: %s", e)
                    return CHILD_OSERROR

                # keep capability for unmount if necessary later
                necessary_capabilities = ([libc.CAP_SYS_ADMIN]
                                          if result_files_patterns else [])
                container.drop_capabilities(keep=necessary_capabilities)

                # Close other fds that were still necessary above.
                container.close_open_fds(keep_files={
                    sys.stdout, sys.stderr, to_parent, from_parent
                })

                # Set up signal handlers to forward signals to grandchild
                # (because we are PID 1, there is a special signal handling otherwise).
                # cf. dumb-init project: https://github.com/Yelp/dumb-init
                # Also wait for grandchild and return its result.
                grandchild_result = container.wait_for_child_and_forward_signals(
                    grandchild_proc.pid, args[0])

                logging.debug(
                    "Child: process %s terminated with exit code %d.",
                    args[0],
                    grandchild_result[0],
                )

                if result_files_patterns:
                    # Remove the bind mount that _setup_container_filesystem added
                    # such that the parent can access the result files.
                    libc.umount(temp_dir.encode())

                # Re-allow access to /proc/<child>/...,
                # this is used by the parent for accessing output files
                libc.prctl(libc.PR_SET_DUMPABLE, libc.SUID_DUMP_USER, 0, 0, 0)

                os.write(to_parent, pickle.dumps(grandchild_result))
                os.close(to_parent)

                # Now the parent copies the output files, we need to wait until this is
                # finished. If the child terminates, the container file system and its
                # tmpfs go away.
                assert os.read(from_parent,
                               1) == MARKER_PARENT_POST_RUN_COMPLETED
                os.close(from_parent)

                return 0
            except OSError:
                logging.exception("Error in child process of RunExecutor")
                return CHILD_OSERROR
            except BaseException:
                # Need to catch everything because this method always needs to return an
                # int (we are inside a C callback that requires returning int).
                logging.exception("Error in child process of RunExecutor")
                return CHILD_UNKNOWN_ERROR

        try:  # parent
            try:
                child_pid = container.execute_in_namespace(
                    child, use_network_ns=not self._allow_network)
            except OSError as e:
                if (e.errno == errno.EPERM and util.try_read_file(
                        "/proc/sys/kernel/unprivileged_userns_clone") == "0"):
                    raise BenchExecException(
                        "Unprivileged user namespaces forbidden on this system, please "
                        "enable them with 'sysctl -w kernel.unprivileged_userns_clone=1' "
                        "or disable container mode")
                elif (e.errno in {errno.ENOSPC, errno.EINVAL} and
                      util.try_read_file("/proc/sys/user/max_user_namespaces")
                      == "0"):
                    # Ubuntu has ENOSPC, Centos seems to produce EINVAL in this case
                    raise BenchExecException(
                        "Unprivileged user namespaces forbidden on this system, please "
                        "enable by using 'sysctl -w user.max_user_namespaces=10000' "
                        "(or another value) or disable container mode")
                else:
                    raise BenchExecException(
                        "Creating namespace for container mode failed: " +
                        os.strerror(e.errno))
            logging.debug(
                "Parent: child process of RunExecutor with PID %d started.",
                child_pid)

            def check_child_exit_code():
                """Check if the child process terminated cleanly
                and raise an error otherwise."""
                child_exitcode, unused_child_rusage = self._wait_for_process(
                    child_pid, args[0])
                child_exitcode = util.ProcessExitCode.from_raw(child_exitcode)
                logging.debug(
                    "Parent: child process of RunExecutor with PID %d"
                    " terminated with %s.",
                    child_pid,
                    child_exitcode,
                )

                if child_exitcode:
                    if child_exitcode.value:
                        if child_exitcode.value == CHILD_OSERROR:
                            # This was an OSError in the child,
                            # details were already logged
                            raise BenchExecException(
                                "execution in container failed, check log for details"
                            )
                        elif child_exitcode.value == CHILD_UNKNOWN_ERROR:
                            raise BenchExecException(
                                "unexpected error in container")
                        raise OSError(child_exitcode.value,
                                      os.strerror(child_exitcode.value))
                    raise OSError(
                        0,
                        "Child process of RunExecutor terminated with " +
                        str(child_exitcode),
                    )

            # Close unnecessary ends of pipes such that read() does not block forever
            # if all other processes have terminated.
            os.close(from_parent)
            os.close(to_parent)

            container.setup_user_mapping(child_pid,
                                         uid=self._uid,
                                         gid=self._gid)
            # signal child to continue
            os.write(to_grandchild, MARKER_USER_MAPPING_COMPLETED)

            try:
                # read at most 10 bytes because this is enough for 32bit int
                grandchild_pid = int(os.read(from_grandchild, 10))
            except ValueError:
                # probably empty read, i.e., pipe closed,
                # i.e., child or grandchild failed
                check_child_exit_code()
                assert False, (
                    "Child process of RunExecutor terminated cleanly"
                    " but did not send expected data.")

            logging.debug(
                "Parent: executing %s in grand child with PID %d"
                " via child with PID %d.",
                args[0],
                grandchild_pid,
                child_pid,
            )

            # start measurements
            cgroups.add_task(grandchild_pid)
            parent_setup = parent_setup_fn()

            # Signal grandchild that setup is finished
            os.write(to_grandchild, MARKER_PARENT_COMPLETED)

            # Copy file descriptor, otherwise we could not close from_grandchild in
            # finally block and would leak a file descriptor in case of exception.
            from_grandchild_copy = os.dup(from_grandchild)
            to_grandchild_copy = os.dup(to_grandchild)
        finally:
            os.close(from_grandchild)
            os.close(to_grandchild)

        def wait_for_grandchild():
            # 1024 bytes ought to be enough for everyone^Wour pickled result
            try:
                received = os.read(from_grandchild_copy, 1024)
            except OSError as e:
                if self.PROCESS_KILLED and e.errno == errno.EINTR:
                    # Read was interrupted because of Ctrl+C, we just try again
                    received = os.read(from_grandchild_copy, 1024)
                else:
                    raise e

            if not received:
                # Typically this means the child exited prematurely because an error
                # occurred, and check_child_exitcode() will handle this.
                # We close the pipe first, otherwise child could hang infinitely.
                os.close(from_grandchild_copy)
                os.close(to_grandchild_copy)
                check_child_exit_code()
                assert False, "Child process terminated cleanly without sending result"

            exitcode, ru_child = pickle.loads(received)

            base_path = "/proc/{}/root".format(child_pid)
            parent_cleanup = parent_cleanup_fn(
                parent_setup, util.ProcessExitCode.from_raw(exitcode),
                base_path)

            if result_files_patterns:
                # As long as the child process exists
                # we can access the container file system here
                self._transfer_output_files(base_path + temp_dir, cwd,
                                            output_dir, result_files_patterns)

            os.close(from_grandchild_copy)
            os.write(to_grandchild_copy, MARKER_PARENT_POST_RUN_COMPLETED)
            os.close(to_grandchild_copy)  # signal child that it can terminate
            check_child_exit_code()

            return exitcode, ru_child, parent_cleanup

        return grandchild_pid, wait_for_grandchild
    def _start_execution_in_container(
            self, args, stdin, stdout, stderr, env, cwd, temp_dir, cgroups,
            output_dir, result_files_patterns,
            parent_setup_fn, child_setup_fn, parent_cleanup_fn):
        """Execute the given command and measure its resource usage similarly to super()._start_execution(),
        but inside a container implemented using Linux namespaces.
        The command has no network access (only loopback),
        a fresh directory as /tmp and no write access outside of this,
        and it does not see other processes except itself.
        """
        assert self._use_namespaces

        args = self._build_cmdline(args, env=env)

        # We have three processes involved:
        # parent: the current Python process in which RunExecutor is executing
        # child: child process in new namespace (PID 1 in inner namespace),
        #        configures inner namespace, serves as dummy init,
        #        collects result of grandchild and passes it to parent
        # grandchild: child of child process (PID 2 in inner namespace), exec()s tool

        # We need the following communication steps between these proceses:
        # 1a) grandchild tells parent its PID (in outer namespace).
        # 1b) grandchild tells parent that it is ready and measurement should begin.
        # 2) parent tells grandchild that measurement has begun and tool should
        #    be exec()ed.
        # 3) child tells parent about return value and resource consumption of grandchild.
        # 1a and 1b are done together by sending the PID through a pipe.
        # 2 is done by sending a null byte through a pipe.
        # 3 is done by sending a pickled object through the same pipe as #2.
        # We cannot use the same pipe for both directions, because otherwise a sender might
        # read the bytes it has sent itself.

        from_parent, to_grandchild = os.pipe() # "downstream" pipe parent->grandchild
        from_grandchild, to_parent = os.pipe() # "upstream" pipe grandchild/child->parent

        # If the current directory is within one of the bind mounts we create,
        # we need to cd into this directory again, otherwise we would not see the bind mount,
        # but the directory behind it. Thus we always set cwd to force a change of directory.
        cwd = os.path.abspath(cwd or os.curdir)

        def grandchild():
            """Setup everything inside the process that finally exec()s the tool."""
            try:
                # We know that this process has PID 2 in the inner namespace,
                # but we actually need to know its PID in the outer namespace
                # such that parent can put us into the correct cgroups.
                # According to http://man7.org/linux/man-pages/man7/pid_namespaces.7.html,
                # there are two ways to achieve this: sending a message with the PID
                # via a socket (but Python < 3.3 lacks a convenient API for sendmsg),
                # and reading /proc/self in the outer procfs instance (that's what we do).
                my_outer_pid = container.get_my_pid_from_procfs()

                container.mount_proc()
                container.drop_capabilities()
                child_setup_fn() # Do some other setup the caller wants.

                # Signal readiness to parent by sending our PID and wait until parent is also ready
                os.write(to_parent, str(my_outer_pid).encode())
                received = os.read(from_parent, 1)
                assert received == b'\0', received
            finally:
                # close remaining ends of pipe
                os.close(from_parent)
                os.close(to_parent)
            # here Python will exec() the tool for us

        def child():
            """Setup everything inside the container, start the tool, and wait for result."""
            try:
                logging.debug("Child: child process of RunExecutor with PID %d started",
                              container.get_my_pid_from_procfs())

                # We want to avoid leaking file descriptors to the executed child.
                # It is also nice if the child has only the minimal necessary file descriptors,
                # to avoid keeping other pipes and files open, e.g., those that the parent
                # uses to communicate with other containers (if containers are started in parallel).
                # Thus we do not use the close_fds feature of subprocess.Popen,
                # but do the same here manually.
                # We keep the relevant ends of our pipes, and stdin/out/err of child and grandchild.
                necessary_fds = {sys.stdin, sys.stdout, sys.stderr,
                    to_parent, from_parent, stdin, stdout, stderr} - {None}
                container.close_open_fds(keep_files=necessary_fds)

                try:
                    if not self._allow_network:
                        container.activate_network_interface("lo")
                    self._setup_container_filesystem(temp_dir)
                except EnvironmentError as e:
                    logging.critical("Failed to configure container: %s", e)
                    return int(e.errno)

                try:
                    os.chdir(cwd)
                except EnvironmentError as e:
                    logging.critical(
                        "Cannot change into working directory inside container: %s", e)
                    return int(e.errno)

                try:
                    grandchild_proc = subprocess.Popen(args,
                                        stdin=stdin,
                                        stdout=stdout, stderr=stderr,
                                        env=env,
                                        close_fds=False,
                                        preexec_fn=grandchild)
                except (EnvironmentError, RuntimeError) as e:
                    logging.critical("Cannot start process: %s", e)
                    try:
                        return int(e.errno)
                    except BaseException:
                        # subprocess.Popen in Python 2.7 throws OSError with errno=None
                        # if the preexec_fn fails.
                        return -2

                container.drop_capabilities()

                # Set up signal handlers to forward signals to grandchild
                # (because we are PID 1, there is a special signal handling otherwise).
                # cf. dumb-init project: https://github.com/Yelp/dumb-init
                container.forward_all_signals(grandchild_proc.pid, args[0])

                # Close other fds that were still necessary above.
                container.close_open_fds(keep_files={sys.stdout, sys.stderr, to_parent})

                # wait for grandchild and return its result
                grandchild_result = self._wait_for_process(grandchild_proc.pid, args[0])
                logging.debug("Child: process %s terminated with exit code %d.",
                              args[0], grandchild_result[0])
                os.write(to_parent, pickle.dumps(grandchild_result))
                os.close(to_parent)

                return 0
            except EnvironmentError as e:
                logging.exception("Error in child process of RunExecutor")
                return int(e.errno)
            except:
                # Need to catch everything because this method always needs to return a int
                # (we are inside a C callback that requires returning int).
                logging.exception("Error in child process of RunExecutor")
                return -1

        try: # parent
            try:
                child_pid = container.execute_in_namespace(child, use_network_ns=not self._allow_network)
            except OSError as e:
                raise BenchExecException(
                    "Creating namespace for container mode failed: " + os.strerror(e.errno))
            logging.debug("Parent: child process of RunExecutor with PID %d started.", child_pid)

            def check_child_exit_code():
                """Check if the child process terminated cleanly and raise an error otherwise."""
                child_exitcode, unused_child_rusage = self._wait_for_process(child_pid, args[0])
                child_exitcode = util.ProcessExitCode.from_raw(child_exitcode)
                logging.debug("Parent: child process of RunExecutor with PID %d terminated with %s.",
                              child_pid, child_exitcode)

                if child_exitcode:
                    if child_exitcode.value and child_exitcode.value <= 128:
                        # This was an OSError in the child, re-create it
                        raise OSError(child_exitcode.value, os.strerror(child_exitcode.value))
                    raise OSError(0, "Child process of RunExecutor terminated with " + str(child_exitcode))

            # Close unnecessary ends of pipes such that read() does not block forever
            # if all other processes have terminated.
            os.close(from_parent)
            os.close(to_parent)

            container.setup_user_mapping(child_pid, uid=self._uid, gid=self._gid)

            try:
                grandchild_pid = int(os.read(from_grandchild, 10)) # 10 bytes is enough for 32bit int
            except ValueError:
                # probably empty read, i.e., pipe closed, i.e., child or grandchild failed
                check_child_exit_code()
                assert False, "Child process of RunExecutor terminated cleanly but did not send expected data."

            logging.debug("Parent: executing %s in grand child with PID %d via child with PID %d.",
                          args[0], grandchild_pid, child_pid)

            # start measurements
            cgroups.add_task(grandchild_pid)
            parent_setup = parent_setup_fn()

            # Signal grandchild that setup is finished
            os.write(to_grandchild, b'\0')

            # Copy file descriptor, otherwise we could not close from_grandchild in finally block
            # and would leak a file descriptor in case of exception.
            from_grandchild_copy = os.dup(from_grandchild)
        finally:
            os.close(from_grandchild)
            os.close(to_grandchild)

        def wait_for_grandchild():
            # 1024 bytes ought to be enough for everyone^Wour pickled result
            try:
                received = os.read(from_grandchild_copy, 1024)
            except OSError as e:
                if self.PROCESS_KILLED and e.errno == errno.EINTR:
                    # Read was interrupted because of Ctrl+C, we just try again
                    received = os.read(from_grandchild_copy, 1024)
                else:
                    raise e

            parent_cleanup = parent_cleanup_fn(parent_setup)

            os.close(from_grandchild_copy)
            check_child_exit_code()

            if result_files_patterns:
                self._transfer_output_files(temp_dir, cwd, output_dir, result_files_patterns)

            exitcode, ru_child = pickle.loads(received)
            return exitcode, ru_child, parent_cleanup

        return grandchild_pid, wait_for_grandchild
Exemplo n.º 6
0
    def _start_execution_in_container(self, args, stdin, stdout, stderr, env,
                                      root_dir, cwd, temp_dir, cgroups,
                                      output_dir, result_files_patterns,
                                      parent_setup_fn, child_setup_fn,
                                      parent_cleanup_fn):
        """Execute the given command and measure its resource usage similarly to super()._start_execution(),
        but inside a container implemented using Linux namespaces.
        The command has no network access (only loopback),
        a fresh directory as /tmp and no write access outside of this,
        and it does not see other processes except itself.
        """
        assert self._use_namespaces

        env.update(self._env_override)

        args = self._build_cmdline(args, env=env)

        # We have three processes involved:
        # parent: the current Python process in which RunExecutor is executing
        # child: child process in new namespace (PID 1 in inner namespace),
        #        configures inner namespace, serves as dummy init,
        #        collects result of grandchild and passes it to parent
        # grandchild: child of child process (PID 2 in inner namespace), exec()s tool

        # We need the following communication steps between these proceses:
        # 1a) grandchild tells parent its PID (in outer namespace).
        # 1b) grandchild tells parent that it is ready and measurement should begin.
        # 2) parent tells grandchild that measurement has begun and tool should
        #    be exec()ed.
        # 3) child tells parent about return value and resource consumption of grandchild.
        # 1a and 1b are done together by sending the PID through a pipe.
        # 2 is done by sending a null byte through a pipe.
        # 3 is done by sending a pickled object through the same pipe as #2.
        # We cannot use the same pipe for both directions, because otherwise a sender might
        # read the bytes it has sent itself.

        # Error codes from child to parent
        CHILD_OSERROR = 128
        CHILD_UNKNOWN_ERROR = 129

        from_parent, to_grandchild = os.pipe(
        )  # "downstream" pipe parent->grandchild
        from_grandchild, to_parent = os.pipe(
        )  # "upstream" pipe grandchild/child->parent

        # If the current directory is within one of the bind mounts we create,
        # we need to cd into this directory again, otherwise we would not see the bind mount,
        # but the directory behind it. Thus we always set cwd to force a change of directory.
        if root_dir is None:
            cwd = os.path.abspath(cwd or os.curdir)
        else:
            root_dir = os.path.abspath(root_dir)
            cwd = os.path.abspath(cwd)

        def grandchild():
            """Setup everything inside the process that finally exec()s the tool."""
            try:
                # We know that this process has PID 2 in the inner namespace,
                # but we actually need to know its PID in the outer namespace
                # such that parent can put us into the correct cgroups.
                # According to http://man7.org/linux/man-pages/man7/pid_namespaces.7.html,
                # there are two ways to achieve this: sending a message with the PID
                # via a socket (but Python < 3.3 lacks a convenient API for sendmsg),
                # and reading /proc/self in the outer procfs instance (that's what we do).
                my_outer_pid = container.get_my_pid_from_procfs()

                container.mount_proc()
                container.drop_capabilities()
                container.reset_signal_handling()
                child_setup_fn()  # Do some other setup the caller wants.

                # Signal readiness to parent by sending our PID and wait until parent is also ready
                os.write(to_parent, str(my_outer_pid).encode())
                received = os.read(from_parent, 1)
                assert received == b'\0', received
            finally:
                # close remaining ends of pipe
                os.close(from_parent)
                os.close(to_parent)
            # here Python will exec() the tool for us

        def child():
            """Setup everything inside the container, start the tool, and wait for result."""
            try:
                logging.debug(
                    "Child: child process of RunExecutor with PID %d started",
                    container.get_my_pid_from_procfs())

                # Put all received signals on hold until we handle them later.
                container.block_all_signals()

                # We want to avoid leaking file descriptors to the executed child.
                # It is also nice if the child has only the minimal necessary file descriptors,
                # to avoid keeping other pipes and files open, e.g., those that the parent
                # uses to communicate with other containers (if containers are started in parallel).
                # Thus we do not use the close_fds feature of subprocess.Popen,
                # but do the same here manually.
                # We keep the relevant ends of our pipes, and stdin/out/err of child and grandchild.
                necessary_fds = {
                    sys.stdin, sys.stdout, sys.stderr, to_parent, from_parent,
                    stdin, stdout, stderr
                } - {None}
                container.close_open_fds(keep_files=necessary_fds)

                try:
                    if not self._allow_network:
                        container.activate_network_interface("lo")

                    if root_dir is not None:
                        self._setup_root_filesystem(root_dir)
                    else:
                        self._setup_container_filesystem(temp_dir)
                except EnvironmentError as e:
                    logging.critical("Failed to configure container: %s", e)
                    return CHILD_OSERROR

                try:
                    os.chdir(cwd)
                except EnvironmentError as e:
                    logging.critical(
                        "Cannot change into working directory inside container: %s",
                        e)
                    return CHILD_OSERROR

                try:
                    grandchild_proc = subprocess.Popen(args,
                                                       stdin=stdin,
                                                       stdout=stdout,
                                                       stderr=stderr,
                                                       env=env,
                                                       close_fds=False,
                                                       preexec_fn=grandchild)
                except (EnvironmentError, RuntimeError) as e:
                    logging.critical("Cannot start process: %s", e)
                    return CHILD_OSERROR

                container.drop_capabilities()

                # Close other fds that were still necessary above.
                container.close_open_fds(
                    keep_files={sys.stdout, sys.stderr, to_parent})

                # Set up signal handlers to forward signals to grandchild
                # (because we are PID 1, there is a special signal handling otherwise).
                # cf. dumb-init project: https://github.com/Yelp/dumb-init
                # Also wait for grandchild and return its result.
                if _HAS_SIGWAIT:
                    grandchild_result = container.wait_for_child_and_forward_all_signals(
                        grandchild_proc.pid, args[0])
                else:
                    container.forward_all_signals_async(
                        grandchild_proc.pid, args[0])
                    grandchild_result = self._wait_for_process(
                        grandchild_proc.pid, args[0])

                logging.debug(
                    "Child: process %s terminated with exit code %d.", args[0],
                    grandchild_result[0])
                os.write(to_parent, pickle.dumps(grandchild_result))
                os.close(to_parent)

                return 0
            except EnvironmentError as e:
                logging.exception("Error in child process of RunExecutor")
                return CHILD_OSERROR
            except:
                # Need to catch everything because this method always needs to return a int
                # (we are inside a C callback that requires returning int).
                logging.exception("Error in child process of RunExecutor")
                return CHILD_UNKNOWN_ERROR

        try:  # parent
            try:
                child_pid = container.execute_in_namespace(
                    child, use_network_ns=not self._allow_network)
            except OSError as e:
                raise BenchExecException(
                    "Creating namespace for container mode failed: " +
                    os.strerror(e.errno))
            logging.debug(
                "Parent: child process of RunExecutor with PID %d started.",
                child_pid)

            def check_child_exit_code():
                """Check if the child process terminated cleanly and raise an error otherwise."""
                child_exitcode, unused_child_rusage = self._wait_for_process(
                    child_pid, args[0])
                child_exitcode = util.ProcessExitCode.from_raw(child_exitcode)
                logging.debug(
                    "Parent: child process of RunExecutor with PID %d terminated with %s.",
                    child_pid, child_exitcode)

                if child_exitcode:
                    if child_exitcode.value:
                        if child_exitcode.value == CHILD_OSERROR:
                            # This was an OSError in the child, details were already logged
                            raise BenchExecException(
                                "execution in container failed, check log for details"
                            )
                        elif child_exitcode.value == CHILD_UNKNOWN_ERROR:
                            raise BenchExecException(
                                "unexpected error in container")
                        raise OSError(child_exitcode.value,
                                      os.strerror(child_exitcode.value))
                    raise OSError(
                        0, "Child process of RunExecutor terminated with " +
                        str(child_exitcode))

            # Close unnecessary ends of pipes such that read() does not block forever
            # if all other processes have terminated.
            os.close(from_parent)
            os.close(to_parent)

            container.setup_user_mapping(child_pid,
                                         uid=self._uid,
                                         gid=self._gid)

            try:
                grandchild_pid = int(os.read(
                    from_grandchild, 10))  # 10 bytes is enough for 32bit int
            except ValueError:
                # probably empty read, i.e., pipe closed, i.e., child or grandchild failed
                check_child_exit_code()
                assert False, "Child process of RunExecutor terminated cleanly but did not send expected data."

            logging.debug(
                "Parent: executing %s in grand child with PID %d via child with PID %d.",
                args[0], grandchild_pid, child_pid)

            # start measurements
            cgroups.add_task(grandchild_pid)
            parent_setup = parent_setup_fn()

            # Signal grandchild that setup is finished
            os.write(to_grandchild, b'\0')

            # Copy file descriptor, otherwise we could not close from_grandchild in finally block
            # and would leak a file descriptor in case of exception.
            from_grandchild_copy = os.dup(from_grandchild)
        finally:
            os.close(from_grandchild)
            os.close(to_grandchild)

        def wait_for_grandchild():
            # 1024 bytes ought to be enough for everyone^Wour pickled result
            try:
                received = os.read(from_grandchild_copy, 1024)
            except OSError as e:
                if self.PROCESS_KILLED and e.errno == errno.EINTR:
                    # Read was interrupted because of Ctrl+C, we just try again
                    received = os.read(from_grandchild_copy, 1024)
                else:
                    raise e

            parent_cleanup = parent_cleanup_fn(parent_setup)

            os.close(from_grandchild_copy)
            check_child_exit_code()

            if result_files_patterns:
                self._transfer_output_files(temp_dir, cwd, output_dir,
                                            result_files_patterns)

            exitcode, ru_child = pickle.loads(received)
            return exitcode, ru_child, parent_cleanup

        return grandchild_pid, wait_for_grandchild