示例#1
0
def duplicate_mount_hierarchy(mount_base, temp_base, work_base, dir_modes):
    """
    Setup a copy of the system's mount hierarchy below a specified directory,
    and apply all specified directory modes (e.g., read-only access or hidden)
    in that new hierarchy.
    Afterwards, the new mount hierarchy can be chroot'ed into.
    @param mount_base: the base directory of the new mount hierarchy
    @param temp_base: the base directory for all temporary files
    @param work_base: the base directory for all overlayfs work files
    @param dir_modes: the directory modes to apply (without mount_base prefix)
    """
    # Create a copy of all mountpoints.
    # Setting MS_PRIVATE flag discouples the new mounts from the original mounts,
    # i.e., mounts we do are not seen outside the mount namespace,
    # and any (un)mounts that are made later in the main system are not seen by us.
    # The latter is desired such that new mounts (e.g., USB sticks being plugged in)
    # do not appear in the container.
    # Blocking host-side unmounts from being propagated has the disadvantage
    # that any unmounts done by the sysadmin won't really unmount the device
    # because it stays mounted in the container and thus keep the device busy
    # (cf. https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=739593#85).
    # We could allow unmounts being propated with MS_SLAVE instead of MS_PRIVATE,
    # but we prefer to have the mount namespace of the container being
    # unchanged during run execution.
    make_bind_mount(b"/", mount_base, recursive=True, private=True)

    # Ensure each special dir is a mountpoint such that the next loop covers it.
    for special_dir in dir_modes.keys():
        mount_path = mount_base + special_dir
        temp_path = temp_base + special_dir
        try:
            make_bind_mount(mount_path, mount_path)
        except OSError as e:
            # on btrfs, non-recursive bind mounts fail
            if e.errno == errno.EINVAL:
                try:
                    make_bind_mount(mount_path, mount_path, recursive=True)
                except OSError as e2:
                    logging.debug(
                        "Failed to make %s a (recursive) bind mount: %s", mount_path, e2
                    )
            else:
                logging.debug("Failed to make %s a bind mount: %s", mount_path, e)
        if not os.path.exists(temp_path):
            os.makedirs(temp_path)

    for unused_source, full_mountpoint, fstype, options in list(get_mount_points()):
        if not util.path_is_below(full_mountpoint, mount_base):
            continue
        mountpoint = full_mountpoint[len(mount_base) :] or b"/"
        mode = determine_directory_mode(dir_modes, mountpoint, fstype)
        if not mode:
            continue

        if not os.access(os.path.dirname(mountpoint), os.X_OK):
            # If parent is not accessible we cannot mount something on mountpoint.
            # We mark the inaccessible directory as hidden
            # because otherwise the mountpoint could become accessible (directly!)
            # if the permissions on the parent are relaxed during container execution.
            original_mountpoint = mountpoint
            parent = os.path.dirname(mountpoint)
            while not os.access(parent, os.X_OK):
                mountpoint = parent
                parent = os.path.dirname(mountpoint)
            mode = DIR_HIDDEN
            logging.debug(
                "Marking inaccessible directory '%s' as hidden "
                "because it contains a mountpoint at '%s'",
                mountpoint.decode(),
                original_mountpoint.decode(),
            )
        else:
            logging.debug("Mounting '%s' as %s", mountpoint.decode(), mode)

        mount_path = mount_base + mountpoint
        temp_path = temp_base + mountpoint
        work_path = work_base + mountpoint

        if mode == DIR_OVERLAY:
            if not os.path.exists(temp_path):
                os.makedirs(temp_path)
            if not os.path.exists(work_path):
                os.makedirs(work_path)
            try:
                # Previous mount in this place not needed if replaced with overlay dir.
                libc.umount(mount_path)
            except OSError as e:
                logging.debug(e)
            try:
                make_overlay_mount(mount_path, mountpoint, temp_path, work_path)
            except OSError as e:
                mp = mountpoint.decode()
                raise OSError(
                    e.errno,
                    "Creating overlay mount for '{}' failed: {}. Please use "
                    "other directory modes, for example '--read-only-dir {}'.".format(
                        mp, os.strerror(e.errno), util.escape_string_shell(mp)
                    ),
                )

        elif mode == DIR_HIDDEN:
            if not os.path.exists(temp_path):
                os.makedirs(temp_path)
            try:
                # Previous mount in this place not needed if replaced with hidden dir.
                libc.umount(mount_path)
            except OSError as e:
                logging.debug(e)
            make_bind_mount(temp_path, mount_path)

        elif mode == DIR_READ_ONLY:
            try:
                remount_with_additional_flags(mount_path, options, libc.MS_RDONLY)
            except OSError as e:
                if e.errno == errno.EACCES:
                    logging.warning(
                        "Cannot mount '%s', directory may be missing from container.",
                        mountpoint.decode(),
                    )
                else:
                    # If this mountpoint is below an overlay/hidden dir,
                    # re-create mountpoint.
                    # Linux does not support making read-only bind mounts in one step:
                    # https://lwn.net/Articles/281157/
                    # http://man7.org/linux/man-pages/man8/mount.8.html
                    make_bind_mount(
                        mountpoint, mount_path, recursive=True, private=True
                    )
                    remount_with_additional_flags(mount_path, options, libc.MS_RDONLY)

        elif mode == DIR_FULL_ACCESS:
            try:
                # Ensure directory is still a mountpoint by attempting to remount.
                remount_with_additional_flags(mount_path, options, 0)
            except OSError as e:
                if e.errno == errno.EACCES:
                    logging.warning(
                        "Cannot mount '%s', directory may be missing from container.",
                        mountpoint.decode(),
                    )
                else:
                    # If this mountpoint is below an overlay/hidden dir,
                    # re-create mountpoint.
                    make_bind_mount(
                        mountpoint, mount_path, recursive=True, private=True
                    )

        else:
            assert False
        def child():
            """Setup everything inside the container,
            start the tool, and wait for result."""
            try:
                logging.debug(
                    "Child: child process of RunExecutor with PID %d started",
                    container.get_my_pid_from_procfs(),
                )

                # Put all received signals on hold until we handle them later.
                container.block_all_signals()

                # We want to avoid leaking file descriptors to the executed child.
                # It is also nice if the child has only the minimal necessary file
                # descriptors, to avoid keeping other pipes and files open, e.g.,
                # those that the parent uses to communicate with other containers
                # (if containers are started in parallel).
                # Thus we do not use the close_fds feature of subprocess.Popen,
                # but do the same here manually. We keep the relevant ends of our pipes,
                # and stdin/out/err of child and grandchild.
                necessary_fds = {
                    sys.stdin,
                    sys.stdout,
                    sys.stderr,
                    to_parent,
                    from_parent,
                    stdin,
                    stdout,
                    stderr,
                } - {None}
                container.close_open_fds(keep_files=necessary_fds)

                try:
                    if self._container_system_config:
                        # A standard hostname increases reproducibility.
                        socket.sethostname(container.CONTAINER_HOSTNAME)

                    if not self._allow_network:
                        container.activate_network_interface("lo")

                    # Wait until user mapping is finished,
                    # this is necessary for filesystem writes
                    received = os.read(from_parent,
                                       len(MARKER_USER_MAPPING_COMPLETED))
                    assert received == MARKER_USER_MAPPING_COMPLETED, received

                    if root_dir is not None:
                        self._setup_root_filesystem(root_dir)
                    else:
                        self._setup_container_filesystem(
                            temp_dir,
                            output_dir if result_files_patterns else None,
                            memlimit,
                            memory_nodes,
                        )

                    # Marking this process as "non-dumpable" (no core dumps) also
                    # forbids several other ways how other processes can access and
                    # influence it:
                    # ptrace is forbidden and much of /proc/<child>/ is inaccessible.
                    # We set this to prevent the benchmarked tool from messing with this
                    # process or using it to escape from the container. More info:
                    # http://man7.org/linux/man-pages/man5/proc.5.html
                    # It needs to be done after MARKER_USER_MAPPING_COMPLETED.
                    libc.prctl(libc.PR_SET_DUMPABLE, libc.SUID_DUMP_DISABLE, 0,
                               0, 0)
                except OSError as e:
                    logging.critical("Failed to configure container: %s", e)
                    return CHILD_OSERROR

                try:
                    os.chdir(cwd)
                except OSError as e:
                    logging.critical(
                        "Cannot change into working directory inside container: %s",
                        e)
                    return CHILD_OSERROR

                container.setup_seccomp_filter()

                try:
                    grandchild_proc = subprocess.Popen(
                        args,
                        stdin=stdin,
                        stdout=stdout,
                        stderr=stderr,
                        env=env,
                        close_fds=False,
                        preexec_fn=grandchild,
                    )
                except (OSError, RuntimeError) as e:
                    logging.critical("Cannot start process: %s", e)
                    return CHILD_OSERROR

                # keep capability for unmount if necessary later
                necessary_capabilities = ([libc.CAP_SYS_ADMIN]
                                          if result_files_patterns else [])
                container.drop_capabilities(keep=necessary_capabilities)

                # Close other fds that were still necessary above.
                container.close_open_fds(keep_files={
                    sys.stdout, sys.stderr, to_parent, from_parent
                })

                # Set up signal handlers to forward signals to grandchild
                # (because we are PID 1, there is a special signal handling otherwise).
                # cf. dumb-init project: https://github.com/Yelp/dumb-init
                # Also wait for grandchild and return its result.
                grandchild_result = container.wait_for_child_and_forward_signals(
                    grandchild_proc.pid, args[0])

                logging.debug(
                    "Child: process %s terminated with exit code %d.",
                    args[0],
                    grandchild_result[0],
                )

                if result_files_patterns:
                    # Remove the bind mount that _setup_container_filesystem added
                    # such that the parent can access the result files.
                    libc.umount(temp_dir.encode())

                # Re-allow access to /proc/<child>/...,
                # this is used by the parent for accessing output files
                libc.prctl(libc.PR_SET_DUMPABLE, libc.SUID_DUMP_USER, 0, 0, 0)

                os.write(to_parent, pickle.dumps(grandchild_result))
                os.close(to_parent)

                # Now the parent copies the output files, we need to wait until this is
                # finished. If the child terminates, the container file system and its
                # tmpfs go away.
                assert os.read(from_parent,
                               1) == MARKER_PARENT_POST_RUN_COMPLETED
                os.close(from_parent)

                return 0
            except OSError:
                logging.exception("Error in child process of RunExecutor")
                return CHILD_OSERROR
            except BaseException:
                # Need to catch everything because this method always needs to return an
                # int (we are inside a C callback that requires returning int).
                logging.exception("Error in child process of RunExecutor")
                return CHILD_UNKNOWN_ERROR
    def _setup_container_filesystem(self, temp_dir):
        """Setup the filesystem layout in the container.
         As first step, we create a copy of all existing mountpoints in mount_base, recursively,
        and as "private" mounts (i.e., changes to existing mountpoints afterwards won't propagate
        to our copy).
        Then we iterate over all mountpoints and change them
        according to the mode the user has specified (hidden, read-only, overlay, or full-access).
        This has do be done for each mountpoint because overlays are not recursive.
        Then we chroot into the new mount hierarchy.

        The new filesystem layout still has a view of the host's /proc.
        We do not mount a fresh /proc here because the grandchild still needs old the /proc.

        We do simply iterate over all existing mount points and set them to read-only/overlay them,
        because it is easier create a new hierarchy and chroot into it.
        First, we still have access to the original mountpoints while doing so,
        and second, we avoid race conditions if someone else changes the existing mountpoints.

        @param temp_dir: The base directory under which all our directories should be created.
        """
        # All strings here are bytes to avoid issues if existing mountpoints are invalid UTF-8.
        temp_dir = temp_dir.encode()
        mount_base = os.path.join(temp_dir, b"mount") # base dir for container mounts
        temp_base = os.path.join(temp_dir, b"temp") # directory with files created by tool
        os.mkdir(mount_base)
        os.mkdir(temp_base)

        def _is_below(path, target_path):
            # compare with trailing slashes for cases like /foo and /foobar
            path = os.path.join(path, b"")
            target_path = os.path.join(target_path, b"")
            return path.startswith(target_path)

        def find_mode_for_dir(path, fstype):
            if (path == b"/proc"):
                # /proc is necessary for the grandchild to read PID, will be replaced later.
                return DIR_READ_ONLY
            if _is_below(path, b"/proc"):
                # Irrelevant.
                return None

            parent_mode = None
            result_mode = None
            for special_dir, mode in self._dir_modes.items():
                if _is_below(path, special_dir):
                    if path != special_dir:
                        parent_mode = mode
                    result_mode = mode
            assert result_mode is not None

            if result_mode == DIR_OVERLAY and (
                    _is_below(path, b"/dev") or
                    _is_below(path, b"/sys") or
                    fstype == b"autofs" or
                    fstype == b"cgroup"):
                # Import /dev, /sys, cgroup, and autofs from host into the container,
                # overlay does not work for them.
                return DIR_READ_ONLY

            if result_mode == DIR_HIDDEN and parent_mode == DIR_HIDDEN:
                # No need to recursively recreate mountpoints in hidden dirs.
                return None
            return result_mode

        # Overlayfs needs its own additional temporary directory ("work" directory).
        # temp_base will be the "upper" layer, the host FS the "lower" layer,
        # and mount_base the mount target.
        work_base = os.path.join(temp_dir, b"overlayfs")
        os.mkdir(work_base)

        if self._container_system_config:
            container.setup_container_system_config(temp_base)

        # Create a copy of host's mountpoints.
        container.make_bind_mount(b"/", mount_base, recursive=True, private=True)

        # Ensure each special dir is a mountpoint such that the next loop covers it.
        for special_dir in self._dir_modes.keys():
            mount_path = mount_base + special_dir
            temp_path = temp_base + special_dir
            try:
                container.make_bind_mount(mount_path, mount_path)
            except OSError as e:
                logging.debug("Failed to make %s a bind mount: %s", mount_path, e)
            if not os.path.exists(temp_path):
                os.makedirs(temp_path)

        # Set desired access mode for each mountpoint.
        for unused_source, full_mountpoint, fstype, options in list(container.get_mount_points()):
            if not _is_below(full_mountpoint, mount_base):
                continue
            mountpoint = full_mountpoint[len(mount_base):] or b"/"

            mount_path = mount_base + mountpoint
            temp_path = temp_base + mountpoint
            work_path = work_base + mountpoint

            mode = find_mode_for_dir(mountpoint, fstype)
            if mode == DIR_OVERLAY:
                if not os.path.exists(temp_path):
                    os.makedirs(temp_path)
                if not os.path.exists(work_path):
                    os.makedirs(work_path)
                try:
                    # Previous mount in this place not needed if replaced with overlay dir.
                    libc.umount(mount_path)
                except OSError as e:
                    logging.debug(e)
                try:
                    container.make_overlay_mount(mount_path, mountpoint, temp_path, work_path)
                except OSError as e:
                    raise OSError(e.errno,
                        "Creating overlay mount for '{}' failed: {}. "
                        "Please use other directory modes."
                            .format(mountpoint.decode(), os.strerror(e.errno)))

            elif mode == DIR_HIDDEN:
                if not os.path.exists(temp_path):
                    os.makedirs(temp_path)
                try:
                    # Previous mount in this place not needed if replaced with hidden dir.
                    libc.umount(mount_path)
                except OSError as e:
                    logging.debug(e)
                container.make_bind_mount(temp_path, mount_path)

            elif mode == DIR_READ_ONLY:
                try:
                    container.remount_with_additional_flags(mount_path, options, libc.MS_RDONLY)
                except OSError as e:
                    if e.errno == errno.EACCES:
                        logging.warning(
                            "Cannot mount '%s', directory may be missing from container.",
                            mountpoint.decode())
                    else:
                        # If this mountpoint is below an overlay/hidden dir re-create mountpoint.
                        # Linux does not support making read-only bind mounts in one step:
                        # https://lwn.net/Articles/281157/ http://man7.org/linux/man-pages/man8/mount.8.html
                        container.make_bind_mount(
                            mountpoint, mount_path, recursive=True, private=True)
                        container.remount_with_additional_flags(mount_path, options, libc.MS_RDONLY)

            elif mode == DIR_FULL_ACCESS:
                try:
                    # Ensure directory is still a mountpoint by attempting to remount.
                    container.remount_with_additional_flags(mount_path, options, 0)
                except OSError as e:
                    if e.errno == errno.EACCES:
                        logging.warning(
                            "Cannot mount '%s', directory may be missing from container.",
                            mountpoint.decode())
                    else:
                        # If this mountpoint is below an overlay/hidden dir re-create mountpoint.
                        container.make_bind_mount(
                            mountpoint, mount_path, recursive=True, private=True)

            elif mode is None:
                pass

            else:
                assert False

        # If necessary, (i.e., if /tmp is not already hidden),
        # hide the directory where we store our files from processes in the container
        # by mounting an empty directory over it.
        if os.path.exists(mount_base + temp_dir):
            os.makedirs(temp_base + temp_dir)
            container.make_bind_mount(temp_base + temp_dir, mount_base + temp_dir)

        os.chroot(mount_base)
示例#4
0
    def _setup_container_filesystem(self, temp_dir):
        """Setup the filesystem layout in the container.
         As first step, we create a copy of all existing mountpoints in mount_base, recursively,
        and as "private" mounts (i.e., changes to existing mountpoints afterwards won't propagate
        to our copy).
        Then we iterate over all mountpoints and change them
        according to the mode the user has specified (hidden, read-only, overlay, or full-access).
        This has do be done for each mountpoint because overlays are not recursive.
        Then we chroot into the new mount hierarchy.

        The new filesystem layout still has a view of the host's /proc.
        We do not mount a fresh /proc here because the grandchild still needs the old /proc.

        We do simply iterate over all existing mount points and set them to read-only/overlay them,
        because it is easier to create a new hierarchy and chroot into it.
        First, we still have access to the original mountpoints while doing so,
        and second, we avoid race conditions if someone else changes the existing mountpoints.

        @param temp_dir: The base directory under which all our directories should be created.
        """
        # All strings here are bytes to avoid issues if existing mountpoints are invalid UTF-8.
        temp_base = self._get_result_files_base(
            temp_dir).encode()  # directory with files created by tool
        temp_dir = temp_dir.encode()
        mount_base = os.path.join(temp_dir,
                                  b"mount")  # base dir for container mounts
        os.mkdir(mount_base)
        os.mkdir(temp_base)

        def _is_below(path, target_path):
            # compare with trailing slashes for cases like /foo and /foobar
            path = os.path.join(path, b"")
            target_path = os.path.join(target_path, b"")
            return path.startswith(target_path)

        def find_mode_for_dir(path, fstype):
            if (path == b"/proc"):
                # /proc is necessary for the grandchild to read PID, will be replaced later.
                return DIR_READ_ONLY
            if _is_below(path, b"/proc"):
                # Irrelevant.
                return None

            parent_mode = None
            result_mode = None
            for special_dir, mode in self._dir_modes.items():
                if _is_below(path, special_dir):
                    if path != special_dir:
                        parent_mode = mode
                    result_mode = mode
            assert result_mode is not None

            if result_mode == DIR_OVERLAY and (_is_below(path, b"/dev")
                                               or _is_below(path, b"/sys")
                                               or fstype == b"cgroup"):
                # Overlay does not make sense for /dev, /sys, and all cgroups.
                return DIR_READ_ONLY

            if result_mode == DIR_OVERLAY and (fstype == b"autofs"
                                               or fstype == b"vfat"
                                               or fstype == b"ntfs"):
                # Overlayfs does not support these as underlying file systems.
                logging.debug(
                    "Cannot use overlay mode for %s because it has file system %s. "
                    "Using read-only mode instead.", path.decode(),
                    fstype.decode())
                return DIR_READ_ONLY

            if result_mode == DIR_HIDDEN and parent_mode == DIR_HIDDEN:
                # No need to recursively recreate mountpoints in hidden dirs.
                return None
            return result_mode

        # Overlayfs needs its own additional temporary directory ("work" directory).
        # temp_base will be the "upper" layer, the host FS the "lower" layer,
        # and mount_base the mount target.
        work_base = os.path.join(temp_dir, b"overlayfs")
        os.mkdir(work_base)

        if self._container_system_config:
            container.setup_container_system_config(temp_base)

        # Create a copy of host's mountpoints.
        # Setting MS_PRIVATE flag discouples our mount namespace from the hosts's,
        # i.e., mounts we do are not seen by the host, and any (un)mounts the host does afterward
        # are not seen by us. The latter is desired such that new mounts (e.g.,
        # USB sticks being plugged in) do not appear in the container.
        # Blocking host-side unmounts from being propagated has the disadvantage
        # that any unmounts done by the sysadmin won't really unmount the device
        # because it stays mounted in the container and thus keep the device busy
        # (cf. https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=739593#85).
        # We could allow unmounts being propated with MS_SLAVE instead of MS_PRIVATE,
        # but we prefer to have the mount namespace of the container being
        # unchanged during run execution.
        container.make_bind_mount(b"/",
                                  mount_base,
                                  recursive=True,
                                  private=True)

        # Ensure each special dir is a mountpoint such that the next loop covers it.
        for special_dir in self._dir_modes.keys():
            mount_path = mount_base + special_dir
            temp_path = temp_base + special_dir
            try:
                container.make_bind_mount(mount_path, mount_path)
            except OSError as e:
                # on btrfs, non-recursive bind mounts faitl
                if e.errno == errno.EINVAL:
                    try:
                        container.make_bind_mount(mount_path,
                                                  mount_path,
                                                  recursive=True)
                    except OSError as e2:
                        logging.debug(
                            "Failed to make %s a (recursive) bind mount: %s",
                            mount_path, e2)
                else:
                    logging.debug("Failed to make %s a bind mount: %s",
                                  mount_path, e)
            if not os.path.exists(temp_path):
                os.makedirs(temp_path)

        # Set desired access mode for each mountpoint.
        for unused_source, full_mountpoint, fstype, options in list(
                container.get_mount_points()):
            if not _is_below(full_mountpoint, mount_base):
                continue
            mountpoint = full_mountpoint[len(mount_base):] or b"/"

            mount_path = mount_base + mountpoint
            temp_path = temp_base + mountpoint
            work_path = work_base + mountpoint

            mode = find_mode_for_dir(mountpoint, fstype)
            if mode == DIR_OVERLAY:
                if not os.path.exists(temp_path):
                    os.makedirs(temp_path)
                if not os.path.exists(work_path):
                    os.makedirs(work_path)
                try:
                    # Previous mount in this place not needed if replaced with overlay dir.
                    libc.umount(mount_path)
                except OSError as e:
                    logging.debug(e)
                try:
                    container.make_overlay_mount(mount_path, mountpoint,
                                                 temp_path, work_path)
                except OSError as e:
                    raise OSError(
                        e.errno, "Creating overlay mount for '{}' failed: {}. "
                        "Please use other directory modes.".format(
                            mountpoint.decode(), os.strerror(e.errno)))

            elif mode == DIR_HIDDEN:
                if not os.path.exists(temp_path):
                    os.makedirs(temp_path)
                try:
                    # Previous mount in this place not needed if replaced with hidden dir.
                    libc.umount(mount_path)
                except OSError as e:
                    logging.debug(e)
                container.make_bind_mount(temp_path, mount_path)

            elif mode == DIR_READ_ONLY:
                try:
                    container.remount_with_additional_flags(
                        mount_path, options, libc.MS_RDONLY)
                except OSError as e:
                    if e.errno == errno.EACCES:
                        logging.warning(
                            "Cannot mount '%s', directory may be missing from container.",
                            mountpoint.decode())
                    else:
                        # If this mountpoint is below an overlay/hidden dir re-create mountpoint.
                        # Linux does not support making read-only bind mounts in one step:
                        # https://lwn.net/Articles/281157/ http://man7.org/linux/man-pages/man8/mount.8.html
                        container.make_bind_mount(mountpoint,
                                                  mount_path,
                                                  recursive=True,
                                                  private=True)
                        container.remount_with_additional_flags(
                            mount_path, options, libc.MS_RDONLY)

            elif mode == DIR_FULL_ACCESS:
                try:
                    # Ensure directory is still a mountpoint by attempting to remount.
                    container.remount_with_additional_flags(
                        mount_path, options, 0)
                except OSError as e:
                    if e.errno == errno.EACCES:
                        logging.warning(
                            "Cannot mount '%s', directory may be missing from container.",
                            mountpoint.decode())
                    else:
                        # If this mountpoint is below an overlay/hidden dir re-create mountpoint.
                        container.make_bind_mount(mountpoint,
                                                  mount_path,
                                                  recursive=True,
                                                  private=True)

            elif mode is None:
                pass

            else:
                assert False

        # If necessary, (i.e., if /tmp is not already hidden),
        # hide the directory where we store our files from processes in the container
        # by mounting an empty directory over it.
        if os.path.exists(mount_base + temp_dir):
            os.makedirs(temp_base + temp_dir)
            container.make_bind_mount(temp_base + temp_dir,
                                      mount_base + temp_dir)

        os.chroot(mount_base)
示例#5
0
    def _setup_container_filesystem(self, temp_dir, output_dir, memlimit, memory_nodes):
        """Setup the filesystem layout in the container.
         As first step, we create a copy of all existing mountpoints in mount_base, recursively,
        and as "private" mounts (i.e., changes to existing mountpoints afterwards won't propagate
        to our copy).
        Then we iterate over all mountpoints and change them
        according to the mode the user has specified (hidden, read-only, overlay, or full-access).
        This has do be done for each mountpoint because overlays are not recursive.
        Then we chroot into the new mount hierarchy.

        The new filesystem layout still has a view of the host's /proc.
        We do not mount a fresh /proc here because the grandchild still needs the old /proc.

        We do simply iterate over all existing mount points and set them to read-only/overlay them,
        because it is easier to create a new hierarchy and chroot into it.
        First, we still have access to the original mountpoints while doing so,
        and second, we avoid race conditions if someone else changes the existing mountpoints.

        @param temp_dir: The base directory under which all our directories should be created.
        """
        # All strings here are bytes to avoid issues if existing mountpoints are invalid UTF-8.
        temp_base = self._get_result_files_base(temp_dir).encode() # directory with files created by tool
        temp_dir = temp_dir.encode()

        tmpfs_opts = ["size=" + str(memlimit or "100%")]
        if memory_nodes:
            tmpfs_opts.append("mpol=bind:" + ",".join(map(str, memory_nodes)))
        tmpfs_opts = (",".join(tmpfs_opts)).encode()
        if self._container_tmpfs:
            libc.mount(None, temp_dir, b"tmpfs", 0, tmpfs_opts)

        mount_base = os.path.join(temp_dir, b"mount") # base dir for container mounts
        os.mkdir(mount_base)
        os.mkdir(temp_base)

        def _is_below(path, target_path):
            # compare with trailing slashes for cases like /foo and /foobar
            path = os.path.join(path, b"")
            target_path = os.path.join(target_path, b"")
            return path.startswith(target_path)

        def find_mode_for_dir(path, fstype=None):
            if (path == b"/proc"):
                # /proc is necessary for the grandchild to read PID, will be replaced later.
                return DIR_READ_ONLY
            if _is_below(path, b"/proc"):
                # Irrelevant.
                return None

            parent_mode = None
            result_mode = None
            for special_dir, mode in self._dir_modes.items():
                if _is_below(path, special_dir):
                    if path != special_dir:
                        parent_mode = mode
                    result_mode = mode
            assert result_mode is not None

            if result_mode == DIR_OVERLAY and (
                    _is_below(path, b"/dev") or
                    _is_below(path, b"/sys") or
                    fstype == b"cgroup"):
                # Overlay does not make sense for /dev, /sys, and all cgroups.
                return DIR_READ_ONLY

            if result_mode == DIR_OVERLAY and (
                    fstype == b"autofs" or
                    fstype == b"vfat" or
                    fstype == b"ntfs"):
                # Overlayfs does not support these as underlying file systems.
                logging.debug("Cannot use overlay mode for %s because it has file system %s. "
                              "Using read-only mode instead.",
                              path.decode(), fstype.decode())
                return DIR_READ_ONLY

            if result_mode == DIR_HIDDEN and parent_mode == DIR_HIDDEN:
                # No need to recursively recreate mountpoints in hidden dirs.
                return None
            return result_mode

        # Overlayfs needs its own additional temporary directory ("work" directory).
        # temp_base will be the "upper" layer, the host FS the "lower" layer,
        # and mount_base the mount target.
        work_base = os.path.join(temp_dir, b"overlayfs")
        os.mkdir(work_base)

        # Create a copy of host's mountpoints.
        # Setting MS_PRIVATE flag discouples our mount namespace from the hosts's,
        # i.e., mounts we do are not seen by the host, and any (un)mounts the host does afterward
        # are not seen by us. The latter is desired such that new mounts (e.g.,
        # USB sticks being plugged in) do not appear in the container.
        # Blocking host-side unmounts from being propagated has the disadvantage
        # that any unmounts done by the sysadmin won't really unmount the device
        # because it stays mounted in the container and thus keep the device busy
        # (cf. https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=739593#85).
        # We could allow unmounts being propated with MS_SLAVE instead of MS_PRIVATE,
        # but we prefer to have the mount namespace of the container being
        # unchanged during run execution.
        container.make_bind_mount(b"/", mount_base, recursive=True, private=True)

        # Ensure each special dir is a mountpoint such that the next loop covers it.
        for special_dir in self._dir_modes.keys():
            mount_path = mount_base + special_dir
            temp_path = temp_base + special_dir
            try:
                container.make_bind_mount(mount_path, mount_path)
            except OSError as e:
                # on btrfs, non-recursive bind mounts faitl
                if e.errno == errno.EINVAL:
                    try:
                        container.make_bind_mount(mount_path, mount_path, recursive=True)
                    except OSError as e2:
                        logging.debug("Failed to make %s a (recursive) bind mount: %s", mount_path, e2)
                else:
                    logging.debug("Failed to make %s a bind mount: %s", mount_path, e)
            if not os.path.exists(temp_path):
                os.makedirs(temp_path)

        # Set desired access mode for each mountpoint.
        for unused_source, full_mountpoint, fstype, options in list(container.get_mount_points()):
            if not _is_below(full_mountpoint, mount_base):
                continue
            mountpoint = full_mountpoint[len(mount_base):] or b"/"
            mode = find_mode_for_dir(mountpoint, fstype)
            if not mode:
                continue

            if not os.access(os.path.dirname(mountpoint), os.X_OK):
                # If parent is not accessible we cannot mount something on mountpoint.
                # We mark the inaccessible directory as hidden because otherwise the mountpoint
                # could become accessible (directly!) if the permissions on the parent
                # are relaxed during container execution.
                original_mountpoint = mountpoint
                parent = os.path.dirname(mountpoint)
                while not os.access(parent, os.X_OK):
                    mountpoint = parent
                    parent = os.path.dirname(mountpoint)
                mode = DIR_HIDDEN
                logging.debug(
                    "Marking inaccessible directory '%s' as hidden "
                    "because it contains a mountpoint at '%s'",
                    mountpoint.decode(), original_mountpoint.decode())
            else:
                logging.debug("Mounting '%s' as %s", mountpoint.decode(), mode)

            mount_path = mount_base + mountpoint
            temp_path = temp_base + mountpoint
            work_path = work_base + mountpoint

            if mode == DIR_OVERLAY:
                if not os.path.exists(temp_path):
                    os.makedirs(temp_path)
                if not os.path.exists(work_path):
                    os.makedirs(work_path)
                try:
                    # Previous mount in this place not needed if replaced with overlay dir.
                    libc.umount(mount_path)
                except OSError as e:
                    logging.debug(e)
                try:
                    container.make_overlay_mount(mount_path, mountpoint, temp_path, work_path)
                except OSError as e:
                    raise OSError(e.errno,
                        "Creating overlay mount for '{}' failed: {}. "
                        "Please use other directory modes."
                            .format(mountpoint.decode(), os.strerror(e.errno)))

            elif mode == DIR_HIDDEN:
                if not os.path.exists(temp_path):
                    os.makedirs(temp_path)
                try:
                    # Previous mount in this place not needed if replaced with hidden dir.
                    libc.umount(mount_path)
                except OSError as e:
                    logging.debug(e)
                container.make_bind_mount(temp_path, mount_path)

            elif mode == DIR_READ_ONLY:
                try:
                    container.remount_with_additional_flags(mount_path, options, libc.MS_RDONLY)
                except OSError as e:
                    if e.errno == errno.EACCES:
                        logging.warning(
                            "Cannot mount '%s', directory may be missing from container.",
                            mountpoint.decode())
                    else:
                        # If this mountpoint is below an overlay/hidden dir re-create mountpoint.
                        # Linux does not support making read-only bind mounts in one step:
                        # https://lwn.net/Articles/281157/ http://man7.org/linux/man-pages/man8/mount.8.html
                        container.make_bind_mount(
                            mountpoint, mount_path, recursive=True, private=True)
                        container.remount_with_additional_flags(mount_path, options, libc.MS_RDONLY)

            elif mode == DIR_FULL_ACCESS:
                try:
                    # Ensure directory is still a mountpoint by attempting to remount.
                    container.remount_with_additional_flags(mount_path, options, 0)
                except OSError as e:
                    if e.errno == errno.EACCES:
                        logging.warning(
                            "Cannot mount '%s', directory may be missing from container.",
                            mountpoint.decode())
                    else:
                        # If this mountpoint is below an overlay/hidden dir re-create mountpoint.
                        container.make_bind_mount(
                            mountpoint, mount_path, recursive=True, private=True)

            else:
                assert False

        # Now configure some special hard-coded cases

        def make_tmpfs_dir(path):
            """Ensure that a tmpfs is mounted on path, if the path exists"""
            if path in self._dir_modes:
                return # explicitly configured by user
            mount_tmpfs = mount_base + path
            temp_tmpfs = temp_base + path
            util.makedirs(temp_tmpfs, exist_ok=True)
            if os.path.isdir(mount_tmpfs):
                # If we already have a tmpfs, we can just bind mount it, otherwise we need one
                if self._container_tmpfs:
                    container.make_bind_mount(temp_tmpfs, mount_tmpfs)
                else:
                    libc.mount(None, mount_tmpfs, b"tmpfs", 0, tmpfs_opts)

        # The following directories should be writable RAM disks for Posix shared memory.
        # For example, the Python multiprocessing module explicitly checks for a tmpfs instance.
        make_tmpfs_dir(b"/dev/shm")
        make_tmpfs_dir(b"/run/shm")

        if self._container_system_config:
            # If overlayfs is not used for /etc, we need additional bind mounts
            # for files in /etc that we want to override, like /etc/passwd
            config_mount_base = mount_base if find_mode_for_dir(b"/etc") != DIR_OVERLAY else None
            container.setup_container_system_config(temp_base, config_mount_base )

        if output_dir:
            # We need a way to see temp_base in the container in order to be able to copy result
            # files out of it, so we need a directory that is guaranteed to exist in order to use
            # it as mountpoint for a bind mount to temp_base.
            # Of course, the tool inside the container should not have access to temp_base,
            # so we will add another bind mount with an empty directory on top
            # (equivalent to --hidden-dir). After the tool terminates we can unmount
            # the top-level bind mount and then access temp_base. However, this works only
            # if there is no other mount point below that directory, and the user can force us
            # to create mount points at arbitrary directory if a directory mode is specified.
            # So we need an existing directory with no mount points below, and luckily temp_dir
            # fulfills all requirements (because we have just created it as fresh drectory ourselves).
            # So we mount temp_base outside of the container to temp_dir inside.
            util.makedirs(mount_base + temp_dir, exist_ok=True)
            container.make_bind_mount(temp_base, mount_base + temp_dir, read_only=True)
            # And the following if branch will automatically hide the bind
            # mount below an empty directory.

        # If necessary, (i.e., if /tmp is not already hidden),
        # hide the directory where we store our files from processes in the container
        # by mounting an empty directory over it.
        if os.path.exists(mount_base + temp_dir):
            util.makedirs(temp_base + temp_dir, exist_ok=True)
            container.make_bind_mount(temp_base + temp_dir, mount_base + temp_dir)

        os.chroot(mount_base)
示例#6
0
        def child():
            """Setup everything inside the container, start the tool, and wait for result."""
            try:
                logging.debug("Child: child process of RunExecutor with PID %d started",
                              container.get_my_pid_from_procfs())

                # Put all received signals on hold until we handle them later.
                container.block_all_signals()

                # We want to avoid leaking file descriptors to the executed child.
                # It is also nice if the child has only the minimal necessary file descriptors,
                # to avoid keeping other pipes and files open, e.g., those that the parent
                # uses to communicate with other containers (if containers are started in parallel).
                # Thus we do not use the close_fds feature of subprocess.Popen,
                # but do the same here manually.
                # We keep the relevant ends of our pipes, and stdin/out/err of child and grandchild.
                necessary_fds = {sys.stdin, sys.stdout, sys.stderr,
                    to_parent, from_parent, stdin, stdout, stderr} - {None}
                container.close_open_fds(keep_files=necessary_fds)

                try:
                    if self._container_system_config:
                        # A standard hostname increases reproducibility.
                        libc.sethostname(container.CONTAINER_HOSTNAME)

                    if not self._allow_network:
                        container.activate_network_interface("lo")

                    # Wait until user mapping is finished, this is necessary for filesystem writes
                    received = os.read(from_parent, len(MARKER_USER_MAPPING_COMPLETED))
                    assert received == MARKER_USER_MAPPING_COMPLETED, received

                    if root_dir is not None:
                        self._setup_root_filesystem(root_dir)
                    else:
                        self._setup_container_filesystem(
                            temp_dir,
                            output_dir if result_files_patterns else None,
                            memlimit,
                            memory_nodes)
                except EnvironmentError as e:
                    logging.critical("Failed to configure container: %s", e)
                    return CHILD_OSERROR

                try:
                    os.chdir(cwd)
                except EnvironmentError as e:
                    logging.critical(
                        "Cannot change into working directory inside container: %s", e)
                    return CHILD_OSERROR

                try:
                    grandchild_proc = subprocess.Popen(args,
                                        stdin=stdin,
                                        stdout=stdout, stderr=stderr,
                                        env=env,
                                        close_fds=False,
                                        preexec_fn=grandchild)
                except (EnvironmentError, RuntimeError) as e:
                    logging.critical("Cannot start process: %s", e)
                    return CHILD_OSERROR

                # keep capability for unmount if necessary later
                necessary_capabilities = [libc.CAP_SYS_ADMIN] if result_files_patterns else []
                container.drop_capabilities(keep=necessary_capabilities)

                # Close other fds that were still necessary above.
                container.close_open_fds(keep_files={sys.stdout, sys.stderr, to_parent, from_parent})

                # Set up signal handlers to forward signals to grandchild
                # (because we are PID 1, there is a special signal handling otherwise).
                # cf. dumb-init project: https://github.com/Yelp/dumb-init
                # Also wait for grandchild and return its result.
                if _HAS_SIGWAIT:
                    grandchild_result = container.wait_for_child_and_forward_all_signals(
                        grandchild_proc.pid, args[0])
                else:
                    container.forward_all_signals_async(grandchild_proc.pid, args[0])
                    grandchild_result = self._wait_for_process(grandchild_proc.pid, args[0])

                logging.debug("Child: process %s terminated with exit code %d.",
                              args[0], grandchild_result[0])

                if result_files_patterns:
                    # Remove the bind mount that _setup_container_filesystem added
                    # such that the parent can access the result files.
                    libc.umount(temp_dir.encode())

                os.write(to_parent, pickle.dumps(grandchild_result))
                os.close(to_parent)

                # Now the parent copies the output files, we need to wait until this is finished.
                # If the child terminates, the container file system and its tmpfs go away.
                os.read(from_parent, 1)
                os.close(from_parent)

                return 0
            except EnvironmentError as e:
                logging.exception("Error in child process of RunExecutor")
                return CHILD_OSERROR
            except:
                # Need to catch everything because this method always needs to return a int
                # (we are inside a C callback that requires returning int).
                logging.exception("Error in child process of RunExecutor")
                return CHILD_UNKNOWN_ERROR