示例#1
0
    def start_bundle(self, bundle):
        """
        Run the given bundle using an available Machine.
        Return whether something was started.
        """
        # Check that we're running a bundle in the QUEUED state.
        state_message = "Unexpected bundle state: %s" % (bundle.state,)
        precondition(bundle.state == State.QUEUED, state_message)
        data_hash_message = "Unexpected bundle data_hash: %s" % (bundle.data_hash,)
        precondition(bundle.data_hash is None, data_hash_message)

        # Run the bundle.
        with self.profile("Running bundle..."):
            started = False
            if isinstance(bundle, RunBundle):
                try:
                    # Get the username of the bundle
                    results = self.auth_handler.get_users("ids", [bundle.owner_id])
                    if results.get(bundle.owner_id):
                        username = results[bundle.owner_id].name
                    else:
                        username = str(bundle.owner_id)

                    status = self.machine.start_bundle(
                        bundle, self.bundle_store, self.get_parent_dict(bundle), username
                    )
                    if status != None:
                        status["started"] = int(time.time())
                        started = True

                except Exception as e:
                    # If there's an exception, we just make the bundle fail
                    # (even if it's not the bundle's fault).
                    real_path = canonicalize.get_current_location(self.bundle_store, bundle.uuid)
                    path_util.make_directory(real_path)
                    status = {
                        "bundle": bundle,
                        "success": False,
                        "failure_message": "Internal error: " + str(e),
                        "temp_dir": real_path,
                    }
                    print "=== INTERNAL ERROR: %s" % e
                    started = True  # Force failing
                    traceback.print_exc()
            else:  # MakeBundle
                started = True
            if started:
                print "-- START BUNDLE: %s" % (bundle,)
                self._update_events_log("start_bundle", bundle, (bundle.uuid,))

            # If we have a MakeBundle, then just process it immediately.
            if isinstance(bundle, MakeBundle):
                real_path = canonicalize.get_current_location(self.bundle_store, bundle.uuid)
                path_util.make_directory(real_path)
                status = {"bundle": bundle, "success": True, "temp_dir": real_path}

            # Update database
            if started:
                self.update_running_bundle(status)
            return started
示例#2
0
    def start_bundle(self, bundle):
        '''
        Run the given bundle using an available Machine.
        Return whether something was started.
        '''
        # Check that we're running a bundle in the QUEUED state.
        state_message = 'Unexpected bundle state: %s' % (bundle.state,)
        precondition(bundle.state == State.QUEUED, state_message)
        data_hash_message = 'Unexpected bundle data_hash: %s' % (bundle.data_hash,)
        precondition(bundle.data_hash is None, data_hash_message)

        # Run the bundle.
        with self.profile('Running bundle...'):
            started = False
            if isinstance(bundle, RunBundle):
                try:
                    # Get the username of the bundle
                    results = self.auth_handler.get_users('ids', [bundle.owner_id])
                    if results.get(bundle.owner_id):
                        username = results[bundle.owner_id].name
                    else:
                        username = str(bundle.owner_id)

                    status = self.machine.start_bundle(bundle, self.bundle_store, self.get_parent_dict(bundle), username)
                    if status != None:
                        status['started'] = int(time.time())
                        started = True

                except Exception as e:
                    # If there's an exception, we just make the bundle fail
                    # (even if it's not the bundle's fault).
                    real_path = canonicalize.get_current_location(self.bundle_store, bundle.uuid)
                    path_util.make_directory(real_path)
                    status = {'bundle': bundle, 'success': False, 'failure_message': 'Internal error: ' + str(e), 'temp_dir': real_path}
                    print '=== INTERNAL ERROR: %s' % e
                    started = True  # Force failing
                    traceback.print_exc()
            else:  # MakeBundle
                started = True
            if started:
                print '-- START BUNDLE: %s' % (bundle,)
                self._update_events_log('start_bundle', bundle, (bundle.uuid,))

            # If we have a MakeBundle, then just process it immediately.
            if isinstance(bundle, MakeBundle):
                real_path = canonicalize.get_current_location(self.bundle_store, bundle.uuid)
                path_util.make_directory(real_path)
                status = {'bundle': bundle, 'success': True, 'temp_dir': real_path}

            # Update database
            if started:
                self.update_running_bundle(status)
            return started
示例#3
0
    def start_bundle(self, bundle):
        '''
        Run the given bundle using an available Machine.
        Return whether something was started.
        '''
        # Check that we're running a bundle in the RUNNING state.
        state_message = 'Unexpected bundle state: %s' % (bundle.state,)
        precondition(bundle.state == State.RUNNING, state_message)
        data_hash_message = 'Unexpected bundle data_hash: %s' % (bundle.data_hash,)
        precondition(bundle.data_hash is None, data_hash_message)

        # Run the bundle.
        with self.profile('Running bundle...'):
            started = False
            if isinstance(bundle, RunBundle):
                try:
                    # Get the username of the bundle
                    results = self.auth_handler.get_users('ids', [bundle.owner_id])
                    if results.get(bundle.owner_id):
                        username = results[bundle.owner_id].name
                    else:
                        username = str(bundle.owner_id)

                    status = self.machine.start_bundle(bundle, self.bundle_store, self.get_parent_dict(bundle), username)
                    if status != None:
                        started = True

                except Exception as e:
                    # If there's an exception, we just make the bundle fail
                    # (even if it's not the bundle's fault).
                    temp_dir = canonicalize.get_current_location(self.bundle_store, bundle.uuid)
                    path_util.make_directory(temp_dir)
                    status = {'bundle': bundle, 'success': False, 'failure_message': str(e), 'temp_dir': temp_dir}
                    print '=== INTERNAL ERROR: %s' % e
                    started = True  # Force failing
                    traceback.print_exc()
            else:  # MakeBundle
                started = True
            if started: print '-- START BUNDLE: %s' % (bundle,)

            # If we have a MakeBundle, then just process it immediately.
            if isinstance(bundle, MakeBundle):
                temp_dir = canonicalize.get_current_location(self.bundle_store, bundle.uuid)
                path_util.make_directory(temp_dir)
                status = {'bundle': bundle, 'success': True, 'temp_dir': temp_dir}

            # Update database
            if started:
                self.update_running_bundle(status)
            return started
示例#4
0
    def start_bundle(self, bundle, bundle_store, parent_dict, username):
        """
        Start a bundle in the background.
        """
        if self.bundle != None:
            return None
        temp_dir = canonicalize.get_current_location(bundle_store, bundle.uuid)
        path_util.make_directory(temp_dir)

        # We don't follow symlinks (for consistency with remote
        # machine, where it is more secure, so people can't make us
        # copy random files on the system).  Of course in local mode,
        # if some of those symlinks are absolute, the run can
        # read/write those locations.  But we're not sandboxed, so
        # anything could happen.  The dependencies are copied, so in
        # practice, this is not a bit worry.
        pairs = bundle.get_dependency_paths(bundle_store, parent_dict, temp_dir)
        print >> sys.stderr, "LocalMachine.start_bundle: copying dependencies of %s to %s" % (bundle.uuid, temp_dir)
        for (source, target) in pairs:
            path_util.copy(source, target, follow_symlinks=False)

        script_file = temp_dir + ".sh"
        with open(script_file, "w") as f:
            f.write("cd %s &&\n" % temp_dir)
            f.write("(%s) > stdout 2>stderr\n" % bundle.command)
        # Use stdbuf (if it exists) to turn off buffering so we get real-time feedback.
        if os.path.exists("/usr/bin/stdbuf"):
            process = subprocess.Popen("/usr/bin/stdbuf -o0 bash " + script_file, shell=True)
        else:
            process = subprocess.Popen("bash " + script_file, shell=True)

        self.bundle = bundle
        self.temp_dir = temp_dir
        self.process = process
        return {"bundle": bundle, "temp_dir": temp_dir, "job_handle": str(process.pid)}
示例#5
0
    def run_bundle(self, bundle):
        """
        Run the given bundle and then update its state to be either READY or FAILED.
        If the bundle is now READY, its data_hash should be set.
        """
        # Check that we're running a bundle in the RUNNING state.
        state_message = "Unexpected bundle state: %s" % (bundle.state,)
        precondition(bundle.state == State.RUNNING, state_message)
        data_hash_message = "Unexpected bundle data_hash: %s" % (bundle.data_hash,)
        precondition(bundle.data_hash is None, data_hash_message)
        # Compute a dict mapping parent_uuid -> parent for each dep of this bundle.
        parent_uuids = set(dep.parent_uuid for dep in bundle.dependencies)
        parents = self.model.batch_get_bundles(uuid=parent_uuids)
        parent_dict = {parent.uuid: parent for parent in parents}

        # Get temp directory
        temp_dir = canonicalize.get_current_location(self.bundle_store, bundle.uuid)

        # Run the bundle. Mark it READY if it is successful and FAILED otherwise.
        with self.profile("Running bundle..."):
            print "-- START RUN: %s" % (bundle,)
            try:
                (data_hash, metadata) = bundle.run(self.bundle_store, parent_dict, temp_dir)
                state = State.READY
            except Exception:
                # TODO(pliang): distinguish between internal CodaLab error and the program failing
                # TODO(skishore): Add metadata updates: time / CPU of run.
                (type, error, tb) = sys.exc_info()
                with self.profile("Uploading failed bundle..."):
                    (data_hash, metadata) = self.upload_failed_bundle(error, temp_dir)
                failure_message = "%s: %s" % (error.__class__.__name__, error)
                if data_hash:
                    suffix = "The results of the failed execution were uploaded."
                    failure_message = "%s\n%s" % (failure_message, suffix)
                elif not isinstance(error, UsageError):
                    failure_message = "Traceback:\n%s\n%s" % ("".join(traceback.format_tb(tb))[:-1], failure_message)
                metadata.update({"failure_message": failure_message})
                state = State.FAILED
            self.finalize_run(bundle, state, data_hash, metadata)
            print "-- END RUN: %s [%s]" % (bundle, state)
        # Clean up after the run.
        with self.profile("Cleaning up temp directory..."):
            path_util.remove(temp_dir)
示例#6
0
    def start_bundle(self, bundle, bundle_store, parent_dict, username):
        '''
        Start a bundle in the background.
        '''
        if self.bundle != None: return None
        temp_dir = canonicalize.get_current_location(bundle_store, bundle.uuid)
        path_util.make_directory(temp_dir)

        # We don't follow symlinks (for consistency with remote
        # machine, where it is more secure, so people can't make us
        # copy random files on the system).  Of course in local mode,
        # if some of those symlinks are absolute, the run can
        # read/write those locations.  But we're not sandboxed, so
        # anything could happen.  The dependencies are copied, so in
        # practice, this is not a bit worry.
        pairs = bundle.get_dependency_paths(bundle_store, parent_dict,
                                            temp_dir)
        print >> sys.stderr, 'LocalMachine.start_bundle: copying dependencies of %s to %s' % (
            bundle.uuid, temp_dir)
        for (source, target) in pairs:
            path_util.copy(source, target, follow_symlinks=False)

        script_file = temp_dir + '.sh'
        with open(script_file, 'w') as f:
            f.write("cd %s &&\n" % temp_dir)
            f.write('(%s) > stdout 2>stderr\n' % bundle.command)
        # Use stdbuf (if it exists) to turn off buffering so we get real-time feedback.
        if os.path.exists('/usr/bin/stdbuf'):
            process = subprocess.Popen("/usr/bin/stdbuf -o0 bash " +
                                       script_file,
                                       shell=True)
        else:
            process = subprocess.Popen("bash " + script_file, shell=True)

        self.bundle = bundle
        self.temp_dir = temp_dir
        self.process = process
        return {
            'bundle': bundle,
            'temp_dir': temp_dir,
            'job_handle': str(process.pid)
        }
    def start_bundle(self, bundle, bundle_store, parent_dict, username):
        '''
        Sets up all the temporary files and then dispatches the job.
        username: the username of the owner of the bundle
        Returns the bundle information.
        '''
        # Create a temporary directory
        temp_dir = canonicalize.get_current_location(bundle_store, bundle.uuid)
        temp_dir = os.path.realpath(temp_dir)  # Follow symlinks
        path_util.make_directory(temp_dir)

        # Copy all the dependencies to that temporary directory.
        pairs = bundle.get_dependency_paths(bundle_store, parent_dict, temp_dir)
        print >>sys.stderr, 'RemoteMachine.start_bundle: copying dependencies of %s to %s' % (bundle.uuid, temp_dir)
        for (source, target) in pairs:
            path_util.copy(source, target, follow_symlinks=False)

        # Set defaults for the dispatcher.
        docker_image = self.default_docker_image
        if bundle.metadata.request_docker_image:
            docker_image = bundle.metadata.request_docker_image
        request_time = self.default_request_time
        if bundle.metadata.request_time:
            request_time = bundle.metadata.request_time
        request_memory = self.default_request_memory
        if bundle.metadata.request_memory:
            request_memory = bundle.metadata.request_memory
        request_cpus = self.default_request_cpus
        if bundle.metadata.request_cpus:
            request_cpus = bundle.metadata.request_cpus
        request_gpus = self.default_request_gpus
        if bundle.metadata.request_gpus:
            request_gpus = bundle.metadata.request_gpus
        request_queue = self.default_request_queue
        if bundle.metadata.request_queue:
            request_queue = bundle.metadata.request_queue
        request_priority = self.default_request_priority
        if bundle.metadata.request_priority:
            request_priority = bundle.metadata.request_priority

        script_file = temp_dir + '.sh'  # main entry point
        ptr_temp_dir = '$temp_dir'
        # 1) If no argument to script_file, use the temp_dir (e.g., Torque, master/worker share file system).
        # 2) If argument is 'use_script_for_temp_dir', use the script to determine temp_dir (e.g., qsub, no master/worker do not share file system).
        set_temp_dir_header = 'if [ -z "$1" ]; then temp_dir=' + temp_dir + '; else temp_dir=`readlink -f $0 | sed -e \'s/\\.sh$//\'`; fi\n'

        # Write the command to be executed to a script.
        if docker_image:
            internal_script_file = temp_dir + '-internal.sh'  # run inside the docker container
            # These paths depend on $temp_dir, an environment variable which will be set (referenced inside script_file)
            ptr_container_file = ptr_temp_dir + '.cid'  # contains the docker container id
            ptr_action_file = ptr_temp_dir + '.action'  # send actions to the container (e.g., kill)
            ptr_status_dir = ptr_temp_dir + '.status'  # receive information from the container (e.g., memory)
            ptr_script_file = ptr_temp_dir + '.sh'  # main entry point
            ptr_internal_script_file = ptr_temp_dir + '-internal.sh'  # run inside the docker container
            # Names of file inside the docker container
            docker_temp_dir = bundle.uuid
            docker_internal_script_file = bundle.uuid + '-internal.sh'

            # 1) script_file starts the docker container and runs internal_script_file in docker.
            # --rm removes the docker container once the job terminates (note that this makes things slow)
            # -v mounts the internal and user scripts and the temp directory
            # Trap SIGTERM and forward it to docker.
            with open(script_file, 'w') as f:
                f.write(set_temp_dir_header)

                # Monitor CPU/memory/disk
                def copy_if_exists(source_template, arg, target):
                    source = source_template % arg
                    # -f because target might be read-only
                    return 'if [ -e %s ] && [ -e %s ]; then cp -f %s %s; fi' % (arg, source, source, target)
                monitor_commands = [
                    # Report on status (memory, cpu, etc.)
                    'mkdir -p %s' % ptr_status_dir,
                    'if [ -e /cgroup ]; then cgroup=/cgroup; else cgroup=/sys/fs/cgroup; fi',  # find where cgroup is
                    copy_if_exists('$cgroup/cpuacct/docker/$(cat %s)/cpuacct.stat', ptr_container_file, ptr_status_dir),
                    copy_if_exists('$cgroup/memory/docker/$(cat %s)/memory.usage_in_bytes', ptr_container_file, ptr_status_dir),
                    copy_if_exists('$cgroup/blkio/docker/$(cat %s)/blkio.throttle.io_service_bytes', ptr_container_file, ptr_status_dir),
                    # Respond to kill action
                    '[ -e %s ] && [ "$(cat %s)" == "kill" ] && docker kill $(cat %s) && rm %s' % (ptr_action_file, ptr_action_file, ptr_container_file, ptr_action_file),
                    # Sleep
                    'sleep 1',
                ]
                f.write('while [ -e %s ]; do\n  %s\ndone &\n' % (ptr_temp_dir, '\n  '. join(monitor_commands)))

                # Tell docker to constrain resources (memory).
                # Note: limiting memory is not always supported. See:
                # http://programster.blogspot.com/2014/09/docker-implementing-container-memory.html
                resource_args = ''
                if bundle.metadata.request_memory:
                    resource_args += ' -m %s' % int(formatting.parse_size(bundle.metadata.request_memory))
                # TODO: would constrain --cpuset=0, but difficult because don't know the CPU ids

                f.write("docker run%s --rm --cidfile %s -u %s -v %s:/%s -v %s:/%s %s bash %s & wait $!\n" % (
                    resource_args,
                    ptr_container_file,
                    os.geteuid(),
                    ptr_temp_dir, docker_temp_dir,
                    ptr_internal_script_file, docker_internal_script_file,
                    docker_image,
                    docker_internal_script_file))

            # 2) internal_script_file runs the actual command inside the docker container
            with open(internal_script_file, 'w') as f:
                # Make sure I have a username
                username = pwd.getpwuid(os.getuid())[0]  # do this because os.getlogin() doesn't always work
                f.write("echo %s::%s:%s::/:/bin/bash >> /etc/passwd\n" % (username, os.geteuid(), os.getgid()))
                # Do this because .bashrc isn't sourced automatically (even with --login, though it works with docker -t -i, strange...)
                f.write(". .bashrc || exit 1\n")
                # Go into the temp directory
                f.write("cd %s &&\n" % docker_temp_dir)
                # Run the actual command
                f.write('(%s) > stdout 2>stderr\n' % bundle.command)
        else:
            # Just run the command regularly without docker
            with open(script_file, 'w') as f:
                f.write(set_temp_dir_header)
                f.write("cd %s &&\n" % ptr_temp_dir)
                f.write('(%s) > stdout 2>stderr\n' % bundle.command)

        # Determine resources to request
        resource_args = []
        if request_time:
            resource_args.extend(['--request_time', formatting.parse_duration(request_time)])
        if request_memory:
            resource_args.extend(['--request_memory', formatting.parse_size(request_memory)])
        if request_cpus:
            resource_args.extend(['--request_cpus', request_cpus])
        if request_gpus:
            resource_args.extend(['--request_gpus', request_gpus])
        if request_queue:
            resource_args.extend(['--request_queue', request_queue])
        if request_priority:
            resource_args.extend(['--request_priority', request_priority])
        if username:
            resource_args.extend(['--username', username])

        # Start the command
        args = self.dispatch_command.split() + ['start'] + map(str, resource_args) + [script_file]
        if self.verbose >= 1: print '=== start_bundle(): running %s' % args
        result = json.loads(self.run_command_get_stdout(args))
        if self.verbose >= 1: print '=== start_bundle(): got %s' % result

        # Return the information about the job.
        return {
            'bundle': bundle,
            'temp_dir': temp_dir,
            'job_handle': result['handle'],
            'docker_image': docker_image,
        }
示例#8
0
    def start_bundle(self, bundle, bundle_store, parent_dict, username):
        '''
        Sets up all the temporary files and then dispatches the job.
        username: the username of the owner of the bundle
        Returns the bundle information.
        '''
        # Create a temporary directory
        temp_dir = canonicalize.get_current_location(bundle_store, bundle.uuid)
        temp_dir = os.path.realpath(temp_dir)  # Follow symlinks
        path_util.make_directory(temp_dir)

        # Copy all the dependencies to that temporary directory.
        pairs = bundle.get_dependency_paths(bundle_store, parent_dict, temp_dir)
        print >>sys.stderr, 'RemoteMachine.start_bundle: copying dependencies of %s to %s' % (bundle.uuid, temp_dir)
        for (source, target) in pairs:
            path_util.copy(source, target, follow_symlinks=False)

        # Set docker image
        docker_image = self.default_docker_image
        if bundle.metadata.request_docker_image:
            docker_image = bundle.metadata.request_docker_image

        # Write the command to be executed to a script.
        if docker_image:
            container_file = temp_dir + '.cid'  # contains the docker container id
            action_file = temp_dir + '.action'  # send actions to the container (e.g., kill)
            status_dir = temp_dir + '.status'  # receive information from the container (e.g., memory)
            script_file = temp_dir + '.sh'  # main entry point
            internal_script_file = temp_dir + '-internal.sh'  # run inside the docker container
            # Names of file inside the docker container
            docker_temp_dir = bundle.uuid
            docker_internal_script_file = bundle.uuid + '-internal.sh'

            # 1) script_file starts the docker container and runs internal_script_file in docker.
            # --rm removes the docker container once the job terminates (note that this makes things slow)
            # -v mounts the internal and user scripts and the temp directory
            # Trap SIGTERM and forward it to docker.
            with open(script_file, 'w') as f:
                # trap doesn't quite work reliably with Torque, so don't use it
                #f.write('trap \'echo Killing docker container $(cat %s); docker kill $(cat %s); echo Killed: $?; exit 143\' TERM\n' % (container_file, container_file))
                # Inspect doesn't tell us a lot, so don't use it
                #f.write('while [ -e %s ]; do docker inspect $(cat %s) > %s; sleep 1; done &\n' % (temp_dir, container_file, status_dir))
                
                # Monitor CPU/memory/disk
                monitor_commands = [
                    # Report on status
                    'mkdir -p %s' % status_dir,
                    'if [ -e /cgroup ]; then cgroup=/cgroup; else cgroup=/sys/fs/cgroup; fi',  # find where cgroup is
                    'cp -f $cgroup/cpuacct/docker/$(cat %s)/cpuacct.stat %s' % (container_file, status_dir),
                    'cp -f $cgroup/memory/docker/$(cat %s)/memory.usage_in_bytes %s' % (container_file, status_dir),
                    'cp -f $cgroup/blkio/docker/$(cat %s)/blkio.throttle.io_service_bytes %s' % (container_file, status_dir),
                    # Respond to actions
                    '[ -e %s ] && [ "$(cat %s)" == "kill" ] && docker kill $(cat %s) && rm %s' % (action_file, action_file, container_file, action_file),
                ]
                f.write('while [ -e %s ]; do %s; sleep 1; done &\n' % (temp_dir, '; '. join(monitor_commands)))

                # Constrain resources
                resource_args = ''
                if bundle.metadata.request_memory:
                    resource_args += ' -m %s' % int(formatting.parse_size(bundle.metadata.request_memory))
                # TODO: would constrain --cpuset=0, but difficult because don't know the CPU ids

                f.write("docker run%s --rm --cidfile %s -u %s -v %s:/%s -v %s:/%s %s bash %s & wait $!\n" % (
                    resource_args,
                    container_file, os.geteuid(),
                    temp_dir, docker_temp_dir,
                    internal_script_file, docker_internal_script_file,
                    docker_image, docker_internal_script_file))

            # 2) internal_script_file runs the actual command inside the docker container
            with open(internal_script_file, 'w') as f:
                # Make sure I have a username
                f.write("echo %s::%s:%s::/:/bin/bash >> /etc/passwd\n" % (os.getlogin(), os.geteuid(), os.getgid()))
                # Do this because .bashrc isn't sourced automatically (even with --login, though it works with docker -t -i, strange...)
                f.write(". .bashrc || exit 1\n")
                # Go into the temp directory
                f.write("cd %s &&\n" % docker_temp_dir)
                # Run the actual command
                f.write('(%s) > stdout 2>stderr\n' % bundle.command)
        else:
            # Just run the command regularly without docker
            script_file = temp_dir + '.sh'
            with open(script_file, 'w') as f:
                f.write("cd %s &&\n" % temp_dir)
                f.write('(%s) > stdout 2>stderr\n' % bundle.command)

        # Determine resources to request
        resource_args = []
        if bundle.metadata.request_time:
            resource_args.extend(['--request_time', formatting.parse_duration(bundle.metadata.request_time)])
        if bundle.metadata.request_memory:
            resource_args.extend(['--request_memory', formatting.parse_size(bundle.metadata.request_memory)])
        if bundle.metadata.request_cpus:
            resource_args.extend(['--request_cpus', bundle.metadata.request_cpus])
        if bundle.metadata.request_gpus:
            resource_args.extend(['--request_gpus', bundle.metadata.request_gpus])
        if bundle.metadata.request_queue:
            resource_args.extend(['--request_queue', bundle.metadata.request_queue])
        if username:
            resource_args.extend(['--username', username])

        # Start the command
        args = self.dispatch_command.split() + ['start'] + map(str, resource_args) + [script_file]
        if self.verbose >= 1: print '=== start_bundle(): running %s' % args
        result = json.loads(self.run_command_get_stdout(args))
        if self.verbose >= 1: print '=== start_bundle(): got %s' % result

        # Return the information about the job.
        return {
            'bundle': bundle,
            'temp_dir': temp_dir,
            'job_handle': result['handle'],
            'docker_image': docker_image,
        }
示例#9
0
    def start_bundle(self, bundle, bundle_store, parent_dict, username):
        '''
        Sets up all the temporary files and then dispatches the job.
        username: the username of the owner of the bundle
        Returns the bundle information.
        '''
        # Create a temporary directory
        temp_dir = canonicalize.get_current_location(bundle_store, bundle.uuid)
        temp_dir = os.path.realpath(temp_dir)  # Follow symlinks
        path_util.make_directory(temp_dir)

        # Copy all the dependencies to that temporary directory.
        pairs = bundle.get_dependency_paths(bundle_store, parent_dict, temp_dir)
        print >>sys.stderr, 'RemoteMachine.start_bundle: copying dependencies of %s to %s' % (bundle.uuid, temp_dir)
        for (source, target) in pairs:
            path_util.copy(source, target, follow_symlinks=False)

        # Set defaults for the dispatcher.
        docker_image = bundle.metadata.request_docker_image or self.default_docker_image
        # Parse |request_string| using |to_value|, but don't exceed |max_value|.
        def parse_and_min(to_value, request_string, default_value, max_value):
            # Use default if request value doesn't exist
            if request_string:
                request_value = to_value(request_string)
            else:
                request_value = default_value
            if request_value and max_value:
                return int(min(request_value, max_value))
            elif request_value:
                return int(request_value)
            elif max_value:
                return int(max_value)
            else:
                return None
        request_time = parse_and_min(formatting.parse_duration, bundle.metadata.request_time, self.default_request_time, self.max_request_time)
        request_memory = parse_and_min(formatting.parse_size, bundle.metadata.request_memory, self.default_request_memory, self.max_request_memory)
        request_disk = parse_and_min(formatting.parse_size, bundle.metadata.request_disk, self.default_request_disk, self.max_request_disk)

        request_cpus = bundle.metadata.request_cpus or self.default_request_cpus
        request_gpus = bundle.metadata.request_gpus or self.default_request_gpus
        request_queue = bundle.metadata.request_queue or self.default_request_queue
        request_priority = bundle.metadata.request_priority or self.default_request_priority
        request_network = bundle.metadata.request_network or self.default_request_network

        script_file = temp_dir + '.sh'  # main entry point
        ptr_temp_dir = '$temp_dir'
        # 1) If no argument to script_file, use the temp_dir (e.g., Torque, master/worker share file system).
        # 2) If argument is 'use_script_for_temp_dir', use the script to determine temp_dir (e.g., qsub, no master/worker do not share file system).
        set_temp_dir_header = 'if [ -z "$1" ]; then temp_dir=' + temp_dir + '; else temp_dir=`readlink -f $0 | sed -e \'s/\\.sh$//\'`; fi\n'

        # Write the command to be executed to a script.
        internal_script_file = temp_dir + '-internal.sh'  # run inside the docker container
        # These paths depend on $temp_dir, an environment variable which will be set (referenced inside script_file)
        ptr_container_file = ptr_temp_dir + '.cid'  # contains the docker container id
        ptr_action_file = ptr_temp_dir + '.action'  # send actions to the container (e.g., kill)
        ptr_status_dir = ptr_temp_dir + '.status'  # receive information from the container (e.g., memory)
        ptr_script_file = ptr_temp_dir + '.sh'  # main entry point
        ptr_internal_script_file = ptr_temp_dir + '-internal.sh'  # run inside the docker container
        # Names of file inside the docker container
        docker_temp_dir = '/' + bundle.uuid
        docker_internal_script_file = '/' + bundle.uuid + '-internal.sh'

        # 1) script_file starts the docker container and runs internal_script_file in docker.
        # --rm removes the docker container once the job terminates (note that this makes things slow)
        # -v mounts the internal and user scripts and the temp directory
        # Trap SIGTERM and forward it to docker.
        with open(script_file, 'w') as f:
            f.write(set_temp_dir_header)

            # Monitor CPU/memory/disk
            # Used to copy status about the docker container.
            def copy_if_exists(source_template, arg, target):
                source = source_template % arg
                # -f because target might be read-only
                return 'if [ -e %s ] && [ -e %s ]; then cp -f %s %s; fi' % (arg, source, source, target)

            def get_field(path, col):
                return 'cat %s | cut -f%s -d\'%s\'' % (path, col, BundleAction.SEPARATOR)

            monitor_commands = [
                # Report on status (memory, cpu, etc.)
                'mkdir -p %s' % ptr_status_dir,
                'if [ -e /cgroup ]; then cgroup=/cgroup; else cgroup=/sys/fs/cgroup; fi',  # find where cgroup is
                copy_if_exists('$cgroup/cpuacct/docker/$(cat %s)/cpuacct.stat', ptr_container_file, ptr_status_dir),
                copy_if_exists('$cgroup/memory/docker/$(cat %s)/memory.usage_in_bytes', ptr_container_file, ptr_status_dir),
                copy_if_exists('$cgroup/blkio/docker/$(cat %s)/blkio.throttle.io_service_bytes', ptr_container_file, ptr_status_dir),
                # Enforce memory limits
                '[ -e "%s/memory.usage_in_bytes" ] && mem=$(cat %s/memory.usage_in_bytes)' % (ptr_status_dir, ptr_status_dir),
                'echo "memory: $mem (max %s)"' % request_memory,
                'if [ -n "$mem" ] && [ "$mem" -gt "%s" ]; then echo "[CodaLab] Memory limit exceeded: $mem > %s, terminating." >> %s/stderr; docker kill $(cat %s); break; fi' % \
                    (request_memory, request_memory, ptr_temp_dir, ptr_container_file),
                # Enforce disk limits
                'disk=$(du -sb %s | cut -f1)' % ptr_temp_dir,
                'echo "disk: $disk (max %s)"' % request_disk,
                'if [ -n "$disk" ] && [ "$disk" -gt "%s" ]; then echo "[CodaLab] Disk limit exceeded: $disk > %s, terminating." >> %s/stderr; docker kill $(cat %s); break; fi' % \
                    (request_disk, request_disk, ptr_temp_dir, ptr_container_file),
                # Execute "kill"
                'if [ -e %s ] && [ "$(cat %s)" == "kill" ]; then echo "[CodaLab] Received kill command, terminating." >> %s/stderr; docker kill $(cat %s); rm %s; break; fi' % \
                    (ptr_action_file, ptr_action_file, ptr_temp_dir, ptr_container_file, ptr_action_file),
                # Execute "write <subpath> <contents>"
                'if [ -e %s ] && [ "$(%s)" == "write" ]; then echo Writing...; %s > %s/$(%s); rm %s; fi' % \
                    (ptr_action_file, get_field(ptr_action_file, 1),
                    get_field(ptr_action_file, '3-'), ptr_temp_dir, get_field(ptr_action_file, 2),
                    ptr_action_file),
                # Sleep
                'sleep 1',
            ]
            f.write('while [ -e %s ]; do\n  %s\ndone &\n' % (ptr_temp_dir, '\n  '. join(monitor_commands)))

            resource_args = ''
            # Limiting memory in docker is not (always) supported. So we rely on bash (see above).
            # http://programster.blogspot.com/2014/09/docker-implementing-container-memory.html
            #if request_memory:
            #    resource_args += ' -m %s' % int(formatting.parse_size(request_memory))
            # TODO: would constrain --cpuset=0, but difficult because don't know the CPU ids

            # Attach all GPUs if any. Note that only the 64-bit version of
            # libcuda.so is picked up.
            f.write('devices=$(/bin/ls /dev/nvidia* 2>/dev/null)\n')
            f.write('if [ -n "$devices" ]; then devices=$(for d in $devices; do echo --device $d:$d; done); fi\n')
            f.write('libcuda=$(/sbin/ldconfig -p 2>/dev/null | grep "libcuda.so$" | grep "x86-64" | head -n 1 | cut -d " " -f 4)\n')
            f.write('if [ -n "$libcuda" ]; then libcuda=" -v $libcuda:/usr/lib/x86_64-linux-gnu/libcuda.so:ro"; fi\n')
            resource_args += ' $devices$libcuda'

            # Enable network?
            if not request_network:
                resource_args += ' --net=none'

            f.write("docker run%s --rm --cidfile %s -u %s -v %s:%s -v %s:%s -e HOME=%s %s bash %s >%s/stdout 2>%s/stderr & wait $!\n" % (
                resource_args,
                ptr_container_file,
                os.geteuid(),
                ptr_temp_dir, docker_temp_dir,
                ptr_internal_script_file, docker_internal_script_file,
                docker_temp_dir,
                docker_image,
                docker_internal_script_file,
                ptr_temp_dir, ptr_temp_dir))

        # 2) internal_script_file runs the actual command inside the docker container
        with open(internal_script_file, 'w') as f:
            # Make sure I have a username
            username = pwd.getpwuid(os.getuid())[0]  # do this because os.getlogin() doesn't always work
            f.write("[ -w /etc/passwd ] && echo %s::%s:%s::/:/bin/bash >> /etc/passwd\n" % (username, os.geteuid(), os.getgid()))
            # Do this because .bashrc isn't sourced automatically (even with --login, though it works with docker -t -i, strange...)
            f.write("[ -e .bashrc ] && . .bashrc\n")
            # Go into the temp directory
            f.write("cd %s &&\n" % docker_temp_dir)
            # Run the actual command
            f.write('(%s) >>stdout 2>>stderr\n' % bundle.command)

        # Determine resources to request
        resource_args = []
        if request_time:
            resource_args.extend(['--request-time', request_time])
        if request_memory:
            resource_args.extend(['--request-memory', request_memory])
        if request_disk:
            resource_args.extend(['--request-disk', request_disk])
        if request_cpus:
            resource_args.extend(['--request-cpus', request_cpus])
        if request_gpus:
            resource_args.extend(['--request-gpus', request_gpus])
        if request_queue:
            resource_args.extend(['--request-queue', request_queue])
        if request_priority:
            resource_args.extend(['--request-priority', request_priority])
        if username:
            resource_args.extend(['--username', username])

        # Start the command
        args = self.dispatch_command.split() + ['start'] + map(str, resource_args) + [script_file]
        if self.verbose >= 1: print '=== start_bundle(): running %s' % args
        result = json.loads(self.run_command_get_stdout(args))
        if self.verbose >= 1: print '=== start_bundle(): got %s' % result

        if not result['handle']:
            raise SystemError('Starting bundle failed')

        # Return the information about the job.
        return {
            'bundle': bundle,
            'temp_dir': temp_dir,
            'job_handle': result['handle'],
            'docker_image': docker_image,
            'request_time': str(request_time) if request_time else None,
            'request_memory': str(request_memory) if request_memory else None,
            'request_disk': str(request_disk) if request_disk else None,
            'request_cpus': request_cpus,
            'request_gpus': request_gpus,
            'request_queue': request_queue,
            'request_priority': request_priority,
            'request_network': request_network,
        }
示例#10
0
    def start_bundle(self, bundle, bundle_store, parent_dict, username):
        '''
        Sets up all the temporary files and then dispatches the job.
        username: the username of the owner of the bundle
        Returns the bundle information.
        '''
        # Create a temporary directory
        temp_dir = canonicalize.get_current_location(bundle_store, bundle.uuid)
        temp_dir = os.path.realpath(temp_dir)  # Follow symlinks
        path_util.make_directory(temp_dir)

        # Copy all the dependencies to that temporary directory.
        pairs = bundle.get_dependency_paths(bundle_store, parent_dict,
                                            temp_dir)
        print >> sys.stderr, 'RemoteMachine.start_bundle: copying dependencies of %s to %s' % (
            bundle.uuid, temp_dir)
        for (source, target) in pairs:
            path_util.copy(source, target, follow_symlinks=False)

        # Set defaults for the dispatcher.
        docker_image = self.default_docker_image
        if bundle.metadata.request_docker_image:
            docker_image = bundle.metadata.request_docker_image
        request_time = self.default_request_time
        if bundle.metadata.request_time:
            request_time = bundle.metadata.request_time
        request_memory = self.default_request_memory
        if bundle.metadata.request_memory:
            request_memory = bundle.metadata.request_memory
        request_cpus = self.default_request_cpus
        if bundle.metadata.request_cpus:
            request_cpus = bundle.metadata.request_cpus
        request_gpus = self.default_request_gpus
        if bundle.metadata.request_gpus:
            request_gpus = bundle.metadata.request_gpus
        request_queue = self.default_request_queue
        if bundle.metadata.request_queue:
            request_queue = bundle.metadata.request_queue
        request_priority = self.default_request_priority
        if bundle.metadata.request_priority:
            request_priority = bundle.metadata.request_priority

        script_file = temp_dir + '.sh'  # main entry point
        ptr_temp_dir = '$temp_dir'
        # 1) If no argument to script_file, use the temp_dir (e.g., Torque, master/worker share file system).
        # 2) If argument is 'use_script_for_temp_dir', use the script to determine temp_dir (e.g., qsub, no master/worker do not share file system).
        set_temp_dir_header = 'if [ -z "$1" ]; then temp_dir=' + temp_dir + '; else temp_dir=`readlink -f $0 | sed -e \'s/\\.sh$//\'`; fi\n'

        # Write the command to be executed to a script.
        if docker_image:
            internal_script_file = temp_dir + '-internal.sh'  # run inside the docker container
            # These paths depend on $temp_dir, an environment variable which will be set (referenced inside script_file)
            ptr_container_file = ptr_temp_dir + '.cid'  # contains the docker container id
            ptr_action_file = ptr_temp_dir + '.action'  # send actions to the container (e.g., kill)
            ptr_status_dir = ptr_temp_dir + '.status'  # receive information from the container (e.g., memory)
            ptr_script_file = ptr_temp_dir + '.sh'  # main entry point
            ptr_internal_script_file = ptr_temp_dir + '-internal.sh'  # run inside the docker container
            # Names of file inside the docker container
            docker_temp_dir = bundle.uuid
            docker_internal_script_file = bundle.uuid + '-internal.sh'

            # 1) script_file starts the docker container and runs internal_script_file in docker.
            # --rm removes the docker container once the job terminates (note that this makes things slow)
            # -v mounts the internal and user scripts and the temp directory
            # Trap SIGTERM and forward it to docker.
            with open(script_file, 'w') as f:
                f.write(set_temp_dir_header)

                # Monitor CPU/memory/disk
                def copy_if_exists(source_template, arg, target):
                    source = source_template % arg
                    # -f because target might be read-only
                    return 'if [ -e %s ] && [ -e %s ]; then cp -f %s %s; fi' % (
                        arg, source, source, target)

                monitor_commands = [
                    # Report on status (memory, cpu, etc.)
                    'mkdir -p %s' % ptr_status_dir,
                    'if [ -e /cgroup ]; then cgroup=/cgroup; else cgroup=/sys/fs/cgroup; fi',  # find where cgroup is
                    copy_if_exists(
                        '$cgroup/cpuacct/docker/$(cat %s)/cpuacct.stat',
                        ptr_container_file, ptr_status_dir),
                    copy_if_exists(
                        '$cgroup/memory/docker/$(cat %s)/memory.usage_in_bytes',
                        ptr_container_file, ptr_status_dir),
                    copy_if_exists(
                        '$cgroup/blkio/docker/$(cat %s)/blkio.throttle.io_service_bytes',
                        ptr_container_file, ptr_status_dir),
                    # Respond to kill action
                    '[ -e %s ] && [ "$(cat %s)" == "kill" ] && docker kill $(cat %s) && rm %s'
                    % (ptr_action_file, ptr_action_file, ptr_container_file,
                       ptr_action_file),
                    # Sleep
                    'sleep 1',
                ]
                f.write('while [ -e %s ]; do\n  %s\ndone &\n' %
                        (ptr_temp_dir, '\n  '.join(monitor_commands)))

                # Tell docker to constrain resources (memory).
                # Note: limiting memory is not always supported. See:
                # http://programster.blogspot.com/2014/09/docker-implementing-container-memory.html
                resource_args = ''
                if bundle.metadata.request_memory:
                    resource_args += ' -m %s' % int(
                        formatting.parse_size(bundle.metadata.request_memory))
                # TODO: would constrain --cpuset=0, but difficult because don't know the CPU ids

                f.write(
                    "docker run%s --rm --cidfile %s -u %s -v %s:/%s -v %s:/%s %s bash %s >%s/stdout 2>%s/stderr & wait $!\n"
                    %
                    (resource_args, ptr_container_file, os.geteuid(),
                     ptr_temp_dir, docker_temp_dir, ptr_internal_script_file,
                     docker_internal_script_file, docker_image,
                     docker_internal_script_file, ptr_temp_dir, ptr_temp_dir))

            # 2) internal_script_file runs the actual command inside the docker container
            with open(internal_script_file, 'w') as f:
                # Make sure I have a username
                username = pwd.getpwuid(os.getuid())[
                    0]  # do this because os.getlogin() doesn't always work
                f.write("echo %s::%s:%s::/:/bin/bash >> /etc/passwd\n" %
                        (username, os.geteuid(), os.getgid()))
                # Do this because .bashrc isn't sourced automatically (even with --login, though it works with docker -t -i, strange...)
                f.write(". .bashrc || exit 1\n")
                # Go into the temp directory
                f.write("cd %s &&\n" % docker_temp_dir)
                # Run the actual command
                f.write('(%s) >>stdout 2>>stderr\n' % bundle.command)
        else:
            # Just run the command regularly without docker
            with open(script_file, 'w') as f:
                f.write(set_temp_dir_header)
                f.write("cd %s &&\n" % ptr_temp_dir)
                f.write('(%s) >stdout 2>stderr\n' % bundle.command)

        # Determine resources to request
        resource_args = []
        if request_time:
            resource_args.extend(
                ['--request_time',
                 formatting.parse_duration(request_time)])
        if request_memory:
            resource_args.extend(
                ['--request_memory',
                 formatting.parse_size(request_memory)])
        if request_cpus:
            resource_args.extend(['--request_cpus', request_cpus])
        if request_gpus:
            resource_args.extend(['--request_gpus', request_gpus])
        if request_queue:
            resource_args.extend(['--request_queue', request_queue])
        if request_priority:
            resource_args.extend(['--request_priority', request_priority])
        if username:
            resource_args.extend(['--username', username])

        # Start the command
        args = self.dispatch_command.split() + ['start'] + map(
            str, resource_args) + [script_file]
        if self.verbose >= 1: print '=== start_bundle(): running %s' % args
        result = json.loads(self.run_command_get_stdout(args))
        if self.verbose >= 1: print '=== start_bundle(): got %s' % result

        # Return the information about the job.
        return {
            'bundle': bundle,
            'temp_dir': temp_dir,
            'job_handle': result['handle'],
            'docker_image': docker_image,
        }