示例#1
0
    def _start_pilot_bulk(self, resource, schema, pilots):
        """
        For each pilot, we prepare by determining what files need to be staged,
        and what job description needs to be submitted.

        We expect `_prepare_pilot(resource, pilot)` to return a dict with:

            { 
              'js' : saga.job.Description,
              'ft' : [ 
                { 'src' : string  # absolute source file name
                  'tgt' : string  # relative target file name
                  'rem' : bool    # shall we remove src?
                }, 
                ... ]
            }
        
        When transfering data, we'll ensure that each src is only transferred
        once (in fact, we put all src files into a tarball and unpack that on
        the target side).

        The returned dicts are expected to only contain files which actually
        need staging, ie. which have not been staged during a previous pilot
        submission.  That implies one of two things: either this component is
        stateful, and remembers what has been staged -- which makes it difficult
        to use multiple component instances; or the component inspects the
        target resource for existing files -- which involves additional
        expensive remote hops.
        FIXME: since neither is implemented at this point we won't discuss the
               tradeoffs further -- right now files are unique per pilot bulk.

        Once all dicts are collected, we create one additional file which
        contains the staging information, and then pack all src files into
        a tarball for staging.  We transfer the tarball, and *immediately*
        trigger the untaring on the target resource, which is thus *not* part of
        the bootstrapping process.
        NOTE: this is to avoid untaring race conditions for multiple pilots, and
              also to simplify bootstrapping dependencies -- the bootstrappers
              are likely within the tarball after all...
        """

        rcfg = self._session.get_resource_config(resource, schema)
        sid = self._session.uid

        # we create a fake session_sandbox with all pilot_sandboxes in /tmp, and
        # then tar it up.  Once we untar that tarball on the target machine, we
        # should have all sandboxes and all files required to bootstrap the
        # pilots
        # FIXME: on untar, there is a race between multiple launcher components
        #        within the same session toward the same target resource.
        tmp_dir = os.path.abspath(tempfile.mkdtemp(prefix='rp_agent_tar_dir'))
        tar_name = '%s.%s.tgz' % (sid, self.uid)
        tar_tgt = '%s/%s' % (tmp_dir, tar_name)
        tar_url = rs.Url('file://localhost/%s' % tar_tgt)

        # we need the session sandbox url, but that is (at least in principle)
        # dependent on the schema to use for pilot startup.  So we confirm here
        # that the bulk is consistent wrt. to the schema.
        # FIXME: if it is not, it needs to be splitted into schema-specific
        # sub-bulks
        schema = pilots[0]['description'].get('access_schema')
        for pilot in pilots[1:]:
            assert(schema == pilot['description'].get('access_schema')), \
                    'inconsistent scheme on launch / staging'

        session_sandbox = self._session._get_session_sandbox(pilots[0]).path

        # we will create the session sandbox before we untar, so we can use that
        # as workdir, and pack all paths relative to that session sandbox.  That
        # implies that we have to recheck that all URLs in fact do point into
        # the session sandbox.

        ft_list = list()  # files to stage
        jd_list = list()  # jobs  to submit
        for pilot in pilots:
            info = self._prepare_pilot(resource, rcfg, pilot)
            ft_list += info['ft']
            jd_list.append(info['jd'])
            self._prof.prof('staging_in_start', uid=pilot['uid'])

        for ft in ft_list:
            src = os.path.abspath(ft['src'])
            tgt = os.path.relpath(os.path.normpath(ft['tgt']), session_sandbox)
            src_dir = os.path.dirname(src)
            tgt_dir = os.path.dirname(tgt)

            if tgt_dir.startswith('..'):
                raise ValueError('staging target %s outside of pilot sandbox' %
                                 ft['tgt'])

            if not os.path.isdir('%s/%s' % (tmp_dir, tgt_dir)):
                os.makedirs('%s/%s' % (tmp_dir, tgt_dir))

            if src == '/dev/null':
                # we want an empty file -- touch it (tar will refuse to
                # handle a symlink to /dev/null)
                open('%s/%s' % (tmp_dir, tgt), 'a').close()
            else:
                os.symlink(src, '%s/%s' % (tmp_dir, tgt))

        # tar.  If any command fails, this will raise.
        cmd = "cd %s && tar zchf %s *" % (tmp_dir, tar_tgt)
        self._log.debug('cmd: %s', cmd)
        out = sp.check_output(["/bin/sh", "-c", cmd], stderr=sp.STDOUT)
        self._log.debug('out: %s', out)

        # remove all files marked for removal-after-pack
        for ft in ft_list:
            if ft['rem']:
                os.unlink(ft['src'])

        fs_endpoint = rcfg['filesystem_endpoint']
        fs_url = rs.Url(fs_endpoint)

        self._log.debug("rs.file.Directory ('%s')", fs_url)

        with self._cache_lock:
            if fs_url in self._saga_fs_cache:
                fs = self._saga_fs_cache[fs_url]
            else:
                fs = rsfs.Directory(fs_url, session=self._session)
                self._saga_fs_cache[fs_url] = fs

        tar_rem = rs.Url(fs_url)
        tar_rem.path = "%s/%s" % (session_sandbox, tar_name)

        fs.copy(tar_url, tar_rem, flags=rsfs.CREATE_PARENTS)

        shutil.rmtree(tmp_dir)

        # we now need to untar on the target machine.
        js_url = ru.Url(pilots[0]['js_url'])

        # well, we actually don't need to talk to the lrms, but only need
        # a shell on the headnode.  That seems true for all LRMSs we use right
        # now.  So, lets convert the URL:
        if '+' in js_url.scheme:
            parts = js_url.scheme.split('+')
            if 'gsissh' in parts: js_url.scheme = 'gsissh'
            elif 'ssh' in parts: js_url.scheme = 'ssh'
        else:
            # In the non-combined '+' case we need to distinguish between
            # a url that was the result of a hop or a local lrms.
            if js_url.scheme not in ['ssh', 'gsissh']:
                js_url.scheme = 'fork'

        with self._cache_lock:
            if js_url in self._saga_js_cache:
                js_tmp = self._saga_js_cache[js_url]
            else:
                js_tmp = rs.job.Service(js_url, session=self._session)
                self._saga_js_cache[js_url] = js_tmp

    ## cmd = "tar zmxvf %s/%s -C / ; rm -f %s" % \
        cmd = "tar zmxvf %s/%s -C %s" % \
                (session_sandbox, tar_name, session_sandbox)
        j = js_tmp.run_job(cmd)
        j.wait()

        self._log.debug('tar cmd : %s', cmd)
        self._log.debug('tar done: %s, %s, %s', j.state, j.stdout, j.stderr)

        for pilot in pilots:
            self._prof.prof('staging_in_stop', uid=pilot['uid'])
            self._prof.prof('submission_start', uid=pilot['uid'])

        # look up or create JS for actual pilot submission.  This might result
        # in the same js url as above, or not.
        js_ep = rcfg['job_manager_endpoint']
        with self._cache_lock:
            if js_ep in self._saga_js_cache:
                js = self._saga_js_cache[js_ep]
            else:
                js = rs.job.Service(js_ep, session=self._session)
                self._saga_js_cache[js_ep] = js

        # now that the scripts are in place and configured,
        # we can launch the agent
        jc = rs.job.Container()

        for jd in jd_list:
            self._log.debug('jd: %s', pprint.pformat(jd.as_dict()))
            jc.add(js.create_job(jd))

        jc.run()

        for j in jc.get_tasks():

            # do a quick error check
            if j.state == rs.FAILED:
                self._log.error('%s: %s : %s : %s', j.id, j.state, j.stderr,
                                j.stdout)
                raise RuntimeError("SAGA Job state is FAILED.")

            if not j.name:
                raise RuntimeError('cannot get job name for %s' % j.id)

            pilot = None
            for p in pilots:
                if p['uid'] == j.name:
                    pilot = p
                    break

            if not pilot:
                raise RuntimeError('job does not match any pilot: %s : %s' %
                                   (j.name, j.id))

            pid = pilot['uid']
            self._log.debug('pilot job: %s : %s : %s : %s', pid, j.id, j.name,
                            j.state)

            # Update the Pilot's state to 'PMGR_ACTIVE_PENDING' if SAGA job
            # submission was successful.  Since the pilot leaves the scope of
            # the PMGR for the time being, we update the complete DB document
            pilot['$all'] = True

            # FIXME: update the right pilot
            with self._pilots_lock:

                self._pilots[pid] = dict()
                self._pilots[pid]['pilot'] = pilot
                self._pilots[pid]['job'] = j

            # make sure we watch that pilot
            with self._check_lock:
                self._checking.append(pid)

        for pilot in pilots:
            self._prof.prof('submission_stop', uid=pilot['uid'])
示例#2
0
    def _handle_pilot_input_staging(self, pilot, sds):

        pid = pilot['uid']

        # NOTE: no unit sandboxes defined!
        src_context = {
            'pwd': pilot['client_sandbox'],
            'pilot': pilot['pilot_sandbox'],
            'resource': pilot['resource_sandbox']
        }
        tgt_context = {
            'pwd': pilot['pilot_sandbox'],
            'pilot': pilot['pilot_sandbox'],
            'resource': pilot['resource_sandbox']
        }

        # Iterate over all directives
        for sd in sds:

            # TODO: respect flags in directive

            action = sd['action']
            flags = sd['flags']
            did = sd['uid']
            src = sd['source']
            tgt = sd['target']

            assert (action in [COPY, LINK, MOVE, TRANSFER])

            self._prof.prof('staging_in_start', uid=pid, msg=did)

            src = complete_url(src, src_context, self._log)
            tgt = complete_url(tgt, tgt_context, self._log)

            if action in [COPY, LINK, MOVE]:
                self._prof.prof('staging_in_fail', uid=pid, msg=did)
                raise ValueError("invalid action '%s' on pilot level" % action)

            self._log.info('transfer %s to %s', src, tgt)

            # FIXME: make sure that tgt URL points to the right resource
            # FIXME: honor sd flags if given (recursive...)
            flags = rsfs.CREATE_PARENTS

            # Define and open the staging directory for the pilot
            # We use the target dir construct here, so that we can create
            # the directory if it does not yet exist.

            # url used for cache (sandbox url w/o path)
            tmp = rs.Url(pilot['pilot_sandbox'])
            tmp.path = '/'
            key = str(tmp)

            self._log.debug("rs.file.Directory ('%s')", key)

            with self._cache_lock:
                if key in self._saga_fs_cache:
                    fs = self._saga_fs_cache[key]

                else:
                    fs = rsfs.Directory(key, session=self._session)
                    self._saga_fs_cache[key] = fs

            fs.copy(src, tgt, flags=flags)

            sd['pmgr_state'] = rps.DONE

            self._prof.prof('staging_in_stop', uid=pid, msg=did)

        self.publish(
            rpc.CONTROL_PUBSUB, {
                'cmd': 'pilot_staging_input_result',
                'arg': {
                    'pilot': pilot,
                    'sds': sds
                }
            })