def _start_pilot_bulk(self, resource, schema, pilots): """ For each pilot, we prepare by determining what files need to be staged, and what job description needs to be submitted. We expect `_prepare_pilot(resource, pilot)` to return a dict with: { 'js' : saga.job.Description, 'ft' : [ { 'src' : string # absolute source file name 'tgt' : string # relative target file name 'rem' : bool # shall we remove src? }, ... ] } When transfering data, we'll ensure that each src is only transferred once (in fact, we put all src files into a tarball and unpack that on the target side). The returned dicts are expected to only contain files which actually need staging, ie. which have not been staged during a previous pilot submission. That implies one of two things: either this component is stateful, and remembers what has been staged -- which makes it difficult to use multiple component instances; or the component inspects the target resource for existing files -- which involves additional expensive remote hops. FIXME: since neither is implemented at this point we won't discuss the tradeoffs further -- right now files are unique per pilot bulk. Once all dicts are collected, we create one additional file which contains the staging information, and then pack all src files into a tarball for staging. We transfer the tarball, and *immediately* trigger the untaring on the target resource, which is thus *not* part of the bootstrapping process. NOTE: this is to avoid untaring race conditions for multiple pilots, and also to simplify bootstrapping dependencies -- the bootstrappers are likely within the tarball after all... """ rcfg = self._session.get_resource_config(resource, schema) sid = self._session.uid # we create a fake session_sandbox with all pilot_sandboxes in /tmp, and # then tar it up. Once we untar that tarball on the target machine, we # should have all sandboxes and all files required to bootstrap the # pilots # FIXME: on untar, there is a race between multiple launcher components # within the same session toward the same target resource. tmp_dir = os.path.abspath(tempfile.mkdtemp(prefix='rp_agent_tar_dir')) tar_name = '%s.%s.tgz' % (sid, self.uid) tar_tgt = '%s/%s' % (tmp_dir, tar_name) tar_url = rs.Url('file://localhost/%s' % tar_tgt) # we need the session sandbox url, but that is (at least in principle) # dependent on the schema to use for pilot startup. So we confirm here # that the bulk is consistent wrt. to the schema. # FIXME: if it is not, it needs to be splitted into schema-specific # sub-bulks schema = pilots[0]['description'].get('access_schema') for pilot in pilots[1:]: assert(schema == pilot['description'].get('access_schema')), \ 'inconsistent scheme on launch / staging' session_sandbox = self._session._get_session_sandbox(pilots[0]).path # we will create the session sandbox before we untar, so we can use that # as workdir, and pack all paths relative to that session sandbox. That # implies that we have to recheck that all URLs in fact do point into # the session sandbox. ft_list = list() # files to stage jd_list = list() # jobs to submit for pilot in pilots: info = self._prepare_pilot(resource, rcfg, pilot) ft_list += info['ft'] jd_list.append(info['jd']) self._prof.prof('staging_in_start', uid=pilot['uid']) for ft in ft_list: src = os.path.abspath(ft['src']) tgt = os.path.relpath(os.path.normpath(ft['tgt']), session_sandbox) src_dir = os.path.dirname(src) tgt_dir = os.path.dirname(tgt) if tgt_dir.startswith('..'): raise ValueError('staging target %s outside of pilot sandbox' % ft['tgt']) if not os.path.isdir('%s/%s' % (tmp_dir, tgt_dir)): os.makedirs('%s/%s' % (tmp_dir, tgt_dir)) if src == '/dev/null': # we want an empty file -- touch it (tar will refuse to # handle a symlink to /dev/null) open('%s/%s' % (tmp_dir, tgt), 'a').close() else: os.symlink(src, '%s/%s' % (tmp_dir, tgt)) # tar. If any command fails, this will raise. cmd = "cd %s && tar zchf %s *" % (tmp_dir, tar_tgt) self._log.debug('cmd: %s', cmd) out = sp.check_output(["/bin/sh", "-c", cmd], stderr=sp.STDOUT) self._log.debug('out: %s', out) # remove all files marked for removal-after-pack for ft in ft_list: if ft['rem']: os.unlink(ft['src']) fs_endpoint = rcfg['filesystem_endpoint'] fs_url = rs.Url(fs_endpoint) self._log.debug("rs.file.Directory ('%s')", fs_url) with self._cache_lock: if fs_url in self._saga_fs_cache: fs = self._saga_fs_cache[fs_url] else: fs = rsfs.Directory(fs_url, session=self._session) self._saga_fs_cache[fs_url] = fs tar_rem = rs.Url(fs_url) tar_rem.path = "%s/%s" % (session_sandbox, tar_name) fs.copy(tar_url, tar_rem, flags=rsfs.CREATE_PARENTS) shutil.rmtree(tmp_dir) # we now need to untar on the target machine. js_url = ru.Url(pilots[0]['js_url']) # well, we actually don't need to talk to the lrms, but only need # a shell on the headnode. That seems true for all LRMSs we use right # now. So, lets convert the URL: if '+' in js_url.scheme: parts = js_url.scheme.split('+') if 'gsissh' in parts: js_url.scheme = 'gsissh' elif 'ssh' in parts: js_url.scheme = 'ssh' else: # In the non-combined '+' case we need to distinguish between # a url that was the result of a hop or a local lrms. if js_url.scheme not in ['ssh', 'gsissh']: js_url.scheme = 'fork' with self._cache_lock: if js_url in self._saga_js_cache: js_tmp = self._saga_js_cache[js_url] else: js_tmp = rs.job.Service(js_url, session=self._session) self._saga_js_cache[js_url] = js_tmp ## cmd = "tar zmxvf %s/%s -C / ; rm -f %s" % \ cmd = "tar zmxvf %s/%s -C %s" % \ (session_sandbox, tar_name, session_sandbox) j = js_tmp.run_job(cmd) j.wait() self._log.debug('tar cmd : %s', cmd) self._log.debug('tar done: %s, %s, %s', j.state, j.stdout, j.stderr) for pilot in pilots: self._prof.prof('staging_in_stop', uid=pilot['uid']) self._prof.prof('submission_start', uid=pilot['uid']) # look up or create JS for actual pilot submission. This might result # in the same js url as above, or not. js_ep = rcfg['job_manager_endpoint'] with self._cache_lock: if js_ep in self._saga_js_cache: js = self._saga_js_cache[js_ep] else: js = rs.job.Service(js_ep, session=self._session) self._saga_js_cache[js_ep] = js # now that the scripts are in place and configured, # we can launch the agent jc = rs.job.Container() for jd in jd_list: self._log.debug('jd: %s', pprint.pformat(jd.as_dict())) jc.add(js.create_job(jd)) jc.run() for j in jc.get_tasks(): # do a quick error check if j.state == rs.FAILED: self._log.error('%s: %s : %s : %s', j.id, j.state, j.stderr, j.stdout) raise RuntimeError("SAGA Job state is FAILED.") if not j.name: raise RuntimeError('cannot get job name for %s' % j.id) pilot = None for p in pilots: if p['uid'] == j.name: pilot = p break if not pilot: raise RuntimeError('job does not match any pilot: %s : %s' % (j.name, j.id)) pid = pilot['uid'] self._log.debug('pilot job: %s : %s : %s : %s', pid, j.id, j.name, j.state) # Update the Pilot's state to 'PMGR_ACTIVE_PENDING' if SAGA job # submission was successful. Since the pilot leaves the scope of # the PMGR for the time being, we update the complete DB document pilot['$all'] = True # FIXME: update the right pilot with self._pilots_lock: self._pilots[pid] = dict() self._pilots[pid]['pilot'] = pilot self._pilots[pid]['job'] = j # make sure we watch that pilot with self._check_lock: self._checking.append(pid) for pilot in pilots: self._prof.prof('submission_stop', uid=pilot['uid'])
def _handle_pilot_input_staging(self, pilot, sds): pid = pilot['uid'] # NOTE: no unit sandboxes defined! src_context = { 'pwd': pilot['client_sandbox'], 'pilot': pilot['pilot_sandbox'], 'resource': pilot['resource_sandbox'] } tgt_context = { 'pwd': pilot['pilot_sandbox'], 'pilot': pilot['pilot_sandbox'], 'resource': pilot['resource_sandbox'] } # Iterate over all directives for sd in sds: # TODO: respect flags in directive action = sd['action'] flags = sd['flags'] did = sd['uid'] src = sd['source'] tgt = sd['target'] assert (action in [COPY, LINK, MOVE, TRANSFER]) self._prof.prof('staging_in_start', uid=pid, msg=did) src = complete_url(src, src_context, self._log) tgt = complete_url(tgt, tgt_context, self._log) if action in [COPY, LINK, MOVE]: self._prof.prof('staging_in_fail', uid=pid, msg=did) raise ValueError("invalid action '%s' on pilot level" % action) self._log.info('transfer %s to %s', src, tgt) # FIXME: make sure that tgt URL points to the right resource # FIXME: honor sd flags if given (recursive...) flags = rsfs.CREATE_PARENTS # Define and open the staging directory for the pilot # We use the target dir construct here, so that we can create # the directory if it does not yet exist. # url used for cache (sandbox url w/o path) tmp = rs.Url(pilot['pilot_sandbox']) tmp.path = '/' key = str(tmp) self._log.debug("rs.file.Directory ('%s')", key) with self._cache_lock: if key in self._saga_fs_cache: fs = self._saga_fs_cache[key] else: fs = rsfs.Directory(key, session=self._session) self._saga_fs_cache[key] = fs fs.copy(src, tgt, flags=flags) sd['pmgr_state'] = rps.DONE self._prof.prof('staging_in_stop', uid=pid, msg=did) self.publish( rpc.CONTROL_PUBSUB, { 'cmd': 'pilot_staging_input_result', 'arg': { 'pilot': pilot, 'sds': sds } })