def _get_jobs_output(self, gc_id_jobnum_list): # Get output of jobs and yield output dirs if len(gc_id_jobnum_list) == 0: raise StopIteration root_dn = os.path.join(self._path_output, 'tmp') try: if len(gc_id_jobnum_list) == 1: # For single jobs create single subdir tmp_dn = os.path.join(root_dn, md5_hex(gc_id_jobnum_list[0][0])) else: tmp_dn = root_dn ensure_dir_exists(tmp_dn) except Exception: raise BackendError( 'Temporary path "%s" could not be created.' % tmp_dn, BackendError) map_gc_id2jobnum = dict(gc_id_jobnum_list) jobs = self._write_wms_id_list(gc_id_jobnum_list) activity = Activity('retrieving %d job outputs' % len(gc_id_jobnum_list)) proc = LocalProcess(self._output_exec, '--noint', '--logfile', '/dev/stderr', '-i', jobs, '--dir', tmp_dn) # yield output dirs todo = map_gc_id2jobnum.values() current_jobnum = None for line in imap(str.strip, proc.stdout.iter(timeout=60)): if line.startswith(tmp_dn): todo.remove(current_jobnum) output_dn = line.strip() unpack_wildcard_tar(self._log, output_dn) yield (current_jobnum, output_dn) current_jobnum = None else: current_jobnum = map_gc_id2jobnum.get(self._create_gc_id(line), current_jobnum) exit_code = proc.status(timeout=0, terminate=True) activity.finish() if exit_code != 0: if 'Keyboard interrupt raised by user' in proc.stderr.read( timeout=0): remove_files([jobs, root_dn]) raise StopIteration else: self._log.log_process(proc, files={'jobs': SafeFile(jobs).read()}) self._log.error('Trying to recover from error ...') for dn in os.listdir(root_dn): yield (None, os.path.join(root_dn, dn)) # return unretrievable jobs for jobnum in todo: yield (jobnum, None) remove_files([jobs, tmp_dn])
def get_jobs_output_chunk(self, tmp_dn, gc_id_jobnum_list, wms_id_list_done): map_gc_id2jobnum = dict(gc_id_jobnum_list) jobs = list(self._iter_wms_ids(gc_id_jobnum_list)) log = tempfile.mktemp('.log') proc = LocalProcess(self._output_exec, '--noint', '--logfile', log, '--dir', tmp_dn, *jobs) exit_code = proc.status(timeout=20 * len(jobs), terminate=True) # yield output dirs current_jobnum = None for line in imap(str.strip, proc.stdout.iter(timeout=20)): match = re.match(self._output_regex, line) if match: wms_id = match.groupdict()['rawId'] current_jobnum = map_gc_id2jobnum.get( self._create_gc_id(wms_id)) wms_id_list_done.append(wms_id) yield (current_jobnum, match.groupdict()['output_dn']) current_jobnum = None if exit_code != 0: if 'Keyboard interrupt raised by user' in proc.stdout.read_log(): remove_files([log, tmp_dn]) raise StopIteration else: self._log.log_process(proc) self._log.error('Trying to recover from error ...') for dn in os.listdir(tmp_dn): yield (None, os.path.join(tmp_dn, dn)) remove_files([log])
def _submit_jobs(self, jobnum_list, task): # submit_jobs: Submit a number of jobs and yield (jobnum, WMS ID, other data) sequentially # >>jobnum: internal ID of the Job # JobNum is linked to the actual *task* here (jdl_fn, submit_jdl_fn) = self._submit_jobs_prepare(jobnum_list, task) try: # submit all jobs simultaneously and temporarily store verbose (ClassAdd) output activity = Activity('queuing jobs at scheduler') submit_args = ' -verbose -batch-name ' + task.get_description().task_name + ' ' + submit_jdl_fn proc = self._proc_factory.logged_execute(self._submit_exec, submit_args) # extract the Condor ID (WMS ID) of the jobs from output ClassAds jobnum_gc_id_list = [] for line in proc.iter(): if 'GridControl_GCIDtoWMSID' in line: jobnum_wms_id = line.split('=')[1].strip(' "\n').split('@') jobnum, wms_id = int(jobnum_wms_id[0]), jobnum_wms_id[1].strip() # Condor creates a default job then overwrites settings on any subsequent job # i.e. skip every second, but better be sure if (not jobnum_gc_id_list) or (jobnum not in lzip(*jobnum_gc_id_list)[0]): jobnum_gc_id_list.append((jobnum, self._create_gc_id(wms_id))) exit_code = proc.wait() activity.finish() if (exit_code != 0) or (len(jobnum_gc_id_list) < len(jobnum_list)): if not self._explain_error(proc, exit_code): self._log.error('Submitted %4d jobs of %4d expected', len(jobnum_gc_id_list), len(jobnum_list)) proc.log_error(self._error_log_fn, jdl=jdl_fn) finally: remove_files([jdl_fn]) for (jobnum, gc_id) in jobnum_gc_id_list: yield (jobnum, gc_id, {})
def _write_jdl(self, jobnum_list, task): # construct a temporary JDL for this batch of jobs jdl_fd, jdl_fn = tempfile.mkstemp(suffix='.jdl') try: data = self._get_jdl_str_list(jobnum_list, task) safe_write(os.fdopen(jdl_fd, 'w'), data) except Exception: remove_files([jdl_fn]) raise BackendError('Could not write jdl data to %s.' % jdl_fn) return jdl_fn
def _purge_done_jobs(self, wms_id_list_done): purge_log_fn = tempfile.mktemp('.log') purge_proc = LocalProcess(resolve_install_path('glite-ce-job-purge'), '--noint', '--logfile', purge_log_fn, str.join(' ', wms_id_list_done)) exit_code = purge_proc.status(timeout=60) if exit_code != 0: if self._explain_error(purge_proc, exit_code): pass else: self._log.log_process(purge_proc) remove_files([purge_log_fn])
def _get_jobs_output(self, gc_id_jobnum_list): # Get output of jobs and yield output dirs if len(gc_id_jobnum_list) == 0: raise StopIteration root_dn = os.path.join(self._path_output, 'tmp') try: if len(gc_id_jobnum_list) == 1: # For single jobs create single subdir tmp_dn = os.path.join(root_dn, md5_hex(gc_id_jobnum_list[0][0])) else: tmp_dn = root_dn ensure_dir_exists(tmp_dn) except Exception: raise BackendError('Temporary path "%s" could not be created.' % tmp_dn, BackendError) map_gc_id2jobnum = dict(gc_id_jobnum_list) jobs = self._write_wms_id_list(gc_id_jobnum_list) activity = Activity('retrieving %d job outputs' % len(gc_id_jobnum_list)) proc = LocalProcess(self._output_exec, '--noint', '--logfile', '/dev/stderr', '-i', jobs, '--dir', tmp_dn) # yield output dirs todo = map_gc_id2jobnum.values() current_jobnum = None for line in imap(str.strip, proc.stdout.iter(timeout=60)): if line.startswith(tmp_dn): todo.remove(current_jobnum) output_dn = line.strip() unpack_wildcard_tar(self._log, output_dn) yield (current_jobnum, output_dn) current_jobnum = None else: current_jobnum = map_gc_id2jobnum.get(self._create_gc_id(line), current_jobnum) exit_code = proc.status(timeout=0, terminate=True) activity.finish() if exit_code != 0: if 'Keyboard interrupt raised by user' in proc.stderr.read(timeout=0): remove_files([jobs, root_dn]) raise StopIteration else: self._log.log_process(proc, files={'jobs': SafeFile(jobs).read()}) self._log.error('Trying to recover from error ...') for dn in os.listdir(root_dn): yield (None, os.path.join(root_dn, dn)) # return unretrievable jobs for jobnum in todo: yield (jobnum, None) remove_files([jobs, tmp_dn])
def _get_jobs_output(self, gc_id_jobnum_list): if not len(gc_id_jobnum_list): raise StopIteration activity = Activity('retrieving %d job outputs' % len(gc_id_jobnum_list)) for gc_id, jobnum in gc_id_jobnum_list: path = self._sandbox_helper.get_sandbox(gc_id) if path is None: yield (jobnum, None) continue # Cleanup sandbox output_fn_list = lchain(imap(lambda pat: glob.glob(os.path.join(path, pat)), self._output_fn_list)) remove_files(ifilter(lambda x: x not in output_fn_list, imap(lambda fn: os.path.join(path, fn), os.listdir(path)))) yield (jobnum, path) activity.finish()
def __new__(cls, config, job_limit=-1, job_selector=None): try: resolve_install_path('zip') except Exception: clear_current_exception() return TextFileJobDB.__new__(cls, config, job_limit, job_selector) path_db = config.get_work_path('jobs') db_fn = config.get_work_path('jobs.zip') if os.path.exists(path_db) and os.path.isdir(path_db) and not os.path.exists(db_fn): activity = Activity('Converting job database') new_db = ZippedJobDB(config) try: old_db = TextFileJobDB(config) for jobnum in old_db.get_job_list(): new_db.commit(jobnum, old_db.get_job(jobnum)) except Exception: remove_files([db_fn]) raise activity.finish() return ZippedJobDB.__new__(cls, config, job_limit, job_selector)
def _submit_job(self, jobnum, task): # Submit job and yield (jobnum, WMS ID, other data) jdl_fd, jdl_fn = tempfile.mkstemp('.jdl') try: jdl_line_list = self._make_jdl(jobnum, task) safe_write(os.fdopen(jdl_fd, 'w'), jdl_line_list) except Exception: remove_files([jdl_fn]) raise BackendError('Could not write jdl data to %s.' % jdl_fn) try: submit_arg_list = [] for key_value in filter_dict(self._submit_args_dict, value_filter=identity).items(): submit_arg_list.extend(key_value) submit_arg_list.append(jdl_fn) activity = Activity('submitting job %d' % jobnum) proc = LocalProcess(self._submit_exec, '--nomsg', '--noint', '--logfile', '/dev/stderr', *submit_arg_list) wms_id = None stripped_stdout_iter = imap(str.strip, proc.stdout.iter(timeout=60)) for line in ifilter(lambda x: x.startswith('http'), stripped_stdout_iter): wms_id = line exit_code = proc.status(timeout=0, terminate=True) activity.finish() if (exit_code != 0) or (wms_id is None): if self._explain_error(proc, exit_code): pass else: self._log.log_process( proc, files={'jdl': SafeFile(jdl_fn).read()}) finally: remove_files([jdl_fn]) job_data = {'jdl': str.join('', jdl_line_list)} return (jobnum, self._create_gc_id(wms_id), job_data)
def _recover_jobs(self): proc = LocalProcess('zip', '-FF', self._db_fn, '--out', '%s.tmp' % self._db_fn) proc.stdin.write('y\n') proc.status(timeout=None) os.rename(self._db_fn, self._db_fn + '.broken') os.rename(self._db_fn + '.tmp', self._db_fn) tar = zipfile.ZipFile(self._db_fn, 'r', zipfile.ZIP_DEFLATED) remove_files([self._db_fn + '.broken']) broken_fn_list = [] for tar_info_fn in tar.namelist(): try: tuple(imap(lambda s: int(s[1:]), tar_info_fn.split('_', 1))) # check name fp = tar.open(tar_info_fn) try: fp.read() finally: fp.close() except Exception: clear_current_exception() broken_fn_list.append(tar_info_fn) for broken in broken_fn_list: os.system('zip %s -d %s' % (self._db_fn, broken))
def _get_jobs_output(self, gc_id_jobnum_list): if not len(gc_id_jobnum_list): raise StopIteration activity = Activity('retrieving %d job outputs' % len(gc_id_jobnum_list)) for gc_id, jobnum in gc_id_jobnum_list: path = self._sandbox_helper.get_sandbox(gc_id) if path is None: yield (jobnum, None) continue # Cleanup sandbox output_fn_list = lchain( imap(lambda pat: glob.glob(os.path.join(path, pat)), self._output_fn_list)) remove_files( ifilter( lambda x: x not in output_fn_list, imap(lambda fn: os.path.join(path, fn), os.listdir(path)))) yield (jobnum, path) activity.finish()
def _submit_job(self, jobnum, task): # Submit job and yield (jobnum, WMS ID, other data) jdl_fd, jdl_fn = tempfile.mkstemp('.jdl') try: jdl_line_list = self._make_jdl(jobnum, task) safe_write(os.fdopen(jdl_fd, 'w'), jdl_line_list) except Exception: remove_files([jdl_fn]) raise BackendError('Could not write jdl data to %s.' % jdl_fn) try: submit_arg_list = [] for key_value in filter_dict(self._submit_args_dict, value_filter=identity).items(): submit_arg_list.extend(key_value) submit_arg_list.append(jdl_fn) activity = Activity('submitting job %d' % jobnum) proc = LocalProcess(self._submit_exec, '--nomsg', '--noint', '--logfile', '/dev/stderr', *submit_arg_list) wms_id = None stripped_stdout_iter = imap(str.strip, proc.stdout.iter(timeout=60)) for line in ifilter(lambda x: x.startswith('http'), stripped_stdout_iter): wms_id = line exit_code = proc.status(timeout=0, terminate=True) activity.finish() if (exit_code != 0) or (wms_id is None): if self._explain_error(proc, exit_code): pass else: self._log.log_process(proc, files={'jdl': SafeFile(jdl_fn).read()}) finally: remove_files([jdl_fn]) job_data = {'jdl': str.join('', jdl_line_list)} return (jobnum, self._create_gc_id(wms_id), job_data)
def _get_jobs_output(self, gc_id_jobnum_list): # Get output of jobs and yield output dirs if len(gc_id_jobnum_list) == 0: raise StopIteration tmp_dn = os.path.join(self._path_output, 'tmp') try: if len(gc_id_jobnum_list) == 1: # For single jobs create single subdir tmp_dn = os.path.join(tmp_dn, md5_hex(gc_id_jobnum_list[0][0])) ensure_dir_exists(tmp_dn) except Exception: raise BackendError( 'Temporary path "%s" could not be created.' % tmp_dn, BackendError) map_gc_id2jobnum = dict(gc_id_jobnum_list) jobnum_list_todo = list(map_gc_id2jobnum.values()) wms_id_list_done = [] activity = Activity('retrieving %d job outputs' % len(gc_id_jobnum_list)) chunk_pos_iter = irange(0, len(gc_id_jobnum_list), self._chunk_size) for ids in imap(lambda x: gc_id_jobnum_list[x:x + self._chunk_size], chunk_pos_iter): for (current_jobnum, output_dn) in self.get_jobs_output_chunk( tmp_dn, ids, wms_id_list_done): unpack_wildcard_tar(self._log, output_dn) jobnum_list_todo.remove(current_jobnum) yield (current_jobnum, output_dn) activity.finish() # return unretrievable jobs for jobnum in jobnum_list_todo: yield (jobnum, None) self._purge_done_jobs(wms_id_list_done) remove_files([tmp_dn])