def getBlocksNormed(self): activity = Activity('Retrieving %s' % self._datasetExpr) try: # Validation, Naming: for block in self._getBlocksInternal(): assert (block[DataProvider.Dataset]) block.setdefault(DataProvider.BlockName, '0') block.setdefault(DataProvider.Provider, self.__class__.__name__) block.setdefault(DataProvider.Locations, None) events = sum( imap(lambda x: x[DataProvider.NEntries], block[DataProvider.FileList])) block.setdefault(DataProvider.NEntries, events) if self._datasetNick: block[DataProvider.Nickname] = self._datasetNick elif self._nickProducer: block = self._nickProducer.processBlock(block) if not block: raise DatasetError('Nickname producer failed!') yield block except Exception: raise DatasetError('Unable to retrieve dataset %s' % repr(self._datasetExpr)) activity.finish()
def do_transfer(self, desc_source_target_list): for (desc, source, target) in desc_source_target_list: if not self._storage_paths: raise ConfigError( "%s can't be transferred because '%s path wasn't set" % (desc, self._storage_channel)) for idx, se_path in enumerate(set(self._storage_paths)): activity = Activity('Copy %s to SE %d ' % (desc, idx + 1)) proc = se_copy(source, os.path.join(se_path, target), self._storage_force) proc.status(timeout=5 * 60, terminate=True) activity.finish() if proc.status(timeout=0) == 0: self._log.info('Copy %s to SE %d finished', desc, idx + 1) else: self._log.info('Copy %s to SE %d failed', desc, idx + 1) self._log.log_process(proc) self._log.critical( 'Unable to copy %s! You can try to copy it manually.', desc) msg = 'Is %s (%s) available on SE %s?' % (desc, source, se_path) if not UserInputInterface().prompt_bool(msg, False): raise StorageError('%s is missing on SE %s!' % (desc, se_path))
def _get_jobs_output(self, gc_id_jobnum_list): # retrieve task output files from sandbox directory if not len(gc_id_jobnum_list): raise StopIteration activity = Activity('retrieving job outputs') for gc_id, jobnum in gc_id_jobnum_list: sandpath = self._get_sandbox_dn(jobnum) if sandpath is None: yield (jobnum, None) continue # when working with a remote spool schedd, tell condor to return files if self._remote_type == PoolType.SPOOL: self._check_and_log_proc( self._proc_factory.logged_execute( self._transfer_exec, self._split_gc_id(gc_id)[1])) # when working with a remote [gsi]ssh schedd, manually return files elif self._remote_type in (PoolType.SSH, PoolType.GSISSH): self._check_and_log_proc( self._proc_factory.logged_copy_from_remote( self._get_remote_output_dn(jobnum), self._get_sandbox_dn())) # clean up remote working directory self._check_and_log_proc( self._proc_factory.logged_execute( 'rm -rf %s' % self._get_remote_output_dn(jobnum))) # eventually extract wildcarded output files from the tarball unpack_wildcard_tar(self._log, sandpath) yield (jobnum, sandpath) # clean up if necessary activity.finish() self._cleanup_remote_output_dn()
def _read_jobs(self, job_limit): ensure_dir_exists(self._path_db, 'job database directory', JobError) candidates = [] for job_fn in fnmatch.filter(os.listdir(self._path_db), 'job_*.txt'): try: # 2xsplit is faster than regex jobnum = int(job_fn.split(".")[0].split("_")[1]) except Exception: clear_current_exception() continue candidates.append((jobnum, job_fn)) (job_map, max_job_len) = ({}, len(candidates)) activity = Activity('Reading job infos') idx = 0 for (jobnum, job_fn) in sorted(candidates): idx += 1 if jobnum >= job_limit >= 0: self._log.info( 'Stopped reading job infos at job #%d out of %d available job files, ' + 'since the limit of %d jobs is reached', jobnum, len(candidates), job_limit) break try: job_fn_full = os.path.join(self._path_db, job_fn) data = self._fmt.parse(SafeFile(job_fn_full).iter_close()) job_obj = self._create_job_obj(job_fn_full, data) except Exception: raise JobError('Unable to process job file %r' % job_fn_full) job_map[jobnum] = job_obj activity.update('Reading job infos %d [%d%%]' % (idx, (100.0 * idx) / max_job_len)) activity.finish() return job_map
def _readJobs(self, jobLimit): utils.ensureDirExists(self._dbPath, 'job database directory', JobError) candidates = [] for jobFile in fnmatch.filter(os.listdir(self._dbPath), 'job_*.txt'): try: # 2xsplit is faster than regex jobNum = int(jobFile.split(".")[0].split("_")[1]) except Exception: continue candidates.append((jobNum, jobFile)) (jobMap, maxJobs) = ({}, len(candidates)) activity = Activity('Reading job infos') idx = 0 for (jobNum, jobFile) in sorted(candidates): idx += 1 if (jobLimit >= 0) and (jobNum >= jobLimit): self._log.info('Stopped reading job infos at job #%d out of %d available job files, since the limit of %d jobs is reached', jobNum, len(candidates), jobLimit) break jobObj = self._load_job(os.path.join(self._dbPath, jobFile)) jobMap[jobNum] = jobObj if idx % 100 == 0: activity.update('Reading job infos %d [%d%%]' % (idx, (100.0 * idx) / maxJobs)) activity.finish() return jobMap
def execute(self, wmsIDs, wmsName): # yields list of (wmsID,) marked_wmsIDs = lmap(lambda result: result[0], self._cancel_executor.execute(wmsIDs, wmsName)) time.sleep(5) activity = Activity('Purging jobs') for result in self._purge_executor.execute(marked_wmsIDs, wmsName): yield result activity.finish()
def _submitJob(self, jobNum, module): fd, jdl = tempfile.mkstemp('.jdl') try: jdlData = self.makeJDL(jobNum, module) utils.safeWrite(os.fdopen(fd, 'w'), jdlData) except Exception: utils.removeFiles([jdl]) raise BackendError('Could not write jdl data to %s.' % jdl) try: submitArgs = [] for key_value in utils.filterDict(self._submitParams, vF = lambda v: v).items(): submitArgs.extend(key_value) submitArgs.append(jdl) activity = Activity('submitting job %d' % jobNum) proc = LocalProcess(self._submitExec, '--nomsg', '--noint', '--logfile', '/dev/stderr', *submitArgs) gcID = None for line in ifilter(lambda x: x.startswith('http'), imap(str.strip, proc.stdout.iter(timeout = 60))): gcID = line retCode = proc.status(timeout = 0, terminate = True) activity.finish() if (retCode != 0) or (gcID is None): if self.explainError(proc, retCode): pass else: self._log.log_process(proc, files = {'jdl': SafeFile(jdl).read()}) finally: utils.removeFiles([jdl]) return (jobNum, utils.QM(gcID, self._createId(gcID), None), {'jdl': str.join('', jdlData)})
def iter_blocks_normed(self): activity = Activity('Retrieving %s' % self._dataset_expr) try: # Validation, Naming: for block in self._iter_blocks_raw(): if not block.get(DataProvider.Dataset): raise DatasetError( 'Block does not contain the dataset name!') block.setdefault(DataProvider.BlockName, '0') block.setdefault(DataProvider.Provider, self.__class__.__name__) block.setdefault(DataProvider.Query, self._dataset_expr) block.setdefault(DataProvider.Locations, None) events = sum( imap(itemgetter(DataProvider.NEntries), block[DataProvider.FileList])) block.setdefault(DataProvider.NEntries, events) if self._dataset_nick_override: block[DataProvider.Nickname] = self._dataset_nick_override elif self._nick_producer: block = self._nick_producer.process_block(block) if not block: raise DatasetError('Nickname producer failed!') yield block except Exception: raise DatasetRetrievalError('Unable to retrieve dataset %s' % repr(self._dataset_expr)) activity.finish()
def _resync(self): if self._data_provider: activity = Activity('Performing resync of datasource %r' % self._name) # Get old and new dataset information ds_old = DataProvider.loadFromFile(self._getDataPath('cache.dat')).getBlocks(show_stats = False) self._data_provider.clearCache() ds_new = self._data_provider.getBlocks(show_stats = False) self._data_provider.saveToFile(self._getDataPath('cache-new.dat'), ds_new) # Use old splitting information to synchronize with new dataset infos old_maxN = self._data_splitter.getMaxJobs() jobChanges = self._data_splitter.resyncMapping(self._getDataPath('map-new.tar'), ds_old, ds_new) activity.finish() if jobChanges is not None: # Move current splitting to backup and use the new splitting from now on def backupRename(old, cur, new): if self._keepOld: os.rename(self._getDataPath(cur), self._getDataPath(old)) os.rename(self._getDataPath(new), self._getDataPath(cur)) backupRename( 'map-old-%d.tar' % time.time(), 'map.tar', 'map-new.tar') backupRename('cache-old-%d.dat' % time.time(), 'cache.dat', 'cache-new.dat') self._data_splitter.importPartitions(self._getDataPath('map.tar')) self._maxN = self._data_splitter.getMaxJobs() self._log.debug('Dataset resync finished: %d -> %d partitions', old_maxN, self._maxN) return (set(jobChanges[0]), set(jobChanges[1]), old_maxN != self._maxN)
def _cleanup_remote_output_dn(self): # active remote submission should clean up when no jobs remain if self._remote_type in (PoolType.SSH, PoolType.GSISSH): activity = Activity('clearing remote work directory') # check whether there are any remote working directories remaining check_proc = self._proc_factory.logged_execute( 'find %s -maxdepth 1 -type d | wc -l' % self._get_remote_output_dn()) try: if int(check_proc.get_output()) <= 1: cleanup_cmd = 'rm -rf %s' % self._get_remote_output_dn() cleanup_proc = self._proc_factory.logged_execute( cleanup_cmd) if cleanup_proc.wait() != 0: if self._explain_error(cleanup_proc, cleanup_proc.wait()): return cleanup_proc.log_error(self._error_log_fn) raise BackendError( 'Cleanup process %s returned: %s' % (cleanup_proc.cmd, cleanup_proc.get_message())) except Exception: self._log.warning( 'There might be some junk data left in: %s @ %s', self._get_remote_output_dn(), self._proc_factory.get_domain()) raise BackendError( 'Unable to clean up remote working directory') activity.finish()
def doTransfer(self, listDescSourceTarget): for (desc, source, target) in listDescSourceTarget: if not self.smPaths: raise ConfigError( "%s can't be transferred because '%s path wasn't set" % (desc, self.smOptPrefix)) for idx, sePath in enumerate(set(self.smPaths)): activity = Activity('Copy %s to SE %d ' % (desc, idx + 1)) proc = se_copy(source, os.path.join(sePath, target), self.smForce) proc.status(timeout=5 * 60, terminate=True) activity.finish() if proc.status(timeout=0) == 0: self._log.info('Copy %s to SE %d finished', desc, idx + 1) else: self._log.info('Copy %s to SE %d failed', desc, idx + 1) self._log.critical(proc.stderr.read(timeout=0)) self._log.critical( 'Unable to copy %s! You can try to copy it manually.', desc) if not utils.getUserBool( 'Is %s (%s) available on SE %s?' % (desc, source, sePath), False): raise StorageError('%s is missing on SE %s!' % (desc, sePath))
def _fill_cms_fi_list(self, block, block_path): activity_fi = Activity('Getting file information') lumi_used = False lumi_info_dict = {} if self._lumi_query: # central lumi query lumi_info_dict = self._get_cms_lumi_dict(block_path) fi_list = [] for (fi, lumi_info_list) in self._iter_cms_files(block_path, self._only_valid, self._lumi_query): self._raise_on_abort() if lumi_info_dict and not lumi_info_list: lumi_info_list = lumi_info_dict.get(fi[DataProvider.URL], []) if lumi_info_list: (run_list_result, lumi_list_result) = ([], []) for (run, lumi_list) in sorted(lumi_info_list): run_list_result.extend([run] * len(lumi_list)) lumi_list_result.extend(lumi_list) assert len(run_list_result) == len(lumi_list_result) fi[DataProvider.Metadata] = [run_list_result, lumi_list_result] lumi_used = True fi_list.append(fi) if lumi_used: block.setdefault(DataProvider.Metadata, []).extend(['Runs', 'Lumi']) block[DataProvider.FileList] = fi_list activity_fi.finish()
def _read_jobs(self, job_limit): ensure_dir_exists(self._path_db, 'job database directory', JobError) candidates = [] for job_fn in fnmatch.filter(os.listdir(self._path_db), 'job_*.txt'): try: # 2xsplit is faster than regex jobnum = int(job_fn.split(".")[0].split("_")[1]) except Exception: clear_current_exception() continue candidates.append((jobnum, job_fn)) (job_map, max_job_len) = ({}, len(candidates)) activity = Activity('Reading job infos') idx = 0 for (jobnum, job_fn) in sorted(candidates): idx += 1 if jobnum >= job_limit >= 0: self._log.info('Stopped reading job infos at job #%d out of %d available job files, ' + 'since the limit of %d jobs is reached', jobnum, len(candidates), job_limit) break try: job_fn_full = os.path.join(self._path_db, job_fn) data = self._fmt.parse(SafeFile(job_fn_full).iter_close()) job_obj = self._create_job_obj(job_fn_full, data) except Exception: raise JobError('Unable to process job file %r' % job_fn_full) job_map[jobnum] = job_obj activity.update('Reading job infos %d [%d%%]' % (idx, (100.0 * idx) / max_job_len)) activity.finish() return job_map
def _get_jobs_output(self, gc_id_jobnum_list): # Get output of jobs and yield output dirs if len(gc_id_jobnum_list) == 0: raise StopIteration root_dn = os.path.join(self._path_output, 'tmp') try: if len(gc_id_jobnum_list) == 1: # For single jobs create single subdir tmp_dn = os.path.join(root_dn, md5_hex(gc_id_jobnum_list[0][0])) else: tmp_dn = root_dn ensure_dir_exists(tmp_dn) except Exception: raise BackendError( 'Temporary path "%s" could not be created.' % tmp_dn, BackendError) map_gc_id2jobnum = dict(gc_id_jobnum_list) jobs = self._write_wms_id_list(gc_id_jobnum_list) activity = Activity('retrieving %d job outputs' % len(gc_id_jobnum_list)) proc = LocalProcess(self._output_exec, '--noint', '--logfile', '/dev/stderr', '-i', jobs, '--dir', tmp_dn) # yield output dirs todo = map_gc_id2jobnum.values() current_jobnum = None for line in imap(str.strip, proc.stdout.iter(timeout=60)): if line.startswith(tmp_dn): todo.remove(current_jobnum) output_dn = line.strip() unpack_wildcard_tar(self._log, output_dn) yield (current_jobnum, output_dn) current_jobnum = None else: current_jobnum = map_gc_id2jobnum.get(self._create_gc_id(line), current_jobnum) exit_code = proc.status(timeout=0, terminate=True) activity.finish() if exit_code != 0: if 'Keyboard interrupt raised by user' in proc.stderr.read( timeout=0): remove_files([jobs, root_dn]) raise StopIteration else: self._log.log_process(proc, files={'jobs': SafeFile(jobs).read()}) self._log.error('Trying to recover from error ...') for dn in os.listdir(root_dn): yield (None, os.path.join(root_dn, dn)) # return unretrievable jobs for jobnum in todo: yield (jobnum, None) remove_files([jobs, tmp_dn])
def _submit_jobs(self, jobnum_list, task): # submit_jobs: Submit a number of jobs and yield (jobnum, WMS ID, other data) sequentially # >>jobnum: internal ID of the Job # JobNum is linked to the actual *task* here (jdl_fn, submit_jdl_fn) = self._submit_jobs_prepare(jobnum_list, task) try: # submit all jobs simultaneously and temporarily store verbose (ClassAdd) output activity = Activity('queuing jobs at scheduler') submit_args = ' -verbose -batch-name ' + task.get_description().task_name + ' ' + submit_jdl_fn proc = self._proc_factory.logged_execute(self._submit_exec, submit_args) # extract the Condor ID (WMS ID) of the jobs from output ClassAds jobnum_gc_id_list = [] for line in proc.iter(): if 'GridControl_GCIDtoWMSID' in line: jobnum_wms_id = line.split('=')[1].strip(' "\n').split('@') jobnum, wms_id = int(jobnum_wms_id[0]), jobnum_wms_id[1].strip() # Condor creates a default job then overwrites settings on any subsequent job # i.e. skip every second, but better be sure if (not jobnum_gc_id_list) or (jobnum not in lzip(*jobnum_gc_id_list)[0]): jobnum_gc_id_list.append((jobnum, self._create_gc_id(wms_id))) exit_code = proc.wait() activity.finish() if (exit_code != 0) or (len(jobnum_gc_id_list) < len(jobnum_list)): if not self._explain_error(proc, exit_code): self._log.error('Submitted %4d jobs of %4d expected', len(jobnum_gc_id_list), len(jobnum_list)) proc.log_error(self._error_log_fn, jdl=jdl_fn) finally: remove_files([jdl_fn]) for (jobnum, gc_id) in jobnum_gc_id_list: yield (jobnum, gc_id, {})
def submit_jobs(self, jobnum_list, task): requestLen = len(jobnum_list) activity = Activity('Submitting jobs (--%)') while jobnum_list: jobSubmitNumList = jobnum_list[-self._schedd.getSubmitScale():] del jobnum_list[-self._schedd.getSubmitScale():] activity = Activity('Submitting jobs (%2d%%)'%(100*(requestLen-len(jobnum_list))/requestLen)) for jobnum in jobSubmitNumList: self._write_job_config( self.getJobCfgPath(jobnum)[0], jobnum, task, {} ) rawJobInfoMaps = self._schedd.submit_jobs( jobSubmitNumList, task, self._getQueryArgs() ) # Yield (jobnum, gc_id, other data) per jobZ jobInfoMaps = self._digestQueueInfoMaps(rawJobInfoMaps) for htcID in jobInfoMaps: yield ( htcID.gcJobNum, self._createGcId(htcID), jobInfoMaps[htcID] ) activity.finish()
def create_tarball(match_info_iter, **kwargs): tar = tarfile.open(mode='w:gz', **kwargs) activity = Activity('Generating tarball') for match_info in match_info_iter: if isinstance(match_info, tuple): (path_source, path_target) = match_info else: (path_source, path_target) = (match_info, None) if isinstance(path_source, str): if not os.path.exists(path_source): raise PathError('File %s does not exist!' % path_source) tar.add(path_source, path_target or os.path.basename(path_source), recursive=False) elif path_source is None: # Update activity activity.update('Generating tarball: %s' % path_target) else: # File handle info, handle = path_source.get_tar_info() if path_target: info.name = path_target info.mtime = time.time() info.mode = stat.S_IRUSR + stat.S_IWUSR + stat.S_IRGRP + stat.S_IROTH if info.name.endswith('.sh') or info.name.endswith('.py'): info.mode += stat.S_IXUSR + stat.S_IXGRP + stat.S_IXOTH tar.addfile(info, handle) handle.close() activity.finish() tar.close()
def _resync_psrc(self): activity = Activity('Performing resync of datasource %r' % self.get_datasource_name()) # Get old and new dataset information provider_old = DataProvider.load_from_file(self._get_data_path('cache.dat')) block_list_old = provider_old.get_block_list_cached(show_stats=False) self._provider.clear_cache() block_list_new = self._provider.get_block_list_cached(show_stats=False) self._provider.save_to_file(self._get_data_path('cache-new.dat'), block_list_new) # Use old splitting information to synchronize with new dataset infos partition_len_old = self.get_parameter_len() partition_changes = self._resync_partitions( self._get_data_path('map-new.tar'), block_list_old, block_list_new) activity.finish() if partition_changes is not None: # Move current splitting to backup and use the new splitting from now on def _rename_with_backup(new, cur, old): if self._keep_old: os.rename(self._get_data_path(cur), self._get_data_path(old)) os.rename(self._get_data_path(new), self._get_data_path(cur)) _rename_with_backup('map-new.tar', 'map.tar', 'map-old-%d.tar' % time.time()) _rename_with_backup('cache-new.dat', 'cache.dat', 'cache-old-%d.dat' % time.time()) self._set_reader(DataSplitter.load_partitions(self._get_data_path('map.tar'))) self._log.debug('Dataset resync finished: %d -> %d partitions', partition_len_old, self._len) (pnum_list_redo, pnum_list_disable) = partition_changes return (set(pnum_list_redo), set(pnum_list_disable), partition_len_old != self._len)
def _set_proxy_lifetime(self): activity = Activity('Get proxy lifetime...') proc = LocalProcess(resolve_install_path('voms-proxy-info')) output = proc.get_output(timeout=10, raise_errors=False) end_of_proxy = 0 proxy_key = None for l in output.split('\n'): if 'subject' in l: proxy_key = l.encode("hex")[-15:] if 'timeleft' in l: h, m, s = int(l.split(':')[-3]), int(l.split(':')[-2]), int( l.split(':')[-1]) end_of_proxy = time.time() + h * 60 * 60 + m * 60 + s break if end_of_proxy == 0: self._log.warning('couldnt evaluate end of proxy. Output was:') self._log.warning(output) time.sleep(300) self._set_proxy_lifetime() else: self._end_of_proxy_lifetime = end_of_proxy if proxy_key is not None: self._delegated_proxy_filename = os.path.join( os.path.expanduser("~"), ".gcDelegatedProxy" + proxy_key) left_time_str = datetime.fromtimestamp( self._end_of_proxy_lifetime).strftime("%A, %B %d, %Y %I:%M:%S") self._log.info('End of current proxy lifetime: %s' % left_time_str) activity.finish() return 0
def delete_job(opts, work_dn, status_mon, job_db, job_obj, jobnum): activity = Activity('Deleting output files') try: if (job_obj.get('deleted') == 'True') and not opts.mark_ignore_rm: return status_mon.register_job_result( jobnum, 'Files are already deleted', JobDownloadStatus.JOB_ALREADY) if (job_obj.get('download') != 'True') and not opts.mark_ignore_dl: return status_mon.register_job_result( jobnum, 'Files are not yet downloaded', JobDownloadStatus.JOB_INCOMPLETE) fi_list = FileInfoProcessor().process( os.path.join(work_dn, 'output', 'job_%d' % jobnum)) or [] if not fi_list: return status_mon.register_job_result( jobnum, 'Job has no output files', JobDownloadStatus.JOB_NO_OUTPUT) job_successful = job_obj.state != Job.SUCCESS delete_files(opts, jobnum, fi_list, download_failed=job_successful, show_se_skip=True) set_job_prop(job_db, jobnum, job_obj, 'deleted', 'True') status_mon.register_job_result(jobnum, 'All files deleted', JobDownloadStatus.JOB_OK) finally: activity.finish()
def _read_jobs(self, job_limit): job_map = {} max_job_len = 0 if os.path.exists(self._db_fn): try: tar = zipfile.ZipFile(self._db_fn, 'r', zipfile.ZIP_DEFLATED) tar.testzip() except Exception: # Try to recover job archive clear_current_exception() self._log.warning('Job database is corrupted - starting recovery') self._recover_jobs() self._log.info('Recover completed!') activity = Activity('Reading job transactions') max_job_len = len(tar.namelist()) map_jobnum2tarfn = {} for idx, tar_info_fn in enumerate(tar.namelist()): (jobnum, tid) = tuple(imap(lambda s: int(s[1:]), tar_info_fn.split('_', 1))) if tid < map_jobnum2tarfn.get(jobnum, 0): continue try: data = self._fmt.parse(tar.open(tar_info_fn).read()) except Exception: clear_current_exception() continue job_map[jobnum] = self._create_job_obj(tar_info_fn, data) map_jobnum2tarfn[jobnum] = tid if idx % 100 == 0: activity.update('Reading job transactions %d [%d%%]' % (idx, (100.0 * idx) / max_job_len)) activity.finish() self._serial = max_job_len return job_map
def _get_jobs_output(self, gc_id_jobnum_list): # retrieve task output files from sandbox directory if not len(gc_id_jobnum_list): raise StopIteration activity = Activity('retrieving job outputs') for gc_id, jobnum in gc_id_jobnum_list: sandpath = self._get_sandbox_dn(jobnum) if sandpath is None: yield (jobnum, None) continue # when working with a remote spool schedd, tell condor to return files if self._remote_type == PoolType.SPOOL: self._check_and_log_proc(self._proc_factory.logged_execute( self._transfer_exec, self._split_gc_id(gc_id)[1])) # when working with a remote [gsi]ssh schedd, manually return files elif self._remote_type in (PoolType.SSH, PoolType.GSISSH): self._check_and_log_proc(self._proc_factory.logged_copy_from_remote( self._get_remote_output_dn(jobnum), self._get_sandbox_dn())) # clean up remote working directory self._check_and_log_proc(self._proc_factory.logged_execute( 'rm -rf %s' % self._get_remote_output_dn(jobnum))) # eventually extract wildcarded output files from the tarball unpack_wildcard_tar(self._log, sandpath) yield (jobnum, sandpath) # clean up if necessary activity.finish() self._cleanup_remote_output_dn()
def _readJobs(self, jobLimit): utils.ensureDirExists(self._dbPath, 'job database directory', JobError) candidates = [] for jobFile in fnmatch.filter(os.listdir(self._dbPath), 'job_*.txt'): try: # 2xsplit is faster than regex jobNum = int(jobFile.split(".")[0].split("_")[1]) except Exception: continue candidates.append((jobNum, jobFile)) (jobMap, maxJobs) = ({}, len(candidates)) activity = Activity('Reading job infos') idx = 0 for (jobNum, jobFile) in sorted(candidates): idx += 1 if (jobLimit >= 0) and (jobNum >= jobLimit): self._log.info( 'Stopped reading job infos at job #%d out of %d available job files, since the limit of %d jobs is reached', jobNum, len(candidates), jobLimit) break jobObj = self._load_job(os.path.join(self._dbPath, jobFile)) jobMap[jobNum] = jobObj if idx % 100 == 0: activity.update('Reading job infos %d [%d%%]' % (idx, (100.0 * idx) / maxJobs)) activity.finish() return jobMap
def _tidyUpWorkingDirectory(self, forceCleanup=False): # active remote submission should clean up when no jobs remain if self.remoteType == PoolType.SSH or self.remoteType == PoolType.GSISSH: self.debugOut( "Revising remote working directory for cleanup. Forced CleanUp: %s" % forceCleanup) activity = Activity('revising remote work directory') # check whether there are any remote working directories remaining checkProcess = self.Pool.LoggedExecute( 'find %s -maxdepth 1 -type d | wc -l' % self.getWorkdirPath()) try: if forceCleanup or (int(checkProcess.getOutput()) <= 1): cleanupProcess = self.Pool.LoggedExecute( 'rm -rf %s' % self.getWorkdirPath()) if cleanupProcess.wait() != 0: if self.explainError(cleanupProcess, cleanupProcess.wait()): return cleanupProcess.logError(self.errorLog) raise BackendError( 'Cleanup process %s returned: %s' % (cleanupProcess.cmd, cleanupProcess.getMessage())) except Exception: self._log.warning( 'There might be some junk data left in: %s @ %s', self.getWorkdirPath(), self.Pool.getDomain()) raise BackendError( 'Unable to clean up remote working directory') activity.finish()
def __init__(self, block_list_old, block_list_new): activity = Activity('Performing resynchronization of dataset') block_resync_tuple = DataProvider.resync_blocks(block_list_old, block_list_new) (self.block_list_added, self._block_list_missing, self._block_list_matching) = block_resync_tuple for block_missing in self._block_list_missing: # Files in matching blocks are already sorted sort_inplace(block_missing[DataProvider.FileList], key=itemgetter(DataProvider.URL)) activity.finish()
def _saveStateToTar(self, tar, meta, source, sourceLen, message): # Write the splitting info grouped into subtarfiles activity = Activity(message) (jobNum, lastValid, subTar) = (-1, -1, None) for jobNum, entry in enumerate(source): if not entry.get(DataSplitter.Invalid, False): lastValid = jobNum if jobNum % self._keySize == 0: self._closeSubTar(tar, subTar) subTar = self._createSubTar('%03dXX.tgz' % int(jobNum / self._keySize)) activity.update('%s [%d / %d]' % (message, jobNum, sourceLen)) # Determine shortest way to store file list tmp = entry.pop(DataSplitter.FileList) savelist = self._getReducedFileList(entry, tmp) # can modify entry # Write files with infos / filelist data = str.join('', self._fmt.format(entry, fkt = self._formatFileEntry) + lmap(lambda fn: '=%s\n' % fn, savelist)) self._addToSubTar(subTar, '%05d' % jobNum, data) # Remove common prefix from info if DataSplitter.CommonPrefix in entry: entry.pop(DataSplitter.CommonPrefix) entry[DataSplitter.FileList] = tmp self._closeSubTar(tar, subTar) activity.finish() # Write metadata to allow reconstruction of data splitter meta['MaxJobs'] = lastValid + 1 for (fn, data) in [('Metadata', self._fmt.format(meta)), ('Version', '2')]: self._addToTar(tar, fn, data)
def hash_verify(opts, status_mon, local_se_path, jobnum, fi_idx, fi): if not opts.verify_md5: return status_mon.register_file_result(jobnum, fi_idx, 'Download successful', FileDownloadStatus.FILE_OK) # Verify => compute md5hash remote_hash = fi[FileInfo.Hash] activity = Activity('Verifying checksum') try: local_hash = ignore_exception(Exception, None, hash_calc, local_se_path.replace('file://', '')) if local_hash is None: return status_mon.register_file_result( jobnum, fi_idx, 'Unable to calculate checksum', FileDownloadStatus.FILE_HASH_FAILED) finally: activity.finish() hash_match = fi[FileInfo.Hash] == local_hash match_map = {True: 'MATCH', False: 'FAIL'} if ANSI is not None: match_map = { True: ANSI.reset + ANSI.color_green + 'MATCH' + ANSI.reset, False: ANSI.reset + ANSI.color_red + 'FAIL' + ANSI.reset } msg = '\tLocal hash: %s\n' % local_hash + \ log_intro(jobnum, fi_idx) + '\tRemote hash: %s\n' % remote_hash + \ log_intro(jobnum, fi_idx) + 'Checksum comparison: ' + match_map[hash_match] if hash_match: return status_mon.register_file_result(jobnum, fi_idx, msg, FileDownloadStatus.FILE_OK) return status_mon.register_file_result(jobnum, fi_idx, msg, FileDownloadStatus.FILE_HASH_FAILED)
def hash_verify(opts, status_mon, local_se_path, jobnum, fi_idx, fi): if not opts.verify_md5: return status_mon.register_file_result(jobnum, fi_idx, 'Download successful', FileDownloadStatus.FILE_OK) # Verify => compute md5hash remote_hash = fi[FileInfo.Hash] activity = Activity('Verifying checksum') try: local_hash = ignore_exception(Exception, None, hash_calc, local_se_path.replace('file://', '')) if local_hash is None: return status_mon.register_file_result(jobnum, fi_idx, 'Unable to calculate checksum', FileDownloadStatus.FILE_HASH_FAILED) finally: activity.finish() hash_match = fi[FileInfo.Hash] == local_hash match_map = {True: 'MATCH', False: 'FAIL'} if ANSI is not None: match_map = {True: ANSI.reset + ANSI.color_green + 'MATCH' + ANSI.reset, False: ANSI.reset + ANSI.color_red + 'FAIL' + ANSI.reset} msg = '\tLocal hash: %s\n' % local_hash + \ log_intro(jobnum, fi_idx) + '\tRemote hash: %s\n' % remote_hash + \ log_intro(jobnum, fi_idx) + 'Checksum comparison: ' + match_map[hash_match] if hash_match: return status_mon.register_file_result(jobnum, fi_idx, msg, FileDownloadStatus.FILE_OK) return status_mon.register_file_result(jobnum, fi_idx, msg, FileDownloadStatus.FILE_HASH_FAILED)
def __init__(self, lockfile): self._lockfile = lockfile activity = Activity('Trying to aquire lock file %s ...' % lockfile) while os.path.exists(self._lockfile): time.sleep(0.2) activity.finish() self._fd = open(self._lockfile, 'w') fcntl.flock(self._fd, fcntl.LOCK_EX)
def execute(self, wms_id_list, wms_name): # yields list of (wms_id,) marked_wms_id_list = lmap(lambda result: result[0], self._cancel_executor.execute(wms_id_list, wms_name)) time.sleep(5) activity = Activity('Purging jobs') for result in self._purge_executor.execute(marked_wms_id_list, wms_name): yield result activity.finish()
def _getJobsOutput(self, ids): if len(ids) == 0: raise StopIteration basePath = os.path.join(self._outputPath, 'tmp') try: if len(ids) == 1: # For single jobs create single subdir tmpPath = os.path.join(basePath, md5(ids[0][0]).hexdigest()) else: tmpPath = basePath utils.ensureDirExists(tmpPath) except Exception: raise BackendError('Temporary path "%s" could not be created.' % tmpPath, BackendError) jobNumMap = dict(ids) jobs = self.writeWMSIds(ids) activity = Activity('retrieving %d job outputs' % len(ids)) proc = LocalProcess(self._outputExec, '--noint', '--logfile', '/dev/stderr', '-i', jobs, '--dir', tmpPath) # yield output dirs todo = jobNumMap.values() currentJobNum = None for line in imap(str.strip, proc.stdout.iter(timeout = 60)): if line.startswith(tmpPath): todo.remove(currentJobNum) outputDir = line.strip() if os.path.exists(outputDir): if 'GC_WC.tar.gz' in os.listdir(outputDir): wildcardTar = os.path.join(outputDir, 'GC_WC.tar.gz') try: tarfile.TarFile.open(wildcardTar, 'r:gz').extractall(outputDir) os.unlink(wildcardTar) except Exception: self._log.error('Can\'t unpack output files contained in %s', wildcardTar) yield (currentJobNum, line.strip()) currentJobNum = None else: currentJobNum = jobNumMap.get(self._createId(line), currentJobNum) retCode = proc.status(timeout = 0, terminate = True) activity.finish() if retCode != 0: if 'Keyboard interrupt raised by user' in proc.stderr.read(timeout = 0): utils.removeFiles([jobs, basePath]) raise StopIteration else: self._log.log_process(proc, files = {'jobs': SafeFile(jobs).read()}) self._log.error('Trying to recover from error ...') for dirName in os.listdir(basePath): yield (None, os.path.join(basePath, dirName)) # return unretrievable jobs for jobNum in todo: yield (jobNum, None) utils.removeFiles([jobs, basePath])
def _delete(file_se_path, where, what): if se_exists(file_se_path).status(timeout=10, terminate=True) == 0: activity = Activity('Deleting file %s from %s' % (fi[FileInfo.NameDest], where)) rm_proc = se_rm(file_se_path) if rm_proc.status(timeout=60, terminate=True) == 0: log.info(log_intro(jobnum, fi_idx) + 'Deleted file %s', file_se_path) else: log.log_process(rm_proc, msg=log_intro(jobnum, fi_idx) + 'Unable to remove %s' % what) activity.finish()
def _get_phedex_replica_list(self, block_path, replicas_dict): activity_fi = Activity('Getting file replica information from PhEDex') # Get dataset se list from PhEDex (perhaps concurrent with get_dbs_file_list) replicas_dict[block_path] = [] for phedex_block in self._pjrc.get(params={'block': block_path})['phedex']['block']: for replica in phedex_block['replica']: replica_info = (replica['node'], replica.get('se'), replica['complete'] == 'y') replicas_dict[block_path].append(replica_info) activity_fi.finish()
def get_dataset_name_list(self): if self._cache_dataset is None: self._cache_dataset = [self._dataset_path] if '*' in self._dataset_path: activity = Activity('Getting dataset list for %s' % self._dataset_path) self._cache_dataset = list(self._get_cms_dataset_list(self._dataset_path)) if not self._cache_dataset: raise DatasetError('No datasets selected by DBS wildcard %s !' % self._dataset_path) activity.finish() return self._cache_dataset
def wait(timeout): activity = Activity('Waiting', parent='root') for remaining in irange(timeout, 0, -1): if abort(): return False if (remaining == timeout) or (remaining < 5) or (remaining % 5 == 0): activity.update('Waiting for %d seconds' % remaining) time.sleep(1) activity.finish() return True
def getEntries(self, path, metadata, events, seList, objStore): metadata['GC_SOURCE_DIR'] = self._path counter = 0 activity = Activity('Reading source directory') for fn in self._iter_path(): activity.update('Reading source directory - [%d]' % counter) yield (os.path.join(self._path, fn.strip()), metadata, events, seList, objStore) counter += 1 activity.finish()
def __init__(self, block_list_old, block_list_new): activity = Activity('Performing resynchronization of dataset') block_resync_tuple = DataProvider.resync_blocks( block_list_old, block_list_new) (self.block_list_added, self._block_list_missing, self._block_list_matching) = block_resync_tuple for block_missing in self._block_list_missing: # Files in matching blocks are already sorted sort_inplace(block_missing[DataProvider.FileList], key=itemgetter(DataProvider.URL)) activity.finish()
def __init__(self, config, source): self._psrc_raw = source BasicParameterAdapter.__init__(self, config, source) self._map_jobnum2pnum = {} ensure_dir_exists(config.get_work_path(), 'parameter storage directory', ParameterError) self._path_jobnum2pnum = config.get_work_path('params.map.gz') self._path_params = config.get_work_path('params.dat.gz') # Find out if init should be performed - overrides resync_requested! init_requested = config.get_state('init', detail='parameters') init_needed = False if not (os.path.exists(self._path_params) and os.path.exists(self._path_jobnum2pnum)): init_needed = True # Init needed if no parameter log exists if init_requested and not init_needed and (source.get_parameter_len() is not None): self._log.warning('Re-Initialization will overwrite the current mapping ' + 'between jobs and parameter/dataset content! This can lead to invalid results!') user_msg = ('Do you want to perform a syncronization between ' + 'the current mapping and the new one to avoid this?') if UserInputInterface().prompt_bool(user_msg, True): init_requested = False do_init = init_requested or init_needed # Find out if resync should be performed resync_by_user = config.get_state('resync', detail='parameters') config.set_state(False, 'resync', detail='parameters') psrc_hash = self._psrc_raw.get_psrc_hash() self._psrc_hash_stored = config.get('parameter hash', psrc_hash, persistent=True) psrc_hash_changed = self._psrc_hash_stored != psrc_hash # Resync if parameters have changed resync_by_psrc = self._psrc_raw.get_resync_request() if do_init: # Write current state self._write_jobnum2pnum(self._path_jobnum2pnum) ParameterSource.get_class('GCDumpParameterSource').write(self._path_params, self.get_job_len(), self.get_job_metadata(), self.iter_jobs()) elif resync_by_user or resync_by_psrc or psrc_hash_changed: # Perform sync if psrc_hash_changed: self._log.info('Parameter hash has changed') self._log.debug('\told hash: %s', self._psrc_hash_stored) self._log.debug('\tnew hash: %s', psrc_hash) self._log.log(logging.DEBUG1, '\tnew src: %s', self._psrc_raw) config.set_state(True, 'init', detail='config') elif resync_by_psrc: self._log.info('Parameter source requested resync') self._log.debug('\t%r', str.join(', ', imap(repr, resync_by_psrc))) elif resync_by_user: self._log.info('User requested resync') self._psrc_hash_stored = None self._resync_state = self.resync(force=True) else: # Reuse old mapping activity = Activity('Loading cached parameter information') self._read_jobnum2pnum() activity.finish() return # do not set parameter hash in config config.set('parameter hash', self._psrc_raw.get_psrc_hash())
def execute(self, wms_id_list, wms_name): # yields list of purged (wms_id,) activity = Activity('waiting for jobs to finish') time.sleep(5) for wms_id in wms_id_list: path = self._sandbox_helper.get_sandbox('WMSID.%s.%s' % (wms_name, wms_id)) if path is None: self._log.warning('Sandbox for job %r could not be found', wms_id) continue with_lock(LocalPurgeJobs.purge_lock, _purge_directory, self._log, path, wms_id) yield (wms_id,) activity.finish()
def _get_phedex_replica_list(self, block_path, replicas_dict): activity_fi = Activity('Getting file replica information from PhEDex') # Get dataset se list from PhEDex (perhaps concurrent with get_dbs_file_list) replicas_dict[block_path] = [] for phedex_block in self._pjrc.get( params={'block': block_path})['phedex']['block']: for replica in phedex_block['replica']: replica_info = (replica['node'], replica.get('se'), replica['complete'] == 'y') replicas_dict[block_path].append(replica_info) activity_fi.finish()
def _get_jobs_output(self, gc_id_jobnum_list): # Get output of jobs and yield output dirs if len(gc_id_jobnum_list) == 0: raise StopIteration root_dn = os.path.join(self._path_output, 'tmp') try: if len(gc_id_jobnum_list) == 1: # For single jobs create single subdir tmp_dn = os.path.join(root_dn, md5_hex(gc_id_jobnum_list[0][0])) else: tmp_dn = root_dn ensure_dir_exists(tmp_dn) except Exception: raise BackendError('Temporary path "%s" could not be created.' % tmp_dn, BackendError) map_gc_id2jobnum = dict(gc_id_jobnum_list) jobs = self._write_wms_id_list(gc_id_jobnum_list) activity = Activity('retrieving %d job outputs' % len(gc_id_jobnum_list)) proc = LocalProcess(self._output_exec, '--noint', '--logfile', '/dev/stderr', '-i', jobs, '--dir', tmp_dn) # yield output dirs todo = map_gc_id2jobnum.values() current_jobnum = None for line in imap(str.strip, proc.stdout.iter(timeout=60)): if line.startswith(tmp_dn): todo.remove(current_jobnum) output_dn = line.strip() unpack_wildcard_tar(self._log, output_dn) yield (current_jobnum, output_dn) current_jobnum = None else: current_jobnum = map_gc_id2jobnum.get(self._create_gc_id(line), current_jobnum) exit_code = proc.status(timeout=0, terminate=True) activity.finish() if exit_code != 0: if 'Keyboard interrupt raised by user' in proc.stderr.read(timeout=0): remove_files([jobs, root_dn]) raise StopIteration else: self._log.log_process(proc, files={'jobs': SafeFile(jobs).read()}) self._log.error('Trying to recover from error ...') for dn in os.listdir(root_dn): yield (None, os.path.join(root_dn, dn)) # return unretrievable jobs for jobnum in todo: yield (jobnum, None) remove_files([jobs, tmp_dn])
def _submit_job(self, jobnum, task): # Submit job and yield (jobnum, WMS ID, other data) activity = Activity('submitting job %d' % jobnum) try: sandbox = tempfile.mkdtemp( '', '%s.%04d.' % (task.get_description().task_id, jobnum), self._sandbox_helper.get_path()) except Exception: raise BackendError('Unable to create sandbox directory "%s"!' % sandbox) sb_prefix = sandbox.replace(self._sandbox_helper.get_path(), '').lstrip('/') def _translate_target(desc, src, target): return (desc, src, os.path.join(sb_prefix, target)) self._sm_sb_in.do_transfer( ismap(_translate_target, self._get_in_transfer_info_list(task))) self._write_job_config( os.path.join(sandbox, '_jobconfig.sh'), jobnum, task, { 'GC_SANDBOX': sandbox, 'GC_SCRATCH_SEARCH': str.join(' ', self._scratch_path) }) reqs = self._broker_site.broker(task.get_requirement_list(jobnum), WMS.SITES) reqs = dict(self._broker_queue.broker(reqs, WMS.QUEUES)) if (self._memory > 0) and (reqs.get(WMS.MEMORY, 0) < self._memory): reqs[ WMS. MEMORY] = self._memory # local jobs need higher (more realistic) memory requirements job_name = task.get_description(jobnum).job_name proc = self._get_submit_proc(jobnum, sandbox, job_name, reqs) exit_code = proc.status(timeout=20, terminate=True) wms_id_str = proc.stdout.read(timeout=0).strip().strip('\n') wms_id = ignore_exception(Exception, None, self.parse_submit_output, wms_id_str) activity.finish() if exit_code != 0: self._log.warning('%s failed:', self._submit_exec) elif wms_id is None: self._log.warning('%s did not yield job id:\n%s', self._submit_exec, wms_id_str) gc_id = self._create_gc_id(wms_id) if gc_id is not None: open(os.path.join(sandbox, gc_id), 'w') else: self._log.log_process(proc) return (jobnum, gc_id, {'sandbox': sandbox})
def _readJobs(self, jobLimit): jobMap = {} maxJobs = 0 if os.path.exists(self._dbFile): try: tar = zipfile.ZipFile(self._dbFile, 'r', zipfile.ZIP_DEFLATED) except Exception: # Try to recover job archive self._log.warning( '=' * 40 + '\nStarting recovery of broken job database => Answer "y" if asked "Is this a single-disk archive?"!\n' + '=' * 40) os.system('zip -FF %s --out %s.tmp 2> /dev/null' % (self._dbFile, self._dbFile)) os.rename(self._dbFile, self._dbFile + '.broken') os.rename(self._dbFile + '.tmp', self._dbFile) tar = zipfile.ZipFile(self._dbFile, 'r', zipfile.ZIP_DEFLATED) removeFiles([self._dbFile + '.broken']) brokenList = [] for idx, fnTarInfo in enumerate(tar.namelist()): (jobNum, tid) = tuple( imap(lambda s: int(s[1:]), fnTarInfo.split('_', 1))) try: fp = tar.open(fnTarInfo) try: fp.read() finally: fp.close() except Exception: clear_current_exception() for broken in brokenList: os.system('zip %s -d %s' % (self._dbFile, broken)) self._log.info('Recover completed!') activity = Activity('Reading job transactions') maxJobs = len(tar.namelist()) tMap = {} for idx, fnTarInfo in enumerate(tar.namelist()): (jobNum, tid) = tuple( imap(lambda s: int(s[1:]), fnTarInfo.split('_', 1))) if tid < tMap.get(jobNum, 0): continue try: data = self._fmt.parse(tar.open(fnTarInfo).read()) except Exception: continue jobMap[jobNum] = self._create_job_obj(fnTarInfo, data) tMap[jobNum] = tid if idx % 100 == 0: activity.update('Reading job transactions %d [%d%%]' % (idx, (100.0 * idx) / maxJobs)) activity.finish() self._serial = maxJobs return jobMap
def getEntries(self, path, metadata, events, seList, objStore): activity = Activity('Reading job logs') for jobNum in self._selected: activity.update('Reading job logs - [%d / %d]' % (jobNum, self._selected[-1])) metadata['GC_JOBNUM'] = jobNum objStore.update({ 'GC_TASK': self._extTask, 'GC_WORKDIR': self._extWorkDir }) yield (os.path.join(self._extWorkDir, 'output', 'job_%d' % jobNum), metadata, events, seList, objStore) activity.finish()
def execute(self, wmsIDs, wmsName): # yields list of purged (wmsID,) activity = Activity('waiting for jobs to finish') time.sleep(5) for wmsID in wmsIDs: path = self._sandbox_helper.get_sandbox('WMSID.%s.%s' % (wmsName, wmsID)) if path is None: self._log.warning('Sandbox for job %r could not be found', wmsID) continue try: shutil.rmtree(path) except Exception: raise BackendError('Sandbox for job %r could not be deleted', wmsID) yield (wmsID,) activity.finish()
def _run_executor(self, desc, executor, fmt, gc_id_list, *args): # Perform some action with the executor, translate wms_id -> gc_id and format the result activity = Activity(desc) map_wms_id2gc_id = self._get_map_wms_id2gc_id(gc_id_list) wms_id_list = sorted(map_wms_id2gc_id.keys()) for result in executor.execute(wms_id_list, *args): wms_id = result[0] # result[0] is the wms_id by convention gc_id = map_wms_id2gc_id.pop(wms_id, None) if gc_id is not None: yield fmt((gc_id, ) + result[1:]) else: self._log.debug('unable to find gc_id for wms_id %r', wms_id) activity.finish()
def _run_executor(self, desc, executor, fmt, gc_id_list, *args): # Perform some action with the executor, translate wms_id -> gc_id and format the result activity = Activity(desc) map_wms_id2gc_id = self._get_map_wms_id2gc_id(gc_id_list) wms_id_list = sorted(map_wms_id2gc_id.keys()) for result in executor.execute(wms_id_list, *args): wms_id = result[0] # result[0] is the wms_id by convention gc_id = map_wms_id2gc_id.pop(wms_id, None) if gc_id is not None: yield fmt((gc_id,) + result[1:]) else: self._log.debug('unable to find gc_id for wms_id %r', wms_id) activity.finish()
def get_dataset_name_list(self): if self._cache_dataset is None: self._cache_dataset = [self._dataset_path] if '*' in self._dataset_path: activity = Activity('Getting dataset list for %s' % self._dataset_path) self._cache_dataset = list( self._get_cms_dataset_list(self._dataset_path)) if not self._cache_dataset: raise DatasetError( 'No datasets selected by DBS wildcard %s !' % self._dataset_path) activity.finish() return self._cache_dataset
def _run_executor(self, desc, executor, fmt, gcIDs, *args): # Perform some action with the executor, translate wmsID -> gcID and format the result activity = Activity(desc) wmsID_gcID_Map = self._get_map_wmsID_gcID(gcIDs) wmsIDs = sorted(wmsID_gcID_Map.keys()) for result in executor.execute(wmsIDs, *args): wmsID = result[0] # result[0] is the wmsID by convention gcID = wmsID_gcID_Map.pop(wmsID, None) if gcID is not None: yield fmt((gcID,) + result[1:]) else: self._log.debug('unable to find gcID for wmsID %r', wmsID) activity.finish()
def __init__(self, config, jobLimit = -1, jobSelector = None): dbPath = config.getWorkPath('jobs') self._dbFile = config.getWorkPath('jobs.zip') if os.path.exists(dbPath) and os.path.isdir(dbPath) and not os.path.exists(self._dbFile): activity = Activity('Converting job database') self._serial = 0 try: oldDB = TextFileJobDB(config) for jobNum in oldDB.getJobs(): self.commit(jobNum, oldDB.get(jobNum)) except Exception: removeFiles([self._dbFile]) raise activity.finish() ZippedJobDB.__init__(self, config, jobLimit, jobSelector)
def resync(self, force = False): # Do not overwrite resync results - eg. from external or init trigger source_hash = self._source.getHash() if (self._resync_state == ParameterSource.EmptyResyncResult()) and ((source_hash != self._source_hash) or force): activity = Activity('Syncronizing parameter information') t_start = time.time() try: self._resync_state = self._resync() except Exception: raise ParameterError('Unable to resync parameters!') self._source_hash = self._source.getHash() activity.finish() self._log.log(logging.INFO, 'Finished resync of parameter source (%s)', strTimeShort(time.time() - t_start)) result = self._resync_state self._resync_state = ParameterSource.EmptyResyncResult() return result
def cancel_jobs(self, gc_id_jobnum_list): if not len(gc_id_jobnum_list): raise StopIteration activity = Activity('Canceling jobs') assert not bool(lfilter( lambda htcid: htcid.scheddURI != self._schedd.getURI(), self._splitGcRequests(gc_id_jobnum_list))), 'Bug! Got jobs at Schedds %s, but servicing only Schedd %s' % (lfilter( lambda itr: itr.scheddURI != self._schedd.getURI(), self._splitGcRequests(gc_id_jobnum_list)), self._schedd.getURI()) canceledJobs = self._schedd.cancel_jobs( self._splitGcRequests(gc_id_jobnum_list) ) # Yield ( jobnum, wms_id) for canceled jobs for htcJobID in canceledJobs: yield ( htcJobID.gcJobNum, self._createGcId(htcJobID) ) activity.finish()
def _get_jobs_output(self, gc_id_jobnum_list): if not len(gc_id_jobnum_list): raise StopIteration activity = Activity('Fetching jobs') assert not bool(lfilter( lambda htcid: htcid.scheddURI != self._schedd.getURI(), self._splitGcRequests(gc_id_jobnum_list))), 'Bug! Got jobs at Schedds %s, but servicing only Schedd %s' % (lfilter( lambda itr: itr.scheddURI != self._schedd.getURI(), self._splitGcRequests(gc_id_jobnum_list)), self._schedd.getURI()) returnedJobs = self._schedd.getJobsOutput( self._splitGcRequests(gc_id_jobnum_list) ) # Yield (jobnum, path_output) per retrieved job for htcID in returnedJobs: yield ( htcID.gcJobNum, self.getSandboxPath(htcID.gcJobNum) ) activity.finish()
def doTransfer(self, listDescSourceTarget): for (desc, source, target) in listDescSourceTarget: if not self.smPaths: raise ConfigError("%s can't be transferred because '%s path wasn't set" % (desc, self.smOptPrefix)) for idx, sePath in enumerate(set(self.smPaths)): activity = Activity('Copy %s to SE %d ' % (desc, idx + 1)) proc = se_copy(source, os.path.join(sePath, target), self.smForce) proc.status(timeout = 5*60, terminate = True) activity.finish() if proc.status(timeout = 0) == 0: self._log.info('Copy %s to SE %d finished', desc, idx + 1) else: self._log.info('Copy %s to SE %d failed', desc, idx + 1) self._log.critical(proc.stderr.read(timeout = 0)) self._log.critical('Unable to copy %s! You can try to copy it manually.', desc) if not utils.getUserBool('Is %s (%s) available on SE %s?' % (desc, source, sePath), False): raise StorageError('%s is missing on SE %s!' % (desc, sePath))