示例#1
0
 def getBlocksNormed(self):
     activity = Activity('Retrieving %s' % self._datasetExpr)
     try:
         # Validation, Naming:
         for block in self._getBlocksInternal():
             assert (block[DataProvider.Dataset])
             block.setdefault(DataProvider.BlockName, '0')
             block.setdefault(DataProvider.Provider,
                              self.__class__.__name__)
             block.setdefault(DataProvider.Locations, None)
             events = sum(
                 imap(lambda x: x[DataProvider.NEntries],
                      block[DataProvider.FileList]))
             block.setdefault(DataProvider.NEntries, events)
             if self._datasetNick:
                 block[DataProvider.Nickname] = self._datasetNick
             elif self._nickProducer:
                 block = self._nickProducer.processBlock(block)
                 if not block:
                     raise DatasetError('Nickname producer failed!')
             yield block
     except Exception:
         raise DatasetError('Unable to retrieve dataset %s' %
                            repr(self._datasetExpr))
     activity.finish()
示例#2
0
 def do_transfer(self, desc_source_target_list):
     for (desc, source, target) in desc_source_target_list:
         if not self._storage_paths:
             raise ConfigError(
                 "%s can't be transferred because '%s path wasn't set" %
                 (desc, self._storage_channel))
         for idx, se_path in enumerate(set(self._storage_paths)):
             activity = Activity('Copy %s to SE %d ' % (desc, idx + 1))
             proc = se_copy(source, os.path.join(se_path, target),
                            self._storage_force)
             proc.status(timeout=5 * 60, terminate=True)
             activity.finish()
             if proc.status(timeout=0) == 0:
                 self._log.info('Copy %s to SE %d finished', desc, idx + 1)
             else:
                 self._log.info('Copy %s to SE %d failed', desc, idx + 1)
                 self._log.log_process(proc)
                 self._log.critical(
                     'Unable to copy %s! You can try to copy it manually.',
                     desc)
                 msg = 'Is %s (%s) available on SE %s?' % (desc, source,
                                                           se_path)
                 if not UserInputInterface().prompt_bool(msg, False):
                     raise StorageError('%s is missing on SE %s!' %
                                        (desc, se_path))
示例#3
0
    def _get_jobs_output(self, gc_id_jobnum_list):
        # retrieve task output files from sandbox directory
        if not len(gc_id_jobnum_list):
            raise StopIteration

        activity = Activity('retrieving job outputs')
        for gc_id, jobnum in gc_id_jobnum_list:
            sandpath = self._get_sandbox_dn(jobnum)
            if sandpath is None:
                yield (jobnum, None)
                continue
            # when working with a remote spool schedd, tell condor to return files
            if self._remote_type == PoolType.SPOOL:
                self._check_and_log_proc(
                    self._proc_factory.logged_execute(
                        self._transfer_exec,
                        self._split_gc_id(gc_id)[1]))
            # when working with a remote [gsi]ssh schedd, manually return files
            elif self._remote_type in (PoolType.SSH, PoolType.GSISSH):
                self._check_and_log_proc(
                    self._proc_factory.logged_copy_from_remote(
                        self._get_remote_output_dn(jobnum),
                        self._get_sandbox_dn()))
                # clean up remote working directory
                self._check_and_log_proc(
                    self._proc_factory.logged_execute(
                        'rm -rf %s' % self._get_remote_output_dn(jobnum)))
            # eventually extract wildcarded output files from the tarball
            unpack_wildcard_tar(self._log, sandpath)
            yield (jobnum, sandpath)
        # clean up if necessary
        activity.finish()
        self._cleanup_remote_output_dn()
示例#4
0
    def _read_jobs(self, job_limit):
        ensure_dir_exists(self._path_db, 'job database directory', JobError)

        candidates = []
        for job_fn in fnmatch.filter(os.listdir(self._path_db), 'job_*.txt'):
            try:  # 2xsplit is faster than regex
                jobnum = int(job_fn.split(".")[0].split("_")[1])
            except Exception:
                clear_current_exception()
                continue
            candidates.append((jobnum, job_fn))

        (job_map, max_job_len) = ({}, len(candidates))
        activity = Activity('Reading job infos')
        idx = 0
        for (jobnum, job_fn) in sorted(candidates):
            idx += 1
            if jobnum >= job_limit >= 0:
                self._log.info(
                    'Stopped reading job infos at job #%d out of %d available job files, '
                    + 'since the limit of %d jobs is reached', jobnum,
                    len(candidates), job_limit)
                break
            try:
                job_fn_full = os.path.join(self._path_db, job_fn)
                data = self._fmt.parse(SafeFile(job_fn_full).iter_close())
                job_obj = self._create_job_obj(job_fn_full, data)
            except Exception:
                raise JobError('Unable to process job file %r' % job_fn_full)
            job_map[jobnum] = job_obj
            activity.update('Reading job infos %d [%d%%]' %
                            (idx, (100.0 * idx) / max_job_len))
        activity.finish()
        return job_map
示例#5
0
	def _readJobs(self, jobLimit):
		utils.ensureDirExists(self._dbPath, 'job database directory', JobError)

		candidates = []
		for jobFile in fnmatch.filter(os.listdir(self._dbPath), 'job_*.txt'):
			try: # 2xsplit is faster than regex
				jobNum = int(jobFile.split(".")[0].split("_")[1])
			except Exception:
				continue
			candidates.append((jobNum, jobFile))

		(jobMap, maxJobs) = ({}, len(candidates))
		activity = Activity('Reading job infos')
		idx = 0
		for (jobNum, jobFile) in sorted(candidates):
			idx += 1
			if (jobLimit >= 0) and (jobNum >= jobLimit):
				self._log.info('Stopped reading job infos at job #%d out of %d available job files, since the limit of %d jobs is reached',
					jobNum, len(candidates), jobLimit)
				break
			jobObj = self._load_job(os.path.join(self._dbPath, jobFile))
			jobMap[jobNum] = jobObj
			if idx % 100 == 0:
				activity.update('Reading job infos %d [%d%%]' % (idx, (100.0 * idx) / maxJobs))
		activity.finish()
		return jobMap
示例#6
0
	def execute(self, wmsIDs, wmsName): # yields list of (wmsID,)
		marked_wmsIDs = lmap(lambda result: result[0], self._cancel_executor.execute(wmsIDs, wmsName))
		time.sleep(5)
		activity = Activity('Purging jobs')
		for result in self._purge_executor.execute(marked_wmsIDs, wmsName):
			yield result
		activity.finish()
示例#7
0
	def _submitJob(self, jobNum, module):
		fd, jdl = tempfile.mkstemp('.jdl')
		try:
			jdlData = self.makeJDL(jobNum, module)
			utils.safeWrite(os.fdopen(fd, 'w'), jdlData)
		except Exception:
			utils.removeFiles([jdl])
			raise BackendError('Could not write jdl data to %s.' % jdl)

		try:
			submitArgs = []
			for key_value in utils.filterDict(self._submitParams, vF = lambda v: v).items():
				submitArgs.extend(key_value)
			submitArgs.append(jdl)

			activity = Activity('submitting job %d' % jobNum)
			proc = LocalProcess(self._submitExec, '--nomsg', '--noint', '--logfile', '/dev/stderr', *submitArgs)

			gcID = None
			for line in ifilter(lambda x: x.startswith('http'), imap(str.strip, proc.stdout.iter(timeout = 60))):
				gcID = line
			retCode = proc.status(timeout = 0, terminate = True)

			activity.finish()

			if (retCode != 0) or (gcID is None):
				if self.explainError(proc, retCode):
					pass
				else:
					self._log.log_process(proc, files = {'jdl': SafeFile(jdl).read()})
		finally:
			utils.removeFiles([jdl])
		return (jobNum, utils.QM(gcID, self._createId(gcID), None), {'jdl': str.join('', jdlData)})
示例#8
0
 def iter_blocks_normed(self):
     activity = Activity('Retrieving %s' % self._dataset_expr)
     try:
         # Validation, Naming:
         for block in self._iter_blocks_raw():
             if not block.get(DataProvider.Dataset):
                 raise DatasetError(
                     'Block does not contain the dataset name!')
             block.setdefault(DataProvider.BlockName, '0')
             block.setdefault(DataProvider.Provider,
                              self.__class__.__name__)
             block.setdefault(DataProvider.Query, self._dataset_expr)
             block.setdefault(DataProvider.Locations, None)
             events = sum(
                 imap(itemgetter(DataProvider.NEntries),
                      block[DataProvider.FileList]))
             block.setdefault(DataProvider.NEntries, events)
             if self._dataset_nick_override:
                 block[DataProvider.Nickname] = self._dataset_nick_override
             elif self._nick_producer:
                 block = self._nick_producer.process_block(block)
                 if not block:
                     raise DatasetError('Nickname producer failed!')
             yield block
     except Exception:
         raise DatasetRetrievalError('Unable to retrieve dataset %s' %
                                     repr(self._dataset_expr))
     activity.finish()
示例#9
0
	def _resync(self):
		if self._data_provider:
			activity = Activity('Performing resync of datasource %r' % self._name)
			# Get old and new dataset information
			ds_old = DataProvider.loadFromFile(self._getDataPath('cache.dat')).getBlocks(show_stats = False)
			self._data_provider.clearCache()
			ds_new = self._data_provider.getBlocks(show_stats = False)
			self._data_provider.saveToFile(self._getDataPath('cache-new.dat'), ds_new)

			# Use old splitting information to synchronize with new dataset infos
			old_maxN = self._data_splitter.getMaxJobs()
			jobChanges = self._data_splitter.resyncMapping(self._getDataPath('map-new.tar'), ds_old, ds_new)
			activity.finish()
			if jobChanges is not None:
				# Move current splitting to backup and use the new splitting from now on
				def backupRename(old, cur, new):
					if self._keepOld:
						os.rename(self._getDataPath(cur), self._getDataPath(old))
					os.rename(self._getDataPath(new), self._getDataPath(cur))
				backupRename(  'map-old-%d.tar' % time.time(),   'map.tar',   'map-new.tar')
				backupRename('cache-old-%d.dat' % time.time(), 'cache.dat', 'cache-new.dat')
				self._data_splitter.importPartitions(self._getDataPath('map.tar'))
				self._maxN = self._data_splitter.getMaxJobs()
				self._log.debug('Dataset resync finished: %d -> %d partitions', old_maxN, self._maxN)
				return (set(jobChanges[0]), set(jobChanges[1]), old_maxN != self._maxN)
示例#10
0
 def _cleanup_remote_output_dn(self):
     # active remote submission should clean up when no jobs remain
     if self._remote_type in (PoolType.SSH, PoolType.GSISSH):
         activity = Activity('clearing remote work directory')
         # check whether there are any remote working directories remaining
         check_proc = self._proc_factory.logged_execute(
             'find %s -maxdepth 1 -type d | wc -l' %
             self._get_remote_output_dn())
         try:
             if int(check_proc.get_output()) <= 1:
                 cleanup_cmd = 'rm -rf %s' % self._get_remote_output_dn()
                 cleanup_proc = self._proc_factory.logged_execute(
                     cleanup_cmd)
                 if cleanup_proc.wait() != 0:
                     if self._explain_error(cleanup_proc,
                                            cleanup_proc.wait()):
                         return
                     cleanup_proc.log_error(self._error_log_fn)
                     raise BackendError(
                         'Cleanup process %s returned: %s' %
                         (cleanup_proc.cmd, cleanup_proc.get_message()))
         except Exception:
             self._log.warning(
                 'There might be some junk data left in: %s @ %s',
                 self._get_remote_output_dn(),
                 self._proc_factory.get_domain())
             raise BackendError(
                 'Unable to clean up remote working directory')
         activity.finish()
示例#11
0
 def doTransfer(self, listDescSourceTarget):
     for (desc, source, target) in listDescSourceTarget:
         if not self.smPaths:
             raise ConfigError(
                 "%s can't be transferred because '%s path wasn't set" %
                 (desc, self.smOptPrefix))
         for idx, sePath in enumerate(set(self.smPaths)):
             activity = Activity('Copy %s to SE %d ' % (desc, idx + 1))
             proc = se_copy(source, os.path.join(sePath, target),
                            self.smForce)
             proc.status(timeout=5 * 60, terminate=True)
             activity.finish()
             if proc.status(timeout=0) == 0:
                 self._log.info('Copy %s to SE %d finished', desc, idx + 1)
             else:
                 self._log.info('Copy %s to SE %d failed', desc, idx + 1)
                 self._log.critical(proc.stderr.read(timeout=0))
                 self._log.critical(
                     'Unable to copy %s! You can try to copy it manually.',
                     desc)
                 if not utils.getUserBool(
                         'Is %s (%s) available on SE %s?' %
                     (desc, source, sePath), False):
                     raise StorageError('%s is missing on SE %s!' %
                                        (desc, sePath))
示例#12
0
 def _fill_cms_fi_list(self, block, block_path):
     activity_fi = Activity('Getting file information')
     lumi_used = False
     lumi_info_dict = {}
     if self._lumi_query:  # central lumi query
         lumi_info_dict = self._get_cms_lumi_dict(block_path)
     fi_list = []
     for (fi,
          lumi_info_list) in self._iter_cms_files(block_path,
                                                  self._only_valid,
                                                  self._lumi_query):
         self._raise_on_abort()
         if lumi_info_dict and not lumi_info_list:
             lumi_info_list = lumi_info_dict.get(fi[DataProvider.URL], [])
         if lumi_info_list:
             (run_list_result, lumi_list_result) = ([], [])
             for (run, lumi_list) in sorted(lumi_info_list):
                 run_list_result.extend([run] * len(lumi_list))
                 lumi_list_result.extend(lumi_list)
             assert len(run_list_result) == len(lumi_list_result)
             fi[DataProvider.Metadata] = [run_list_result, lumi_list_result]
             lumi_used = True
         fi_list.append(fi)
     if lumi_used:
         block.setdefault(DataProvider.Metadata,
                          []).extend(['Runs', 'Lumi'])
     block[DataProvider.FileList] = fi_list
     activity_fi.finish()
示例#13
0
	def _read_jobs(self, job_limit):
		ensure_dir_exists(self._path_db, 'job database directory', JobError)

		candidates = []
		for job_fn in fnmatch.filter(os.listdir(self._path_db), 'job_*.txt'):
			try:  # 2xsplit is faster than regex
				jobnum = int(job_fn.split(".")[0].split("_")[1])
			except Exception:
				clear_current_exception()
				continue
			candidates.append((jobnum, job_fn))

		(job_map, max_job_len) = ({}, len(candidates))
		activity = Activity('Reading job infos')
		idx = 0
		for (jobnum, job_fn) in sorted(candidates):
			idx += 1
			if jobnum >= job_limit >= 0:
				self._log.info('Stopped reading job infos at job #%d out of %d available job files, ' +
					'since the limit of %d jobs is reached', jobnum, len(candidates), job_limit)
				break
			try:
				job_fn_full = os.path.join(self._path_db, job_fn)
				data = self._fmt.parse(SafeFile(job_fn_full).iter_close())
				job_obj = self._create_job_obj(job_fn_full, data)
			except Exception:
				raise JobError('Unable to process job file %r' % job_fn_full)
			job_map[jobnum] = job_obj
			activity.update('Reading job infos %d [%d%%]' % (idx, (100.0 * idx) / max_job_len))
		activity.finish()
		return job_map
示例#14
0
    def _get_jobs_output(self, gc_id_jobnum_list):
        # Get output of jobs and yield output dirs
        if len(gc_id_jobnum_list) == 0:
            raise StopIteration

        root_dn = os.path.join(self._path_output, 'tmp')
        try:
            if len(gc_id_jobnum_list) == 1:
                # For single jobs create single subdir
                tmp_dn = os.path.join(root_dn,
                                      md5_hex(gc_id_jobnum_list[0][0]))
            else:
                tmp_dn = root_dn
            ensure_dir_exists(tmp_dn)
        except Exception:
            raise BackendError(
                'Temporary path "%s" could not be created.' % tmp_dn,
                BackendError)

        map_gc_id2jobnum = dict(gc_id_jobnum_list)
        jobs = self._write_wms_id_list(gc_id_jobnum_list)

        activity = Activity('retrieving %d job outputs' %
                            len(gc_id_jobnum_list))
        proc = LocalProcess(self._output_exec, '--noint', '--logfile',
                            '/dev/stderr', '-i', jobs, '--dir', tmp_dn)

        # yield output dirs
        todo = map_gc_id2jobnum.values()
        current_jobnum = None
        for line in imap(str.strip, proc.stdout.iter(timeout=60)):
            if line.startswith(tmp_dn):
                todo.remove(current_jobnum)
                output_dn = line.strip()
                unpack_wildcard_tar(self._log, output_dn)
                yield (current_jobnum, output_dn)
                current_jobnum = None
            else:
                current_jobnum = map_gc_id2jobnum.get(self._create_gc_id(line),
                                                      current_jobnum)
        exit_code = proc.status(timeout=0, terminate=True)
        activity.finish()

        if exit_code != 0:
            if 'Keyboard interrupt raised by user' in proc.stderr.read(
                    timeout=0):
                remove_files([jobs, root_dn])
                raise StopIteration
            else:
                self._log.log_process(proc,
                                      files={'jobs': SafeFile(jobs).read()})
            self._log.error('Trying to recover from error ...')
            for dn in os.listdir(root_dn):
                yield (None, os.path.join(root_dn, dn))

        # return unretrievable jobs
        for jobnum in todo:
            yield (jobnum, None)

        remove_files([jobs, tmp_dn])
示例#15
0
	def _submit_jobs(self, jobnum_list, task):
		# submit_jobs: Submit a number of jobs and yield (jobnum, WMS ID, other data) sequentially
		# >>jobnum: internal ID of the Job
		# JobNum is linked to the actual *task* here
		(jdl_fn, submit_jdl_fn) = self._submit_jobs_prepare(jobnum_list, task)
		try:
			# submit all jobs simultaneously and temporarily store verbose (ClassAdd) output
			activity = Activity('queuing jobs at scheduler')
			submit_args = ' -verbose -batch-name ' + task.get_description().task_name + ' ' + submit_jdl_fn
			proc = self._proc_factory.logged_execute(self._submit_exec, submit_args)

			# extract the Condor ID (WMS ID) of the jobs from output ClassAds
			jobnum_gc_id_list = []
			for line in proc.iter():
				if 'GridControl_GCIDtoWMSID' in line:
					jobnum_wms_id = line.split('=')[1].strip(' "\n').split('@')
					jobnum, wms_id = int(jobnum_wms_id[0]), jobnum_wms_id[1].strip()
					# Condor creates a default job then overwrites settings on any subsequent job
					# i.e. skip every second, but better be sure
					if (not jobnum_gc_id_list) or (jobnum not in lzip(*jobnum_gc_id_list)[0]):
						jobnum_gc_id_list.append((jobnum, self._create_gc_id(wms_id)))

			exit_code = proc.wait()
			activity.finish()
			if (exit_code != 0) or (len(jobnum_gc_id_list) < len(jobnum_list)):
				if not self._explain_error(proc, exit_code):
					self._log.error('Submitted %4d jobs of %4d expected',
						len(jobnum_gc_id_list), len(jobnum_list))
					proc.log_error(self._error_log_fn, jdl=jdl_fn)
		finally:
			remove_files([jdl_fn])

		for (jobnum, gc_id) in jobnum_gc_id_list:
			yield (jobnum, gc_id, {})
示例#16
0
	def submit_jobs(self, jobnum_list, task):
		requestLen = len(jobnum_list)
		activity = Activity('Submitting jobs (--%)')
		while jobnum_list:
			jobSubmitNumList = jobnum_list[-self._schedd.getSubmitScale():]
			del jobnum_list[-self._schedd.getSubmitScale():]
			activity = Activity('Submitting jobs (%2d%%)'%(100*(requestLen-len(jobnum_list))/requestLen))
			for jobnum in jobSubmitNumList:
				self._write_job_config(
					self.getJobCfgPath(jobnum)[0],
					jobnum,
					task, {}
					)
			rawJobInfoMaps = self._schedd.submit_jobs(
				jobSubmitNumList, 
				task,
				self._getQueryArgs()
				)
			# Yield (jobnum, gc_id, other data) per jobZ
			jobInfoMaps = self._digestQueueInfoMaps(rawJobInfoMaps)
			for htcID in jobInfoMaps:
				yield (
					htcID.gcJobNum,
					self._createGcId(htcID),
					jobInfoMaps[htcID]
					)
		activity.finish()
示例#17
0
def create_tarball(match_info_iter, **kwargs):
	tar = tarfile.open(mode='w:gz', **kwargs)
	activity = Activity('Generating tarball')
	for match_info in match_info_iter:
		if isinstance(match_info, tuple):
			(path_source, path_target) = match_info
		else:
			(path_source, path_target) = (match_info, None)
		if isinstance(path_source, str):
			if not os.path.exists(path_source):
				raise PathError('File %s does not exist!' % path_source)
			tar.add(path_source, path_target or os.path.basename(path_source), recursive=False)
		elif path_source is None:  # Update activity
			activity.update('Generating tarball: %s' % path_target)
		else:  # File handle
			info, handle = path_source.get_tar_info()
			if path_target:
				info.name = path_target
			info.mtime = time.time()
			info.mode = stat.S_IRUSR + stat.S_IWUSR + stat.S_IRGRP + stat.S_IROTH
			if info.name.endswith('.sh') or info.name.endswith('.py'):
				info.mode += stat.S_IXUSR + stat.S_IXGRP + stat.S_IXOTH
			tar.addfile(info, handle)
			handle.close()
	activity.finish()
	tar.close()
示例#18
0
	def _resync_psrc(self):
		activity = Activity('Performing resync of datasource %r' % self.get_datasource_name())
		# Get old and new dataset information
		provider_old = DataProvider.load_from_file(self._get_data_path('cache.dat'))
		block_list_old = provider_old.get_block_list_cached(show_stats=False)
		self._provider.clear_cache()
		block_list_new = self._provider.get_block_list_cached(show_stats=False)
		self._provider.save_to_file(self._get_data_path('cache-new.dat'), block_list_new)

		# Use old splitting information to synchronize with new dataset infos
		partition_len_old = self.get_parameter_len()
		partition_changes = self._resync_partitions(
			self._get_data_path('map-new.tar'), block_list_old, block_list_new)
		activity.finish()
		if partition_changes is not None:
			# Move current splitting to backup and use the new splitting from now on
			def _rename_with_backup(new, cur, old):
				if self._keep_old:
					os.rename(self._get_data_path(cur), self._get_data_path(old))
				os.rename(self._get_data_path(new), self._get_data_path(cur))
			_rename_with_backup('map-new.tar', 'map.tar', 'map-old-%d.tar' % time.time())
			_rename_with_backup('cache-new.dat', 'cache.dat', 'cache-old-%d.dat' % time.time())
			self._set_reader(DataSplitter.load_partitions(self._get_data_path('map.tar')))
			self._log.debug('Dataset resync finished: %d -> %d partitions', partition_len_old, self._len)
			(pnum_list_redo, pnum_list_disable) = partition_changes
			return (set(pnum_list_redo), set(pnum_list_disable), partition_len_old != self._len)
示例#19
0
 def _set_proxy_lifetime(self):
     activity = Activity('Get proxy lifetime...')
     proc = LocalProcess(resolve_install_path('voms-proxy-info'))
     output = proc.get_output(timeout=10, raise_errors=False)
     end_of_proxy = 0
     proxy_key = None
     for l in output.split('\n'):
         if 'subject' in l:
             proxy_key = l.encode("hex")[-15:]
         if 'timeleft' in l:
             h, m, s = int(l.split(':')[-3]), int(l.split(':')[-2]), int(
                 l.split(':')[-1])
             end_of_proxy = time.time() + h * 60 * 60 + m * 60 + s
             break
     if end_of_proxy == 0:
         self._log.warning('couldnt evaluate end of proxy. Output was:')
         self._log.warning(output)
         time.sleep(300)
         self._set_proxy_lifetime()
     else:
         self._end_of_proxy_lifetime = end_of_proxy
         if proxy_key is not None:
             self._delegated_proxy_filename = os.path.join(
                 os.path.expanduser("~"), ".gcDelegatedProxy" + proxy_key)
         left_time_str = datetime.fromtimestamp(
             self._end_of_proxy_lifetime).strftime("%A, %B %d, %Y %I:%M:%S")
         self._log.info('End of current proxy lifetime: %s' % left_time_str)
         activity.finish()
     return 0
示例#20
0
def delete_job(opts, work_dn, status_mon, job_db, job_obj, jobnum):
    activity = Activity('Deleting output files')
    try:
        if (job_obj.get('deleted') == 'True') and not opts.mark_ignore_rm:
            return status_mon.register_job_result(
                jobnum, 'Files are already deleted',
                JobDownloadStatus.JOB_ALREADY)
        if (job_obj.get('download') != 'True') and not opts.mark_ignore_dl:
            return status_mon.register_job_result(
                jobnum, 'Files are not yet downloaded',
                JobDownloadStatus.JOB_INCOMPLETE)
        fi_list = FileInfoProcessor().process(
            os.path.join(work_dn, 'output', 'job_%d' % jobnum)) or []
        if not fi_list:
            return status_mon.register_job_result(
                jobnum, 'Job has no output files',
                JobDownloadStatus.JOB_NO_OUTPUT)
        job_successful = job_obj.state != Job.SUCCESS
        delete_files(opts,
                     jobnum,
                     fi_list,
                     download_failed=job_successful,
                     show_se_skip=True)
        set_job_prop(job_db, jobnum, job_obj, 'deleted', 'True')
        status_mon.register_job_result(jobnum, 'All files deleted',
                                       JobDownloadStatus.JOB_OK)
    finally:
        activity.finish()
示例#21
0
	def _read_jobs(self, job_limit):
		job_map = {}
		max_job_len = 0
		if os.path.exists(self._db_fn):
			try:
				tar = zipfile.ZipFile(self._db_fn, 'r', zipfile.ZIP_DEFLATED)
				tar.testzip()
			except Exception:  # Try to recover job archive
				clear_current_exception()
				self._log.warning('Job database is corrupted - starting recovery')
				self._recover_jobs()
				self._log.info('Recover completed!')
			activity = Activity('Reading job transactions')
			max_job_len = len(tar.namelist())
			map_jobnum2tarfn = {}
			for idx, tar_info_fn in enumerate(tar.namelist()):
				(jobnum, tid) = tuple(imap(lambda s: int(s[1:]), tar_info_fn.split('_', 1)))
				if tid < map_jobnum2tarfn.get(jobnum, 0):
					continue
				try:
					data = self._fmt.parse(tar.open(tar_info_fn).read())
				except Exception:
					clear_current_exception()
					continue
				job_map[jobnum] = self._create_job_obj(tar_info_fn, data)
				map_jobnum2tarfn[jobnum] = tid
				if idx % 100 == 0:
					activity.update('Reading job transactions %d [%d%%]' % (idx, (100.0 * idx) / max_job_len))
			activity.finish()
		self._serial = max_job_len
		return job_map
示例#22
0
	def _get_jobs_output(self, gc_id_jobnum_list):
		# retrieve task output files from sandbox directory
		if not len(gc_id_jobnum_list):
			raise StopIteration

		activity = Activity('retrieving job outputs')
		for gc_id, jobnum in gc_id_jobnum_list:
			sandpath = self._get_sandbox_dn(jobnum)
			if sandpath is None:
				yield (jobnum, None)
				continue
			# when working with a remote spool schedd, tell condor to return files
			if self._remote_type == PoolType.SPOOL:
				self._check_and_log_proc(self._proc_factory.logged_execute(
					self._transfer_exec, self._split_gc_id(gc_id)[1]))
			# when working with a remote [gsi]ssh schedd, manually return files
			elif self._remote_type in (PoolType.SSH, PoolType.GSISSH):
				self._check_and_log_proc(self._proc_factory.logged_copy_from_remote(
					self._get_remote_output_dn(jobnum), self._get_sandbox_dn()))
				# clean up remote working directory
				self._check_and_log_proc(self._proc_factory.logged_execute(
					'rm -rf %s' % self._get_remote_output_dn(jobnum)))
			# eventually extract wildcarded output files from the tarball
			unpack_wildcard_tar(self._log, sandpath)
			yield (jobnum, sandpath)
		# clean up if necessary
		activity.finish()
		self._cleanup_remote_output_dn()
示例#23
0
    def _readJobs(self, jobLimit):
        utils.ensureDirExists(self._dbPath, 'job database directory', JobError)

        candidates = []
        for jobFile in fnmatch.filter(os.listdir(self._dbPath), 'job_*.txt'):
            try:  # 2xsplit is faster than regex
                jobNum = int(jobFile.split(".")[0].split("_")[1])
            except Exception:
                continue
            candidates.append((jobNum, jobFile))

        (jobMap, maxJobs) = ({}, len(candidates))
        activity = Activity('Reading job infos')
        idx = 0
        for (jobNum, jobFile) in sorted(candidates):
            idx += 1
            if (jobLimit >= 0) and (jobNum >= jobLimit):
                self._log.info(
                    'Stopped reading job infos at job #%d out of %d available job files, since the limit of %d jobs is reached',
                    jobNum, len(candidates), jobLimit)
                break
            jobObj = self._load_job(os.path.join(self._dbPath, jobFile))
            jobMap[jobNum] = jobObj
            if idx % 100 == 0:
                activity.update('Reading job infos %d [%d%%]' %
                                (idx, (100.0 * idx) / maxJobs))
        activity.finish()
        return jobMap
示例#24
0
	def _submit_jobs(self, jobnum_list, task):
		# submit_jobs: Submit a number of jobs and yield (jobnum, WMS ID, other data) sequentially
		# >>jobnum: internal ID of the Job
		# JobNum is linked to the actual *task* here
		(jdl_fn, submit_jdl_fn) = self._submit_jobs_prepare(jobnum_list, task)
		try:
			# submit all jobs simultaneously and temporarily store verbose (ClassAdd) output
			activity = Activity('queuing jobs at scheduler')
			submit_args = ' -verbose -batch-name ' + task.get_description().task_name + ' ' + submit_jdl_fn
			proc = self._proc_factory.logged_execute(self._submit_exec, submit_args)

			# extract the Condor ID (WMS ID) of the jobs from output ClassAds
			jobnum_gc_id_list = []
			for line in proc.iter():
				if 'GridControl_GCIDtoWMSID' in line:
					jobnum_wms_id = line.split('=')[1].strip(' "\n').split('@')
					jobnum, wms_id = int(jobnum_wms_id[0]), jobnum_wms_id[1].strip()
					# Condor creates a default job then overwrites settings on any subsequent job
					# i.e. skip every second, but better be sure
					if (not jobnum_gc_id_list) or (jobnum not in lzip(*jobnum_gc_id_list)[0]):
						jobnum_gc_id_list.append((jobnum, self._create_gc_id(wms_id)))

			exit_code = proc.wait()
			activity.finish()
			if (exit_code != 0) or (len(jobnum_gc_id_list) < len(jobnum_list)):
				if not self._explain_error(proc, exit_code):
					self._log.error('Submitted %4d jobs of %4d expected',
						len(jobnum_gc_id_list), len(jobnum_list))
					proc.log_error(self._error_log_fn, jdl=jdl_fn)
		finally:
			remove_files([jdl_fn])

		for (jobnum, gc_id) in jobnum_gc_id_list:
			yield (jobnum, gc_id, {})
示例#25
0
 def _tidyUpWorkingDirectory(self, forceCleanup=False):
     # active remote submission should clean up when no jobs remain
     if self.remoteType == PoolType.SSH or self.remoteType == PoolType.GSISSH:
         self.debugOut(
             "Revising remote working directory for cleanup. Forced CleanUp: %s"
             % forceCleanup)
         activity = Activity('revising remote work directory')
         # check whether there are any remote working directories remaining
         checkProcess = self.Pool.LoggedExecute(
             'find %s -maxdepth 1 -type d | wc -l' % self.getWorkdirPath())
         try:
             if forceCleanup or (int(checkProcess.getOutput()) <= 1):
                 cleanupProcess = self.Pool.LoggedExecute(
                     'rm -rf %s' % self.getWorkdirPath())
                 if cleanupProcess.wait() != 0:
                     if self.explainError(cleanupProcess,
                                          cleanupProcess.wait()):
                         return
                     cleanupProcess.logError(self.errorLog)
                     raise BackendError(
                         'Cleanup process %s returned: %s' %
                         (cleanupProcess.cmd, cleanupProcess.getMessage()))
         except Exception:
             self._log.warning(
                 'There might be some junk data left in: %s @ %s',
                 self.getWorkdirPath(), self.Pool.getDomain())
             raise BackendError(
                 'Unable to clean up remote working directory')
         activity.finish()
示例#26
0
	def __init__(self, block_list_old, block_list_new):
		activity = Activity('Performing resynchronization of dataset')
		block_resync_tuple = DataProvider.resync_blocks(block_list_old, block_list_new)
		(self.block_list_added, self._block_list_missing, self._block_list_matching) = block_resync_tuple
		for block_missing in self._block_list_missing:  # Files in matching blocks are already sorted
			sort_inplace(block_missing[DataProvider.FileList], key=itemgetter(DataProvider.URL))
		activity.finish()
示例#27
0
	def _saveStateToTar(self, tar, meta, source, sourceLen, message):
		# Write the splitting info grouped into subtarfiles
		activity = Activity(message)
		(jobNum, lastValid, subTar) = (-1, -1, None)
		for jobNum, entry in enumerate(source):
			if not entry.get(DataSplitter.Invalid, False):
				lastValid = jobNum
			if jobNum % self._keySize == 0:
				self._closeSubTar(tar, subTar)
				subTar = self._createSubTar('%03dXX.tgz' % int(jobNum / self._keySize))
				activity.update('%s [%d / %d]' % (message, jobNum, sourceLen))
			# Determine shortest way to store file list
			tmp = entry.pop(DataSplitter.FileList)
			savelist = self._getReducedFileList(entry, tmp) # can modify entry
			# Write files with infos / filelist
			data = str.join('', self._fmt.format(entry, fkt = self._formatFileEntry) + lmap(lambda fn: '=%s\n' % fn, savelist))
			self._addToSubTar(subTar, '%05d' % jobNum, data)
			# Remove common prefix from info
			if DataSplitter.CommonPrefix in entry:
				entry.pop(DataSplitter.CommonPrefix)
			entry[DataSplitter.FileList] = tmp
		self._closeSubTar(tar, subTar)
		activity.finish()
		# Write metadata to allow reconstruction of data splitter
		meta['MaxJobs'] = lastValid + 1
		for (fn, data) in [('Metadata', self._fmt.format(meta)), ('Version', '2')]:
			self._addToTar(tar, fn, data)
示例#28
0
def hash_verify(opts, status_mon, local_se_path, jobnum, fi_idx, fi):
    if not opts.verify_md5:
        return status_mon.register_file_result(jobnum, fi_idx,
                                               'Download successful',
                                               FileDownloadStatus.FILE_OK)
    # Verify => compute md5hash
    remote_hash = fi[FileInfo.Hash]
    activity = Activity('Verifying checksum')
    try:
        local_hash = ignore_exception(Exception, None, hash_calc,
                                      local_se_path.replace('file://', ''))
        if local_hash is None:
            return status_mon.register_file_result(
                jobnum, fi_idx, 'Unable to calculate checksum',
                FileDownloadStatus.FILE_HASH_FAILED)
    finally:
        activity.finish()
    hash_match = fi[FileInfo.Hash] == local_hash
    match_map = {True: 'MATCH', False: 'FAIL'}
    if ANSI is not None:
        match_map = {
            True: ANSI.reset + ANSI.color_green + 'MATCH' + ANSI.reset,
            False: ANSI.reset + ANSI.color_red + 'FAIL' + ANSI.reset
        }
    msg = '\tLocal  hash: %s\n' % local_hash + \
     log_intro(jobnum, fi_idx) + '\tRemote hash: %s\n' % remote_hash + \
     log_intro(jobnum, fi_idx) + 'Checksum comparison: ' + match_map[hash_match]
    if hash_match:
        return status_mon.register_file_result(jobnum, fi_idx, msg,
                                               FileDownloadStatus.FILE_OK)
    return status_mon.register_file_result(jobnum, fi_idx, msg,
                                           FileDownloadStatus.FILE_HASH_FAILED)
示例#29
0
def create_tarball(match_info_iter, **kwargs):
    tar = tarfile.open(mode='w:gz', **kwargs)
    activity = Activity('Generating tarball')
    for match_info in match_info_iter:
        if isinstance(match_info, tuple):
            (path_source, path_target) = match_info
        else:
            (path_source, path_target) = (match_info, None)
        if isinstance(path_source, str):
            if not os.path.exists(path_source):
                raise PathError('File %s does not exist!' % path_source)
            tar.add(path_source,
                    path_target or os.path.basename(path_source),
                    recursive=False)
        elif path_source is None:  # Update activity
            activity.update('Generating tarball: %s' % path_target)
        else:  # File handle
            info, handle = path_source.get_tar_info()
            if path_target:
                info.name = path_target
            info.mtime = time.time()
            info.mode = stat.S_IRUSR + stat.S_IWUSR + stat.S_IRGRP + stat.S_IROTH
            if info.name.endswith('.sh') or info.name.endswith('.py'):
                info.mode += stat.S_IXUSR + stat.S_IXGRP + stat.S_IXOTH
            tar.addfile(info, handle)
            handle.close()
    activity.finish()
    tar.close()
def hash_verify(opts, status_mon, local_se_path, jobnum, fi_idx, fi):
	if not opts.verify_md5:
		return status_mon.register_file_result(jobnum, fi_idx, 'Download successful',
			FileDownloadStatus.FILE_OK)
	# Verify => compute md5hash
	remote_hash = fi[FileInfo.Hash]
	activity = Activity('Verifying checksum')
	try:
		local_hash = ignore_exception(Exception, None, hash_calc, local_se_path.replace('file://', ''))
		if local_hash is None:
			return status_mon.register_file_result(jobnum, fi_idx, 'Unable to calculate checksum',
				FileDownloadStatus.FILE_HASH_FAILED)
	finally:
		activity.finish()
	hash_match = fi[FileInfo.Hash] == local_hash
	match_map = {True: 'MATCH', False: 'FAIL'}
	if ANSI is not None:
		match_map = {True: ANSI.reset + ANSI.color_green + 'MATCH' + ANSI.reset,
			False: ANSI.reset + ANSI.color_red + 'FAIL' + ANSI.reset}
	msg = '\tLocal  hash: %s\n' % local_hash + \
		log_intro(jobnum, fi_idx) + '\tRemote hash: %s\n' % remote_hash + \
		log_intro(jobnum, fi_idx) + 'Checksum comparison: ' + match_map[hash_match]
	if hash_match:
		return status_mon.register_file_result(jobnum, fi_idx, msg, FileDownloadStatus.FILE_OK)
	return status_mon.register_file_result(jobnum, fi_idx, msg, FileDownloadStatus.FILE_HASH_FAILED)
 def __init__(self, lockfile):
     self._lockfile = lockfile
     activity = Activity('Trying to aquire lock file %s ...' % lockfile)
     while os.path.exists(self._lockfile):
         time.sleep(0.2)
     activity.finish()
     self._fd = open(self._lockfile, 'w')
     fcntl.flock(self._fd, fcntl.LOCK_EX)
示例#32
0
	def execute(self, wms_id_list, wms_name):  # yields list of (wms_id,)
		marked_wms_id_list = lmap(lambda result: result[0],
			self._cancel_executor.execute(wms_id_list, wms_name))
		time.sleep(5)
		activity = Activity('Purging jobs')
		for result in self._purge_executor.execute(marked_wms_id_list, wms_name):
			yield result
		activity.finish()
	def __init__(self, lockfile):
		self._lockfile = lockfile
		activity = Activity('Trying to aquire lock file %s ...' % lockfile)
		while os.path.exists(self._lockfile):
			time.sleep(0.2)
		activity.finish()
		self._fd = open(self._lockfile, 'w')
		fcntl.flock(self._fd, fcntl.LOCK_EX)
示例#34
0
	def _getJobsOutput(self, ids):
		if len(ids) == 0:
			raise StopIteration

		basePath = os.path.join(self._outputPath, 'tmp')
		try:
			if len(ids) == 1:
				# For single jobs create single subdir
				tmpPath = os.path.join(basePath, md5(ids[0][0]).hexdigest())
			else:
				tmpPath = basePath
			utils.ensureDirExists(tmpPath)
		except Exception:
			raise BackendError('Temporary path "%s" could not be created.' % tmpPath, BackendError)

		jobNumMap = dict(ids)
		jobs = self.writeWMSIds(ids)

		activity = Activity('retrieving %d job outputs' % len(ids))
		proc = LocalProcess(self._outputExec, '--noint', '--logfile', '/dev/stderr', '-i', jobs, '--dir', tmpPath)

		# yield output dirs
		todo = jobNumMap.values()
		currentJobNum = None
		for line in imap(str.strip, proc.stdout.iter(timeout = 60)):
			if line.startswith(tmpPath):
				todo.remove(currentJobNum)
				outputDir = line.strip()
				if os.path.exists(outputDir):
					if 'GC_WC.tar.gz' in os.listdir(outputDir):
						wildcardTar = os.path.join(outputDir, 'GC_WC.tar.gz')
						try:
							tarfile.TarFile.open(wildcardTar, 'r:gz').extractall(outputDir)
							os.unlink(wildcardTar)
						except Exception:
							self._log.error('Can\'t unpack output files contained in %s', wildcardTar)
				yield (currentJobNum, line.strip())
				currentJobNum = None
			else:
				currentJobNum = jobNumMap.get(self._createId(line), currentJobNum)
		retCode = proc.status(timeout = 0, terminate = True)
		activity.finish()

		if retCode != 0:
			if 'Keyboard interrupt raised by user' in proc.stderr.read(timeout = 0):
				utils.removeFiles([jobs, basePath])
				raise StopIteration
			else:
				self._log.log_process(proc, files = {'jobs': SafeFile(jobs).read()})
			self._log.error('Trying to recover from error ...')
			for dirName in os.listdir(basePath):
				yield (None, os.path.join(basePath, dirName))

		# return unretrievable jobs
		for jobNum in todo:
			yield (jobNum, None)

		utils.removeFiles([jobs, basePath])
		def _delete(file_se_path, where, what):
			if se_exists(file_se_path).status(timeout=10, terminate=True) == 0:
				activity = Activity('Deleting file %s from %s' % (fi[FileInfo.NameDest], where))
				rm_proc = se_rm(file_se_path)
				if rm_proc.status(timeout=60, terminate=True) == 0:
					log.info(log_intro(jobnum, fi_idx) + 'Deleted file %s', file_se_path)
				else:
					log.log_process(rm_proc, msg=log_intro(jobnum, fi_idx) + 'Unable to remove %s' % what)
				activity.finish()
示例#36
0
	def _get_phedex_replica_list(self, block_path, replicas_dict):
		activity_fi = Activity('Getting file replica information from PhEDex')
		# Get dataset se list from PhEDex (perhaps concurrent with get_dbs_file_list)
		replicas_dict[block_path] = []
		for phedex_block in self._pjrc.get(params={'block': block_path})['phedex']['block']:
			for replica in phedex_block['replica']:
				replica_info = (replica['node'], replica.get('se'), replica['complete'] == 'y')
				replicas_dict[block_path].append(replica_info)
		activity_fi.finish()
示例#37
0
	def get_dataset_name_list(self):
		if self._cache_dataset is None:
			self._cache_dataset = [self._dataset_path]
			if '*' in self._dataset_path:
				activity = Activity('Getting dataset list for %s' % self._dataset_path)
				self._cache_dataset = list(self._get_cms_dataset_list(self._dataset_path))
				if not self._cache_dataset:
					raise DatasetError('No datasets selected by DBS wildcard %s !' % self._dataset_path)
				activity.finish()
		return self._cache_dataset
示例#38
0
def wait(timeout):
	activity = Activity('Waiting', parent='root')
	for remaining in irange(timeout, 0, -1):
		if abort():
			return False
		if (remaining == timeout) or (remaining < 5) or (remaining % 5 == 0):
			activity.update('Waiting for %d seconds' % remaining)
		time.sleep(1)
	activity.finish()
	return True
示例#39
0
 def getEntries(self, path, metadata, events, seList, objStore):
     metadata['GC_SOURCE_DIR'] = self._path
     counter = 0
     activity = Activity('Reading source directory')
     for fn in self._iter_path():
         activity.update('Reading source directory - [%d]' % counter)
         yield (os.path.join(self._path, fn.strip()), metadata, events,
                seList, objStore)
         counter += 1
     activity.finish()
示例#40
0
def wait(timeout):
    activity = Activity('Waiting', parent='root')
    for remaining in irange(timeout, 0, -1):
        if abort():
            return False
        if (remaining == timeout) or (remaining < 5) or (remaining % 5 == 0):
            activity.update('Waiting for %d seconds' % remaining)
        time.sleep(1)
    activity.finish()
    return True
示例#41
0
 def __init__(self, block_list_old, block_list_new):
     activity = Activity('Performing resynchronization of dataset')
     block_resync_tuple = DataProvider.resync_blocks(
         block_list_old, block_list_new)
     (self.block_list_added, self._block_list_missing,
      self._block_list_matching) = block_resync_tuple
     for block_missing in self._block_list_missing:  # Files in matching blocks are already sorted
         sort_inplace(block_missing[DataProvider.FileList],
                      key=itemgetter(DataProvider.URL))
     activity.finish()
示例#42
0
	def __init__(self, config, source):
		self._psrc_raw = source
		BasicParameterAdapter.__init__(self, config, source)
		self._map_jobnum2pnum = {}
		ensure_dir_exists(config.get_work_path(), 'parameter storage directory', ParameterError)
		self._path_jobnum2pnum = config.get_work_path('params.map.gz')
		self._path_params = config.get_work_path('params.dat.gz')

		# Find out if init should be performed - overrides resync_requested!
		init_requested = config.get_state('init', detail='parameters')
		init_needed = False
		if not (os.path.exists(self._path_params) and os.path.exists(self._path_jobnum2pnum)):
			init_needed = True  # Init needed if no parameter log exists
		if init_requested and not init_needed and (source.get_parameter_len() is not None):
			self._log.warning('Re-Initialization will overwrite the current mapping ' +
				'between jobs and parameter/dataset content! This can lead to invalid results!')
			user_msg = ('Do you want to perform a syncronization between ' +
				'the current mapping and the new one to avoid this?')
			if UserInputInterface().prompt_bool(user_msg, True):
				init_requested = False
		do_init = init_requested or init_needed

		# Find out if resync should be performed
		resync_by_user = config.get_state('resync', detail='parameters')
		config.set_state(False, 'resync', detail='parameters')
		psrc_hash = self._psrc_raw.get_psrc_hash()
		self._psrc_hash_stored = config.get('parameter hash', psrc_hash, persistent=True)
		psrc_hash_changed = self._psrc_hash_stored != psrc_hash  # Resync if parameters have changed
		resync_by_psrc = self._psrc_raw.get_resync_request()

		if do_init:  # Write current state
			self._write_jobnum2pnum(self._path_jobnum2pnum)
			ParameterSource.get_class('GCDumpParameterSource').write(self._path_params,
				self.get_job_len(), self.get_job_metadata(), self.iter_jobs())
		elif resync_by_user or resync_by_psrc or psrc_hash_changed:  # Perform sync
			if psrc_hash_changed:
				self._log.info('Parameter hash has changed')
				self._log.debug('\told hash: %s', self._psrc_hash_stored)
				self._log.debug('\tnew hash: %s', psrc_hash)
				self._log.log(logging.DEBUG1, '\tnew src: %s', self._psrc_raw)
				config.set_state(True, 'init', detail='config')
			elif resync_by_psrc:
				self._log.info('Parameter source requested resync')
				self._log.debug('\t%r', str.join(', ', imap(repr, resync_by_psrc)))
			elif resync_by_user:
				self._log.info('User requested resync')
			self._psrc_hash_stored = None
			self._resync_state = self.resync(force=True)
		else:  # Reuse old mapping
			activity = Activity('Loading cached parameter information')
			self._read_jobnum2pnum()
			activity.finish()
			return  # do not set parameter hash in config
		config.set('parameter hash', self._psrc_raw.get_psrc_hash())
示例#43
0
	def execute(self, wms_id_list, wms_name):  # yields list of purged (wms_id,)
		activity = Activity('waiting for jobs to finish')
		time.sleep(5)
		for wms_id in wms_id_list:
			path = self._sandbox_helper.get_sandbox('WMSID.%s.%s' % (wms_name, wms_id))
			if path is None:
				self._log.warning('Sandbox for job %r could not be found', wms_id)
				continue
			with_lock(LocalPurgeJobs.purge_lock, _purge_directory, self._log, path, wms_id)
			yield (wms_id,)
		activity.finish()
示例#44
0
 def _get_phedex_replica_list(self, block_path, replicas_dict):
     activity_fi = Activity('Getting file replica information from PhEDex')
     # Get dataset se list from PhEDex (perhaps concurrent with get_dbs_file_list)
     replicas_dict[block_path] = []
     for phedex_block in self._pjrc.get(
             params={'block': block_path})['phedex']['block']:
         for replica in phedex_block['replica']:
             replica_info = (replica['node'], replica.get('se'),
                             replica['complete'] == 'y')
             replicas_dict[block_path].append(replica_info)
     activity_fi.finish()
示例#45
0
	def _get_jobs_output(self, gc_id_jobnum_list):
		# Get output of jobs and yield output dirs
		if len(gc_id_jobnum_list) == 0:
			raise StopIteration

		root_dn = os.path.join(self._path_output, 'tmp')
		try:
			if len(gc_id_jobnum_list) == 1:
				# For single jobs create single subdir
				tmp_dn = os.path.join(root_dn, md5_hex(gc_id_jobnum_list[0][0]))
			else:
				tmp_dn = root_dn
			ensure_dir_exists(tmp_dn)
		except Exception:
			raise BackendError('Temporary path "%s" could not be created.' % tmp_dn, BackendError)

		map_gc_id2jobnum = dict(gc_id_jobnum_list)
		jobs = self._write_wms_id_list(gc_id_jobnum_list)

		activity = Activity('retrieving %d job outputs' % len(gc_id_jobnum_list))
		proc = LocalProcess(self._output_exec, '--noint',
			'--logfile', '/dev/stderr', '-i', jobs, '--dir', tmp_dn)

		# yield output dirs
		todo = map_gc_id2jobnum.values()
		current_jobnum = None
		for line in imap(str.strip, proc.stdout.iter(timeout=60)):
			if line.startswith(tmp_dn):
				todo.remove(current_jobnum)
				output_dn = line.strip()
				unpack_wildcard_tar(self._log, output_dn)
				yield (current_jobnum, output_dn)
				current_jobnum = None
			else:
				current_jobnum = map_gc_id2jobnum.get(self._create_gc_id(line), current_jobnum)
		exit_code = proc.status(timeout=0, terminate=True)
		activity.finish()

		if exit_code != 0:
			if 'Keyboard interrupt raised by user' in proc.stderr.read(timeout=0):
				remove_files([jobs, root_dn])
				raise StopIteration
			else:
				self._log.log_process(proc, files={'jobs': SafeFile(jobs).read()})
			self._log.error('Trying to recover from error ...')
			for dn in os.listdir(root_dn):
				yield (None, os.path.join(root_dn, dn))

		# return unretrievable jobs
		for jobnum in todo:
			yield (jobnum, None)

		remove_files([jobs, tmp_dn])
示例#46
0
    def _submit_job(self, jobnum, task):
        # Submit job and yield (jobnum, WMS ID, other data)
        activity = Activity('submitting job %d' % jobnum)

        try:
            sandbox = tempfile.mkdtemp(
                '', '%s.%04d.' % (task.get_description().task_id, jobnum),
                self._sandbox_helper.get_path())
        except Exception:
            raise BackendError('Unable to create sandbox directory "%s"!' %
                               sandbox)
        sb_prefix = sandbox.replace(self._sandbox_helper.get_path(),
                                    '').lstrip('/')

        def _translate_target(desc, src, target):
            return (desc, src, os.path.join(sb_prefix, target))

        self._sm_sb_in.do_transfer(
            ismap(_translate_target, self._get_in_transfer_info_list(task)))

        self._write_job_config(
            os.path.join(sandbox, '_jobconfig.sh'), jobnum, task, {
                'GC_SANDBOX': sandbox,
                'GC_SCRATCH_SEARCH': str.join(' ', self._scratch_path)
            })
        reqs = self._broker_site.broker(task.get_requirement_list(jobnum),
                                        WMS.SITES)
        reqs = dict(self._broker_queue.broker(reqs, WMS.QUEUES))
        if (self._memory > 0) and (reqs.get(WMS.MEMORY, 0) < self._memory):
            reqs[
                WMS.
                MEMORY] = self._memory  # local jobs need higher (more realistic) memory requirements

        job_name = task.get_description(jobnum).job_name
        proc = self._get_submit_proc(jobnum, sandbox, job_name, reqs)
        exit_code = proc.status(timeout=20, terminate=True)
        wms_id_str = proc.stdout.read(timeout=0).strip().strip('\n')
        wms_id = ignore_exception(Exception, None, self.parse_submit_output,
                                  wms_id_str)
        activity.finish()

        if exit_code != 0:
            self._log.warning('%s failed:', self._submit_exec)
        elif wms_id is None:
            self._log.warning('%s did not yield job id:\n%s',
                              self._submit_exec, wms_id_str)
        gc_id = self._create_gc_id(wms_id)
        if gc_id is not None:
            open(os.path.join(sandbox, gc_id), 'w')
        else:
            self._log.log_process(proc)
        return (jobnum, gc_id, {'sandbox': sandbox})
示例#47
0
 def _readJobs(self, jobLimit):
     jobMap = {}
     maxJobs = 0
     if os.path.exists(self._dbFile):
         try:
             tar = zipfile.ZipFile(self._dbFile, 'r', zipfile.ZIP_DEFLATED)
         except Exception:  # Try to recover job archive
             self._log.warning(
                 '=' * 40 +
                 '\nStarting recovery of broken job database => Answer "y" if asked "Is this a single-disk archive?"!\n'
                 + '=' * 40)
             os.system('zip -FF %s --out %s.tmp 2> /dev/null' %
                       (self._dbFile, self._dbFile))
             os.rename(self._dbFile, self._dbFile + '.broken')
             os.rename(self._dbFile + '.tmp', self._dbFile)
             tar = zipfile.ZipFile(self._dbFile, 'r', zipfile.ZIP_DEFLATED)
             removeFiles([self._dbFile + '.broken'])
             brokenList = []
             for idx, fnTarInfo in enumerate(tar.namelist()):
                 (jobNum, tid) = tuple(
                     imap(lambda s: int(s[1:]), fnTarInfo.split('_', 1)))
                 try:
                     fp = tar.open(fnTarInfo)
                     try:
                         fp.read()
                     finally:
                         fp.close()
                 except Exception:
                     clear_current_exception()
             for broken in brokenList:
                 os.system('zip %s -d %s' % (self._dbFile, broken))
             self._log.info('Recover completed!')
         activity = Activity('Reading job transactions')
         maxJobs = len(tar.namelist())
         tMap = {}
         for idx, fnTarInfo in enumerate(tar.namelist()):
             (jobNum, tid) = tuple(
                 imap(lambda s: int(s[1:]), fnTarInfo.split('_', 1)))
             if tid < tMap.get(jobNum, 0):
                 continue
             try:
                 data = self._fmt.parse(tar.open(fnTarInfo).read())
             except Exception:
                 continue
             jobMap[jobNum] = self._create_job_obj(fnTarInfo, data)
             tMap[jobNum] = tid
             if idx % 100 == 0:
                 activity.update('Reading job transactions %d [%d%%]' %
                                 (idx, (100.0 * idx) / maxJobs))
         activity.finish()
     self._serial = maxJobs
     return jobMap
示例#48
0
 def getEntries(self, path, metadata, events, seList, objStore):
     activity = Activity('Reading job logs')
     for jobNum in self._selected:
         activity.update('Reading job logs - [%d / %d]' %
                         (jobNum, self._selected[-1]))
         metadata['GC_JOBNUM'] = jobNum
         objStore.update({
             'GC_TASK': self._extTask,
             'GC_WORKDIR': self._extWorkDir
         })
         yield (os.path.join(self._extWorkDir, 'output', 'job_%d' % jobNum),
                metadata, events, seList, objStore)
     activity.finish()
示例#49
0
	def execute(self, wmsIDs, wmsName): # yields list of purged (wmsID,)
		activity = Activity('waiting for jobs to finish')
		time.sleep(5)
		for wmsID in wmsIDs:
			path = self._sandbox_helper.get_sandbox('WMSID.%s.%s' % (wmsName, wmsID))
			if path is None:
				self._log.warning('Sandbox for job %r could not be found', wmsID)
				continue
			try:
				shutil.rmtree(path)
			except Exception:
				raise BackendError('Sandbox for job %r could not be deleted', wmsID)
			yield (wmsID,)
		activity.finish()
示例#50
0
    def _run_executor(self, desc, executor, fmt, gc_id_list, *args):
        # Perform some action with the executor, translate wms_id -> gc_id and format the result
        activity = Activity(desc)
        map_wms_id2gc_id = self._get_map_wms_id2gc_id(gc_id_list)
        wms_id_list = sorted(map_wms_id2gc_id.keys())

        for result in executor.execute(wms_id_list, *args):
            wms_id = result[0]  # result[0] is the wms_id by convention
            gc_id = map_wms_id2gc_id.pop(wms_id, None)
            if gc_id is not None:
                yield fmt((gc_id, ) + result[1:])
            else:
                self._log.debug('unable to find gc_id for wms_id %r', wms_id)
        activity.finish()
示例#51
0
	def _run_executor(self, desc, executor, fmt, gc_id_list, *args):
		# Perform some action with the executor, translate wms_id -> gc_id and format the result
		activity = Activity(desc)
		map_wms_id2gc_id = self._get_map_wms_id2gc_id(gc_id_list)
		wms_id_list = sorted(map_wms_id2gc_id.keys())

		for result in executor.execute(wms_id_list, *args):
			wms_id = result[0]  # result[0] is the wms_id by convention
			gc_id = map_wms_id2gc_id.pop(wms_id, None)
			if gc_id is not None:
				yield fmt((gc_id,) + result[1:])
			else:
				self._log.debug('unable to find gc_id for wms_id %r', wms_id)
		activity.finish()
示例#52
0
 def get_dataset_name_list(self):
     if self._cache_dataset is None:
         self._cache_dataset = [self._dataset_path]
         if '*' in self._dataset_path:
             activity = Activity('Getting dataset list for %s' %
                                 self._dataset_path)
             self._cache_dataset = list(
                 self._get_cms_dataset_list(self._dataset_path))
             if not self._cache_dataset:
                 raise DatasetError(
                     'No datasets selected by DBS wildcard %s !' %
                     self._dataset_path)
             activity.finish()
     return self._cache_dataset
示例#53
0
	def _run_executor(self, desc, executor, fmt, gcIDs, *args):
		# Perform some action with the executor, translate wmsID -> gcID and format the result
		activity = Activity(desc)
		wmsID_gcID_Map = self._get_map_wmsID_gcID(gcIDs)
		wmsIDs = sorted(wmsID_gcID_Map.keys())

		for result in executor.execute(wmsIDs, *args):
			wmsID = result[0] # result[0] is the wmsID by convention
			gcID = wmsID_gcID_Map.pop(wmsID, None)
			if gcID is not None:
				yield fmt((gcID,) + result[1:])
			else:
				self._log.debug('unable to find gcID for wmsID %r', wmsID)
		activity.finish()
示例#54
0
	def __init__(self, config, jobLimit = -1, jobSelector = None):
		dbPath = config.getWorkPath('jobs')
		self._dbFile = config.getWorkPath('jobs.zip')
		if os.path.exists(dbPath) and os.path.isdir(dbPath) and not os.path.exists(self._dbFile):
			activity = Activity('Converting job database')
			self._serial = 0
			try:
				oldDB = TextFileJobDB(config)
				for jobNum in oldDB.getJobs():
					self.commit(jobNum, oldDB.get(jobNum))
			except Exception:
				removeFiles([self._dbFile])
				raise
			activity.finish()
		ZippedJobDB.__init__(self, config, jobLimit, jobSelector)
示例#55
0
	def resync(self, force = False): # Do not overwrite resync results - eg. from external or init trigger
		source_hash = self._source.getHash()
		if (self._resync_state == ParameterSource.EmptyResyncResult()) and ((source_hash != self._source_hash) or force):
			activity = Activity('Syncronizing parameter information')
			t_start = time.time()
			try:
				self._resync_state = self._resync()
			except Exception:
				raise ParameterError('Unable to resync parameters!')
			self._source_hash = self._source.getHash()
			activity.finish()
			self._log.log(logging.INFO, 'Finished resync of parameter source (%s)', strTimeShort(time.time() - t_start))
		result = self._resync_state
		self._resync_state = ParameterSource.EmptyResyncResult()
		return result
示例#56
0
	def cancel_jobs(self, gc_id_jobnum_list):
		if not len(gc_id_jobnum_list):
			raise StopIteration
		activity   = Activity('Canceling jobs')
		assert not bool(lfilter( lambda htcid: htcid.scheddURI != self._schedd.getURI(), self._splitGcRequests(gc_id_jobnum_list))), 'Bug! Got jobs at Schedds %s, but servicing only Schedd %s' % (lfilter( lambda itr: itr.scheddURI != self._schedd.getURI(), self._splitGcRequests(gc_id_jobnum_list)), self._schedd.getURI())
		canceledJobs = self._schedd.cancel_jobs(
			self._splitGcRequests(gc_id_jobnum_list)
			)
		# Yield ( jobnum, wms_id) for canceled jobs
		for htcJobID in canceledJobs:
			yield (
				htcJobID.gcJobNum,
				self._createGcId(htcJobID)
				)
		activity.finish()
示例#57
0
	def _get_jobs_output(self, gc_id_jobnum_list):
		if not len(gc_id_jobnum_list):
			raise StopIteration
		activity   = Activity('Fetching jobs')
		assert not bool(lfilter( lambda htcid: htcid.scheddURI != self._schedd.getURI(), self._splitGcRequests(gc_id_jobnum_list))), 'Bug! Got jobs at Schedds %s, but servicing only Schedd %s' % (lfilter( lambda itr: itr.scheddURI != self._schedd.getURI(), self._splitGcRequests(gc_id_jobnum_list)), self._schedd.getURI())
		returnedJobs = self._schedd.getJobsOutput(
			self._splitGcRequests(gc_id_jobnum_list)
			)
		# Yield (jobnum, path_output) per retrieved job
		for htcID in returnedJobs:
			yield (
				htcID.gcJobNum,
				self.getSandboxPath(htcID.gcJobNum)
				)
		activity.finish()
示例#58
0
	def doTransfer(self, listDescSourceTarget):
		for (desc, source, target) in listDescSourceTarget:
			if not self.smPaths:
				raise ConfigError("%s can't be transferred because '%s path wasn't set" % (desc, self.smOptPrefix))
			for idx, sePath in enumerate(set(self.smPaths)):
				activity = Activity('Copy %s to SE %d ' % (desc, idx + 1))
				proc = se_copy(source, os.path.join(sePath, target), self.smForce)
				proc.status(timeout = 5*60, terminate = True)
				activity.finish()
				if proc.status(timeout = 0) == 0:
					self._log.info('Copy %s to SE %d finished', desc, idx + 1)
				else:
					self._log.info('Copy %s to SE %d failed', desc, idx + 1)
					self._log.critical(proc.stderr.read(timeout = 0))
					self._log.critical('Unable to copy %s! You can try to copy it manually.', desc)
					if not utils.getUserBool('Is %s (%s) available on SE %s?' % (desc, source, sePath), False):
						raise StorageError('%s is missing on SE %s!' % (desc, sePath))