def send_dialog_messages(self, dialog_list): tmpLog = self.make_logger(method_name='send_dialog_messages') tmpLog.debug('start') dataList = [] for diagSpec in dialog_list: dataList.append(diagSpec.convert_to_propagate()) data = dict() data['harvesterID'] = harvester_config.master.harvester_id data['dialogs'] = json.dumps(dataList) tmpLog.debug('send {0} messages'.format(len(dataList))) tmpStat, tmpRes = self.post_ssl('addHarvesterDialogs', data) errStr = 'OK' if tmpStat is False: errStr = core_utils.dump_error_message(tmpLog, tmpRes) else: try: retCode, tmpStr = tmpRes.json() if not retCode: errStr = core_utils.dump_error_message(tmpLog, tmpStr) tmpStat = False except Exception: errStr = core_utils.dump_error_message(tmpLog) tmpLog.error('conversion failure from {0}'.format(tmpRes.text)) tmpStat = False if tmpStat: tmpLog.debug('done with {0}'.format(errStr)) return tmpStat, errStr
def update_worker_stats(self, site_name, stats): tmpLog = self.make_logger(method_name='update_worker_stats') tmpLog.debug('start') data = dict() data['harvesterID'] = harvester_config.master.harvester_id data['siteName'] = site_name data['paramsList'] = json.dumps(stats) tmpLog.debug('update stats for {0}, stats: {1}'.format(site_name, stats)) tmpStat, tmpRes = self.post_ssl('reportWorkerStats', data) errStr = 'OK' if tmpStat is False: errStr = core_utils.dump_error_message(tmpLog, tmpRes) else: try: retCode, retMsg = tmpRes.json() if not retCode: tmpStat = False errStr = core_utils.dump_error_message(tmpLog, retMsg) except Exception: tmpStat = False errStr = core_utils.dump_error_message(tmpLog) tmpLog.error('conversion failure from {0}'.format(tmpRes.text)) if tmpStat: tmpLog.debug('done with {0}:{1}'.format(tmpStat, errStr)) return tmpStat, errStr
def __init__(self, **kwarg): PluginBase.__init__(self, **kwarg) # create Globus Transfer Client tmpLog = self.make_logger(_logger, method_name='GoPreparator __init__ ') try: self.tc = None # need to get client_id and refresh_token from PanDA server via harvester cache mechanism tmpLog.debug('about to call dbInterface.get_cache(globus_secret)') c_data = self.dbInterface.get_cache('globus_secret') if (not c_data == None) and c_data.data['StatusCode'] == 0 : tmpLog.debug('Got the globus_secrets from PanDA') self.client_id = c_data.data['publicKey'] # client_id self.refresh_token = c_data.data['privateKey'] # refresh_token tmpStat, self.tc = globus_utils.create_globus_transfer_client(tmpLog,self.client_id,self.refresh_token) if not tmpStat: self.tc = None errStr = 'failed to create Globus Transfer Client' tmpLog.error(errStr) else : self.client_id = None self.refresh_token = None self.tc = None errStr = 'failed to get Globus Client ID and Refresh Token' tmpLog.error(errStr) except: core_utils.dump_error_message(tmpLog) tmpLog.debug('__init__ finished')
def feed_events(self, workspec, events_dict): # get logger tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='feed_events') retVal = True if workspec.mapType in [WorkSpec.MT_OneToOne, WorkSpec.MT_MultiWorkers]: # put the json just under the access point jsonFilePath = os.path.join(workspec.get_access_point(), jsonEventsFeedFileName) tmpLog.debug('feeding events to {0}'.format(jsonFilePath)) try: with open(jsonFilePath, 'w') as jsonFile: json.dump(events_dict, jsonFile) except Exception: core_utils.dump_error_message(tmpLog) retVal = False elif workspec.mapType == WorkSpec.MT_MultiJobs: # TOBEFIXED pass # remove request file try: jsonFilePath = os.path.join(workspec.get_access_point(), jsonEventsRequestFileName) os.remove(jsonFilePath) except Exception: pass tmpLog.debug('done') return retVal
def is_alive(self, key_values): tmpLog = self.make_logger(method_name='is_alive') tmpLog.debug('start') # convert datetime for tmpKey, tmpVal in iteritems(key_values): if isinstance(tmpVal, datetime.datetime): tmpVal = 'datetime/' + tmpVal.strftime('%Y-%m-%d %H:%M:%S.%f') key_values[tmpKey] = tmpVal # send data data = dict() data['harvesterID'] = harvester_config.master.harvester_id data['data'] = json.dumps(key_values) tmpStat, tmpRes = self.post_ssl('harvesterIsAlive', data) retCode = False if tmpStat is False: tmpStr = core_utils.dump_error_message(tmpLog, tmpRes) else: try: retCode, tmpStr = tmpRes.json() except Exception: tmpStr = core_utils.dump_error_message(tmpLog) tmpLog.error('conversion failure from {0}'.format(tmpRes.text)) tmpStat = False if tmpStat: tmpLog.debug('done with {0} : {1}'.format(retCode, tmpStr)) return retCode, tmpStr
def get_proxy(self, voms_role, cert=None): retVal = None retMsg = '' # get logger tmpLog = self.make_logger(method_name='get_proxy') tmpLog.debug('start') data = {'role': voms_role} tmpStat, tmpRes = self.post_ssl('getProxy', data, cert) if tmpStat is False: core_utils.dump_error_message(tmpLog, tmpRes) else: try: tmpDict = tmpRes.json() if tmpDict['StatusCode'] == 0: retVal = tmpDict['userProxy'] else: retMsg = tmpDict['errorDialog'] core_utils.dump_error_message(tmpLog, retMsg) tmpStat = False except Exception: retMsg = core_utils.dump_error_message(tmpLog, tmpRes) tmpStat = False if tmpStat: tmpLog.debug('done with {0}'.format(str(retVal))) return retVal, retMsg
def check_event_availability(self, jobspec): retStat = False retVal = None tmpLog = self.make_logger('PandaID={0}'.format(jobspec.PandaID), method_name='check_event_availability') tmpLog.debug('start') data = dict() data['taskID'] = jobspec.taskID data['pandaID'] = jobspec.PandaID if jobspec.jobsetID is None: data['jobsetID'] = jobspec.jobParams['jobsetID'] else: data['jobsetID'] = jobspec.jobsetID tmpStat, tmpRes = self.post_ssl('checkEventsAvailability', data) if tmpStat is False: core_utils.dump_error_message(tmpLog, tmpRes) else: try: tmpDict = tmpRes.json() if tmpDict['StatusCode'] == 0: retStat = True retVal = tmpDict['nEventRanges'] except Exception: core_utils.dump_error_message(tmpLog, tmpRes) tmpLog.debug('done with {0}'.format(retVal)) return retStat, retVal
def update_workers(self, workspec_list): tmpLog = self.make_logger(method_name='update_workers') tmpLog.debug('start') dataList = [] for workSpec in workspec_list: dataList.append(workSpec.convert_to_propagate()) data = dict() data['harvesterID'] = harvester_config.master.harvester_id data['workers'] = json.dumps(dataList) tmpLog.debug('update {0} workers'.format(len(dataList))) tmpStat, tmpRes = self.post_ssl('updateWorkers', data) retList = None errStr = 'OK' if tmpStat is False: errStr = core_utils.dump_error_message(tmpLog, tmpRes) else: try: retCode, retList = tmpRes.json() if not retCode: errStr = core_utils.dump_error_message(tmpLog, retList) retList = None tmpStat = False except Exception: errStr = core_utils.dump_error_message(tmpLog) tmpLog.error('conversion failure from {0}'.format(tmpRes.text)) tmpStat = False if tmpStat: tmpLog.debug('done with {0}'.format(errStr)) return retList, errStr
def rucio_create_dataset(tmpLog,datasetScope,datasetName): # create the dataset try: # register dataset lifetime = 7*24*60*60 tmpLog.debug('register {0}:{1} lifetime = {2}' .format(datasetScope, datasetName,lifetime)) try: executable = ['/usr/bin/env', 'rucio', 'add-dataset'] executable += [ '--lifetime',('%d' %lifetime)] executable += [datasetName] #print executable tmpLog.debug('rucio add-dataset command: {0} '.format(executable)) tmpLog.debug('rucio add-dataset command (for human): %s ' % ' '.join(executable)) process = subprocess.Popen(executable, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) stdout,stderr = process.communicate() if process.returncode == 0: tmpLog.debug(stdout) return True,'' else: # check what failed dataset_exists = False rucio_sessions_limit_error = False for line in stdout.split('\n'): if 'Data Identifier Already Exists' in line: dataset_exists = True break elif 'exceeded simultaneous SESSIONS_PER_USER limit' in line: rucio_sessions_limit_error = True break if dataset_exists: errMsg = 'dataset {0}:{1} already exists'.format(datasetScope, datasetName) tmpLog.debug(errMsg) return True,errMsg elif rucio_sessions_limit_error: # do nothing errStr = 'Rucio returned error, will retry: stdout: {0}'.format(stdout) tmpLog.warning(errStr) return None,errStr else: # some other Rucio error errStr = 'Rucio returned error : stdout: {0}'.format(stdout) tmpLog.error(errStr) return False,errStr except Exception: errMsg = 'Could not create dataset {0}:{1}'.format(datasetScope, datasetName) core_utils.dump_error_message(tmpLog) tmpLog.error(errMsg) return False,errMsg
def feed_jobs(self, workspec, jobspec_list): # get logger tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='feed_jobs') retVal = True # get PFC pfc = core_utils.make_pool_file_catalog(jobspec_list) pandaIDs = [] for jobSpec in jobspec_list: accessPoint = self.get_access_point(workspec, jobSpec.PandaID) jobSpecFilePath = os.path.join(accessPoint, jobSpecFileName) xmlFilePath = os.path.join(accessPoint, xmlPoolCatalogFileName) tmpLog.debug('feeding jobs to {0}'.format(jobSpecFilePath)) try: # put job spec file with open(jobSpecFilePath, 'w') as jobSpecFile: jobParams = jobSpec.get_job_params(self.stripJobParams) if self.jobSpecFileFormat == 'cgi': jobSpecFile.write(urlencode(jobParams)) else: json.dump({jobSpec.PandaID: jobParams}, jobSpecFile) # put PFC.xml with open(xmlFilePath, 'w') as pfcFile: pfcFile.write(pfc) # make symlink inFiles = jobSpec.get_input_file_attributes() for inLFN, inFile in iteritems(inFiles): dstPath = os.path.join(accessPoint, inLFN) if 'path' in inFile and inFile['path'] != dstPath: # test if symlink exists if so remove it if os.path.exists(dstPath): os.unlink(dstPath) tmpLog.debug("removing existing symlink %s" % dstPath) os.symlink(inFile['path'], dstPath) pandaIDs.append(jobSpec.PandaID) except Exception: core_utils.dump_error_message(tmpLog) retVal = False # put PandaIDs file try: jsonFilePath = os.path.join(workspec.get_access_point(), pandaIDsFile) with open(jsonFilePath, 'w') as jsonPandaIDsFile: json.dump(pandaIDs, jsonPandaIDsFile) except Exception: core_utils.dump_error_message(tmpLog) retVal = False # remove request file try: reqFilePath = os.path.join(workspec.get_access_point(), jsonJobRequestFileName) os.remove(reqFilePath) except Exception: pass tmpLog.debug('done') return retVal
def is_alive(self, workspec, worker_heartbeat_limit): tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='is_alive') tmpLog.debug('start') try: ret = self.conn.root.is_alive(self.original_config, workspec, worker_heartbeat_limit) except Exception: core_utils.dump_error_message(tmpLog) ret = None else: tmpLog.debug('done') return ret
def check_workers(self, workspec_list): tmpLog = core_utils.make_logger(_logger, method_name='check_workers') tmpLog.debug('start') try: ret = self.conn.root.check_workers(self.original_config, workspec_list) except Exception: core_utils.dump_error_message(tmpLog) ret = None else: tmpLog.debug('done') return ret
def post_processing(self, workspec, jobspec_list, map_type): tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='post_processing') tmpLog.debug('start') try: ret = self.conn.root.post_processing(self.original_config, workspec, jobspec_list, map_type) except Exception: core_utils.dump_error_message(tmpLog) ret = None else: tmpLog.debug('done') return ret
def acknowledge_events_files(self, workspec): tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='acknowledge_events_files') tmpLog.debug('start') try: ret = self.conn.root.acknowledge_events_files(self.original_config, workspec) except Exception: core_utils.dump_error_message(tmpLog) ret = None else: tmpLog.debug('done') return ret
def kill_requested(self, workspec): tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='kill_requested') tmpLog.debug('start') try: ret = self.conn.root.kill_requested(self.original_config, workspec) except Exception: core_utils.dump_error_message(tmpLog) ret = None else: tmpLog.debug('done') return ret
def rucio_add_files_to_dataset(tmpLog,datasetScope,datasetName,fileList): # add files to dataset try: #create the to DID to_did = '{0}:{1}'.format(datasetScope,datasetName) executable = ['/usr/bin/env', 'rucio', 'attach', to_did] # loop over the files to add for filename in fileList: from_did = '{0}:{1}'.format(filename['scope'],filename['name']) executable += [from_did] #print executable tmpLog.debug('rucio attach command: {0} '.format(executable)) tmpLog.debug('rucio attach command (for human): %s ' % ' '.join(executable)) process = subprocess.Popen(executable, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) stdout,stderr = process.communicate() if process.returncode == 0: tmpLog.debug(stdout) return True,'' else: # check what failed rucio_sessions_limit_error = False for line in stdout.split('\n'): if 'exceeded simultaneous SESSIONS_PER_USER limit' in line: rucio_sessions_limit_error = True break if rucio_sessions_limit_error: # do nothing errStr = 'Rucio returned Sessions Limit error, will retry: stdout: {0}'.format(stdout) tmpLog.warning(errStr) return None,errStr else: # some other Rucio error errStr = 'Rucio returned error : stdout: {0}'.format(stdout) tmpLog.error(errStr) return False,errStr #except FileAlreadyExists: # # ignore if files already exist # pass except Exception: errMsg = 'Could not add files to DS - {0}:{1} files - {2}'.format(datasetScope, datasetName, fileList) core_utils.dump_error_message(tmpLog) tmpLog.error(errMsg) return False,errMsg
def feed_events(self, workspec, events_dict): '''Havester has an event range to pass to job''' # get logger arclog = arc_utils.ARCLogger(baselogger, workspec.workerID) tmpLog = arclog.log # Upload to jobid/jsonEventsFeedFileName, delete jobid/jsonEventsRequestFileName job = workspec.workAttributes['arcjob'] arcid = job['JobID'] # Set certificate to use for interacting with ARC CE usercfg = arc.UserConfig(self.cred_type) if not self._setup_proxy(usercfg, workspec, arcid, tmpLog): return False retVal = True if workspec.mapType in [WorkSpec.MT_OneToOne, WorkSpec.MT_MultiWorkers]: # put the json just under the access point then upload to ARC CE localJsonFilePath = os.path.join(workspec.get_access_point(), jsonEventsFeedFileName) tmpLog.debug('feeding events to {0}'.format(localJsonFilePath)) try: with open(localJsonFilePath, 'w') as jsonFile: json.dump(events_dict, jsonFile) except Exception: core_utils.dump_error_message(tmpLog) retVal = False remoteJsonFilePath = '%s/%s' % (arcid, jsonEventsFeedFileName) # Try to copy the file status = self._copy_file(localJsonFilePath, remoteJsonFilePath, usercfg, tmpLog) if not status: tmpLog.error('Failed to feed events to {0}: {1}'.format(remoteJsonFilePath, str(status))) retVal = False else: remoteJsonEventsRequestFile = '%s/%s' % (arcid, jsonEventsRequestFileName) status = self._delete_file(remoteJsonEventsRequestFile, usercfg, tmpLog) if not status and status.GetErrno() != errno.ENOENT: tmpLog.error('Failed to delete event request file at {0}'.format(remoteJsonEventsRequestFile)) elif workspec.mapType == WorkSpec.MT_MultiJobs: # TOBEFIXED pass # remove request file try: jsonFilePath = os.path.join(workspec.get_access_point(), jsonEventsFeedFileName) os.remove(jsonFilePath) except Exception: pass tmpLog.debug('done') return retVal
def rucio_rule_info(tmpLog,rucioRule): # get rule-info tmpLog.debug('rucio rule-info {0}'.format(rucioRule)) try: executable = ['/usr/bin/env', 'rucio', 'rule-info',rucioRule] #print executable tmpLog.debug('rucio rule-info command: {0} '.format(executable)) tmpLog.debug('rucio rule-info command (for human): %s ' % ' '.join(executable)) process = subprocess.Popen(executable, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) stdout,stderr = process.communicate() if process.returncode == 0: tmpLog.debug(stdout) # parse the output to get the state: for line in stdout.split('\n'): if 'State:' in line: # get the State varible result = line.split() return True,result return None,'' else: # check what failed rucio_sessions_limit_error = False for line in stdout.split('\n'): if 'exceeded simultaneous SESSIONS_PER_USER limit' in line: rucio_sessions_limit_error = True break if rucio_sessions_limit_error: # do nothing errStr = 'Rucio returned error, will retry: stdout: {0}'.format(stdout) tmpLog.warning(errStr) return None,errStr else: # some other Rucio error errStr = 'Rucio returned error : stdout: {0}'.format(stdout) tmpLog.error(errStr) return False,errStr except Exception: errMsg = 'Could not run rucio rule-info {0}'.format(rucioRule) core_utils.dump_error_message(tmpLog) tmpLog.error(errMsg) return False,errMsg
def catch_sigkill(sig, frame): disable_profiler() _logger.info('got signal={0} to be killed'.format(sig)) try: os.remove(options.pid) except Exception: pass try: if os.getppid() == 1: os.killpg(os.getpgrp(), signal.SIGKILL) else: os.kill(os.getpid(), signal.SIGKILL) except Exception: core_utils.dump_error_message(_logger) _logger.error('failed to be killed')
def rucio_add_rule(tmpLog,datasetScope,datasetName,dstRSE): # add rule try: tmpLog.debug('rucio add-rule {0}:{1} 1 {2}'.format(datasetScope, datasetName, dstRSE)) did = '{0}:{1}'.format(datasetScope,datasetName) executable = ['/usr/bin/env', 'rucio', 'add-rule',did,'1',dstRSE] #print executable tmpLog.debug('rucio add-rule command: {0} '.format(executable)) tmpLog.debug('rucio add-rule command (for human): %s ' % ' '.join(executable)) process = subprocess.Popen(executable, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) stdout,stderr = process.communicate() if process.returncode == 0: tmpLog.debug(stdout) #parse stdout for rule id rule_id = stdout.split('\n')[0] return True,rule_id else: # check what failed rucio_sessions_limit_error = False for line in stdout.split('\n'): if 'exceeded simultaneous SESSIONS_PER_USER limit' in line: rucio_sessions_limit_error = True break if rucio_sessions_limit_error: # do nothing errStr = 'Rucio returned error, will retry: stdout: {0}'.format(stdout) tmpLog.warning(errStr) return None,errStr else: # some other Rucio error errStr = 'Rucio returned error : stdout: {0}'.format(stdout) tmpLog.error(errStr) return False,errStr except Exception: core_utils.dump_error_message(tmpLog) # treat as a temporary error tmpStat = False tmpMsg = 'failed to add a rule for {0}:{1}'.format(datasetScope, datasetName) return tmpStat,tmpMsg
def application(environ, start_response): try: # get params try: request_body_size = int(environ.get('CONTENT_LENGTH', 0)) except: request_body_size = 0 # check token try: auth_str = environ.get('HTTP_AUTHORIZATION', '').split()[-1] token = HarvesterToken() payload = token.get_payload(auth_str) except: errMsg = 'Auth failed: Invalid token' start_response('403 Forbidden', [('Content-Type', 'text/plain')]) return [errMsg.encode('ascii')] request_body = environ['wsgi.input'].read(request_body_size) params = json.loads(request_body) # make handler handler = ApacheHandler(None, None, None) handler.set_form(params) # execute handler.do_POST() # make response _logger.debug("{0} Phrase".format(handler.responseCode)) start_response("{0} Phrase".format(handler.responseCode), handler.headerList) return [handler.message] except: errMsg = core_utils.dump_error_message(_logger) start_response('500 Phrase', [('Content-Type', 'text/plain')]) return [errMsg]
def zip_output(self, jobspec): # make logger tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID), method_name='zip_output') tmpLog.debug('start') try: for fileSpec in jobspec.outFiles: if self.zipDir == "${SRCDIR}": # the same directory as src zipDir = os.path.dirname(next(iter(fileSpec.associatedFiles)).path) else: zipDir = self.zipDir zipPath = os.path.join(zipDir, fileSpec.lfn) # remove zip file just in case try: os.remove(zipPath) except: pass # make zip file with zipfile.ZipFile(zipPath, "w", zipfile.ZIP_STORED) as zf: for assFileSpec in fileSpec.associatedFiles: zf.write(assFileSpec.path,os.path.basename(assFileSpec.path)) # set path fileSpec.path = zipPath # get size statInfo = os.stat(zipPath) fileSpec.fsize = statInfo.st_size except: errMsg = core_utils.dump_error_message(tmpLog) return False, 'failed to zip with {0}'.format(errMsg) tmpLog.debug('done') return True, ''
def check_credential(self): # make logger mainLog = self.make_logger(_logger, method_name='check_credential') comStr = "grid-proxy-info -exists -hours 72 -file {0}".format(self.outCertFile) mainLog.debug(comStr) try: p = subprocess.Popen(comStr.split(), shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdOut, stdErr = p.communicate() retCode = p.returncode except: core_utils.dump_error_message(mainLog) return False mainLog.debug('retCode={0} stdOut={1} stdErr={2}'.format(retCode, stdOut, stdErr)) return retCode == 0
def make_workers(self, jobchunk_list, queue_config, n_ready, resource_type, maker=None): tmpLog = core_utils.make_logger(_logger, 'queue={0} rtype={1}'.format(queue_config.queueName, resource_type), method_name='make_workers') tmpLog.debug('start') try: # get plugin if maker is None: maker = self.pluginFactory.get_plugin(queue_config.workerMaker) if maker is None: # not found tmpLog.error('plugin for {0} not found'.format(queue_config.queueName)) return [], jobchunk_list # get ready workers readyWorkers = self.dbProxy.get_ready_workers(queue_config.queueName, n_ready) # loop over all chunks okChunks = [] ngChunks = [] for iChunk, jobChunk in enumerate(jobchunk_list): # make a worker if iChunk >= n_ready: workSpec = maker.make_worker(jobChunk, queue_config, resource_type) else: # use ready worker if iChunk < len(readyWorkers): workSpec = readyWorkers[iChunk] else: workSpec = None # failed if workSpec is None: ngChunks.append(jobChunk) continue # set workerID if workSpec.workerID is None: workSpec.workerID = self.dbProxy.get_next_seq_number('SEQ_workerID') workSpec.configID = queue_config.configID workSpec.isNew = True okChunks.append((workSpec, jobChunk)) # dump tmpLog.debug('made {0} workers while {1} chunks failed'.format(len(okChunks), len(ngChunks))) return okChunks, ngChunks except Exception: # dump error core_utils.dump_error_message(tmpLog) return [], jobchunk_list
def get_job_stats(self): tmp_log = self.make_logger(method_name='get_job_stats') tmp_log.debug('start') tmp_stat, tmp_res = self.post_ssl('getJobStatisticsPerSite', {}) stats = {} if tmp_stat is False: ret_msg = 'FAILED' core_utils.dump_error_message(tmp_log, tmp_res) else: try: stats = pickle.loads(tmp_res.content) ret_msg = 'OK' except Exception: ret_msg = 'Exception' core_utils.dump_error_message(tmp_log) return stats, ret_msg
def get_event_ranges(self, data_map, scattered): retStat = False retVal = dict() try: getEventsChunkSize = harvester_config.pandacon.getEventsChunkSize except Exception: getEventsChunkSize = 5120 for pandaID, data in iteritems(data_map): # get logger tmpLog = self.make_logger('PandaID={0}'.format(data['pandaID']), method_name='get_event_ranges') if 'nRanges' in data: nRanges = data['nRanges'] else: nRanges = 1 if scattered: data['scattered'] = True tmpLog.debug('start nRanges={0}'.format(nRanges)) while nRanges > 0: # use a small chunk size to avoid timeout chunkSize = min(getEventsChunkSize, nRanges) data['nRanges'] = chunkSize tmpStat, tmpRes = self.post_ssl('getEventRanges', data) if tmpStat is False: core_utils.dump_error_message(tmpLog, tmpRes) else: try: tmpDict = tmpRes.json() if tmpDict['StatusCode'] == 0: retStat = True if data['pandaID'] not in retVal: retVal[data['pandaID']] = [] retVal[data['pandaID']] += tmpDict['eventRanges'] # got empty if len(tmpDict['eventRanges']) == 0: break except Exception: core_utils.dump_error_message(tmpLog, tmpRes) break nRanges -= chunkSize tmpLog.debug('done with {0}'.format(str(retVal))) return retStat, retVal
def __init__(self, **kwarg): PluginBase.__init__(self, **kwarg) # make logger tmpLog = self.make_logger(_logger, 'ThreadID={0}'.format(threading.current_thread().ident), method_name='GlobusBulkPreparator __init__ {} ') tmpLog.debug('__init__ start') self.thread_id = threading.current_thread().ident self.id = GlobusBulkPreparator.next_id GlobusBulkPreparator.next_id += 1 with uLock: global uID self.dummy_transfer_id = '{0}_{1}'.format(dummy_transfer_id_base, 'XXXX') uID += 1 uID %= harvester_config.preparator.nThreads # create Globus Transfer Client try: self.tc = None # need to get client_id and refresh_token from PanDA server via harvester cache mechanism tmpLog.debug('about to call dbInterface.get_cache(globus_secret)') c_data = self.dbInterface.get_cache('globus_secret') if (not c_data == None) and c_data.data['StatusCode'] == 0 : tmpLog.debug('Got the globus_secrets from PanDA') self.client_id = c_data.data['publicKey'] # client_id self.refresh_token = c_data.data['privateKey'] # refresh_token tmpStat, self.tc = globus_utils.create_globus_transfer_client(tmpLog,self.client_id,self.refresh_token) if not tmpStat: self.tc = None errStr = 'failed to create Globus Transfer Client' tmpLog.error(errStr) else : self.client_id = None self.refresh_token = None self.tc = None errStr = 'failed to get Globus Client ID and Refresh Token' tmpLog.error(errStr) except: core_utils.dump_error_message(tmpLog) # tmp debugging tmpLog.debug('self.id = {0}'.format(self.id)) tmpLog.debug('self.dummy_transfer_id = {0}'.format(self.dummy_transfer_id)) # tmp debugging tmpLog.debug('__init__ finish')
def submit_with_command(self, jdl_list, use_spool=False, tmp_str='', keep_temp_sdf=False): # Make logger tmpLog = core_utils.make_logger(baseLogger, 'submissionHost={0}'.format(self.submissionHost), method_name='CondorJobSubmit.submit_with_command') # Initialize errStr = '' batchIDs_list = [] # make sdf temp file from jdls tmpFile = tempfile.NamedTemporaryFile(mode='w', delete=(not keep_temp_sdf), suffix='_{0}_cluster_submit.sdf'.format(tmp_str)) sdf_file = tmpFile.name tmpFile.write('\n\n'.join(jdl_list)) tmpFile.flush() # make condor remote options name_opt = '-name {0}'.format(self.condor_schedd) if self.condor_schedd else '' pool_opt = '-pool {0}'.format(self.condor_pool) if self.condor_pool else '' spool_opt = '-remote -spool' if use_spool and self.condor_schedd else '' # command comStr = 'condor_submit -single-cluster {spool_opt} {name_opt} {pool_opt} {sdf_file}'.format( sdf_file=sdf_file, name_opt=name_opt, pool_opt=pool_opt, spool_opt=spool_opt) # submit tmpLog.debug('submit with command: {0}'.format(comStr)) try: p = subprocess.Popen(comStr.split(), shell=False, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # check return code stdOut, stdErr = p.communicate() retCode = p.returncode except Exception as e: stdOut = '' stdErr = core_utils.dump_error_message(tmpLog, no_message=True) retCode = 1 errStr = '{0}: {1}'.format(e.__class__.__name__, e) finally: tmpFile.close() tmpLog.debug('retCode={0}'.format(retCode)) if retCode == 0: # extract clusterid and n_jobs job_id_match = None for tmp_line_str in stdOut.split('\n'): job_id_match = re.search('^(\d+) job[(]s[)] submitted to cluster (\d+)\.$', tmp_line_str) if job_id_match: break if job_id_match is not None: n_jobs = int(job_id_match.group(1)) clusterid = job_id_match.group(2) batchIDs_list = ['{0}.{1}'.format(clusterid, procid) for procid in range(n_jobs)] tmpLog.debug('submitted {0} jobs: {1}'.format(n_jobs, ' '.join(batchIDs_list))) else: errStr = 'no job submitted: {0}'.format(errStr) tmpLog.error(errStr) else: tmpLog.error('submission failed: {0} ; {1}'.format(stdErr, errStr)) # Return return (batchIDs_list, errStr)
def ack_commands(self, command_ids): harvester_id = harvester_config.master.harvester_id tmpLog = self.make_logger('harvesterID={0}'.format(harvester_id), method_name='ack_commands') tmpLog.debug('Start acknowledging {0} commands (command_ids={1})'.format(len(command_ids), command_ids)) data = {} data['command_ids'] = json.dumps(command_ids) tmp_stat, tmp_res = self.post_ssl('ackCommands', data) if tmp_stat is False: core_utils.dump_error_message(tmpLog, tmp_res) else: try: tmp_dict = tmp_res.json() if tmp_dict['StatusCode'] == 0: tmpLog.debug('Finished acknowledging commands') return True return False except KeyError: core_utils.dump_error_message(tmpLog, tmp_res) return False
def setup_access_points(self, workspec_list): try: for workSpec in workspec_list: accessPoint = workSpec.get_access_point() # make the dir if missing if not os.path.exists(accessPoint): os.makedirs(accessPoint) jobSpecs = workSpec.get_jobspec_list() if jobSpecs is not None: for jobSpec in jobSpecs: subAccessPoint = self.get_access_point(workSpec, jobSpec.PandaID) if accessPoint != subAccessPoint: if not os.path.exists(subAccessPoint): os.mkdir(subAccessPoint) return True except Exception: # get logger tmpLog = core_utils.make_logger(_logger, method_name='setup_access_points') core_utils.dump_error_message(tmpLog) return False
def execute(self): # avoid too early check if not self.singleMode and datetime.datetime.utcnow() - self.startTime \ < datetime.timedelta(seconds=harvester_config.watcher.checkInterval): return mainLog = core_utils.make_logger(_logger, 'id={0}'.format(self.get_pid()), method_name='execute') mainLog.debug('start') # get file lock try: with core_utils.get_file_lock( lockFileName, harvester_config.watcher.checkInterval): logFileName = os.path.join(logDir, 'panda-db_proxy.log') timeNow = datetime.datetime.utcnow() if os.path.exists(logFileName): # get latest timestamp try: p = subprocess.Popen(['tail', '-1', logFileName], stdout=subprocess.PIPE, stderr=subprocess.PIPE) line = p.stdout.readline() lastTime = datetime.datetime.strptime( line[:23], "%Y-%m-%d %H:%M:%S,%f") except Exception: lastTime = None # get processing time for last 1000 queries logDuration = None try: p = subprocess.Popen('tail -{0} {1} | head -1'.format( harvester_config.watcher.nMessages, logFileName), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) line = p.stdout.readline() firstTime = datetime.datetime.strptime( line[:23], "%Y-%m-%d %H:%M:%S,%f") if lastTime is not None: logDuration = lastTime - firstTime except Exception: pass tmpMsg = 'last log message at {0}. '.format(lastTime) if logDuration is not None: tmpMsg += '{0} messages took {1} sec'.format( harvester_config.watcher.nMessages, logDuration.total_seconds()) mainLog.debug(tmpMsg) # check timestamp doAction = False if harvester_config.watcher.maxStalled > 0 and lastTime is not None and \ timeNow - lastTime > datetime.timedelta(seconds=harvester_config.watcher.maxStalled): mainLog.warning( 'last log message is too old. seems to be stalled') doAction = True elif harvester_config.watcher.maxDuration > 0 and logDuration is not None and \ logDuration.total_seconds() > harvester_config.watcher.maxDuration: mainLog.warning( 'slow message generation. seems to be a performance issue' ) doAction = True # take action if doAction: # email if 'email' in harvester_config.watcher.actions.split( ','): # get pass phrase toSkip = False mailUser = None mailPass = None if harvester_config.watcher.mailUser != '' and \ harvester_config.watcher.mailPassword != '': envName = harvester_config.watcher.passphraseEnv if envName not in os.environ: tmpMsg = '{0} is undefined in etc/sysconfig/panda_harvester'.format( envName) mainLog.error(tmpMsg) toSkip = True else: key = os.environ[envName] mailUser = core_utils.decrypt_string( key, harvester_config.watcher.mailUser) mailPass = core_utils.decrypt_string( key, harvester_config.watcher.mailPassword) if not toSkip: # message msgBody = 'harvester {0} '.format( harvester_config.master.harvester_id) msgBody += 'is having a problem on {0} '.format( socket.getfqdn()) msgBody += 'at {0} (UTC)'.format( datetime.datetime.utcnow()) message = MIMEText(msgBody) message['Subject'] = "Harvester Alarm" message[ 'From'] = harvester_config.watcher.mailFrom message['To'] = harvester_config.watcher.mailTo # send email mainLog.debug('sending email to {0}'.format( harvester_config.watcher.mailTo)) server = smtplib.SMTP( harvester_config.watcher.mailServer, harvester_config.watcher.mailPort) if hasattr(harvester_config.watcher, 'mailUseSSL') and \ harvester_config.watcher.mailUseSSL is True: server.starttls() if mailUser is not None and mailPass is not None: server.login(mailUser, mailPass) server.ehlo() server.sendmail( harvester_config.watcher.mailFrom, harvester_config.watcher.mailTo.split(','), message.as_string()) server.quit() # kill if 'kill' in harvester_config.watcher.actions.split( ','): # send USR2 fist mainLog.debug('sending SIGUSR2') os.killpg(os.getpgrp(), signal.SIGUSR2) time.sleep(60) mainLog.debug('sending SIGKILL') os.killpg(os.getpgrp(), signal.SIGKILL) else: mainLog.debug('skip as {0} is missing'.format(logFileName)) except IOError: mainLog.debug( 'skip as locked by another thread or too early to check') except Exception: core_utils.dump_error_message(mainLog) mainLog.debug('done')
def run(self): lockedBy = 'sweeper-{0}'.format(self.get_pid()) while True: sw_main = core_utils.get_stopwatch() mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') # get commands to kill sw_getcomm = core_utils.get_stopwatch() mainLog.debug('try to get commands') comStr = CommandSpec.COM_killWorkers commandSpecs = self.dbProxy.get_commands_for_receiver( 'sweeper', comStr) mainLog.debug('got {0} {1} commands'.format( len(commandSpecs), comStr)) for commandSpec in commandSpecs: n_to_kill = self.dbProxy.kill_workers_by_query( commandSpec.params) mainLog.debug('will kill {0} workers with {1}'.format( n_to_kill, commandSpec.params)) mainLog.debug('done handling commands' + sw_getcomm.get_elapsed_time()) # killing stage sw_kill = core_utils.get_stopwatch() mainLog.debug('try to get workers to kill') # get workers to kill workersToKill = self.dbProxy.get_workers_to_kill( harvester_config.sweeper.maxWorkers, harvester_config.sweeper.checkInterval) mainLog.debug('got {0} queues to kill workers'.format( len(workersToKill))) # loop over all workers sw = core_utils.get_stopwatch() for queueName, configIdWorkSpecList in iteritems(workersToKill): for configID, workspec_list in iteritems(configIdWorkSpecList): # get sweeper if not self.queueConfigMapper.has_queue( queueName, configID): mainLog.error( 'queue config for {0}/{1} not found'.format( queueName, configID)) continue queueConfig = self.queueConfigMapper.get_queue( queueName, configID) try: sweeperCore = self.pluginFactory.get_plugin( queueConfig.sweeper) except Exception: mainLog.error( 'failed to launch sweeper plugin for {0}/{1}'. format(queueName, configID)) core_utils.dump_error_message(mainLog) continue sw.reset() n_workers = len(workspec_list) try: # try bulk method tmpLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') tmpLog.debug('start killing') tmpList = sweeperCore.kill_workers(workspec_list) except AttributeError: # fall back to single-worker method for workspec in workspec_list: tmpLog = self.make_logger(_logger, 'workerID={0}'.format( workspec.workerID), method_name='run') try: tmpLog.debug('start killing one worker') tmpStat, tmpOut = sweeperCore.kill_worker( workspec) tmpLog.debug( 'done killing with status={0} diag={1}'. format(tmpStat, tmpOut)) except Exception: core_utils.dump_error_message(tmpLog) except Exception: core_utils.dump_error_message(mainLog) else: # bulk method n_killed = 0 for workspec, (tmpStat, tmpOut) in zip(workspec_list, tmpList): tmpLog.debug( 'done killing workerID={0} with status={1} diag={2}' .format(workspec.workerID, tmpStat, tmpOut)) if tmpStat: n_killed += 1 tmpLog.debug('killed {0}/{1} workers'.format( n_killed, n_workers)) mainLog.debug( 'done killing {0} workers'.format(n_workers) + sw.get_elapsed_time()) mainLog.debug('done all killing' + sw_kill.get_elapsed_time()) # cleanup stage sw_cleanup = core_utils.get_stopwatch() # timeout for missed try: keepMissed = harvester_config.sweeper.keepMissed except Exception: keepMissed = 24 try: keepPending = harvester_config.sweeper.keepPending except Exception: keepPending = 24 # get workers for cleanup statusTimeoutMap = { 'finished': harvester_config.sweeper.keepFinished, 'failed': harvester_config.sweeper.keepFailed, 'cancelled': harvester_config.sweeper.keepCancelled, 'missed': keepMissed, 'pending': keepPending } workersForCleanup = self.dbProxy.get_workers_for_cleanup( harvester_config.sweeper.maxWorkers, statusTimeoutMap) mainLog.debug('got {0} queues for workers cleanup'.format( len(workersForCleanup))) sw = core_utils.get_stopwatch() for queueName, configIdWorkSpecList in iteritems( workersForCleanup): for configID, workspec_list in iteritems(configIdWorkSpecList): # get sweeper if not self.queueConfigMapper.has_queue( queueName, configID): mainLog.error( 'queue config for {0}/{1} not found'.format( queueName, configID)) continue queueConfig = self.queueConfigMapper.get_queue( queueName, configID) sweeperCore = self.pluginFactory.get_plugin( queueConfig.sweeper) messenger = self.pluginFactory.get_plugin( queueConfig.messenger) sw.reset() n_workers = len(workspec_list) # make sure workers to clean up are all terminated mainLog.debug( 'making sure workers to clean up are all terminated') try: # try bulk method tmpList = sweeperCore.kill_workers(workspec_list) except AttributeError: # fall back to single-worker method for workspec in workspec_list: tmpLog = self.make_logger(_logger, 'workerID={0}'.format( workspec.workerID), method_name='run') try: tmpStat, tmpOut = sweeperCore.kill_worker( workspec) except Exception: core_utils.dump_error_message(tmpLog) except Exception: core_utils.dump_error_message(mainLog) mainLog.debug( 'made sure workers to clean up are all terminated') # start cleanup for workspec in workspec_list: tmpLog = self.make_logger(_logger, 'workerID={0}'.format( workspec.workerID), method_name='run') try: tmpLog.debug('start cleaning up one worker') # sweep worker tmpStat, tmpOut = sweeperCore.sweep_worker( workspec) tmpLog.debug( 'swept_worker with status={0} diag={1}'.format( tmpStat, tmpOut)) tmpLog.debug('start messenger cleanup') mc_tmpStat, mc_tmpOut = messenger.clean_up( workspec) tmpLog.debug( 'messenger cleaned up with status={0} diag={1}' .format(mc_tmpStat, mc_tmpOut)) if tmpStat: self.dbProxy.delete_worker(workspec.workerID) except Exception: core_utils.dump_error_message(tmpLog) mainLog.debug( 'done cleaning up {0} workers'.format(n_workers) + sw.get_elapsed_time()) mainLog.debug('done all cleanup' + sw_cleanup.get_elapsed_time()) # old-job-deletion stage sw_delete = core_utils.get_stopwatch() mainLog.debug('delete old jobs') jobTimeout = max(statusTimeoutMap.values()) + 1 self.dbProxy.delete_old_jobs(jobTimeout) # delete orphaned job info self.dbProxy.delete_orphaned_job_info() mainLog.debug('done deletion of old jobs' + sw_delete.get_elapsed_time()) # disk cleanup if hasattr(harvester_config.sweeper, 'diskCleanUpInterval') and \ hasattr(harvester_config.sweeper, 'diskHighWatermark'): locked = self.dbProxy.get_process_lock( 'sweeper', self.get_pid(), harvester_config.sweeper.diskCleanUpInterval * 60 * 60) if locked: try: all_active_files = None for item in harvester_config.sweeper.diskHighWatermark.split( ','): # dir name and watermark in GB dir_name, watermark = item.split('|') mainLog.debug( 'checking {0} for cleanup with watermark {1} GB' .format(dir_name, watermark)) watermark = int(watermark) * 10**9 total_size = 0 file_dict = {} # scan dir for root, dirs, filenames in walk(dir_name): for base_name in filenames: full_name = os.path.join(root, base_name) f_size = os.path.getsize(full_name) total_size += f_size mtime = os.path.getmtime(full_name) file_dict.setdefault(mtime, set()) file_dict[mtime].add( (base_name, full_name, f_size)) # delete if necessary if total_size < watermark: mainLog.debug( 'skip cleanup {0} due to total_size {1} GB < watermark {2} GB' .format(dir_name, total_size // (10**9), watermark // (10**9))) else: mainLog.debug( 'cleanup {0} due to total_size {1} GB >= watermark {2} GB' .format(dir_name, total_size // (10**9), watermark // (10**9))) # get active input files if all_active_files is None: all_active_files = self.dbProxy.get_all_active_input_files( ) deleted_size = 0 mtimes = sorted(file_dict.keys()) for mtime in mtimes: for base_name, full_name, f_size in file_dict[ mtime]: # keep if active if base_name in all_active_files: continue try: os.remove(full_name) except Exception: core_utils.dump_error_message( mainLog) deleted_size += f_size if total_size - deleted_size < watermark: break if total_size - deleted_size < watermark: break except Exception: core_utils.dump_error_message(mainLog) # time the cycle mainLog.debug('done a sweeper cycle' + sw_main.get_elapsed_time()) # check if being terminated if self.terminated(harvester_config.sweeper.sleepTime): mainLog.debug('terminated') return
def submit_a_worker(data): workspec = data['workspec'] to_submit = data['to_submit'] # make logger tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), method_name='submit_a_worker') # no need to submit bad worker if not to_submit: errStr = 'Not submitted, due to incomplete data of the worker' tmpLog.warning(errStr) tmpRetVal = (None, errStr) return tmpRetVal, workspec.get_changed_attributes() # attributes try: ce_info_dict = data['ce_info_dict'] batch_log_dict = data['batch_log_dict'] condor_schedd = data['condor_schedd'] condor_pool = data['condor_pool'] use_spool = data['use_spool'] except KeyError: errStr = 'Not submitted, due to incomplete data of the worker' tmpLog.warning(errStr) tmpRetVal = (None, errStr) return tmpRetVal, workspec.get_changed_attributes() else: workspec.reset_changed_list() # make batch script batchFile = make_batch_script(**data) # make condor remote options name_opt = '-name {0}'.format(condor_schedd) if condor_schedd else '' pool_opt = '-pool {0}'.format(condor_pool) if condor_pool else '' spool_opt = '-spool'.format( use_spool) if use_spool and condor_schedd else '' # command comStr = 'condor_submit {spool_opt} {name_opt} {pool_opt} {sdf_file}'.format( sdf_file=batchFile, name_opt=name_opt, pool_opt=pool_opt, spool_opt=spool_opt) # submit tmpLog.debug('submit with command: {0}'.format(comStr)) try: p = subprocess.Popen(comStr.split(), shell=False, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # check return code stdOut, stdErr = p.communicate() retCode = p.returncode except Exception: stdOut = '' stdErr = core_utils.dump_error_message(tmpLog, no_message=True) retCode = 1 tmpLog.debug('retCode={0}'.format(retCode)) if retCode == 0: # extract batchID job_id_match = None for tmp_line_str in stdOut.split('\n'): job_id_match = re.search( '^(\d+) job[(]s[)] submitted to cluster (\d+)\.$', tmp_line_str) if job_id_match: break if job_id_match is not None: workspec.batchID = job_id_match.group(2) # set submissionHost if not condor_schedd and not condor_pool: workspec.submissionHost = None else: workspec.submissionHost = '{0},{1}'.format( condor_schedd, condor_pool) tmpLog.debug('submissionHost={0} batchID={1}'.format( workspec.submissionHost, workspec.batchID)) # set computingElement workspec.computingElement = ce_info_dict.get('ce_endpoint', '') # set log batch_log = _condor_macro_replace(batch_log_dict['batch_log'], ClusterId=workspec.batchID) batch_stdout = _condor_macro_replace( batch_log_dict['batch_stdout'], ClusterId=workspec.batchID) batch_stderr = _condor_macro_replace( batch_log_dict['batch_stderr'], ClusterId=workspec.batchID) workspec.set_log_file('batch_log', batch_log) workspec.set_log_file('stdout', batch_stdout) workspec.set_log_file('stderr', batch_stderr) if not workspec.get_jobspec_list(): tmpLog.debug( 'No jobspec associated in the worker of workerID={0}'. format(workspec.workerID)) else: for jobSpec in workspec.get_jobspec_list(): # using batchLog and stdOut URL as pilotID and pilotLog jobSpec.set_one_attribute( 'pilotID', workspec.workAttributes['stdOut']) jobSpec.set_one_attribute( 'pilotLog', workspec.workAttributes['batchLog']) tmpLog.debug('Done set_log_file after submission') tmpRetVal = (True, '') else: errStr = 'batchID cannot be found' tmpLog.error(errStr) tmpRetVal = (None, errStr) else: # failed errStr = '{0} \n {1}'.format(stdOut, stdErr) tmpLog.error(errStr) tmpRetVal = (None, errStr) return tmpRetVal, workspec.get_changed_attributes()
def get_files_to_stage_out(self, workspec): # get logger tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format( workspec.workerID), method_name='get_files_to_stage_out') fileDict = dict() # look for the json just under the access point for pandaID in workspec.pandaid_list: # look for the json just under the access point accessPoint = self.get_access_point(workspec, pandaID) jsonFilePath = os.path.join(accessPoint, jsonOutputsFileName) readJsonPath = jsonFilePath + suffixReadJson # first look for json.read which is not yet acknowledged tmpLog.debug('looking for output file {0}'.format(readJsonPath)) if os.path.exists(readJsonPath): pass else: tmpLog.debug( 'looking for output file {0}'.format(jsonFilePath)) if not os.path.exists(jsonFilePath): # not found tmpLog.debug('not found') continue try: tmpLog.debug('found') # rename to prevent from being overwritten os.rename(jsonFilePath, readJsonPath) except Exception: tmpLog.error('failed to rename json') continue # load json toSkip = False loadDict = None try: with open(readJsonPath) as jsonFile: loadDict = json.load(jsonFile) except Exception: tmpLog.error('failed to load json') toSkip = True # test validity of data format (ie it should be a Dictionary) if not toSkip: if not isinstance(loadDict, dict): tmpLog.error('loaded data is not a dictionary') toSkip = True # collect files and events nData = 0 if not toSkip: sizeMap = dict() chksumMap = dict() eventsList = dict() for tmpPandaID, tmpEventMapList in iteritems(loadDict): tmpPandaID = long(tmpPandaID) # test if tmpEventMapList is a list if not isinstance(tmpEventMapList, list): tmpLog.error('loaded data item is not a list') toSkip = True break for tmpEventInfo in tmpEventMapList: try: nData += 1 if 'eventRangeID' in tmpEventInfo: tmpEventRangeID = tmpEventInfo['eventRangeID'] else: tmpEventRangeID = None tmpFileDict = dict() pfn = tmpEventInfo['path'] lfn = os.path.basename(pfn) tmpFileDict['path'] = pfn if pfn not in sizeMap: if 'fsize' in tmpEventInfo: sizeMap[pfn] = tmpEventInfo['fsize'] else: sizeMap[pfn] = os.stat(pfn).st_size tmpFileDict['fsize'] = sizeMap[pfn] tmpFileDict['type'] = tmpEventInfo['type'] if tmpEventInfo['type'] in ['log', 'output']: # disable zipping tmpFileDict['isZip'] = 0 elif tmpEventInfo['type'] == 'zip_output': # already zipped tmpFileDict['isZip'] = 1 elif 'isZip' in tmpEventInfo: tmpFileDict['isZip'] = tmpEventInfo['isZip'] # guid if 'guid' in tmpEventInfo: tmpFileDict['guid'] = tmpEventInfo['guid'] else: tmpFileDict['guid'] = str(uuid.uuid4()) # get checksum if pfn not in chksumMap: if 'chksum' in tmpEventInfo: chksumMap[pfn] = tmpEventInfo['chksum'] else: chksumMap[pfn] = core_utils.calc_adler32( pfn) tmpFileDict['chksum'] = chksumMap[pfn] if tmpPandaID not in fileDict: fileDict[tmpPandaID] = dict() if lfn not in fileDict[tmpPandaID]: fileDict[tmpPandaID][lfn] = [] fileDict[tmpPandaID][lfn].append(tmpFileDict) # skip if unrelated to events if tmpFileDict['type'] not in [ 'es_output', 'zip_output' ]: continue tmpFileDict['eventRangeID'] = tmpEventRangeID if tmpPandaID not in eventsList: eventsList[tmpPandaID] = list() eventsList[tmpPandaID].append({ 'eventRangeID': tmpEventRangeID, 'eventStatus': tmpEventInfo['eventStatus'] }) except Exception: core_utils.dump_error_message(tmpLog) # dump events if not toSkip: if len(eventsList) > 0: curName = os.path.join(accessPoint, jsonEventsUpdateFileName) newName = curName + '.new' f = open(newName, 'w') json.dump(eventsList, f) f.close() os.rename(newName, curName) # remove empty file if toSkip or nData == 0: try: os.remove(readJsonPath) except Exception: pass tmpLog.debug('got {0} files for PandaID={1}'.format( nData, pandaID)) return fileDict
def ssh_zip_output(self, jobspec, tmp_log): tmp_log.debug('start') self.zip_tmp_log = tmp_log self.zip_jobSpec = jobspec argDictList = [] outFiles_list = list(jobspec.outFiles) try: try: if hasattr(harvester_config, 'zipper'): nThreadsForZip = harvester_config.zipper.nThreadsForZip else: nThreadsForZip = harvester_config.stager.nThreadsForZip except Exception: nThreadsForZip = multiprocessing.cpu_count() # check associate file existence def _check_assfile_existence(fileSpec): in_data = '\\n'.join([ '{0}'.format(assFileSpec.path) for assFileSpec in fileSpec.associatedFiles ]) com1 = ( 'ssh ' '-o StrictHostKeyChecking=no ' '-i {sshkey} ' '{userhost} ' '"{fileop_script} write_tmpfile --suffix {suffix} --dir {dir} \\"{data}\\" "' ).format( sshkey=self.sshkey, userhost=self.userhost, fileop_script=self.fileop_script, suffix='_check-exist.tmp', dir=os.path.dirname( next(iter(fileSpec.associatedFiles)).path), data=in_data, ) # execute p1 = subprocess.Popen(com1, shell=True, close_fds=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdOut, stdErr = p1.communicate() retCode = p1.returncode if retCode != 0: msgStr = 'failed to make tmpargfile remotely with {0}:{1}'.format( stdOut, stdErr) tmp_log.error(msgStr) return False, 'failed to zip with {0}'.format(msgStr) stdOut_str = stdOut if (isinstance(stdOut, str) or stdOut is None) else stdOut.decode() tmpargfile_name = stdOut_str.strip('\n') del p1, stdOut, stdErr # record set existence_set = set() # make command com2 = ( 'ssh ' '-o StrictHostKeyChecking=no ' '-i {sshkey} ' '{userhost} ' '"cat {arg_file} | xargs -I%% sh -c \' test -f %% && echo T || echo F \' " ' ).format( sshkey=self.sshkey, userhost=self.userhost, arg_file=tmpargfile_name, ) # execute p2 = subprocess.Popen(com2, shell=True, close_fds=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdOut, stdErr = p2.communicate() retCode = p2.returncode if retCode != 0: msgStr = 'failed to existence of associate files with {0}:{1}'.format( stdOut, stdErr) tmp_log.error(msgStr) else: try: stdOut_str = stdOut if ( isinstance(stdOut, str) or stdOut is None) else stdOut.decode() ret_list = stdOut_str.strip('\n').split('\n') if len(fileSpec.associatedFiles) == len(ret_list): for (assFileSpec, retVal) in zip(fileSpec.associatedFiles, ret_list): if retVal == 'T': existence_set.add(assFileSpec.path) else: msgStr = 'returned number of files inconsistent! Skipped...' tmp_log.error(msgStr) except Exception: core_utils.dump_error_message(tmp_log) del p2, stdOut, stdErr, com2 # delete tmpargfile com3 = ('ssh ' '-o StrictHostKeyChecking=no ' '-i {sshkey} ' '{userhost} ' '"{fileop_script} remove_file {file_path} "').format( sshkey=self.sshkey, userhost=self.userhost, fileop_script=self.fileop_script, file_path=tmpargfile_name, ) # execute p3 = subprocess.Popen(com3, shell=True, close_fds=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdOut, stdErr = p3.communicate() retCode = p3.returncode if retCode != 0: msgStr = 'failed to delete tmpargfile remotely with {0}:{1}'.format( stdOut, stdErr) tmp_log.error(msgStr) del p3, stdOut, stdErr gc.collect() return existence_set # parallel execution of check existence with Pool(max_workers=nThreadsForZip) as pool: existence_set_list = pool.map(_check_assfile_existence, outFiles_list) # loop for fileSpec, existence_set in zip(outFiles_list, existence_set_list): if self.zipDir == "${SRCDIR}": # the same directory as src zipDir = os.path.dirname( next(iter(fileSpec.associatedFiles)).path) elif self.zipDir == "${WORKDIR}": # work dir workSpec = jobspec.get_workspec_list()[0] zipDir = workSpec.get_access_point() else: zipDir = self.zipDir zipPath = os.path.join(zipDir, fileSpec.lfn) argDict = dict() argDict['zipPath'] = zipPath argDict['associatedFiles'] = [] # check existence of files for assFileSpec in fileSpec.associatedFiles: if assFileSpec.path in existence_set: argDict['associatedFiles'].append(assFileSpec.path) else: assFileSpec.status = 'failed' # append argDictList.append(argDict) # parallel execution of zip with Pool(max_workers=nThreadsForZip) as pool: retValList = pool.map(self.ssh_make_one_zip, argDictList) # check returns for fileSpec, retVal in zip(jobspec.outFiles, retValList): tmpRet, errMsg, fileInfo = retVal if tmpRet is True: # set path fileSpec.path = fileInfo['path'] fileSpec.fsize = fileInfo['fsize'] fileSpec.chksum = fileInfo['chksum'] msgStr = 'fileSpec.path - {0}, fileSpec.fsize - {1}, fileSpec.chksum(adler32) - {2}' \ .format(fileSpec.path, fileSpec.fsize, fileSpec.chksum) tmp_log.debug(msgStr) else: tmp_log.error( 'got {0} with {1} when zipping {2}'.format( tmpRet, errMsg, fileSpec.lfn)) return tmpRet, 'failed to zip with {0}'.format(errMsg) except Exception: errMsg = core_utils.dump_error_message(tmp_log) return False, 'failed to zip with {0}'.format(errMsg) tmp_log.debug('done') return True, ''
def check_workers(self, mon_core, messenger, all_workers, queue_config, tmp_log): workersToCheck = [] retMap = dict() for workSpec in all_workers: eventsRequestParams = {} eventsToUpdate = [] pandaIDs = [] workStatus = None workAttributes = None filesToStageOut = [] nJobsToReFill = None # job-level late binding if workSpec.hasJob == 0 and workSpec.mapType != WorkSpec.MT_NoJob: # check if job is requested jobRequested = messenger.job_requested(workSpec) if jobRequested: # set ready when job is requested workStatus = WorkSpec.ST_ready else: workStatus = workSpec.status elif workSpec.nJobsToReFill in [0, None]: # check if job is requested to refill free slots jobRequested = messenger.job_requested(workSpec) if jobRequested: nJobsToReFill = jobRequested workersToCheck.append(workSpec) else: workersToCheck.append(workSpec) # add retMap[workSpec.workerID] = { 'newStatus': workStatus, 'monStatus': workStatus, 'workAttributes': workAttributes, 'filesToStageOut': filesToStageOut, 'eventsRequestParams': eventsRequestParams, 'eventsToUpdate': eventsToUpdate, 'diagMessage': '', 'pandaIDs': pandaIDs, 'nJobsToReFill': nJobsToReFill } # check workers tmp_log.debug('checking workers with plugin') try: tmpStat, tmpOut = mon_core.check_workers(workersToCheck) if not tmpStat: tmp_log.error( 'failed to check workers with: {0}'.format(tmpOut)) else: tmp_log.debug('checked') for workSpec, (newStatus, diagMessage) in zip(workersToCheck, tmpOut): workerID = workSpec.workerID tmp_log.debug( 'Going to check workerID={0}'.format(workerID)) pandaIDs = [] if workerID in retMap: # request kill if messenger.kill_requested(workSpec): self.dbProxy.kill_worker(workSpec.workerID) # expired heartbeat - only when requested in the configuration try: # check if the queue configuration requires checking for worker heartbeat worker_heartbeat_limit = int( queue_config.messenger['worker_heartbeat']) except (AttributeError, KeyError): worker_heartbeat_limit = None tmp_log.debug( 'workerID={0} heartbeat limit is configured to {1}' .format(workerID, worker_heartbeat_limit)) if worker_heartbeat_limit: if messenger.is_alive(workSpec, worker_heartbeat_limit): tmp_log.debug( 'heartbeat for workerID={0} is valid'. format(workerID)) else: tmp_log.debug( 'heartbeat for workerID={0} expired: sending kill request' .format(workerID)) self.dbProxy.kill_worker(workSpec.workerID) # get work attributes workAttributes = messenger.get_work_attributes( workSpec) retMap[workerID]['workAttributes'] = workAttributes # get output files filesToStageOut = messenger.get_files_to_stage_out( workSpec) retMap[workerID]['filesToStageOut'] = filesToStageOut # get events to update if workSpec.eventsRequest in [ WorkSpec.EV_useEvents, WorkSpec.EV_requestEvents ]: eventsToUpdate = messenger.events_to_update( workSpec) retMap[workerID]['eventsToUpdate'] = eventsToUpdate # request events if workSpec.eventsRequest == WorkSpec.EV_useEvents: eventsRequestParams = messenger.events_requested( workSpec) retMap[workerID][ 'eventsRequestParams'] = eventsRequestParams # get PandaIDs for pull model if workSpec.mapType == WorkSpec.MT_NoJob: pandaIDs = messenger.get_panda_ids(workSpec) retMap[workerID]['pandaIDs'] = pandaIDs # keep original new status retMap[workerID]['monStatus'] = newStatus # set running while there are events to update or files to stage out if newStatus in [ WorkSpec.ST_finished, WorkSpec.ST_failed, WorkSpec.ST_cancelled ]: if len(retMap[workerID]['filesToStageOut']) > 0 or \ len(retMap[workerID]['eventsToUpdate']) > 0: newStatus = WorkSpec.ST_running elif not workSpec.is_post_processed(): if not queue_config.is_no_heartbeat_status( newStatus): # post processing unless heartbeat is suppressed jobSpecs = self.dbProxy.get_jobs_with_worker_id( workSpec.workerID, None, True, only_running=True) # post processing messenger.post_processing( workSpec, jobSpecs, workSpec.mapType) workSpec.post_processed() newStatus = WorkSpec.ST_running # reset modification time to immediately trigger subsequent lookup workSpec.trigger_next_lookup() retMap[workerID]['newStatus'] = newStatus retMap[workerID]['diagMessage'] = diagMessage else: tmp_log.debug( 'workerID={0} not in retMap'.format(workerID)) return True, retMap except: core_utils.dump_error_message(tmp_log) return False, None
def run(self): lockedBy = 'submitter-{0}'.format(self.get_pid()) monitor_fifo = self.monitor_fifo while True: sw_main = core_utils.get_stopwatch() mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') mainLog.debug('getting queues to submit workers') # get queues associated to a site to submit workers curWorkers, siteName, resMap = self.dbProxy.get_queues_to_submit( harvester_config.submitter.nQueues, harvester_config.submitter.lookupTime, harvester_config.submitter.lockInterval) submitted = False if siteName is not None: mainLog.debug('got {0} queues for site {1}'.format( len(curWorkers), siteName)) # get commands comStr = '{0}:{1}'.format(CommandSpec.COM_setNWorkers, siteName) commandSpecs = self.dbProxy.get_commands_for_receiver( 'submitter', comStr) mainLog.debug('got {0} {1} commands'.format( commandSpecs, comStr)) for commandSpec in commandSpecs: newLimits = self.dbProxy.set_queue_limit( siteName, commandSpec.params) for tmpResource, tmpNewVal in iteritems(newLimits): # if available, overwrite new worker value with the command from panda server if tmpResource in resMap: tmpQueueName = resMap[tmpResource] if tmpQueueName in curWorkers: curWorkers[tmpQueueName][tmpResource][ 'nNewWorkers'] = tmpNewVal # define number of new workers if len(curWorkers) == 0: n_workers_per_queue_and_rt = dict() else: n_workers_per_queue_and_rt = self.workerAdjuster.define_num_workers( curWorkers, siteName) if n_workers_per_queue_and_rt is None: mainLog.error( 'WorkerAdjuster failed to define the number of workers' ) elif len(n_workers_per_queue_and_rt) == 0: pass else: # loop over all queues and resource types for queueName in n_workers_per_queue_and_rt: for resource_type, tmpVal in iteritems( n_workers_per_queue_and_rt[queueName]): tmpLog = self.make_logger( _logger, 'id={0} queue={1} rtype={2}'.format( lockedBy, queueName, resource_type), method_name='run') try: tmpLog.debug('start') tmpLog.debug('workers status: %s' % tmpVal) nWorkers = tmpVal['nNewWorkers'] + tmpVal[ 'nReady'] nReady = tmpVal['nReady'] # check queue if not self.queueConfigMapper.has_queue( queueName): tmpLog.error('config not found') continue # no new workers if nWorkers == 0: tmpLog.debug( 'skipped since no new worker is needed based on current stats' ) continue # get queue queueConfig = self.queueConfigMapper.get_queue( queueName) workerMakerCore = self.workerMaker.get_plugin( queueConfig) # check if resource is ready if hasattr( workerMakerCore, 'dynamicSizing' ) and workerMakerCore.dynamicSizing is True: numReadyResources = self.workerMaker.num_ready_resources( queueConfig, resource_type, workerMakerCore) tmpLog.debug('numReadyResources: %s' % numReadyResources) if not numReadyResources: if hasattr(workerMakerCore, 'staticWorkers'): nQRWorkers = tmpVal[ 'nQueue'] + tmpVal['nRunning'] tmpLog.debug( 'staticWorkers: %s, nQRWorkers(Queue+Running): %s' % (workerMakerCore.staticWorkers, nQRWorkers)) if nQRWorkers >= workerMakerCore.staticWorkers: tmpLog.debug( 'No left static workers, skip' ) continue else: nWorkers = min( workerMakerCore. staticWorkers - nQRWorkers, nWorkers) tmpLog.debug( 'staticWorkers: %s, nWorkers: %s' % (workerMakerCore. staticWorkers, nWorkers)) else: tmpLog.debug( 'skip since no resources are ready' ) continue else: nWorkers = min(nWorkers, numReadyResources) # post action of worker maker if hasattr( workerMakerCore, 'skipOnFail' ) and workerMakerCore.skipOnFail is True: skipOnFail = True else: skipOnFail = False # actions based on mapping type if queueConfig.mapType == WorkSpec.MT_NoJob: # workers without jobs jobChunks = [] for i in range(nWorkers): jobChunks.append([]) elif queueConfig.mapType == WorkSpec.MT_OneToOne: # one worker per one job jobChunks = self.dbProxy.get_job_chunks_for_workers( queueName, nWorkers, nReady, 1, None, queueConfig.useJobLateBinding, harvester_config.submitter. checkInterval, harvester_config. submitter.lockInterval, lockedBy) elif queueConfig.mapType == WorkSpec.MT_MultiJobs: # one worker for multiple jobs nJobsPerWorker = self.workerMaker.get_num_jobs_per_worker( queueConfig, nWorkers, resource_type, maker=workerMakerCore) tmpLog.debug('nJobsPerWorker={0}'.format( nJobsPerWorker)) jobChunks = self.dbProxy.get_job_chunks_for_workers( queueName, nWorkers, nReady, nJobsPerWorker, None, queueConfig.useJobLateBinding, harvester_config.submitter. checkInterval, harvester_config. submitter.lockInterval, lockedBy, queueConfig.allowJobMixture) elif queueConfig.mapType == WorkSpec.MT_MultiWorkers: # multiple workers for one job nWorkersPerJob = self.workerMaker.get_num_workers_per_job( queueConfig, nWorkers, resource_type, maker=workerMakerCore) maxWorkersPerJob = self.workerMaker.get_max_workers_per_job_in_total( queueConfig, resource_type, maker=workerMakerCore) maxWorkersPerJobPerCycle = self.workerMaker.get_max_workers_per_job_per_cycle( queueConfig, resource_type, maker=workerMakerCore) tmpLog.debug('nWorkersPerJob={0}'.format( nWorkersPerJob)) jobChunks = self.dbProxy.get_job_chunks_for_workers( queueName, nWorkers, nReady, None, nWorkersPerJob, queueConfig.useJobLateBinding, harvester_config.submitter. checkInterval, harvester_config.submitter. lockInterval, lockedBy, max_workers_per_job_in_total= maxWorkersPerJob, max_workers_per_job_per_cycle= maxWorkersPerJobPerCycle) else: tmpLog.error('unknown mapType={0}'.format( queueConfig.mapType)) continue tmpLog.debug('got {0} job chunks'.format( len(jobChunks))) if len(jobChunks) == 0: continue # make workers okChunks, ngChunks = self.workerMaker.make_workers( jobChunks, queueConfig, nReady, resource_type, maker=workerMakerCore) if len(ngChunks) == 0: tmpLog.debug( 'successfully made {0} workers'.format( len(okChunks))) else: tmpLog.debug( 'made {0} workers, while {1} workers failed' .format(len(okChunks), len(ngChunks))) timeNow = datetime.datetime.utcnow() timeNow_timestamp = time.time() pandaIDs = set() # NG (=not good) for ngJobs in ngChunks: for jobSpec in ngJobs: if skipOnFail: # release jobs when workers are not made pandaIDs.add(jobSpec.PandaID) else: jobSpec.status = 'failed' jobSpec.subStatus = 'failed_to_make' jobSpec.stateChangeTime = timeNow jobSpec.lockedBy = None errStr = 'failed to make a worker' jobSpec.set_pilot_error( PilotErrors.ERR_SETUPFAILURE, errStr) jobSpec.trigger_propagation() self.dbProxy.update_job( jobSpec, { 'lockedBy': lockedBy, 'subStatus': 'prepared' }) # OK workSpecList = [] if len(okChunks) > 0: for workSpec, okJobs in okChunks: # has job if (queueConfig.useJobLateBinding and workSpec.workerID is None) \ or queueConfig.mapType == WorkSpec.MT_NoJob: workSpec.hasJob = 0 else: workSpec.hasJob = 1 if workSpec.nJobsToReFill in [ None, 0 ]: workSpec.set_jobspec_list( okJobs) else: # refill free slots during the worker is running workSpec.set_jobspec_list( okJobs[:workSpec. nJobsToReFill]) workSpec.nJobsToReFill = None for jobSpec in okJobs[ workSpec. nJobsToReFill:]: pandaIDs.add( jobSpec.PandaID) workSpec.set_num_jobs_with_list() # map type workSpec.mapType = queueConfig.mapType # queue name workSpec.computingSite = queueConfig.queueName # set access point workSpec.accessPoint = queueConfig.messenger[ 'accessPoint'] # sync level workSpec.syncLevel = queueConfig.get_synchronization_level( ) # events if len(okJobs) > 0 and \ ('eventService' in okJobs[0].jobParams or 'cloneJob' in okJobs[0].jobParams): workSpec.eventsRequest = WorkSpec.EV_useEvents workSpecList.append(workSpec) if len(workSpecList) > 0: sw = core_utils.get_stopwatch() # get plugin for submitter submitterCore = self.pluginFactory.get_plugin( queueConfig.submitter) if submitterCore is None: # not found tmpLog.error( 'submitter plugin for {0} not found' .format(jobSpec.computingSite)) continue # get plugin for messenger messenger = self.pluginFactory.get_plugin( queueConfig.messenger) if messenger is None: # not found tmpLog.error( 'messenger plugin for {0} not found' .format(jobSpec.computingSite)) continue # setup access points messenger.setup_access_points(workSpecList) # feed jobs for workSpec in workSpecList: if workSpec.hasJob == 1: tmpStat = messenger.feed_jobs( workSpec, workSpec.get_jobspec_list()) if tmpStat is False: tmpLog.error( 'failed to send jobs to workerID={0}' .format(workSpec.workerID)) else: tmpLog.debug( 'sent jobs to workerID={0} with {1}' .format( workSpec.workerID, tmpStat)) # insert workers self.dbProxy.insert_workers( workSpecList, lockedBy) # submit sw.reset() tmpLog.info( 'submitting {0} workers'.format( len(workSpecList))) workSpecList, tmpRetList, tmpStrList = self.submit_workers( submitterCore, workSpecList) tmpLog.debug('done submitting {0} workers'. format(len(workSpecList)) + sw.get_elapsed_time()) # collect successful jobs okPandaIDs = set() for iWorker, (tmpRet, tmpStr) in enumerate( zip(tmpRetList, tmpStrList)): if tmpRet: workSpec, jobList = okChunks[ iWorker] jobList = workSpec.get_jobspec_list( ) if jobList is not None: for jobSpec in jobList: okPandaIDs.add( jobSpec.PandaID) # loop over all workers for iWorker, (tmpRet, tmpStr) in enumerate( zip(tmpRetList, tmpStrList)): workSpec, jobList = okChunks[iWorker] # set harvesterHost workSpec.harvesterHost = socket.gethostname( ) # use associated job list since it can be truncated for re-filling jobList = workSpec.get_jobspec_list() # set status if not tmpRet: # failed submission errStr = 'failed to submit a workerID={0} with {1}'.format( workSpec.workerID, tmpStr) tmpLog.error(errStr) workSpec.set_status( WorkSpec.ST_missed) workSpec.set_dialog_message(tmpStr) workSpec.set_pilot_error( PilotErrors.ERR_SETUPFAILURE, errStr) if jobList is not None: # increment attempt number newJobList = [] for jobSpec in jobList: # skip if successful with another worker if jobSpec.PandaID in okPandaIDs: continue if jobSpec.submissionAttempts is None: jobSpec.submissionAttempts = 0 jobSpec.submissionAttempts += 1 # max attempt or permanent error if tmpRet is False or \ jobSpec.submissionAttempts >= \ queueConfig.maxSubmissionAttempts: newJobList.append( jobSpec) else: self.dbProxy.increment_submission_attempt( jobSpec.PandaID, jobSpec. submissionAttempts) jobList = newJobList elif queueConfig.useJobLateBinding and workSpec.hasJob == 1: # directly go to running after feeding jobs for late biding workSpec.set_status( WorkSpec.ST_running) else: # normal successful submission workSpec.set_status( WorkSpec.ST_submitted) workSpec.submitTime = timeNow workSpec.modificationTime = timeNow workSpec.checkTime = timeNow if self.monitor_fifo.enabled: workSpec.set_work_params({ 'lastCheckAt': timeNow_timestamp }) # prefetch events if tmpRet and workSpec.hasJob == 1 and \ workSpec.eventsRequest == WorkSpec.EV_useEvents and \ queueConfig.prefetchEvents: workSpec.eventsRequest = WorkSpec.EV_requestEvents eventsRequestParams = dict() for jobSpec in jobList: eventsRequestParams[jobSpec.PandaID] = \ {'pandaID': jobSpec.PandaID, 'taskID': jobSpec.taskID, 'jobsetID': jobSpec.jobParams['jobsetID'], 'nRanges': max(int(math.ceil(workSpec.nCore / len(jobList))), jobSpec.jobParams['coreCount']), } workSpec.eventsRequestParams = eventsRequestParams # register worker tmpStat = self.dbProxy.register_worker( workSpec, jobList, lockedBy) if jobList is not None: for jobSpec in jobList: pandaIDs.add(jobSpec.PandaID) if tmpStat: if tmpRet: tmpStr = \ 'submitted a workerID={0} for PandaID={1} with batchID={2}' tmpLog.info( tmpStr.format( workSpec. workerID, jobSpec. PandaID, workSpec. batchID)) else: tmpStr = 'failed to submit a workerID={0} for PandaID={1}' tmpLog.error( tmpStr.format( workSpec. workerID, jobSpec.PandaID )) else: tmpStr = \ 'failed to register a worker for PandaID={0} with batchID={1}' tmpLog.error( tmpStr.format( jobSpec.PandaID, workSpec.batchID)) # enqueue to monitor fifo if self.monitor_fifo.enabled \ and queueConfig.mapType != WorkSpec.MT_MultiWorkers: workSpecsToEnqueue = \ [[w] for w in workSpecList if w.status in (WorkSpec.ST_submitted, WorkSpec.ST_running)] monitor_fifo.put( (queueName, workSpecsToEnqueue), time.time() + harvester_config. monitor.fifoCheckInterval) mainLog.debug( 'put workers to monitor FIFO') submitted = True # release jobs self.dbProxy.release_jobs(pandaIDs, lockedBy) tmpLog.info('done') except Exception: core_utils.dump_error_message(tmpLog) mainLog.debug('done') # define sleep interval if siteName is None: sleepTime = harvester_config.submitter.sleepTime else: sleepTime = 0 if submitted and hasattr(harvester_config.submitter, 'minSubmissionInterval'): interval = harvester_config.submitter.minSubmissionInterval if interval > 0: newTime = datetime.datetime.utcnow( ) + datetime.timedelta(seconds=interval) self.dbProxy.update_panda_queue_attribute( 'submitTime', newTime, site_name=siteName) # time the cycle mainLog.debug('done a submitter cycle' + sw_main.get_elapsed_time()) # check if being terminated if self.terminated(sleepTime): mainLog.debug('terminated') return
def make_one_zip(self, arg_dict): try: zipPath = arg_dict['zipPath'] lfn = os.path.basename(zipPath) self.zip_tmp_log.debug( '{0} start zipPath={1} with {2} files'.format( lfn, zipPath, len(arg_dict['associatedFiles']))) # make zip if doesn't exist if not os.path.exists(zipPath): # tmp file names tmpZipPath = zipPath + '.' + str(uuid.uuid4()) tmpZipPathIn = tmpZipPath + '.in' with open(tmpZipPathIn, "w") as f: for associatedFile in arg_dict['associatedFiles']: f.write("{0}\n".format(associatedFile)) # make command com = 'tar -c -f {0} -T {1} '.format(tmpZipPath, tmpZipPathIn) com += "--transform 's/.*\///' " # execute p = subprocess.Popen(com, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdOut, stdErr = p.communicate() retCode = p.returncode if retCode != 0: msgStr = 'failed to make zip for {0} with {1}:{2}'.format( lfn, stdOut, stdErr) self.zip_tmp_log.error(msgStr) return None, msgStr, {} # avoid overwriting lockName = 'zip.lock.{0}'.format(lfn) lockInterval = 60 tmpStat = False # get lock for i in range(lockInterval): tmpStat = self.dbInterface.get_object_lock( lockName, lock_interval=lockInterval) if tmpStat: break time.sleep(1) # failed to lock if not tmpStat: msgStr = 'failed to lock for {0}'.format(lfn) self.zip_tmp_log.error(msgStr) return None, msgStr if not os.path.exists(zipPath): os.rename(tmpZipPath, zipPath) # release lock self.dbInterface.release_object_lock(lockName) # make return fileInfo = dict() fileInfo['path'] = zipPath # get size statInfo = os.stat(zipPath) fileInfo['fsize'] = statInfo.st_size fileInfo['chksum'] = core_utils.calc_adler32(zipPath) except Exception: errMsg = core_utils.dump_error_message(self.zip_tmp_log) return False, 'failed to zip with {0}'.format(errMsg) self.zip_tmp_log.debug('{0} done'.format(lfn)) return True, '', fileInfo
def run(self): lockedBy = 'stager-{0}'.format(self.get_pid()) while True: sw = core_utils.get_stopwatch() mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') mainLog.debug('try to get jobs to check') # get jobs to check preparation try: maxFilesPerJob = harvester_config.stager.maxFilesPerJobToCheck except Exception: maxFilesPerJob = None jobsToCheck = self.dbProxy.get_jobs_for_stage_out( harvester_config.stager.maxJobsToCheck, harvester_config.stager.checkInterval, harvester_config.stager.lockInterval, lockedBy, 'transferring', JobSpec.HO_hasTransfer, max_files_per_job=maxFilesPerJob) mainLog.debug('got {0} jobs to check'.format(len(jobsToCheck))) # loop over all jobs for jobSpec in jobsToCheck: tmpLog = self.make_logger(_logger, 'PandaID={0}'.format( jobSpec.PandaID), method_name='run') try: tmpLog.debug('start checking') # configID configID = jobSpec.configID if not core_utils.dynamic_plugin_change(): configID = None # get queue if not self.queueConfigMapper.has_queue( jobSpec.computingSite, configID): tmpLog.error( 'queue config for {0}/{1} not found'.format( jobSpec.computingSite, configID)) continue queueConfig = self.queueConfigMapper.get_queue( jobSpec.computingSite, configID) # get plugin stagerCore = self.pluginFactory.get_plugin( queueConfig.stager) if stagerCore is None: # not found tmpLog.error('plugin for {0} not found'.format( jobSpec.computingSite)) continue # lock job again lockedAgain = self.dbProxy.lock_job_again( jobSpec.PandaID, 'stagerTime', 'stagerLock', lockedBy) if not lockedAgain: tmpLog.debug('skip since locked by another thread') continue tmpStat, tmpStr = stagerCore.check_status(jobSpec) # check result if tmpStat is True: # succeeded newSubStatus = self.dbProxy.update_job_for_stage_out( jobSpec, True, lockedBy) tmpLog.debug( 'succeeded new subStatus={0}'.format(newSubStatus)) elif tmpStat is False: # fatal error tmpLog.debug( 'fatal error when checking status with {0}'.format( tmpStr)) # update job for fileSpec in jobSpec.outFiles: if fileSpec.status != 'finished': fileSpec.status = 'failed' errStr = 'stage-out failed with {0}'.format(tmpStr) jobSpec.set_pilot_error(PilotErrors.ERR_STAGEOUTFAILED, errStr) jobSpec.trigger_propagation() newSubStatus = self.dbProxy.update_job_for_stage_out( jobSpec, True, lockedBy) tmpLog.debug( 'updated new subStatus={0}'.format(newSubStatus)) else: # on-going tmpLog.debug( 'try to check later since {0}'.format(tmpStr)) except Exception: core_utils.dump_error_message(tmpLog) # get jobs to trigger stage-out try: maxFilesPerJob = harvester_config.stager.maxFilesPerJobToTrigger except Exception: maxFilesPerJob = None jobsToTrigger = self.dbProxy.get_jobs_for_stage_out( harvester_config.stager.maxJobsToTrigger, harvester_config.stager.triggerInterval, harvester_config.stager.lockInterval, lockedBy, 'to_transfer', JobSpec.HO_hasOutput, JobSpec.HO_hasZipOutput, max_files_per_job=maxFilesPerJob) mainLog.debug('got {0} jobs to trigger'.format(len(jobsToTrigger))) # loop over all jobs for jobSpec in jobsToTrigger: tmpLog = self.make_logger(_logger, 'PandaID={0}'.format( jobSpec.PandaID), method_name='run') try: tmpLog.debug('try to trigger stage-out') # configID configID = jobSpec.configID if not core_utils.dynamic_plugin_change(): configID = None # get queue if not self.queueConfigMapper.has_queue( jobSpec.computingSite, configID): tmpLog.error( 'queue config for {0}/{1} not found'.format( jobSpec.computingSite, configID)) continue queueConfig = self.queueConfigMapper.get_queue( jobSpec.computingSite, configID) # get plugin stagerCore = self.pluginFactory.get_plugin( queueConfig.stager) if stagerCore is None: # not found tmpLog.error('plugin for {0} not found'.format( jobSpec.computingSite)) continue # lock job again lockedAgain = self.dbProxy.lock_job_again( jobSpec.PandaID, 'stagerTime', 'stagerLock', lockedBy) if not lockedAgain: tmpLog.debug('skip since locked by another thread') continue # trigger stage-out tmpStat, tmpStr = stagerCore.trigger_stage_out(jobSpec) # check result if tmpStat is True: # succeeded jobSpec.all_files_triggered_to_stage_out() newSubStatus = self.dbProxy.update_job_for_stage_out( jobSpec, True, lockedBy) tmpLog.debug( 'triggered new subStatus={0}'.format(newSubStatus)) elif tmpStat is False: # fatal error tmpLog.debug( 'fatal error to trigger with {0}'.format(tmpStr)) # update job for fileSpec in jobSpec.outFiles: if fileSpec.status != 'finished': fileSpec.status = 'failed' errStr = 'stage-out failed with {0}'.format(tmpStr) jobSpec.set_pilot_error(PilotErrors.ERR_STAGEOUTFAILED, errStr) jobSpec.trigger_propagation() newSubStatus = self.dbProxy.update_job_for_stage_out( jobSpec, True, lockedBy) tmpLog.debug( 'updated new subStatus={0}'.format(newSubStatus)) else: # temporary error tmpLog.debug( 'try to trigger later since {0}'.format(tmpStr)) except Exception: core_utils.dump_error_message(tmpLog) # get jobs to zip output try: maxFilesPerJob = harvester_config.stager.maxFilesPerJobToZip except Exception: maxFilesPerJob = None try: zipInterval = harvester_config.stager.zipInterval except Exception: zipInterval = harvester_config.stager.triggerInterval jobsToZip = self.dbProxy.get_jobs_for_stage_out( harvester_config.stager.maxJobsToZip, zipInterval, harvester_config.stager.lockInterval, lockedBy, 'to_transfer', JobSpec.HO_hasZipOutput, JobSpec.HO_hasOutput, max_files_per_job=maxFilesPerJob) mainLog.debug('got {0} jobs to zip'.format(len(jobsToZip))) # loop over all jobs for jobSpec in jobsToZip: tmpLog = self.make_logger(_logger, 'PandaID={0}'.format( jobSpec.PandaID), method_name='run') try: tmpLog.debug('try to zip output') # configID configID = jobSpec.configID if not core_utils.dynamic_plugin_change(): configID = None # get queue if not self.queueConfigMapper.has_queue( jobSpec.computingSite, configID): tmpLog.error( 'queue config for {0}/{1} not found'.format( jobSpec.computingSite, configID)) continue queueConfig = self.queueConfigMapper.get_queue( jobSpec.computingSite, configID) # get plugin stagerCore = self.pluginFactory.get_plugin( queueConfig.stager) if stagerCore is None: # not found tmpLog.error('plugin for {0} not found'.format( jobSpec.computingSite)) continue # lock job again lockedAgain = self.dbProxy.lock_job_again( jobSpec.PandaID, 'stagerTime', 'stagerLock', lockedBy) if not lockedAgain: tmpLog.debug('skip since locked by another thread') continue # trigger preparation tmpStat, tmpStr = stagerCore.zip_output(jobSpec) # succeeded if tmpStat is True: # update job jobSpec.all_files_zipped() newSubStatus = self.dbProxy.update_job_for_stage_out( jobSpec, False, lockedBy) tmpLog.debug( 'zipped new subStatus={0}'.format(newSubStatus)) else: # failed tmpLog.debug('failed to zip with {0}'.format(tmpStr)) except Exception: core_utils.dump_error_message(tmpLog) mainLog.debug('done' + sw.get_elapsed_time()) # check if being terminated if self.terminated(harvester_config.stager.sleepTime): mainLog.debug('terminated') return
def submit_a_worker(data): workspec = data['workspec'] template = data['template'] log_dir = data['log_dir'] n_core_per_node = data['n_core_per_node'] panda_queue_name = data['panda_queue_name'] x509_user_proxy = data['x509_user_proxy'] ce_info_dict = data['ce_info_dict'] batch_log_dict = data['batch_log_dict'] special_par = data['special_par'] workspec.reset_changed_list() # make logger tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), method_name='submit_a_worker') # make batch script # batchFile = make_batch_script(workspec=workspec, template=template, n_core_per_node=n_core_per_node, log_dir=log_dir, # panda_queue_name=panda_queue_name, x509_user_proxy=x509_user_proxy, # ce_info_dict=ce_info_dict, batch_log_dict=batch_log_dict, special_par=special_par) batchFile = make_batch_script(**data) # command comStr = 'condor_submit {0}'.format(batchFile) # submit tmpLog.debug('submit with {0}'.format(batchFile)) try: p = subprocess.Popen(comStr.split(), shell=False, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # check return code stdOut, stdErr = p.communicate() retCode = p.returncode except: stdOut = '' stdErr = core_utils.dump_error_message(tmpLog, no_message=True) retCode = 1 tmpLog.debug('retCode={0}'.format(retCode)) if retCode == 0: # extract batchID job_id_match = None for tmp_line_str in stdOut.split('\n'): job_id_match = re.search( '^(\d+) job[(]s[)] submitted to cluster (\d+)\.$', tmp_line_str) if job_id_match: break if job_id_match is not None: workspec.batchID = job_id_match.group(2) tmpLog.debug('batchID={0}'.format(workspec.batchID)) batch_log = _condor_macro_replace(batch_log_dict['batch_log'], ClusterId=workspec.batchID) batch_stdout = _condor_macro_replace( batch_log_dict['batch_stdout'], ClusterId=workspec.batchID) batch_stderr = _condor_macro_replace( batch_log_dict['batch_stderr'], ClusterId=workspec.batchID) workspec.set_log_file('batch_log', batch_log) workspec.set_log_file('stdout', batch_stdout) workspec.set_log_file('stderr', batch_stderr) tmpRetVal = (True, '') else: errStr = 'batchID cannot be found' tmpLog.error(errStr) tmpRetVal = (False, errStr) else: # failed errStr = '{0} \n {1}'.format(stdOut, stdErr) tmpLog.error(errStr) tmpRetVal = (False, errStr) return tmpRetVal, workspec.get_changed_attributes()
def trigger_preparation(self, jobspec): # make logger tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), method_name='trigger_preparation') tmpLog.debug('start') # loop over all inputs allDone = True for tmpFileSpec in jobspec.inFiles: # local access path url = tmpFileSpec.url accPath = self.make_local_access_path(tmpFileSpec.scope, tmpFileSpec.lfn) # check if already exits if os.path.exists(accPath): continue # make directories if needed if not os.path.isdir(os.path.dirname(accPath)): os.makedirs(os.path.dirname(accPath)) # get return_code = 1 if url.startswith('http'): try: tmpLog.debug('getting via http from {0} to {1}'.format(url, accPath)) res = requests.get(url, timeout=180, verify=False) if res.status_code == 200: with open(accPath, 'w') as f: f.write(res.content) return_code = 0 else: errMsg = 'failed to get {0} with StatusCode={1} {2}'.format(url, res.status_code, res.text) tmpLog.error(errMsg) except requests.exceptions.ReadTimeout: tmpLog.error('read timeout when getting data from {0}'.format(url)) except Exception: core_utils.dump_error_message(tmpLog) elif url.startswith('docker'): args = ['docker', 'save', '-o', accPath, url.split('://')[-1]] try: tmpLog.debug('executing ' + ' '.join(args)) p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() return_code = p.returncode if stdout is not None: stdout = stdout.replace('\n', ' ') if stderr is not None: stderr = stderr.replace('\n', ' ') tmpLog.debug("stdout: %s" % stdout) tmpLog.debug("stderr: %s" % stderr) except Exception: core_utils.dump_error_message(tmpLog) else: tmpLog.error('unsupported protocol in {0}'.format(url)) if return_code != 0: allDone = False if allDone: tmpLog.debug('succeeded') return True, '' else: errMsg = 'failed' tmpLog.error(errMsg) # check attemptNr for tmpFileSpec in jobspec.inFiles: if tmpFileSpec.attemptNr >= self.maxAttempts: errMsg = 'gave up due to max attempts' tmpLog.error(errMsg) return (False, errMsg) return None, errMsg
def trigger_stage_out(self, jobspec): # make logger tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), method_name='trigger_stage_out') tmpLog.debug('start') # loop over all files files = dict() transferIDs = dict() transferDatasets = dict() fileAttrs = jobspec.get_output_file_attributes() for fileSpec in jobspec.outFiles: # skip zipped files if fileSpec.zipFileID is not None: continue # skip if already processed if 'transferDataset' in fileSpec.fileAttributes: if fileSpec.fileType not in transferDatasets: transferDatasets[ fileSpec. fileType] = fileSpec.fileAttributes['transferDataset'] if fileSpec.fileType not in transferIDs: transferIDs[fileSpec.fileType] = fileSpec.fileAttributes[ 'transferID'] continue # set OS ID if fileSpec.fileType == ['es_output', 'zip_output']: fileSpec.objstoreID = self.objStoreID_ES # make path where file is copied for transfer if fileSpec.fileType != 'zip_output': scope = fileAttrs[fileSpec.lfn]['scope'] datasetName = fileAttrs[fileSpec.lfn]['dataset'] else: # use panda scope for zipped files scope = self.scopeForTmp datasetName = 'dummy' srcPath = fileSpec.path dstPath = mover_utils.construct_file_path(self.srcBasePath, scope, fileSpec.lfn) # remove if os.path.exists(dstPath): os.remove(dstPath) # copy tmpLog.debug('copy src={srcPath} dst={dstPath}'.format( srcPath=srcPath, dstPath=dstPath)) dstDir = os.path.dirname(dstPath) if not os.path.exists(dstDir): os.makedirs(dstDir) shutil.copyfile(srcPath, dstPath) # collect files tmpFile = dict() tmpFile['scope'] = scope tmpFile['name'] = fileSpec.lfn tmpFile['bytes'] = fileSpec.fsize if fileSpec.fileType not in files: files[fileSpec.fileType] = [] files[fileSpec.fileType].append(tmpFile) # loop over all file types to be registered to rucio rucioAPI = RucioClient() for fileType, fileList in iteritems(files): # set destination RSE if fileType in ['es_output', 'zip_output']: dstRSE = self.dstRSE_ES elif fileType == 'output': dstRSE = self.dstRSE_Out elif fileType == 'log': dstRSE = self.dstRSE_Log else: errMsg = 'unsupported file type {0}'.format(fileType) tmpLog.error(errMsg) return (False, errMsg) # skip if destination is None if dstRSE is None: continue # make datasets if missing if fileType not in transferDatasets: try: tmpScope = self.scopeForTmp tmpDS = 'panda.harvester_stage_out.{0}'.format( str(uuid.uuid4())) rucioAPI.add_dataset(tmpScope, tmpDS, meta={'hidden': True}, lifetime=30 * 24 * 60 * 60, files=fileList, rse=self.srcRSE) transferDatasets[fileType] = tmpDS # add rule tmpDID = dict() tmpDID['scope'] = tmpScope tmpDID['name'] = tmpDS tmpRet = rucioAPI.add_replication_rule([tmpDID], 1, dstRSE, lifetime=30 * 24 * 60 * 60) tmpTransferIDs = tmpRet[0] transferIDs[fileType] = tmpTransferIDs tmpLog.debug('register dataset {0} with rule {1}'.format( tmpDS, str(tmpTransferIDs))) except: errMsg = core_utils.dump_error_message(tmpLog) return (False, errMsg) else: # add files to existing dataset try: tmpScope = self.scopeForTmp tmpDS = transferDatasets[fileType] rucioAPI.add_files_to_dataset(tmpScope, tmpDS, fileList, self.srcRSE) tmpLog.debug('added files to {0}'.format(tmpDS)) except: errMsg = core_utils.dump_error_message(tmpLog) return (False, errMsg) # set transfer datasets and rules for fileSpec in jobspec.outFiles: # skip zipped files if fileSpec.zipFileID is not None: continue # skip already done if fileSpec.status in ['finished', 'failed']: continue # skip if already processed if 'transferDataset' in fileSpec.fileAttributes: continue # no destination if fileSpec.fileType not in transferDatasets: fileSpec.status = 'finished' continue # set dataset fileSpec.fileAttributes['transferDataset'] = transferDatasets[ fileSpec.fileType] # set rule fileSpec.fileAttributes['transferID'] = transferIDs[ fileSpec.fileType] # force update fileSpec.force_update('fileAttributes') # return tmpLog.debug('done') return (True, '')
def run(self): lockedBy = 'sweeper-{0}'.format(self.get_pid()) while True: sw_main = core_utils.get_stopwatch() mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') # get commands to kill sw_getcomm = core_utils.get_stopwatch() mainLog.debug('try to get commands') comStr = CommandSpec.COM_killWorkers commandSpecs = self.dbProxy.get_commands_for_receiver( 'sweeper', comStr) mainLog.debug('got {0} {1} commands'.format( len(commandSpecs), comStr)) for commandSpec in commandSpecs: n_to_kill = self.dbProxy.kill_workers_by_query( commandSpec.params) mainLog.debug('will kill {0} workers with {1}'.format( n_to_kill, commandSpec.params)) mainLog.debug('done handling commands' + sw_getcomm.get_elapsed_time()) # killing stage sw_kill = core_utils.get_stopwatch() mainLog.debug('try to get workers to kill') # get workers to kill workersToKill = self.dbProxy.get_workers_to_kill( harvester_config.sweeper.maxWorkers, harvester_config.sweeper.checkInterval) mainLog.debug('got {0} queues to kill workers'.format( len(workersToKill))) # loop over all workers sw = core_utils.get_stopwatch() for queueName, configIdWorkSpecList in iteritems(workersToKill): for configID, workspec_list in iteritems(configIdWorkSpecList): # get sweeper if not self.queueConfigMapper.has_queue( queueName, configID): mainLog.error( 'queue config for {0}/{1} not found'.format( queueName, configID)) continue queueConfig = self.queueConfigMapper.get_queue( queueName, configID) try: sweeperCore = self.pluginFactory.get_plugin( queueConfig.sweeper) except Exception: mainLog.error( 'failed to launch sweeper plugin for {0}/{1}'. format(queueName, configID)) core_utils.dump_error_message(mainLog) continue sw.reset() n_workers = len(workspec_list) try: # try bulk method tmpLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') tmpLog.debug('start killing') tmpList = sweeperCore.kill_workers(workspec_list) except AttributeError: # fall back to single-worker method for workspec in workspec_list: tmpLog = self.make_logger(_logger, 'workerID={0}'.format( workspec.workerID), method_name='run') try: tmpLog.debug('start killing one worker') tmpStat, tmpOut = sweeperCore.kill_worker( workspec) tmpLog.debug( 'done killing with status={0} diag={1}'. format(tmpStat, tmpOut)) except Exception: core_utils.dump_error_message(tmpLog) except Exception: core_utils.dump_error_message(mainLog) else: # bulk method n_killed = 0 for workspec, (tmpStat, tmpOut) in zip(workspec_list, tmpList): tmpLog.debug( 'done killing workerID={0} with status={1} diag={2}' .format(workspec.workerID, tmpStat, tmpOut)) if tmpStat: n_killed += 1 tmpLog.debug('killed {0}/{1} workers'.format( n_killed, n_workers)) mainLog.debug( 'done killing {0} workers'.format(n_workers) + sw.get_elapsed_time()) mainLog.debug('done all killing' + sw_kill.get_elapsed_time()) # cleanup stage sw_cleanup = core_utils.get_stopwatch() # timeout for missed try: keepMissed = harvester_config.sweeper.keepMissed except Exception: keepMissed = 24 try: keepPending = harvester_config.sweeper.keepPending except Exception: keepPending = 24 # get workers for cleanup statusTimeoutMap = { 'finished': harvester_config.sweeper.keepFinished, 'failed': harvester_config.sweeper.keepFailed, 'cancelled': harvester_config.sweeper.keepCancelled, 'missed': keepMissed, 'pending': keepPending } workersForCleanup = self.dbProxy.get_workers_for_cleanup( harvester_config.sweeper.maxWorkers, statusTimeoutMap) mainLog.debug('got {0} queues for workers cleanup'.format( len(workersForCleanup))) sw = core_utils.get_stopwatch() for queueName, configIdWorkSpecList in iteritems( workersForCleanup): for configID, workspec_list in iteritems(configIdWorkSpecList): # get sweeper if not self.queueConfigMapper.has_queue( queueName, configID): mainLog.error( 'queue config for {0}/{1} not found'.format( queueName, configID)) continue queueConfig = self.queueConfigMapper.get_queue( queueName, configID) sweeperCore = self.pluginFactory.get_plugin( queueConfig.sweeper) messenger = self.pluginFactory.get_plugin( queueConfig.messenger) sw.reset() n_workers = len(workspec_list) # make sure workers to clean up are all terminated mainLog.debug( 'making sure workers to clean up are all terminated') try: # try bulk method tmpList = sweeperCore.kill_workers(workspec_list) except AttributeError: # fall back to single-worker method for workspec in workspec_list: tmpLog = self.make_logger(_logger, 'workerID={0}'.format( workspec.workerID), method_name='run') try: tmpStat, tmpOut = sweeperCore.kill_worker( workspec) except Exception: core_utils.dump_error_message(tmpLog) except Exception: core_utils.dump_error_message(mainLog) mainLog.debug( 'made sure workers to clean up are all terminated') # start cleanup for workspec in workspec_list: tmpLog = self.make_logger(_logger, 'workerID={0}'.format( workspec.workerID), method_name='run') try: tmpLog.debug('start cleaning up one worker') # sweep worker tmpStat, tmpOut = sweeperCore.sweep_worker( workspec) tmpLog.debug( 'swept_worker with status={0} diag={1}'.format( tmpStat, tmpOut)) tmpLog.debug('start messenger cleanup') mc_tmpStat, mc_tmpOut = messenger.clean_up( workspec) tmpLog.debug( 'messenger cleaned up with status={0} diag={1}' .format(mc_tmpStat, mc_tmpOut)) if tmpStat: self.dbProxy.delete_worker(workspec.workerID) except Exception: core_utils.dump_error_message(tmpLog) mainLog.debug( 'done cleaning up {0} workers'.format(n_workers) + sw.get_elapsed_time()) mainLog.debug('done all cleanup' + sw_cleanup.get_elapsed_time()) # old-job-deletion stage sw_delete = core_utils.get_stopwatch() mainLog.debug('delete old jobs') jobTimeout = max(statusTimeoutMap.values()) + 1 self.dbProxy.delete_old_jobs(jobTimeout) # delete orphaned job info self.dbProxy.delete_orphaned_job_info() mainLog.debug('done deletion of old jobs' + sw_delete.get_elapsed_time()) # time the cycle mainLog.debug('done a sweeper cycle' + sw_main.get_elapsed_time()) # check if being terminated if self.terminated(harvester_config.sweeper.sleepTime): mainLog.debug('terminated') return
def define_num_workers(self, static_num_workers, site_name): tmpLog = core_utils.make_logger(_logger, 'site={0}'.format(site_name), method_name='define_num_workers') tmpLog.debug('start') dyn_num_workers = copy.copy(static_num_workers) try: # get queue status queueStat = self.dbProxy.get_cache("panda_queues.json", None) if queueStat is None: queueStat = dict() else: queueStat = queueStat.data # define num of new workers for queueName, tmpVal in iteritems(static_num_workers): # set 0 to num of new workers when the queue is disabled if queueName in queueStat and queueStat[queueName][ 'status'] in ['offline']: dyn_num_workers[queueName]['nNewWorkers'] = 0 retMsg = 'set nNewWorkers=0 since status={0}'.format( queueStat[queueName]['status']) tmpLog.debug(retMsg) continue # get queue queueConfig = self.queueConfigMapper.get_queue(queueName) # get throttler if queueName not in self.throttlerMap: if hasattr(queueConfig, 'throttler'): throttler = self.pluginFactory.get_plugin( queueConfig.throttler) else: throttler = None self.throttlerMap[queueName] = throttler # check throttler throttler = self.throttlerMap[queueName] if throttler is not None: toThrottle, tmpMsg = throttler.to_be_throttled(queueConfig) if toThrottle: dyn_num_workers[queueName]['nNewWorkers'] = 0 retMsg = 'set nNewWorkers=0 by {0}:{1}'.format( throttler.__class__.__name__, tmpMsg) tmpLog.debug(retMsg) continue # check stats nQueue = tmpVal['nQueue'] nReady = tmpVal['nReady'] nRunning = tmpVal['nRunning'] nQueueLimit = queueConfig.nQueueLimitWorker maxWorkers = queueConfig.maxWorkers if queueConfig.runMode == 'slave': nNewWorkersDef = tmpVal['nNewWorkers'] if nNewWorkersDef == 0: dyn_num_workers[queueName]['nNewWorkers'] = 0 retMsg = 'set nNewWorkers=0 by panda in slave mode' tmpLog.debug(retMsg) continue else: nNewWorkersDef = None # define num of new workers based on static site config nNewWorkers = 0 if nQueueLimit > 0 and nQueue >= nQueueLimit: # enough queued workers retMsg = 'No nNewWorkers since nQueue({0})>=nQueueLimit({1})'.format( nQueue, nQueueLimit) tmpLog.debug(retMsg) pass elif maxWorkers > 0 and (nQueue + nReady + nRunning) >= maxWorkers: # enough workers in the system retMsg = 'No nNewWorkers since nQueue({0}) + nReady({1}) + nRunning({2}) '.format( nQueue, nReady, nRunning) retMsg += '>= maxWorkers({0})'.format(maxWorkers) tmpLog.debug(retMsg) pass else: # get max number of queued workers maxQueuedWorkers = 0 if nQueueLimit > 0: maxQueuedWorkers = nQueueLimit if maxQueuedWorkers == 0: if nNewWorkersDef is not None: # slave mode maxQueuedWorkers = nNewWorkersDef + nQueue else: # use default value maxQueuedWorkers = 1 # new workers nNewWorkers = max(maxQueuedWorkers - nQueue, 0) if maxWorkers > 0: nNewWorkers = min( nNewWorkers, max(maxWorkers - nQueue - nReady - nRunning, 0)) if queueConfig.maxNewWorkersPerCycle > 0: nNewWorkers = min(nNewWorkers, queueConfig.maxNewWorkersPerCycle) dyn_num_workers[queueName]['nNewWorkers'] = nNewWorkers # dump tmpLog.debug('defined {0}'.format(str(dyn_num_workers))) return dyn_num_workers except: # dump error errMsg = core_utils.dump_error_message(tmpLog) return None
def ssh_make_one_zip(self, arg_dict): try: zipPath = arg_dict['zipPath'] lfn = os.path.basename(zipPath) self.zip_tmp_log.debug( '{0} start zipPath={1} with {2} files'.format( lfn, zipPath, len(arg_dict['associatedFiles']))) in_data = '\\n'.join( ['{0}'.format(path) for path in arg_dict['associatedFiles']]) com0 = ( 'ssh ' '-o StrictHostKeyChecking=no ' '-i {sshkey} ' '{userhost} ' '"{fileop_script} write_tmpfile --suffix {suffix} --dir {dir} \\"{data}\\" "' ).format( sshkey=self.sshkey, userhost=self.userhost, fileop_script=self.fileop_script, suffix='_tar-name.tmp', dir=os.path.dirname(zipPath), data=in_data, ) # execute p0 = subprocess.Popen(com0, shell=True, close_fds=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdOut, stdErr = p0.communicate() retCode = p0.returncode if retCode != 0: msgStr = 'failed to make tmpargfile remotely with {0}:{1}'.format( stdOut, stdErr) tmp_log.error(msgStr) return False, 'failed to zip with {0}'.format(msgStr) stdOut_str = stdOut if (isinstance(stdOut, str) or stdOut is None) else stdOut.decode() tmpargfile_name = stdOut_str.strip('\n') del p0, stdOut, stdErr # tmp zip file names tmpZipPath = zipPath + '.' + str(uuid.uuid4()) com1 = ( 'ssh ' '-o StrictHostKeyChecking=no ' '-i {sshkey} ' '{userhost} ' '"test -f {tmpZipPath} || tar -cf {tmpZipPath} -T {arg_file} --transform \'s;.*/;;\' "' ).format( sshkey=self.sshkey, userhost=self.userhost, tmpZipPath=tmpZipPath, arg_file=tmpargfile_name, ) # execute p1 = subprocess.Popen(com1, shell=True, close_fds=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdOut, stdErr = p1.communicate() retCode = p1.returncode if retCode != 0: msgStr = 'failed to make zip for {0} with {1}:{2}'.format( lfn, stdOut, stdErr) self.zip_tmp_log.error(msgStr) return None, msgStr, {} del p1, stdOut, stdErr # delete tmpargfile com1a = ('ssh ' '-o StrictHostKeyChecking=no ' '-i {sshkey} ' '{userhost} ' '"{fileop_script} remove_file {file_path} "').format( sshkey=self.sshkey, userhost=self.userhost, fileop_script=self.fileop_script, file_path=tmpargfile_name, ) # execute p1a = subprocess.Popen(com1a, shell=True, close_fds=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdOut, stdErr = p1a.communicate() retCode = p1a.returncode if retCode != 0: msgStr = 'failed to delete tmpargfile remotely with {0}:{1}'.format( stdOut, stdErr) tmp_log.error(msgStr) del p1a, stdOut, stdErr gc.collect() # avoid overwriting lockName = 'zip.lock.{0}'.format(lfn) lockInterval = 60 tmpStat = False # get lock for i in range(lockInterval): tmpStat = self.dbInterface.get_object_lock( lockName, lock_interval=lockInterval) if tmpStat: break time.sleep(1) # failed to lock if not tmpStat: msgStr = 'failed to lock for {0}'.format(lfn) self.zip_tmp_log.error(msgStr) return None, msgStr, {} # rename to be zipPath com2 = ('ssh ' '-o StrictHostKeyChecking=no ' '-i {sshkey} ' '{userhost} ' '"test -f {zipPath} || mv {tmpZipPath} {zipPath}"').format( sshkey=self.sshkey, userhost=self.userhost, zipPath=zipPath, tmpZipPath=tmpZipPath, ) p2 = subprocess.Popen(com2, shell=True, close_fds=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) p2.communicate() del p2 gc.collect() # release lock self.dbInterface.release_object_lock(lockName) # make return fileInfo = dict() fileInfo['path'] = zipPath # get size com3 = ('ssh ' '-o StrictHostKeyChecking=no ' '-i {sshkey} ' '{userhost} ' '"stat -c %s {zipPath}"').format( sshkey=self.sshkey, userhost=self.userhost, zipPath=zipPath, ) p3 = subprocess.Popen(com3, shell=True, close_fds=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdOut, stdErr = p3.communicate() retCode = p3.returncode if retCode != 0: msgStr = 'failed to get file size of {0} with {1}:{2}'.format( zipPath, stdOut, stdErr) self.zip_tmp_log.error(msgStr) return None, msgStr, {} else: stdOut_str = stdOut if (isinstance(stdOut, str) or stdOut is None) else stdOut.decode() file_size = int(stdOut_str.strip('\n')) fileInfo['fsize'] = file_size del p3, stdOut, stdErr gc.collect() # get checksum com4 = ('ssh ' '-o StrictHostKeyChecking=no ' '-i {sshkey} ' '{userhost} ' '"{fileop_script} adler32 {zipPath}"').format( sshkey=self.sshkey, userhost=self.userhost, fileop_script=self.fileop_script, zipPath=zipPath, ) p4 = subprocess.Popen(com4, shell=True, close_fds=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdOut, stdErr = p4.communicate() retCode = p4.returncode if retCode != 0: msgStr = 'failed to get file adler32 of {0} with {1}:{2}'.format( zipPath, stdOut, stdErr) self.zip_tmp_log.error(msgStr) return None, msgStr, {} else: stdOut_str = stdOut if (isinstance(stdOut, str) or stdOut is None) else stdOut.decode() file_chksum = stdOut_str.strip('\n') fileInfo['chksum'] = file_chksum del p4, stdOut, stdErr gc.collect() except Exception: errMsg = core_utils.dump_error_message(self.zip_tmp_log) return False, 'failed to zip with {0}'.format(errMsg) self.zip_tmp_log.debug('{0} done'.format(lfn)) return True, '', fileInfo
def _check_assfile_existence(fileSpec): in_data = '\\n'.join([ '{0}'.format(assFileSpec.path) for assFileSpec in fileSpec.associatedFiles ]) com1 = ( 'ssh ' '-o StrictHostKeyChecking=no ' '-i {sshkey} ' '{userhost} ' '"{fileop_script} write_tmpfile --suffix {suffix} --dir {dir} \\"{data}\\" "' ).format( sshkey=self.sshkey, userhost=self.userhost, fileop_script=self.fileop_script, suffix='_check-exist.tmp', dir=os.path.dirname( next(iter(fileSpec.associatedFiles)).path), data=in_data, ) # execute p1 = subprocess.Popen(com1, shell=True, close_fds=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdOut, stdErr = p1.communicate() retCode = p1.returncode if retCode != 0: msgStr = 'failed to make tmpargfile remotely with {0}:{1}'.format( stdOut, stdErr) tmp_log.error(msgStr) return False, 'failed to zip with {0}'.format(msgStr) stdOut_str = stdOut if (isinstance(stdOut, str) or stdOut is None) else stdOut.decode() tmpargfile_name = stdOut_str.strip('\n') del p1, stdOut, stdErr # record set existence_set = set() # make command com2 = ( 'ssh ' '-o StrictHostKeyChecking=no ' '-i {sshkey} ' '{userhost} ' '"cat {arg_file} | xargs -I%% sh -c \' test -f %% && echo T || echo F \' " ' ).format( sshkey=self.sshkey, userhost=self.userhost, arg_file=tmpargfile_name, ) # execute p2 = subprocess.Popen(com2, shell=True, close_fds=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdOut, stdErr = p2.communicate() retCode = p2.returncode if retCode != 0: msgStr = 'failed to existence of associate files with {0}:{1}'.format( stdOut, stdErr) tmp_log.error(msgStr) else: try: stdOut_str = stdOut if ( isinstance(stdOut, str) or stdOut is None) else stdOut.decode() ret_list = stdOut_str.strip('\n').split('\n') if len(fileSpec.associatedFiles) == len(ret_list): for (assFileSpec, retVal) in zip(fileSpec.associatedFiles, ret_list): if retVal == 'T': existence_set.add(assFileSpec.path) else: msgStr = 'returned number of files inconsistent! Skipped...' tmp_log.error(msgStr) except Exception: core_utils.dump_error_message(tmp_log) del p2, stdOut, stdErr, com2 # delete tmpargfile com3 = ('ssh ' '-o StrictHostKeyChecking=no ' '-i {sshkey} ' '{userhost} ' '"{fileop_script} remove_file {file_path} "').format( sshkey=self.sshkey, userhost=self.userhost, fileop_script=self.fileop_script, file_path=tmpargfile_name, ) # execute p3 = subprocess.Popen(com3, shell=True, close_fds=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdOut, stdErr = p3.communicate() retCode = p3.returncode if retCode != 0: msgStr = 'failed to delete tmpargfile remotely with {0}:{1}'.format( stdOut, stdErr) tmp_log.error(msgStr) del p3, stdOut, stdErr gc.collect() return existence_set
def update_jobs(self, jobspec_list, id): sw = core_utils.get_stopwatch() tmpLogG = self.make_logger('id={0}'.format(id), method_name='update_jobs') tmpLogG.debug('update {0} jobs'.format(len(jobspec_list))) retList = [] # update events for jobSpec in jobspec_list: eventRanges, eventSpecs = jobSpec.to_event_data(max_events=10000) if eventRanges != []: tmpLogG.debug('update {0} events for PandaID={1}'.format(len(eventSpecs), jobSpec.PandaID)) tmpRet = self.update_event_ranges(eventRanges, tmpLogG) if tmpRet['StatusCode'] == 0: for eventSpec, retVal in zip(eventSpecs, tmpRet['Returns']): if retVal in [True, False] and eventSpec.is_final_status(): eventSpec.subStatus = 'done' # update jobs in bulk nLookup = 100 iLookup = 0 while iLookup < len(jobspec_list): dataList = [] jobSpecSubList = jobspec_list[iLookup:iLookup+nLookup] for jobSpec in jobSpecSubList: data = jobSpec.get_job_attributes_for_panda() data['jobId'] = jobSpec.PandaID data['siteName'] = jobSpec.computingSite data['state'] = jobSpec.get_status() data['attemptNr'] = jobSpec.attemptNr data['jobSubStatus'] = jobSpec.subStatus # change cancelled to failed to be accepted by panda server if data['state'] in ['cancelled', 'missed']: if jobSpec.is_pilot_closed(): data['jobSubStatus'] = 'pilot_closed' else: data['jobSubStatus'] = data['state'] data['state'] = 'failed' if jobSpec.startTime is not None and 'startTime' not in data: data['startTime'] = jobSpec.startTime.strftime('%Y-%m-%d %H:%M:%S') if jobSpec.endTime is not None and 'endTime' not in data: data['endTime'] = jobSpec.endTime.strftime('%Y-%m-%d %H:%M:%S') if 'coreCount' not in data and jobSpec.nCore is not None: data['coreCount'] = jobSpec.nCore if jobSpec.is_final_status() and jobSpec.status == jobSpec.get_status(): if jobSpec.metaData is not None: data['metaData'] = json.dumps(jobSpec.metaData) if jobSpec.outputFilesToReport is not None: data['xml'] = jobSpec.outputFilesToReport dataList.append(data) harvester_id = harvester_config.master.harvester_id tmpData = {'jobList': json.dumps(dataList), 'harvester_id': harvester_id} tmpStat, tmpRes = self.post_ssl('updateJobsInBulk', tmpData) retMaps = None errStr = '' if tmpStat is False: errStr = core_utils.dump_error_message(tmpLogG, tmpRes) else: try: tmpStat, retMaps = tmpRes.json() if tmpStat is False: tmpLogG.error('updateJobsInBulk failed with {0}'.format(retMaps)) retMaps = None except Exception: errStr = core_utils.dump_error_message(tmpLogG) if retMaps is None: retMap = {} retMap['content'] = {} retMap['content']['StatusCode'] = 999 retMap['content']['ErrorDiag'] = errStr retMaps = [json.dumps(retMap)] * len(jobSpecSubList) for jobSpec, retMap, data in zip(jobSpecSubList, retMaps, dataList): tmpLog = self.make_logger('id={0} PandaID={1}'.format(id, jobSpec.PandaID), method_name='update_jobs') try: retMap = json.loads(retMap['content']) except Exception: errStr = 'falied to load json' retMap = {} retMap['StatusCode'] = 999 retMap['ErrorDiag'] = errStr tmpLog.debug('data={0}'.format(str(data))) tmpLog.debug('done with {0}'.format(str(retMap))) retList.append(retMap) iLookup += nLookup tmpLogG.debug('done' + sw.get_elapsed_time()) return retList
def do_POST(self): # logger if self.tmpLog is None: self.tmpLog = core_utils.make_logger(_logger) toSkip = False form = None methodName = None dataStr = None message = '' # parse the form data posted try: form = self.get_form() except Exception: message = 'corrupted json' toSkip = True # check parameters if not toSkip: toSkip = True # method is not set if 'methodName' not in form: message = 'methodName is not given' self.send_response(400) elif 'workerID' not in form: message = 'workerID is not given' self.send_response(400) elif 'data' not in form: message = 'data is not given' self.send_response(400) else: toSkip = False # get worker if not toSkip: try: workerID = form['workerID'] workSpec = self.dbProxy.get_worker_with_id(workerID) if workSpec is None: message = 'workerID={0} not found in DB'.format(workerID) self.send_response(400) else: # chose file and operation for each action methodName = form['methodName'] opType = None filePath = '' if methodName == 'requestJobs': filePath = os.path.join( workSpec.get_access_point(), shared_file_messenger.jsonJobRequestFileName) opType = 'w' elif methodName == 'getJobs': filePath = os.path.join( workSpec.get_access_point(), shared_file_messenger.jobSpecFileName) opType = 'r' elif methodName == 'requestEventRanges': filePath = os.path.join( workSpec.get_access_point(), shared_file_messenger.jsonEventsRequestFileName) opType = 'w' elif methodName == 'getEventRanges': filePath = os.path.join( workSpec.get_access_point(), shared_file_messenger.jsonEventsFeedFileName) opType = 'r' elif methodName == 'updateJobs': filePath = os.path.join( workSpec.get_access_point(), shared_file_messenger.jsonAttrsFileName) opType = 'w' elif methodName == 'uploadJobReport': filePath = os.path.join( workSpec.get_access_point(), shared_file_messenger.jsonJobReport) opType = 'w' elif methodName == 'uploadEventOutputDump': filePath = os.path.join( workSpec.get_access_point(), shared_file_messenger.jsonOutputsFileName) opType = 'w' elif methodName == 'setPandaIDs': filePath = os.path.join( workSpec.get_access_point(), shared_file_messenger.pandaIDsFile) opType = 'w' elif methodName == 'killWorker': filePath = os.path.join( workSpec.get_access_point(), shared_file_messenger.killWorkerFile) opType = 'w' elif methodName == 'heartbeat': filePath = os.path.join( workSpec.get_access_point(), shared_file_messenger.heartbeatFile) opType = 'w' else: self.send_response(501) message = 'method not implemented' toSkip = True # take action if not toSkip: # write actions if opType == 'w': # check if file exists. Methods such as heartbeat however need to overwrite the file if os.path.exists(filePath) and methodName not in [ 'heartbeat' ]: message = 'previous request is not yet processed' self.send_response(503) else: with open(filePath, 'w') as fileHandle: json.dump(form['data'], fileHandle) message = 'OK' self.send_response(200) else: # read actions if os.path.exists(filePath): with open(filePath) as fileHandle: try: _message = json.load(fileHandle) message = json.dumps(_message) self.send_header( 'Content-Type', 'application/json') except JSONDecodeError: _f_qs = open(filePath).read() # _message = dict(parse_qsl(_f_qs, keep_blank_values=True)) message = _f_qs self.send_header( 'Content-Type', 'text/plain') self.send_response(200) else: message = 'previous request is not yet processed' self.send_response(503) except Exception: self.send_response(500) message = core_utils.dump_error_message(_logger) if harvester_config.frontend.verbose: self.tmpLog.debug('ip={3} - method={0} json={1} msg={2}'.format( methodName, dataStr, message, self.client_address[0])) # set the response self.do_postprocessing(message) return
def check_stage_out_status(self, jobspec): # make logger tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID), method_name='check_stage_out_status') tmpLog.debug('executing base check_stage_out_status') tmpStat, tmpMsg = GlobusBulkStager.check_stage_out_status( self, jobspec) tmpLog.debug('got {0} {1}'.format(tmpStat, tmpMsg)) if tmpStat is not True: return tmpStat, tmpMsg # get transfer groups groups = jobspec.get_groups_of_output_files() if len(groups) == 0: return tmpStat, tmpMsg # get the queueConfig and corresponding objStoreID_ES queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(jobspec.computingSite) # write to debug log queueConfig.stager tmpLog.debug( 'jobspec.computingSite - {0} queueConfig.stager {1}'.format( jobspec.computingSite, queueConfig.stager)) # check queueConfig stager section to see if srcRSE is set if 'srcRSE' in queueConfig.stager: srcRSE = queueConfig.stager['srcRSE'] else: tmpLog.debug( 'Warning srcRSE not defined in stager portion of queue config file' ) # get destination endpoint nucleus = jobspec.jobParams['nucleus'] agis = self.dbInterface.get_cache('panda_queues.json').data dstRSE = [ agis[x]["astorages"]['pr'][0] for x in agis if agis[x]["atlas_site"] == nucleus ][0] # if debugging log source and destination RSEs tmpLog.debug('srcRSE - {0} dstRSE - {1}'.format(srcRSE, dstRSE)) # test that srcRSE and dstRSE are defined tmpLog.debug('srcRSE - {0} dstRSE - {1}'.format(srcRSE, dstRSE)) errStr = '' if srcRSE is None: errStr = 'Source RSE is not defined ' if dstRSE is None: errStr = errStr + ' Desitination RSE is not defined' if (srcRSE is None) or (dstRSE is None): tmpLog.error(errStr) return None, errStr # check queueConfig stager section to see if jobtype is set if 'jobtype' in queueConfig.stager: if queueConfig.stager['jobtype'] == "Yoda": self.Yodajob = True # set the location of the files in fileSpec.objstoreID # see file /cvmfs/atlas.cern.ch/repo/sw/local/etc/agis_ddmendpoints.json ddm = self.dbInterface.get_cache('agis_ddmendpoints.json').data self.objstoreID = ddm[dstRSE]['id'] if self.Yodajob: self.pathConvention = int(queueConfig.stager['pathConvention']) tmpLog.debug( 'Yoda Job - PandaID = {0} objstoreID = {1} pathConvention ={2}' .format(jobspec.PandaID, self.objstoreID, self.pathConvention)) else: self.pathConvention = None tmpLog.debug('PandaID = {0} objstoreID = {1}'.format( jobspec.PandaID, self.objstoreID)) # set the location of the files in fileSpec.objstoreID self.set_FileSpec_objstoreID(jobspec, self.objstoreID, self.pathConvention) # create the Rucio Client try: # register dataset rucioAPI = RucioClient() except Exception: core_utils.dump_error_message(tmpLog) # treat as a temporary error tmpStat = None tmpMsg = 'failed to add a rule for {0}:{1}'.format( datasetScope, datasetName) return tmpStat, tmpMsg # loop over all transfers tmpStat = True tmpMsg = '' for transferID in groups: if transferID is None: continue datasetName = 'panda.harvester.{0}.{1}'.format( jobspec.PandaID, transferID) datasetScope = 'transient' # lock have_db_lock = self.dbInterface.get_object_lock(transferID, lock_interval=120) if not have_db_lock: msgStr = 'escape since {0} is locked by another thread'.format( transferID) tmpLog.debug(msgStr) return None, msgStr # get transfer status groupStatus = self.dbInterface.get_file_group_status(transferID) if 'hopped' in groupStatus: # already succeeded pass elif 'failed' in groupStatus: # transfer failure tmpStat = False tmpMsg = 'rucio rule for {0}:{1} already failed'.format( datasetScope, datasetName) elif 'hopping' in groupStatus: # check rucio rule ruleStatus = 'FAILED' try: tmpLog.debug('check state for {0}:{1}'.format( datasetScope, datasetName)) for ruleInfo in rucioAPI.list_did_rules( datasetScope, datasetName): if ruleInfo['rse_expression'] != dstRSE: continue ruleStatus = ruleInfo['state'] tmpLog.debug('got state={0}'.format(ruleStatus)) if ruleStatus == 'OK': break except DataIdentifierNotFound: tmpLog.error('dataset not found') except Exception: core_utils.dump_error_message(tmpLog) ruleStatus = None if ruleStatus in ['FAILED', 'CANCELED']: # transfer failure tmpStat = False tmpMsg = 'rucio rule for {0}:{1} failed with {2}'.format( datasetScope, datasetName, ruleStatus) # update file group status self.dbInterface.update_file_group_status( transferID, 'failed') elif ruleStatus == 'OK': # update successful file group status self.dbInterface.update_file_group_status( transferID, 'hopped') else: # replicating or temporary error tmpStat = None tmpMsg = 'replicating or temporary error for {0}:{1}'.format( datasetScope, datasetName) else: # make rucio rule fileSpecs = self.dbInterface.get_files_with_group_id( transferID) fileList = [] for fileSpec in fileSpecs: tmpFile = dict() tmpFile['scope'] = datasetScope tmpFile['name'] = fileSpec.lfn tmpFile['bytes'] = fileSpec.fsize tmpFile['adler32'] = fileSpec.chksum if fileSpec.fileAttributes is not None and 'guid' in fileSpec.fileAttributes: tmpFile['meta'] = { 'guid': fileSpec.fileAttributes['guid'] } else: tmpLog.debug( 'File - {0} does not have a guid value'.format( fileSpec.lfn)) tmpLog.debug('Adding file {0} to fileList'.format( fileSpec.lfn)) fileList.append(tmpFile) # get source RSE if srcRSE is None and fileSpec.objstoreID is not None: ddm = self.dbInterface.get_cache( 'agis_ddmendpoints.json').data srcRSE = [ x for x in ddm if ddm[x]["id"] == fileSpec.objstoreID ][0] try: # register dataset tmpLog.debug( 'register {0}:{1} rse = {2} meta=(hidden: True) lifetime = {3}' .format(datasetScope, datasetName, srcRSE, (30 * 24 * 60 * 60))) try: rucioAPI.add_dataset(datasetScope, datasetName, meta={'hidden': True}, lifetime=30 * 24 * 60 * 60, rse=srcRSE) except DataIdentifierAlreadyExists: # ignore even if the dataset already exists pass except Exception: errMsg = 'Could not create dataset {0}:{1} srcRSE - {2}'.format( datasetScope, datasetName, srcRSE) core_utils.dump_error_message(tmpLog) tmpLog.error(errMsg) raise # return None,errMsg # add files to dataset # add 500 files at a time numfiles = len(fileList) maxfiles = 500 numslices = numfiles / maxfiles if (numfiles % maxfiles) > 0: numslices = numslices + 1 start = 0 for i in range(numslices): try: stop = start + maxfiles if stop > numfiles: stop = numfiles rucioAPI.add_files_to_datasets( [{ 'scope': datasetScope, 'name': datasetName, 'dids': fileList[start:stop], 'rse': srcRSE }], ignore_duplicate=True) start = stop except FileAlreadyExists: # ignore if files already exist pass except Exception: errMsg = 'Could not add files to DS - {0}:{1} rse - {2} files - {3}'.format( datasetScope, datasetName, srcRSE, fileList) core_utils.dump_error_message(tmpLog) tmpLog.error(errMsg) return None, errMsg # add rule try: tmpDID = dict() tmpDID['scope'] = datasetScope tmpDID['name'] = datasetName tmpRet = rucioAPI.add_replication_rule([tmpDID], 1, dstRSE, lifetime=30 * 24 * 60 * 60) ruleIDs = tmpRet[0] tmpLog.debug( 'registered dataset {0}:{1} with rule {2}'.format( datasetScope, datasetName, str(ruleIDs))) except DuplicateRule: # ignore duplicated rule tmpLog.debug('rule is already available') except Exception: errMsg = 'Error creating rule for dataset {0}:{1}'.format( datasetScope, datasetName) core_utils.dump_error_message(tmpLog) tmpLog.debug(errMsg) #raise return None, errMsg # update file group status self.dbInterface.update_file_group_status( transferID, 'hopping') except Exception: core_utils.dump_error_message(tmpLog) # treat as a temporary error tmpStat = None tmpMsg = 'failed to add a rule for {0}:{1}'.format( datasetScope, datasetName) # release lock self.dbInterface.release_object_lock(transferID) # escape if already failed if tmpStat is False: break # all done if tmpStat is True: self.set_FileSpec_status(jobspec, 'finished') tmpLog.debug('done with {0} : {1}'.format(tmpStat, tmpMsg)) return tmpStat, tmpMsg
def run(self): lockedBy = 'sweeper-{0}'.format(self.get_pid()) while True: mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') mainLog.debug('try to get workers to kill') # get workers to kill workersToKill = self.dbProxy.get_workers_to_kill(harvester_config.sweeper.maxWorkers, harvester_config.sweeper.checkInterval) mainLog.debug('got {0} queues to kill workers'.format(len(workersToKill))) # loop over all workers for queueName, configIdWorkSpecs in iteritems(workersToKill): for configID, workSpecs in iteritems(configIdWorkSpecs): # get sweeper if not self.queueConfigMapper.has_queue(queueName, configID): mainLog.error('queue config for {0}/{1} not found'.format(queueName, configID)) continue queueConfig = self.queueConfigMapper.get_queue(queueName, configID) sweeperCore = self.pluginFactory.get_plugin(queueConfig.sweeper) for workSpec in workSpecs: tmpLog = self.make_logger(_logger, 'workerID={0}'.format(workSpec.workerID), method_name='run') try: tmpLog.debug('start killing') tmpStat, tmpOut = sweeperCore.kill_worker(workSpec) tmpLog.debug('done with status={0} diag={1}'.format(tmpStat, tmpOut)) except Exception: core_utils.dump_error_message(tmpLog) mainLog.debug('done kill') # timeout for missed try: keepMissed = harvester_config.sweeper.keepMissed except Exception: keepMissed = 24 keepPending = 24 # get workers for cleanup statusTimeoutMap = {'finished': harvester_config.sweeper.keepFinished, 'failed': harvester_config.sweeper.keepFailed, 'cancelled': harvester_config.sweeper.keepCancelled, 'missed': keepMissed, 'pending': keepPending } workersForCleanup = self.dbProxy.get_workers_for_cleanup(harvester_config.sweeper.maxWorkers, statusTimeoutMap) mainLog.debug('got {0} queues for workers cleanup'.format(len(workersForCleanup))) for queueName, configIdWorkSpecs in iteritems(workersForCleanup): for configID, workSpecs in iteritems(configIdWorkSpecs): # get sweeper if not self.queueConfigMapper.has_queue(queueName, configID): mainLog.error('queue config for {0}/{1} not found'.format(queueName, configID)) continue queueConfig = self.queueConfigMapper.get_queue(queueName, configID) sweeperCore = self.pluginFactory.get_plugin(queueConfig.sweeper) for workSpec in workSpecs: tmpLog = self.make_logger(_logger, 'workerID={0}'.format(workSpec.workerID), method_name='run') try: tmpLog.debug('start cleanup') tmpStat, tmpOut = sweeperCore.sweep_worker(workSpec) tmpLog.debug('done with status={0} diag={1}'.format(tmpStat, tmpOut)) if tmpStat: # delete from DB self.dbProxy.delete_worker(workSpec.workerID) except Exception: core_utils.dump_error_message(tmpLog) # delete old jobs mainLog.debug('delete old jobs') jobTimeout = max(statusTimeoutMap.values()) + 1 self.dbProxy.delete_old_jobs(jobTimeout) mainLog.debug('done cleanup') # check if being terminated if self.terminated(harvester_config.sweeper.sleepTime): mainLog.debug('terminated') return