def process(self, file_name, to_delete=False, test_mode=False, get_log=False, dump_workflow=False): try: is_fatal = False is_OK = True request_id = None dump_str = None with open(file_name) as f: ops = json.load(f) user_name = clean_user_id(ops["userName"]) base_platform = ops['data'].get('base_platform') for task_type in ops['data']['taskParams']: ops['data']['taskParams'][task_type]['userName'] = user_name if base_platform: ops['data']['taskParams'][task_type]['basePlatform'] = base_platform log_token = '< id="{}" test={} outDS={} >'.format(user_name, test_mode, ops['data']['outDS']) tmpLog = LogWrapper(self.log, log_token) tmpLog.info('start {}'.format(file_name)) sandbox_url = os.path.join(ops['data']['sourceURL'], 'cache', ops['data']['sandbox']) # IO through json files ops_file = tempfile.NamedTemporaryFile(delete=False, mode='w') json.dump(ops, ops_file) ops_file.close() # execute main in another process to avoid chdir mess tmp_stat, tmp_out = commands_get_status_output("python {} {} '{}' {} {} '{}' {}".format( __file__, sandbox_url, log_token, dump_workflow, ops_file.name, user_name, test_mode)) if tmp_stat: is_OK = False tmpLog.error('main execution failed with {}:{}'.format(tmp_stat, tmp_out)) else: with open(tmp_out.split('\n')[-1]) as tmp_out_file: is_OK, is_fatal, request_id, dump_str = json.load(tmp_out_file) try: os.remove(tmp_out) except Exception: pass if not get_log: if is_OK: tmpLog.info('is_OK={} request_id={}'.format(is_OK, request_id)) else: tmpLog.info('is_OK={} is_fatal={} request_id={}'.format(is_OK, is_fatal, request_id)) if to_delete or (not test_mode and (is_OK or is_fatal)): dump_str = tmpLog.dumpToString() + dump_str tmpLog.debug('delete {}'.format(file_name)) try: os.remove(file_name) except Exception: pass # send notification if not test_mode and self.taskBuffer is not None: toAdder = self.taskBuffer.getEmailAddr(user_name) if toAdder is None or toAdder.startswith('notsend'): tmpLog.debug('skip to send notification since suppressed') else: # message if is_OK: mailSubject = "PANDA Notification for Workflow {}".format(ops['data']['outDS']) mailBody = "Hello,\n\nWorkflow:{} has been accepted with RequestID:{}\n\n".\ format(ops['data']['outDS'], request_id) else: mailSubject = "PANDA WARNING for Workflow={}".format(ops['data']['outDS']) mailBody = "Hello,\n\nWorkflow {} was not accepted\n\n".\ format(ops['data']['outDS'], request_id) mailBody += "Reason : %s\n" % dump_str # send tmpSM = MailUtils().send(toAdder, mailSubject, mailBody) tmpLog.debug('sent message with {}'.format(tmpSM)) except Exception as e: is_OK = False tmpLog.error("failed to run with {} {}".format(str(e), traceback.format_exc())) if get_log: ret_val = {'status': is_OK} if is_OK: ret_val['log'] = dump_str else: if dump_str is None: ret_val['log'] = tmpLog.dumpToString() else: ret_val['log'] = dump_str return ret_val
class EventPicker: # constructor def __init__(self, taskBuffer, siteMapper, evpFileName, ignoreError): self.taskBuffer = taskBuffer self.siteMapper = siteMapper self.ignoreError = ignoreError self.evpFileName = evpFileName self.token = datetime.datetime.utcnow().isoformat(' ') # logger self.logger = LogWrapper(_logger, self.token) self.pd2p = DynDataDistributer.DynDataDistributer([], self.taskBuffer, self.siteMapper, token=' ', logger=self.logger) self.userDatasetName = '' self.creationTime = '' self.params = '' self.lockedBy = '' self.evpFile = None self.userTaskName = '' # message buffer self.msgBuffer = [] self.lineLimit = 100 # JEDI self.jediTaskID = None self.prodSourceLabel = None self.job_label = None # main def run(self): try: self.putLog('start %s' % self.evpFileName) # lock evp file self.evpFile = open(self.evpFileName) try: fcntl.flock(self.evpFile.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB) except Exception: # relase self.putLog("cannot lock %s" % self.evpFileName) self.evpFile.close() return True # options runEvtList = [] eventPickDataType = '' eventPickStreamName = '' eventPickDS = [] eventPickAmiTag = '' eventPickNumSites = 1 inputFileList = [] tagDsList = [] tagQuery = '' tagStreamRef = '' skipDaTRI = False runEvtGuidMap = {} ei_api = '' # read evp file for tmpLine in self.evpFile: tmpMatch = re.search('^([^=]+)=(.+)$', tmpLine) # check format if tmpMatch is None: continue tmpItems = tmpMatch.groups() if tmpItems[0] == 'runEvent': # get run and event number tmpRunEvt = tmpItems[1].split(',') if len(tmpRunEvt) == 2: runEvtList.append(tmpRunEvt) elif tmpItems[0] == 'eventPickDataType': # data type eventPickDataType = tmpItems[1] elif tmpItems[0] == 'eventPickStreamName': # stream name eventPickStreamName = tmpItems[1] elif tmpItems[0] == 'eventPickDS': # dataset pattern eventPickDS = tmpItems[1].split(',') elif tmpItems[0] == 'eventPickAmiTag': # AMI tag eventPickAmiTag = tmpItems[1] elif tmpItems[0] == 'eventPickNumSites': # the number of sites where datasets are distributed try: eventPickNumSites = int(tmpItems[1]) except Exception: pass elif tmpItems[0] == 'userName': # user name self.userDN = tmpItems[1] self.putLog("user=%s" % self.userDN) elif tmpItems[0] == 'userTaskName': # user task name self.userTaskName = tmpItems[1] elif tmpItems[0] == 'userDatasetName': # user dataset name self.userDatasetName = tmpItems[1] elif tmpItems[0] == 'lockedBy': # client name self.lockedBy = tmpItems[1] elif tmpItems[0] == 'creationTime': # creation time self.creationTime = tmpItems[1] elif tmpItems[0] == 'params': # parameters self.params = tmpItems[1] elif tmpItems[0] == 'ei_api': # ei api parameter for MC ei_api = tmpItems[1] elif tmpItems[0] == 'inputFileList': # input file list inputFileList = tmpItems[1].split(',') try: inputFileList.remove('') except Exception: pass elif tmpItems[0] == 'tagDS': # TAG dataset tagDsList = tmpItems[1].split(',') elif tmpItems[0] == 'tagQuery': # query for TAG tagQuery = tmpItems[1] elif tmpItems[0] == 'tagStreamRef': # StreamRef for TAG tagStreamRef = tmpItems[1] if not tagStreamRef.endswith('_ref'): tagStreamRef += '_ref' elif tmpItems[0] == 'runEvtGuidMap': # GUIDs try: runEvtGuidMap = eval(tmpItems[1]) except Exception: pass # extract task name if self.userTaskName == '' and self.params != '': try: tmpMatch = re.search('--outDS(=| ) *([^ ]+)', self.params) if tmpMatch is not None: self.userTaskName = tmpMatch.group(2) if not self.userTaskName.endswith('/'): self.userTaskName += '/' except Exception: pass # suppress DaTRI if self.params != '': if '--eventPickSkipDaTRI' in self.params: skipDaTRI = True # get compact user name compactDN = self.taskBuffer.cleanUserID(self.userDN) # get jediTaskID self.jediTaskID = self.taskBuffer.getTaskIDwithTaskNameJEDI( compactDN, self.userTaskName) # get prodSourceLabel self.prodSourceLabel, self.job_label = self.taskBuffer.getProdSourceLabelwithTaskID( self.jediTaskID) # convert run/event list to dataset/file list tmpRet, locationMap, allFiles = self.pd2p.convertEvtRunToDatasets( runEvtList, eventPickDataType, eventPickStreamName, eventPickDS, eventPickAmiTag, self.userDN, runEvtGuidMap, ei_api) if not tmpRet: if 'isFatal' in locationMap and locationMap['isFatal'] is True: self.ignoreError = False self.endWithError( 'Failed to convert the run/event list to a dataset/file list' ) return False # use only files in the list if inputFileList != []: tmpAllFiles = [] for tmpFile in allFiles: if tmpFile['lfn'] in inputFileList: tmpAllFiles.append(tmpFile) allFiles = tmpAllFiles # remove redundant CN from DN tmpDN = self.userDN tmpDN = re.sub('/CN=limited proxy', '', tmpDN) tmpDN = re.sub('(/CN=proxy)+$', '', tmpDN) # make dataset container tmpRet = self.pd2p.registerDatasetContainerWithDatasets( self.userDatasetName, allFiles, locationMap, nSites=eventPickNumSites, owner=tmpDN) if not tmpRet: self.endWithError('Failed to make a dataset container %s' % self.userDatasetName) return False # skip DaTRI if skipDaTRI: # successfully terminated self.putLog("skip DaTRI") # update task self.taskBuffer.updateTaskModTimeJEDI(self.jediTaskID) else: # get candidates tmpRet, candidateMaps = self.pd2p.getCandidates( self.userDatasetName, self.prodSourceLabel, self.job_label, checkUsedFile=False, useHidden=True) if not tmpRet: self.endWithError( 'Failed to find candidate for destination') return False # collect all candidates allCandidates = [] for tmpDS in candidateMaps: tmpDsVal = candidateMaps[tmpDS] for tmpCloud in tmpDsVal: tmpCloudVal = tmpDsVal[tmpCloud] for tmpSiteName in tmpCloudVal[0]: if tmpSiteName not in allCandidates: allCandidates.append(tmpSiteName) if allCandidates == []: self.endWithError('No candidate for destination') return False # get list of dataset (container) names if eventPickNumSites > 1: # decompose container to transfer datasets separately tmpRet, tmpOut = self.pd2p.getListDatasetReplicasInContainer( self.userDatasetName) if not tmpRet: self.endWithError('Failed to get replicas in %s' % self.userDatasetName) return False userDatasetNameList = list(tmpOut) else: # transfer container at once userDatasetNameList = [self.userDatasetName] # loop over all datasets sitesUsed = [] for tmpUserDatasetName in userDatasetNameList: # get size of dataset container tmpRet, totalInputSize = rucioAPI.getDatasetSize( tmpUserDatasetName) if not tmpRet: self.endWithError( 'Failed to get the size of {0} with {1}'.format( tmpUserDatasetName, totalInputSize)) return False # run brokerage tmpJob = JobSpec() tmpJob.AtlasRelease = '' self.putLog("run brokerage for %s" % tmpDS) pandaserver.brokerage.broker.schedule( [tmpJob], self.taskBuffer, self.siteMapper, True, allCandidates, True, datasetSize=totalInputSize) if tmpJob.computingSite.startswith('ERROR'): self.endWithError('brokerage failed with %s' % tmpJob.computingSite) return False self.putLog("site -> %s" % tmpJob.computingSite) # send transfer request try: tmpDN = rucioAPI.parse_dn(tmpDN) tmpStatus, userInfo = rucioAPI.finger(tmpDN) if not tmpStatus: raise RuntimeError( 'user info not found for {0} with {1}'.format( tmpDN, userInfo)) tmpDN = userInfo['nickname'] tmpSiteSpec = self.siteMapper.getSite( tmpJob.computingSite) scope_input, scope_output = select_scope( tmpSiteSpec, JobUtils.ANALY_PS, JobUtils.ANALY_PS) tmpDQ2ID = tmpSiteSpec.ddm_input[scope_input] tmpMsg = "%s ds=%s site=%s id=%s" % ( 'registerDatasetLocation for DaTRI ', tmpUserDatasetName, tmpDQ2ID, tmpDN) self.putLog(tmpMsg) rucioAPI.registerDatasetLocation( tmpDS, [tmpDQ2ID], lifetime=14, owner=tmpDN, activity="User Subscriptions") self.putLog('OK') except Exception: errType, errValue = sys.exc_info()[:2] tmpStr = 'Failed to send transfer request : %s %s' % ( errType, errValue) tmpStr.strip() tmpStr += traceback.format_exc() self.endWithError(tmpStr) return False # list of sites already used sitesUsed.append(tmpJob.computingSite) self.putLog("used %s sites" % len(sitesUsed)) # set candidates if len(sitesUsed) >= eventPickNumSites: # reset candidates to limit the number of sites allCandidates = sitesUsed sitesUsed = [] else: # remove site allCandidates.remove(tmpJob.computingSite) # send email notification for success tmpMsg = 'A transfer request was successfully sent to Rucio.\n' tmpMsg += 'Your task will get started once transfer is completed.' self.sendEmail(True, tmpMsg) try: # unlock and delete evp file fcntl.flock(self.evpFile.fileno(), fcntl.LOCK_UN) self.evpFile.close() os.remove(self.evpFileName) except Exception: pass # successfully terminated self.putLog("end %s" % self.evpFileName) return True except Exception: errType, errValue = sys.exc_info()[:2] self.endWithError('Got exception %s:%s %s' % (errType, errValue, traceback.format_exc())) return False # end with error def endWithError(self, message): self.putLog(message, 'error') # unlock evp file try: fcntl.flock(self.evpFile.fileno(), fcntl.LOCK_UN) self.evpFile.close() if not self.ignoreError: # remove evp file os.remove(self.evpFileName) # send email notification self.sendEmail(False, message) except Exception: pass # upload log if self.jediTaskID is not None: outLog = self.uploadLog() self.taskBuffer.updateTaskErrorDialogJEDI( self.jediTaskID, 'event picking failed. ' + outLog) # update task if not self.ignoreError: self.taskBuffer.updateTaskModTimeJEDI(self.jediTaskID, 'tobroken') self.putLog(outLog) self.putLog('end %s' % self.evpFileName) # put log def putLog(self, msg, type='debug'): tmpMsg = msg if type == 'error': self.logger.error(tmpMsg) else: self.logger.debug(tmpMsg) # send email notification def sendEmail(self, isSucceeded, message): # mail address toAdder = Notifier(self.taskBuffer, None, []).getEmail(self.userDN) if toAdder == '': self.putLog('cannot find email address for %s' % self.userDN, 'error') return # subject mailSubject = "PANDA notification for Event-Picking Request" # message mailBody = "Hello,\n\nHere is your request status for event picking\n\n" if isSucceeded: mailBody += "Status : Passed to Rucio\n" else: mailBody += "Status : Failed\n" mailBody += "Created : %s\n" % self.creationTime mailBody += "Ended : %s\n" % datetime.datetime.utcnow().strftime( '%Y-%m-%d %H:%M:%S') mailBody += "Dataset : %s\n" % self.userDatasetName mailBody += "\n" mailBody += "Parameters : %s %s\n" % (self.lockedBy, self.params) mailBody += "\n" mailBody += "%s\n" % message # send retVal = MailUtils().send(toAdder, mailSubject, mailBody) # return return # upload log def uploadLog(self): if self.jediTaskID is None: return 'cannot find jediTaskID' strMsg = self.logger.dumpToString() s, o = Client.uploadLog(strMsg, self.jediTaskID) if s != 0: return "failed to upload log with {0}.".format(s) if o.startswith('http'): return '<a href="{0}">log</a>'.format(o) return o
def core_exec(sandbox_url, log_token, dump_workflow, ops_file, user_name, test_mode): tmpLog = LogWrapper(_logger, log_token) is_OK = True is_fatal = False request_id = None if dump_workflow == 'True': dump_workflow = True else: dump_workflow = False if test_mode == 'True': test_mode = True else: test_mode = False try: with open(ops_file) as f: ops = json.load(f) try: os.remove(ops_file) except Exception: pass # go to temp dir cur_dir = os.getcwd() with tempfile.TemporaryDirectory() as tmp_dirname: os.chdir(tmp_dirname) # download sandbox tmpLog.info('downloading sandbox from {}'.format(sandbox_url)) with requests.get(sandbox_url, allow_redirects=True, verify=False, stream=True) as r: if r.status_code == 400: tmpLog.error("not found") is_fatal = True is_OK = False elif r.status_code != 200: tmpLog.error("bad HTTP response {}".format(r.status_code)) is_OK = False # extract sandbox if is_OK: with open(ops['data']['sandbox'], 'wb') as fs: for chunk in r.raw.stream(1024, decode_content=False): if chunk: fs.write(chunk) fs.close() tmp_stat, tmp_out = commands_get_status_output( 'tar xvfz {}'.format(ops['data']['sandbox'])) if tmp_stat != 0: tmpLog.error(tmp_out) dump_str = 'failed to extract {}'.format(ops['data']['sandbox']) tmpLog.error(dump_str) is_fatal = True is_OK = False # parse workflow files if is_OK: tmpLog.info('parse workflow') if ops['data']['language'] == 'cwl': nodes, root_in = pcwl_utils.parse_workflow_file(ops['data']['workflowSpecFile'], tmpLog) with open(ops['data']['workflowInputFile']) as workflow_input: data = yaml.safe_load(workflow_input) s_id, t_nodes, nodes = pcwl_utils.resolve_nodes(nodes, root_in, data, 0, set(), ops['data']['outDS'], tmpLog) workflow_utils.set_workflow_outputs(nodes) id_node_map = workflow_utils.get_node_id_map(nodes) [node.resolve_params(ops['data']['taskParams'], id_node_map) for node in nodes] dump_str = "the description was internally converted as follows\n" \ + workflow_utils.dump_nodes(nodes) tmpLog.info(dump_str) for node in nodes: s_check, o_check = node.verify() tmp_str = 'Verification failure in ID:{} {}'.format(node.id, o_check) if not s_check: tmpLog.error(tmp_str) dump_str += tmp_str dump_str += '\n' is_fatal = True is_OK = False else: dump_str = "{} is not supported to describe the workflow" tmpLog.error(dump_str) is_fatal = True is_OK = False # convert to workflow if is_OK: workflow_to_submit, dump_str_list = workflow_utils.convert_nodes_to_workflow(nodes) try: if workflow_to_submit: if not test_mode: tmpLog.info('submit workflow') wm = ClientManager(host=get_rest_host()) request_id = wm.submit(workflow_to_submit, username=user_name) else: dump_str = 'workflow is empty' tmpLog.error(dump_str) is_fatal = True is_OK = False except Exception as e: dump_str = 'failed to submit the workflow with {}'.format(str(e)) tmpLog.error('{} {}'.format(dump_str, traceback.format_exc())) if dump_workflow: tmpLog.debug('\n' + ''.join(dump_str_list)) os.chdir(cur_dir) except Exception as e: is_OK = False is_fatal = True tmpLog.error("failed to run with {} {}".format(str(e), traceback.format_exc())) with tempfile.NamedTemporaryFile(delete=False, mode='w') as tmp_json: json.dump([is_OK, is_fatal, request_id, tmpLog.dumpToString()], tmp_json) print(tmp_json.name) sys.exit(0)