def wn_dest_ce(ji): """Build dest_ce. Only run on worker node.""" dest_ce = CommonUtil.env('GLOBUS_CE') if not dest_ce: dest_ce = CommonUtil.stdout('edg-brokerinfo getCE') if not dest_ce: dest_ce = CommonUtil.stdout('glite-brokerinfo getCE') return CommonUtil.strip_to_none(dest_ce)
def wn_grid_job_id(ji): """Build grid_job_id. Only run on worker node.""" grid_job_id = CommonUtil.env('CREAM_JOBID') if not grid_job_id: grid_job_id = CommonUtil.env('EDG_WL_JOBID') if not grid_job_id: grid_job_id = CommonUtil.env('GLITE_WMS_JOBID') return grid_job_id
def cl_target(job): """Build target. Only run on client.""" if hasattr(job.backend, 'CE'): targets = [] if job.backend.CE: targets.append('CE_%s' % job.backend.CE) for site in job.backend.requirements.sites: if site: targets.append('SITE_%s' % site) targetcsv = ','.join(targets) return CommonUtil.strip_to_none(targetcsv) else: return CommonUtil.hostname()
def _cl_job_meta_message(self): j = self.job_info # called on client, so job_info is Job object msg = { # e.g. https://grid-lb0.desy.de:9000/moqY5njFGurEuoDkkJmtBA 'GRIDJOBID': self.dynamic_util.cl_grid_job_id(j), # e.g. fdr08_run2.0052283.physics_Muon.merge.AOD.o3_f8_m10 'INPUTDATASET': AthenaUtil.cl_input_dataset(j), # subjob id e.g. 0 'JOB_ID_INSIDE_THE_TASK': self.dynamic_util.cl_job_id_inside_the_task(j), # None or non-negative number e.g. 100 'NEVENTSREQUESTED': AthenaUtil.cl_nevents_requested(j), # e.g. user09.DavidTuckett.ganga.420.20091125.FZK-LCG2_SCRATCHDISK 'OUTPUTDATASET': AthenaUtil.cl_output_dataset(j), # Unknown at submission. e.g. FZK-LCG2_SCRATCHDISK 'OUTPUTSE': AthenaUtil.cl_output_se(j), 'PILOT': 0, # 0 = not pilot, 1 = pilot 'PILOTNAME': None, 'REPORTER': 'ToolUI', # e.g. ToolUI, JobWN # e.g. 2009-11-25T14:59:24.754249Z 'REPORTTIME': CommonUtil.utcnow(), # e.g. CE_xxx,SITE_CSCS-LCG2_DATADISK,SITE_DESY-ZN_DATADISK 'TARGET': AthenaUtil.cl_target(j), # e.g. ganga:6702b50a-8a31-4476-8189-62ea5b8e00b3:TrigStudy 'TASKNAME': self.dynamic_util.cl_task_name(j), # Ganga uuid e.g. 1c08ff3b-904f-4f77-a481-d6fa765813cb 'UNIQUEJOBID': self.dynamic_util.cl_unique_job_id(j), '___fqid': j.fqid, } return msg
def _cl_job_status_message(self, status, status_source, status_start_time=None): # Not null: EXECUTION_BACKEND, GRIDJOBID, JOB_ID_INSIDE_THE_TASK, TASKNAME, UNIQUEJOBID j = self.job_info # called on client, so job_info is Job object msg = { 'DESTCE': LCGUtil.cl_dest_ce(j), # Actual CE. e.g. ce-3-fzk.gridka.de:2119/jobmanager-pbspro-atlasXS 'DESTSITE': None, # Actual site. e.g. FZK-LCG2 'DESTWN': None, # Actual worker node hostname. e.g. c01-102-103.gridka.de 'EXECUTION_BACKEND': LCGUtil.cl_execution_backend(j), # Backend. e.g. LCG 'GRIDEXITCODE': None, # e.g. 0 'GRIDEXITREASON': None, # e.g. Job terminated successfully 'GRIDJOBID': LCGUtil.cl_grid_job_id(j), # e.g. https://grid-lb0.desy.de:9000/moqY5njFGurEuoDkkJmtBA 'JOBEXITCODE': None, # e.g. 0 'JOBEXITREASON': None, 'JOB_ID_INSIDE_THE_TASK': LCGUtil.cl_job_id_inside_the_task(j), # subjob id e.g. 0 'OWNERDN': LCGUtil.cl_ownerdn(), # Grid certificate. e.g. /DC=ch/DC=cern/OU=Organic Units/OU=Users/CN=dtuckett/CN=671431/CN=David Tuckett/CN=proxy 'REPORTER': 'ToolUI', # e.g. ToolUI, JobWN 'REPORTTIME': CommonUtil.utcnow(), # e.g. 2009-11-25T14:59:24.754249Z 'STATENAME': status, # e.g. submitted, Done (Success) 'STATESOURCE': status_source, # e.g. Ganga, LB 'STATESTARTTIME': status_start_time, # e.g. 2009-11-25T14:32:51.428988Z 'TASKNAME': LCGUtil.cl_task_name(j), # e.g. ganga:6702b50a-8a31-4476-8189-62ea5b8e00b3:TrigStudy 'UNIQUEJOBID': LCGUtil.cl_unique_job_id(j), # Ganga uuid e.g. 1c08ff3b-904f-4f77-a481-d6fa765813cb '___fqid' : j.fqid, } return msg
def _wn_job_status_message(self, status, status_source, status_start_time): # Not null: EXECUTION_BACKEND, GRIDJOBID, JOB_ID_INSIDE_THE_TASK, TASKNAME, UNIQUEJOBID ji = self.job_info # called on worker node, so job_info is dictionary msg = { 'DESTCE': LCGUtil.wn_dest_ce(ji), 'DESTSITE': LCGUtil.wn_dest_site(ji), 'DESTWN': LCGUtil.wn_dest_wn(), 'EXECUTION_BACKEND': ji['EXECUTION_BACKEND'], 'GRIDEXITCODE': None, 'GRIDEXITREASON': None, 'GRIDJOBID': LCGUtil.wn_grid_job_id(ji), 'JOBEXITCODE': None, 'JOBEXITREASON': None, 'JOB_ID_INSIDE_THE_TASK': ji['JOB_ID_INSIDE_THE_TASK'], 'OWNERDN': ji['OWNERDN'], 'REPORTER': 'JobWN', 'REPORTTIME': CommonUtil.utcnow(), 'STATENAME': status, 'STATESOURCE': status_source, 'STATESTARTTIME': status_start_time, 'TASKNAME': ji['TASKNAME'], 'UNIQUEJOBID': ji['UNIQUEJOBID'], '___fqid' : ji['fqid'], } return msg
def start(self, **opts): """Log start event on worker node.""" ji = self.job_info # called on worker node, so job_info is dictionary self._log('debug', 'start %s' % ji['fqid']) # send Ganga running job-status message message = self._wn_job_status_message('running', 'Ganga', CommonUtil.utcnow()) self._send(self.config_info['destination_job_status'], message)
def _cl_task_meta_message(self): j = self.job_info # called on client, so job_info is Job object msg = { 'APPLICATION': AthenaUtil.cl_application(j), # e.g. ATHENA # e.g. 15.5.1 'APPLICATIONVERSION': AthenaUtil.cl_application_version(j), # e.g. fdr08_run2.0052283.physics_Muon.merge.AOD.o3_f8_m10 'INPUTDATASET': AthenaUtil.cl_input_dataset(j), 'JSTOOL': 'Ganga', # e.g. Ganga, Panda # hostname of client. e.g. lxplus246.cern.ch 'JSTOOLUI': AthenaUtil.cl_jstoolui(), # Unknown at submission. e.g. # user09.DavidTuckett.ganga.420.20091125.FZK-LCG2_SCRATCHDISK 'OUTPUTDATASET': AthenaUtil.cl_output_dataset(j), # Unknown at submission. e.g. FZK-LCG2_SCRATCHDISK 'OUTPUTSE': AthenaUtil.cl_output_se(j), # Grid certificate. e.g. /DC=ch/DC=cern/OU=Organic # Units/OU=Users/CN=dtuckett/CN=671431/CN=David Tuckett/CN=proxy 'OWNERDN': self.dynamic_util.cl_ownerdn(), 'REPORTER': 'ToolUI', # e.g. ToolUI, JobWN # e.g. 2009-11-25T14:59:24.754249Z 'REPORTTIME': CommonUtil.utcnow(), 'SUBMISSIONTYPE': 'direct', # e.g. CE_xxx,SITE_CSCS-LCG2_DATADISK,SITE_DESY-ZN_DATADISK 'TARGET': AthenaUtil.cl_target(j), # e.g. ganga:6702b50a-8a31-4476-8189-62ea5b8e00b3:TrigStudy 'TASKNAME': self.dynamic_util.cl_task_name(j), # e.g. analysis, production, hammercloud etc. 'TASKTYPE': AthenaUtil.cl_task_type(self.config_info), '___fqid': j.fqid, } return msg
def _wn_job_processing_attributes_message(self): ji = self.job_info # called on worker node, so job_info is dictionary athena_stats = AthenaUtil.wn_load_athena_stats() msg = { # e.g. https://grid-lb0.desy.de:9000/moqY5njFGurEuoDkkJmtBA 'GRIDJOBID': self.dynamic_util.wn_grid_job_id(ji), # subjob id e.g. 0 'JOB_ID_INSIDE_THE_TASK': ji['JOB_ID_INSIDE_THE_TASK'], # number of events processed. e.g. 100 'NEVENTSPROCESSED': athena_stats.get('totalevents'), # number of files processed. e.g. 2 'NFILESPROCESSED': athena_stats.get('numfiles'), 'REPORTER': 'JobWN', # e.g. ToolUI, JobWN # e.g. 2009-11-25T14:59:24.754249Z 'REPORTTIME': CommonUtil.utcnow(), # system cpu time in seconds. e.g. 38.45 'SYSTEMTIME': athena_stats.get('systemtime'), # e.g. # [email protected]:/afs/cern.ch/user/d/dtuckett/gangadir/repository/dtuckett/LocalAMGA 'TASKNAME': ji['TASKNAME'], # Ganga uuid e.g. 1c08ff3b-904f-4f77-a481-d6fa765813cb 'UNIQUEJOBID': ji['UNIQUEJOBID'], # user cpu time in seconds. e.g. 479.0 'USERTIME': athena_stats.get('usertime'), # wallclock time in seconds. e.g. 1040 'WALLCLOCK': athena_stats.get('wallclock'), '___fqid': ji['fqid'], } return msg
def cl_application_version(job): """Build application_version. Only run on client.""" if job.application.atlas_production: application_version = job.application.atlas_production else: application_version = job.application.atlas_release return CommonUtil.strip_to_none(application_version)
def wn_grid_job_id(ji): """Build grid_job_id. Only run on worker node.""" """ grid_job_id = CommonUtil.env('EDG_WL_JOBID') if not grid_job_id: grid_job_id = CommonUtil.env('GLITE_WMS_JOBID') return grid_job_id """ return CommonUtil.strip_to_none(ji['fqid'])
def fail(self, **opts): """Log fail event on client.""" j = self.job_info # called on client, so job_info is Job object self._log('debug', 'fail %s' % j.fqid) # ignore master wrapper jobs if j.subjobs: self._log('debug', 'Not sending unwanted message on fail for master wrapper job %s.' % j.fqid) return # send LB Done or Aborted job-status message message = self._cl_job_status_message('failed', 'LB', CommonUtil.utcnow()) message['GRIDEXITCODE'] = self.dynamic_util.cl_grid_exit_code(j) message['GRIDEXITREASON'] = self.dynamic_util.cl_grid_exit_reason(j) self._send(self.config_info['destination_job_status'], message)
def stop(self, exitcode, **opts): """Log stop event on worker node.""" ji = self.job_info # called on worker node, so job_info is dictionary self._log('debug', 'stop %s' % ji['fqid']) if exitcode == 0: status = 'completed' else: status = 'failed' # send Ganga completed or failed job-status message message = self._wn_job_status_message(status, 'Ganga', CommonUtil.utcnow()) message['JOBEXITCODE'] = exitcode message['JOBEXITREASON'] = None #TODO: how can we know this? self._send(self.config_info['destination_job_status'], message)
def cl_output_se(job): """Build output_se. Only run on client.""" if not job.outputdata: return None # job.outputdata.location can be a string or a list if isinstance(job.outputdata.location, list): locations = [] for l in job.outputdata.location: if l and l not in locations: locations.append(l) locationcsv = ','.join(locations) else: locationcsv = job.outputdata.location return CommonUtil.strip_to_none(locationcsv)
def submit(self, **opts): """Log submit event on client.""" j = self.job_info # called on client, so job_info is Job object self._log('debug', 'submit %s' % j.fqid) # ignore master wrapper jobs if j.subjobs: self._log('debug', 'Not sending unwanted message on submit for master wrapper job %s.' % j.fqid) return # send Ganga submitted job-status message message = self._cl_job_status_message('submitted', 'Ganga', CommonUtil.utcnow()) if message['GRIDJOBID'] is None: # This is to handle the temporary workaround in # LCG.master_bulk_updateMonitoringInformation() which results in two # submit messages being sent, one without a grid_job_id. self._log('debug', 'Not sending redundant message on submit without grid_job_id for job %s.' % j.fqid) else: self._send(self.config_info['destination_job_status'], message)
def submit(self, **opts): """Log submit event on client.""" j = self.job_info # called on client, so job_info is Job object self._log('debug', 'submit %s' % j.fqid) # ignore master wrapper jobs if j.subjobs: self._log( 'debug', 'Not sending unwanted message on submit for master wrapper job %s.' % j.fqid) return # send Ganga submitted job-status message message = self._cl_job_status_message( 'submitted', 'Ganga', CommonUtil.utcnow()) if message['GRIDJOBID'] is None: # This is to handle the temporary workaround in # IBackend.master_bulk_updateMonitoringInformation() which results in two # submit messages being sent, one without a grid_job_id. self._log( 'debug', 'Not sending redundant message on submit without grid_job_id for job %s.' % j.fqid) else: self._send(self.config_info['destination_job_status'], message) if j.master: j = j.master from Ganga.GPIDev import Credentials proxy = Credentials.getCredential('GridProxy') ownerdn = proxy.info('-subject') user = '******' # if no error in the proxy -> get the second CN value from right to # left if ownerdn.find('ERROR') == -1: if ownerdn.rfind('CN=') > -1: subownerdn = ownerdn[0:ownerdn.rfind('CN=') - 1] user = subownerdn[ subownerdn.rfind('CN=') + 3:].replace(' ', '') task_name = 'ganga:%s:%s' % (j.info.uuid, j.name,) task_mon_link = "http://dashb-atlas-jobdev.cern.ch/dashboard/templates/index.html#user=%s&from=&till=&timeRange=lastDay&refresh=0&tid=%s&p=1&uparam[]=all" % ( user, task_name) if j.backend.__class__.__name__ == 'Panda' and len(j.backend.buildjobs) > 0 and j.backend.buildjobs[0].url is not None: j.info.monitoring_links = [ (task_mon_link, 'dashboard'), (j.backend.buildjobs[0].url, 'panda')] else: j.info.monitoring_links = [(task_mon_link, 'dashboard')]
def cl_grid_status(job): """Build grid_status. Only run on client.""" return CommonUtil.strip_to_none(job.backend.status)
def cl_ownerdn(): """Build ownerdn. Only run on client.""" from Ganga.GPIDev import Credentials proxy = Credentials.getCredential('GridProxy') ownerdn = proxy.info('-subject') return CommonUtil.strip_to_none(ownerdn)
def cl_grid_job_id(job): """Build grid_job_id. Only run on client.""" return CommonUtil.strip_to_none(job.backend.id)
def cl_grid_exit_code(job): """Build grid_exit_code. Only run on client.""" return CommonUtil.strip_to_none(job.backend.exitcode)
def cl_dest_ce(job): """Build dest_ce. Only run on client.""" return CommonUtil.strip_to_none(job.backend.actualCE)
def cl_jstoolui(): """Build jstoolui. Only run on client.""" return CommonUtil.hostname()
def wn_dest_ce(ji): """Build dest_ce. Only run on worker node.""" dest_ce = CommonUtil.env('CE_ID') return CommonUtil.strip_to_none(dest_ce)
def wn_dest_site(ji): """Build dest_site. Only run on worker node.""" return CommonUtil.env('SITE_NAME')
def cl_grid_exit_reason(job): """Build grid_exit_reason. Only run on client.""" return CommonUtil.strip_to_none(job.backend.reason)
def cl_application(job): """Build application. Only run on client.""" return CommonUtil.strip_to_none(job.application.atlas_exetype)
def wn_dest_wn(): """Build dest_wn. Only run on worker node.""" return CommonUtil.hostname()
def cl_output_dataset(job): """Build output_dataset. Only run on client.""" if not job.outputdata: return None return CommonUtil.strip_to_none(job.outputdata.datasetname)
def cl_input_dataset(job): """Build input_dataset. Only run on client.""" if not job.inputdata: return None datasetcsv = ','.join(job.inputdata.dataset) return CommonUtil.strip_to_none(datasetcsv)