def test(): '''test submission''' from pandaharvester.harvestercore.job_spec import JobSpec from pandaharvester.harvestercore.plugin_factory import PluginFactory import json queuename = 'ARC-TEST' queueconfmapper = QueueConfigMapper() queueconf = queueconfmapper.get_queue(queuename) pluginfactory = PluginFactory() pandajob = '{"jobsetID": 11881, "logGUID": "88ee8a52-5c70-490c-a585-5eb6f48e4152", "cmtConfig": "x86_64-slc6-gcc49-opt", "prodDBlocks": "mc16_13TeV:mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.merge.EVNT.e5340_e5984_tid11329621_00", "dispatchDBlockTokenForOut": "NULL,NULL", "destinationDBlockToken": "dst:CERN-PROD_DATADISK,dst:NDGF-T1_DATADISK", "destinationSE": "CERN-PROD_PRESERVATION", "realDatasets": "mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.HITS.e5340_e5984_s3126_tid11364822_00,mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.log.e5340_e5984_s3126_tid11364822_00", "prodUserID": "gingrich", "GUID": "A407D965-B139-A543-8851-A8E134A678D7", "realDatasetsIn": "mc16_13TeV:mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.merge.EVNT.e5340_e5984_tid11329621_00", "nSent": 2, "cloud": "WORLD", "StatusCode": 0, "homepackage": "AtlasOffline/21.0.15", "inFiles": "EVNT.11329621._001079.pool.root.1", "processingType": "simul", "currentPriority": 900, "fsize": "129263662", "fileDestinationSE": "CERN-PROD_PRESERVATION,BOINC_MCORE", "scopeOut": "mc16_13TeV", "minRamCount": 1573, "jobDefinitionID": 0, "maxWalltime": 40638, "scopeLog": "mc16_13TeV", "transformation": "Sim_tf.py", "maxDiskCount": 485, "coreCount": 1, "prodDBlockToken": "NULL", "transferType": "NULL", "destinationDblock": "mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.HITS.e5340_e5984_s3126_tid11364822_00_sub0418634273,mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.log.e5340_e5984_s3126_tid11364822_00_sub0418634276", "dispatchDBlockToken": "NULL", "jobPars": "--inputEVNTFile=EVNT.11329621._001079.pool.root.1 --maxEvents=50 --postInclude \\"default:RecJobTransforms/UseFrontier.py\\" --preExec \\"EVNTtoHITS:simFlags.SimBarcodeOffset.set_Value_and_Lock(200000)\\" \\"EVNTtoHITS:simFlags.TRTRangeCut=30.0;simFlags.TightMuonStepping=True\\" --preInclude \\"EVNTtoHITS:SimulationJobOptions/preInclude.BeamPipeKill.py,SimulationJobOptions/preInclude.FrozenShowersFCalOnly.py\\" --skipEvents=4550 --firstEvent=5334551 --outputHITSFile=HITS.11364822._128373.pool.root.1 --physicsList=FTFP_BERT_ATL_VALIDATION --randomSeed=106692 --DBRelease=\\"all:current\\" --conditionsTag \\"default:OFLCOND-MC16-SDR-14\\" --geometryVersion=\\"default:ATLAS-R2-2016-01-00-01_VALIDATION\\" --runNumber=364168 --AMITag=s3126 --DataRunNumber=284500 --simulator=FullG4 --truthStrategy=MC15aPlus", "attemptNr": 2, "swRelease": "Atlas-21.0.15", "nucleus": "CERN-PROD", "maxCpuCount": 40638, "outFiles": "HITS.11364822._128373.pool.root.11,log.11364822._128373.job.log.tgz.11", "ddmEndPointOut": "CERN-PROD_DATADISK,NDGF-T1_DATADISK", "scopeIn": "mc16_13TeV", "PandaID": 3487584273, "sourceSite": "NULL", "dispatchDblock": "panda.11364822.07.05.GEN.0c9b1d3b-feec-411a-89e4-1cbf7347d70c_dis003487584270", "prodSourceLabel": "managed", "checksum": "ad:cd0bf10b", "jobName": "mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.e5340_e5984_s3126.3433643361", "ddmEndPointIn": "NDGF-T1_DATADISK", "taskID": 11364822, "logFile": "log.11364822._128373.job.log.tgz.1"}' pandajob = json.loads(pandajob) jspec = JobSpec() jspec.convert_job_json(pandajob) jspec.computingSite = queuename jspeclist = [jspec] maker = pluginfactory.get_plugin(queueconf.workerMaker) wspec = maker.make_worker(jspeclist, queueconf) wspec.hasJob = 1 wspec.set_jobspec_list(jspeclist) sub = ARCSubmitter() print sub.submit_workers([wspec]) print wspec.batchID
def post_processing(self, workspec, jobspec_list, map_type): ''' Fetch job output and process pilot info for sending in final heartbeat. The pilot pickle is loaded and some attributes corrected (schedulerid, pilotlog etc), then converted to dictionary and stored in workspec.workAttributes[pandaid]. If pilot pickle cannot be used, report ARC error in pilotErrorDiag and fill all possible attributes using ARC information. ''' arclog = arc_utils.ARCLogger(baselogger, workspec.workerID) tmplog = arclog.log tmplog.info('Post processing ARC job {0}'.format(workspec.batchID)) job = workspec.workAttributes['arcjob'] proxyrole = workspec.workAttributes['proxyrole'] arcid = job['JobID'] tmplog.info('Job id {0}'.format(arcid)) if 'arcdownloadfiles' not in workspec.workAttributes: tmplog.error('No files to download') return # Assume one-to-one mapping of workers to jobs. If jobspec_list is empty # it means the job was cancelled by panda or otherwise forgotten if not jobspec_list: return # Set certificate userconfig = arc.UserConfig(self.cred_type) try: userconfig.ProxyPath(str(self.certs[proxyrole])) except: tmplog.error("Job {0}: no proxy found with role {1}".format(job.JobID, proxyrole)) return queueconfigmapper = QueueConfigMapper() queueconfig = queueconfigmapper.get_queue(jobspec_list[0].computingSite) logbaseurl = queueconfig.submitter.get('logBaseURL') logbasedir = queueconfig.submitter.get('logDir', self.tmpdir) logsubdir = workspec.workAttributes['logsubdir'] pandaid = str(jobspec_list[0].PandaID) # Construct log path and url logurl = '/'.join([logbaseurl, logsubdir, str(pandaid)]) if logbaseurl else None logdir = os.path.join(logbasedir, logsubdir) # post_processing is only called once, so no retries are done. But keep # the possibility here in case it changes (fetched, notfetched, notfetchedretry) = self._download_outputs(workspec.workAttributes['arcdownloadfiles'], logdir, arcid, pandaid, userconfig, tmplog) if arcid not in fetched: tmplog.warning("Could not get outputs of {0}".format(arcid)) workspec.workAttributes[long(pandaid)] = {} workspec.workAttributes[long(pandaid)] = self._extractAndFixPilotPickle(job, pandaid, (arcid in fetched), logurl, tmplog) tmplog.debug("pilot info for {0}: {1}".format(pandaid, workspec.workAttributes[long(pandaid)]))
def __init__(self, single_mode=False, stop_event=None, daemon_mode=True): # initialize database and config self.singleMode = single_mode self.stopEvent = stop_event self.daemonMode = daemon_mode from pandaharvester.harvestercore.communicator_pool import CommunicatorPool self.communicatorPool = CommunicatorPool() from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper self.queueConfigMapper = QueueConfigMapper() from pandaharvester.harvestercore.db_proxy_pool import DBProxyPool as DBProxy dbProxy = DBProxy() dbProxy.make_tables(self.queueConfigMapper)
def __init__(self, **kwarg): BaseWorkerMaker.__init__(self, **kwarg) self.pluginFactory = PluginFactory() self.queue_config_mapper = QueueConfigMapper() tmpLog = self.make_logger(baseLogger, method_name='__init__') tmpLog.info("Multinode workermaker: created.") tmpLog.debug("Queue name: {0}".format(self.queueName)) if self.mode == "static": tmpLog.info("Static configuration") elif self.mode == "dynamic": tmpLog.info("Dynamic configuration") self.nNodes, self.walltimelimit = self.get_resources() self.nJobsPerWorker = self.nNodes * self.nJobsPerNode
def __init__(self, **kwarg): PluginBase.__init__(self, **kwarg) self.queue_config_mapper = QueueConfigMapper() # States taken from: https://cloud.google.com/compute/docs/instances/checking-instance-status self.vm_to_worker_status = { 'RUNNING': WorkSpec.ST_running, 'TERMINATED': WorkSpec. ST_running, # the VM is stopped, but has to be fully deleted 'STOPPING': WorkSpec.ST_finished, 'PROVISIONING': WorkSpec.ST_submitted, 'STAGING': WorkSpec.ST_submitted }
def __init__(self, pid_file, single_mode=False): AgentBase.__init__(self, single_mode) self.db_proxy = DBProxy() if pid_file is not None: self.pid_file = pid_file else: try: self.pid_file = harvester_config.service_monitor.pidfile except Exception: self.pid_file = None self.pid = self.get_master_pid() self.master_process = psutil.Process(self.pid) self.children = self.master_process.children(recursive=True) self.cpu_count = multiprocessing.cpu_count() self.queue_config_mapper = QueueConfigMapper() self.cred_manager = CredManager(self.queue_config_mapper, single_mode=True)
def submit_k8s_worker(self, work_spec): tmp_log = self.make_logger(base_logger, method_name='submit_k8s_worker') # get info from harvester queue config _queueConfigMapper = QueueConfigMapper() harvester_queue_config = _queueConfigMapper.get_queue(self.queueName) # set the stdout log file log_file_name = '{0}_{1}.out'.format(harvester_config.master.harvester_id, work_spec.workerID) work_spec.set_log_file('stdout', '{0}/{1}'.format(self.logBaseURL, log_file_name)) # TODO: consider if we want to upload the yaml file to PanDA cache yaml_content = self.k8s_client.read_yaml_file(self.k8s_yaml_file) try: # read the job configuration (if available, only push model) job_fields, job_pars_parsed = self.read_job_configuration(work_spec) # decide container image and executable to run. In pull mode, defaults are provided container_image = self.decide_container_image(job_fields, job_pars_parsed) executable, args = self.build_executable(job_fields, job_pars_parsed) tmp_log.debug('container_image: "{0}"; executable: "{1}"; args: "{2}"'.format(container_image, executable, args)) # choose the appropriate proxy panda_queues_dict = PandaQueuesDict() this_panda_queue_dict = panda_queues_dict.get(self.queueName, dict()) is_grandly_unified_queue = panda_queues_dict.is_grandly_unified_queue(self.queueName) cert = self._choose_proxy(work_spec, is_grandly_unified_queue) if not cert: err_str = 'No proxy specified in proxySecretPath. Not submitted' tmp_return_value = (False, err_str) return tmp_return_value # get the walltime limit try: max_time = this_panda_queue_dict['maxtime'] except Exception as e: tmp_log.warning('Could not retrieve maxtime field for queue {0}'.format(self.queueName)) max_time = None associated_params_dict = {} for key, val in panda_queues_dict.get_harvester_params(self.queueName).items(): if key in self._allowed_agis_attrs: associated_params_dict[key] = val pilot_url = associated_params_dict.get('pilot_url') pilot_version = str(this_panda_queue_dict.get('pilot_version', 'current')) python_version = str(this_panda_queue_dict.get('python_version', '2')) # prod_source_label = harvester_queue_config.get_source_label(work_spec.jobType) pilot_opt_dict = submitter_common.get_complicated_pilot_options(work_spec.pilotType) if pilot_opt_dict is None: prod_source_label = harvester_queue_config.get_source_label(work_spec.jobType) pilot_type = work_spec.pilotType pilot_url_str = '--piloturl {0}'.format(pilot_url) if pilot_url else '' else: prod_source_label = pilot_opt_dict['prod_source_label'] pilot_type = pilot_opt_dict['pilot_type_opt'] pilot_url_str = pilot_opt_dict['pilot_url_str'] pilot_python_option = submitter_common.get_python_version_option(python_version, prod_source_label) # submit the worker rsp, yaml_content_final = self.k8s_client.create_job_from_yaml(yaml_content, work_spec, prod_source_label, pilot_type, pilot_url_str, pilot_python_option, container_image, executable, args, cert, cpu_adjust_ratio=self.cpuAdjustRatio, memory_adjust_ratio=self.memoryAdjustRatio, max_time=max_time) except Exception as _e: tmp_log.error(traceback.format_exc()) err_str = 'Failed to create a JOB; {0}'.format(_e) tmp_return_value = (False, err_str) else: work_spec.batchID = yaml_content['metadata']['name'] tmp_log.debug('Created worker {0} with batchID={1}'.format(work_spec.workerID, work_spec.batchID)) tmp_return_value = (True, '') return tmp_return_value
def submit_workers(self, workspec_list): retList = [] for workSpec in workspec_list: tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workSpec.workerID), method_name='submit_workers') queueconfigmapper = QueueConfigMapper() queueconfig = queueconfigmapper.get_queue(workSpec.computingSite) prodSourceLabel = queueconfig.get_source_label() # If jobSpec is defined we are in push mode, if not pull mode # Both assume one to one worker to job mapping jobSpec = workSpec.get_jobspec_list() if jobSpec: jobSpec = jobSpec[0] tmpLog.debug("JobSpec: {0}".format(jobSpec.values_map())) # Unified queues: take prodsourcelabel from job prodSourceLabel = jobSpec.jobParams.get('prodSourceLabel', prodSourceLabel) desc = {} # If we need to prefetch events, set aCT status waiting. # feed_events in act_messenger will fill events and release the job if queueconfig.prefetchEvents: desc['pandastatus'] = 'waiting' desc['actpandastatus'] = 'waiting' desc['arcjobid'] = -1 # dummy id to prevent submission else: desc['pandastatus'] = 'sent' desc['actpandastatus'] = 'sent' desc['siteName'] = workSpec.computingSite desc['proxyid'] = self.proxymap['pilot' if prodSourceLabel in ['user', 'panda'] else 'production'] desc['prodSourceLabel'] = prodSourceLabel desc['sendhb'] = 0 metadata = {'harvesteraccesspoint': workSpec.get_access_point(), 'schedulerid': 'harvester-{}'.format(harvester_config.master.harvester_id)} desc['metadata'] = json.dumps(metadata) if jobSpec: # push mode: aCT takes the url-encoded job description (like it gets from panda server) pandaid = jobSpec.PandaID actjobdesc = urllib.parse.urlencode(jobSpec.jobParams) else: # pull mode: just set pandaid (to workerid) and prodsourcelabel pandaid = workSpec.workerID actjobdesc = 'PandaID=%d&prodSourceLabel=%s' % (pandaid, prodSourceLabel) tmpLog.info("Inserting job {0} into aCT DB: {1}".format(pandaid, str(desc))) try: batchid = self.actDB.insertJob(pandaid, actjobdesc, desc)['LAST_INSERT_ID()'] except Exception as e: result = (False, "Failed to insert job into aCT DB: {0}".format(str(e))) else: tmpLog.info("aCT batch id {0}".format(batchid)) workSpec.batchID = str(batchid) workSpec.submissionHost = self.hostname workSpec.nativeStatus = desc['actpandastatus'] # Set log files in workSpec today = time.strftime('%Y-%m-%d', time.gmtime()) logurl = '/'.join([queueconfig.submitter.get('logBaseURL'), today, workSpec.computingSite, str(pandaid)]) workSpec.set_log_file('batch_log', '{0}.log'.format(logurl)) workSpec.set_log_file('stdout', '{0}.out'.format(logurl)) workSpec.set_log_file('stderr', '{0}.err'.format(logurl)) workSpec.set_log_file('jdl', '{0}.jdl'.format(logurl)) result = (True, '') retList.append(result) return retList
def check_workers(self, workspec_list): retList = [] for workSpec in workspec_list: # make logger tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format( workSpec.workerID), method_name='check_workers') queueconfigmapper = QueueConfigMapper() queueconfig = queueconfigmapper.get_queue(workSpec.computingSite) try: tmpLog.debug('Querying aCT for id {0}'.format( workSpec.batchID)) columns = [ 'actpandastatus', 'pandastatus', 'computingElement', 'node', 'error' ] actjobs = self.actDB.getJobs("id={0}".format(workSpec.batchID), columns) except Exception as e: if self.actDB: tmpLog.error("Failed to query aCT DB: {0}".format(str(e))) # send back current status retList.append((workSpec.status, '')) continue if not actjobs: tmpLog.error("Job with id {0} not found in aCT".format( workSpec.batchID)) # send back current status retList.append((WorkSpec.ST_failed, "Job not found in aCT")) continue actstatus = actjobs[0]['actpandastatus'] workSpec.nativeStatus = actstatus newStatus = WorkSpec.ST_running errorMsg = '' if actstatus in ['waiting', 'sent', 'starting']: newStatus = WorkSpec.ST_submitted # Handle post running states if queueconfig.truePilot: # True pilot: keep in running until really done if actstatus in ['done', 'donecancelled']: newStatus = WorkSpec.ST_finished elif actstatus == 'donefailed': # set failed here with workspec sup error errorMsg = actjobs[0]['error'] or 'Unknown error' error_code = WorkerErrors.error_codes.get('GENERAL_ERROR') workSpec.set_supplemental_error(error_code=error_code, error_diag=errorMsg) newStatus = WorkSpec.ST_failed tmpLog.info('ID {0} failed with error {1})'.format( workSpec.batchID, errorMsg)) elif actstatus in [ 'done', 'donefailed', 'donecancelled', 'transferring', 'tovalidate' ]: # NG mode: all post processing is now done in the stager newStatus = WorkSpec.ST_finished if newStatus != workSpec.status: tmpLog.info('ID {0} updated status {1} -> {2} ({3})'.format( workSpec.batchID, workSpec.status, newStatus, actstatus)) else: tmpLog.debug('batchStatus {0} -> workerStatus {1}'.format( actstatus, newStatus)) if actjobs[0]['computingElement']: workSpec.computingElement = actjobs[0]['computingElement'] if actjobs[0]['node']: try: pandaid = workSpec.get_jobspec_list()[0].PandaID workSpec.set_work_attributes( {pandaid: { 'node': actjobs[0]['node'] }}) except: tmpLog.warning( 'Could not extract panda ID for worker {0}'.format( workSpec.batchID)) retList.append((newStatus, errorMsg)) return True, retList
def check_stage_out_status(self, jobspec): # make logger tmpLog = self.make_logger(_logger, 'PandaID={0} ThreadID={1}'.format( jobspec.PandaID, threading.current_thread().ident), method_name='check_stage_out_status') tmpLog.debug('start') # show the dummy transfer id and set to a value with the PandaID if needed. tmpLog.debug('self.dummy_transfer_id = {}'.format( self.dummy_transfer_id)) if self.dummy_transfer_id == '{0}_{1}'.format(dummy_transfer_id_base, 'XXXX'): old_dummy_transfer_id = self.dummy_transfer_id self.dummy_transfer_id = '{0}_{1}'.format(dummy_transfer_id_base, jobspec.PandaID) tmpLog.debug( 'Change self.dummy_transfer_id from {0} to {1}'.format( old_dummy_transfer_id, self.dummy_transfer_id)) # default return tmpRetVal = (True, '') # set flag if have db lock have_db_lock = False # check that jobspec.computingSite is defined if jobspec.computingSite is None: # not found tmpLog.error('jobspec.computingSite is not defined') return False, 'jobspec.computingSite is not defined' else: tmpLog.debug('jobspec.computingSite : {0}'.format( jobspec.computingSite)) # get the queueConfig and corresponding objStoreID_ES queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(jobspec.computingSite) # check queueConfig stager section to see if jobtype is set if 'jobtype' in queueConfig.stager: if queueConfig.stager['jobtype'] == "Yoda": self.Yodajob = True # set the location of the files in fileSpec.objstoreID # see file /cvmfs/atlas.cern.ch/repo/sw/local/etc/agis_ddmendpoints.json self.objstoreID = int(queueConfig.stager['objStoreID_ES']) if self.Yodajob: self.pathConvention = int(queueConfig.stager['pathConvention']) tmpLog.debug( 'Yoda Job - PandaID = {0} objstoreID = {1} pathConvention ={2}' .format(jobspec.PandaID, self.objstoreID, self.pathConvention)) else: self.pathConvention = None tmpLog.debug('PandaID = {0} objstoreID = {1}'.format( jobspec.PandaID, self.objstoreID)) # test we have a Globus Transfer Client if not self.tc: errStr = 'failed to get Globus Transfer Client' tmpLog.error(errStr) return False, errStr # set transferID to None transferID = None # get the scope of the log files outfileattrib = jobspec.get_output_file_attributes() scopeLog = 'xxxx' for key in outfileattrib.keys(): if "log.tgz" in key: scopeLog = outfileattrib[key]['scope'] # get transfer groups groups = jobspec.get_groups_of_output_files() tmpLog.debug( 'jobspec.get_groups_of_output_files() = : {0}'.format(groups)) # lock if the dummy transfer ID is used to avoid submitting duplicated transfer requests for dummy_transferID in groups: # skip if valid transfer ID not dummy one if validate_transferid(dummy_transferID): continue # lock for 120 sec tmpLog.debug( 'attempt to set DB lock for self.id - {0} dummy_transferID - {1}' .format(self.id, dummy_transferID)) have_db_lock = self.dbInterface.get_object_lock(dummy_transferID, lock_interval=120) if not have_db_lock: # escape since locked by another thread msgStr = 'escape since locked by another thread' tmpLog.debug(msgStr) return None, msgStr # refresh group information since that could have been updated by another thread before getting the lock tmpLog.debug('self.dbInterface.refresh_file_group_info(jobspec)') self.dbInterface.refresh_file_group_info(jobspec) # get transfer groups again with refreshed info tmpLog.debug( 'After db refresh call groups=jobspec.get_groups_of_output_files()' ) groups = jobspec.get_groups_of_output_files() tmpLog.debug( 'jobspec.get_groups_of_output_files() = : {0}'.format(groups)) # the dummy transfer ID is still there if dummy_transferID in groups: groupUpdateTime = groups[dummy_transferID]['groupUpdateTime'] # get files with the dummy transfer ID across jobs fileSpecs = self.dbInterface.get_files_with_group_id( dummy_transferID) # submit transfer if there are more than 10 files or the group was made before more than 10 min msgStr = 'dummy_transferID = {0} number of files = {1}'.format( dummy_transferID, len(fileSpecs)) tmpLog.debug(msgStr) if len(fileSpecs) >= 10 or \ groupUpdateTime < datetime.datetime.utcnow() - datetime.timedelta(minutes=10): tmpLog.debug('prepare to transfer files') # submit transfer and get a real transfer ID # set the Globus destination Endpoint id and path will get them from Agis eventually #self.Globus_srcPath = queueConfig.stager['Globus_srcPath'] self.srcEndpoint = queueConfig.stager['srcEndpoint'] self.Globus_srcPath = self.basePath self.Globus_dstPath = queueConfig.stager['Globus_dstPath'] self.dstEndpoint = queueConfig.stager['dstEndpoint'] # Test the endpoints and create the transfer data class errMsg = None try: # Test endpoints for activation tmpStatsrc, srcStr = globus_utils.check_endpoint_activation( tmpLog, self.tc, self.srcEndpoint) tmpStatdst, dstStr = globus_utils.check_endpoint_activation( tmpLog, self.tc, self.dstEndpoint) if tmpStatsrc and tmpStatdst: errStr = 'source Endpoint and destination Endpoint activated' tmpLog.debug(errStr) else: errMsg = '' if not tmpStatsrc: errMsg += ' source Endpoint not activated ' if not tmpStatdst: errMsg += ' destination Endpoint not activated ' # release process lock tmpLog.debug( 'attempt to release DB lock for self.id - {0} dummy_transferID - {1}' .format(self.id, dummy_transferID)) self.have_db_lock = self.dbInterface.release_object_lock( dummy_transferID) if not self.have_db_lock: errMsg += ' - Could not release DB lock for {}'.format( dummy_transferID) tmpLog.error(errMsg) tmpRetVal = (None, errMsg) return tmpRetVal # both endpoints activated now prepare to transfer data tdata = None tdata = TransferData(self.tc, self.srcEndpoint, self.dstEndpoint, sync_level="checksum") except: errStat, errMsg = globus_utils.handle_globus_exception( tmpLog) # release process lock tmpLog.debug( 'attempt to release DB lock for self.id - {0} dummy_transferID - {1}' .format(self.id, dummy_transferID)) release_db_lock = self.dbInterface.release_object_lock( dummy_transferID) if not release_db_lock: errMsg += ' - Could not release DB lock for {}'.format( dummy_transferID) tmpLog.error(errMsg) tmpRetVal = (errStat, errMsg) return tmpRetVal # loop over all files ifile = 0 for fileSpec in fileSpecs: logfile = False scope = 'panda' if fileSpec.scope is not None: scope = fileSpec.scope # for Yoda job set the scope to transient for non log files if self.Yodajob: scope = 'transient' if fileSpec.fileType == "log": logfile = True scope = scopeLog # only print to log file first 25 files if ifile < 25: msgStr = "fileSpec.lfn - {0} fileSpec.scope - {1}".format( fileSpec.lfn, fileSpec.scope) tmpLog.debug(msgStr) if ifile == 25: msgStr = "printed first 25 files skipping the rest".format( fileSpec.lfn, fileSpec.scope) tmpLog.debug(msgStr) hash = hashlib.md5() hash.update('%s:%s' % (scope, fileSpec.lfn)) hash_hex = hash.hexdigest() correctedscope = "/".join(scope.split('.')) srcURL = fileSpec.path dstURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format( endPoint=self.Globus_dstPath, scope=correctedscope, hash1=hash_hex[0:2], hash2=hash_hex[2:4], lfn=fileSpec.lfn) if logfile: tmpLog.debug('src={srcURL} dst={dstURL}'.format( srcURL=srcURL, dstURL=dstURL)) if ifile < 25: tmpLog.debug('src={srcURL} dst={dstURL}'.format( srcURL=srcURL, dstURL=dstURL)) # add files to transfer object - tdata if os.access(srcURL, os.R_OK): if ifile < 25: tmpLog.debug("tdata.add_item({},{})".format( srcURL, dstURL)) tdata.add_item(srcURL, dstURL) else: errMsg = "source file {} does not exist".format( srcURL) # release process lock tmpLog.debug( 'attempt to release DB lock for self.id - {0} dummy_transferID - {1}' .format(self.id, dummy_transferID)) release_db_lock = self.dbInterface.release_object_lock( dummy_transferID) if not release_db_lock: errMsg += ' - Could not release DB lock for {}'.format( dummy_transferID) tmpLog.error(errMsg) tmpRetVal = (False, errMsg) return tmpRetVal ifile += 1 # submit transfer tmpLog.debug('Number of files to transfer - {}'.format( len(tdata['DATA']))) try: transfer_result = self.tc.submit_transfer(tdata) # check status code and message tmpLog.debug(str(transfer_result)) if transfer_result['code'] == "Accepted": # succeeded # set transfer ID which are used for later lookup transferID = transfer_result['task_id'] tmpLog.debug( 'successfully submitted id={0}'.format( transferID)) # set status for files self.dbInterface.set_file_group( fileSpecs, transferID, 'running') msgStr = 'submitted transfer with ID={0}'.format( transferID) tmpLog.debug(msgStr) else: # release process lock tmpLog.debug( 'attempt to release DB lock for self.id - {0} dummy_transferID - {1}' .format(self.id, dummy_transferID)) release_db_lock = self.dbInterface.release_object_lock( dummy_transferID) if not release_db_lock: errMsg = 'Could not release DB lock for {}'.format( dummy_transferID) tmpLog.error(errMsg) tmpRetVal = (None, transfer_result['message']) return tmpRetVal except Exception as e: errStat, errMsg = globus_utils.handle_globus_exception( tmpLog) # release process lock tmpLog.debug( 'attempt to release DB lock for self.id - {0} dummy_transferID - {1}' .format(self.id, dummy_transferID)) release_db_lock = self.dbInterface.release_object_lock( dummy_transferID) if not release_db_lock: errMsg += ' - Could not release DB lock for {}'.format( dummy_transferID) tmpLog.error(errMsg) return errStat, errMsg else: msgStr = 'wait until enough files are pooled' tmpLog.debug(msgStr) # release the lock tmpLog.debug( 'attempt to release DB lock for self.id - {0} dummy_transferID - {1}' .format(self.id, dummy_transferID)) release_db_lock = self.dbInterface.release_object_lock( dummy_transferID) if release_db_lock: tmpLog.debug( 'released DB lock for self.id - {0} dummy_transferID - {1}' .format(self.id, dummy_transferID)) have_db_lock = False else: msgStr += ' - Could not release DB lock for {}'.format( dummy_transferID) tmpLog.error(msgStr) # return None to retry later return None, msgStr # release the db lock if needed if have_db_lock: tmpLog.debug( 'attempt to release DB lock for self.id - {0} dummy_transferID - {1}' .format(self.id, dummy_transferID)) release_db_lock = self.dbInterface.release_object_lock( dummy_transferID) if release_db_lock: tmpLog.debug( 'released DB lock for self.id - {0} dummy_transferID - {1}' .format(self.id, dummy_transferID)) have_db_lock = False else: msgStr += ' - Could not release DB lock for {}'.format( dummy_transferID) tmpLog.error(msgStr) return None, msgStr # check transfer with real transfer IDs # get transfer groups tmpLog.debug("groups = jobspec.get_groups_of_output_files()") groups = jobspec.get_groups_of_output_files() tmpLog.debug('Number of transfer groups - {0}'.format(len(groups))) tmpLog.debug('transfer groups any state - {0}'.format(groups)) if len(groups) == 0: tmpLog.debug( "jobspec.get_groups_of_output_files(skip_done=True) returned no files " ) tmpLog.debug("check_stage_out_status return status - True ") return True, '' for transferID in groups: # allow only valid UUID if validate_transferid(transferID): # get transfer task tmpStat, transferTasks = globus_utils.get_transfer_task_by_id( tmpLog, self.tc, transferID) # return a temporary error when failed to get task if not tmpStat: errStr = 'failed to get transfer task; tc = %s; transferID = %s' % ( str(self.tc), str(transferID)) tmpLog.error(errStr) return None, errStr # return a temporary error when task is missing if transferID not in transferTasks: errStr = 'transfer task ID - {} is missing'.format( transferID) tmpLog.error(errStr) return None, errStr # succeeded in finding a transfer task by tranferID if transferTasks[transferID]['status'] == 'SUCCEEDED': tmpLog.debug( 'transfer task {} succeeded'.format(transferID)) self.set_FileSpec_objstoreID(jobspec, self.objstoreID, self.pathConvention) if self.changeFileStatusOnSuccess: self.set_FileSpec_status(jobspec, 'finished') return True, '' # failed if transferTasks[transferID]['status'] == 'FAILED': errStr = 'transfer task {} failed'.format(transferID) tmpLog.error(errStr) self.set_FileSpec_status(jobspec, 'failed') return False, errStr # another status tmpStr = 'transfer task {0} status: {1}'.format( transferID, transferTasks[transferID]['status']) tmpLog.debug(tmpStr) return None, '' # end of loop over transfer groups tmpLog.debug( 'End of loop over transfers groups - ending check_stage_out_status function' ) return None, 'no valid transfer id found'
def trigger_preparation(self, jobspec): # get logger tmpLog = core_utils.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID), method_name='trigger_preparation') tmpLog.debug('start') # check that jobspec.computingSite is defined if jobspec.computingSite is None: # not found tmpLog.error('jobspec.computingSite is not defined') return False, 'jobspec.computingSite is not defined' else: tmpLog.debug('jobspec.computingSite : {0}'.format( jobspec.computingSite)) # test we have a Globus Transfer Client if not self.tc: errStr = 'failed to get Globus Transfer Client' tmpLog.error(errStr) return False, errStr # get label label = self.make_label(jobspec) tmpLog.debug('label={0}'.format(label)) # get transfer tasks tmpStat, transferTasks = globus_utils.get_transfer_tasks( tmpLog, self.tc, label) if not tmpStat: errStr = 'failed to get transfer tasks' tmpLog.error(errStr) return False, errStr # check if already queued if label in transferTasks: tmpLog.debug('skip since already queued with {0}'.format( str(transferTasks[label]))) return True, '' # set the Globus destination Endpoint id and path will get them from Agis eventually from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(jobspec.computingSite) self.Globus_srcPath = queueConfig.preparator['Globus_srcPath'] self.srcEndpoint = queueConfig.preparator['srcEndpoint'] self.Globus_dstPath = self.basePath #self.Globus_dstPath = queueConfig.preparator['Globus_dstPath'] self.dstEndpoint = queueConfig.preparator['dstEndpoint'] # get input files files = [] lfns = [] inFiles = jobspec.get_input_file_attributes(skip_ready=True) for inLFN, inFile in iteritems(inFiles): # set path to each file inFile['path'] = mover_utils.construct_file_path( self.basePath, inFile['scope'], inLFN) dstpath = inFile['path'] # check if path exists if not create it. if not os.access(self.basePath, os.F_OK): os.makedirs(self.basePath) # create the file paths for the Globus source and destination endpoints Globus_srcpath = mover_utils.construct_file_path( self.Globus_srcPath, inFile['scope'], inLFN) Globus_dstpath = mover_utils.construct_file_path( self.Globus_dstPath, inFile['scope'], inLFN) files.append({ 'scope': inFile['scope'], 'name': inLFN, 'Globus_dstPath': Globus_dstpath, 'Globus_srcPath': Globus_srcpath }) lfns.append(inLFN) tmpLog.debug('files[] {0}'.format(files)) try: # Test endpoints for activation tmpStatsrc, srcStr = globus_utils.check_endpoint_activation( tmpLog, self.tc, self.srcEndpoint) tmpStatdst, dstStr = globus_utils.check_endpoint_activation( tmpLog, self.tc, self.dstEndpoint) if tmpStatsrc and tmpStatdst: errStr = 'source Endpoint and destination Endpoint activated' tmpLog.debug(errStr) else: errStr = '' if not tmpStatsrc: errStr += ' source Endpoint not activated ' if not tmpStatdst: errStr += ' destination Endpoint not activated ' tmpLog.error(errStr) return False, errStr # both endpoints activated now prepare to transfer data if len(files) > 0: tdata = TransferData(self.tc, self.srcEndpoint, self.dstEndpoint, label=label, sync_level="checksum") # loop over all input files and add for myfile in files: tdata.add_item(myfile['Globus_srcPath'], myfile['Globus_dstPath']) # submit transfer_result = self.tc.submit_transfer(tdata) # check status code and message tmpLog.debug(str(transfer_result)) if transfer_result['code'] == "Accepted": # succeeded # set transfer ID which are used for later lookup transferID = transfer_result['task_id'] jobspec.set_groups_to_files( {transferID: { 'lfns': lfns, 'groupStatus': 'active' }}) tmpLog.debug('done') return True, '' else: return False, transfer_result['message'] # if no files to transfer return True return True, 'No files to transfer' except: errStat, errMsg = globus_utils.handle_globus_exception(tmpLog) return errStat, {}
def __init__(self, **kwarg): PluginBase.__init__(self, **kwarg) self.pluginFactory = PluginFactory() self.queue_config_mapper = QueueConfigMapper() tmpLog = self.make_logger(baseLogger, method_name='__init__') tmpLog.info("[{0}] SAGA adaptor will be used.".format(self.adaptor))
def qconf_refresh(arguments): from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper qcm = QueueConfigMapper() qcm._update_last_reload_time() qcm.lastUpdate = None qcm.load_data(refill_table=arguments.refill)
def qconf_refresh(arguments): from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper qcm = QueueConfigMapper() qcm.lastUpdate = None qcm.load_data()
def check_status(self, jobspec): # make logger tmpLog = core_utils.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID), method_name='check_status') tmpLog.debug('start') # default return tmpRetVal = (True, '') # check that jobspec.computingSite is defined if jobspec.computingSite is None: # not found tmpLog.error('jobspec.computingSite is not defined') return False, 'jobspec.computingSite is not defined' else: tmpLog.debug('jobspec.computingSite : {0}'.format(jobspec.computingSite)) # test we have a Globus Transfer Client if not self.tc : errStr = 'failed to get Globus Transfer Client' tmpLog.error(errStr) return False, errStr # set transferID to None transferID = None # get transfer groups groups = jobspec.get_groups_of_input_files(skip_ready=True) tmpLog.debug('jobspec.get_groups_of_input_files() = : {0}'.format(groups)) # lock if the dummy transfer ID is used to avoid submitting duplicated transfer requests if self.dummy_transfer_id in groups: # lock for 120 sec if not self.have_db_lock : tmpLog.debug('attempt to set DB lock for self.id - {0} self.dummy_transfer_id - {1}'.format(self.id,self.dummy_transfer_id)) self.have_db_lock = self.dbInterface.get_object_lock(self.dummy_transfer_id, lock_interval=120) if not self.have_db_lock: # escape since locked by another thread msgStr = 'escape since locked by another thread' tmpLog.debug(msgStr) return None, msgStr # refresh group information since that could have been updated by another thread before getting the lock self.dbInterface.refresh_file_group_info(jobspec) # get transfer groups again with refreshed info groups = jobspec.get_groups_of_input_files(skip_ready=True) # the dummy transfer ID is still there if self.dummy_transfer_id in groups: groupUpdateTime = groups[self.dummy_transfer_id]['groupUpdateTime'] # get files with the dummy transfer ID across jobs fileSpecs = self.dbInterface.get_files_with_group_id(self.dummy_transfer_id) # submit transfer if there are more than 10 files or the group was made before more than 10 min msgStr = 'self.dummy_transfer_id = {0} number of files = {1}'.format(self.dummy_transfer_id,len(fileSpecs)) tmpLog.debug(msgStr) if len(fileSpecs) >= 10 or \ groupUpdateTime < datetime.datetime.utcnow() - datetime.timedelta(minutes=10): tmpLog.debug('prepare to transfer files') # submit transfer and get a real transfer ID # set the Globus destination Endpoint id and path will get them from Agis eventually from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(jobspec.computingSite) self.Globus_srcPath = queueConfig.preparator['Globus_srcPath'] self.srcEndpoint = queueConfig.preparator['srcEndpoint'] self.Globus_dstPath = self.basePath #self.Globus_dstPath = queueConfig.preparator['Globus_dstPath'] self.dstEndpoint = queueConfig.preparator['dstEndpoint'] # Test the endpoints and create the transfer data class errMsg = None try: # Test endpoints for activation tmpStatsrc, srcStr = globus_utils.check_endpoint_activation(tmpLog,self.tc,self.srcEndpoint) tmpStatdst, dstStr = globus_utils.check_endpoint_activation(tmpLog,self.tc,self.dstEndpoint) if tmpStatsrc and tmpStatdst: errStr = 'source Endpoint and destination Endpoint activated' tmpLog.debug(errStr) else: errMsg = '' if not tmpStatsrc : errMsg += ' source Endpoint not activated ' if not tmpStatdst : errMsg += ' destination Endpoint not activated ' # release process lock tmpLog.debug('attempt to release DB lock for self.id - {0} self.dummy_transfer_id - {1}'.format(self.id,self.dummy_transfer_id)) self.have_db_lock = self.dbInterface.release_object_lock(self.dummy_transfer_id) if not self.have_db_lock: errMsg += ' - Could not release DB lock for {}'.format(self.dummy_transfer_id) tmpLog.error(errMsg) tmpRetVal = (None,errMsg) return tmpRetVal # both endpoints activated now prepare to transfer data tdata = TransferData(self.tc, self.srcEndpoint, self.dstEndpoint, sync_level="checksum") except: errStat, errMsg = globus_utils.handle_globus_exception(tmpLog) # release process lock tmpLog.debug('attempt to release DB lock for self.id - {0} self.dummy_transfer_id - {1}'.format(self.id,self.dummy_transfer_id)) self.have_db_lock = self.dbInterface.release_object_lock(self.dummy_transfer_id) if not self.have_db_lock: errMsg += ' - Could not release DB lock for {}'.format(self.dummy_transfer_id) tmpLog.error(errMsg) tmpRetVal = (errStat, errMsg) return tmpRetVal # loop over all files for fileSpec in fileSpecs: attrs = jobspec.get_input_file_attributes() msgStr = "len(jobSpec.get_input_file_attributes()) = {0} type - {1}".format(len(attrs),type(attrs)) tmpLog.debug(msgStr) for key, value in attrs.iteritems(): msgStr = "input file attributes - {0} {1}".format(key,value) tmpLog.debug(msgStr) msgStr = "fileSpec.lfn - {0} fileSpec.scope - {1}".format(fileSpec.lfn, fileSpec.scope) tmpLog.debug(msgStr) scope = fileSpec.scope hash = hashlib.md5() hash.update('%s:%s' % (scope, fileSpec.lfn)) hash_hex = hash.hexdigest() correctedscope = "/".join(scope.split('.')) #srcURL = fileSpec.path srcURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format(endPoint=self.Globus_srcPath, scope=correctedscope, hash1=hash_hex[0:2], hash2=hash_hex[2:4], lfn=fileSpec.lfn) dstURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format(endPoint=self.Globus_dstPath, scope=correctedscope, hash1=hash_hex[0:2], hash2=hash_hex[2:4], lfn=fileSpec.lfn) tmpLog.debug('src={srcURL} dst={dstURL}'.format(srcURL=srcURL, dstURL=dstURL)) # add files to transfer object - tdata tmpLog.debug("tdata.add_item({},{})".format(srcURL,dstURL)) tdata.add_item(srcURL,dstURL) # submit transfer try: transfer_result = self.tc.submit_transfer(tdata) # check status code and message tmpLog.debug(str(transfer_result)) if transfer_result['code'] == "Accepted": # succeeded # set transfer ID which are used for later lookup transferID = transfer_result['task_id'] tmpLog.debug('successfully submitted id={0}'.format(transferID)) # set status for files self.dbInterface.set_file_group(fileSpecs, transferID, 'running') msgStr = 'submitted transfer with ID={0}'.format(transferID) tmpLog.debug(msgStr) else: # release process lock tmpLog.debug('attempt to release DB lock for self.id - {0} self.dummy_transfer_id - {1}'.format(self.id,self.dummy_transfer_id)) self.have_db_lock = self.dbInterface.release_object_lock(self.dummy_transfer_id) if not self.have_db_lock: errMsg = 'Could not release DB lock for {}'.format(self.dummy_transfer_id) tmpLog.error(errMsg) tmpRetVal = (None, transfer_result['message']) return tmpRetVal except Exception as e: errStat,errMsg = globus_utils.handle_globus_exception(tmpLog) # release process lock tmpLog.debug('attempt to release DB lock for self.id - {0} self.dummy_transfer_id - {1}'.format(self.id,self.dummy_transfer_id)) self.have_db_lock = self.dbInterface.release_object_lock(self.dummy_transfer_id) if not self.have_db_lock: errMsg += ' - Could not release DB lock for {}'.format(self.dummy_transfer_id) tmpLog.error(errMsg) return errStat, errMsg else: msgStr = 'wait until enough files are pooled' tmpLog.debug(msgStr) # release the lock tmpLog.debug('attempt to release DB lock for self.id - {0} self.dummy_transfer_id - {1}'.format(self.id,self.dummy_transfer_id)) self.have_db_lock = self.dbInterface.release_object_lock(self.dummy_transfer_id) if not self.have_db_lock: msgStr += ' - Could not release DB lock for {}'.format(self.dummy_transfer_id) tmpLog.error(msgStr) # return None to retry later return None, msgStr # check transfer with real transfer IDs # get transfer groups groups = jobspec.get_groups_of_input_files(skip_ready=True) for transferID in groups: if transferID != self.dummy_transfer_id : # get transfer task tmpStat, transferTasks = globus_utils.get_transfer_task_by_id(tmpLog,self.tc,transferID) # return a temporary error when failed to get task if not tmpStat: errStr = 'failed to get transfer task' tmpLog.error(errStr) return None, errStr # return a temporary error when task is missing if transferID not in transferTasks: errStr = 'transfer task ID - {} is missing'.format(transferID) tmpLog.error(errStr) return None, errStr # succeeded in finding a transfer task by tranferID if transferTasks[transferID]['status'] == 'SUCCEEDED': tmpLog.debug('transfer task {} succeeded'.format(transferID)) self.set_FileSpec_status(jobspec,'finished') return True, '' # failed if transferTasks[transferID]['status'] == 'FAILED': errStr = 'transfer task {} failed'.format(transferID) tmpLog.error(errStr) self.set_FileSpec_status(jobspec,'failed') return False, errStr # another status tmpStr = 'transfer task {0} status: {1}'.format(transferID,transferTasks[transferID]['status']) tmpLog.debug(tmpStr) return None, ''
def submit_workers(self, workspec_list): tmpLog = self.make_logger(baseLogger, method_name='submit_workers') nWorkers = len(workspec_list) tmpLog.debug('start nWorkers={0}'.format(nWorkers)) # get log subdirectory name from timestamp timeNow = datetime.datetime.utcnow() log_subdir = timeNow.strftime('%y-%m-%d_%H') log_subdir_path = os.path.join(self.logDir, log_subdir) try: os.mkdir(log_subdir_path) except OSError as e: if e.errno != errno.EEXIST: raise else: pass # get info from harvester queue config _queueConfigMapper = QueueConfigMapper() harvester_queue_config = _queueConfigMapper.get_queue(self.queueName) # get queue info from AGIS by cacher in db if self.useAtlasAGIS: panda_queues_dict = PandaQueuesDict() panda_queue_name = panda_queues_dict.get_panda_queue_name(self.queueName) this_panda_queue_dict = panda_queues_dict.get(self.queueName, dict()) # tmpLog.debug('panda_queues_name and queue_info: {0}, {1}'.format(self.queueName, panda_queues_dict[self.queueName])) else: panda_queues_dict = dict() panda_queue_name = self.queueName this_panda_queue_dict = dict() def _handle_one_worker(workspec): # make logger tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), method_name='_handle_one_worker') # get default information from queue info n_core_per_node_from_queue = this_panda_queue_dict.get('corecount', 1) if this_panda_queue_dict.get('corecount', 1) else 1 is_unified_queue = 'unifiedPandaQueue' in this_panda_queue_dict.get('catchall', '').split(',') \ or this_panda_queue_dict.get('capability', '') == 'ucore' ce_info_dict = dict() batch_log_dict = dict() special_par = '' if self.useAtlasGridCE: # If ATLAS Grid CE mode used tmpLog.debug('Using ATLAS Grid CE mode...') queues_from_queue_list = this_panda_queue_dict.get('queues', []) special_par = this_panda_queue_dict.get('special_par', '') ce_auxilary_dict = {} for _queue_dict in queues_from_queue_list: if not ( _queue_dict.get('ce_endpoint') and str(_queue_dict.get('ce_state', '')).upper() == 'ACTIVE' and str(_queue_dict.get('ce_flavour', '')).lower() in set(['arc-ce', 'cream-ce', 'htcondor-ce']) ): continue ce_endpoint = _queue_dict.get('ce_endpoint') if ( ce_endpoint in ce_auxilary_dict and str(_queue_dict.get('ce_queue_name', '')).lower() == 'default' ): pass else: ce_auxilary_dict[ce_endpoint] = _queue_dict # qualified CEs from AGIS info n_qualified_ce = len(ce_auxilary_dict) queue_status_dict = self.dbInterface.get_queue_status(self.queueName) worker_ce_stats_dict = self.dbInterface.get_worker_ce_stats(self.queueName) ce_weight_dict = _get_ce_weight_dict(ce_endpoint_list=list(ce_auxilary_dict.keys()), queue_status_dict=queue_status_dict, worker_ce_stats_dict=worker_ce_stats_dict) # good CEs which can be submitted to, duplicate by weight good_ce_weighted_list = [] for _ce_endpoint in ce_auxilary_dict.keys(): good_ce_weighted_list.extend([_ce_endpoint] * ce_weight_dict.get(_ce_endpoint, 0)) tmpLog.debug('queue_status_dict: {0} ; worker_ce_stats_dict: {1} ; ce_weight_dict: {2}'.format( queue_status_dict, worker_ce_stats_dict, ce_weight_dict)) if len(good_ce_weighted_list) > 0: ce_info_dict = ce_auxilary_dict[random.choice(good_ce_weighted_list)].copy() else: tmpLog.info('No good CE endpoint left. Choose an arbitrary CE endpoint') ce_info_dict = random.choice(list(ce_auxilary_dict.values())).copy() ce_endpoint_from_queue = ce_info_dict.get('ce_endpoint', '') ce_flavour_str = str(ce_info_dict.get('ce_flavour', '')).lower() ce_version_str = str(ce_info_dict.get('ce_version', '')).lower() ce_info_dict['ce_hostname'] = re.sub(':\w*', '', ce_endpoint_from_queue) tmpLog.debug('For site {0} got CE endpoint: "{1}", flavour: "{2}"'.format(self.queueName, ce_endpoint_from_queue, ce_flavour_str)) if os.path.isdir(self.CEtemplateDir) and ce_flavour_str: sdf_template_filename = '{ce_flavour_str}.sdf'.format(ce_flavour_str=ce_flavour_str) self.templateFile = os.path.join(self.CEtemplateDir, sdf_template_filename) # template for batch script tmpFile = open(self.templateFile) sdf_template = tmpFile.read() tmpFile.close() # get batch_log, stdout, stderr filename for _line in sdf_template.split('\n'): if _line.startswith('#'): continue _match_batch_log = re.match('log = (.+)', _line) _match_stdout = re.match('output = (.+)', _line) _match_stderr = re.match('error = (.+)', _line) if _match_batch_log: batch_log_value = _match_batch_log.group(1) continue if _match_stdout: stdout_value = _match_stdout.group(1) continue if _match_stderr: stderr_value = _match_stderr.group(1) continue # get override requirements from queue configured try: n_core_per_node = self.nCorePerNode if self.nCorePerNode else n_core_per_node_from_queue except AttributeError: n_core_per_node = n_core_per_node_from_queue # URLs for log files if not (self.logBaseURL is None): if workspec.batchID: batchID = workspec.batchID guess = False else: batchID = '' guess = True batch_log_filename = parse_batch_job_filename(value_str=batch_log_value, file_dir=log_subdir_path, batchID=batchID, guess=guess) stdout_path_file_name = parse_batch_job_filename(value_str=stdout_value, file_dir=log_subdir_path, batchID=batchID, guess=guess) stderr_path_filename = parse_batch_job_filename(value_str=stderr_value, file_dir=log_subdir_path, batchID=batchID, guess=guess) batch_log = '{0}/{1}/{2}'.format(self.logBaseURL, log_subdir, batch_log_filename) batch_stdout = '{0}/{1}/{2}'.format(self.logBaseURL, log_subdir, stdout_path_file_name) batch_stderr = '{0}/{1}/{2}'.format(self.logBaseURL, log_subdir, stderr_path_filename) workspec.set_log_file('batch_log', batch_log) workspec.set_log_file('stdout', batch_stdout) workspec.set_log_file('stderr', batch_stderr) batch_log_dict['batch_log'] = batch_log batch_log_dict['batch_stdout'] = batch_stdout batch_log_dict['batch_stderr'] = batch_stderr batch_log_dict['gtag'] = workspec.workAttributes['stdOut'] tmpLog.debug('Done set_log_file before submission') tmpLog.debug('Done jobspec attribute setting') # set data dict data = {'workspec': workspec, 'template': sdf_template, 'log_dir': self.logDir, 'log_subdir': log_subdir, 'n_core_per_node': n_core_per_node, 'panda_queue_name': panda_queue_name, 'x509_user_proxy': self.x509UserProxy, 'ce_info_dict': ce_info_dict, 'batch_log_dict': batch_log_dict, 'special_par': special_par, 'harvester_queue_config': harvester_queue_config, 'is_unified_queue': is_unified_queue, 'condor_schedd': self.condorSchedd, 'condor_pool': self.condorPool, } return data def _propagate_attributes(workspec, tmpVal): # make logger tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), method_name='_propagate_attributes') (retVal, tmpDict) = tmpVal workspec.set_attributes_with_dict(tmpDict) tmpLog.debug('Done workspec attributes propagation') return retVal tmpLog.debug('finished preparing worker attributes') # map(_handle_one_worker, workspec_list) with ThreadPoolExecutor(self.nProcesses * 4) as thread_pool: dataIterator = thread_pool.map(_handle_one_worker, workspec_list) tmpLog.debug('{0} workers handled'.format(nWorkers)) # exec with mcore with ThreadPoolExecutor(self.nProcesses) as thread_pool: retValList = thread_pool.map(submit_a_worker, dataIterator) tmpLog.debug('{0} workers submitted'.format(nWorkers)) # propagate changed attributes with ThreadPoolExecutor(self.nProcesses) as thread_pool: retIterator = thread_pool.map(lambda _wv_tuple: _propagate_attributes(*_wv_tuple), zip(workspec_list, retValList)) retList = list(retIterator) tmpLog.debug('done') return retList
def submit_k8s_worker(self, work_spec): tmp_log = self.make_logger(base_logger, method_name='submit_k8s_worker') # get info from harvester queue config _queueConfigMapper = QueueConfigMapper() harvester_queue_config = _queueConfigMapper.get_queue(self.queueName) prod_source_label = harvester_queue_config.get_source_label( work_spec.jobType) # set the stdout log file log_file_name = '{0}_{1}.out'.format( harvester_config.master.harvester_id, work_spec.workerID) work_spec.set_log_file( 'stdout', '{0}/{1}'.format(self.logBaseURL, log_file_name)) # TODO: consider if we want to upload the yaml file to PanDA cache yaml_content = self.k8s_client.read_yaml_file(self.k8s_yaml_file) try: # read the job configuration (if available, only push model) job_fields, job_pars_parsed = self.read_job_configuration( work_spec) # decide container image and executable to run. In pull mode, defaults are provided container_image = self.decide_container_image( job_fields, job_pars_parsed) executable, args = self.build_executable(job_fields, job_pars_parsed) tmp_log.debug( 'container_image: "{0}"; executable: "{1}"; args: "{2}"'. format(container_image, executable, args)) # choose the appropriate proxy panda_queues_dict = PandaQueuesDict() is_grandly_unified_queue = panda_queues_dict.is_grandly_unified_queue( self.queueName) cert, use_secret = self._choose_proxy(work_spec, is_grandly_unified_queue) if not cert: err_str = 'No proxy specified in proxySecretPath or x509UserProxy. Not submitted' tmp_return_value = (False, err_str) return tmp_return_value # get the walltime limit try: max_time = panda_queues_dict.get(self.queueName)['maxtime'] except Exception as e: tmp_log.warning( 'Could not retrieve maxtime field for queue {0}'.format( self.queueName)) max_time = None # submit the worker rsp, yaml_content_final = self.k8s_client.create_job_from_yaml( yaml_content, work_spec, prod_source_label, container_image, executable, args, cert, cert_in_secret=use_secret, cpu_adjust_ratio=self.cpuAdjustRatio, memory_adjust_ratio=self.memoryAdjustRatio, max_time=max_time) except Exception as _e: tmp_log.error(traceback.format_exc()) err_str = 'Failed to create a JOB; {0}'.format(_e) tmp_return_value = (False, err_str) else: work_spec.batchID = yaml_content['metadata']['name'] tmp_log.debug('Created worker {0} with batchID={1}'.format( work_spec.workerID, work_spec.batchID)) tmp_return_value = (True, '') return tmp_return_value
def main(): logging.basicConfig() parser = argparse.ArgumentParser() parser.add_argument( '--remoteDir', action='store', dest='remoteDir', default='harvester', help= 'directory on the remote target machine where harvester is installed') parser.add_argument( '--remoteBuildDir', action='store', dest='remoteBuildDir', default='harvester_build', help='directory on the remote target machine where harvester is build') parser.add_argument('--remotePythonSetup', action='store', dest='remotePythonSetup', default='', help='python setup on remote target machine') parser.add_argument('--queueName', action='store', dest='queueName', default=None, required=True, help='the name of queue where harvester is installed') parser.add_argument('--middleware', action='store', dest='middleware', default='rpc', help='middleware to access the remote target machine') options = parser.parse_args() # remove ~/ which doesn't work with sftp options.remoteDir = re.sub('^~/', '', options.remoteDir) options.remoteBuildDir = re.sub('^~/', '', options.remoteBuildDir) # get queue qcm = QueueConfigMapper() qcm.load_data() queueConfig = qcm.get_queue(options.queueName) if queueConfig is None: print('ERROR: queue={0} not found in panda_queueconfig.json'.format( options.queueName)) sys.exit(1) # get middleware if not hasattr(queueConfig, options.middleware): print( 'ERROR: middleware={0} is not defined for {1} in panda_queueconfig.json' .format(options.middleware, options.queueName)) sys.exit(1) middleware = getattr(queueConfig, options.middleware) # get ssh parameters sshHost = middleware['remoteHost'] try: sshPort = middleware['remotePort'] except Exception: sshPort = 22 sshUserName = middleware['sshUserName'] try: sshPassword = middleware['sshPassword'] except Exception: sshPassword = None privateKey = None passPhrase = None if sshPassword is None: try: privateKey = middleware['privateKey'] except Exception: print("ERROR: set sshPassword or privateKey in middleware={0}". format(options.middleware)) sys.exit(1) try: passPhrase = middleware['passPhrase'] except Exception: passPhrase = None try: jumpHost = middleware['jumpHost'] except Exception: jumpHost = None try: jumpPort = middleware['jumpPort'] except Exception: jumpPort = 22 # ssh sshClient = make_ssh_connection(sshHost, sshPort, sshUserName, sshPassword, passPhrase, privateKey, jumpHost, jumpPort) # get remote python version exec_out = sshClient.exec_command(';'.join([ options.remotePythonSetup, """python -c 'import sys;print("{0}{1}".format(*(sys.version_info[:2])))' """ ])) remotePythonVer = exec_out[1].read().rstrip() sshClient.close() print('remote python version : {0}'.format(remotePythonVer)) # make tmp dir with TemporaryDirectory() as tmpDir: harvesterGit = "git+git://github.com/PanDAWMS/panda-harvester.git" # get all dependencies print("getting dependencies") p = subprocess.Popen("pip download -d {0} {1}; rm -rf {0}/*".format( tmpDir, harvesterGit), stdout=subprocess.PIPE, shell=True) stdout, stderr = p.communicate() packages = [] for line in stdout.split('\n'): if line.startswith('Successfully downloaded'): packages = line.split()[2:] packages.append(harvesterGit) packages.append('pip') packages.remove('pandaharvester') # download packages print("pip download to {0}".format(tmpDir)) for package in packages: print("getting {0}".format(package)) ret = subprocess.call( "pip download --no-deps --python-version {0} -d {1} {2}". format(remotePythonVer, tmpDir, package), shell=True) if ret != 0: print("ERROR: failed to download {0}".format(package)) sys.exit(1) # sftp sshClient = make_ssh_connection(sshHost, sshPort, sshUserName, sshPassword, passPhrase, privateKey, jumpHost, jumpPort) try: sshClient.exec_command('rm -rf {0}; mkdir -p {0}'.format( options.remoteBuildDir)) except Exception: pass sftp = sshClient.open_sftp() for name in os.listdir(tmpDir): path = os.path.join(tmpDir, name) if os.path.isdir(path): continue remotePath = os.path.join(options.remoteBuildDir, name) print("copy {0} to {1}".format(name, remotePath)) sftp.put(path, remotePath) # install print("install harvester") buildDir = options.remoteBuildDir if not buildDir.startswith('/'): buildDir = '~/' + buildDir exec_out = sshClient.exec_command(';'.join([ options.remotePythonSetup, 'cd {0}'.format(options.remoteDir), 'pip install pip pandaharvester --no-index --find-links {0}'. format(buildDir) ])) print(exec_out[1].read()) print(exec_out[2].read()) sshClient.close()
def trigger_stage_out(self, jobspec): # make logger tmpLog = self.make_logger(baseLogger, 'PandaID={0} ThreadID={1}'.format( jobspec.PandaID, threading.current_thread().ident), method_name='trigger_stage_out') tmpLog.debug('start') # check that jobspec.computingSite is defined if jobspec.computingSite is None: # not found tmpLog.error('jobspec.computingSite is not defined') return False, 'jobspec.computingSite is not defined' else: tmpLog.debug('jobspec.computingSite : {0}'.format( jobspec.computingSite)) # get the queueConfig and corresponding objStoreID_ES queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(jobspec.computingSite) # write to debug log queueConfig.stager tmpLog.debug( 'jobspec.computingSite - {0} queueConfig.stager {1}'.format( jobspec.computingSite, queueConfig.stager)) # check queueConfig stager section to see if jobtype is set if 'jobtype' in queueConfig.stager: if queueConfig.stager['jobtype'] == "Yoda": self.Yodajob = True # set the location of the files in fileSpec.objstoreID # see file /cvmfs/atlas.cern.ch/repo/sw/local/etc/agis_ddmendpoints.json self.objstoreID = int(queueConfig.stager['objStoreID_ES']) if self.Yodajob: self.pathConvention = int(queueConfig.stager['pathConvention']) tmpLog.debug( 'Yoda Job - PandaID = {0} objstoreID = {1} pathConvention ={2}' .format(jobspec.PandaID, self.objstoreID, self.pathConvention)) else: self.pathConvention = None tmpLog.debug('PandaID = {0} objstoreID = {1}'.format( jobspec.PandaID, self.objstoreID)) self.RSE_dstpath = queueConfig.stager['RSE_dstPath'] # loop over the output files and copy the files ifile = 0 errors = [] for fileSpec in jobspec.get_output_file_specs(skip_done=True): scope = 'panda' if fileSpec.scope is not None: scope = fileSpec.scope # for Yoda job set the scope to transient if self.Yodajob: scope = 'transient' # only print to log file first 25 files if ifile < 25: msgStr = "fileSpec.lfn - {0} fileSpec.scope - {1}".format( fileSpec.lfn, fileSpec.scope) tmpLog.debug(msgStr) if ifile == 25: msgStr = "printed first 25 files skipping the rest".format( fileSpec.lfn, fileSpec.scope) tmpLog.debug(msgStr) hash = hashlib.md5() hash.update('%s:%s' % (scope, fileSpec.lfn)) hash_hex = hash.hexdigest() correctedscope = "/".join(scope.split('.')) srcURL = fileSpec.path dstURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format( endPoint=self.RSE_dstPath, scope=correctedscope, hash1=hash_hex[0:2], hash2=hash_hex[2:4], lfn=fileSpec.lfn) if ifile < 25: tmpLog.debug('src={srcURL} dst={dstURL}'.format(srcURL=srcURL, dstURL=dstURL)) # copy the source file from source to destination skip over if file already exists if os.path.exists(dstURL): tmpLog.debug('Already copied file {0}'.format(dstURL)) # Set the file spec status if self.changeFileStatusOnSuccess: fileSpec.status = 'finished' else: if os.path.exists(srcURL): # check if destination directory exists if not create it dstDIR = os.path.dirname(dstURL) try: if not os.path.exists(dstDIR): os.makedirs(dstDIR) mode = stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IXGRP mode = mode | stat.S_IROTH | stat.S_IXOTH | stat.S_ISGID os.chmod(dstDIR, mode) # copy the source file to destination file shutil.copy2(srcURL, dstURL) # Set the file spec status if self.changeFileStatusOnSuccess: self.set_FileSpec_status(jobspec, 'finished') except (IOError, os.error) as why: errors.append((srcURL, dstURL, str(why))) else: errors.append((srcURL, dstURL, 'Source file missing')) ifile += 1 # Now test for any errors if errors: for error in errors: tmpLog.debug( 'copy error source {0} destination {1} Reason {2}'.format( error[0], error[1], error[2])) raise Error(errors) # otherwise we are OK tmpLog.debug('stop') return True, ''
def trigger_stage_out(self, jobspec): # make logger tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID), method_name='trigger_stage_out') tmpLog.debug('start') # default return tmpRetVal = (True, '') # check that jobspec.computingSite is defined if jobspec.computingSite is None: # not found tmpLog.error('jobspec.computingSite is not defined') return False, 'jobspec.computingSite is not defined' else: tmpLog.debug('jobspec.computingSite : {0}'.format( jobspec.computingSite)) # test we have a Globus Transfer Client if not self.tc: errStr = 'failed to get Globus Transfer Client' tmpLog.error(errStr) return False, errStr # get label label = self.make_label(jobspec) tmpLog.debug('label={0}'.format(label)) # get transfer tasks tmpStat, transferTasks = globus_utils.get_transfer_tasks( tmpLog, self.tc, label) if not tmpStat: errStr = 'failed to get transfer tasks' tmpLog.error(errStr) return False, errStr # check if already queued if label in transferTasks: tmpLog.debug('skip since already queued with {0}'.format( str(transferTasks[label]))) return True, '' # set the Globus destination Endpoint id and path will get them from Agis eventually from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(jobspec.computingSite) #self.Globus_srcPath = queueConfig.stager['Globus_srcPath'] self.srcEndpoint = queueConfig.stager['srcEndpoint'] self.Globus_srcPath = self.basePath self.Globus_dstPath = queueConfig.stager['Globus_dstPath'] self.dstEndpoint = queueConfig.stager['dstEndpoint'] # Test the endpoints and create the transfer data class errMsg = None try: # Test endpoints for activation tmpStatsrc, srcStr = globus_utils.check_endpoint_activation( tmpLog, self.tc, self.srcEndpoint) tmpStatdst, dstStr = globus_utils.check_endpoint_activation( tmpLog, self.tc, self.dstEndpoint) if tmpStatsrc and tmpStatdst: errStr = 'source Endpoint and destination Endpoint activated' tmpLog.debug(errStr) else: errMsg = '' if not tmpStatsrc: errMsg += ' source Endpoint not activated ' if not tmpStatdst: errMsg += ' destination Endpoint not activated ' tmpLog.error(errMsg) tmpRetVal = (False, errMsg) return tmpRetVal # both endpoints activated now prepare to transfer data tdata = TransferData(self.tc, self.srcEndpoint, self.dstEndpoint, label=label, sync_level="checksum") except: errStat, errMsg = globus_utils.handle_globus_exception(tmpLog) tmpRetVal = (errStat, errMsg) return tmpRetVal # loop over all files fileAttrs = jobspec.get_output_file_attributes() lfns = [] for fileSpec in jobspec.outFiles: scope = fileAttrs[fileSpec.lfn]['scope'] hash = hashlib.md5() hash.update('%s:%s' % (scope, fileSpec.lfn)) hash_hex = hash.hexdigest() correctedscope = "/".join(scope.split('.')) srcURL = fileSpec.path dstURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format( endPoint=self.Globus_dstPath, scope=correctedscope, hash1=hash_hex[0:2], hash2=hash_hex[2:4], lfn=fileSpec.lfn) tmpLog.debug('src={srcURL} dst={dstURL}'.format(srcURL=srcURL, dstURL=dstURL)) # add files to transfer object - tdata if os.access(srcURL, os.R_OK): tmpLog.debug("tdata.add_item({},{})".format(srcURL, dstURL)) tdata.add_item(srcURL, dstURL) lfns.append(fileSpec.lfn) else: errMsg = "source file {} does not exist".format(srcURL) tmpLog.error(errMsg) tmpRetVal = (False, errMsg) return tmpRetVal # submit transfer try: transfer_result = self.tc.submit_transfer(tdata) # check status code and message tmpLog.debug(str(transfer_result)) if transfer_result['code'] == "Accepted": # succeeded # set transfer ID which are used for later lookup transferID = transfer_result['task_id'] tmpLog.debug( 'successfully submitted id={0}'.format(transferID)) jobspec.set_groups_to_files( {transferID: { 'lfns': lfns, 'groupStatus': 'active' }}) # set for fileSpec in jobspec.outFiles: if fileSpec.fileAttributes == None: fileSpec.fileAttributes = {} fileSpec.fileAttributes['transferID'] = transferID else: tmpRetVal = (False, transfer_result['message']) except Exception as e: errStat, errMsg = globus_utils.handle_globus_exception(tmpLog) if errMsg is None: errtype, errvalue = sys.exc_info()[:2] errMsg = "{0} {1}".format(errtype.__name__, errvalue) tmpRetVal = (errStat, errMsg) # return tmpLog.debug('done') return tmpRetVal
def submit_workers(self, workspec_list): retlist = [] # Get queue info from DB pandaqueues = self.dbproxy.get_cache("panda_queues.json", None) if pandaqueues is None: raise Exception("Failed to get panda queue info from database") pandaqueues = pandaqueues.data osmap = self.dbproxy.get_cache("ddmendpoints_objectstores.json", None) if osmap is None: raise Exception("Failed to get Object Store info from database") osmap = osmap.data for workspec in workspec_list: arclog = arc_utils.ARCLogger(baselogger, workspec.workerID) tmplog = arclog.log # Assume for aCT that jobs are always pre-fetched (no late-binding) for jobspec in workspec.get_jobspec_list(): tmplog.debug("JobSpec: {0}".format(jobspec.values_map())) if jobspec.computingSite not in pandaqueues: retlist.append( (False, "No queue information for {0}".format( jobspec.computingSite))) continue # Get CEs from panda queue info # List of (endpoint, queue) tuples arcces = [] for endpoint in pandaqueues[jobspec.computingSite]['queues']: ce_endpoint = endpoint['ce_endpoint'] if not re.search('://', ce_endpoint): ce_endpoint = 'gsiftp://%s' % ce_endpoint ce_queue = endpoint['ce_queue_name'] arcces.append((ce_endpoint, ce_queue)) if not arcces: retlist.append((False, "No CEs defined for %{0}".format( jobspec.computingSite))) continue # Set true pilot or not queueconfigmapper = QueueConfigMapper() queueconfig = queueconfigmapper.get_queue( jobspec.computingSite) pandaqueues[jobspec.computingSite][ 'truepilot'] = 'running' in queueconfig.noHeartbeat # Set log URL for GTAG env in job description logbaseurl = queueconfig.submitter.get('logBaseURL') logsubdir = self._set_logdir(jobspec.computingSite) logfileurl = '/'.join( [logbaseurl, logsubdir, '%d.out' % jobspec.PandaID]) if logbaseurl else None tmplog.debug("Converting to ARC XRSL format") arcxrsl = ARCParser( jobspec.jobParams, jobspec.computingSite, pandaqueues[jobspec.computingSite], logfileurl, self.schedulerid, osmap, '/tmp', # tmpdir, TODO common tmp dir None, #jobSpec.eventranges, # TODO event ranges tmplog) arcxrsl.parse() xrsl = arcxrsl.getXrsl() tmplog.debug("ARC xrsl: {0}".format(xrsl)) # Set the files to be downloaded at the end of the job downloadfiles = 'gmlog/errors' if 'logFile' in jobspec.jobParams: downloadfiles += ';%s' % jobspec.jobParams[ 'logFile'].replace('.tgz', '') if not pandaqueues[jobspec.computingSite]['truepilot']: downloadfiles += ';jobSmallFiles.tgz' # Set certificate userconfig = arc.UserConfig(self.cred_type) proxyrole = '' if jobspec.jobParams['prodSourceLabel'] == 'user': userconfig.ProxyPath(str(self.certs['pilot'])) proxyrole = 'pilot' else: userconfig.ProxyPath(str(self.certs['production'])) proxyrole = 'production' tmplog.debug("Submitting using {0} proxy at {1}".format( proxyrole, userconfig.ProxyPath())) try: tmplog.debug("Submission targets: {0}".format(arcces)) arcjob = self._arc_submit(xrsl, arcces, userconfig, tmplog) tmplog.info("ARC CE job id {0}".format(arcjob.JobID)) arc_utils.arcjob2workspec(arcjob, workspec) workspec.workAttributes['arcdownloadfiles'] = downloadfiles workspec.workAttributes['proxyrole'] = proxyrole workspec.workAttributes['logsubdir'] = logsubdir workspec.batchID = arcjob.JobID tmplog.debug(workspec.workAttributes) result = (True, '') except Exception as exc: tmplog.error(traceback.format_exc()) result = (False, "Failed to submit ARC job: {0}".format(str(exc))) retlist.append(result) return retlist
def __init__(self, **kwarg): BaseStager.__init__(self, **kwarg) self.queue_config_mapper = QueueConfigMapper()
def check_status(self, jobspec): # make logger tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID), method_name='check_status') tmpLog.debug('executing base check_status') tmpStat, tmpMsg = GlobusBulkStager.check_status(self, jobspec) tmpLog.debug('got {0} {1}'.format(tmpStat, tmpMsg)) if tmpStat is not True: return tmpStat, tmpMsg # get transfer groups groups = jobspec.get_groups_of_output_files() if len(groups) == 0: return tmpStat, tmpMsg # get the queueConfig and corresponding objStoreID_ES queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(jobspec.computingSite) # write to debug log queueConfig.stager tmpLog.debug('jobspec.computingSite - {0} queueConfig.stager {1}'.format(jobspec.computingSite,queueConfig.stager)) # check queueConfig stager section to see if srcRSE is set if 'srcRSE' in queueConfig.stager: srcRSE = queueConfig.stager['srcRSE'] else: tmpLog.debug('Warning srcRSE not defined in stager portion of queue config file') # get destination endpoint nucleus = jobspec.jobParams['nucleus'] agis = self.dbInterface.get_cache('panda_queues.json').data dstRSE = [agis[x]["astorages"]['pr'][0] for x in agis if agis[x]["atlas_site"] == nucleus][0] # if debugging log source and destination RSEs tmpLog.debug('srcRSE - {0} dstRSE - {1}'.format(srcRSE,dstRSE)) # test that srcRSE and dstRSE are defined tmpLog.debug('srcRSE - {0} dstRSE - {1}'.format(srcRSE,dstRSE)) errStr = '' if srcRSE is None: errStr = 'Source RSE is not defined ' if dstRSE is None: errStr = errStr + ' Desitination RSE is not defined' if (srcRSE is None) or (dstRSE is None) : tmpLog.error(errStr) return None,errStr # check queueConfig stager section to see if jobtype is set if 'jobtype' in queueConfig.stager: if queueConfig.stager['jobtype'] == "Yoda" : self.Yodajob = True # set the location of the files in fileSpec.objstoreID # see file /cvmfs/atlas.cern.ch/repo/sw/local/etc/agis_ddmendpoints.json ddm = self.dbInterface.get_cache('agis_ddmendpoints.json').data self.objstoreID = ddm[dstRSE]['id'] if self.Yodajob : self.pathConvention = int(queueConfig.stager['pathConvention']) tmpLog.debug('Yoda Job - PandaID = {0} objstoreID = {1} pathConvention ={2}'.format(jobspec.PandaID,self.objstoreID,self.pathConvention)) else: self.pathConvention = None tmpLog.debug('PandaID = {0} objstoreID = {1}'.format(jobspec.PandaID,self.objstoreID)) # set the location of the files in fileSpec.objstoreID self.set_FileSpec_objstoreID(jobspec, self.objstoreID, self.pathConvention) # create the Rucio Client try: # register dataset rucioAPI = RucioClient() except Exception: core_utils.dump_error_message(tmpLog) # treat as a temporary error tmpStat = None tmpMsg = 'failed to add a rule for {0}:{1}'.format(datasetScope, datasetName) return tmpStat,tmpMsg # loop over all transfers tmpStat = True tmpMsg = '' for transferID in groups: if transferID is None: continue datasetName = 'panda.harvester.{0}.{1}'.format(jobspec.PandaID, transferID) datasetScope = 'transient' # lock have_db_lock = self.dbInterface.get_object_lock(transferID, lock_interval=120) if not have_db_lock: msgStr = 'escape since {0} is locked by another thread'.format(transferID) tmpLog.debug(msgStr) return None, msgStr # get transfer status groupStatus = self.dbInterface.get_file_group_status(transferID) if 'hopped' in groupStatus: # already succeeded pass elif 'failed' in groupStatus: # transfer failure tmpStat = False tmpMsg = 'rucio rule for {0}:{1} already failed'.format(datasetScope, datasetName) elif 'hopping' in groupStatus: # check rucio rule ruleStatus = 'FAILED' try: tmpLog.debug('check state for {0}:{1}'.format(datasetScope, datasetName)) for ruleInfo in rucioAPI.list_did_rules(datasetScope, datasetName): if ruleInfo['rse_expression'] != dstRSE: continue ruleStatus = ruleInfo['state'] tmpLog.debug('got state={0}'.format(ruleStatus)) if ruleStatus == 'OK': break except DataIdentifierNotFound: tmpLog.error('dataset not found') except Exception: core_utils.dump_error_message(tmpLog) ruleStatus = None if ruleStatus in ['FAILED', 'CANCELED']: # transfer failure tmpStat = False tmpMsg = 'rucio rule for {0}:{1} failed with {2}'.format(datasetScope, datasetName, ruleStatus) # update file group status self.dbInterface.update_file_group_status(transferID, 'failed') elif ruleStatus == 'OK': # update successful file group status self.dbInterface.update_file_group_status(transferID, 'hopped') else: # replicating or temporary error tmpStat = None tmpMsg = 'replicating or temporary error for {0}:{1}'.format(datasetScope, datasetName) else: # make rucio rule fileSpecs = self.dbInterface.get_files_with_group_id(transferID) fileList = [] for fileSpec in fileSpecs: tmpFile = dict() tmpFile['scope'] = datasetScope tmpFile['name'] = fileSpec.lfn tmpFile['bytes'] = fileSpec.fsize tmpFile['adler32'] = fileSpec.chksum if fileSpec.fileAttributes is not None and 'guid' in fileSpec.fileAttributes: tmpFile['meta'] = {'guid': fileSpec.fileAttributes['guid']} else : tmpLog.debug('File - {0} does not have a guid value'.format(fileSpec.lfn)) tmpLog.debug('Adding file {0} to fileList'.format(fileSpec.lfn)) fileList.append(tmpFile) # get source RSE if srcRSE is None and fileSpec.objstoreID is not None: ddm = self.dbInterface.get_cache('agis_ddmendpoints.json').data srcRSE = [x for x in ddm if ddm[x]["id"] == fileSpec.objstoreID][0] try: # register dataset tmpLog.debug('register {0}:{1} rse = {2} meta=(hidden: True) lifetime = {3}' .format(datasetScope, datasetName,srcRSE,(30*24*60*60))) try: rucioAPI.add_dataset(datasetScope, datasetName, meta={'hidden': True}, lifetime=30 * 24 * 60 * 60, rse=srcRSE ) except DataIdentifierAlreadyExists: # ignore even if the dataset already exists pass except Exception: errMsg = 'Could not create dataset {0}:{1} srcRSE - {2}'.format(datasetScope, datasetName, srcRSE) core_utils.dump_error_message(tmpLog) tmpLog.error(errMsg) raise # return None,errMsg # add files to dataset # add 500 files at a time numfiles = len(fileList) maxfiles = 500 numslices = numfiles/maxfiles if (numfiles%maxfiles) > 0 : numslices = numslices + 1 start = 0 for i in range(numslices) : try: stop = start + maxfiles if stop > numfiles : stop = numfiles rucioAPI.add_files_to_datasets([{'scope': datasetScope, 'name': datasetName, 'dids': fileList[start:stop], 'rse': srcRSE}], ignore_duplicate=True) start = stop except FileAlreadyExists: # ignore if files already exist pass except Exception: errMsg = 'Could not add files to DS - {0}:{1} rse - {2} files - {3}'.format(datasetScope, datasetName, srcRSE, fileList) core_utils.dump_error_message(tmpLog) tmpLog.error(errMsg) return None,errMsg # add rule try: tmpDID = dict() tmpDID['scope'] = datasetScope tmpDID['name'] = datasetName tmpRet = rucioAPI.add_replication_rule([tmpDID], 1, dstRSE, lifetime=30 * 24 * 60 * 60) ruleIDs = tmpRet[0] tmpLog.debug('registered dataset {0}:{1} with rule {2}'.format(datasetScope, datasetName, str(ruleIDs))) except DuplicateRule: # ignore duplicated rule tmpLog.debug('rule is already available') except Exception: errMsg = 'Error creating rule for dataset {0}:{1}'.format(datasetScope, datasetName) core_utils.dump_error_message(tmpLog) tmpLog.debug(errMsg) #raise return None,errMsg # update file group status self.dbInterface.update_file_group_status(transferID, 'hopping') except Exception: core_utils.dump_error_message(tmpLog) # treat as a temporary error tmpStat = None tmpMsg = 'failed to add a rule for {0}:{1}'.format(datasetScope, datasetName) # release lock self.dbInterface.release_object_lock(transferID) # escape if already failed if tmpStat is False: break # all done if tmpStat is True: self.set_FileSpec_status(jobspec, 'finished') tmpLog.debug('done with {0} : {1}'.format(tmpStat, tmpMsg)) return tmpStat, tmpMsg
def __init__(self, **kwarg): self.logBaseURL = 'http://localhost/test' PluginBase.__init__(self, **kwarg) self.queue_config_mapper = QueueConfigMapper()
timeout=self.__worker_update_timeout) tmp_log.debug('worker update for {0} ended with {1} {2}'.format( batch_id, r.status_code, r.text)) end_time = time.time() tmp_log.debug('done (took {0})'.format(end_time - start_time)) except: tmp_log.error('Excepted with: {0}'.format(traceback.format_exc())) if __name__ == "__main__": """ Quick tests """ from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper queue_config_mapper = QueueConfigMapper() apfmon = Apfmon(queue_config_mapper) apfmon.create_factory() apfmon.create_labels() worker_a = WorkSpec() worker_a.batchID = 1 worker_a.computingSite = 'CERN-PROD-DEV_UCORE' worker_a.computingElement = 'bla1' worker_a.workAttributes = { "batchLog": "https://aipanda024.cern.ch/condor_logs/18-07-19_09/grid.9659.0.log", "stdErr": "https://aipanda024.cern.ch/condor_logs/18-07-19_09/grid.9659.0.err", "stdOut":
def trigger_stage_out(self, jobspec): # make logger tmpLog = self.make_logger(baseLogger, 'PandaID={0} ThreadID={1}'.format(jobspec.PandaID,threading.current_thread().ident), method_name='trigger_stage_out') tmpLog.debug('start') # initialize some values tmpStat = None tmpMsg = '' srcRSE = None dstRSE = None datasetName = 'panda.harvester.{0}.{1}'.format(jobspec.PandaID,str(uuid.uuid4())) datasetScope = 'transient' # check that jobspec.computingSite is defined if jobspec.computingSite is None: # not found tmpLog.error('jobspec.computingSite is not defined') return False, 'jobspec.computingSite is not defined' else: tmpLog.debug('jobspec.computingSite : {0}'.format(jobspec.computingSite)) # get the queueConfig and corresponding objStoreID_ES queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(jobspec.computingSite) # write to debug log queueConfig.stager tmpLog.debug('jobspec.computingSite - {0} queueConfig.stager {1}'.format(jobspec.computingSite,queueConfig.stager)) # check queueConfig stager section to see if jobtype is set if 'jobtype' in queueConfig.stager: if queueConfig.stager['jobtype'] == "Yoda" : self.Yodajob = True # get destination endpoint nucleus = jobspec.jobParams['nucleus'] agis = self.dbInterface.get_cache('panda_queues.json').data dstRSE = [agis[x]["astorages"]['pr'][0] for x in agis if agis[x]["atlas_site"] == nucleus][0] # see file /cvmfs/atlas.cern.ch/repo/sw/local/etc/agis_ddmendpoints.json ddm = self.dbInterface.get_cache('agis_ddmendpoints.json').data self.objstoreID = ddm[dstRSE]['id'] if self.Yodajob : self.pathConvention = int(queueConfig.stager['pathConvention']) tmpLog.debug('Yoda Job - PandaID = {0} objstoreID = {1} pathConvention ={2}'.format(jobspec.PandaID,self.objstoreID,self.pathConvention)) else: self.pathConvention = None tmpLog.debug('PandaID = {0} objstoreID = {1}'.format(jobspec.PandaID,self.objstoreID)) # set the location of the files in fileSpec.objstoreID self.set_FileSpec_objstoreID(jobspec, self.objstoreID, self.pathConvention) self.RSE_dstpath = queueConfig.stager['RSE_dstPath'] # check queueConfig stager section to see if srcRSE is set if 'srcRSE' in queueConfig.stager: srcRSE = queueConfig.stager['srcRSE'] else: tmpLog.debug('Warning srcRSE not defined in stager portion of queue config file') tmpLog.debug('srcRSE - {0} dstRSE - {1}'.format(srcRSE,dstRSE)) # loop over the output files and copy the files ifile = 0 errors = [] fileList = [] lfns = [] fileSpec_list = [] fileSpec_list = jobspec.get_output_file_specs(skip_done=False) msgStr = '#(jobspec.get_output_file_specs(skip_done=False)) = {0}'\ .format(len(fileSpec_list)) tmpLog.debug(msgStr) for fileSpec in fileSpec_list: msgstr = 'fileSpec: dataset scope - {0} file name - {1} size(Bytes) - {2} adler32 - {3}'\ .format(datasetScope,fileSpec.lfn,fileSpec.fsize,fileSpec.chksum) if fileSpec.fileAttributes is not None and 'guid' in fileSpec.fileAttributes: msgstr += ' guid - {0}'.format(fileSpec.fileAttributes['guid']) tmpLog.debug(msgstr) #for fileSpec in jobspec.get_output_file_specs(skip_done=True): for fileSpec in jobspec.get_output_file_specs(skip_done=False): scope ='panda' if fileSpec.scope is not None : scope = fileSpec.scope # for Yoda job set the scope to transient if self.Yodajob : scope = 'transient' # only print to log file first 25 files if ifile < 25 : msgStr = "fileSpec.lfn - {0} fileSpec.scope - {1}".format(fileSpec.lfn, fileSpec.scope) tmpLog.debug(msgStr) if ifile == 25 : msgStr = "printed first 25 files skipping the rest".format(fileSpec.lfn, fileSpec.scope) tmpLog.debug(msgStr) hash = hashlib.md5() hash.update('%s:%s' % (scope, fileSpec.lfn)) hash_hex = hash.hexdigest() correctedscope = "/".join(scope.split('.')) srcURL = fileSpec.path dstURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format(endPoint=self.RSE_dstPath, scope=correctedscope, hash1=hash_hex[0:2], hash2=hash_hex[2:4], lfn=fileSpec.lfn) if ifile < 25 : tmpLog.debug('src={srcURL} dst={dstURL}'.format(srcURL=srcURL, dstURL=dstURL)) tmpFile = dict() # copy the source file from source to destination skip over if file already exists if os.path.exists(dstURL): tmpLog.debug('Already copied file {0}'.format(dstURL)) # save for adding to rucio dataset tmpFile['scope'] = datasetScope tmpFile['name'] = fileSpec.lfn tmpFile['bytes'] = fileSpec.fsize tmpFile['adler32'] = fileSpec.chksum if fileSpec.fileAttributes is not None and 'guid' in fileSpec.fileAttributes: tmpFile['meta'] = {'guid': fileSpec.fileAttributes['guid']} else : tmpLog.debug('File - {0} does not have a guid value'.format(fileSpec.lfn)) tmpLog.debug('Adding file {0} to fileList'.format(fileSpec.lfn)) fileList.append(tmpFile) lfns.append(fileSpec.lfn) # get source RSE if srcRSE is None and fileSpec.objstoreID is not None: ddm = self.dbInterface.get_cache('agis_ddmendpoints.json').data srcRSE = [x for x in ddm if ddm[x]["id"] == fileSpec.objstoreID][0] tmpLog.debug('srcRSE - {0} defined from agis_ddmendpoints.json'.format(srcRSE)) else : if os.path.exists(srcURL) : # check if destination directory exists if not create it dstDIR = os.path.dirname(dstURL) try: if not os.path.exists(dstDIR) : os.makedirs(dstDIR) mode = stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IXGRP mode = mode | stat.S_IROTH | stat.S_IXOTH | stat.S_ISGID os.chmod(dstDIR,mode) # copy the source file to destination file shutil.copy2(srcURL, dstURL) # save for adding to rucio dataset tmpFile['scope'] = datasetScope tmpFile['name'] = fileSpec.lfn tmpFile['bytes'] = fileSpec.fsize tmpFile['adler32'] = fileSpec.chksum if fileSpec.fileAttributes is not None and 'guid' in fileSpec.fileAttributes: tmpFile['meta'] = {'guid': fileSpec.fileAttributes['guid']} else : tmpLog.debug('File - {0} does not have a guid value'.format(fileSpec.lfn)) tmpLog.debug('Adding file {0} to fileList'.format(fileSpec.lfn)) fileList.append(tmpFile) lfns.append(fileSpec.lfn) # get source RSE if not already set if srcRSE is None and fileSpec.objstoreID is not None: ddm = self.dbInterface.get_cache('agis_ddmendpoints.json').data srcRSE = [x for x in ddm if ddm[x]["id"] == fileSpec.objstoreID][0] tmpLog.debug('srcRSE - {0} defined from agis_ddmendpoints.json'.format(srcRSE)) except (IOError, os.error) as why: errors.append((srcURL, dstURL, str(why))) else : errors.append((srcURL, dstURL, 'Source file missing')) ifile += 1 # test that srcRSE and dstRSE are defined tmpLog.debug('srcRSE - {0} dstRSE - {1}'.format(srcRSE,dstRSE)) errStr = '' if srcRSE is None: errStr = 'Source RSE is not defined ' if dstRSE is None: errStr = errStr + ' Desitination RSE is not defined' if (srcRSE is None) or (dstRSE is None) : tmpLog.error(errStr) return None,errStr # test to see if there are any files to add dataset if len(fileList) == 0: errStr = 'There are no files to add to database' tmpLog.error(errStr) return None,errStr # print out the file list tmpLog.debug('fileList - {0}'.format(fileList)) # create the dataset and add files to it and create a transfer rule try: # register dataset rucioAPI = RucioClient() tmpLog.debug('register {0}:{1} rse = {2} meta=(hidden: True) lifetime = {3}' .format(datasetScope, datasetName,srcRSE,(30*24*60*60))) try: rucioAPI.add_dataset(datasetScope, datasetName, meta={'hidden': True}, lifetime=30 * 24 * 60 * 60, rse=srcRSE ) except DataIdentifierAlreadyExists: # ignore even if the dataset already exists pass except Exception: errMsg = 'Could not create dataset {0}:{1} srcRSE - {2}'.format(datasetScope, datasetName, srcRSE) core_utils.dump_error_message(tmpLog) tmpLog.error(errMsg) return None,errMsg # add files to dataset # add 500 files at a time numfiles = len(fileList) maxfiles = 500 numslices = numfiles/maxfiles if (numfiles%maxfiles) > 0 : numslices = numslices + 1 start = 0 for i in range(numslices) : try: stop = start + maxfiles if stop > numfiles : stop = numfiles rucioAPI.add_files_to_datasets([{'scope': datasetScope, 'name': datasetName, 'dids': fileList[start:stop], 'rse': srcRSE}], ignore_duplicate=True) start = stop except FileAlreadyExists: # ignore if files already exist pass except Exception: errMsg = 'Could not add files to DS - {0}:{1} rse - {2} files - {3}'.format(datasetScope, datasetName, srcRSE, fileList) core_utils.dump_error_message(tmpLog) tmpLog.error(errMsg) return None,errMsg # add rule try: tmpDID = dict() tmpDID['scope'] = datasetScope tmpDID['name'] = datasetName tmpRet = rucioAPI.add_replication_rule([tmpDID], 1, dstRSE, lifetime=30 * 24 * 60 * 60) ruleIDs = tmpRet[0] tmpLog.debug('registered dataset {0}:{1} with rule {2}'.format(datasetScope, datasetName, str(ruleIDs))) # group the output files together by the Rucio transfer rule jobspec.set_groups_to_files({ruleIDs: {'lfns': lfns,'groupStatus': 'pending'}}) msgStr = 'jobspec.set_groups_to_files -Rucio rule - {0}, lfns - {1}, groupStatus - pending'.format(ruleIDs,lfns) tmpLog.debug(msgStr) tmpLog.debug('call self.dbInterface.set_file_group(jobspec.get_output_file_specs(skip_done=True),ruleIDs,pending)') tmpStat = self.dbInterface.set_file_group(jobspec.get_output_file_specs(skip_done=True),ruleIDs,'transferring') tmpLog.debug('called self.dbInterface.set_file_group(jobspec.get_output_file_specs(skip_done=True),ruleIDs,transferring)') tmpStat = True tmpMsg = 'created Rucio rule successfully' except DuplicateRule: # ignore duplicated rule tmpLog.debug('rule is already available') except Exception: errMsg = 'Error creating rule for dataset {0}:{1}'.format(datasetScope, datasetName) core_utils.dump_error_message(tmpLog) tmpLog.debug(errMsg) return None,errMsg # update file group status self.dbInterface.update_file_group_status(ruleIDs, 'transferring') except Exception: core_utils.dump_error_message(tmpLog) # treat as a temporary error tmpStat = None tmpMsg = 'failed to add a rule for {0}:{1}'.format(datasetScope, datasetName) # Now test for any errors if errors: for error in errors: tmpLog.debug('copy error source {0} destination {1} Reason {2}'.format(error[0],error[1],error[2])) raise Error(errors) # otherwise we are OK tmpLog.debug('stop') return tmpStat,tmpMsg
def submit_workers(self, workspec_list): tmpLog = self.make_logger(baseLogger, method_name='submit_workers') nWorkers = len(workspec_list) tmpLog.debug('start nWorkers={0}'.format(nWorkers)) # whether to submit any worker to_submit_any = True # get log subdirectory name from timestamp timeNow = datetime.datetime.utcnow() log_subdir = timeNow.strftime('%y-%m-%d_%H') log_subdir_path = os.path.join(self.logDir, log_subdir) if self.condorSchedd is None or not self.useSpool: try: os.mkdir(log_subdir_path) except OSError as e: if e.errno != errno.EEXIST: raise else: pass # get info from harvester queue config _queueConfigMapper = QueueConfigMapper() harvester_queue_config = _queueConfigMapper.get_queue(self.queueName) # get queue info from AGIS by cacher in db if self.useAtlasAGIS: panda_queues_dict = PandaQueuesDict() panda_queue_name = panda_queues_dict.get_panda_queue_name( self.queueName) this_panda_queue_dict = panda_queues_dict.get( self.queueName, dict()) # tmpLog.debug('panda_queues_name and queue_info: {0}, {1}'.format(self.queueName, panda_queues_dict[self.queueName])) else: panda_queues_dict = dict() panda_queue_name = self.queueName this_panda_queue_dict = dict() # get default information from queue info n_core_per_node_from_queue = this_panda_queue_dict.get( 'corecount', 1) if this_panda_queue_dict.get('corecount', 1) else 1 is_unified_queue = this_panda_queue_dict.get('capability', '') == 'ucore' # get override requirements from queue configured try: n_core_per_node = self.nCorePerNode if self.nCorePerNode else n_core_per_node_from_queue except AttributeError: n_core_per_node = n_core_per_node_from_queue # deal with CE special_par = '' ce_weighting = None if self.useAtlasGridCE: # If ATLAS Grid CE mode used tmpLog.debug('Using ATLAS Grid CE mode...') queues_from_queue_list = this_panda_queue_dict.get('queues', []) special_par = this_panda_queue_dict.get('special_par', '') ce_auxilary_dict = {} for _queue_dict in queues_from_queue_list: if not (_queue_dict.get('ce_endpoint') and str( _queue_dict.get('ce_state', '')).upper() == 'ACTIVE' and str(_queue_dict.get('ce_flavour', '')).lower() in set(['arc-ce', 'cream-ce', 'htcondor-ce'])): continue ce_endpoint = _queue_dict.get('ce_endpoint') if (ce_endpoint in ce_auxilary_dict and str(_queue_dict.get('ce_queue_name', '')).lower() == 'default'): pass else: ce_auxilary_dict[ce_endpoint] = _queue_dict # qualified CEs from AGIS info n_qualified_ce = len(ce_auxilary_dict) if n_qualified_ce > 0: # Get CE weighting tmpLog.debug('Get CE weighting') worker_ce_all_tuple = self.get_ce_statistics( self.queueName, nWorkers) ce_weighting = _get_ce_weighting( ce_endpoint_list=list(ce_auxilary_dict.keys()), worker_ce_all_tuple=worker_ce_all_tuple) stats_weighting_display_str = _get_ce_stats_weighting_display( ce_auxilary_dict.keys(), worker_ce_all_tuple, ce_weighting) tmpLog.debug('CE stats and weighting: {0}'.format( stats_weighting_display_str)) else: tmpLog.error('No valid CE endpoint found') to_submit_any = False def _handle_one_worker(workspec, to_submit=to_submit_any): # make logger tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format( workspec.workerID), method_name='_handle_one_worker') ce_info_dict = dict() batch_log_dict = dict() data = { 'workspec': workspec, 'to_submit': to_submit, } if to_submit: if self.useAtlasGridCE: # choose a CE tmpLog.info('choose a CE...') ce_chosen = _choose_ce(ce_weighting) try: ce_info_dict = ce_auxilary_dict[ce_chosen].copy() except KeyError: tmpLog.info( 'Problem choosing CE with weighting. Choose an arbitrary CE endpoint' ) ce_info_dict = random.choice( list(ce_auxilary_dict.values())).copy() # go on info of the CE ce_endpoint_from_queue = ce_info_dict.get( 'ce_endpoint', '') ce_flavour_str = str(ce_info_dict.get('ce_flavour', '')).lower() ce_version_str = str(ce_info_dict.get('ce_version', '')).lower() ce_info_dict['ce_hostname'] = re.sub( ':\w*', '', ce_endpoint_from_queue) if ce_info_dict['ce_hostname'] == ce_endpoint_from_queue: # add default port to ce_endpoint if missing default_port_map = { 'cream-ce': 8443, 'arc-ce': 2811, 'htcondor-ce': 9619, } if ce_flavour_str in default_port_map: default_port = default_port_map[ce_flavour_str] ce_info_dict['ce_endpoint'] = '{0}:{1}'.format( ce_endpoint_from_queue, default_port) tmpLog.debug( 'For site {0} got CE endpoint: "{1}", flavour: "{2}"'. format(self.queueName, ce_endpoint_from_queue, ce_flavour_str)) if os.path.isdir(self.CEtemplateDir) and ce_flavour_str: sdf_template_filename = '{ce_flavour_str}.sdf'.format( ce_flavour_str=ce_flavour_str) self.templateFile = os.path.join( self.CEtemplateDir, sdf_template_filename) else: try: # Manually define site condor schedd as ceHostname and central manager as ceEndpoint if self.ceHostname and isinstance( self.ceHostname, list) and len(self.ceHostname) > 0: if isinstance(self.ceEndpoint, list) and len(self.ceEndpoint) > 0: ce_info_dict['ce_hostname'], ce_info_dict[ 'ce_endpoint'] = random.choice( list( zip(self.ceHostname, self.ceEndpoint))) else: ce_info_dict['ce_hostname'] = random.choice( self.ceHostname) ce_info_dict['ce_endpoint'] = self.ceEndpoint else: ce_info_dict['ce_hostname'] = self.ceHostname ce_info_dict['ce_endpoint'] = self.ceEndpoint except AttributeError: pass # template for batch script try: tmpFile = open(self.templateFile) sdf_template_raw = tmpFile.read() tmpFile.close() except AttributeError: tmpLog.error( 'No valid templateFile found. Maybe templateFile, CEtemplateDir invalid, or no valid CE found' ) to_submit = False return data else: # get batch_log, stdout, stderr filename, and remobe commented liness sdf_template_str_list = [] for _line in sdf_template_raw.split('\n'): if _line.startswith('#'): continue sdf_template_str_list.append(_line) _match_batch_log = re.match('log = (.+)', _line) _match_stdout = re.match('output = (.+)', _line) _match_stderr = re.match('error = (.+)', _line) if _match_batch_log: batch_log_value = _match_batch_log.group(1) continue if _match_stdout: stdout_value = _match_stdout.group(1) continue if _match_stderr: stderr_value = _match_stderr.group(1) continue sdf_template = '\n'.join(sdf_template_str_list) # Choose from Condor schedd and central managers if isinstance(self.condorSchedd, list) and len(self.condorSchedd) > 0: if isinstance(self.condorPool, list) and len(self.condorPool) > 0: condor_schedd, condor_pool = random.choice( list(zip(self.condorSchedd, self.condorPool))) else: condor_schedd = random.choice(self.condorSchedd) condor_pool = self.condorPool else: condor_schedd = self.condorSchedd condor_pool = self.condorPool # Log Base URL if self.logBaseURL and '[ScheddHostname]' in self.logBaseURL: schedd_hostname = re.sub( r'(?:[a-zA-Z0-9_.\-]*@)?([a-zA-Z0-9.\-]+)(?::[0-9]+)?', lambda matchobj: matchobj.group(1) if matchobj.group(1) else '', condor_schedd) log_base_url = re.sub(r'\[ScheddHostname\]', schedd_hostname, self.logBaseURL) else: log_base_url = self.logBaseURL # URLs for log files if not (log_base_url is None): if workspec.batchID: batchID = workspec.batchID guess = False else: batchID = '' guess = True batch_log_filename = parse_batch_job_filename( value_str=batch_log_value, file_dir=log_subdir_path, batchID=batchID, guess=guess) stdout_path_file_name = parse_batch_job_filename( value_str=stdout_value, file_dir=log_subdir_path, batchID=batchID, guess=guess) stderr_path_filename = parse_batch_job_filename( value_str=stderr_value, file_dir=log_subdir_path, batchID=batchID, guess=guess) batch_log = '{0}/{1}/{2}'.format( log_base_url, log_subdir, batch_log_filename) batch_stdout = '{0}/{1}/{2}'.format( log_base_url, log_subdir, stdout_path_file_name) batch_stderr = '{0}/{1}/{2}'.format( log_base_url, log_subdir, stderr_path_filename) workspec.set_log_file('batch_log', batch_log) workspec.set_log_file('stdout', batch_stdout) workspec.set_log_file('stderr', batch_stderr) batch_log_dict['batch_log'] = batch_log batch_log_dict['batch_stdout'] = batch_stdout batch_log_dict['batch_stderr'] = batch_stderr batch_log_dict['gtag'] = workspec.workAttributes[ 'stdOut'] tmpLog.debug('Done set_log_file before submission') tmpLog.debug('Done jobspec attribute setting') # set data dict data.update({ 'workspec': workspec, 'to_submit': to_submit, 'template': sdf_template, 'executable_file': self.executableFile, 'log_dir': self.logDir, 'log_subdir': log_subdir, 'n_core_per_node': n_core_per_node, 'panda_queue_name': panda_queue_name, 'x509_user_proxy': self.x509UserProxy, 'ce_info_dict': ce_info_dict, 'batch_log_dict': batch_log_dict, 'special_par': special_par, 'harvester_queue_config': harvester_queue_config, 'is_unified_queue': is_unified_queue, 'condor_schedd': condor_schedd, 'condor_pool': condor_pool, 'use_spool': self.useSpool, }) return data def _propagate_attributes(workspec, tmpVal): # make logger tmpLog = core_utils.make_logger( baseLogger, 'workerID={0}'.format(workspec.workerID), method_name='_propagate_attributes') (retVal, tmpDict) = tmpVal workspec.set_attributes_with_dict(tmpDict) tmpLog.debug('Done workspec attributes propagation') return retVal tmpLog.debug('finished preparing worker attributes') # map(_handle_one_worker, workspec_list) with ThreadPoolExecutor(self.nProcesses * 4) as thread_pool: dataIterator = thread_pool.map(_handle_one_worker, workspec_list) tmpLog.debug('{0} workers handled'.format(nWorkers)) # exec with mcore with ThreadPoolExecutor(self.nProcesses) as thread_pool: retValList = thread_pool.map(submit_a_worker, dataIterator) tmpLog.debug('{0} workers submitted'.format(nWorkers)) # propagate changed attributes with ThreadPoolExecutor(self.nProcesses) as thread_pool: retIterator = thread_pool.map( lambda _wv_tuple: _propagate_attributes(*_wv_tuple), zip(workspec_list, retValList)) retList = list(retIterator) tmpLog.debug('done') return retList
def check_stage_out_status(self, jobspec): tmpStat = True tmpMsg = '' # make logger tmpLog = self.make_logger(baseLogger, 'PandaID={0} ThreadID={1}'.format(jobspec.PandaID,threading.current_thread().ident), method_name='check_stage_out_status') tmpLog.debug('start') # check that jobspec.computingSite is defined if jobspec.computingSite is None: # not found tmpLog.error('jobspec.computingSite is not defined') return False, 'jobspec.computingSite is not defined' else: tmpLog.debug('jobspec.computingSite : {0}'.format(jobspec.computingSite)) # get the queueConfig and corresponding objStoreID_ES queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(jobspec.computingSite) # write to debug log queueConfig.stager tmpLog.debug('jobspec.computingSite - {0} queueConfig.stager {1}'.format(jobspec.computingSite,queueConfig.stager)) # check queueConfig stager section to see if jobtype is set if 'jobtype' in queueConfig.stager: if queueConfig.stager['jobtype'] == "Yoda" : self.Yodajob = True # get destination endpoint nucleus = jobspec.jobParams['nucleus'] agis = self.dbInterface.get_cache('panda_queues.json').data dstRSE = [agis[x]["astorages"]['pr'][0] for x in agis if agis[x]["atlas_site"] == nucleus][0] # set the location of the files in fileSpec.objstoreID # see file /cvmfs/atlas.cern.ch/repo/sw/local/etc/agis_ddmendpoints.json ddm = self.dbInterface.get_cache('agis_ddmendpoints.json').data self.objstoreID = ddm[dstRSE]['id'] if self.Yodajob : self.pathConvention = int(queueConfig.stager['pathConvention']) tmpLog.debug('Yoda Job - PandaID = {0} objstoreID = {1} pathConvention ={2}'.format(jobspec.PandaID,self.objstoreID,self.pathConvention)) else: self.pathConvention = None tmpLog.debug('PandaID = {0} objstoreID = {1}'.format(jobspec.PandaID,self.objstoreID)) # set the location of the files in fileSpec.objstoreID self.set_FileSpec_objstoreID(jobspec, self.objstoreID, self.pathConvention) # Get the files grouped by Rucio Rule ID groups = jobspec.get_groups_of_output_files() if len(groups) == 0: tmpLog.debug('No Rucio Rules') return None,'No Rucio Rules' tmpLog.debug('#Rucio Rules - {0} - Rules - {1}'.format(len(groups),groups)) try: rucioAPI = RucioClient() except: tmpLog.error('failure to get Rucio Client try again later') return None,'failure to get Rucio Client try again later' # loop over the Rucio rules for rucioRule in groups: if rucioRule is None: continue # lock have_db_lock = self.dbInterface.get_object_lock(rucioRule, lock_interval=120) if not have_db_lock: msgStr = 'escape since {0} is locked by another thread'.format(rucioRule) tmpLog.debug(msgStr) return None, msgStr # get transfer status groupStatus = self.dbInterface.get_file_group_status(rucioRule) tmpLog.debug('rucioRule - {0} - groupStatus - {1}'.format(rucioRule,groupStatus)) if 'transferred' in groupStatus: # already succeeded - set the fileSpec status for these files self.set_FileSpec_objstoreID(jobspec, self.objstoreID, self.pathConvention) pass elif 'failed' in groupStatus : # transfer failure tmpStat = False tmpMsg = 'rucio rule for {0}:{1} already failed'.format(datasetScope, datasetName) elif 'transferring' in groupStatus or 'pending' in groupStatus: # transfer started in Rucio check status try: result = rucioAPI.get_replication_rule(rucioRule,False) if result['state'] == "OK" : # files transfered to nucleus tmpLog.debug('Files for Rucio Rule {0} successfully transferred'.format(rucioRule)) self.dbInterface.update_file_group_status(rucioRule, 'transferred') # set the fileSpec status for these files self.set_FileSpec_objstoreID(jobspec, self.objstoreID, self.pathConvention) self.set_FileSpec_status(jobspec,'finished') elif result['state'] == "FAILED" : # failed Rucio Transfer tmpStat = False tmpMsg = 'Failed Rucio Transfer - Rucio Rule - {0}'.format(rucioRule) tmpLog.debug(tmpMsg) self.set_FileSpec_status(jobspec,'failed') elif result['state'] == 'STUCK' : tmpStat = None tmpMsg = 'Rucio Transfer Rule {0} Stuck'.format(rucioRule) tmpLog.debug(tmpMsg) except: tmpStat = None tmpMsg = 'Could not get information or Rucio Rule {0}'.format(rucioRule) tmpLog.error(tmpMsg) pass # release the lock if have_db_lock: tmpLog.debug('attempt to release DB lock for Rucio Rule {0}'.format(rucioRule)) release_db_lock = self.dbInterface.release_object_lock(rucioRule) if release_db_lock: tmpLog.debug('released DB lock for rucioRule - {0}'.format(rucioRule)) have_db_lock = False else: msgStr = ' Could not release DB lock for {}'.format(rucioRule) tmpLog.error(msgStr) return None, msgStr tmpLog.debug('stop') return tmpStat, tmpMsg
fork_child_pid = os.fork() if fork_child_pid != 0: signal_utils.set_suicide_handler(None) os.wait() else: if len(sys.argv) not in (2, 4): print("Wrong number of parameters. You can either:") print(" - specify the queue name") print( " - specify the queue name, jobType (managed, user) and resourceType (SCORE, SCORE_HIMEM, MCORE, MCORE_HIMEM)" ) sys.exit(0) queueName = sys.argv[1] queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(queueName) if queueConfig.prodSourceLabel in ('user', 'managed'): jobType = queueConfig.prodSourceLabel else: jobType = 'managed' # default, can be overwritten by parameters resourceType = 'SCORE' # default, can be overwritten by parameters if len(sys.argv) == 4: # jobType should be 'managed' or 'user'. If not specified will default to a production job if sys.argv[2] in ('user', 'managed'): jobType = sys.argv[2] else: print('value for jobType not valid, defaulted to {0}'.format(
def __init__(self, **kwarg): PluginBase.__init__(self, **kwarg) self.queue_config_mapper = QueueConfigMapper()