def test(): '''test submission''' from pandaharvester.harvestercore.job_spec import JobSpec from pandaharvester.harvestercore.plugin_factory import PluginFactory import json queuename = 'ARC-TEST' queueconfmapper = QueueConfigMapper() queueconf = queueconfmapper.get_queue(queuename) pluginfactory = PluginFactory() pandajob = '{"jobsetID": 11881, "logGUID": "88ee8a52-5c70-490c-a585-5eb6f48e4152", "cmtConfig": "x86_64-slc6-gcc49-opt", "prodDBlocks": "mc16_13TeV:mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.merge.EVNT.e5340_e5984_tid11329621_00", "dispatchDBlockTokenForOut": "NULL,NULL", "destinationDBlockToken": "dst:CERN-PROD_DATADISK,dst:NDGF-T1_DATADISK", "destinationSE": "CERN-PROD_PRESERVATION", "realDatasets": "mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.HITS.e5340_e5984_s3126_tid11364822_00,mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.log.e5340_e5984_s3126_tid11364822_00", "prodUserID": "gingrich", "GUID": "A407D965-B139-A543-8851-A8E134A678D7", "realDatasetsIn": "mc16_13TeV:mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.merge.EVNT.e5340_e5984_tid11329621_00", "nSent": 2, "cloud": "WORLD", "StatusCode": 0, "homepackage": "AtlasOffline/21.0.15", "inFiles": "EVNT.11329621._001079.pool.root.1", "processingType": "simul", "currentPriority": 900, "fsize": "129263662", "fileDestinationSE": "CERN-PROD_PRESERVATION,BOINC_MCORE", "scopeOut": "mc16_13TeV", "minRamCount": 1573, "jobDefinitionID": 0, "maxWalltime": 40638, "scopeLog": "mc16_13TeV", "transformation": "Sim_tf.py", "maxDiskCount": 485, "coreCount": 1, "prodDBlockToken": "NULL", "transferType": "NULL", "destinationDblock": "mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.HITS.e5340_e5984_s3126_tid11364822_00_sub0418634273,mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.log.e5340_e5984_s3126_tid11364822_00_sub0418634276", "dispatchDBlockToken": "NULL", "jobPars": "--inputEVNTFile=EVNT.11329621._001079.pool.root.1 --maxEvents=50 --postInclude \\"default:RecJobTransforms/UseFrontier.py\\" --preExec \\"EVNTtoHITS:simFlags.SimBarcodeOffset.set_Value_and_Lock(200000)\\" \\"EVNTtoHITS:simFlags.TRTRangeCut=30.0;simFlags.TightMuonStepping=True\\" --preInclude \\"EVNTtoHITS:SimulationJobOptions/preInclude.BeamPipeKill.py,SimulationJobOptions/preInclude.FrozenShowersFCalOnly.py\\" --skipEvents=4550 --firstEvent=5334551 --outputHITSFile=HITS.11364822._128373.pool.root.1 --physicsList=FTFP_BERT_ATL_VALIDATION --randomSeed=106692 --DBRelease=\\"all:current\\" --conditionsTag \\"default:OFLCOND-MC16-SDR-14\\" --geometryVersion=\\"default:ATLAS-R2-2016-01-00-01_VALIDATION\\" --runNumber=364168 --AMITag=s3126 --DataRunNumber=284500 --simulator=FullG4 --truthStrategy=MC15aPlus", "attemptNr": 2, "swRelease": "Atlas-21.0.15", "nucleus": "CERN-PROD", "maxCpuCount": 40638, "outFiles": "HITS.11364822._128373.pool.root.11,log.11364822._128373.job.log.tgz.11", "ddmEndPointOut": "CERN-PROD_DATADISK,NDGF-T1_DATADISK", "scopeIn": "mc16_13TeV", "PandaID": 3487584273, "sourceSite": "NULL", "dispatchDblock": "panda.11364822.07.05.GEN.0c9b1d3b-feec-411a-89e4-1cbf7347d70c_dis003487584270", "prodSourceLabel": "managed", "checksum": "ad:cd0bf10b", "jobName": "mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.e5340_e5984_s3126.3433643361", "ddmEndPointIn": "NDGF-T1_DATADISK", "taskID": 11364822, "logFile": "log.11364822._128373.job.log.tgz.1"}' pandajob = json.loads(pandajob) jspec = JobSpec() jspec.convert_job_json(pandajob) jspec.computingSite = queuename jspeclist = [jspec] maker = pluginfactory.get_plugin(queueconf.workerMaker) wspec = maker.make_worker(jspeclist, queueconf) wspec.hasJob = 1 wspec.set_jobspec_list(jspeclist) sub = ARCSubmitter() print sub.submit_workers([wspec]) print wspec.batchID
def post_processing(self, workspec, jobspec_list, map_type): ''' Fetch job output and process pilot info for sending in final heartbeat. The pilot pickle is loaded and some attributes corrected (schedulerid, pilotlog etc), then converted to dictionary and stored in workspec.workAttributes[pandaid]. If pilot pickle cannot be used, report ARC error in pilotErrorDiag and fill all possible attributes using ARC information. ''' arclog = arc_utils.ARCLogger(baselogger, workspec.workerID) tmplog = arclog.log tmplog.info('Post processing ARC job {0}'.format(workspec.batchID)) job = workspec.workAttributes['arcjob'] proxyrole = workspec.workAttributes['proxyrole'] arcid = job['JobID'] tmplog.info('Job id {0}'.format(arcid)) if 'arcdownloadfiles' not in workspec.workAttributes: tmplog.error('No files to download') return # Assume one-to-one mapping of workers to jobs. If jobspec_list is empty # it means the job was cancelled by panda or otherwise forgotten if not jobspec_list: return # Set certificate userconfig = arc.UserConfig(self.cred_type) try: userconfig.ProxyPath(str(self.certs[proxyrole])) except: tmplog.error("Job {0}: no proxy found with role {1}".format(job.JobID, proxyrole)) return queueconfigmapper = QueueConfigMapper() queueconfig = queueconfigmapper.get_queue(jobspec_list[0].computingSite) logbaseurl = queueconfig.submitter.get('logBaseURL') logbasedir = queueconfig.submitter.get('logDir', self.tmpdir) logsubdir = workspec.workAttributes['logsubdir'] pandaid = str(jobspec_list[0].PandaID) # Construct log path and url logurl = '/'.join([logbaseurl, logsubdir, str(pandaid)]) if logbaseurl else None logdir = os.path.join(logbasedir, logsubdir) # post_processing is only called once, so no retries are done. But keep # the possibility here in case it changes (fetched, notfetched, notfetchedretry) = self._download_outputs(workspec.workAttributes['arcdownloadfiles'], logdir, arcid, pandaid, userconfig, tmplog) if arcid not in fetched: tmplog.warning("Could not get outputs of {0}".format(arcid)) workspec.workAttributes[long(pandaid)] = {} workspec.workAttributes[long(pandaid)] = self._extractAndFixPilotPickle(job, pandaid, (arcid in fetched), logurl, tmplog) tmplog.debug("pilot info for {0}: {1}".format(pandaid, workspec.workAttributes[long(pandaid)]))
class GoogleSubmitter(PluginBase): """ Plug-in for Google Cloud Engine VM submission. In this case the worker will abstract a VM running a job """ def __init__(self, **kwarg): self.logBaseURL = 'http://localhost/test' PluginBase.__init__(self, **kwarg) self.queue_config_mapper = QueueConfigMapper() def submit_workers(self, work_spec_list): """ :param work_spec_list: list of workers to submit :return: """ tmp_log = self.make_logger(base_logger, method_name='submit_workers') tmp_log.debug('start nWorkers={0}'.format(len(work_spec_list))) ret_list = [] if not work_spec_list: tmp_log.debug('empty work_spec_list') return ret_list # we assume all work_specs in the list belong to the same queue queue_config = self.queue_config_mapper.get_queue( work_spec_list[0].computingSite) # Create VMs in parallel # authentication issues when running the Cloud API in multiprocess # pool_size = min(len(work_spec_list), 10) # with Pool(pool_size) as pool: # ret_val_list = pool.map(create_vm, work_spec_list, lock) ret_val_list = [] for work_spec in work_spec_list: ret_val_list.append(create_vm(work_spec, queue_config)) # Propagate changed attributes for work_spec, tmp_val in zip(work_spec_list, ret_val_list): ret_val, tmp_dict = tmp_val work_spec.set_attributes_with_dict(tmp_dict) work_spec.set_log_file( 'batch_log', '{0}/{1}.log'.format(self.logBaseURL, work_spec.batchID)) work_spec.set_log_file( 'stdout', '{0}/{1}.out'.format(self.logBaseURL, work_spec.batchID)) work_spec.set_log_file( 'stderr', '{0}/{1}.err'.format(self.logBaseURL, work_spec.batchID)) ret_list.append(ret_val) tmp_log.debug('done') return ret_list
class GoogleSweeper(PluginBase): """ Sweeper with kill/clean-up functions for Google Compute Engine """ def __init__(self, **kwarg): PluginBase.__init__(self, **kwarg) self.queue_config_mapper = QueueConfigMapper() def kill_worker(self, work_spec): """ Sends the command to Google to destroy a VM :param work_spec: worker specification :type work_spec: WorkSpec :return: A tuple of return code (True for success, False otherwise) and error dialog :rtype: (bool, string) """ try: vm_name = work_spec.batchID queue_config = self.queue_config_mapper.get_queue( work_spec.computingSite) try: zone = queue_config.zone except AttributeError: zone = ZONE base_logger.debug('Going to kill VM {0}'.format(vm_name)) compute.instances().delete(project=PROJECT, zone=zone, instance=vm_name).execute() base_logger.debug('Killed VM {0}'.format(vm_name)) return True, '' except googleapiclient.errors.HttpError as e: if 'was not found' in e.content: # the VM was already killed or does not exist for any other reason message = 'VM does not exist'.format(vm_name) base_logger.debug(message) return True, message else: # there was an issue killing the VM and it should be retried at another time return False, 'Problems killing the VM: {0}'.format(e) except Exception as e: return False, 'Problems killing the VM: {0}'.format(e) def sweep_worker(self, work_spec): """ In the cloud, cleaning means destroying a VM :param work_spec: worker specification :type work_spec: WorkSpec :return: A tuple of return code (True for success, False otherwise) and error dialog :rtype: (bool, string) """ return self.kill_worker(work_spec)
def post_processing(self, workspec, jobspec_list, map_type): ''' Fetch job output and process pilot info for sending in final heartbeat. The pilot pickle is loaded and some attributes corrected (schedulerid, pilotlog etc), then converted to dictionary and stored in workspec.workAttributes[pandaid]. If pilot pickle cannot be used, report ARC error in pilotErrorDiag and fill all possible attributes using ARC information. ''' arclog = arc_utils.ARCLogger(baselogger, workspec.workerID) tmplog = arclog.log tmplog.info('Post processing ARC job {0}'.format(workspec.batchID)) job = workspec.workAttributes['arcjob'] arcid = job['JobID'] tmplog.info('Job id {0}'.format(arcid)) if 'arcdownloadfiles' not in workspec.workAttributes: tmplog.error('No files to download') return True # Assume one-to-one mapping of workers to jobs. If jobspec_list is empty # it means the job was cancelled by panda or otherwise forgotten if not jobspec_list: return True # Set certificate to use for interacting with ARC CE userconfig = arc.UserConfig(self.cred_type) if not self._setup_proxy(usercfg, workspec, arcid, tmplog): return True queueconfigmapper = QueueConfigMapper() queueconfig = queueconfigmapper.get_queue(jobspec_list[0].computingSite) logbaseurl = queueconfig.submitter.get('logBaseURL') logbasedir = queueconfig.submitter.get('logDir', self.tmpdir) logsubdir = workspec.workAttributes['logsubdir'] pandaid = str(jobspec_list[0].PandaID) # Construct log path and url logurl = '/'.join([logbaseurl, logsubdir, str(pandaid)]) if logbaseurl else None logdir = os.path.join(logbasedir, logsubdir) # post_processing is only called once, so no retries are done. But keep # the possibility here in case it changes (fetched, notfetched, notfetchedretry) = self._download_outputs(workspec.workAttributes['arcdownloadfiles'], logdir, arcid, pandaid, userconfig, tmplog) if arcid not in fetched: tmplog.warning("Could not get outputs of {0}".format(arcid)) workspec.workAttributes[long(pandaid)] = {} workspec.workAttributes[long(pandaid)] = self._extractAndFixPilotPickle(job, pandaid, (arcid in fetched), logurl, tmplog) tmplog.debug("pilot info for {0}: {1}".format(pandaid, workspec.workAttributes[long(pandaid)])) return True
def submit_workers(self, workspec_list): retList = [] for workSpec in workspec_list: tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workSpec.workerID), method_name='submit_workers') queueconfigmapper = QueueConfigMapper() queueconfig = queueconfigmapper.get_queue(workSpec.computingSite) prodSourceLabel = queueconfig.get_source_label() # If jobSpec is defined we are in push mode, if not pull mode # Both assume one to one worker to job mapping jobSpec = workSpec.get_jobspec_list() if jobSpec: jobSpec = jobSpec[0] tmpLog.debug("JobSpec: {0}".format(jobSpec.values_map())) desc = {} desc['pandastatus'] = 'sent' desc['actpandastatus'] = 'sent' desc['siteName'] = workSpec.computingSite desc['proxyid'] = self.proxymap['pilot' if prodSourceLabel == 'user' else 'production'] desc['sendhb'] = 0 metadata = {'harvesteraccesspoint': workSpec.get_access_point(), 'schedulerid': 'harvester-{}'.format(harvester_config.master.harvester_id)} desc['metadata'] = json.dumps(metadata) if jobSpec: # push mode: aCT takes the url-encoded job description (like it gets from panda server) pandaid = jobSpec.PandaID actjobdesc = urllib.urlencode(jobSpec.jobParams) else: # pull mode: just set pandaid (to workerid) and prodsourcelabel pandaid = workSpec.workerID actjobdesc = 'PandaID=%d&prodSourceLabel=%s' % (pandaid, prodSourceLabel) tmpLog.info("Inserting job {0} into aCT DB: {1}".format(pandaid, str(desc))) try: batchid = self.actDB.insertJob(pandaid, actjobdesc, desc)['LAST_INSERT_ID()'] except Exception as e: result = (False, "Failed to insert job into aCT DB: {0}".format(str(e))) else: tmpLog.info("aCT batch id {0}".format(batchid)) workSpec.batchID = str(batchid) # Set log files in workSpec today = time.strftime('%Y-%m-%d', time.gmtime()) logurl = '/'.join([queueconfig.submitter.get('logBaseURL'), today, workSpec.computingSite, str(pandaid)]) workSpec.set_log_file('batch_log', '{0}.log'.format(logurl)) workSpec.set_log_file('stdout', '{0}.out'.format(logurl)) workSpec.set_log_file('stderr', '{0}.err'.format(logurl)) result = (True, '') retList.append(result) return retList
class GoogleSweeper(PluginBase): """ Sweeper with kill/clean-up functions for Google Compute Engine """ def __init__(self, **kwarg): PluginBase.__init__(self, **kwarg) self.queue_config_mapper = QueueConfigMapper() def kill_worker(self, work_spec): """ Sends the command to Google to destroy a VM :param work_spec: worker specification :type work_spec: WorkSpec :return: A tuple of return code (True for success, False otherwise) and error dialog :rtype: (bool, string) """ try: vm_name = work_spec.batchID queue_config = self.queue_config_mapper.get_queue(work_spec.computingSite) try: zone = queue_config.zone except AttributeError: zone = ZONE base_logger.debug('Going to kill VM {0}'.format(vm_name)) compute.instances().delete(project=PROJECT, zone=zone, instance=vm_name).execute() base_logger.debug('Killed VM {0}'.format(vm_name)) return True, '' except googleapiclient.errors.HttpError as e: if 'was not found' in e.content: # the VM was already killed or does not exist for any other reason message = 'VM does not exist'.format(vm_name) base_logger.debug(message) return True, message else: # there was an issue killing the VM and it should be retried at another time return False, 'Problems killing the VM: {0}'.format(e) except Exception as e: return False, 'Problems killing the VM: {0}'.format(e) def sweep_worker(self, work_spec): """ In the cloud, cleaning means destroying a VM :param work_spec: worker specification :type work_spec: WorkSpec :return: A tuple of return code (True for success, False otherwise) and error dialog :rtype: (bool, string) """ return self.kill_worker(work_spec)
class GoogleSubmitter(PluginBase): """ Plug-in for Google Cloud Engine VM submission. In this case the worker will abstract a VM running a job """ def __init__(self, **kwarg): self.logBaseURL = 'http://localhost/test' PluginBase.__init__(self, **kwarg) self.queue_config_mapper = QueueConfigMapper() def submit_workers(self, work_spec_list): """ :param work_spec_list: list of workers to submit :return: """ tmp_log = self.make_logger(base_logger, method_name='submit_workers') tmp_log.debug('start nWorkers={0}'.format(len(work_spec_list))) ret_list = [] if not work_spec_list: tmp_log.debug('empty work_spec_list') return ret_list # we assume all work_specs in the list belong to the same queue queue_config = self.queue_config_mapper.get_queue(work_spec_list[0].computingSite) # Create VMs in parallel # authentication issues when running the Cloud API in multiprocess # pool_size = min(len(work_spec_list), 10) # with Pool(pool_size) as pool: # ret_val_list = pool.map(create_vm, work_spec_list, lock) ret_val_list = [] for work_spec in work_spec_list: ret_val_list.append(create_vm(work_spec, queue_config)) # Propagate changed attributes for work_spec, tmp_val in zip(work_spec_list, ret_val_list): ret_val, tmp_dict = tmp_val work_spec.set_attributes_with_dict(tmp_dict) work_spec.set_log_file('batch_log', '{0}/{1}.log'.format(self.logBaseURL, work_spec.batchID)) work_spec.set_log_file('stdout', '{0}/{1}.out'.format(self.logBaseURL, work_spec.batchID)) work_spec.set_log_file('stderr', '{0}/{1}.err'.format(self.logBaseURL, work_spec.batchID)) ret_list.append(ret_val) tmp_log.debug('done') return ret_list
def submit_workers(self, workspec_list): tmpLog = self.make_logger(baseLogger, method_name='submit_workers') nWorkers = len(workspec_list) tmpLog.debug('start nWorkers={0}'.format(nWorkers)) # get log subdirectory name from timestamp timeNow = datetime.datetime.utcnow() log_subdir = timeNow.strftime('%y-%m-%d_%H') log_subdir_path = os.path.join(self.logDir, log_subdir) try: os.mkdir(log_subdir_path) except OSError as e: if e.errno != errno.EEXIST: raise else: pass # get info from harvester queue config _queueConfigMapper = QueueConfigMapper() harvester_queue_config = _queueConfigMapper.get_queue(self.queueName) # get queue info from AGIS by cacher in db if self.useAtlasAGIS: panda_queues_dict = PandaQueuesDict() panda_queue_name = panda_queues_dict.get_panda_queue_name(self.queueName) this_panda_queue_dict = panda_queues_dict.get(self.queueName, dict()) # tmpLog.debug('panda_queues_name and queue_info: {0}, {1}'.format(self.queueName, panda_queues_dict[self.queueName])) else: panda_queues_dict = dict() panda_queue_name = self.queueName this_panda_queue_dict = dict() def _handle_one_worker(workspec): # make logger tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), method_name='_handle_one_worker') # get default information from queue info n_core_per_node_from_queue = this_panda_queue_dict.get('corecount', 1) if this_panda_queue_dict.get('corecount', 1) else 1 is_unified_queue = 'unifiedPandaQueue' in this_panda_queue_dict.get('catchall', '').split(',') \ or this_panda_queue_dict.get('capability', '') == 'ucore' ce_info_dict = dict() batch_log_dict = dict() special_par = '' if self.useAtlasGridCE: # If ATLAS Grid CE mode used tmpLog.debug('Using ATLAS Grid CE mode...') queues_from_queue_list = this_panda_queue_dict.get('queues', []) special_par = this_panda_queue_dict.get('special_par', '') ce_auxilary_dict = {} for _queue_dict in queues_from_queue_list: if not ( _queue_dict.get('ce_endpoint') and str(_queue_dict.get('ce_state', '')).upper() == 'ACTIVE' and str(_queue_dict.get('ce_flavour', '')).lower() in set(['arc-ce', 'cream-ce', 'htcondor-ce']) ): continue ce_endpoint = _queue_dict.get('ce_endpoint') if ( ce_endpoint in ce_auxilary_dict and str(_queue_dict.get('ce_queue_name', '')).lower() == 'default' ): pass else: ce_auxilary_dict[ce_endpoint] = _queue_dict # qualified CEs from AGIS info n_qualified_ce = len(ce_auxilary_dict) queue_status_dict = self.dbInterface.get_queue_status(self.queueName) worker_ce_stats_dict = self.dbInterface.get_worker_ce_stats(self.queueName) ce_weight_dict = _get_ce_weight_dict(ce_endpoint_list=list(ce_auxilary_dict.keys()), queue_status_dict=queue_status_dict, worker_ce_stats_dict=worker_ce_stats_dict) # good CEs which can be submitted to, duplicate by weight good_ce_weighted_list = [] for _ce_endpoint in ce_auxilary_dict.keys(): good_ce_weighted_list.extend([_ce_endpoint] * ce_weight_dict.get(_ce_endpoint, 0)) tmpLog.debug('queue_status_dict: {0} ; worker_ce_stats_dict: {1} ; ce_weight_dict: {2}'.format( queue_status_dict, worker_ce_stats_dict, ce_weight_dict)) if len(good_ce_weighted_list) > 0: ce_info_dict = ce_auxilary_dict[random.choice(good_ce_weighted_list)].copy() else: tmpLog.info('No good CE endpoint left. Choose an arbitrary CE endpoint') ce_info_dict = random.choice(list(ce_auxilary_dict.values())).copy() ce_endpoint_from_queue = ce_info_dict.get('ce_endpoint', '') ce_flavour_str = str(ce_info_dict.get('ce_flavour', '')).lower() ce_version_str = str(ce_info_dict.get('ce_version', '')).lower() ce_info_dict['ce_hostname'] = re.sub(':\w*', '', ce_endpoint_from_queue) tmpLog.debug('For site {0} got CE endpoint: "{1}", flavour: "{2}"'.format(self.queueName, ce_endpoint_from_queue, ce_flavour_str)) if os.path.isdir(self.CEtemplateDir) and ce_flavour_str: sdf_template_filename = '{ce_flavour_str}.sdf'.format(ce_flavour_str=ce_flavour_str) self.templateFile = os.path.join(self.CEtemplateDir, sdf_template_filename) # template for batch script tmpFile = open(self.templateFile) sdf_template = tmpFile.read() tmpFile.close() # get batch_log, stdout, stderr filename for _line in sdf_template.split('\n'): if _line.startswith('#'): continue _match_batch_log = re.match('log = (.+)', _line) _match_stdout = re.match('output = (.+)', _line) _match_stderr = re.match('error = (.+)', _line) if _match_batch_log: batch_log_value = _match_batch_log.group(1) continue if _match_stdout: stdout_value = _match_stdout.group(1) continue if _match_stderr: stderr_value = _match_stderr.group(1) continue # get override requirements from queue configured try: n_core_per_node = self.nCorePerNode if self.nCorePerNode else n_core_per_node_from_queue except AttributeError: n_core_per_node = n_core_per_node_from_queue # URLs for log files if not (self.logBaseURL is None): if workspec.batchID: batchID = workspec.batchID guess = False else: batchID = '' guess = True batch_log_filename = parse_batch_job_filename(value_str=batch_log_value, file_dir=log_subdir_path, batchID=batchID, guess=guess) stdout_path_file_name = parse_batch_job_filename(value_str=stdout_value, file_dir=log_subdir_path, batchID=batchID, guess=guess) stderr_path_filename = parse_batch_job_filename(value_str=stderr_value, file_dir=log_subdir_path, batchID=batchID, guess=guess) batch_log = '{0}/{1}/{2}'.format(self.logBaseURL, log_subdir, batch_log_filename) batch_stdout = '{0}/{1}/{2}'.format(self.logBaseURL, log_subdir, stdout_path_file_name) batch_stderr = '{0}/{1}/{2}'.format(self.logBaseURL, log_subdir, stderr_path_filename) workspec.set_log_file('batch_log', batch_log) workspec.set_log_file('stdout', batch_stdout) workspec.set_log_file('stderr', batch_stderr) batch_log_dict['batch_log'] = batch_log batch_log_dict['batch_stdout'] = batch_stdout batch_log_dict['batch_stderr'] = batch_stderr batch_log_dict['gtag'] = workspec.workAttributes['stdOut'] tmpLog.debug('Done set_log_file before submission') tmpLog.debug('Done jobspec attribute setting') # set data dict data = {'workspec': workspec, 'template': sdf_template, 'log_dir': self.logDir, 'log_subdir': log_subdir, 'n_core_per_node': n_core_per_node, 'panda_queue_name': panda_queue_name, 'x509_user_proxy': self.x509UserProxy, 'ce_info_dict': ce_info_dict, 'batch_log_dict': batch_log_dict, 'special_par': special_par, 'harvester_queue_config': harvester_queue_config, 'is_unified_queue': is_unified_queue, 'condor_schedd': self.condorSchedd, 'condor_pool': self.condorPool, } return data def _propagate_attributes(workspec, tmpVal): # make logger tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), method_name='_propagate_attributes') (retVal, tmpDict) = tmpVal workspec.set_attributes_with_dict(tmpDict) tmpLog.debug('Done workspec attributes propagation') return retVal tmpLog.debug('finished preparing worker attributes') # map(_handle_one_worker, workspec_list) with ThreadPoolExecutor(self.nProcesses * 4) as thread_pool: dataIterator = thread_pool.map(_handle_one_worker, workspec_list) tmpLog.debug('{0} workers handled'.format(nWorkers)) # exec with mcore with ThreadPoolExecutor(self.nProcesses) as thread_pool: retValList = thread_pool.map(submit_a_worker, dataIterator) tmpLog.debug('{0} workers submitted'.format(nWorkers)) # propagate changed attributes with ThreadPoolExecutor(self.nProcesses) as thread_pool: retIterator = thread_pool.map(lambda _wv_tuple: _propagate_attributes(*_wv_tuple), zip(workspec_list, retValList)) retList = list(retIterator) tmpLog.debug('done') return retList
def submit_k8s_worker(self, work_spec): tmp_log = self.make_logger(base_logger, method_name='submit_k8s_worker') # get info from harvester queue config _queueConfigMapper = QueueConfigMapper() harvester_queue_config = _queueConfigMapper.get_queue(self.queueName) prod_source_label = harvester_queue_config.get_source_label( work_spec.jobType) # set the stdout log file log_file_name = '{0}_{1}.out'.format( harvester_config.master.harvester_id, work_spec.workerID) work_spec.set_log_file( 'stdout', '{0}/{1}'.format(self.logBaseURL, log_file_name)) # TODO: consider if we want to upload the yaml file to PanDA cache yaml_content = self.k8s_client.read_yaml_file(self.k8s_yaml_file) try: # read the job configuration (if available, only push model) job_fields, job_pars_parsed = self.read_job_configuration( work_spec) # decide container image and executable to run. In pull mode, defaults are provided container_image = self.decide_container_image( job_fields, job_pars_parsed) executable, args = self.build_executable(job_fields, job_pars_parsed) tmp_log.debug( 'container_image: "{0}"; executable: "{1}"; args: "{2}"'. format(container_image, executable, args)) # choose the appropriate proxy panda_queues_dict = PandaQueuesDict() is_grandly_unified_queue = panda_queues_dict.is_grandly_unified_queue( self.queueName) cert, use_secret = self._choose_proxy(work_spec, is_grandly_unified_queue) if not cert: err_str = 'No proxy specified in proxySecretPath or x509UserProxy. Not submitted' tmp_return_value = (False, err_str) return tmp_return_value # get the walltime limit try: max_time = panda_queues_dict.get(self.queueName)['maxtime'] except Exception as e: tmp_log.warning( 'Could not retrieve maxtime field for queue {0}'.format( self.queueName)) max_time = None # submit the worker rsp, yaml_content_final = self.k8s_client.create_job_from_yaml( yaml_content, work_spec, prod_source_label, container_image, executable, args, cert, cert_in_secret=use_secret, cpu_adjust_ratio=self.cpuAdjustRatio, memory_adjust_ratio=self.memoryAdjustRatio, max_time=max_time) except Exception as _e: tmp_log.error(traceback.format_exc()) err_str = 'Failed to create a JOB; {0}'.format(_e) tmp_return_value = (False, err_str) else: work_spec.batchID = yaml_content['metadata']['name'] tmp_log.debug('Created worker {0} with batchID={1}'.format( work_spec.workerID, work_spec.batchID)) tmp_return_value = (True, '') return tmp_return_value
def trigger_stage_out(self, jobspec): # make logger tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID), method_name='trigger_stage_out') tmpLog.debug('start') # default return tmpRetVal = (True, '') # check that jobspec.computingSite is defined if jobspec.computingSite is None: # not found tmpLog.error('jobspec.computingSite is not defined') return False, 'jobspec.computingSite is not defined' else: tmpLog.debug('jobspec.computingSite : {0}'.format(jobspec.computingSite)) # test we have a Globus Transfer Client if not self.tc : errStr = 'failed to get Globus Transfer Client' tmpLog.error(errStr) return False, errStr # get label label = self.make_label(jobspec) tmpLog.debug('label={0}'.format(label)) # get transfer tasks tmpStat, transferTasks = globus_utils.get_transfer_tasks(tmpLog,self.tc,label) if not tmpStat: errStr = 'failed to get transfer tasks' tmpLog.error(errStr) return False, errStr # check if already queued if label in transferTasks: tmpLog.debug('skip since already queued with {0}'.format(str(transferTasks[label]))) return True, '' # set the Globus destination Endpoint id and path will get them from Agis eventually from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(jobspec.computingSite) #self.Globus_srcPath = queueConfig.stager['Globus_srcPath'] self.srcEndpoint = queueConfig.stager['srcEndpoint'] self.Globus_srcPath = self.basePath self.Globus_dstPath = queueConfig.stager['Globus_dstPath'] self.dstEndpoint = queueConfig.stager['dstEndpoint'] # Test the endpoints and create the transfer data class errMsg = None try: # Test endpoints for activation tmpStatsrc, srcStr = globus_utils.check_endpoint_activation(tmpLog,self.tc,self.srcEndpoint) tmpStatdst, dstStr = globus_utils.check_endpoint_activation(tmpLog,self.tc,self.dstEndpoint) if tmpStatsrc and tmpStatdst: errStr = 'source Endpoint and destination Endpoint activated' tmpLog.debug(errStr) else: errMsg = '' if not tmpStatsrc : errMsg += ' source Endpoint not activated ' if not tmpStatdst : errMsg += ' destination Endpoint not activated ' tmpLog.error(errMsg) tmpRetVal = (False,errMsg) return tmpRetVal # both endpoints activated now prepare to transfer data tdata = TransferData(self.tc, self.srcEndpoint, self.dstEndpoint, label=label, sync_level="checksum") except: errStat,errMsg = globus_utils.handle_globus_exception(tmpLog) tmpRetVal = (errStat, errMsg) return tmpRetVal # loop over all files fileAttrs = jobspec.get_output_file_attributes() lfns = [] for fileSpec in jobspec.outFiles: scope = fileAttrs[fileSpec.lfn]['scope'] hash = hashlib.md5() hash.update('%s:%s' % (scope, fileSpec.lfn)) hash_hex = hash.hexdigest() correctedscope = "/".join(scope.split('.')) srcURL = fileSpec.path dstURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format(endPoint=self.Globus_dstPath, scope=correctedscope, hash1=hash_hex[0:2], hash2=hash_hex[2:4], lfn=fileSpec.lfn) tmpLog.debug('src={srcURL} dst={dstURL}'.format(srcURL=srcURL, dstURL=dstURL)) # add files to transfer object - tdata if os.access(srcURL, os.R_OK): tmpLog.debug("tdata.add_item({},{})".format(srcURL,dstURL)) tdata.add_item(srcURL,dstURL) lfns.append(fileSpec.lfn) else: errMsg = "source file {} does not exist".format(srcURL) tmpLog.error(errMsg) tmpRetVal = (False,errMsg) return tmpRetVal # submit transfer try: transfer_result = self.tc.submit_transfer(tdata) # check status code and message tmpLog.debug(str(transfer_result)) if transfer_result['code'] == "Accepted": # succeeded # set transfer ID which are used for later lookup transferID = transfer_result['task_id'] tmpLog.debug('successfully submitted id={0}'.format(transferID)) jobspec.set_groups_to_files({transferID: {'lfns': lfns, 'groupStatus': 'active'}}) # set for fileSpec in jobspec.outFiles: if fileSpec.fileAttributes == None: fileSpec.fileAttributes = {} fileSpec.fileAttributes['transferID'] = transferID else: tmpRetVal = (False, transfer_result['message']) except Exception as e: errStat,errMsg = globus_utils.handle_globus_exception(tmpLog) if errMsg is None: errtype, errvalue = sys.exc_info()[:2] errMsg = "{0} {1}".format(errtype.__name__, errvalue) tmpRetVal = (errStat,errMsg) # return tmpLog.debug('done') return tmpRetVal
def trigger_stage_out(self, jobspec): # make logger tmpLog = self.make_logger(baseLogger, 'PandaID={0} ThreadID={1}'.format(jobspec.PandaID,threading.current_thread().ident), method_name='trigger_stage_out') tmpLog.debug('start') # initialize some values tmpStat = None tmpMsg = '' srcRSE = None dstRSE = None datasetName = 'panda.harvester.{0}.{1}'.format(jobspec.PandaID,str(uuid.uuid4())) datasetScope = 'transient' # check that jobspec.computingSite is defined if jobspec.computingSite is None: # not found tmpLog.error('jobspec.computingSite is not defined') return False, 'jobspec.computingSite is not defined' else: tmpLog.debug('jobspec.computingSite : {0}'.format(jobspec.computingSite)) # get the queueConfig and corresponding objStoreID_ES queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(jobspec.computingSite) # write to debug log queueConfig.stager tmpLog.debug('jobspec.computingSite - {0} queueConfig.stager {1}'.format(jobspec.computingSite,queueConfig.stager)) # check queueConfig stager section to see if jobtype is set if 'jobtype' in queueConfig.stager: if queueConfig.stager['jobtype'] == "Yoda" : self.Yodajob = True # get destination endpoint nucleus = jobspec.jobParams['nucleus'] agis = self.dbInterface.get_cache('panda_queues.json').data dstRSE = [agis[x]["astorages"]['pr'][0] for x in agis if agis[x]["atlas_site"] == nucleus][0] # see file /cvmfs/atlas.cern.ch/repo/sw/local/etc/agis_ddmendpoints.json ddm = self.dbInterface.get_cache('agis_ddmendpoints.json').data self.objstoreID = ddm[dstRSE]['id'] if self.Yodajob : self.pathConvention = int(queueConfig.stager['pathConvention']) tmpLog.debug('Yoda Job - PandaID = {0} objstoreID = {1} pathConvention ={2}'.format(jobspec.PandaID,self.objstoreID,self.pathConvention)) else: self.pathConvention = None tmpLog.debug('PandaID = {0} objstoreID = {1}'.format(jobspec.PandaID,self.objstoreID)) # set the location of the files in fileSpec.objstoreID self.set_FileSpec_objstoreID(jobspec, self.objstoreID, self.pathConvention) self.RSE_dstpath = queueConfig.stager['RSE_dstPath'] # check queueConfig stager section to see if srcRSE is set if 'srcRSE' in queueConfig.stager: srcRSE = queueConfig.stager['srcRSE'] else: tmpLog.debug('Warning srcRSE not defined in stager portion of queue config file') tmpLog.debug('srcRSE - {0} dstRSE - {1}'.format(srcRSE,dstRSE)) # loop over the output files and copy the files ifile = 0 errors = [] fileList = [] lfns = [] fileSpec_list = [] fileSpec_list = jobspec.get_output_file_specs(skip_done=False) msgStr = '#(jobspec.get_output_file_specs(skip_done=False)) = {0}'\ .format(len(fileSpec_list)) tmpLog.debug(msgStr) for fileSpec in fileSpec_list: msgstr = 'fileSpec: dataset scope - {0} file name - {1} size(Bytes) - {2} adler32 - {3}'\ .format(datasetScope,fileSpec.lfn,fileSpec.fsize,fileSpec.chksum) if fileSpec.fileAttributes is not None and 'guid' in fileSpec.fileAttributes: msgstr += ' guid - {0}'.format(fileSpec.fileAttributes['guid']) tmpLog.debug(msgstr) #for fileSpec in jobspec.get_output_file_specs(skip_done=True): for fileSpec in jobspec.get_output_file_specs(skip_done=False): scope ='panda' if fileSpec.scope is not None : scope = fileSpec.scope # for Yoda job set the scope to transient if self.Yodajob : scope = 'transient' # only print to log file first 25 files if ifile < 25 : msgStr = "fileSpec.lfn - {0} fileSpec.scope - {1}".format(fileSpec.lfn, fileSpec.scope) tmpLog.debug(msgStr) if ifile == 25 : msgStr = "printed first 25 files skipping the rest".format(fileSpec.lfn, fileSpec.scope) tmpLog.debug(msgStr) hash = hashlib.md5() hash.update('%s:%s' % (scope, fileSpec.lfn)) hash_hex = hash.hexdigest() correctedscope = "/".join(scope.split('.')) srcURL = fileSpec.path dstURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format(endPoint=self.RSE_dstPath, scope=correctedscope, hash1=hash_hex[0:2], hash2=hash_hex[2:4], lfn=fileSpec.lfn) if ifile < 25 : tmpLog.debug('src={srcURL} dst={dstURL}'.format(srcURL=srcURL, dstURL=dstURL)) tmpFile = dict() # copy the source file from source to destination skip over if file already exists if os.path.exists(dstURL): tmpLog.debug('Already copied file {0}'.format(dstURL)) # save for adding to rucio dataset tmpFile['scope'] = datasetScope tmpFile['name'] = fileSpec.lfn tmpFile['bytes'] = fileSpec.fsize tmpFile['adler32'] = fileSpec.chksum if fileSpec.fileAttributes is not None and 'guid' in fileSpec.fileAttributes: tmpFile['meta'] = {'guid': fileSpec.fileAttributes['guid']} else : tmpLog.debug('File - {0} does not have a guid value'.format(fileSpec.lfn)) tmpLog.debug('Adding file {0} to fileList'.format(fileSpec.lfn)) fileList.append(tmpFile) lfns.append(fileSpec.lfn) # get source RSE if srcRSE is None and fileSpec.objstoreID is not None: ddm = self.dbInterface.get_cache('agis_ddmendpoints.json').data srcRSE = [x for x in ddm if ddm[x]["id"] == fileSpec.objstoreID][0] tmpLog.debug('srcRSE - {0} defined from agis_ddmendpoints.json'.format(srcRSE)) else : if os.path.exists(srcURL) : # check if destination directory exists if not create it dstDIR = os.path.dirname(dstURL) try: if not os.path.exists(dstDIR) : os.makedirs(dstDIR) mode = stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IXGRP mode = mode | stat.S_IROTH | stat.S_IXOTH | stat.S_ISGID os.chmod(dstDIR,mode) # copy the source file to destination file shutil.copy2(srcURL, dstURL) # save for adding to rucio dataset tmpFile['scope'] = datasetScope tmpFile['name'] = fileSpec.lfn tmpFile['bytes'] = fileSpec.fsize tmpFile['adler32'] = fileSpec.chksum if fileSpec.fileAttributes is not None and 'guid' in fileSpec.fileAttributes: tmpFile['meta'] = {'guid': fileSpec.fileAttributes['guid']} else : tmpLog.debug('File - {0} does not have a guid value'.format(fileSpec.lfn)) tmpLog.debug('Adding file {0} to fileList'.format(fileSpec.lfn)) fileList.append(tmpFile) lfns.append(fileSpec.lfn) # get source RSE if not already set if srcRSE is None and fileSpec.objstoreID is not None: ddm = self.dbInterface.get_cache('agis_ddmendpoints.json').data srcRSE = [x for x in ddm if ddm[x]["id"] == fileSpec.objstoreID][0] tmpLog.debug('srcRSE - {0} defined from agis_ddmendpoints.json'.format(srcRSE)) except (IOError, os.error) as why: errors.append((srcURL, dstURL, str(why))) else : errors.append((srcURL, dstURL, 'Source file missing')) ifile += 1 # test that srcRSE and dstRSE are defined tmpLog.debug('srcRSE - {0} dstRSE - {1}'.format(srcRSE,dstRSE)) errStr = '' if srcRSE is None: errStr = 'Source RSE is not defined ' if dstRSE is None: errStr = errStr + ' Desitination RSE is not defined' if (srcRSE is None) or (dstRSE is None) : tmpLog.error(errStr) return None,errStr # test to see if there are any files to add dataset if len(fileList) == 0: errStr = 'There are no files to add to database' tmpLog.error(errStr) return None,errStr # print out the file list tmpLog.debug('fileList - {0}'.format(fileList)) # create the dataset and add files to it and create a transfer rule try: # register dataset rucioAPI = RucioClient() tmpLog.debug('register {0}:{1} rse = {2} meta=(hidden: True) lifetime = {3}' .format(datasetScope, datasetName,srcRSE,(30*24*60*60))) try: rucioAPI.add_dataset(datasetScope, datasetName, meta={'hidden': True}, lifetime=30 * 24 * 60 * 60, rse=srcRSE ) except DataIdentifierAlreadyExists: # ignore even if the dataset already exists pass except Exception: errMsg = 'Could not create dataset {0}:{1} srcRSE - {2}'.format(datasetScope, datasetName, srcRSE) core_utils.dump_error_message(tmpLog) tmpLog.error(errMsg) return None,errMsg # add files to dataset # add 500 files at a time numfiles = len(fileList) maxfiles = 500 numslices = numfiles/maxfiles if (numfiles%maxfiles) > 0 : numslices = numslices + 1 start = 0 for i in range(numslices) : try: stop = start + maxfiles if stop > numfiles : stop = numfiles rucioAPI.add_files_to_datasets([{'scope': datasetScope, 'name': datasetName, 'dids': fileList[start:stop], 'rse': srcRSE}], ignore_duplicate=True) start = stop except FileAlreadyExists: # ignore if files already exist pass except Exception: errMsg = 'Could not add files to DS - {0}:{1} rse - {2} files - {3}'.format(datasetScope, datasetName, srcRSE, fileList) core_utils.dump_error_message(tmpLog) tmpLog.error(errMsg) return None,errMsg # add rule try: tmpDID = dict() tmpDID['scope'] = datasetScope tmpDID['name'] = datasetName tmpRet = rucioAPI.add_replication_rule([tmpDID], 1, dstRSE, lifetime=30 * 24 * 60 * 60) ruleIDs = tmpRet[0] tmpLog.debug('registered dataset {0}:{1} with rule {2}'.format(datasetScope, datasetName, str(ruleIDs))) # group the output files together by the Rucio transfer rule jobspec.set_groups_to_files({ruleIDs: {'lfns': lfns,'groupStatus': 'pending'}}) msgStr = 'jobspec.set_groups_to_files -Rucio rule - {0}, lfns - {1}, groupStatus - pending'.format(ruleIDs,lfns) tmpLog.debug(msgStr) tmpLog.debug('call self.dbInterface.set_file_group(jobspec.get_output_file_specs(skip_done=True),ruleIDs,pending)') tmpStat = self.dbInterface.set_file_group(jobspec.get_output_file_specs(skip_done=True),ruleIDs,'transferring') tmpLog.debug('called self.dbInterface.set_file_group(jobspec.get_output_file_specs(skip_done=True),ruleIDs,transferring)') tmpStat = True tmpMsg = 'created Rucio rule successfully' except DuplicateRule: # ignore duplicated rule tmpLog.debug('rule is already available') except Exception: errMsg = 'Error creating rule for dataset {0}:{1}'.format(datasetScope, datasetName) core_utils.dump_error_message(tmpLog) tmpLog.debug(errMsg) return None,errMsg # update file group status self.dbInterface.update_file_group_status(ruleIDs, 'transferring') except Exception: core_utils.dump_error_message(tmpLog) # treat as a temporary error tmpStat = None tmpMsg = 'failed to add a rule for {0}:{1}'.format(datasetScope, datasetName) # Now test for any errors if errors: for error in errors: tmpLog.debug('copy error source {0} destination {1} Reason {2}'.format(error[0],error[1],error[2])) raise Error(errors) # otherwise we are OK tmpLog.debug('stop') return tmpStat,tmpMsg
def submit_workers(self, workspec_list): retList = [] for workSpec in workspec_list: tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workSpec.workerID), method_name='submit_workers') queueconfigmapper = QueueConfigMapper() queueconfig = queueconfigmapper.get_queue(workSpec.computingSite) prodSourceLabel = queueconfig.get_source_label() # If jobSpec is defined we are in push mode, if not pull mode # Both assume one to one worker to job mapping jobSpec = workSpec.get_jobspec_list() if jobSpec: jobSpec = jobSpec[0] tmpLog.debug("JobSpec: {0}".format(jobSpec.values_map())) # Unified queues: take prodsourcelabel from job prodSourceLabel = jobSpec.jobParams.get('prodSourceLabel', prodSourceLabel) desc = {} # If we need to prefetch events, set aCT status waiting. # feed_events in act_messenger will fill events and release the job if queueconfig.prefetchEvents: desc['pandastatus'] = 'waiting' desc['actpandastatus'] = 'waiting' desc['arcjobid'] = -1 # dummy id to prevent submission else: desc['pandastatus'] = 'sent' desc['actpandastatus'] = 'sent' desc['siteName'] = workSpec.computingSite desc['proxyid'] = self.proxymap['pilot' if prodSourceLabel in ['user', 'panda'] else 'production'] desc['prodSourceLabel'] = prodSourceLabel desc['sendhb'] = 0 metadata = {'harvesteraccesspoint': workSpec.get_access_point(), 'schedulerid': 'harvester-{}'.format(harvester_config.master.harvester_id)} desc['metadata'] = json.dumps(metadata) if jobSpec: # push mode: aCT takes the url-encoded job description (like it gets from panda server) pandaid = jobSpec.PandaID actjobdesc = urllib.parse.urlencode(jobSpec.jobParams) else: # pull mode: just set pandaid (to workerid) and prodsourcelabel pandaid = workSpec.workerID actjobdesc = 'PandaID=%d&prodSourceLabel=%s' % (pandaid, prodSourceLabel) tmpLog.info("Inserting job {0} into aCT DB: {1}".format(pandaid, str(desc))) try: batchid = self.actDB.insertJob(pandaid, actjobdesc, desc)['LAST_INSERT_ID()'] except Exception as e: result = (False, "Failed to insert job into aCT DB: {0}".format(str(e))) else: tmpLog.info("aCT batch id {0}".format(batchid)) workSpec.batchID = str(batchid) workSpec.submissionHost = self.hostname workSpec.nativeStatus = desc['actpandastatus'] # Set log files in workSpec today = time.strftime('%Y-%m-%d', time.gmtime()) logurl = '/'.join([queueconfig.submitter.get('logBaseURL'), today, workSpec.computingSite, str(pandaid)]) workSpec.set_log_file('batch_log', '{0}.log'.format(logurl)) workSpec.set_log_file('stdout', '{0}.out'.format(logurl)) workSpec.set_log_file('stderr', '{0}.err'.format(logurl)) workSpec.set_log_file('jdl', '{0}.jdl'.format(logurl)) result = (True, '') retList.append(result) return retList
class MultiNodeWorkerMaker(BaseWorkerMaker): # constructor def __init__(self, **kwarg): BaseWorkerMaker.__init__(self, **kwarg) self.pluginFactory = PluginFactory() self.queue_config_mapper = QueueConfigMapper() tmpLog = self.make_logger(baseLogger, method_name='__init__') tmpLog.info("Multinode workermaker: created.") tmpLog.debug("Queue name: {0}".format(self.queueName)) if self.mode == "static": tmpLog.info("Static configuration") elif self.mode == "dynamic": tmpLog.info("Dynamic configuration") self.nNodes, self.walltimelimit = self.get_resources() self.nJobsPerWorker = self.nNodes * self.nJobsPerNode def _get_executable(self): # return string which contain body of script for scheduler: specific enviroment setup, executor with parameters exe_str = "" tmpLog = self.make_logger(baseLogger, method_name='_get_executable') # prepare static enviroment env_str = "" if self.env not in (None, "NULL"): env_str = "\n".join(map(lambda s: s.strip(), self.env.split(", "))) # prepare executor try: if self.executor == "aprun": # "aprun -n [number of required nodes/jobs] -d [number of cpu per node/job]" - for one multicore job per node exe_str = self.executor + " -n {0} -d {1} ".format(self.nJobsPerWorker, self.nCorePerJob) exe_str += self.pilot else: exe_str = self.executor + " " + self.pilot if self.pilot_params: exe_str = " ".join([exe_str, self.pilot_params]) except Exception: tmpLog.error("Unable to build executor command check configuration") exe_str = "" exe_str = "\n".join([env_str, exe_str]) tmpLog.debug("Shell script body: \n%s" % exe_str) return exe_str # make a worker from jobs def make_worker(self, jobspec_list, queue_config, resource_type): tmpLog = core_utils.make_logger(baseLogger, 'queue={0}'.format(queue_config.queueName), method_name='make_worker') tmpLog.info("Multi node worker preparation started.") tmpLog.info("Worker size: {0} jobs on {2} nodes for {1} sec.".format(self.nJobsPerWorker, self.walltimelimit, self.nNodes)) workSpec = WorkSpec() workSpec.nCore = self.nNodes * queue_config.submitter['nCorePerNode'] workSpec.minRamCount = 0 workSpec.maxDiskCount = 0 workSpec.maxWalltime = self.walltimelimit workSpec.workParams = self._get_executable() if len(jobspec_list) > 0: # push case: we know the job and set the parameters of the job for jobSpec in jobspec_list: try: workSpec.minRamCount += jobSpec.jobParams['minRamCount'] except Exception: pass try: workSpec.maxDiskCount += jobSpec.jobParams['maxDiskCount'] except Exception: pass #try: # if jobSpec.jobParams['maxWalltime'] not in (None, "NULL"): # workSpec.maxWalltime = max(int(queue_config.walltimeLimit), jobSpec.jobParams['maxWalltime']) # else: # workSpec.maxWalltime = queue_config.walltimeLimit #except Exception: # pass tmpLog.info("Worker for {0} nodes with {2} jobs with walltime {1} sec. defined".format(self.nNodes, workSpec.maxWalltime, self.nJobsPerWorker)) return workSpec # def get_num_jobs_per_worker(self, n_workers): # """ # Function to set 'size' of worker. Define number of jobs per worker # """ # tmpLog = core_utils.make_logger(baseLogger, 'queue={0}'.format(self.queueName), # method_name='get_num_jobs_per_worker') # tmpLog.info("Get number of jobs per worker") # self.nJobsPerWorker = 1 # if self.mode == "static": # tmpLog.info("Static configuration") # self.nJobsPerWorker = self.nNodes * self.nJobsPerNode # elif self.mode == "dynamic": # tmpLog.info("Dynamic configuration") # self.nNodes, self.walltimelimit = self.get_resources() # self.nJobsPerWorker = self.nNodes * self.nJobsPerNode # # tmpLog.info("Get: {0} jobs to run for {1} sec.".format(self.nJobsPerWorker, self.walltimelimit)) # return self.nJobsPerWorker def get_resources(self): """ Function to get resourcese and map them to number of jobs """ tmpLog = core_utils.make_logger(baseLogger, 'queue={0}'.format(self.queueName), method_name='get_resources') njobs = 0 walltime = self.walltimelimit queue_config = self.queue_config_mapper.get_queue(self.queueName) resource_utils = self.pluginFactory.get_plugin(queue_config.resource) if resource_utils: nodes, walltime = resource_utils.get_resources() else: tmpLog.info("Resource plugin is not defined") nodes = self.nNodes return nodes, walltime
def check_status(self, jobspec): # make logger tmpLog = core_utils.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID), method_name='check_status') tmpLog.debug('start') # default return tmpRetVal = (True, '') # check that jobspec.computingSite is defined if jobspec.computingSite is None: # not found tmpLog.error('jobspec.computingSite is not defined') return False, 'jobspec.computingSite is not defined' else: tmpLog.debug('jobspec.computingSite : {0}'.format(jobspec.computingSite)) # test we have a Globus Transfer Client if not self.tc : errStr = 'failed to get Globus Transfer Client' tmpLog.error(errStr) return False, errStr # set transferID to None transferID = None # get transfer groups groups = jobspec.get_groups_of_input_files(skip_ready=True) tmpLog.debug('jobspec.get_groups_of_input_files() = : {0}'.format(groups)) # lock if the dummy transfer ID is used to avoid submitting duplicated transfer requests if self.dummy_transfer_id in groups: # lock for 120 sec if not self.have_db_lock : tmpLog.debug('attempt to set DB lock for self.id - {0} self.dummy_transfer_id - {1}'.format(self.id,self.dummy_transfer_id)) self.have_db_lock = self.dbInterface.get_object_lock(self.dummy_transfer_id, lock_interval=120) if not self.have_db_lock: # escape since locked by another thread msgStr = 'escape since locked by another thread' tmpLog.debug(msgStr) return None, msgStr # refresh group information since that could have been updated by another thread before getting the lock self.dbInterface.refresh_file_group_info(jobspec) # get transfer groups again with refreshed info groups = jobspec.get_groups_of_input_files(skip_ready=True) # the dummy transfer ID is still there if self.dummy_transfer_id in groups: groupUpdateTime = groups[self.dummy_transfer_id]['groupUpdateTime'] # get files with the dummy transfer ID across jobs fileSpecs = self.dbInterface.get_files_with_group_id(self.dummy_transfer_id) # submit transfer if there are more than 10 files or the group was made before more than 10 min msgStr = 'self.dummy_transfer_id = {0} number of files = {1}'.format(self.dummy_transfer_id,len(fileSpecs)) tmpLog.debug(msgStr) if len(fileSpecs) >= 10 or \ groupUpdateTime < datetime.datetime.utcnow() - datetime.timedelta(minutes=10): tmpLog.debug('prepare to transfer files') # submit transfer and get a real transfer ID # set the Globus destination Endpoint id and path will get them from Agis eventually from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(jobspec.computingSite) self.Globus_srcPath = queueConfig.preparator['Globus_srcPath'] self.srcEndpoint = queueConfig.preparator['srcEndpoint'] self.Globus_dstPath = self.basePath #self.Globus_dstPath = queueConfig.preparator['Globus_dstPath'] self.dstEndpoint = queueConfig.preparator['dstEndpoint'] # Test the endpoints and create the transfer data class errMsg = None try: # Test endpoints for activation tmpStatsrc, srcStr = globus_utils.check_endpoint_activation(tmpLog,self.tc,self.srcEndpoint) tmpStatdst, dstStr = globus_utils.check_endpoint_activation(tmpLog,self.tc,self.dstEndpoint) if tmpStatsrc and tmpStatdst: errStr = 'source Endpoint and destination Endpoint activated' tmpLog.debug(errStr) else: errMsg = '' if not tmpStatsrc : errMsg += ' source Endpoint not activated ' if not tmpStatdst : errMsg += ' destination Endpoint not activated ' # release process lock tmpLog.debug('attempt to release DB lock for self.id - {0} self.dummy_transfer_id - {1}'.format(self.id,self.dummy_transfer_id)) self.have_db_lock = self.dbInterface.release_object_lock(self.dummy_transfer_id) if not self.have_db_lock: errMsg += ' - Could not release DB lock for {}'.format(self.dummy_transfer_id) tmpLog.error(errMsg) tmpRetVal = (None,errMsg) return tmpRetVal # both endpoints activated now prepare to transfer data tdata = TransferData(self.tc, self.srcEndpoint, self.dstEndpoint, sync_level="checksum") except: errStat, errMsg = globus_utils.handle_globus_exception(tmpLog) # release process lock tmpLog.debug('attempt to release DB lock for self.id - {0} self.dummy_transfer_id - {1}'.format(self.id,self.dummy_transfer_id)) self.have_db_lock = self.dbInterface.release_object_lock(self.dummy_transfer_id) if not self.have_db_lock: errMsg += ' - Could not release DB lock for {}'.format(self.dummy_transfer_id) tmpLog.error(errMsg) tmpRetVal = (errStat, errMsg) return tmpRetVal # loop over all files for fileSpec in fileSpecs: attrs = jobspec.get_input_file_attributes() msgStr = "len(jobSpec.get_input_file_attributes()) = {0} type - {1}".format(len(attrs),type(attrs)) tmpLog.debug(msgStr) for key, value in attrs.iteritems(): msgStr = "input file attributes - {0} {1}".format(key,value) tmpLog.debug(msgStr) msgStr = "fileSpec.lfn - {0} fileSpec.scope - {1}".format(fileSpec.lfn, fileSpec.scope) tmpLog.debug(msgStr) scope = fileSpec.scope hash = hashlib.md5() hash.update('%s:%s' % (scope, fileSpec.lfn)) hash_hex = hash.hexdigest() correctedscope = "/".join(scope.split('.')) #srcURL = fileSpec.path srcURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format(endPoint=self.Globus_srcPath, scope=correctedscope, hash1=hash_hex[0:2], hash2=hash_hex[2:4], lfn=fileSpec.lfn) dstURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format(endPoint=self.Globus_dstPath, scope=correctedscope, hash1=hash_hex[0:2], hash2=hash_hex[2:4], lfn=fileSpec.lfn) tmpLog.debug('src={srcURL} dst={dstURL}'.format(srcURL=srcURL, dstURL=dstURL)) # add files to transfer object - tdata tmpLog.debug("tdata.add_item({},{})".format(srcURL,dstURL)) tdata.add_item(srcURL,dstURL) # submit transfer try: transfer_result = self.tc.submit_transfer(tdata) # check status code and message tmpLog.debug(str(transfer_result)) if transfer_result['code'] == "Accepted": # succeeded # set transfer ID which are used for later lookup transferID = transfer_result['task_id'] tmpLog.debug('successfully submitted id={0}'.format(transferID)) # set status for files self.dbInterface.set_file_group(fileSpecs, transferID, 'running') msgStr = 'submitted transfer with ID={0}'.format(transferID) tmpLog.debug(msgStr) else: # release process lock tmpLog.debug('attempt to release DB lock for self.id - {0} self.dummy_transfer_id - {1}'.format(self.id,self.dummy_transfer_id)) self.have_db_lock = self.dbInterface.release_object_lock(self.dummy_transfer_id) if not self.have_db_lock: errMsg = 'Could not release DB lock for {}'.format(self.dummy_transfer_id) tmpLog.error(errMsg) tmpRetVal = (None, transfer_result['message']) return tmpRetVal except Exception as e: errStat,errMsg = globus_utils.handle_globus_exception(tmpLog) # release process lock tmpLog.debug('attempt to release DB lock for self.id - {0} self.dummy_transfer_id - {1}'.format(self.id,self.dummy_transfer_id)) self.have_db_lock = self.dbInterface.release_object_lock(self.dummy_transfer_id) if not self.have_db_lock: errMsg += ' - Could not release DB lock for {}'.format(self.dummy_transfer_id) tmpLog.error(errMsg) return errStat, errMsg else: msgStr = 'wait until enough files are pooled' tmpLog.debug(msgStr) # release the lock tmpLog.debug('attempt to release DB lock for self.id - {0} self.dummy_transfer_id - {1}'.format(self.id,self.dummy_transfer_id)) self.have_db_lock = self.dbInterface.release_object_lock(self.dummy_transfer_id) if not self.have_db_lock: msgStr += ' - Could not release DB lock for {}'.format(self.dummy_transfer_id) tmpLog.error(msgStr) # return None to retry later return None, msgStr # check transfer with real transfer IDs # get transfer groups groups = jobspec.get_groups_of_input_files(skip_ready=True) for transferID in groups: if transferID != self.dummy_transfer_id : # get transfer task tmpStat, transferTasks = globus_utils.get_transfer_task_by_id(tmpLog,self.tc,transferID) # return a temporary error when failed to get task if not tmpStat: errStr = 'failed to get transfer task' tmpLog.error(errStr) return None, errStr # return a temporary error when task is missing if transferID not in transferTasks: errStr = 'transfer task ID - {} is missing'.format(transferID) tmpLog.error(errStr) return None, errStr # succeeded in finding a transfer task by tranferID if transferTasks[transferID]['status'] == 'SUCCEEDED': tmpLog.debug('transfer task {} succeeded'.format(transferID)) self.set_FileSpec_status(jobspec,'finished') return True, '' # failed if transferTasks[transferID]['status'] == 'FAILED': errStr = 'transfer task {} failed'.format(transferID) tmpLog.error(errStr) self.set_FileSpec_status(jobspec,'failed') return False, errStr # another status tmpStr = 'transfer task {0} status: {1}'.format(transferID,transferTasks[transferID]['status']) tmpLog.debug(tmpStr) return None, ''
def trigger_stage_out(self, jobspec): # make logger tmpLog = self.make_logger(baseLogger, 'PandaID={0} ThreadID={1}'.format( jobspec.PandaID, threading.current_thread().ident), method_name='trigger_stage_out') tmpLog.debug('start') # check that jobspec.computingSite is defined if jobspec.computingSite is None: # not found tmpLog.error('jobspec.computingSite is not defined') return False, 'jobspec.computingSite is not defined' else: tmpLog.debug('jobspec.computingSite : {0}'.format( jobspec.computingSite)) # get the queueConfig and corresponding objStoreID_ES queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(jobspec.computingSite) # write to debug log queueConfig.stager tmpLog.debug( 'jobspec.computingSite - {0} queueConfig.stager {1}'.format( jobspec.computingSite, queueConfig.stager)) # check queueConfig stager section to see if jobtype is set if 'jobtype' in queueConfig.stager: if queueConfig.stager['jobtype'] == "Yoda": self.Yodajob = True # set the location of the files in fileSpec.objstoreID # see file /cvmfs/atlas.cern.ch/repo/sw/local/etc/agis_ddmendpoints.json self.objstoreID = int(queueConfig.stager['objStoreID_ES']) if self.Yodajob: self.pathConvention = int(queueConfig.stager['pathConvention']) tmpLog.debug( 'Yoda Job - PandaID = {0} objstoreID = {1} pathConvention ={2}' .format(jobspec.PandaID, self.objstoreID, self.pathConvention)) else: self.pathConvention = None tmpLog.debug('PandaID = {0} objstoreID = {1}'.format( jobspec.PandaID, self.objstoreID)) self.RSE_dstpath = queueConfig.stager['RSE_dstPath'] # loop over the output files and copy the files ifile = 0 errors = [] for fileSpec in jobspec.get_output_file_specs(skip_done=True): scope = 'panda' if fileSpec.scope is not None: scope = fileSpec.scope # for Yoda job set the scope to transient if self.Yodajob: scope = 'transient' # only print to log file first 25 files if ifile < 25: msgStr = "fileSpec.lfn - {0} fileSpec.scope - {1}".format( fileSpec.lfn, fileSpec.scope) tmpLog.debug(msgStr) if ifile == 25: msgStr = "printed first 25 files skipping the rest".format( fileSpec.lfn, fileSpec.scope) tmpLog.debug(msgStr) hash = hashlib.md5() hash.update('%s:%s' % (scope, fileSpec.lfn)) hash_hex = hash.hexdigest() correctedscope = "/".join(scope.split('.')) srcURL = fileSpec.path dstURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format( endPoint=self.RSE_dstPath, scope=correctedscope, hash1=hash_hex[0:2], hash2=hash_hex[2:4], lfn=fileSpec.lfn) if ifile < 25: tmpLog.debug('src={srcURL} dst={dstURL}'.format(srcURL=srcURL, dstURL=dstURL)) # copy the source file from source to destination skip over if file already exists if os.path.exists(dstURL): tmpLog.debug('Already copied file {0}'.format(dstURL)) # Set the file spec status if self.changeFileStatusOnSuccess: fileSpec.status = 'finished' else: if os.path.exists(srcURL): # check if destination directory exists if not create it dstDIR = os.path.dirname(dstURL) try: if not os.path.exists(dstDIR): os.makedirs(dstDIR) mode = stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IXGRP mode = mode | stat.S_IROTH | stat.S_IXOTH | stat.S_ISGID os.chmod(dstDIR, mode) # copy the source file to destination file shutil.copy2(srcURL, dstURL) # Set the file spec status if self.changeFileStatusOnSuccess: self.set_FileSpec_status(jobspec, 'finished') except (IOError, os.error) as why: errors.append((srcURL, dstURL, str(why))) else: errors.append((srcURL, dstURL, 'Source file missing')) ifile += 1 # Now test for any errors if errors: for error in errors: tmpLog.debug( 'copy error source {0} destination {1} Reason {2}'.format( error[0], error[1], error[2])) raise Error(errors) # otherwise we are OK tmpLog.debug('stop') return True, ''
def main(): parser = argparse.ArgumentParser() parser.add_argument('--queueName', action='store', dest='queueName', default=None, required=True, help='the name of queue where harvester is installed') parser.add_argument('--middleware', action='store', dest='middleware', default='rpc', help='middleware to access the remote target machine') options = parser.parse_args() # get queue qcm = QueueConfigMapper() qcm.load_data() queueConfig = qcm.get_queue(options.queueName) if queueConfig is None: print ('ERROR: queue={0} not found in panda_queueconfig.json'.format(options.queueName)) sys.exit(1) # get middleware if not hasattr(queueConfig, options.middleware): print ('ERROR: middleware={0} is not defined for {1} in panda_queueconfig.json'.format(options.middleware, options.queueName)) sys.exit(1) middleware = getattr(queueConfig, options.middleware) # get ssh parameters sshHost = middleware['remoteHost'] try: sshPort = middleware['remotePort'] except Exception: sshPort = 22 sshUserName = middleware['sshUserName'] try: sshPassword = middleware['sshPassword'] except Exception: sshPassword = None privateKey = None passPhrase = None if sshPassword is None: try: privateKey = middleware['privateKey'] except Exception: print ("ERROR: set sshPassword or privateKey in middleware={0}".format(options.middleware)) sys.exit(1) try: passPhrase = middleware['passPhrase'] except Exception: passPhrase = None try: jumpHost = middleware['jumpHost'] except Exception: jumpHost = None try: jumpPort = middleware['jumpPort'] except Exception: jumpPort = 22 # ssh sshTunnelPool.make_tunnel_server(sshHost, sshPort, remote_bind_port=middleware['remoteBindPort'], num_tunnels=1, ssh_username=sshUserName, ssh_password=sshPassword, private_key=privateKey, pass_phrase=passPhrase, jump_host=jumpHost, jump_port=jumpPort ) ssh = sshTunnelPool.get_tunnel(sshHost, sshPort)[-1] return ssh
def trigger_stage_out(self, jobspec): # make logger tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID), method_name='trigger_stage_out') tmpLog.debug('start') # default return tmpRetVal = (True, '') # check that jobspec.computingSite is defined if jobspec.computingSite is None: # not found tmpLog.error('jobspec.computingSite is not defined') return False, 'jobspec.computingSite is not defined' else: tmpLog.debug('jobspec.computingSite : {0}'.format( jobspec.computingSite)) # test we have a Globus Transfer Client if not self.tc: errStr = 'failed to get Globus Transfer Client' tmpLog.error(errStr) return False, errStr # get label label = self.make_label(jobspec) tmpLog.debug('label={0}'.format(label)) # get transfer tasks tmpStat, transferTasks = globus_utils.get_transfer_tasks( tmpLog, self.tc, label) if not tmpStat: errStr = 'failed to get transfer tasks' tmpLog.error(errStr) return False, errStr # check if already queued if label in transferTasks: tmpLog.debug('skip since already queued with {0}'.format( str(transferTasks[label]))) return True, '' # set the Globus destination Endpoint id and path will get them from Agis eventually from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(jobspec.computingSite) #self.Globus_srcPath = queueConfig.stager['Globus_srcPath'] self.srcEndpoint = queueConfig.stager['srcEndpoint'] self.Globus_srcPath = self.basePath self.Globus_dstPath = queueConfig.stager['Globus_dstPath'] self.dstEndpoint = queueConfig.stager['dstEndpoint'] # Test the endpoints and create the transfer data class errMsg = None try: # Test endpoints for activation tmpStatsrc, srcStr = globus_utils.check_endpoint_activation( tmpLog, self.tc, self.srcEndpoint) tmpStatdst, dstStr = globus_utils.check_endpoint_activation( tmpLog, self.tc, self.dstEndpoint) if tmpStatsrc and tmpStatdst: errStr = 'source Endpoint and destination Endpoint activated' tmpLog.debug(errStr) else: errMsg = '' if not tmpStatsrc: errMsg += ' source Endpoint not activated ' if not tmpStatdst: errMsg += ' destination Endpoint not activated ' tmpLog.error(errMsg) tmpRetVal = (False, errMsg) return tmpRetVal # both endpoints activated now prepare to transfer data tdata = TransferData(self.tc, self.srcEndpoint, self.dstEndpoint, label=label, sync_level="checksum") except: errStat, errMsg = globus_utils.handle_globus_exception(tmpLog) tmpRetVal = (errStat, errMsg) return tmpRetVal # loop over all files fileAttrs = jobspec.get_output_file_attributes() lfns = [] for fileSpec in jobspec.outFiles: scope = fileAttrs[fileSpec.lfn]['scope'] hash = hashlib.md5() hash.update('%s:%s' % (scope, fileSpec.lfn)) hash_hex = hash.hexdigest() correctedscope = "/".join(scope.split('.')) srcURL = fileSpec.path dstURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format( endPoint=self.Globus_dstPath, scope=correctedscope, hash1=hash_hex[0:2], hash2=hash_hex[2:4], lfn=fileSpec.lfn) tmpLog.debug('src={srcURL} dst={dstURL}'.format(srcURL=srcURL, dstURL=dstURL)) # add files to transfer object - tdata if os.access(srcURL, os.R_OK): tmpLog.debug("tdata.add_item({},{})".format(srcURL, dstURL)) tdata.add_item(srcURL, dstURL) lfns.append(fileSpec.lfn) else: errMsg = "source file {} does not exist".format(srcURL) tmpLog.error(errMsg) tmpRetVal = (False, errMsg) return tmpRetVal # submit transfer try: transfer_result = self.tc.submit_transfer(tdata) # check status code and message tmpLog.debug(str(transfer_result)) if transfer_result['code'] == "Accepted": # succeeded # set transfer ID which are used for later lookup transferID = transfer_result['task_id'] tmpLog.debug( 'successfully submitted id={0}'.format(transferID)) jobspec.set_groups_to_files( {transferID: { 'lfns': lfns, 'groupStatus': 'active' }}) # set for fileSpec in jobspec.outFiles: if fileSpec.fileAttributes == None: fileSpec.fileAttributes = {} fileSpec.fileAttributes['transferID'] = transferID else: tmpRetVal = (False, transfer_result['message']) except Exception as e: errStat, errMsg = globus_utils.handle_globus_exception(tmpLog) if errMsg is None: errtype, errvalue = sys.exc_info()[:2] errMsg = "{0} {1}".format(errtype.__name__, errvalue) tmpRetVal = (errStat, errMsg) # return tmpLog.debug('done') return tmpRetVal
def main(): logging.basicConfig() parser = argparse.ArgumentParser() parser.add_argument( '--remoteDir', action='store', dest='remoteDir', default='harvester', help= 'directory on the remote target machine where harvester is installed') parser.add_argument( '--remoteBuildDir', action='store', dest='remoteBuildDir', default='harvester_build', help='directory on the remote target machine where harvester is build') parser.add_argument('--remotePythonSetup', action='store', dest='remotePythonSetup', default='', help='python setup on remote target machine') parser.add_argument('--queueName', action='store', dest='queueName', default=None, required=True, help='the name of queue where harvester is installed') parser.add_argument('--middleware', action='store', dest='middleware', default='rpc', help='middleware to access the remote target machine') options = parser.parse_args() # remove ~/ which doesn't work with sftp options.remoteDir = re.sub('^~/', '', options.remoteDir) options.remoteBuildDir = re.sub('^~/', '', options.remoteBuildDir) # get queue qcm = QueueConfigMapper() qcm.load_data() queueConfig = qcm.get_queue(options.queueName) if queueConfig is None: print('ERROR: queue={0} not found in panda_queueconfig.json'.format( options.queueName)) sys.exit(1) # get middleware if not hasattr(queueConfig, options.middleware): print( 'ERROR: middleware={0} is not defined for {1} in panda_queueconfig.json' .format(options.middleware, options.queueName)) sys.exit(1) middleware = getattr(queueConfig, options.middleware) # get ssh parameters sshHost = middleware['remoteHost'] try: sshPort = middleware['remotePort'] except Exception: sshPort = 22 sshUserName = middleware['sshUserName'] try: sshPassword = middleware['sshPassword'] except Exception: sshPassword = None privateKey = None passPhrase = None if sshPassword is None: try: privateKey = middleware['privateKey'] except Exception: print("ERROR: set sshPassword or privateKey in middleware={0}". format(options.middleware)) sys.exit(1) try: passPhrase = middleware['passPhrase'] except Exception: passPhrase = None try: jumpHost = middleware['jumpHost'] except Exception: jumpHost = None try: jumpPort = middleware['jumpPort'] except Exception: jumpPort = 22 # ssh sshClient = make_ssh_connection(sshHost, sshPort, sshUserName, sshPassword, passPhrase, privateKey, jumpHost, jumpPort) # get remote python version exec_out = sshClient.exec_command(';'.join([ options.remotePythonSetup, """python -c 'import sys;print("{0}{1}".format(*(sys.version_info[:2])))' """ ])) remotePythonVer = exec_out[1].read().rstrip() sshClient.close() print('remote python version : {0}'.format(remotePythonVer)) # make tmp dir with TemporaryDirectory() as tmpDir: harvesterGit = "git+git://github.com/PanDAWMS/panda-harvester.git" # get all dependencies print("getting dependencies") p = subprocess.Popen("pip download -d {0} {1}; rm -rf {0}/*".format( tmpDir, harvesterGit), stdout=subprocess.PIPE, shell=True) stdout, stderr = p.communicate() packages = [] for line in stdout.split('\n'): if line.startswith('Successfully downloaded'): packages = line.split()[2:] packages.append(harvesterGit) packages.append('pip') packages.remove('pandaharvester') # download packages print("pip download to {0}".format(tmpDir)) for package in packages: print("getting {0}".format(package)) ret = subprocess.call( "pip download --no-deps --python-version {0} -d {1} {2}". format(remotePythonVer, tmpDir, package), shell=True) if ret != 0: print("ERROR: failed to download {0}".format(package)) sys.exit(1) # sftp sshClient = make_ssh_connection(sshHost, sshPort, sshUserName, sshPassword, passPhrase, privateKey, jumpHost, jumpPort) try: sshClient.exec_command('rm -rf {0}; mkdir -p {0}'.format( options.remoteBuildDir)) except Exception: pass sftp = sshClient.open_sftp() for name in os.listdir(tmpDir): path = os.path.join(tmpDir, name) if os.path.isdir(path): continue remotePath = os.path.join(options.remoteBuildDir, name) print("copy {0} to {1}".format(name, remotePath)) sftp.put(path, remotePath) # install print("install harvester") buildDir = options.remoteBuildDir if not buildDir.startswith('/'): buildDir = '~/' + buildDir exec_out = sshClient.exec_command(';'.join([ options.remotePythonSetup, 'cd {0}'.format(options.remoteDir), 'pip install pip pandaharvester --no-index --find-links {0}'. format(buildDir) ])) print(exec_out[1].read()) print(exec_out[2].read()) sshClient.close()
def check_status(self, jobspec): # make logger tmpLog = self.make_logger(_logger, 'PandaID={0} ThreadID={1}'.format(jobspec.PandaID,threading.current_thread().ident), method_name='check_status') tmpLog.debug('start') # show the dummy transfer id and set to a value with the PandaID if needed. tmpLog.debug('self.dummy_transfer_id = {}'.format(self.dummy_transfer_id)) if self.dummy_transfer_id == '{0}_{1}'.format(dummy_transfer_id_base,'XXXX') : old_dummy_transfer_id = self.dummy_transfer_id self.dummy_transfer_id = '{0}_{1}'.format(dummy_transfer_id_base,jobspec.PandaID) tmpLog.debug('Change self.dummy_transfer_id from {0} to {1}'.format(old_dummy_transfer_id,self.dummy_transfer_id)) # default return tmpRetVal = (True, '') # set flag if have db lock have_db_lock = False # check that jobspec.computingSite is defined if jobspec.computingSite is None: # not found tmpLog.error('jobspec.computingSite is not defined') return False, 'jobspec.computingSite is not defined' else: tmpLog.debug('jobspec.computingSite : {0}'.format(jobspec.computingSite)) # get the queueConfig and corresponding objStoreID_ES queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(jobspec.computingSite) # check queueConfig stager section to see if jobtype is set if 'jobtype' in queueConfig.stager: if queueConfig.stager['jobtype'] == "Yoda" : self.Yodajob = True # set the location of the files in fileSpec.objstoreID # see file /cvmfs/atlas.cern.ch/repo/sw/local/etc/agis_ddmendpoints.json self.objstoreID = int(queueConfig.stager['objStoreID_ES']) if self.Yodajob : self.pathConvention = int(queueConfig.stager['pathConvention']) tmpLog.debug('Yoda Job - PandaID = {0} objstoreID = {1} pathConvention ={2}'.format(jobspec.PandaID,self.objstoreID,self.pathConvention)) else: self.pathConvention = None tmpLog.debug('PandaID = {0} objstoreID = {1}'.format(jobspec.PandaID,self.objstoreID)) # test we have a Globus Transfer Client if not self.tc : errStr = 'failed to get Globus Transfer Client' tmpLog.error(errStr) return False, errStr # set transferID to None transferID = None # get the scope of the log files outfileattrib = jobspec.get_output_file_attributes() scopeLog = 'xxxx' for key in outfileattrib.keys(): if "log.tgz" in key : scopeLog = outfileattrib[key]['scope'] # get transfer groups groups = jobspec.get_groups_of_output_files() tmpLog.debug('jobspec.get_groups_of_output_files() = : {0}'.format(groups)) # lock if the dummy transfer ID is used to avoid submitting duplicated transfer requests for dummy_transferID in groups: # skip if valid transfer ID not dummy one if validate_transferid(dummy_transferID) : continue # lock for 120 sec tmpLog.debug('attempt to set DB lock for self.id - {0} dummy_transferID - {1}'.format(self.id,dummy_transferID)) have_db_lock = self.dbInterface.get_object_lock(dummy_transferID, lock_interval=120) if not have_db_lock: # escape since locked by another thread msgStr = 'escape since locked by another thread' tmpLog.debug(msgStr) return None, msgStr # refresh group information since that could have been updated by another thread before getting the lock tmpLog.debug('self.dbInterface.refresh_file_group_info(jobspec)') self.dbInterface.refresh_file_group_info(jobspec) # get transfer groups again with refreshed info tmpLog.debug('After db refresh call groups=jobspec.get_groups_of_output_files()') groups = jobspec.get_groups_of_output_files() tmpLog.debug('jobspec.get_groups_of_output_files() = : {0}'.format(groups)) # the dummy transfer ID is still there if dummy_transferID in groups: groupUpdateTime = groups[dummy_transferID]['groupUpdateTime'] # get files with the dummy transfer ID across jobs fileSpecs = self.dbInterface.get_files_with_group_id(dummy_transferID) # submit transfer if there are more than 10 files or the group was made before more than 10 min msgStr = 'dummy_transferID = {0} number of files = {1}'.format(dummy_transferID,len(fileSpecs)) tmpLog.debug(msgStr) if len(fileSpecs) >= 10 or \ groupUpdateTime < datetime.datetime.utcnow() - datetime.timedelta(minutes=10): tmpLog.debug('prepare to transfer files') # submit transfer and get a real transfer ID # set the Globus destination Endpoint id and path will get them from Agis eventually #self.Globus_srcPath = queueConfig.stager['Globus_srcPath'] self.srcEndpoint = queueConfig.stager['srcEndpoint'] self.Globus_srcPath = self.basePath self.Globus_dstPath = queueConfig.stager['Globus_dstPath'] self.dstEndpoint = queueConfig.stager['dstEndpoint'] # Test the endpoints and create the transfer data class errMsg = None try: # Test endpoints for activation tmpStatsrc, srcStr = globus_utils.check_endpoint_activation(tmpLog,self.tc,self.srcEndpoint) tmpStatdst, dstStr = globus_utils.check_endpoint_activation(tmpLog,self.tc,self.dstEndpoint) if tmpStatsrc and tmpStatdst: errStr = 'source Endpoint and destination Endpoint activated' tmpLog.debug(errStr) else: errMsg = '' if not tmpStatsrc : errMsg += ' source Endpoint not activated ' if not tmpStatdst : errMsg += ' destination Endpoint not activated ' # release process lock tmpLog.debug('attempt to release DB lock for self.id - {0} dummy_transferID - {1}'.format(self.id,dummy_transferID)) self.have_db_lock = self.dbInterface.release_object_lock(dummy_transferID) if not self.have_db_lock: errMsg += ' - Could not release DB lock for {}'.format(dummy_transferID) tmpLog.error(errMsg) tmpRetVal = (None,errMsg) return tmpRetVal # both endpoints activated now prepare to transfer data tdata = None tdata = TransferData(self.tc, self.srcEndpoint, self.dstEndpoint, sync_level="checksum") except: errStat, errMsg = globus_utils.handle_globus_exception(tmpLog) # release process lock tmpLog.debug('attempt to release DB lock for self.id - {0} dummy_transferID - {1}'.format(self.id,dummy_transferID)) release_db_lock = self.dbInterface.release_object_lock(dummy_transferID) if not release_db_lock: errMsg += ' - Could not release DB lock for {}'.format(dummy_transferID) tmpLog.error(errMsg) tmpRetVal = (errStat, errMsg) return tmpRetVal # loop over all files ifile = 0 for fileSpec in fileSpecs: logfile = False scope ='panda' if fileSpec.scope is not None : scope = fileSpec.scope # for Yoda job set the scope to transient for non log files if self.Yodajob : scope = 'transient' if fileSpec.fileType == "log" : logfile = True scope = scopeLog # only print to log file first 25 files if ifile < 25 : msgStr = "fileSpec.lfn - {0} fileSpec.scope - {1}".format(fileSpec.lfn, fileSpec.scope) tmpLog.debug(msgStr) if ifile == 25 : msgStr = "printed first 25 files skipping the rest".format(fileSpec.lfn, fileSpec.scope) tmpLog.debug(msgStr) hash = hashlib.md5() hash.update('%s:%s' % (scope, fileSpec.lfn)) hash_hex = hash.hexdigest() correctedscope = "/".join(scope.split('.')) srcURL = fileSpec.path dstURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format(endPoint=self.Globus_dstPath, scope=correctedscope, hash1=hash_hex[0:2], hash2=hash_hex[2:4], lfn=fileSpec.lfn) if logfile : tmpLog.debug('src={srcURL} dst={dstURL}'.format(srcURL=srcURL, dstURL=dstURL)) if ifile < 25 : tmpLog.debug('src={srcURL} dst={dstURL}'.format(srcURL=srcURL, dstURL=dstURL)) # add files to transfer object - tdata if os.access(srcURL, os.R_OK): if ifile < 25 : tmpLog.debug("tdata.add_item({},{})".format(srcURL,dstURL)) tdata.add_item(srcURL,dstURL) else: errMsg = "source file {} does not exist".format(srcURL) # release process lock tmpLog.debug('attempt to release DB lock for self.id - {0} dummy_transferID - {1}'.format(self.id,dummy_transferID)) release_db_lock = self.dbInterface.release_object_lock(dummy_transferID) if not release_db_lock: errMsg += ' - Could not release DB lock for {}'.format(dummy_transferID) tmpLog.error(errMsg) tmpRetVal = (False,errMsg) return tmpRetVal ifile += 1 # submit transfer tmpLog.debug('Number of files to transfer - {}'.format(len(tdata['DATA']))) try: transfer_result = self.tc.submit_transfer(tdata) # check status code and message tmpLog.debug(str(transfer_result)) if transfer_result['code'] == "Accepted": # succeeded # set transfer ID which are used for later lookup transferID = transfer_result['task_id'] tmpLog.debug('successfully submitted id={0}'.format(transferID)) # set status for files self.dbInterface.set_file_group(fileSpecs, transferID, 'running') msgStr = 'submitted transfer with ID={0}'.format(transferID) tmpLog.debug(msgStr) else: # release process lock tmpLog.debug('attempt to release DB lock for self.id - {0} dummy_transferID - {1}'.format(self.id,dummy_transferID)) release_db_lock = self.dbInterface.release_object_lock(dummy_transferID) if not release_db_lock: errMsg = 'Could not release DB lock for {}'.format(dummy_transferID) tmpLog.error(errMsg) tmpRetVal = (None, transfer_result['message']) return tmpRetVal except Exception as e: errStat,errMsg = globus_utils.handle_globus_exception(tmpLog) # release process lock tmpLog.debug('attempt to release DB lock for self.id - {0} dummy_transferID - {1}'.format(self.id,dummy_transferID)) release_db_lock = self.dbInterface.release_object_lock(dummy_transferID) if not release_db_lock: errMsg += ' - Could not release DB lock for {}'.format(dummy_transferID) tmpLog.error(errMsg) return errStat, errMsg else: msgStr = 'wait until enough files are pooled' tmpLog.debug(msgStr) # release the lock tmpLog.debug('attempt to release DB lock for self.id - {0} dummy_transferID - {1}'.format(self.id,dummy_transferID)) release_db_lock = self.dbInterface.release_object_lock(dummy_transferID) if release_db_lock: tmpLog.debug('released DB lock for self.id - {0} dummy_transferID - {1}'.format(self.id,dummy_transferID)) have_db_lock = False else: msgStr += ' - Could not release DB lock for {}'.format(dummy_transferID) tmpLog.error(msgStr) # return None to retry later return None, msgStr # release the db lock if needed if have_db_lock: tmpLog.debug('attempt to release DB lock for self.id - {0} dummy_transferID - {1}'.format(self.id,dummy_transferID)) release_db_lock = self.dbInterface.release_object_lock(dummy_transferID) if release_db_lock: tmpLog.debug('released DB lock for self.id - {0} dummy_transferID - {1}'.format(self.id,dummy_transferID)) have_db_lock = False else: msgStr += ' - Could not release DB lock for {}'.format(dummy_transferID) tmpLog.error(msgStr) return None, msgStr # check transfer with real transfer IDs # get transfer groups tmpLog.debug("groups = jobspec.get_groups_of_output_files()") groups = jobspec.get_groups_of_output_files() tmpLog.debug('Number of transfer groups - {0}'.format(len(groups))) tmpLog.debug('transfer groups any state - {0}'.format(groups)) if len(groups) == 0: tmpLog.debug("jobspec.get_groups_of_output_files(skip_done=True) returned no files ") tmpLog.debug("check_status return status - True ") return True,'' for transferID in groups: # allow only valid UUID if validate_transferid(transferID) : # get transfer task tmpStat, transferTasks = globus_utils.get_transfer_task_by_id(tmpLog,self.tc,transferID) # return a temporary error when failed to get task if not tmpStat: errStr = 'failed to get transfer task; tc = %s; transferID = %s' % (str(self.tc),str(transferID)) tmpLog.error(errStr) return None, errStr # return a temporary error when task is missing if transferID not in transferTasks: errStr = 'transfer task ID - {} is missing'.format(transferID) tmpLog.error(errStr) return None, errStr # succeeded in finding a transfer task by tranferID if transferTasks[transferID]['status'] == 'SUCCEEDED': tmpLog.debug('transfer task {} succeeded'.format(transferID)) self.set_FileSpec_objstoreID(jobspec, self.objstoreID, self.pathConvention) if self.changeFileStatusOnSuccess: self.set_FileSpec_status(jobspec, 'finished') return True, '' # failed if transferTasks[transferID]['status'] == 'FAILED': errStr = 'transfer task {} failed'.format(transferID) tmpLog.error(errStr) self.set_FileSpec_status(jobspec,'failed') return False, errStr # another status tmpStr = 'transfer task {0} status: {1}'.format(transferID,transferTasks[transferID]['status']) tmpLog.debug(tmpStr) return None, '' # end of loop over transfer groups tmpLog.debug('End of loop over transfers groups - ending check_status function') return None,'no valid transfer id found'
def check_workers(self, workspec_list): retList = [] for workSpec in workspec_list: # make logger tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format( workSpec.workerID), method_name='check_workers') queueconfigmapper = QueueConfigMapper() queueconfig = queueconfigmapper.get_queue(workSpec.computingSite) try: tmpLog.debug('Querying aCT for id {0}'.format( workSpec.batchID)) columns = [ 'actpandastatus', 'pandastatus', 'computingElement', 'node', 'error' ] actjobs = self.actDB.getJobs("id={0}".format(workSpec.batchID), columns) except Exception as e: if self.actDB: tmpLog.error("Failed to query aCT DB: {0}".format(str(e))) # send back current status retList.append((workSpec.status, '')) continue if not actjobs: tmpLog.error("Job with id {0} not found in aCT".format( workSpec.batchID)) # send back current status retList.append((WorkSpec.ST_failed, "Job not found in aCT")) continue actstatus = actjobs[0]['actpandastatus'] workSpec.nativeStatus = actstatus newStatus = WorkSpec.ST_running errorMsg = '' if actstatus in ['waiting', 'sent', 'starting']: newStatus = WorkSpec.ST_submitted # Handle post running states if queueconfig.truePilot: # True pilot: keep in running until really done if actstatus in ['done', 'donecancelled']: newStatus = WorkSpec.ST_finished elif actstatus == 'donefailed': # set failed here with workspec sup error errorMsg = actjobs[0]['error'] or 'Unknown error' error_code = WorkerErrors.error_codes.get('GENERAL_ERROR') workSpec.set_supplemental_error(error_code=error_code, error_diag=errorMsg) newStatus = WorkSpec.ST_failed tmpLog.info('ID {0} failed with error {1})'.format( workSpec.batchID, errorMsg)) elif actstatus in [ 'done', 'donefailed', 'donecancelled', 'transferring', 'tovalidate' ]: # NG mode: all post processing is now done in the stager newStatus = WorkSpec.ST_finished if newStatus != workSpec.status: tmpLog.info('ID {0} updated status {1} -> {2} ({3})'.format( workSpec.batchID, workSpec.status, newStatus, actstatus)) else: tmpLog.debug('batchStatus {0} -> workerStatus {1}'.format( actstatus, newStatus)) if actjobs[0]['computingElement']: workSpec.computingElement = actjobs[0]['computingElement'] if actjobs[0]['node']: try: pandaid = workSpec.get_jobspec_list()[0].PandaID workSpec.set_work_attributes( {pandaid: { 'node': actjobs[0]['node'] }}) except: tmpLog.warning( 'Could not extract panda ID for worker {0}'.format( workSpec.batchID)) retList.append((newStatus, errorMsg)) return True, retList
class GoogleMonitor(PluginBase): def __init__(self, **kwarg): PluginBase.__init__(self, **kwarg) self.queue_config_mapper = QueueConfigMapper() # States taken from: https://cloud.google.com/compute/docs/instances/checking-instance-status self.vm_to_worker_status = { 'RUNNING': WorkSpec.ST_running, 'TERMINATED': WorkSpec.ST_running, # the VM is stopped, but has to be fully deleted 'STOPPING': WorkSpec.ST_finished, 'PROVISIONING': WorkSpec.ST_submitted, 'STAGING': WorkSpec.ST_submitted } def list_vms(self, zone): """ List the status of the running VMs :return: """ try: result = compute.instances().list(project=PROJECT, zone=zone).execute() try: vm_instances = result['items'] except KeyError: # there are no VMs running return [], {} # make a list with the VM names vm_names = map(lambda vm_instance: vm_instance['name'], vm_instances) # make a dictionary so we can retrieve a VM by its name vm_name_to_status = {} for vm_instance in vm_instances: vm_name_to_status[vm_instance['name']] = vm_instance['status'] return vm_names, vm_name_to_status except: return None, None def kill_worker(self, vm_name, zone): """ Sends the command to Google to destroy a VM """ try: base_logger.debug('Going to kill VM {0}'.format(vm_name)) compute.instances().delete(project=PROJECT, zone=zone, instance=vm_name).execute() base_logger.debug('Killed VM {0}'.format(vm_name)) except Exception as e: base_logger.error('Problems killing the VM: {0}'.format(e)) def check_workers(self, workers): """ This method takes a list of WorkSpecs as input argument and returns a list of worker's statuses. Nth element in the return list corresponds to the status of Nth WorkSpec in the given list. :param worker_list: a list of work specs instances :return: A tuple containing the return code (True for success, False otherwise) and a list of worker's statuses :rtype: (bool, [string,]) """ if not workers: return False, 'Empty workers list received' # it assumes that all workers belong to the same queue, which is currently the case # we assume all work_specs in the list belong to the same queue queue_config = self.queue_config_mapper.get_queue(workers[0].computingSite) try: zone = queue_config.zone except AttributeError: zone = ZONE # running instances vm_names, vm_name_to_status = self.list_vms(zone) if vm_names is None and vm_name_to_status is None: error_string = 'Could not list the VMs' base_logger.error(error_string) return False, error_string # extract the list of batch IDs batch_IDs = map(lambda x: str(x.batchID), workers) base_logger.debug('Batch IDs: {0}'.format(batch_IDs)) ret_list = [] for batch_ID in batch_IDs: tmp_log = self.make_logger(base_logger, 'batch ID={0}'.format(batch_ID), method_name='check_workers') if batch_ID not in vm_names: new_status = WorkSpec.ST_finished message = 'VM not found' else: try: new_status = self.vm_to_worker_status[vm_name_to_status[batch_ID]] message = 'VM status returned by GCE API' # Preemptible VMs: GCE terminates a VM, but a stopped VM with its disk is left and needs to be # explicitly deleted if vm_name_to_status[batch_ID] == 'TERMINATED': self.kill_worker(batch_ID, zone) except KeyError: new_status = WorkSpec.ST_missed message = 'Unknown status to Harvester: {0}'.format(vm_name_to_status[batch_ID]) tmp_log.debug('new_status={0}'.format(new_status)) ret_list.append((new_status, message)) base_logger.debug('ret_list: {0}'.format(ret_list)) return True, ret_list
def trigger_stage_out(self, jobspec): # make logger tmpLog = self.make_logger(baseLogger, 'PandaID={0} ThreadID={1}'.format(jobspec.PandaID,threading.current_thread().ident), method_name='trigger_stage_out') tmpLog.debug('start') # check that jobspec.computingSite is defined if jobspec.computingSite is None: # not found tmpLog.error('jobspec.computingSite is not defined') return False, 'jobspec.computingSite is not defined' else: tmpLog.debug('jobspec.computingSite : {0}'.format(jobspec.computingSite)) # get the queueConfig and corresponding objStoreID_ES queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(jobspec.computingSite) # write to debug log queueConfig.stager tmpLog.debug('jobspec.computingSite - {0} queueConfig.stager {1}'.format(jobspec.computingSite,queueConfig.stager)) # check queueConfig stager section to see if jobtype is set if 'jobtype' in queueConfig.stager: if queueConfig.stager['jobtype'] == "Yoda" : self.Yodajob = True # set the location of the files in fileSpec.objstoreID # see file /cvmfs/atlas.cern.ch/repo/sw/local/etc/agis_ddmendpoints.json self.objstoreID = int(queueConfig.stager['objStoreID_ES']) if self.Yodajob : self.pathConvention = int(queueConfig.stager['pathConvention']) tmpLog.debug('Yoda Job - PandaID = {0} objstoreID = {1} pathConvention ={2}'.format(jobspec.PandaID,self.objstoreID,self.pathConvention)) else: self.pathConvention = None tmpLog.debug('PandaID = {0} objstoreID = {1}'.format(jobspec.PandaID,self.objstoreID)) self.RSE_dstpath = queueConfig.stager['RSE_dstPath'] # loop over the output files and copy the files ifile = 0 errors = [] for fileSpec in jobspec.get_output_file_specs(skip_done=True): scope ='panda' if fileSpec.scope is not None : scope = fileSpec.scope # for Yoda job set the scope to transient if self.Yodajob : scope = 'transient' # only print to log file first 25 files if ifile < 25 : msgStr = "fileSpec.lfn - {0} fileSpec.scope - {1}".format(fileSpec.lfn, fileSpec.scope) tmpLog.debug(msgStr) if ifile == 25 : msgStr = "printed first 25 files skipping the rest".format(fileSpec.lfn, fileSpec.scope) tmpLog.debug(msgStr) hash = hashlib.md5() hash.update('%s:%s' % (scope, fileSpec.lfn)) hash_hex = hash.hexdigest() correctedscope = "/".join(scope.split('.')) srcURL = fileSpec.path dstURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format(endPoint=self.RSE_dstPath, scope=correctedscope, hash1=hash_hex[0:2], hash2=hash_hex[2:4], lfn=fileSpec.lfn) if ifile < 25 : tmpLog.debug('src={srcURL} dst={dstURL}'.format(srcURL=srcURL, dstURL=dstURL)) # copy the source file from source to destination skip over if file already exists if os.path.exists(dstURL): tmpLog.debug('Already copied file {0}'.format(dstURL)) # Set the file spec status if self.changeFileStatusOnSuccess: fileSpec.status = 'finished' else : if os.path.exists(srcURL) : # check if destination directory exists if not create it dstDIR = os.path.dirname(dstURL) try: if not os.path.exists(dstDIR) : os.makedirs(dstDIR) mode = stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IXGRP mode = mode | stat.S_IROTH | stat.S_IXOTH | stat.S_ISGID os.chmod(dstDIR,mode) # copy the source file to destination file shutil.copy2(srcURL, dstURL) # Set the file spec status if self.changeFileStatusOnSuccess: self.set_FileSpec_status(jobspec, 'finished') except (IOError, os.error) as why: errors.append((srcURL, dstURL, str(why))) else : errors.append((srcURL, dstURL, 'Source file missing')) ifile += 1 # Now test for any errors if errors: for error in errors: tmpLog.debug('copy error source {0} destination {1} Reason {2}'.format(error[0],error[1],error[2])) raise Error(errors) # otherwise we are OK tmpLog.debug('stop') return True, ''
class cpCompasStagerHPC(BaseStager): # constructor def __init__(self, **kwarg): BaseStager.__init__(self, **kwarg) self.queue_config_mapper = QueueConfigMapper() # check status def check_stage_out_status(self, jobspec): """Check the status of stage-out procedure. If staging-out is done synchronously in trigger_stage_out this method should always return True. Output files are available through jobspec.get_outfile_specs(skip_done=False) which gives a list of FileSpecs not yet done. FileSpec.attemptNr shows how many times the transfer was checked for the file. If the file was successfully transferred, status should be set to 'finished'. Or 'failed', if the file failed to be transferred. Once files are set to 'finished' or 'failed', jobspec.get_outfile_specs(skip_done=False) ignores them. :param jobspec: job specifications :type jobspec: JobSpec :return: A tuple of return code (True: transfer success, False: fatal transfer failure, None: on-going or temporary failure) and error dialog :rtype: (bool, string) """ # make logger tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), method_name='check_stage_out_status') tmpLog.debug('start') # for fileSpec in jobspec.get_output_file_specs(skip_done=True): # fileSpec.status = 'finished' return True, '' # trigger stage out def trigger_stage_out(self, jobspec): """Trigger the stage-out procedure for the job. Output files are available through jobspec.get_outfile_specs(skip_done=False) which gives a list of FileSpecs not yet done. FileSpec.attemptNr shows how many times transfer was tried for the file so far. :param jobspec: job specifications :type jobspec: JobSpec :return: A tuple of return code (True: success, False: fatal failure, None: temporary failure) and error dialog :rtype: (bool, string) """ # make logger tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), method_name='check_stage_out_status') tmpLog.debug('start') allChecked = True ErrMsg = 'These files failed to upload: ' tmpLog.debug('Getting seprodpath from queue_config') queue_config = self.queue_config_mapper.get_queue(self.queueName) tmpLog.debug('Requesting full spec of the job {0}' . format(jobspec.PandaID)) proxy = DBProxy() jobSpec_full = proxy.get_job(jobspec.PandaID) for fileSpec in jobspec.get_output_file_specs(skip_done=True): destination = queue_config.seprodpath filename = fileSpec.lfn se_path = '' sw_path = '' prod_name = '' prodSlt = '' TMPMDSTFILE = '' TMPHISTFILE = '' EVTDUMPFILE = '' MERGEDMDSTFILE = '' MERGEDHISTFILE = '' MERGEDDUMPFILE = '' if not ".log.tgz" in fileSpec.lfn: tmpLog.debug('Getting sw path, name and hist filename from jobPars') sw_prefix, sw_path, prod_name, prodSlt, TMPMDSTFILE, TMPHISTFILE, EVTDUMPFILE, MERGEDMDSTFILE, MERGEDHISTFILE, MERGEDDUMPFILE, PRODSOFT, MCGENFILEOUT = self.getSWPathAndNameAndFilename(jobSpec_full.jobParams['jobPars']) tmpLog.debug('sw_prefix: {0}' . format(sw_prefix)) tmpLog.debug('sw_path: {0}' . format(sw_path)) tmpLog.debug('prod_name: {0}' . format(prod_name)) tmpLog.debug('prodSlt: {0}' . format(prodSlt)) tmpLog.debug('TMPMDSTFILE: {0}' . format(TMPMDSTFILE)) tmpLog.debug('TMPHISTFILE: {0}' . format(TMPHISTFILE)) tmpLog.debug('EVTDUMPFILE: {0}' . format(EVTDUMPFILE)) tmpLog.debug('MERGEDMDSTFILE: {0}' . format(MERGEDMDSTFILE)) tmpLog.debug('MERGEDHISTFILE: {0}' . format(MERGEDHISTFILE)) tmpLog.debug('MERGEDDUMPFILE: {0}' . format(MERGEDDUMPFILE)) tmpLog.debug('PRODSOFT: {0}' . format(PRODSOFT)) tmpLog.debug('MCGENFILEOUT: {0}' . format(MCGENFILEOUT)) # prod if fileSpec.lfn == TMPMDSTFILE : se_path = sw_prefix + sw_path + PRODSOFT + '/mDST.chunks' if fileSpec.lfn == TMPHISTFILE: se_path = sw_prefix + sw_path + PRODSOFT + '/TRAFDIC' if fileSpec.lfn == "testevtdump.raw": se_path = sw_prefix + sw_path + PRODSOFT + '/evtdump/slot' + prodSlt filename = EVTDUMPFILE if fileSpec.lfn == "payload_stdout.out.gz": se_path = sw_prefix + sw_path + PRODSOFT + '/logFiles' filename = prod_name + '.' + TMPHISTFILE.replace('.root', '.stdout.gz') if fileSpec.lfn == "payload_stderr.out.gz": se_path = sw_prefix + sw_path + PRODSOFT + '/logFiles' filename = prod_name + '.' + TMPHISTFILE.replace('.root', '.stderr.gz') # merge if fileSpec.lfn == MERGEDMDSTFILE : se_path = sw_prefix + sw_path + PRODSOFT + '/mDST' if fileSpec.lfn == MERGEDHISTFILE: se_path = sw_prefix + sw_path + PRODSOFT + '/histos' if fileSpec.lfn == MERGEDDUMPFILE: se_path = sw_prefix + sw_path + PRODSOFT + '/mergedDump/slot' + prodSlt # mc generation if fileSpec.lfn == MCGENFILEOUT: se_path = sw_prefix + '/mc/' + sw_path + PRODSOFT + '/mcgen' filename = MCGENFILEOUT destination = se_path surl = "{0}/{1}" . format(destination, filename) dst_gpfn = "{0}/{1}" . format(destination, filename) lfcdir = destination tmpLog.debug('fileSpec.path = {0}' . format(fileSpec.path)) tmpLog.debug('SURL = {0}' . format(surl)) tmpLog.debug('dst_gpfn = {0}' . format(dst_gpfn)) tmpLog.debug('lfcdir = {0}' . format(lfcdir)) tmpLog.debug('Create if does not exist {0}' . format(lfcdir)) if not os.path.exists(lfcdir): os.makedirs(lfcdir) tmpLog.debug('Copy {0} to {1}' . format(fileSpec.path, dst_gpfn)) shutil.copyfile(fileSpec.path, dst_gpfn) if os.path.exists(dst_gpfn): fileSpec.status = 'finished' else: fileSpec.status = 'failed' allChecked = False ErrMsg += '{0} ' . format(fileSpec.lfn) # force update fileSpec.force_update('status') tmpLog.debug('Status of file {0} is {1}' . format(fileSpec.path, fileSpec.status)) del jobSpec_full tmpLog.debug('done') if allChecked: return True, '' else: return False, ErrMsg def getSWPathAndNameAndFilename(self, jobPars): """ Get COMPASS_SW_PATH and COMPASS_PROD_NAME from JobPars """ a = jobPars.find('COMPASS_SW_PREFIX') b = jobPars[a:] c = b.find(';') d = b[:c] sw_prefix = d[d.find('=') + 1:] a = jobPars.find('COMPASS_SW_PATH') b = jobPars[a:] c = b.find(';') d = b[:c] sw_path = d[d.find('=') + 1:] a = jobPars.find('COMPASS_PROD_NAME') b = jobPars[a:] c = b.find(';') d = b[:c] prod_name = d[d.find('=') + 1:] a = jobPars.find('prodSlt') b = jobPars[a:] c = b.find(';') d = b[:c] prodSlt = d[d.find('=') + 1:] a = jobPars.find('TMPMDSTFILE') b = jobPars[a:] c = b.find(';') d = b[:c] TMPMDSTFILE = d[d.find('=') + 1:] a = jobPars.find('TMPHISTFILE') b = jobPars[a:] c = b.find(';') d = b[:c] TMPHISTFILE = d[d.find('=') + 1:] a = jobPars.find('EVTDUMPFILE') b = jobPars[a:] c = b.find(';') d = b[:c] EVTDUMPFILE = d[d.find('=') + 1:] a = jobPars.find('MERGEDMDSTFILE') b = jobPars[a:] c = b.find(';') d = b[:c] MERGEDMDSTFILE = d[d.find('=') + 1:] a = jobPars.find('MERGEDHISTFILE') b = jobPars[a:] c = b.find(';') d = b[:c] MERGEDHISTFILE = d[d.find('=') + 1:] a = jobPars.find('MERGEDDUMPFILE') b = jobPars[a:] c = b.find(';') d = b[:c] MERGEDDUMPFILE = d[d.find('=') + 1:] a = jobPars.find('PRODSOFT') b = jobPars[a:] c = b.find(';') d = b[:c] PRODSOFT = d[d.find('=') + 1:] a = jobPars.find('MCGENFILEOUT') b = jobPars[a:] c = b.find(';') d = b[:c] MCGENFILEOUT = d[d.find('=') + 1:] return sw_prefix, sw_path, prod_name, prodSlt, TMPMDSTFILE, TMPHISTFILE, EVTDUMPFILE, MERGEDMDSTFILE, MERGEDHISTFILE, MERGEDDUMPFILE, PRODSOFT, MCGENFILEOUT # zip output files def zip_output(self, jobspec): tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), method_name='zip_output') return self.simple_zip_output(jobspec, tmpLog) # asynchronous zip output def async_zip_output(self, jobspec): return True, '' # post zipping def post_zip_output(self, jobspec): return True, ''
def trigger_preparation(self, jobspec): # get logger tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID), method_name='trigger_preparation') tmpLog.debug('start') # check that jobspec.computingSite is defined if jobspec.computingSite is None: # not found tmpLog.error('jobspec.computingSite is not defined') return False, 'jobspec.computingSite is not defined' else: tmpLog.debug('jobspec.computingSite : {0}'.format(jobspec.computingSite)) # test we have a Globus Transfer Client if not self.tc : errStr = 'failed to get Globus Transfer Client' tmpLog.error(errStr) return False, errStr # get label label = self.make_label(jobspec) tmpLog.debug('label={0}'.format(label)) # get transfer tasks tmpStat, transferTasks = globus_utils.get_transfer_tasks(tmpLog,self.tc,label) if not tmpStat: errStr = 'failed to get transfer tasks' tmpLog.error(errStr) return False, errStr # check if already queued if label in transferTasks: tmpLog.debug('skip since already queued with {0}'.format(str(transferTasks[label]))) return True, '' # set the Globus destination Endpoint id and path will get them from Agis eventually from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(jobspec.computingSite) self.Globus_srcPath = queueConfig.preparator['Globus_srcPath'] self.srcEndpoint = queueConfig.preparator['srcEndpoint'] self.Globus_dstPath = self.basePath #self.Globus_dstPath = queueConfig.preparator['Globus_dstPath'] self.dstEndpoint = queueConfig.preparator['dstEndpoint'] # get input files files = [] lfns = [] inFiles = jobspec.get_input_file_attributes(skip_ready=True) for inLFN, inFile in iteritems(inFiles): # set path to each file inFile['path'] = mover_utils.construct_file_path(self.basePath, inFile['scope'], inLFN) dstpath = inFile['path'] # check if path exists if not create it. if not os.access(self.basePath, os.F_OK): os.makedirs(self.basePath) # create the file paths for the Globus source and destination endpoints Globus_srcpath = mover_utils.construct_file_path(self.Globus_srcPath, inFile['scope'], inLFN) Globus_dstpath = mover_utils.construct_file_path(self.Globus_dstPath, inFile['scope'], inLFN) files.append({'scope': inFile['scope'], 'name': inLFN, 'Globus_dstPath': Globus_dstpath, 'Globus_srcPath': Globus_srcpath}) lfns.append(inLFN) tmpLog.debug('files[] {0}'.format(files)) try: # Test endpoints for activation tmpStatsrc, srcStr = globus_utils.check_endpoint_activation(tmpLog,self.tc,self.srcEndpoint) tmpStatdst, dstStr = globus_utils.check_endpoint_activation(tmpLog,self.tc,self.dstEndpoint) if tmpStatsrc and tmpStatdst: errStr = 'source Endpoint and destination Endpoint activated' tmpLog.debug(errStr) else: errStr = '' if not tmpStatsrc : errStr += ' source Endpoint not activated ' if not tmpStatdst : errStr += ' destination Endpoint not activated ' tmpLog.error(errStr) return False,errStr # both endpoints activated now prepare to transfer data if len(files) > 0: tdata = TransferData(self.tc, self.srcEndpoint, self.dstEndpoint, label=label, sync_level="checksum") # loop over all input files and add for myfile in files: tdata.add_item(myfile['Globus_srcPath'],myfile['Globus_dstPath']) # submit transfer_result = self.tc.submit_transfer(tdata) # check status code and message tmpLog.debug(str(transfer_result)) if transfer_result['code'] == "Accepted": # succeeded # set transfer ID which are used for later lookup transferID = transfer_result['task_id'] jobspec.set_groups_to_files({transferID: {'lfns': lfns, 'groupStatus': 'active'}}) tmpLog.debug('done') return True,'' else: return False,transfer_result['message'] # if no files to transfer return True return True, 'No files to transfer' except: errStat,errMsg = globus_utils.handle_globus_exception(tmpLog) return errStat, {}
def main(): logging.basicConfig() parser = argparse.ArgumentParser() parser.add_argument('--remoteDir', action='store', dest='remoteDir', default='harvester', help='directory on the remote target machine where harvester is installed') parser.add_argument('--remoteBuildDir', action='store', dest='remoteBuildDir', default='harvester_build', help='directory on the remote target machine where harvester is build') parser.add_argument('--remotePythonSetup', action='store', dest='remotePythonSetup', default='', help='python setup on remote target machine') parser.add_argument('--queueName', action='store', dest='queueName', default=None, required=True, help='the name of queue where harvester is installed') parser.add_argument('--middleware', action='store', dest='middleware', default='rpc', help='middleware to access the remote target machine') options = parser.parse_args() # remove ~/ which doesn't work with sftp options.remoteDir = re.sub('^~/', '', options.remoteDir) options.remoteBuildDir = re.sub('^~/', '', options.remoteBuildDir) # get queue qcm = QueueConfigMapper() qcm.load_data() queueConfig = qcm.get_queue(options.queueName) if queueConfig is None: print ('ERROR: queue={0} not found in panda_queueconfig.json'.format(options.queueName)) sys.exit(1) # get middleware if not hasattr(queueConfig, options.middleware): print ('ERROR: middleware={0} is not defined for {1} in panda_queueconfig.json'.format(options.middleware, options.queueName)) sys.exit(1) middleware = getattr(queueConfig, options.middleware) # get ssh parameters sshHost = middleware['remoteHost'] try: sshPort = middleware['remotePort'] except Exception: sshPort = 22 sshUserName = middleware['sshUserName'] try: sshPassword = middleware['sshPassword'] except Exception: sshPassword = None privateKey = None passPhrase = None if sshPassword is None: try: privateKey = middleware['privateKey'] except Exception: print ("ERROR: set sshPassword or privateKey in middleware={0}".format(options.middleware)) sys.exit(1) try: passPhrase = middleware['passPhrase'] except Exception: passPhrase = None try: jumpHost = middleware['jumpHost'] except Exception: jumpHost = None try: jumpPort = middleware['jumpPort'] except Exception: jumpPort = 22 # ssh sshClient = make_ssh_connection(sshHost, sshPort, sshUserName, sshPassword, passPhrase, privateKey, jumpHost, jumpPort) # get remote python version exec_out = sshClient.exec_command( ';'.join([options.remotePythonSetup, """python -c 'import sys;print("{0}{1}".format(*(sys.version_info[:2])))' """]) ) remotePythonVer = exec_out[1].read().rstrip() sshClient.close() print ('remote python version : {0}'.format(remotePythonVer)) # make tmp dir with TemporaryDirectory() as tmpDir: harvesterGit = "git+git://github.com/PanDAWMS/panda-harvester.git" # get all dependencies print ("getting dependencies") p = subprocess.Popen("pip download -d {0} {1}; rm -rf {0}/*".format(tmpDir, harvesterGit), stdout=subprocess.PIPE, shell=True) stdout, stderr = p.communicate() packages = [] for line in stdout.split('\n'): if line.startswith('Successfully downloaded'): packages = line.split()[2:] packages.append(harvesterGit) packages.append('pip') packages.remove('pandaharvester') # download packages print ("pip download to {0}".format(tmpDir)) for package in packages: print ("getting {0}".format(package)) ret = subprocess.call("pip download --no-deps --python-version {0} -d {1} {2}".format(remotePythonVer, tmpDir, package), shell=True) if ret != 0: print ("ERROR: failed to download {0}".format(package)) sys.exit(1) # sftp sshClient = make_ssh_connection(sshHost, sshPort, sshUserName, sshPassword, passPhrase, privateKey, jumpHost, jumpPort) try: sshClient.exec_command('rm -rf {0}; mkdir -p {0}'.format(options.remoteBuildDir)) except Exception: pass sftp = sshClient.open_sftp() for name in os.listdir(tmpDir): path = os.path.join(tmpDir, name) if os.path.isdir(path): continue remotePath = os.path.join(options.remoteBuildDir, name) print ("copy {0} to {1}".format(name, remotePath)) sftp.put(path, remotePath) # install print ("install harvester") buildDir = options.remoteBuildDir if not buildDir.startswith('/'): buildDir = '~/' + buildDir exec_out = sshClient.exec_command( ';'.join([options.remotePythonSetup, 'cd {0}'.format(options.remoteDir), 'pip install pip pandaharvester --no-index --find-links {0}'.format(buildDir)]) ) print (exec_out[1].read()) print (exec_out[2].read()) sshClient.close()
class GoogleMonitor(PluginBase): def __init__(self, **kwarg): PluginBase.__init__(self, **kwarg) self.queue_config_mapper = QueueConfigMapper() # States taken from: https://cloud.google.com/compute/docs/instances/checking-instance-status self.vm_to_worker_status = { 'RUNNING': WorkSpec.ST_running, 'TERMINATED': WorkSpec. ST_running, # the VM is stopped, but has to be fully deleted 'STOPPING': WorkSpec.ST_finished, 'PROVISIONING': WorkSpec.ST_submitted, 'STAGING': WorkSpec.ST_submitted } def list_vms(self, zone): """ List the status of the running VMs :return: """ try: result = compute.instances().list(project=PROJECT, zone=zone).execute() try: vm_instances = result['items'] except KeyError: # there are no VMs running return [], {} # make a list with the VM names vm_names = map(lambda vm_instance: vm_instance['name'], vm_instances) # make a dictionary so we can retrieve a VM by its name vm_name_to_status = {} for vm_instance in vm_instances: vm_name_to_status[vm_instance['name']] = vm_instance['status'] return vm_names, vm_name_to_status except: return None, None def kill_worker(self, vm_name, zone): """ Sends the command to Google to destroy a VM """ try: base_logger.debug('Going to kill VM {0}'.format(vm_name)) compute.instances().delete(project=PROJECT, zone=zone, instance=vm_name).execute() base_logger.debug('Killed VM {0}'.format(vm_name)) except Exception as e: base_logger.error('Problems killing the VM: {0}'.format(e)) def check_workers(self, workers): """ This method takes a list of WorkSpecs as input argument and returns a list of worker's statuses. Nth element in the return list corresponds to the status of Nth WorkSpec in the given list. :param worker_list: a list of work specs instances :return: A tuple containing the return code (True for success, False otherwise) and a list of worker's statuses :rtype: (bool, [string,]) """ if not workers: return False, 'Empty workers list received' # it assumes that all workers belong to the same queue, which is currently the case # we assume all work_specs in the list belong to the same queue queue_config = self.queue_config_mapper.get_queue( workers[0].computingSite) try: zone = queue_config.zone except AttributeError: zone = ZONE # running instances vm_names, vm_name_to_status = self.list_vms(zone) if vm_names is None and vm_name_to_status is None: error_string = 'Could not list the VMs' base_logger.error(error_string) return False, error_string # extract the list of batch IDs batch_IDs = map(lambda x: str(x.batchID), workers) base_logger.debug('Batch IDs: {0}'.format(batch_IDs)) ret_list = [] for batch_ID in batch_IDs: tmp_log = self.make_logger(base_logger, 'batch ID={0}'.format(batch_ID), method_name='check_workers') if batch_ID not in vm_names: new_status = WorkSpec.ST_finished message = 'VM not found' else: try: new_status = self.vm_to_worker_status[ vm_name_to_status[batch_ID]] message = 'VM status returned by GCE API' # Preemptible VMs: GCE terminates a VM, but a stopped VM with its disk is left and needs to be # explicitly deleted if vm_name_to_status[batch_ID] == 'TERMINATED': self.kill_worker(batch_ID, zone) except KeyError: new_status = WorkSpec.ST_missed message = 'Unknown status to Harvester: {0}'.format( vm_name_to_status[batch_ID]) tmp_log.debug('new_status={0}'.format(new_status)) ret_list.append((new_status, message)) base_logger.debug('ret_list: {0}'.format(ret_list)) return True, ret_list
def check_status(self, jobspec): tmpStat = True tmpMsg = '' # make logger tmpLog = self.make_logger(baseLogger, 'PandaID={0} ThreadID={1}'.format(jobspec.PandaID,threading.current_thread().ident), method_name='check_status') tmpLog.debug('start') # check that jobspec.computingSite is defined if jobspec.computingSite is None: # not found tmpLog.error('jobspec.computingSite is not defined') return False, 'jobspec.computingSite is not defined' else: tmpLog.debug('jobspec.computingSite : {0}'.format(jobspec.computingSite)) # get the queueConfig and corresponding objStoreID_ES queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(jobspec.computingSite) # write to debug log queueConfig.stager tmpLog.debug('jobspec.computingSite - {0} queueConfig.stager {1}'.format(jobspec.computingSite,queueConfig.stager)) # check queueConfig stager section to see if jobtype is set if 'jobtype' in queueConfig.stager: if queueConfig.stager['jobtype'] == "Yoda" : self.Yodajob = True # get destination endpoint nucleus = jobspec.jobParams['nucleus'] agis = self.dbInterface.get_cache('panda_queues.json').data dstRSE = [agis[x]["astorages"]['pr'][0] for x in agis if agis[x]["atlas_site"] == nucleus][0] # set the location of the files in fileSpec.objstoreID # see file /cvmfs/atlas.cern.ch/repo/sw/local/etc/agis_ddmendpoints.json ddm = self.dbInterface.get_cache('agis_ddmendpoints.json').data self.objstoreID = ddm[dstRSE]['id'] if self.Yodajob : self.pathConvention = int(queueConfig.stager['pathConvention']) tmpLog.debug('Yoda Job - PandaID = {0} objstoreID = {1} pathConvention ={2}'.format(jobspec.PandaID,self.objstoreID,self.pathConvention)) else: self.pathConvention = None tmpLog.debug('PandaID = {0} objstoreID = {1}'.format(jobspec.PandaID,self.objstoreID)) # set the location of the files in fileSpec.objstoreID self.set_FileSpec_objstoreID(jobspec, self.objstoreID, self.pathConvention) # Get the files grouped by Rucio Rule ID groups = jobspec.get_groups_of_output_files() if len(groups) == 0: tmpLog.debug('No Rucio Rules') return None,'No Rucio Rules' tmpLog.debug('#Rucio Rules - {0} - Rules - {1}'.format(len(groups),groups)) try: rucioAPI = RucioClient() except: tmpLog.error('failure to get Rucio Client try again later') return None,'failure to get Rucio Client try again later' # loop over the Rucio rules for rucioRule in groups: if rucioRule is None: continue # lock have_db_lock = self.dbInterface.get_object_lock(rucioRule, lock_interval=120) if not have_db_lock: msgStr = 'escape since {0} is locked by another thread'.format(rucioRule) tmpLog.debug(msgStr) return None, msgStr # get transfer status groupStatus = self.dbInterface.get_file_group_status(rucioRule) tmpLog.debug('rucioRule - {0} - groupStatus - {1}'.format(rucioRule,groupStatus)) if 'transferred' in groupStatus: # already succeeded - set the fileSpec status for these files self.set_FileSpec_objstoreID(jobspec, self.objstoreID, self.pathConvention) pass elif 'failed' in groupStatus : # transfer failure tmpStat = False tmpMsg = 'rucio rule for {0}:{1} already failed'.format(datasetScope, datasetName) elif 'transferring' in groupStatus or 'pending' in groupStatus: # transfer started in Rucio check status try: result = rucioAPI.get_replication_rule(rucioRule,False) if result['state'] == "OK" : # files transfered to nucleus tmpLog.debug('Files for Rucio Rule {0} successfully transferred'.format(rucioRule)) self.dbInterface.update_file_group_status(rucioRule, 'transferred') # set the fileSpec status for these files self.set_FileSpec_objstoreID(jobspec, self.objstoreID, self.pathConvention) self.set_FileSpec_status(jobspec,'finished') elif result['state'] == "FAILED" : # failed Rucio Transfer tmpStat = False tmpMsg = 'Failed Rucio Transfer - Rucio Rule - {0}'.format(rucioRule) tmpLog.debug(tmpMsg) self.set_FileSpec_status(jobspec,'failed') elif result['state'] == 'STUCK' : tmpStat = None tmpMsg = 'Rucio Transfer Rule {0} Stuck'.format(rucioRule) tmpLog.debug(tmpMsg) except: tmpStat = None tmpMsg = 'Could not get information or Rucio Rule {0}'.format(rucioRule) tmpLog.error(tmpMsg) pass # release the lock if have_db_lock: tmpLog.debug('attempt to release DB lock for Rucio Rule {0}'.format(rucioRule)) release_db_lock = self.dbInterface.release_object_lock(rucioRule) if release_db_lock: tmpLog.debug('released DB lock for rucioRule - {0}'.format(rucioRule)) have_db_lock = False else: msgStr = ' Could not release DB lock for {}'.format(rucioRule) tmpLog.error(msgStr) return None, msgStr tmpLog.debug('stop') return tmpStat, tmpMsg
class SAGAMonitor(PluginBase): # constructor def __init__(self, **kwarg): PluginBase.__init__(self, **kwarg) self.pluginFactory = PluginFactory() self.queue_config_mapper = QueueConfigMapper() tmpLog = self.make_logger(baseLogger, method_name='__init__') tmpLog.info("[{0}] SAGA adaptor will be used.".format(self.adaptor)) # check workers def check_workers(self, workspec_list): """Check status of workers. This method takes a list of WorkSpecs as input argument and returns a list of worker's statuses. :param workspec_list: a list of work specs instances :return: A tuple of return code (True for success, False otherwise) and a list of worker's statuses. :rtype: (bool, [string,]) """ try: job_service = rs.job.Service(self.adaptor) except rs.SagaException as ex: time.sleep(10) self.check_workers(workspec_list) retList = [] for workSpec in workspec_list: # make logger errStr = '' tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workSpec.workerID), method_name='check_workers') tmpLog.debug("SAGA monitor started") if workSpec.batchID: saga_submission_id = '[{0}]-[{1}]'.format(self.adaptor, workSpec.batchID) try: worker = job_service.get_job(saga_submission_id) tmpLog.debug( 'SAGA State for submission with batchid: {0} is: {1}'.format(workSpec.batchID, worker.state)) harvester_job_state = SAGASubmitter.status_translator(worker.state) workSpec.nativeStatus = worker.state workSpec.set_status(harvester_job_state) tmpLog.debug( 'Worker state with batchid: {0} is: {1} exit code: {2}'.format(workSpec.batchID, harvester_job_state, worker.exit_code)) workSpec.set_status(harvester_job_state) if worker.created: tmpLog.debug("Worker created (SAGA): {0}".format(worker.created)) workSpec.submitTime = datetime.utcfromtimestamp(worker.created) if worker.started: tmpLog.debug("Worker started (SAGA): {0}".format(worker.started)) workSpec.startTime = datetime.utcfromtimestamp(worker.started) if worker.finished: tmpLog.debug("Worker finished (SAGA): {0}".format(worker.finished)) workSpec.endTime = datetime.utcfromtimestamp(worker.finished) if workSpec.is_final_status(): workSpec.nativeExitCode = worker.exit_code tmpLog.info("Worker in final status [{0}] exit code: {1}".format(workSpec.status, workSpec.nativeExitCode)) if workSpec.nativeExitCode != 0: # let's try to find exit code, exit message etc... tmpLog.info("Deep check to find exit code and exit status required") harvester_job_state, workSpec.nativeExitCode, workSpec.nativeStatus, starttime, endtime, errStr = self.deep_checkjob( workSpec.batchID, workSpec.workerID) if harvester_job_state == "": harvester_job_state = workSpec.ST_finished if not workSpec.startTime: workSpec.startTime = starttime if endtime: workSpec.endTime = endtime workSpec.set_status(harvester_job_state) # jsonFilePath = os.path.join(workSpec.get_access_point(), harvester_config.payload_interaction.killWorkerFile) # tmpLog.debug('Going to request kill worker via file {0}.'.format(jsonFilePath)) # try: # os.utime(jsonFilePath, None) # except OSError: # open(jsonFilePath, 'a').close() tmpLog.info('Worker {2} with BatchID={0} finished with exit code {1} and state {3}'.format( workSpec.batchID, worker.exit_code, workSpec.workerID, worker.state)) tmpLog.debug('Started: [{0}] finished: [{1}]'.format(worker.started, worker.finished)) if worker.state == rs.job.PENDING: queue_time = (datetime.now() - workSpec.submitTime).total_seconds() tmpLog.info("Worker queued for {0} sec.".format(queue_time)) if hasattr(self, 'maxqueuetime') and queue_time > self.maxqueuetime: tmpLog.info( "Queue time {0} is longer than limit {1} worker will be canceled".format(queue_time, self.maxqueuetime)) worker.cancel() worker.wait() workSpec.nativeExitCode = worker.exit_code cur_time = datetime.now() workSpec.startTime = cur_time workSpec.endTime = cur_time workSpec.set_pilot_closed() workSpec.set_status(workSpec.ST_cancelled) harvester_job_state = workSpec.ST_cancelled tmpLog.info("Worker state: {0} worker exit code: {1}".format(harvester_job_state, workSpec.nativeExitCode)) # proper processing of jobs for worker will be required, to avoid 'fake' fails if worker.state == rs.job.RUNNING: tmpLog.info("Going to check that all jobs of the worker are in the final status.") dbProxy = DBProxy() job_spec_list = dbProxy.get_jobs_with_worker_id(workSpec.workerID, None, only_running=False, slim=False) allFinal = True for job_spec in job_spec_list: if not job_spec.is_final_status(): allFinal = False tmpLog.info("Not all jobs are in the final status, skip till the next monitoring cycle.") break if allFinal: tmpLog.info("All jobs are in the final status, going to cancel the worker.") worker.cancel() worker.wait() workSpec.nativeExitCode = 0 cur_time = datetime.utcnow() workSpec.endTime = cur_time jsonFilePath = os.path.join(workSpec.get_access_point(), harvester_config.payload_interaction.killWorkerFile) tmpLog.debug('Going to request kill worker via file {0}.'.format(jsonFilePath)) try: os.utime(jsonFilePath, None) except OSError: open(jsonFilePath, 'a').close() workSpec.set_status(workSpec.ST_finished) harvester_job_state = workSpec.ST_finished tmpLog.info("Worker state: {0} worker exit code: {1}".format(harvester_job_state, workSpec.nativeExitCode)) except rs.SagaException as ex: tmpLog.info('An exception occured during retriving worker information {0}'.format(workSpec.batchID)) tmpLog.info(ex.get_message()) # probably 'fnished' is not proper state in this case, 'undefined' looks a bit better # some more work for SAGA to get proper state harvester_job_state, workSpec.nativeExitCode, workSpec.nativeStatus, starttime, endtime, errStr = self.deep_checkjob( workSpec.batchID, workSpec.workerID) if harvester_job_state == "": harvester_job_state = workSpec.ST_finished if not workSpec.startTime: workSpec.startTime = starttime if endtime: workSpec.endTime = endtime workSpec.set_status(harvester_job_state) tmpLog.debug('Worker state set to: {0} ({1})'.format(workSpec.status, harvester_job_state)) retList.append((harvester_job_state, errStr)) # for compatibility with dummy monitor f = open(os.path.join(workSpec.accessPoint, 'status.txt'), 'w') f.write(workSpec.status) f.close() else: tmpLog.debug("SAGA monitor found worker [{0}] without batchID".format(workSpec.workerID)) job_service.close() tmpLog.debug('Results: {0}'.format(retList)) return True, retList def deep_checkjob(self, batchid, workerid): """ Get job state, exit code and some more parameters, from resources depending sources :param batchid: :return harvester_job_state, nativeExitCode, nativeStatus, startTime, endTime, diagMessage """ tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workerid), method_name='deep_checkjob') harvester_job_state = None nativeexitcode = None nativestatus = None diagmessage = "" starttime = None endtime = None queue_config = self.queue_config_mapper.get_queue(self.queueName) if hasattr(queue_config, 'resource'): resource_utils = self.pluginFactory.get_plugin(queue_config.resource) else: tmpLog.debug("Resource configuration missed for: {0}".format(self.queueName)) resource_utils = None if resource_utils: batchjob_info = resource_utils.get_batchjob_info(batchid) if batchjob_info: tmpLog.info('Batch job info collected: {0}'.format(batchjob_info)) harvester_job_state = batchjob_info['status'] nativeexitcode = batchjob_info['nativeExitCode'] nativestatus = batchjob_info['nativeStatus'] diagmessage = batchjob_info['nativeExitMsg'] if batchjob_info['start_time']: starttime = batchjob_info['start_time'] if batchjob_info['finish_time']: endtime = batchjob_info['finish_time'] return harvester_job_state, nativeexitcode, nativestatus, starttime, endtime, diagmessage
def check_stage_out_status(self, jobspec): tmpStat = True tmpMsg = '' # make logger tmpLog = self.make_logger(baseLogger, 'PandaID={0} ThreadID={1}'.format(jobspec.PandaID,threading.current_thread().ident), method_name='check_stage_out_status') tmpLog.debug('start') # check that jobspec.computingSite is defined if jobspec.computingSite is None: # not found tmpLog.error('jobspec.computingSite is not defined') return False, 'jobspec.computingSite is not defined' else: tmpLog.debug('jobspec.computingSite : {0}'.format(jobspec.computingSite)) # get the queueConfig and corresponding objStoreID_ES queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(jobspec.computingSite) # write to debug log queueConfig.stager tmpLog.debug('jobspec.computingSite - {0} queueConfig.stager {1}'.format(jobspec.computingSite,queueConfig.stager)) # check queueConfig stager section to see if jobtype is set if 'jobtype' in queueConfig.stager: if queueConfig.stager['jobtype'] == "Yoda" : self.Yodajob = True # get destination endpoint nucleus = jobspec.jobParams['nucleus'] agis = self.dbInterface.get_cache('panda_queues.json').data dstRSE = [agis[x]["astorages"]['pr'][0] for x in agis if agis[x]["atlas_site"] == nucleus][0] # set the location of the files in fileSpec.objstoreID # see file /cvmfs/atlas.cern.ch/repo/sw/local/etc/agis_ddmendpoints.json ddm = self.dbInterface.get_cache('agis_ddmendpoints.json').data self.objstoreID = ddm[dstRSE]['id'] if self.Yodajob : self.pathConvention = int(queueConfig.stager['pathConvention']) tmpLog.debug('Yoda Job - PandaID = {0} objstoreID = {1} pathConvention ={2}'.format(jobspec.PandaID,self.objstoreID,self.pathConvention)) else: self.pathConvention = None tmpLog.debug('PandaID = {0} objstoreID = {1}'.format(jobspec.PandaID,self.objstoreID)) # set the location of the files in fileSpec.objstoreID self.set_FileSpec_objstoreID(jobspec, self.objstoreID, self.pathConvention) # Get the files grouped by Rucio Rule ID groups = jobspec.get_groups_of_output_files() if len(groups) == 0: tmpLog.debug('No Rucio Rules') return None,'No Rucio Rules' tmpLog.debug('#Rucio Rules - {0} - Rules - {1}'.format(len(groups),groups)) try: rucioAPI = RucioClient() except: tmpLog.error('failure to get Rucio Client try again later') return None,'failure to get Rucio Client try again later' # loop over the Rucio rules for rucioRule in groups: if rucioRule is None: continue # lock have_db_lock = self.dbInterface.get_object_lock(rucioRule, lock_interval=120) if not have_db_lock: msgStr = 'escape since {0} is locked by another thread'.format(rucioRule) tmpLog.debug(msgStr) return None, msgStr # get transfer status groupStatus = self.dbInterface.get_file_group_status(rucioRule) tmpLog.debug('rucioRule - {0} - groupStatus - {1}'.format(rucioRule,groupStatus)) if 'transferred' in groupStatus: # already succeeded - set the fileSpec status for these files self.set_FileSpec_objstoreID(jobspec, self.objstoreID, self.pathConvention) pass elif 'failed' in groupStatus : # transfer failure tmpStat = False tmpMsg = 'rucio rule for {0}:{1} already failed'.format(datasetScope, datasetName) elif 'transferring' in groupStatus or 'pending' in groupStatus: # transfer started in Rucio check status try: result = rucioAPI.get_replication_rule(rucioRule,False) if result['state'] == "OK" : # files transfered to nucleus tmpLog.debug('Files for Rucio Rule {0} successfully transferred'.format(rucioRule)) self.dbInterface.update_file_group_status(rucioRule, 'transferred') # set the fileSpec status for these files self.set_FileSpec_objstoreID(jobspec, self.objstoreID, self.pathConvention) self.set_FileSpec_status(jobspec,'finished') elif result['state'] == "FAILED" : # failed Rucio Transfer tmpStat = False tmpMsg = 'Failed Rucio Transfer - Rucio Rule - {0}'.format(rucioRule) tmpLog.debug(tmpMsg) self.set_FileSpec_status(jobspec,'failed') elif result['state'] == 'STUCK' : tmpStat = None tmpMsg = 'Rucio Transfer Rule {0} Stuck'.format(rucioRule) tmpLog.debug(tmpMsg) except: tmpStat = None tmpMsg = 'Could not get information or Rucio Rule {0}'.format(rucioRule) tmpLog.error(tmpMsg) pass # release the lock if have_db_lock: tmpLog.debug('attempt to release DB lock for Rucio Rule {0}'.format(rucioRule)) release_db_lock = self.dbInterface.release_object_lock(rucioRule) if release_db_lock: tmpLog.debug('released DB lock for rucioRule - {0}'.format(rucioRule)) have_db_lock = False else: msgStr = ' Could not release DB lock for {}'.format(rucioRule) tmpLog.error(msgStr) return None, msgStr tmpLog.debug('stop') return tmpStat, tmpMsg
def trigger_preparation(self, jobspec): # get logger tmpLog = core_utils.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID), method_name='trigger_preparation') tmpLog.debug('start') # check that jobspec.computingSite is defined if jobspec.computingSite is None: # not found tmpLog.error('jobspec.computingSite is not defined') return False, 'jobspec.computingSite is not defined' else: tmpLog.debug('jobspec.computingSite : {0}'.format( jobspec.computingSite)) # test we have a Globus Transfer Client if not self.tc: errStr = 'failed to get Globus Transfer Client' tmpLog.error(errStr) return False, errStr # get label label = self.make_label(jobspec) tmpLog.debug('label={0}'.format(label)) # get transfer tasks tmpStat, transferTasks = globus_utils.get_transfer_tasks( tmpLog, self.tc, label) if not tmpStat: errStr = 'failed to get transfer tasks' tmpLog.error(errStr) return False, errStr # check if already queued if label in transferTasks: tmpLog.debug('skip since already queued with {0}'.format( str(transferTasks[label]))) return True, '' # set the Globus destination Endpoint id and path will get them from Agis eventually from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(jobspec.computingSite) self.Globus_srcPath = queueConfig.preparator['Globus_srcPath'] self.srcEndpoint = queueConfig.preparator['srcEndpoint'] self.Globus_dstPath = self.basePath #self.Globus_dstPath = queueConfig.preparator['Globus_dstPath'] self.dstEndpoint = queueConfig.preparator['dstEndpoint'] # get input files files = [] lfns = [] inFiles = jobspec.get_input_file_attributes(skip_ready=True) for inLFN, inFile in iteritems(inFiles): # set path to each file inFile['path'] = mover_utils.construct_file_path( self.basePath, inFile['scope'], inLFN) dstpath = inFile['path'] # check if path exists if not create it. if not os.access(self.basePath, os.F_OK): os.makedirs(self.basePath) # create the file paths for the Globus source and destination endpoints Globus_srcpath = mover_utils.construct_file_path( self.Globus_srcPath, inFile['scope'], inLFN) Globus_dstpath = mover_utils.construct_file_path( self.Globus_dstPath, inFile['scope'], inLFN) files.append({ 'scope': inFile['scope'], 'name': inLFN, 'Globus_dstPath': Globus_dstpath, 'Globus_srcPath': Globus_srcpath }) lfns.append(inLFN) tmpLog.debug('files[] {0}'.format(files)) try: # Test endpoints for activation tmpStatsrc, srcStr = globus_utils.check_endpoint_activation( tmpLog, self.tc, self.srcEndpoint) tmpStatdst, dstStr = globus_utils.check_endpoint_activation( tmpLog, self.tc, self.dstEndpoint) if tmpStatsrc and tmpStatdst: errStr = 'source Endpoint and destination Endpoint activated' tmpLog.debug(errStr) else: errStr = '' if not tmpStatsrc: errStr += ' source Endpoint not activated ' if not tmpStatdst: errStr += ' destination Endpoint not activated ' tmpLog.error(errStr) return False, errStr # both endpoints activated now prepare to transfer data if len(files) > 0: tdata = TransferData(self.tc, self.srcEndpoint, self.dstEndpoint, label=label, sync_level="checksum") # loop over all input files and add for myfile in files: tdata.add_item(myfile['Globus_srcPath'], myfile['Globus_dstPath']) # submit transfer_result = self.tc.submit_transfer(tdata) # check status code and message tmpLog.debug(str(transfer_result)) if transfer_result['code'] == "Accepted": # succeeded # set transfer ID which are used for later lookup transferID = transfer_result['task_id'] jobspec.set_groups_to_files( {transferID: { 'lfns': lfns, 'groupStatus': 'active' }}) tmpLog.debug('done') return True, '' else: return False, transfer_result['message'] # if no files to transfer return True return True, 'No files to transfer' except: errStat, errMsg = globus_utils.handle_globus_exception(tmpLog) return errStat, {}
def submit_k8s_worker(self, work_spec): tmp_log = self.make_logger(base_logger, method_name='submit_k8s_worker') # get info from harvester queue config _queueConfigMapper = QueueConfigMapper() harvester_queue_config = _queueConfigMapper.get_queue(self.queueName) # set the stdout log file log_file_name = '{0}_{1}.out'.format(harvester_config.master.harvester_id, work_spec.workerID) work_spec.set_log_file('stdout', '{0}/{1}'.format(self.logBaseURL, log_file_name)) # TODO: consider if we want to upload the yaml file to PanDA cache yaml_content = self.k8s_client.read_yaml_file(self.k8s_yaml_file) try: # read the job configuration (if available, only push model) job_fields, job_pars_parsed = self.read_job_configuration(work_spec) # decide container image and executable to run. In pull mode, defaults are provided container_image = self.decide_container_image(job_fields, job_pars_parsed) executable, args = self.build_executable(job_fields, job_pars_parsed) tmp_log.debug('container_image: "{0}"; executable: "{1}"; args: "{2}"'.format(container_image, executable, args)) # choose the appropriate proxy panda_queues_dict = PandaQueuesDict() this_panda_queue_dict = panda_queues_dict.get(self.queueName, dict()) is_grandly_unified_queue = panda_queues_dict.is_grandly_unified_queue(self.queueName) cert = self._choose_proxy(work_spec, is_grandly_unified_queue) if not cert: err_str = 'No proxy specified in proxySecretPath. Not submitted' tmp_return_value = (False, err_str) return tmp_return_value # get the walltime limit try: max_time = this_panda_queue_dict['maxtime'] except Exception as e: tmp_log.warning('Could not retrieve maxtime field for queue {0}'.format(self.queueName)) max_time = None associated_params_dict = {} for key, val in panda_queues_dict.get_harvester_params(self.queueName).items(): if key in self._allowed_agis_attrs: associated_params_dict[key] = val pilot_url = associated_params_dict.get('pilot_url') pilot_version = str(this_panda_queue_dict.get('pilot_version', 'current')) python_version = str(this_panda_queue_dict.get('python_version', '2')) # prod_source_label = harvester_queue_config.get_source_label(work_spec.jobType) pilot_opt_dict = submitter_common.get_complicated_pilot_options(work_spec.pilotType) if pilot_opt_dict is None: prod_source_label = harvester_queue_config.get_source_label(work_spec.jobType) pilot_type = work_spec.pilotType pilot_url_str = '--piloturl {0}'.format(pilot_url) if pilot_url else '' else: prod_source_label = pilot_opt_dict['prod_source_label'] pilot_type = pilot_opt_dict['pilot_type_opt'] pilot_url_str = pilot_opt_dict['pilot_url_str'] pilot_python_option = submitter_common.get_python_version_option(python_version, prod_source_label) # submit the worker rsp, yaml_content_final = self.k8s_client.create_job_from_yaml(yaml_content, work_spec, prod_source_label, pilot_type, pilot_url_str, pilot_python_option, container_image, executable, args, cert, cpu_adjust_ratio=self.cpuAdjustRatio, memory_adjust_ratio=self.memoryAdjustRatio, max_time=max_time) except Exception as _e: tmp_log.error(traceback.format_exc()) err_str = 'Failed to create a JOB; {0}'.format(_e) tmp_return_value = (False, err_str) else: work_spec.batchID = yaml_content['metadata']['name'] tmp_log.debug('Created worker {0} with batchID={1}'.format(work_spec.workerID, work_spec.batchID)) tmp_return_value = (True, '') return tmp_return_value
def check_stage_out_status(self, jobspec): # make logger tmpLog = self.make_logger(_logger, 'PandaID={0} ThreadID={1}'.format( jobspec.PandaID, threading.current_thread().ident), method_name='check_stage_out_status') tmpLog.debug('start') # show the dummy transfer id and set to a value with the PandaID if needed. tmpLog.debug('self.dummy_transfer_id = {}'.format( self.dummy_transfer_id)) if self.dummy_transfer_id == '{0}_{1}'.format(dummy_transfer_id_base, 'XXXX'): old_dummy_transfer_id = self.dummy_transfer_id self.dummy_transfer_id = '{0}_{1}'.format(dummy_transfer_id_base, jobspec.PandaID) tmpLog.debug( 'Change self.dummy_transfer_id from {0} to {1}'.format( old_dummy_transfer_id, self.dummy_transfer_id)) # default return tmpRetVal = (True, '') # set flag if have db lock have_db_lock = False # check that jobspec.computingSite is defined if jobspec.computingSite is None: # not found tmpLog.error('jobspec.computingSite is not defined') return False, 'jobspec.computingSite is not defined' else: tmpLog.debug('jobspec.computingSite : {0}'.format( jobspec.computingSite)) # get the queueConfig and corresponding objStoreID_ES queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(jobspec.computingSite) # check queueConfig stager section to see if jobtype is set if 'jobtype' in queueConfig.stager: if queueConfig.stager['jobtype'] == "Yoda": self.Yodajob = True # set the location of the files in fileSpec.objstoreID # see file /cvmfs/atlas.cern.ch/repo/sw/local/etc/agis_ddmendpoints.json self.objstoreID = int(queueConfig.stager['objStoreID_ES']) if self.Yodajob: self.pathConvention = int(queueConfig.stager['pathConvention']) tmpLog.debug( 'Yoda Job - PandaID = {0} objstoreID = {1} pathConvention ={2}' .format(jobspec.PandaID, self.objstoreID, self.pathConvention)) else: self.pathConvention = None tmpLog.debug('PandaID = {0} objstoreID = {1}'.format( jobspec.PandaID, self.objstoreID)) # test we have a Globus Transfer Client if not self.tc: errStr = 'failed to get Globus Transfer Client' tmpLog.error(errStr) return False, errStr # set transferID to None transferID = None # get the scope of the log files outfileattrib = jobspec.get_output_file_attributes() scopeLog = 'xxxx' for key in outfileattrib.keys(): if "log.tgz" in key: scopeLog = outfileattrib[key]['scope'] # get transfer groups groups = jobspec.get_groups_of_output_files() tmpLog.debug( 'jobspec.get_groups_of_output_files() = : {0}'.format(groups)) # lock if the dummy transfer ID is used to avoid submitting duplicated transfer requests for dummy_transferID in groups: # skip if valid transfer ID not dummy one if validate_transferid(dummy_transferID): continue # lock for 120 sec tmpLog.debug( 'attempt to set DB lock for self.id - {0} dummy_transferID - {1}' .format(self.id, dummy_transferID)) have_db_lock = self.dbInterface.get_object_lock(dummy_transferID, lock_interval=120) if not have_db_lock: # escape since locked by another thread msgStr = 'escape since locked by another thread' tmpLog.debug(msgStr) return None, msgStr # refresh group information since that could have been updated by another thread before getting the lock tmpLog.debug('self.dbInterface.refresh_file_group_info(jobspec)') self.dbInterface.refresh_file_group_info(jobspec) # get transfer groups again with refreshed info tmpLog.debug( 'After db refresh call groups=jobspec.get_groups_of_output_files()' ) groups = jobspec.get_groups_of_output_files() tmpLog.debug( 'jobspec.get_groups_of_output_files() = : {0}'.format(groups)) # the dummy transfer ID is still there if dummy_transferID in groups: groupUpdateTime = groups[dummy_transferID]['groupUpdateTime'] # get files with the dummy transfer ID across jobs fileSpecs = self.dbInterface.get_files_with_group_id( dummy_transferID) # submit transfer if there are more than 10 files or the group was made before more than 10 min msgStr = 'dummy_transferID = {0} number of files = {1}'.format( dummy_transferID, len(fileSpecs)) tmpLog.debug(msgStr) if len(fileSpecs) >= 10 or \ groupUpdateTime < datetime.datetime.utcnow() - datetime.timedelta(minutes=10): tmpLog.debug('prepare to transfer files') # submit transfer and get a real transfer ID # set the Globus destination Endpoint id and path will get them from Agis eventually #self.Globus_srcPath = queueConfig.stager['Globus_srcPath'] self.srcEndpoint = queueConfig.stager['srcEndpoint'] self.Globus_srcPath = self.basePath self.Globus_dstPath = queueConfig.stager['Globus_dstPath'] self.dstEndpoint = queueConfig.stager['dstEndpoint'] # Test the endpoints and create the transfer data class errMsg = None try: # Test endpoints for activation tmpStatsrc, srcStr = globus_utils.check_endpoint_activation( tmpLog, self.tc, self.srcEndpoint) tmpStatdst, dstStr = globus_utils.check_endpoint_activation( tmpLog, self.tc, self.dstEndpoint) if tmpStatsrc and tmpStatdst: errStr = 'source Endpoint and destination Endpoint activated' tmpLog.debug(errStr) else: errMsg = '' if not tmpStatsrc: errMsg += ' source Endpoint not activated ' if not tmpStatdst: errMsg += ' destination Endpoint not activated ' # release process lock tmpLog.debug( 'attempt to release DB lock for self.id - {0} dummy_transferID - {1}' .format(self.id, dummy_transferID)) self.have_db_lock = self.dbInterface.release_object_lock( dummy_transferID) if not self.have_db_lock: errMsg += ' - Could not release DB lock for {}'.format( dummy_transferID) tmpLog.error(errMsg) tmpRetVal = (None, errMsg) return tmpRetVal # both endpoints activated now prepare to transfer data tdata = None tdata = TransferData(self.tc, self.srcEndpoint, self.dstEndpoint, sync_level="checksum") except: errStat, errMsg = globus_utils.handle_globus_exception( tmpLog) # release process lock tmpLog.debug( 'attempt to release DB lock for self.id - {0} dummy_transferID - {1}' .format(self.id, dummy_transferID)) release_db_lock = self.dbInterface.release_object_lock( dummy_transferID) if not release_db_lock: errMsg += ' - Could not release DB lock for {}'.format( dummy_transferID) tmpLog.error(errMsg) tmpRetVal = (errStat, errMsg) return tmpRetVal # loop over all files ifile = 0 for fileSpec in fileSpecs: logfile = False scope = 'panda' if fileSpec.scope is not None: scope = fileSpec.scope # for Yoda job set the scope to transient for non log files if self.Yodajob: scope = 'transient' if fileSpec.fileType == "log": logfile = True scope = scopeLog # only print to log file first 25 files if ifile < 25: msgStr = "fileSpec.lfn - {0} fileSpec.scope - {1}".format( fileSpec.lfn, fileSpec.scope) tmpLog.debug(msgStr) if ifile == 25: msgStr = "printed first 25 files skipping the rest".format( fileSpec.lfn, fileSpec.scope) tmpLog.debug(msgStr) hash = hashlib.md5() hash.update('%s:%s' % (scope, fileSpec.lfn)) hash_hex = hash.hexdigest() correctedscope = "/".join(scope.split('.')) srcURL = fileSpec.path dstURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format( endPoint=self.Globus_dstPath, scope=correctedscope, hash1=hash_hex[0:2], hash2=hash_hex[2:4], lfn=fileSpec.lfn) if logfile: tmpLog.debug('src={srcURL} dst={dstURL}'.format( srcURL=srcURL, dstURL=dstURL)) if ifile < 25: tmpLog.debug('src={srcURL} dst={dstURL}'.format( srcURL=srcURL, dstURL=dstURL)) # add files to transfer object - tdata if os.access(srcURL, os.R_OK): if ifile < 25: tmpLog.debug("tdata.add_item({},{})".format( srcURL, dstURL)) tdata.add_item(srcURL, dstURL) else: errMsg = "source file {} does not exist".format( srcURL) # release process lock tmpLog.debug( 'attempt to release DB lock for self.id - {0} dummy_transferID - {1}' .format(self.id, dummy_transferID)) release_db_lock = self.dbInterface.release_object_lock( dummy_transferID) if not release_db_lock: errMsg += ' - Could not release DB lock for {}'.format( dummy_transferID) tmpLog.error(errMsg) tmpRetVal = (False, errMsg) return tmpRetVal ifile += 1 # submit transfer tmpLog.debug('Number of files to transfer - {}'.format( len(tdata['DATA']))) try: transfer_result = self.tc.submit_transfer(tdata) # check status code and message tmpLog.debug(str(transfer_result)) if transfer_result['code'] == "Accepted": # succeeded # set transfer ID which are used for later lookup transferID = transfer_result['task_id'] tmpLog.debug( 'successfully submitted id={0}'.format( transferID)) # set status for files self.dbInterface.set_file_group( fileSpecs, transferID, 'running') msgStr = 'submitted transfer with ID={0}'.format( transferID) tmpLog.debug(msgStr) else: # release process lock tmpLog.debug( 'attempt to release DB lock for self.id - {0} dummy_transferID - {1}' .format(self.id, dummy_transferID)) release_db_lock = self.dbInterface.release_object_lock( dummy_transferID) if not release_db_lock: errMsg = 'Could not release DB lock for {}'.format( dummy_transferID) tmpLog.error(errMsg) tmpRetVal = (None, transfer_result['message']) return tmpRetVal except Exception as e: errStat, errMsg = globus_utils.handle_globus_exception( tmpLog) # release process lock tmpLog.debug( 'attempt to release DB lock for self.id - {0} dummy_transferID - {1}' .format(self.id, dummy_transferID)) release_db_lock = self.dbInterface.release_object_lock( dummy_transferID) if not release_db_lock: errMsg += ' - Could not release DB lock for {}'.format( dummy_transferID) tmpLog.error(errMsg) return errStat, errMsg else: msgStr = 'wait until enough files are pooled' tmpLog.debug(msgStr) # release the lock tmpLog.debug( 'attempt to release DB lock for self.id - {0} dummy_transferID - {1}' .format(self.id, dummy_transferID)) release_db_lock = self.dbInterface.release_object_lock( dummy_transferID) if release_db_lock: tmpLog.debug( 'released DB lock for self.id - {0} dummy_transferID - {1}' .format(self.id, dummy_transferID)) have_db_lock = False else: msgStr += ' - Could not release DB lock for {}'.format( dummy_transferID) tmpLog.error(msgStr) # return None to retry later return None, msgStr # release the db lock if needed if have_db_lock: tmpLog.debug( 'attempt to release DB lock for self.id - {0} dummy_transferID - {1}' .format(self.id, dummy_transferID)) release_db_lock = self.dbInterface.release_object_lock( dummy_transferID) if release_db_lock: tmpLog.debug( 'released DB lock for self.id - {0} dummy_transferID - {1}' .format(self.id, dummy_transferID)) have_db_lock = False else: msgStr += ' - Could not release DB lock for {}'.format( dummy_transferID) tmpLog.error(msgStr) return None, msgStr # check transfer with real transfer IDs # get transfer groups tmpLog.debug("groups = jobspec.get_groups_of_output_files()") groups = jobspec.get_groups_of_output_files() tmpLog.debug('Number of transfer groups - {0}'.format(len(groups))) tmpLog.debug('transfer groups any state - {0}'.format(groups)) if len(groups) == 0: tmpLog.debug( "jobspec.get_groups_of_output_files(skip_done=True) returned no files " ) tmpLog.debug("check_stage_out_status return status - True ") return True, '' for transferID in groups: # allow only valid UUID if validate_transferid(transferID): # get transfer task tmpStat, transferTasks = globus_utils.get_transfer_task_by_id( tmpLog, self.tc, transferID) # return a temporary error when failed to get task if not tmpStat: errStr = 'failed to get transfer task; tc = %s; transferID = %s' % ( str(self.tc), str(transferID)) tmpLog.error(errStr) return None, errStr # return a temporary error when task is missing if transferID not in transferTasks: errStr = 'transfer task ID - {} is missing'.format( transferID) tmpLog.error(errStr) return None, errStr # succeeded in finding a transfer task by tranferID if transferTasks[transferID]['status'] == 'SUCCEEDED': tmpLog.debug( 'transfer task {} succeeded'.format(transferID)) self.set_FileSpec_objstoreID(jobspec, self.objstoreID, self.pathConvention) if self.changeFileStatusOnSuccess: self.set_FileSpec_status(jobspec, 'finished') return True, '' # failed if transferTasks[transferID]['status'] == 'FAILED': errStr = 'transfer task {} failed'.format(transferID) tmpLog.error(errStr) self.set_FileSpec_status(jobspec, 'failed') return False, errStr # another status tmpStr = 'transfer task {0} status: {1}'.format( transferID, transferTasks[transferID]['status']) tmpLog.debug(tmpStr) return None, '' # end of loop over transfer groups tmpLog.debug( 'End of loop over transfers groups - ending check_stage_out_status function' ) return None, 'no valid transfer id found'
def submit_workers(self, workspec_list): retlist = [] # Get queue info from DB pandaqueues = self.dbproxy.get_cache("panda_queues.json", None) if pandaqueues is None: raise Exception("Failed to get panda queue info from database") pandaqueues = pandaqueues.data osmap = self.dbproxy.get_cache("ddmendpoints_objectstores.json", None) if osmap is None: raise Exception("Failed to get Object Store info from database") osmap = osmap.data for workspec in workspec_list: arclog = arc_utils.ARCLogger(baselogger, workspec.workerID) tmplog = arclog.log # Assume for aCT that jobs are always pre-fetched (no late-binding) for jobspec in workspec.get_jobspec_list(): tmplog.debug("JobSpec: {0}".format(jobspec.values_map())) if jobspec.computingSite not in pandaqueues: retlist.append( (False, "No queue information for {0}".format( jobspec.computingSite))) continue # Get CEs from panda queue info # List of (endpoint, queue) tuples arcces = [] for endpoint in pandaqueues[jobspec.computingSite]['queues']: ce_endpoint = endpoint['ce_endpoint'] if not re.search('://', ce_endpoint): ce_endpoint = 'gsiftp://%s' % ce_endpoint ce_queue = endpoint['ce_queue_name'] arcces.append((ce_endpoint, ce_queue)) if not arcces: retlist.append((False, "No CEs defined for %{0}".format( jobspec.computingSite))) continue # Set true pilot or not queueconfigmapper = QueueConfigMapper() queueconfig = queueconfigmapper.get_queue( jobspec.computingSite) pandaqueues[jobspec.computingSite][ 'truepilot'] = 'running' in queueconfig.noHeartbeat # Set log URL for GTAG env in job description logbaseurl = queueconfig.submitter.get('logBaseURL') logsubdir = self._set_logdir(jobspec.computingSite) logfileurl = '/'.join( [logbaseurl, logsubdir, '%d.out' % jobspec.PandaID]) if logbaseurl else None tmplog.debug("Converting to ARC XRSL format") arcxrsl = ARCParser( jobspec.jobParams, jobspec.computingSite, pandaqueues[jobspec.computingSite], logfileurl, self.schedulerid, osmap, '/tmp', # tmpdir, TODO common tmp dir None, #jobSpec.eventranges, # TODO event ranges tmplog) arcxrsl.parse() xrsl = arcxrsl.getXrsl() tmplog.debug("ARC xrsl: {0}".format(xrsl)) # Set the files to be downloaded at the end of the job downloadfiles = 'gmlog/errors' if 'logFile' in jobspec.jobParams: downloadfiles += ';%s' % jobspec.jobParams[ 'logFile'].replace('.tgz', '') if not pandaqueues[jobspec.computingSite]['truepilot']: downloadfiles += ';jobSmallFiles.tgz' # Set certificate userconfig = arc.UserConfig(self.cred_type) proxyrole = '' if jobspec.jobParams['prodSourceLabel'] == 'user': userconfig.ProxyPath(str(self.certs['pilot'])) proxyrole = 'pilot' else: userconfig.ProxyPath(str(self.certs['production'])) proxyrole = 'production' tmplog.debug("Submitting using {0} proxy at {1}".format( proxyrole, userconfig.ProxyPath())) try: tmplog.debug("Submission targets: {0}".format(arcces)) arcjob = self._arc_submit(xrsl, arcces, userconfig, tmplog) tmplog.info("ARC CE job id {0}".format(arcjob.JobID)) arc_utils.arcjob2workspec(arcjob, workspec) workspec.workAttributes['arcdownloadfiles'] = downloadfiles workspec.workAttributes['proxyrole'] = proxyrole workspec.workAttributes['logsubdir'] = logsubdir workspec.batchID = arcjob.JobID tmplog.debug(workspec.workAttributes) result = (True, '') except Exception as exc: tmplog.error(traceback.format_exc()) result = (False, "Failed to submit ARC job: {0}".format(str(exc))) retlist.append(result) return retlist
def main(): parser = argparse.ArgumentParser() parser.add_argument('--queueName', action='store', dest='queueName', default=None, required=True, help='the name of queue where harvester is installed') parser.add_argument('--middleware', action='store', dest='middleware', default='rpc', help='middleware to access the remote target machine') options = parser.parse_args() # get queue qcm = QueueConfigMapper() qcm.load_data() queueConfig = qcm.get_queue(options.queueName) if queueConfig is None: print('ERROR: queue={0} not found in panda_queueconfig.json'.format( options.queueName)) sys.exit(1) # get middleware if not hasattr(queueConfig, options.middleware): print( 'ERROR: middleware={0} is not defined for {1} in panda_queueconfig.json' .format(options.middleware, options.queueName)) sys.exit(1) middleware = getattr(queueConfig, options.middleware) # get ssh parameters sshHost = middleware['remoteHost'] try: sshPort = middleware['remotePort'] except Exception: sshPort = 22 sshUserName = middleware['sshUserName'] try: sshPassword = middleware['sshPassword'] except Exception: sshPassword = None privateKey = None passPhrase = None if sshPassword is None: try: privateKey = middleware['privateKey'] except Exception: print("ERROR: set sshPassword or privateKey in middleware={0}". format(options.middleware)) sys.exit(1) try: passPhrase = middleware['passPhrase'] except Exception: passPhrase = None try: jumpHost = middleware['jumpHost'] except Exception: jumpHost = None try: jumpPort = middleware['jumpPort'] except Exception: jumpPort = 22 # ssh sshTunnelPool.make_tunnel_server( sshHost, sshPort, remote_bind_port=middleware['remoteBindPort'], num_tunnels=1, ssh_username=sshUserName, ssh_password=sshPassword, private_key=privateKey, pass_phrase=passPhrase, jump_host=jumpHost, jump_port=jumpPort) ssh = sshTunnelPool.get_tunnel(sshHost, sshPort)[-1] return ssh
def check_status(self, jobspec): # make logger tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID), method_name='check_status') tmpLog.debug('executing base check_status') tmpStat, tmpMsg = GlobusBulkStager.check_status(self, jobspec) tmpLog.debug('got {0} {1}'.format(tmpStat, tmpMsg)) if tmpStat is not True: return tmpStat, tmpMsg # get transfer groups groups = jobspec.get_groups_of_output_files() if len(groups) == 0: return tmpStat, tmpMsg # get the queueConfig and corresponding objStoreID_ES queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(jobspec.computingSite) # write to debug log queueConfig.stager tmpLog.debug('jobspec.computingSite - {0} queueConfig.stager {1}'.format(jobspec.computingSite,queueConfig.stager)) # check queueConfig stager section to see if srcRSE is set if 'srcRSE' in queueConfig.stager: srcRSE = queueConfig.stager['srcRSE'] else: tmpLog.debug('Warning srcRSE not defined in stager portion of queue config file') # get destination endpoint nucleus = jobspec.jobParams['nucleus'] agis = self.dbInterface.get_cache('panda_queues.json').data dstRSE = [agis[x]["astorages"]['pr'][0] for x in agis if agis[x]["atlas_site"] == nucleus][0] # if debugging log source and destination RSEs tmpLog.debug('srcRSE - {0} dstRSE - {1}'.format(srcRSE,dstRSE)) # test that srcRSE and dstRSE are defined tmpLog.debug('srcRSE - {0} dstRSE - {1}'.format(srcRSE,dstRSE)) errStr = '' if srcRSE is None: errStr = 'Source RSE is not defined ' if dstRSE is None: errStr = errStr + ' Desitination RSE is not defined' if (srcRSE is None) or (dstRSE is None) : tmpLog.error(errStr) return None,errStr # check queueConfig stager section to see if jobtype is set if 'jobtype' in queueConfig.stager: if queueConfig.stager['jobtype'] == "Yoda" : self.Yodajob = True # set the location of the files in fileSpec.objstoreID # see file /cvmfs/atlas.cern.ch/repo/sw/local/etc/agis_ddmendpoints.json ddm = self.dbInterface.get_cache('agis_ddmendpoints.json').data self.objstoreID = ddm[dstRSE]['id'] if self.Yodajob : self.pathConvention = int(queueConfig.stager['pathConvention']) tmpLog.debug('Yoda Job - PandaID = {0} objstoreID = {1} pathConvention ={2}'.format(jobspec.PandaID,self.objstoreID,self.pathConvention)) else: self.pathConvention = None tmpLog.debug('PandaID = {0} objstoreID = {1}'.format(jobspec.PandaID,self.objstoreID)) # set the location of the files in fileSpec.objstoreID self.set_FileSpec_objstoreID(jobspec, self.objstoreID, self.pathConvention) # create the Rucio Client try: # register dataset rucioAPI = RucioClient() except Exception: core_utils.dump_error_message(tmpLog) # treat as a temporary error tmpStat = None tmpMsg = 'failed to add a rule for {0}:{1}'.format(datasetScope, datasetName) return tmpStat,tmpMsg # loop over all transfers tmpStat = True tmpMsg = '' for transferID in groups: if transferID is None: continue datasetName = 'panda.harvester.{0}.{1}'.format(jobspec.PandaID, transferID) datasetScope = 'transient' # lock have_db_lock = self.dbInterface.get_object_lock(transferID, lock_interval=120) if not have_db_lock: msgStr = 'escape since {0} is locked by another thread'.format(transferID) tmpLog.debug(msgStr) return None, msgStr # get transfer status groupStatus = self.dbInterface.get_file_group_status(transferID) if 'hopped' in groupStatus: # already succeeded pass elif 'failed' in groupStatus: # transfer failure tmpStat = False tmpMsg = 'rucio rule for {0}:{1} already failed'.format(datasetScope, datasetName) elif 'hopping' in groupStatus: # check rucio rule ruleStatus = 'FAILED' try: tmpLog.debug('check state for {0}:{1}'.format(datasetScope, datasetName)) for ruleInfo in rucioAPI.list_did_rules(datasetScope, datasetName): if ruleInfo['rse_expression'] != dstRSE: continue ruleStatus = ruleInfo['state'] tmpLog.debug('got state={0}'.format(ruleStatus)) if ruleStatus == 'OK': break except DataIdentifierNotFound: tmpLog.error('dataset not found') except Exception: core_utils.dump_error_message(tmpLog) ruleStatus = None if ruleStatus in ['FAILED', 'CANCELED']: # transfer failure tmpStat = False tmpMsg = 'rucio rule for {0}:{1} failed with {2}'.format(datasetScope, datasetName, ruleStatus) # update file group status self.dbInterface.update_file_group_status(transferID, 'failed') elif ruleStatus == 'OK': # update successful file group status self.dbInterface.update_file_group_status(transferID, 'hopped') else: # replicating or temporary error tmpStat = None tmpMsg = 'replicating or temporary error for {0}:{1}'.format(datasetScope, datasetName) else: # make rucio rule fileSpecs = self.dbInterface.get_files_with_group_id(transferID) fileList = [] for fileSpec in fileSpecs: tmpFile = dict() tmpFile['scope'] = datasetScope tmpFile['name'] = fileSpec.lfn tmpFile['bytes'] = fileSpec.fsize tmpFile['adler32'] = fileSpec.chksum if fileSpec.fileAttributes is not None and 'guid' in fileSpec.fileAttributes: tmpFile['meta'] = {'guid': fileSpec.fileAttributes['guid']} else : tmpLog.debug('File - {0} does not have a guid value'.format(fileSpec.lfn)) tmpLog.debug('Adding file {0} to fileList'.format(fileSpec.lfn)) fileList.append(tmpFile) # get source RSE if srcRSE is None and fileSpec.objstoreID is not None: ddm = self.dbInterface.get_cache('agis_ddmendpoints.json').data srcRSE = [x for x in ddm if ddm[x]["id"] == fileSpec.objstoreID][0] try: # register dataset tmpLog.debug('register {0}:{1} rse = {2} meta=(hidden: True) lifetime = {3}' .format(datasetScope, datasetName,srcRSE,(30*24*60*60))) try: rucioAPI.add_dataset(datasetScope, datasetName, meta={'hidden': True}, lifetime=30 * 24 * 60 * 60, rse=srcRSE ) except DataIdentifierAlreadyExists: # ignore even if the dataset already exists pass except Exception: errMsg = 'Could not create dataset {0}:{1} srcRSE - {2}'.format(datasetScope, datasetName, srcRSE) core_utils.dump_error_message(tmpLog) tmpLog.error(errMsg) raise # return None,errMsg # add files to dataset # add 500 files at a time numfiles = len(fileList) maxfiles = 500 numslices = numfiles/maxfiles if (numfiles%maxfiles) > 0 : numslices = numslices + 1 start = 0 for i in range(numslices) : try: stop = start + maxfiles if stop > numfiles : stop = numfiles rucioAPI.add_files_to_datasets([{'scope': datasetScope, 'name': datasetName, 'dids': fileList[start:stop], 'rse': srcRSE}], ignore_duplicate=True) start = stop except FileAlreadyExists: # ignore if files already exist pass except Exception: errMsg = 'Could not add files to DS - {0}:{1} rse - {2} files - {3}'.format(datasetScope, datasetName, srcRSE, fileList) core_utils.dump_error_message(tmpLog) tmpLog.error(errMsg) return None,errMsg # add rule try: tmpDID = dict() tmpDID['scope'] = datasetScope tmpDID['name'] = datasetName tmpRet = rucioAPI.add_replication_rule([tmpDID], 1, dstRSE, lifetime=30 * 24 * 60 * 60) ruleIDs = tmpRet[0] tmpLog.debug('registered dataset {0}:{1} with rule {2}'.format(datasetScope, datasetName, str(ruleIDs))) except DuplicateRule: # ignore duplicated rule tmpLog.debug('rule is already available') except Exception: errMsg = 'Error creating rule for dataset {0}:{1}'.format(datasetScope, datasetName) core_utils.dump_error_message(tmpLog) tmpLog.debug(errMsg) #raise return None,errMsg # update file group status self.dbInterface.update_file_group_status(transferID, 'hopping') except Exception: core_utils.dump_error_message(tmpLog) # treat as a temporary error tmpStat = None tmpMsg = 'failed to add a rule for {0}:{1}'.format(datasetScope, datasetName) # release lock self.dbInterface.release_object_lock(transferID) # escape if already failed if tmpStat is False: break # all done if tmpStat is True: self.set_FileSpec_status(jobspec, 'finished') tmpLog.debug('done with {0} : {1}'.format(tmpStat, tmpMsg)) return tmpStat, tmpMsg
class SAGAMonitor(PluginBase): # constructor def __init__(self, **kwarg): PluginBase.__init__(self, **kwarg) self.pluginFactory = PluginFactory() self.queue_config_mapper = QueueConfigMapper() tmpLog = self.make_logger(baseLogger, method_name='__init__') tmpLog.info("[{0}] SAGA adaptor will be used.".format(self.adaptor)) # check workers def check_workers(self, workspec_list): """Check status of workers. This method takes a list of WorkSpecs as input argument and returns a list of worker's statuses. :param workspec_list: a list of work specs instances :return: A tuple of return code (True for success, False otherwise) and a list of worker's statuses. :rtype: (bool, [string,]) """ try: job_service = saga.job.Service(self.adaptor) except saga.SagaException as ex: time.sleep(10) self.check_workers(workspec_list) sagadateformat_str = '%a %b %d %H:%M:%S %Y' retList = [] for workSpec in workspec_list: # make logger errStr = '' tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workSpec.workerID), method_name='check_workers') tmpLog.debug("SAGA monitor started") if workSpec.batchID: saga_submission_id = '[{0}]-[{1}]'.format(self.adaptor, workSpec.batchID) try: worker = job_service.get_job(saga_submission_id) tmpLog.debug( 'SAGA State for submission with batchid: {0} is: {1}'.format(workSpec.batchID, worker.state)) harvester_job_state = SAGASubmitter.status_translator(worker.state) workSpec.nativeStatus = worker.state workSpec.set_status(harvester_job_state) tmpLog.debug( 'Worker state with batchid: {0} is: {1} exit code: {2}'.format(workSpec.batchID, harvester_job_state, worker.exit_code)) workSpec.set_status(harvester_job_state) if worker.created: tmpLog.debug("Worker created (SAGA): {0}".format(worker.created)) workSpec.submitTime = datetime.strptime(worker.created, sagadateformat_str) if worker.started: tmpLog.debug("Worker started (SAGA): {0}".format(worker.started)) workSpec.startTime = datetime.strptime(worker.started, sagadateformat_str) if worker.finished: tmpLog.debug("Worker finished (SAGA): {0}".format(worker.finished)) workSpec.endTime = datetime.strptime(worker.finished, sagadateformat_str) if workSpec.is_final_status(): workSpec.nativeExitCode = worker.exit_code tmpLog.info("Worker in final status [{0}] exit code: {1}".format(workSpec.status, workSpec.nativeExitCode)) if workSpec.nativeExitCode != 0: # let's try to find exit code, exit message etc... tmpLog.info("Deep check to find exit code and exit status required") harvester_job_state, workSpec.nativeExitCode, workSpec.nativeStatus, starttime, endtime, errStr = self.deep_checkjob( workSpec.batchID, workSpec.workerID) if harvester_job_state == "": harvester_job_state = workSpec.ST_finished if not workSpec.startTime: workSpec.startTime = starttime if endtime: workSpec.endTime = endtime workSpec.set_status(harvester_job_state) tmpLog.info('Worker {2} with BatchID={0} finished with exit code {1} and state {3}'.format( workSpec.batchID, worker.exit_code, workSpec.workerID, worker.state)) tmpLog.debug('Started: [{0}] finished: [{1}]'.format(worker.started, worker.finished)) if worker.state == saga.job.PENDING: queue_time = (datetime.now() - workSpec.submitTime).total_seconds() tmpLog.info("Worker queued for {0} sec.".format(queue_time)) if hasattr(self, 'maxqueuetime') and queue_time > self.maxqueuetime: tmpLog.info( "Queue time {0} is longer than limit {1} worker will be canceled".format(queue_time, self.maxqueuetime)) worker.cancel() worker.wait() workSpec.nativeExitCode = worker.exit_code cur_time = datetime.now() workSpec.startTime = cur_time workSpec.endTime = cur_time workSpec.set_pilot_closed() workSpec.set_status(workSpec.ST_cancelled) harvester_job_state = workSpec.ST_cancelled tmpLog.info("Worker state: {0} worker exit code: {1}".format(harvester_job_state, workSpec.nativeExitCode)) # proper processing of jobs for worker will be required, to avoid 'fake' fails except saga.SagaException as ex: tmpLog.info('An exception occured during retriving worker information {0}'.format(workSpec.batchID)) tmpLog.info(ex.get_message()) # probably 'fnished' is not proper state in this case, 'undefined' looks a bit better # some more work for SAGA to get proper state harvester_job_state, workSpec.nativeExitCode, workSpec.nativeStatus, starttime, endtime, errStr = self.deep_checkjob( workSpec.batchID, workSpec.workerID) if harvester_job_state == "": harvester_job_state = workSpec.ST_finished if not workSpec.startTime: workSpec.startTime = starttime if endtime: workSpec.endTime = endtime workSpec.set_status(harvester_job_state) tmpLog.debug('Worker state set to: {0} ({1})'.format(workSpec.status, harvester_job_state)) retList.append((harvester_job_state, errStr)) # for compatibility with dummy monitor f = open(os.path.join(workSpec.accessPoint, 'status.txt'), 'w') f.write(workSpec.status) f.close() else: tmpLog.debug("SAGA monitor found worker [{0}] without batchID".format(workSpec.workerID)) job_service.close() tmpLog.debug('Results: {0}'.format(retList)) return True, retList def deep_checkjob(self, batchid, workerid): """ Get job state, exit code and some more parameters, from resources depending sources :param batchid: :return harvester_job_state, nativeExitCode, nativeStatus, startTime, endTime, diagMessage """ tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workerid), method_name='deep_checkjob') harvester_job_state = None nativeexitcode = None nativestatus = None diagmessage = "" starttime = None endtime = None queue_config = self.queue_config_mapper.get_queue(self.queueName) if hasattr(queue_config, 'resource'): resource_utils = self.pluginFactory.get_plugin(queue_config.resource) else: tmpLog.debug("Resource configuration missed for: {0}".format(self.queueName)) resource_utils = None if resource_utils: batchjob_info = resource_utils.get_batchjob_info(batchid) if batchjob_info: tmpLog.info('Batch job info collected: {0}'.format(batchjob_info)) harvester_job_state = batchjob_info['status'] nativeexitcode = batchjob_info['nativeExitCode'] nativestatus = batchjob_info['nativeStatus'] diagmessage = batchjob_info['nativeExitMsg'] if batchjob_info['start_time']: starttime = batchjob_info['start_time'] if batchjob_info['finish_time']: endtime = batchjob_info['finish_time'] return harvester_job_state, nativeexitcode, nativestatus, starttime, endtime, diagmessage
file_prefix = 'panda.sgotest.' def exit_func(): for f in os.listdir('.'): if f.startswith(file_prefix): os.remove(f) atexit.register(exit_func) queueName = sys.argv[1] queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(queueName) fileSpec = FileSpec() fileSpec.fileType = 'output' fileSpec.lfn = file_prefix + uuid.uuid4().hex + '.gz' fileSpec.fileAttributes = {'guid': str(uuid.uuid4())} fileSpec.checksum = '0d439274' assFileSpec = FileSpec() assFileSpec.lfn = file_prefix + uuid.uuid4().hex assFileSpec.fileType = 'es_output' assFileSpec.fsize = random.randint(10, 100) assFileSpec.path = os.getcwd() + '/' + assFileSpec.lfn oFile = open(assFileSpec.lfn, 'w') oFile.write(''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(assFileSpec.fsize))) oFile.close() fileSpec.add_associated_file(assFileSpec)
def submit_workers(self, workspec_list): retlist = [] # Get queue info from DB pandaqueues = self.dbproxy.get_cache("panda_queues.json", None) if pandaqueues is None: raise Exception("Failed to get panda queue info from database") pandaqueues = pandaqueues.data osmap = self.dbproxy.get_cache("ddmendpoints_objectstores.json", None) if osmap is None: raise Exception("Failed to get Object Store info from database") osmap = osmap.data for workspec in workspec_list: arclog = arc_utils.ARCLogger(baselogger, workspec.workerID) tmplog = arclog.log # Assume for aCT that jobs are always pre-fetched (no late-binding) for jobspec in workspec.get_jobspec_list(): tmplog.debug("JobSpec: {0}".format(jobspec.values_map())) if jobspec.computingSite not in pandaqueues: retlist.append((False, "No queue information for {0}".format(jobspec.computingSite))) continue # Get CEs from panda queue info # List of (endpoint, queue) tuples arcces = [] for endpoint in pandaqueues[jobspec.computingSite]['queues']: ce_endpoint = endpoint['ce_endpoint'] if not re.search('://', ce_endpoint): ce_endpoint = 'gsiftp://%s' % ce_endpoint ce_queue = endpoint['ce_queue_name'] arcces.append((ce_endpoint, ce_queue)) if not arcces: retlist.append((False, "No CEs defined for %{0}".format(jobspec.computingSite))) continue # Set true pilot or not queueconfigmapper = QueueConfigMapper() queueconfig = queueconfigmapper.get_queue(jobspec.computingSite) pandaqueues[jobspec.computingSite]['truepilot'] = 'running' in queueconfig.noHeartbeat # Set log URL for GTAG env in job description logbaseurl = queueconfig.submitter.get('logBaseURL') logsubdir = self._set_logdir(jobspec.computingSite) logfileurl = '/'.join([logbaseurl, logsubdir, '%d.out' % jobspec.PandaID]) if logbaseurl else None tmplog.debug("Converting to ARC XRSL format") arcxrsl = ARCParser(jobspec.jobParams, jobspec.computingSite, pandaqueues[jobspec.computingSite], logfileurl, self.schedulerid, osmap, '/tmp', # tmpdir, TODO common tmp dir None, #jobSpec.eventranges, # TODO event ranges tmplog) arcxrsl.parse() xrsl = arcxrsl.getXrsl() tmplog.debug("ARC xrsl: {0}".format(xrsl)) # Set the files to be downloaded at the end of the job downloadfiles = 'gmlog/errors' if 'logFile' in jobspec.jobParams: downloadfiles += ';%s' %jobspec.jobParams['logFile'].replace('.tgz', '') if not pandaqueues[jobspec.computingSite]['truepilot']: downloadfiles += ';jobSmallFiles.tgz' # Set certificate userconfig = arc.UserConfig(self.cred_type) proxyrole = '' if jobspec.jobParams['prodSourceLabel'] == 'user': userconfig.ProxyPath(str(self.certs['pilot'])) proxyrole = 'pilot' else: userconfig.ProxyPath(str(self.certs['production'])) proxyrole = 'production' tmplog.debug("Submitting using {0} proxy at {1}".format(proxyrole, userconfig.ProxyPath())) try: tmplog.debug("Submission targets: {0}".format(arcces)) arcjob = self._arc_submit(xrsl, arcces, userconfig, tmplog) tmplog.info("ARC CE job id {0}".format(arcjob.JobID)) arc_utils.arcjob2workspec(arcjob, workspec) workspec.workAttributes['arcdownloadfiles'] = downloadfiles workspec.workAttributes['proxyrole'] = proxyrole workspec.workAttributes['logsubdir'] = logsubdir workspec.batchID = arcjob.JobID tmplog.debug(workspec.workAttributes) result = (True, '') except Exception as exc: tmplog.error(traceback.format_exc()) result = (False, "Failed to submit ARC job: {0}".format(str(exc))) retlist.append(result) return retlist
def submit_workers(self, workspec_list): tmpLog = self.make_logger(baseLogger, method_name='submit_workers') nWorkers = len(workspec_list) tmpLog.debug('start nWorkers={0}'.format(nWorkers)) # whether to submit any worker to_submit_any = True # get log subdirectory name from timestamp timeNow = datetime.datetime.utcnow() log_subdir = timeNow.strftime('%y-%m-%d_%H') log_subdir_path = os.path.join(self.logDir, log_subdir) if self.condorSchedd is None or not self.useSpool: try: os.mkdir(log_subdir_path) except OSError as e: if e.errno != errno.EEXIST: raise else: pass # get info from harvester queue config _queueConfigMapper = QueueConfigMapper() harvester_queue_config = _queueConfigMapper.get_queue(self.queueName) # get queue info from AGIS by cacher in db if self.useAtlasAGIS: panda_queues_dict = PandaQueuesDict() panda_queue_name = panda_queues_dict.get_panda_queue_name( self.queueName) this_panda_queue_dict = panda_queues_dict.get( self.queueName, dict()) # tmpLog.debug('panda_queues_name and queue_info: {0}, {1}'.format(self.queueName, panda_queues_dict[self.queueName])) else: panda_queues_dict = dict() panda_queue_name = self.queueName this_panda_queue_dict = dict() # get default information from queue info n_core_per_node_from_queue = this_panda_queue_dict.get( 'corecount', 1) if this_panda_queue_dict.get('corecount', 1) else 1 is_unified_queue = this_panda_queue_dict.get('capability', '') == 'ucore' # get override requirements from queue configured try: n_core_per_node = self.nCorePerNode if self.nCorePerNode else n_core_per_node_from_queue except AttributeError: n_core_per_node = n_core_per_node_from_queue # deal with CE special_par = '' ce_weighting = None if self.useAtlasGridCE: # If ATLAS Grid CE mode used tmpLog.debug('Using ATLAS Grid CE mode...') queues_from_queue_list = this_panda_queue_dict.get('queues', []) special_par = this_panda_queue_dict.get('special_par', '') ce_auxilary_dict = {} for _queue_dict in queues_from_queue_list: if not (_queue_dict.get('ce_endpoint') and str( _queue_dict.get('ce_state', '')).upper() == 'ACTIVE' and str(_queue_dict.get('ce_flavour', '')).lower() in set(['arc-ce', 'cream-ce', 'htcondor-ce'])): continue ce_endpoint = _queue_dict.get('ce_endpoint') if (ce_endpoint in ce_auxilary_dict and str(_queue_dict.get('ce_queue_name', '')).lower() == 'default'): pass else: ce_auxilary_dict[ce_endpoint] = _queue_dict # qualified CEs from AGIS info n_qualified_ce = len(ce_auxilary_dict) if n_qualified_ce > 0: # Get CE weighting tmpLog.debug('Get CE weighting') worker_ce_all_tuple = self.get_ce_statistics( self.queueName, nWorkers) ce_weighting = _get_ce_weighting( ce_endpoint_list=list(ce_auxilary_dict.keys()), worker_ce_all_tuple=worker_ce_all_tuple) stats_weighting_display_str = _get_ce_stats_weighting_display( ce_auxilary_dict.keys(), worker_ce_all_tuple, ce_weighting) tmpLog.debug('CE stats and weighting: {0}'.format( stats_weighting_display_str)) else: tmpLog.error('No valid CE endpoint found') to_submit_any = False def _handle_one_worker(workspec, to_submit=to_submit_any): # make logger tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format( workspec.workerID), method_name='_handle_one_worker') ce_info_dict = dict() batch_log_dict = dict() data = { 'workspec': workspec, 'to_submit': to_submit, } if to_submit: if self.useAtlasGridCE: # choose a CE tmpLog.info('choose a CE...') ce_chosen = _choose_ce(ce_weighting) try: ce_info_dict = ce_auxilary_dict[ce_chosen].copy() except KeyError: tmpLog.info( 'Problem choosing CE with weighting. Choose an arbitrary CE endpoint' ) ce_info_dict = random.choice( list(ce_auxilary_dict.values())).copy() # go on info of the CE ce_endpoint_from_queue = ce_info_dict.get( 'ce_endpoint', '') ce_flavour_str = str(ce_info_dict.get('ce_flavour', '')).lower() ce_version_str = str(ce_info_dict.get('ce_version', '')).lower() ce_info_dict['ce_hostname'] = re.sub( ':\w*', '', ce_endpoint_from_queue) if ce_info_dict['ce_hostname'] == ce_endpoint_from_queue: # add default port to ce_endpoint if missing default_port_map = { 'cream-ce': 8443, 'arc-ce': 2811, 'htcondor-ce': 9619, } if ce_flavour_str in default_port_map: default_port = default_port_map[ce_flavour_str] ce_info_dict['ce_endpoint'] = '{0}:{1}'.format( ce_endpoint_from_queue, default_port) tmpLog.debug( 'For site {0} got CE endpoint: "{1}", flavour: "{2}"'. format(self.queueName, ce_endpoint_from_queue, ce_flavour_str)) if os.path.isdir(self.CEtemplateDir) and ce_flavour_str: sdf_template_filename = '{ce_flavour_str}.sdf'.format( ce_flavour_str=ce_flavour_str) self.templateFile = os.path.join( self.CEtemplateDir, sdf_template_filename) else: try: # Manually define site condor schedd as ceHostname and central manager as ceEndpoint if self.ceHostname and isinstance( self.ceHostname, list) and len(self.ceHostname) > 0: if isinstance(self.ceEndpoint, list) and len(self.ceEndpoint) > 0: ce_info_dict['ce_hostname'], ce_info_dict[ 'ce_endpoint'] = random.choice( list( zip(self.ceHostname, self.ceEndpoint))) else: ce_info_dict['ce_hostname'] = random.choice( self.ceHostname) ce_info_dict['ce_endpoint'] = self.ceEndpoint else: ce_info_dict['ce_hostname'] = self.ceHostname ce_info_dict['ce_endpoint'] = self.ceEndpoint except AttributeError: pass # template for batch script try: tmpFile = open(self.templateFile) sdf_template_raw = tmpFile.read() tmpFile.close() except AttributeError: tmpLog.error( 'No valid templateFile found. Maybe templateFile, CEtemplateDir invalid, or no valid CE found' ) to_submit = False return data else: # get batch_log, stdout, stderr filename, and remobe commented liness sdf_template_str_list = [] for _line in sdf_template_raw.split('\n'): if _line.startswith('#'): continue sdf_template_str_list.append(_line) _match_batch_log = re.match('log = (.+)', _line) _match_stdout = re.match('output = (.+)', _line) _match_stderr = re.match('error = (.+)', _line) if _match_batch_log: batch_log_value = _match_batch_log.group(1) continue if _match_stdout: stdout_value = _match_stdout.group(1) continue if _match_stderr: stderr_value = _match_stderr.group(1) continue sdf_template = '\n'.join(sdf_template_str_list) # Choose from Condor schedd and central managers if isinstance(self.condorSchedd, list) and len(self.condorSchedd) > 0: if isinstance(self.condorPool, list) and len(self.condorPool) > 0: condor_schedd, condor_pool = random.choice( list(zip(self.condorSchedd, self.condorPool))) else: condor_schedd = random.choice(self.condorSchedd) condor_pool = self.condorPool else: condor_schedd = self.condorSchedd condor_pool = self.condorPool # Log Base URL if self.logBaseURL and '[ScheddHostname]' in self.logBaseURL: schedd_hostname = re.sub( r'(?:[a-zA-Z0-9_.\-]*@)?([a-zA-Z0-9.\-]+)(?::[0-9]+)?', lambda matchobj: matchobj.group(1) if matchobj.group(1) else '', condor_schedd) log_base_url = re.sub(r'\[ScheddHostname\]', schedd_hostname, self.logBaseURL) else: log_base_url = self.logBaseURL # URLs for log files if not (log_base_url is None): if workspec.batchID: batchID = workspec.batchID guess = False else: batchID = '' guess = True batch_log_filename = parse_batch_job_filename( value_str=batch_log_value, file_dir=log_subdir_path, batchID=batchID, guess=guess) stdout_path_file_name = parse_batch_job_filename( value_str=stdout_value, file_dir=log_subdir_path, batchID=batchID, guess=guess) stderr_path_filename = parse_batch_job_filename( value_str=stderr_value, file_dir=log_subdir_path, batchID=batchID, guess=guess) batch_log = '{0}/{1}/{2}'.format( log_base_url, log_subdir, batch_log_filename) batch_stdout = '{0}/{1}/{2}'.format( log_base_url, log_subdir, stdout_path_file_name) batch_stderr = '{0}/{1}/{2}'.format( log_base_url, log_subdir, stderr_path_filename) workspec.set_log_file('batch_log', batch_log) workspec.set_log_file('stdout', batch_stdout) workspec.set_log_file('stderr', batch_stderr) batch_log_dict['batch_log'] = batch_log batch_log_dict['batch_stdout'] = batch_stdout batch_log_dict['batch_stderr'] = batch_stderr batch_log_dict['gtag'] = workspec.workAttributes[ 'stdOut'] tmpLog.debug('Done set_log_file before submission') tmpLog.debug('Done jobspec attribute setting') # set data dict data.update({ 'workspec': workspec, 'to_submit': to_submit, 'template': sdf_template, 'executable_file': self.executableFile, 'log_dir': self.logDir, 'log_subdir': log_subdir, 'n_core_per_node': n_core_per_node, 'panda_queue_name': panda_queue_name, 'x509_user_proxy': self.x509UserProxy, 'ce_info_dict': ce_info_dict, 'batch_log_dict': batch_log_dict, 'special_par': special_par, 'harvester_queue_config': harvester_queue_config, 'is_unified_queue': is_unified_queue, 'condor_schedd': condor_schedd, 'condor_pool': condor_pool, 'use_spool': self.useSpool, }) return data def _propagate_attributes(workspec, tmpVal): # make logger tmpLog = core_utils.make_logger( baseLogger, 'workerID={0}'.format(workspec.workerID), method_name='_propagate_attributes') (retVal, tmpDict) = tmpVal workspec.set_attributes_with_dict(tmpDict) tmpLog.debug('Done workspec attributes propagation') return retVal tmpLog.debug('finished preparing worker attributes') # map(_handle_one_worker, workspec_list) with ThreadPoolExecutor(self.nProcesses * 4) as thread_pool: dataIterator = thread_pool.map(_handle_one_worker, workspec_list) tmpLog.debug('{0} workers handled'.format(nWorkers)) # exec with mcore with ThreadPoolExecutor(self.nProcesses) as thread_pool: retValList = thread_pool.map(submit_a_worker, dataIterator) tmpLog.debug('{0} workers submitted'.format(nWorkers)) # propagate changed attributes with ThreadPoolExecutor(self.nProcesses) as thread_pool: retIterator = thread_pool.map( lambda _wv_tuple: _propagate_attributes(*_wv_tuple), zip(workspec_list, retValList)) retList = list(retIterator) tmpLog.debug('done') return retList
class MultiNodeWorkerMaker(BaseWorkerMaker): # constructor def __init__(self, **kwarg): BaseWorkerMaker.__init__(self, **kwarg) self.pluginFactory = PluginFactory() self.queue_config_mapper = QueueConfigMapper() tmpLog = self.make_logger(baseLogger, method_name='__init__') tmpLog.info("Multinode workermaker: created.") tmpLog.debug("Queue name: {0}".format(self.queueName)) if self.mode == "static": tmpLog.info("Static configuration") elif self.mode == "dynamic": tmpLog.info("Dynamic configuration") self.nNodes, self.walltimelimit = self.get_resources() self.nJobsPerWorker = self.nNodes * self.nJobsPerNode def _get_executable(self): # return string which contain body of script for scheduler: specific enviroment setup, executor with parameters exe_str = "" tmpLog = self.make_logger(baseLogger, method_name='_get_executable') # prepare static enviroment env_str = "" if self.env not in (None, "NULL"): env_str = "\n".join(map(lambda s: s.strip(), self.env.split(", "))) # prepare executor try: if self.executor == "aprun": # "aprun -n [number of required nodes/jobs] -d [number of cpu per node/job]" - for one multicore job per node exe_str = self.executor + " -n {0} -d {1} ".format(self.nJobsPerWorker, self.nCorePerJob) exe_str += self.pilot else: exe_str = self.executor + " " + self.pilot if self.pilot_params: exe_str = " ".join([exe_str, self.pilot_params]) except Exception: tmpLog.error("Unable to build executor command check configuration") exe_str = "" exe_str = "\n".join([env_str, exe_str]) tmpLog.debug("Shell script body: \n%s" % exe_str) return exe_str # make a worker from jobs def make_worker(self, jobspec_list, queue_config, job_type, resource_type): tmpLog = core_utils.make_logger(baseLogger, 'queue={0}'.format(queue_config.queueName), method_name='make_worker') tmpLog.info("Multi node worker preparation started.") tmpLog.info("Worker size: {0} jobs on {2} nodes for {1} sec.".format(self.nJobsPerWorker, self.walltimelimit, self.nNodes)) workSpec = WorkSpec() workSpec.nCore = self.nNodes * queue_config.submitter['nCorePerNode'] workSpec.minRamCount = 0 workSpec.maxDiskCount = 0 workSpec.maxWalltime = self.walltimelimit workSpec.workParams = self._get_executable() if len(jobspec_list) > 0: # push case: we know the job and set the parameters of the job for jobSpec in jobspec_list: try: workSpec.minRamCount += jobSpec.jobParams['minRamCount'] except Exception: pass try: workSpec.maxDiskCount += jobSpec.jobParams['maxDiskCount'] except Exception: pass #try: # if jobSpec.jobParams['maxWalltime'] not in (None, "NULL"): # workSpec.maxWalltime = max(int(queue_config.walltimeLimit), jobSpec.jobParams['maxWalltime']) # else: # workSpec.maxWalltime = queue_config.walltimeLimit #except Exception: # pass tmpLog.info("Worker for {0} nodes with {2} jobs with walltime {1} sec. defined".format(self.nNodes, workSpec.maxWalltime, self.nJobsPerWorker)) return workSpec # def get_num_jobs_per_worker(self, n_workers): # """ # Function to set 'size' of worker. Define number of jobs per worker # """ # tmpLog = core_utils.make_logger(baseLogger, 'queue={0}'.format(self.queueName), # method_name='get_num_jobs_per_worker') # tmpLog.info("Get number of jobs per worker") # self.nJobsPerWorker = 1 # if self.mode == "static": # tmpLog.info("Static configuration") # self.nJobsPerWorker = self.nNodes * self.nJobsPerNode # elif self.mode == "dynamic": # tmpLog.info("Dynamic configuration") # self.nNodes, self.walltimelimit = self.get_resources() # self.nJobsPerWorker = self.nNodes * self.nJobsPerNode # # tmpLog.info("Get: {0} jobs to run for {1} sec.".format(self.nJobsPerWorker, self.walltimelimit)) # return self.nJobsPerWorker def get_resources(self): """ Function to get resourcese and map them to number of jobs """ tmpLog = core_utils.make_logger(baseLogger, 'queue={0}'.format(self.queueName), method_name='get_resources') njobs = 0 walltime = self.walltimelimit queue_config = self.queue_config_mapper.get_queue(self.queueName) resource_utils = self.pluginFactory.get_plugin(queue_config.resource) if resource_utils: nodes, walltime = resource_utils.get_resources() else: tmpLog.info("Resource plugin is not defined") nodes = self.nNodes return nodes, walltime
if fork_child_pid != 0: signal_utils.set_suicide_handler(None) os.wait() else: if len(sys.argv) not in (2, 4): print("Wrong number of parameters. You can either:") print(" - specify the queue name") print( " - specify the queue name, jobType (managed, user) and resourceType (SCORE, SCORE_HIMEM, MCORE, MCORE_HIMEM)" ) sys.exit(0) queueName = sys.argv[1] queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(queueName) if queueConfig.prodSourceLabel in ('user', 'managed'): jobType = queueConfig.prodSourceLabel else: jobType = 'managed' # default, can be overwritten by parameters resourceType = 'SCORE' # default, can be overwritten by parameters if len(sys.argv) == 4: # jobType should be 'managed' or 'user'. If not specified will default to a production job if sys.argv[2] in ('user', 'managed'): jobType = sys.argv[2] else: print('value for jobType not valid, defaulted to {0}'.format( jobType))
def submit_workers(self, workspec_list): tmpLog = self.make_logger(baseLogger, method_name='submit_workers') nWorkers = len(workspec_list) tmpLog.debug('start nWorkers={0}'.format(nWorkers)) # whether to submit any worker to_submit_any = True # get log subdirectory name from timestamp timeNow = datetime.datetime.utcnow() log_subdir = timeNow.strftime('%y-%m-%d_%H') log_subdir_path = os.path.join(self.logDir, log_subdir) if self.condorSchedd is None or not self.useSpool: try: os.mkdir(log_subdir_path) except OSError as e: if e.errno != errno.EEXIST: raise else: pass # get info from harvester queue config _queueConfigMapper = QueueConfigMapper() harvester_queue_config = _queueConfigMapper.get_queue(self.queueName) # get queue info from AGIS by cacher in db if self.useAtlasAGIS: panda_queues_dict = PandaQueuesDict() panda_queue_name = panda_queues_dict.get_panda_queue_name(self.queueName) this_panda_queue_dict = panda_queues_dict.get(self.queueName, dict()) # tmpLog.debug('panda_queues_name and queue_info: {0}, {1}'.format(self.queueName, panda_queues_dict[self.queueName])) else: panda_queues_dict = dict() panda_queue_name = self.queueName this_panda_queue_dict = dict() # get default information from queue info n_core_per_node_from_queue = this_panda_queue_dict.get('corecount', 1) if this_panda_queue_dict.get('corecount', 1) else 1 is_unified_queue = this_panda_queue_dict.get('capability', '') == 'ucore' pilot_version_orig = str(this_panda_queue_dict.get('pilot_version', '')) pilot_version_suffix_str = '_pilot2' if pilot_version_orig == '2' else '' # get override requirements from queue configured try: n_core_per_node = self.nCorePerNode if self.nCorePerNode else n_core_per_node_from_queue except AttributeError: n_core_per_node = n_core_per_node_from_queue # deal with Condor schedd and central managers; make a random list the choose n_bulks = _div_round_up(nWorkers, self.minBulkToRamdomizedSchedd) if isinstance(self.condorSchedd, list) and len(self.condorSchedd) > 0: if isinstance(self.condorPool, list) and len(self.condorPool) > 0: orig_list = list(zip(self.condorSchedd, self.condorPool)) else: orig_list = [ (_schedd, self.condorPool) for _schedd in self.condorSchedd ] if n_bulks < len(orig_list): schedd_pool_choice_list = random.sample(orig_list, n_bulks) else: schedd_pool_choice_list = orig_list else: schedd_pool_choice_list = [(self.condorSchedd, self.condorPool)] # deal with CE special_par = '' ce_weighting = None if self.useAtlasGridCE: # If ATLAS Grid CE mode used tmpLog.debug('Using ATLAS Grid CE mode...') queues_from_queue_list = this_panda_queue_dict.get('queues', []) special_par = this_panda_queue_dict.get('special_par', '') ce_auxilary_dict = {} for _queue_dict in queues_from_queue_list: if not ( _queue_dict.get('ce_endpoint') and str(_queue_dict.get('ce_state', '')).upper() == 'ACTIVE' and str(_queue_dict.get('ce_flavour', '')).lower() in set(['arc-ce', 'cream-ce', 'htcondor-ce']) ): continue ce_endpoint = _queue_dict.get('ce_endpoint') if ( ce_endpoint in ce_auxilary_dict and str(_queue_dict.get('ce_queue_name', '')).lower() == 'default' ): pass else: ce_auxilary_dict[ce_endpoint] = _queue_dict # qualified CEs from AGIS info n_qualified_ce = len(ce_auxilary_dict) if n_qualified_ce > 0: # Get CE weighting tmpLog.debug('Get CE weighting') worker_ce_all_tuple = self.get_ce_statistics(self.queueName, nWorkers) ce_weighting = _get_ce_weighting(ce_endpoint_list=list(ce_auxilary_dict.keys()), worker_ce_all_tuple=worker_ce_all_tuple) stats_weighting_display_str = _get_ce_stats_weighting_display( ce_auxilary_dict.keys(), worker_ce_all_tuple, ce_weighting) tmpLog.debug('CE stats and weighting: {0}'.format(stats_weighting_display_str)) else: tmpLog.error('No valid CE endpoint found') to_submit_any = False def _handle_one_worker(workspec, to_submit=to_submit_any): # make logger tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), method_name='_handle_one_worker') ce_info_dict = dict() batch_log_dict = dict() data = {'workspec': workspec, 'to_submit': to_submit,} if to_submit: if self.useAtlasGridCE: # choose a CE tmpLog.info('choose a CE...') ce_chosen = _choose_ce(ce_weighting) try: ce_info_dict = ce_auxilary_dict[ce_chosen].copy() except KeyError: tmpLog.info('Problem choosing CE with weighting. Choose an arbitrary CE endpoint') ce_info_dict = random.choice(list(ce_auxilary_dict.values())).copy() # go on info of the CE ce_endpoint_from_queue = ce_info_dict.get('ce_endpoint', '') ce_flavour_str = str(ce_info_dict.get('ce_flavour', '')).lower() ce_version_str = str(ce_info_dict.get('ce_version', '')).lower() ce_info_dict['ce_hostname'] = re.sub(':\w*', '', ce_endpoint_from_queue) if ce_info_dict['ce_hostname'] == ce_endpoint_from_queue: # add default port to ce_endpoint if missing default_port_map = { 'cream-ce': 8443, 'arc-ce': 2811, 'htcondor-ce': 9619, } if ce_flavour_str in default_port_map: default_port = default_port_map[ce_flavour_str] ce_info_dict['ce_endpoint'] = '{0}:{1}'.format(ce_endpoint_from_queue, default_port) tmpLog.debug('For site {0} got pilot version: "{1}"; CE endpoint: "{2}", flavour: "{3}"'.format( self.queueName, pilot_version_orig, ce_endpoint_from_queue, ce_flavour_str)) if os.path.isdir(self.CEtemplateDir) and ce_flavour_str: sdf_template_filename = '{ce_flavour_str}{pilot_version_suffix_str}.sdf'.format( ce_flavour_str=ce_flavour_str, pilot_version_suffix_str=pilot_version_suffix_str) self.templateFile = os.path.join(self.CEtemplateDir, sdf_template_filename) else: try: # Manually define site condor schedd as ceHostname and central manager as ceEndpoint if self.ceHostname and isinstance(self.ceHostname, list) and len(self.ceHostname) > 0: if isinstance(self.ceEndpoint, list) and len(self.ceEndpoint) > 0: ce_info_dict['ce_hostname'], ce_info_dict['ce_endpoint'] = random.choice(list(zip(self.ceHostname, self.ceEndpoint))) else: ce_info_dict['ce_hostname'] = random.choice(self.ceHostname) ce_info_dict['ce_endpoint'] = self.ceEndpoint else: ce_info_dict['ce_hostname'] = self.ceHostname ce_info_dict['ce_endpoint'] = self.ceEndpoint except AttributeError: pass # template for batch script try: tmpFile = open(self.templateFile) sdf_template_raw = tmpFile.read() tmpFile.close() except AttributeError: tmpLog.error('No valid templateFile found. Maybe templateFile, CEtemplateDir invalid, or no valid CE found') to_submit = False return data else: # get batch_log, stdout, stderr filename, and remobe commented liness sdf_template_str_list = [] for _line in sdf_template_raw.split('\n'): if _line.startswith('#'): continue sdf_template_str_list.append(_line) _match_batch_log = re.match('log = (.+)', _line) _match_stdout = re.match('output = (.+)', _line) _match_stderr = re.match('error = (.+)', _line) if _match_batch_log: batch_log_value = _match_batch_log.group(1) continue if _match_stdout: stdout_value = _match_stdout.group(1) continue if _match_stderr: stderr_value = _match_stderr.group(1) continue sdf_template = '\n'.join(sdf_template_str_list) # Choose from Condor schedd and central managers condor_schedd, condor_pool = random.choice(schedd_pool_choice_list) # set submissionHost if not condor_schedd and not condor_pool: workspec.submissionHost = 'LOCAL' else: workspec.submissionHost = '{0},{1}'.format(condor_schedd, condor_pool) tmpLog.debug('set submissionHost={0}'.format(workspec.submissionHost)) # Log Base URL if self.logBaseURL and '[ScheddHostname]' in self.logBaseURL: schedd_hostname = re.sub(r'(?:[a-zA-Z0-9_.\-]*@)?([a-zA-Z0-9.\-]+)(?::[0-9]+)?', lambda matchobj: matchobj.group(1) if matchobj.group(1) else '', condor_schedd) log_base_url = re.sub(r'\[ScheddHostname\]', schedd_hostname, self.logBaseURL) else: log_base_url = self.logBaseURL # URLs for log files if not (log_base_url is None): if workspec.batchID: batchID = workspec.batchID guess = False else: batchID = '' guess = True batch_log_filename = parse_batch_job_filename(value_str=batch_log_value, file_dir=log_subdir_path, batchID=batchID, guess=guess) stdout_path_file_name = parse_batch_job_filename(value_str=stdout_value, file_dir=log_subdir_path, batchID=batchID, guess=guess) stderr_path_filename = parse_batch_job_filename(value_str=stderr_value, file_dir=log_subdir_path, batchID=batchID, guess=guess) batch_log = '{0}/{1}/{2}'.format(log_base_url, log_subdir, batch_log_filename) batch_stdout = '{0}/{1}/{2}'.format(log_base_url, log_subdir, stdout_path_file_name) batch_stderr = '{0}/{1}/{2}'.format(log_base_url, log_subdir, stderr_path_filename) workspec.set_log_file('batch_log', batch_log) workspec.set_log_file('stdout', batch_stdout) workspec.set_log_file('stderr', batch_stderr) batch_log_dict['batch_log'] = batch_log batch_log_dict['batch_stdout'] = batch_stdout batch_log_dict['batch_stderr'] = batch_stderr batch_log_dict['gtag'] = workspec.workAttributes['stdOut'] tmpLog.debug('Done set_log_file before submission') tmpLog.debug('Done jobspec attribute setting') # set data dict data.update({ 'workspec': workspec, 'to_submit': to_submit, 'template': sdf_template, 'executable_file': self.executableFile, 'log_dir': self.logDir, 'log_subdir': log_subdir, 'n_core_per_node': n_core_per_node, 'panda_queue_name': panda_queue_name, 'x509_user_proxy': self.x509UserProxy, 'ce_info_dict': ce_info_dict, 'batch_log_dict': batch_log_dict, 'special_par': special_par, 'harvester_queue_config': harvester_queue_config, 'is_unified_queue': is_unified_queue, 'condor_schedd': condor_schedd, 'condor_pool': condor_pool, 'use_spool': self.useSpool, 'pilot_version': pilot_version_orig, }) return data def _propagate_attributes(workspec, tmpVal): # make logger tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), method_name='_propagate_attributes') (retVal, tmpDict) = tmpVal workspec.set_attributes_with_dict(tmpDict) tmpLog.debug('Done workspec attributes propagation') return retVal tmpLog.debug('finished preparing worker attributes') # map(_handle_one_worker, workspec_list) with ThreadPoolExecutor(self.nProcesses * 4) as thread_pool: dataIterator = thread_pool.map(_handle_one_worker, workspec_list) tmpLog.debug('{0} workers handled'.format(nWorkers)) # submit retValList = submit_bag_of_workers(list(dataIterator)) tmpLog.debug('{0} workers submitted'.format(nWorkers)) # propagate changed attributes with ThreadPoolExecutor(self.nProcesses) as thread_pool: retIterator = thread_pool.map(lambda _wv_tuple: _propagate_attributes(*_wv_tuple), zip(workspec_list, retValList)) retList = list(retIterator) tmpLog.debug('done') return retList