def create_job_from_yaml(self, yaml_content, work_spec, cert, cert_in_secret=True, cpuadjustratio=100, memoryadjustratio=100): panda_queues_dict = PandaQueuesDict() queue_name = panda_queues_dict.get_panda_queue_name(work_spec.computingSite) yaml_content['metadata']['name'] = yaml_content['metadata']['name'] + "-" + str(work_spec.workerID) yaml_content['spec']['template'].setdefault('metadata', {}) yaml_content['spec']['template']['metadata'].update({ 'labels': {'resourceType': str(work_spec.resourceType)}}) yaml_containers = yaml_content['spec']['template']['spec']['containers'] del(yaml_containers[1:len(yaml_containers)]) container_env = yaml_containers[0] container_env.setdefault('resources', {}) # note that predefined values in the yaml template will NOT be overwritten if work_spec.nCore > 0: container_env['resources'].setdefault('limits', {'cpu': str(work_spec.nCore)}) container_env['resources'].setdefault('requests', {'cpu': str(work_spec.nCore * cpuadjustratio / 100.0)}) if work_spec.minRamCount > 4: # K8S minimum memory limit = 4 MB container_env['resources'].setdefault('limits', { 'memory': str(work_spec.minRamCount) + 'M'}) container_env['resources'].setdefault('requests', { 'memory': str(work_spec.minRamCount*memoryadjustratio/100.0) + 'M'}) container_env.setdefault('env', []) container_env['env'].extend([ {'name': 'computingSite', 'value': work_spec.computingSite}, {'name': 'pandaQueueName', 'value': queue_name}, {'name': 'resourceType', 'value': work_spec.resourceType}, {'name': 'proxySecretPath', 'value': cert if cert_in_secret else None}, {'name': 'proxyContent', 'value': None if cert_in_secret else self.set_proxy(cert)}, {'name': 'workerID', 'value': str(work_spec.workerID)}, {'name': 'logs_frontend_w', 'value': harvester_config.pandacon.pandaCacheURL_W}, {'name': 'logs_frontend_r', 'value': harvester_config.pandacon.pandaCacheURL_R}, {'name': 'PANDA_JSID', 'value': 'harvester-' + harvester_config.master.harvester_id}, {'name': 'HARVESTER_WORKER_ID', 'value': str(work_spec.workerID)}, {'name': 'HARVESTER_ID', 'value': harvester_config.master.harvester_id} ]) if 'affinity' not in yaml_content['spec']['template']['spec']: yaml_content = self.set_affinity(yaml_content) rsp = self.batchv1.create_namespaced_job(body=yaml_content, namespace=self.namespace) return rsp
def make_worker(self, jobspec_list, queue_config, job_type, resource_type): tmpLog = self.make_logger(_logger, 'queue={0}'.format(queue_config.queueName), method_name='make_worker') tmpLog.debug('jobspec_list: {0}'.format(jobspec_list)) workSpec = WorkSpec() workSpec.creationTime = datetime.datetime.utcnow() # get the queue configuration from the DB panda_queues_dict = PandaQueuesDict() queue_dict = panda_queues_dict.get(queue_config.queueName, {}) workSpec.minRamCount = queue_dict.get('maxrss', 1) or 1 workSpec.maxWalltime = queue_dict.get('maxtime', 1) workSpec.maxDiskCount = queue_dict.get('maxwdir', 1) # get info from jobs if len(jobspec_list) > 0: nRemainingEvents = 0 for jobspec in jobspec_list: if jobspec.nRemainingEvents: nRemainingEvents += jobspec.nRemainingEvents nCore, maxWalltime = self.calculate_worker_requirements( nRemainingEvents) workSpec.nCore = nCore workSpec.maxWalltime = maxWalltime # TODO: this needs to be improved with real resource types if resource_type and resource_type != 'ANY': workSpec.resourceType = resource_type elif workSpec.nCore == 1: workSpec.resourceType = 'SCORE' else: workSpec.resourceType = 'MCORE' return workSpec
def make_worker(self, jobspec_list, queue_config, resource_type): tmpLog = self.make_logger(_logger, 'queue={0}'.format(queue_config.queueName), method_name='make_worker') tmpLog.debug('jobspec_list: {0}'.format(jobspec_list)) workSpec = WorkSpec() workSpec.creationTime = datetime.datetime.utcnow() # get the queue configuration from the DB panda_queues_dict = PandaQueuesDict() queue_dict = panda_queues_dict.get(queue_config.queueName, {}) workSpec.minRamCount = queue_dict.get('maxrss', 1) or 1 workSpec.maxWalltime = queue_dict.get('maxtime', 1) workSpec.maxDiskCount = queue_dict.get('maxwdir', 1) # get info from jobs if len(jobspec_list) > 0: nRemainingEvents = 0 for jobspec in jobspec_list: if jobspec.nRemainingEvents: nRemainingEvents += jobspec.nRemainingEvents nCore, maxWalltime = self.calculate_worker_requirements(nRemainingEvents) workSpec.nCore = nCore workSpec.maxWalltime = maxWalltime # TODO: this needs to be improved with real resource types if resource_type and resource_type != 'ANY': workSpec.resourceType = resource_type elif workSpec.nCore == 1: workSpec.resourceType = 'SCORE' else: workSpec.resourceType = 'MCORE' return workSpec
def update_label(self, site, msg, data): """ Updates a label (=panda queue+CE) """ start_time = time.time() tmp_log = core_utils.make_logger(_base_logger, 'harvester_id={0}'.format( self.harvester_id), method_name='update_label') if not self.__active: tmp_log.debug('APFMon reporting not enabled') return try: tmp_log.debug('start') data = self.massage_label_data(data) # get the active queues from the config mapper all_sites = self.queue_config_mapper.get_active_queues().keys() panda_queues_dict = PandaQueuesDict() site_info = panda_queues_dict.get(site, dict()) if not site_info: tmp_log.warning('No site info for {0}'.format(site)) return # when no CEs associated to a queue, e.g. P1, HPCs, etc. Try to see if there is something # in local configuration, otherwise set it to a dummy value try: ce = self.queue_config_mapper.queueConfig[site].submitter[ 'ceEndpoint'] queues = [{'ce_endpoint': ce}] except KeyError: if site_info['queues']: queues = site_info['queues'] else: queues = [{'ce_endpoint': NO_CE}] for queue in queues: try: try: ce = clean_ce(queue['ce_endpoint']) except: ce = '' label_data = {'status': msg, 'data': data} label = '{0}-{1}'.format(site, ce) label_id = '{0}:{1}'.format(self.harvester_id, label) url = '{0}/labels/{1}'.format(self.base_url, label_id) r = requests.post(url, data=json.dumps(label_data), timeout=self.__label_timeout) tmp_log.debug( 'label update for {0} ended with {1} {2}'.format( label, r.status_code, r.text)) except: tmp_log.error('Excepted for site {0} with: {1}'.format( label, traceback.format_exc())) end_time = time.time() tmp_log.debug('done (took {0})'.format(end_time - start_time)) except: tmp_log.error('Excepted with: {0}'.format(traceback.format_exc()))
def create_labels(self): """ Creates or updates a collection of labels (=panda queue+CE) """ start_time = time.time() tmp_log = core_utils.make_logger(_base_logger, 'harvester_id={0}'.format( self.harvester_id), method_name='create_labels') if not self.__active: tmp_log.debug('APFMon reporting not enabled') return try: tmp_log.debug('start') url = '{0}/labels'.format(self.base_url) # get the active queues from the config mapper all_sites = self.queue_config_mapper.get_active_queues().keys() panda_queues_dict = PandaQueuesDict() # publish the active queues to APF mon in shards for sites in core_utils.create_shards(all_sites, 20): labels = [] for site in sites: try: site_info = panda_queues_dict.get(site, dict()) if not site_info: tmp_log.warning( 'No site info for {0}'.format(site)) continue # when no CEs associated to a queue, e.g. P1, HPCs, etc. Try to see if there is something # in local configuration, otherwise set it to a dummy value try: ce = self.queue_config_mapper.queueConfig[ site].submitter['ceEndpoint'] queues = [{'ce_endpoint': ce}] except KeyError: if site_info['queues']: queues = site_info['queues'] else: queues = [{'ce_endpoint': NO_CE}] for queue in queues: try: ce = clean_ce(queue['ce_endpoint']) except: ce = '' try: ce_queue_id = queue['ce_queue_id'] except KeyError: ce_queue_id = 0 labels.append({ 'name': '{0}-{1}'.format(site, ce), 'wmsqueue': site, 'ce_queue_id': ce_queue_id, 'factory': self.harvester_id }) except: tmp_log.error('Excepted for site {0} with: {1}'.format( site, traceback.format_exc())) continue payload = json.dumps(labels) r = requests.put(url, data=payload, timeout=self.__label_timeout) tmp_log.debug( 'label creation for {0} ended with {1} {2}'.format( sites, r.status_code, r.text)) end_time = time.time() tmp_log.debug('done (took {0})'.format(end_time - start_time)) except: tmp_log.error('Excepted with: {0}'.format(traceback.format_exc()))
def load_data(self, refill_table=False): mainLog = _make_logger(method_name='QueueConfigMapper.load_data') with self.lock: # check if to update timeNow_timestamp = time.time() if self.lastUpdate is not None: last_reload_timestamp = self._get_last_reload_time() if (last_reload_timestamp is not None and self.lastUpdate is not None and datetime.datetime.utcfromtimestamp( last_reload_timestamp) < self.lastUpdate and timeNow_timestamp - last_reload_timestamp < self.updateInterval): return # start with self.lock: # update timesatmp of last reload, lock with check interval got_timesatmp_update_lock = self.dbProxy.get_process_lock( 'qconf_reload', 'qconf_universal', self.updateInterval) if got_timesatmp_update_lock: retVal = self._update_last_reload_time() if retVal: mainLog.debug('updated last reload timestamp') else: mainLog.warning( 'failed to update last reload timestamp. Skipped') else: mainLog.debug( 'did not get qconf_reload timestamp lock. Skipped to update last reload timestamp' ) # init newQueueConfig = dict() localTemplatesDict = dict() remoteTemplatesDict = dict() finalTemplatesDict = dict() localQueuesDict = dict() remoteQueuesDict = dict() dynamicQueuesDict = dict() allQueuesNameList = set() getQueuesDynamic = False invalidQueueList = set() pandaQueueDict = PandaQueuesDict() # get resolver resolver = self._get_resolver() if resolver is None: mainLog.debug('No resolver is configured') # load config json from cacher (RT & RQ) queueConfigJson_cacher = self._load_config_from_cache() if queueConfigJson_cacher is not None: for queueName, queueDict in iteritems(queueConfigJson_cacher): if queueDict.get('isTemplateQueue') is True \ or queueName.endswith('_TEMPLATE'): # is RT queueDict['isTemplateQueue'] = True queueDict.pop('templateQueueName', None) remoteTemplatesDict[queueName] = queueDict else: # is RQ queueDict['isTemplateQueue'] = False remoteQueuesDict[queueName] = queueDict # load config from local json file (LT & LQ) queueConfigJson_local = self._load_config_from_file() if queueConfigJson_local is not None: for queueName, queueDict in iteritems(queueConfigJson_local): if queueDict.get('isTemplateQueue') is True \ or queueName.endswith('_TEMPLATE'): # is LT queueDict['isTemplateQueue'] = True queueDict.pop('templateQueueName', None) localTemplatesDict[queueName] = queueDict else: # is LQ queueDict['isTemplateQueue'] = False localQueuesDict[queueName] = queueDict else: mainLog.warning( 'Failed to load config from local json file. Skipped') # fill in final template (FT) finalTemplatesDict.update(remoteTemplatesDict) finalTemplatesDict.update(localTemplatesDict) finalTemplatesDict.pop(None, None) # remove queues with invalid templateQueueName for acr, queuesDict in [('RQ', remoteQueuesDict), ('LQ', localQueuesDict)]: for queueName, queueDict in iteritems(queuesDict.copy()): templateQueueName = queueDict.get('templateQueueName') if templateQueueName is not None \ and templateQueueName not in finalTemplatesDict: del queuesDict[queueName] mainLog.warning( 'Invalid templateQueueName "{0}" for {1} ({2}). Skipped' .format(templateQueueName, queueName, acr)) # get queue names from resolver and fill in dynamic queue (DQ) if resolver is not None \ and 'DYNAMIC' in harvester_config.qconf.queueList: getQueuesDynamic = True dynamicQueuesNameList = resolver.get_all_queue_names() for queueName in dynamicQueuesNameList.copy(): queueDict = dict() # template and default template via workflow templateQueueName = None resolver_harvester_template = None if resolver is not None: resolver_harvester_template = resolver.get_harvester_template( queueName) resolver_type, resolver_workflow = resolver.get_type_workflow( queueName) if resolver_harvester_template: templateQueueName = resolver_harvester_template elif not (resolver_type is None or resolver_workflow is None): templateQueueName = '{pq_type}.{workflow}'.format( pq_type=resolver_type, workflow=resolver_workflow) else: templateQueueName = harvester_config.qconf.defaultTemplateQueueName if templateQueueName not in finalTemplatesDict: # remove queues with invalid templateQueueName dynamicQueuesNameList.discard(queueName) mainLog.warning( 'Invalid templateQueueName "{0}" for {1} (DQ). Skipped' .format(templateQueueName, queueName)) continue # parameters resolver_harvester_params = resolver.get_harvester_params( queueName) for key, val in iteritems(resolver_harvester_params): if key in self.dynamic_queue_generic_attrs: queueDict[key] = val # fill in dynamic queue configs queueDict['templateQueueName'] = templateQueueName queueDict['isTemplateQueue'] = False dynamicQueuesDict[queueName] = queueDict # fill in all queue name list (names of RQ + DQ + LQ) allQueuesNameList |= set(remoteQueuesDict) allQueuesNameList |= set(dynamicQueuesDict) allQueuesNameList |= set(localQueuesDict) allQueuesNameList.discard(None) # set attributes for queueName in allQueuesNameList: # sources or queues and templates queueSourceList = [] templateSourceList = [] # prepare templateQueueName templateQueueName = None for queuesDict in [ remoteQueuesDict, dynamicQueuesDict, localQueuesDict ]: if queueName not in queuesDict: continue tmp_queueDict = queuesDict[queueName] tmp_templateQueueName = tmp_queueDict.get( 'templateQueueName') if tmp_templateQueueName is not None: templateQueueName = tmp_templateQueueName # prepare queueDict queueDict = dict() if templateQueueName in finalTemplatesDict: queueDict.update( copy.deepcopy(finalTemplatesDict[templateQueueName])) for acr, templatesDict in [('RT', remoteTemplatesDict), ('LT', localTemplatesDict)]: if templateQueueName in templatesDict: templateSourceList.append(acr) # update queueDict for acr, queuesDict in [('RQ', remoteQueuesDict), ('DQ', dynamicQueuesDict), ('LQ', localQueuesDict)]: if queueName not in queuesDict: continue queueSourceList.append(acr) tmp_queueDict = queuesDict[queueName] for key, val in iteritems(tmp_queueDict): val = copy.deepcopy(val) if key in self.updatable_plugin_attrs \ and isinstance(queueDict.get(key), dict) \ and isinstance(val, dict): # update plugin parameters instead of overwriting whole plugin section queueDict[key].update(val) else: queueDict[key] = val # record sources of the queue config and its templates in log if templateQueueName: mainLog.debug( ('queue {queueName} comes from {queueSource} ' '(with template {templateName} ' 'from {templateSource})').format( queueName=queueName, templateName=templateQueueName, queueSource=','.join(queueSourceList), templateSource=','.join(templateSourceList))) else: mainLog.debug( 'queue {queueName} comes from {queueSource}'.format( queueName=queueName, queueSource=','.join(queueSourceList))) # prepare queueConfig if queueName in newQueueConfig: queueConfig = newQueueConfig[queueName] else: queueConfig = QueueConfig(queueName) # queueName = siteName/resourceType queueConfig.siteName = queueConfig.queueName.split('/')[0] if queueConfig.siteName != queueConfig.queueName: queueConfig.resourceType = queueConfig.queueName.split( '/')[-1] # get common attributes commonAttrDict = dict() if isinstance(queueDict.get('common'), dict): commonAttrDict = queueDict.get('common') # according to queueDict for key, val in iteritems(queueDict): if isinstance(val, dict) and 'module' in val and 'name' in val: # plugin attributes val = copy.deepcopy(val) # fill in common attributes for all plugins for c_key, c_val in iteritems(commonAttrDict): if c_key not in val and c_key not in ('module', 'name'): val[c_key] = c_val # check module and class name try: _t3mP_1Mp0R7_mO6U1e__ = importlib.import_module( val['module']) _t3mP_1Mp0R7_N4m3__ = getattr( _t3mP_1Mp0R7_mO6U1e__, val['name']) except Exception as _e: invalidQueueList.add(queueConfig.queueName) mainLog.error( 'Module or class not found. Omitted {0} in queue config ({1})' .format(queueConfig.queueName, _e)) continue else: del _t3mP_1Mp0R7_mO6U1e__ del _t3mP_1Mp0R7_N4m3__ # fill in siteName and queueName if 'siteName' not in val: val['siteName'] = queueConfig.siteName if 'queueName' not in val: val['queueName'] = queueConfig.queueName # middleware if 'middleware' in val and val[ 'middleware'] in queueDict: # keep original config val['original_config'] = copy.deepcopy(val) # overwrite with middleware config for m_key, m_val in iteritems( queueDict[val['middleware']]): val[m_key] = m_val setattr(queueConfig, key, val) # delete isTemplateQueue attribute try: if getattr(queueConfig, 'isTemplateQueue'): mainLog.error( 'Internal error: isTemplateQueue is True. Omitted {0} in queue config' .format(queueConfig.queueName)) invalidQueueList.add(queueConfig.queueName) else: delattr(queueConfig, 'isTemplateQueue') except AttributeError as _e: mainLog.error( 'Internal error with attr "isTemplateQueue". Omitted {0} in queue config ({1})' .format(queueConfig.queueName, _e)) invalidQueueList.add(queueConfig.queueName) # get Panda Queue Name if resolver is not None: queueConfig.pandaQueueName = resolver.get_panda_queue_name( queueConfig.siteName) # additional criteria for getJob if queueConfig.getJobCriteria is not None: tmpCriteria = dict() for tmpItem in queueConfig.getJobCriteria.split(','): tmpKey, tmpVal = tmpItem.split('=') tmpCriteria[tmpKey] = tmpVal if len(tmpCriteria) == 0: queueConfig.getJobCriteria = None else: queueConfig.getJobCriteria = tmpCriteria # nullify job attributes if NoJob mapType if queueConfig.mapType == WorkSpec.MT_NoJob: for attName in [ 'nQueueLimitJob', 'nQueueLimitJobRatio', 'nQueueLimitJobMax', 'nQueueLimitJobMin' ]: setattr(queueConfig, attName, None) # heartbeat suppression if queueConfig.truePilot and queueConfig.noHeartbeat == '': queueConfig.noHeartbeat = 'running,transferring,finished,failed' # set unique name queueConfig.set_unique_name() # put into new queue configs newQueueConfig[queueName] = queueConfig # Check existence of mandatory attributes if queueName in newQueueConfig: queueConfig = newQueueConfig[queueName] missing_attr_list = [] for _attr in self.mandatory_attrs: if not hasattr(queueConfig, _attr): invalidQueueList.add(queueConfig.queueName) missing_attr_list.append(_attr) if missing_attr_list: mainLog.error( 'Missing mandatory attributes {0} . Omitted {1} in queue config' .format(','.join(missing_attr_list), queueConfig.queueName)) # delete invalid queues for invalidQueueName in invalidQueueList: if invalidQueueName in newQueueConfig: del newQueueConfig[invalidQueueName] # auto blacklisting autoBlacklist = False if resolver is not None and hasattr(harvester_config.qconf, 'autoBlacklist') and \ harvester_config.qconf.autoBlacklist: autoBlacklist = True # get queue dumps queueConfigDumps = self.dbProxy.get_queue_config_dumps() # get active queues activeQueues = dict() for queueName, queueConfig in iteritems(newQueueConfig): # get status if queueConfig.queueStatus is None and autoBlacklist: queueConfig.queueStatus = resolver.get_queue_status( queueName) # get dynamic information if 'DYNAMIC' in harvester_config.qconf.queueList: # UPS queue if resolver is not None and resolver.is_ups_queue( queueName): queueConfig.runMode = 'slave' queueConfig.mapType = 'NoJob' # set online if undefined if queueConfig.queueStatus is None: queueConfig.queueStatus = 'online' queueConfig.queueStatus = queueConfig.queueStatus.lower() # look for configID dumpSpec = QueueConfigDumpSpec() dumpSpec.queueName = queueName dumpSpec.set_data(vars(queueConfig)) if dumpSpec.dumpUniqueName in queueConfigDumps: dumpSpec = queueConfigDumps[dumpSpec.dumpUniqueName] else: # add dump dumpSpec.creationTime = datetime.datetime.utcnow() dumpSpec.configID = self.dbProxy.get_next_seq_number( 'SEQ_configID') tmpStat = self.dbProxy.add_queue_config_dump(dumpSpec) if not tmpStat: dumpSpec.configID = self.dbProxy.get_config_id_dump( dumpSpec) if dumpSpec.configID is None: mainLog.error( 'failed to get configID for {0}'.format( dumpSpec.dumpUniqueName)) continue queueConfigDumps[dumpSpec.dumpUniqueName] = dumpSpec queueConfig.configID = dumpSpec.configID # ignore offline if queueConfig.queueStatus == 'offline': continue # filter for pilot version if hasattr(harvester_config.qconf, 'pilotVersion') and \ pandaQueueDict.get(queueConfig.siteName) is not None and \ pandaQueueDict.get(queueConfig.siteName).get('pilot_version') != str(harvester_config.qconf.pilotVersion): continue if 'ALL' not in harvester_config.qconf.queueList and \ 'DYNAMIC' not in harvester_config.qconf.queueList and \ queueName not in harvester_config.qconf.queueList: continue activeQueues[queueName] = queueConfig self.queueConfig = newQueueConfig self.activeQueues = activeQueues newQueueConfigWithID = dict() for dumpSpec in queueConfigDumps.values(): queueConfig = QueueConfig(dumpSpec.queueName) queueConfig.update_attributes(dumpSpec.data) queueConfig.configID = dumpSpec.configID newQueueConfigWithID[dumpSpec.configID] = queueConfig self.queueConfigWithID = newQueueConfigWithID self.lastUpdate = datetime.datetime.utcnow() # update database if self.toUpdateDB: self.dbProxy.fill_panda_queue_table(self.activeQueues.keys(), self, refill_table=refill_table) mainLog.debug('updated to DB') # done mainLog.debug('done')
def submit_k8s_worker(self, work_spec): tmp_log = self.make_logger(base_logger, method_name='submit_k8s_worker') # get info from harvester queue config _queueConfigMapper = QueueConfigMapper() harvester_queue_config = _queueConfigMapper.get_queue(self.queueName) prod_source_label = harvester_queue_config.get_source_label( work_spec.jobType) # set the stdout log file log_file_name = '{0}_{1}.out'.format( harvester_config.master.harvester_id, work_spec.workerID) work_spec.set_log_file( 'stdout', '{0}/{1}'.format(self.logBaseURL, log_file_name)) # TODO: consider if we want to upload the yaml file to PanDA cache yaml_content = self.k8s_client.read_yaml_file(self.k8s_yaml_file) try: # read the job configuration (if available, only push model) job_fields, job_pars_parsed = self.read_job_configuration( work_spec) # decide container image and executable to run. In pull mode, defaults are provided container_image = self.decide_container_image( job_fields, job_pars_parsed) executable, args = self.build_executable(job_fields, job_pars_parsed) tmp_log.debug( 'container_image: "{0}"; executable: "{1}"; args: "{2}"'. format(container_image, executable, args)) # choose the appropriate proxy panda_queues_dict = PandaQueuesDict() is_grandly_unified_queue = panda_queues_dict.is_grandly_unified_queue( self.queueName) cert, use_secret = self._choose_proxy(work_spec, is_grandly_unified_queue) if not cert: err_str = 'No proxy specified in proxySecretPath or x509UserProxy. Not submitted' tmp_return_value = (False, err_str) return tmp_return_value # get the walltime limit try: max_time = panda_queues_dict.get(self.queueName)['maxtime'] except Exception as e: tmp_log.warning( 'Could not retrieve maxtime field for queue {0}'.format( self.queueName)) max_time = None # submit the worker rsp, yaml_content_final = self.k8s_client.create_job_from_yaml( yaml_content, work_spec, prod_source_label, container_image, executable, args, cert, cert_in_secret=use_secret, cpu_adjust_ratio=self.cpuAdjustRatio, memory_adjust_ratio=self.memoryAdjustRatio, max_time=max_time) except Exception as _e: tmp_log.error(traceback.format_exc()) err_str = 'Failed to create a JOB; {0}'.format(_e) tmp_return_value = (False, err_str) else: work_spec.batchID = yaml_content['metadata']['name'] tmp_log.debug('Created worker {0} with batchID={1}'.format( work_spec.workerID, work_spec.batchID)) tmp_return_value = (True, '') return tmp_return_value
def submit_k8s_worker(self, work_spec): tmp_log = self.make_logger(base_logger, method_name='submit_k8s_worker') # get info from harvester queue config _queueConfigMapper = QueueConfigMapper() harvester_queue_config = _queueConfigMapper.get_queue(self.queueName) # set the stdout log file log_file_name = '{0}_{1}.out'.format(harvester_config.master.harvester_id, work_spec.workerID) work_spec.set_log_file('stdout', '{0}/{1}'.format(self.logBaseURL, log_file_name)) # TODO: consider if we want to upload the yaml file to PanDA cache yaml_content = self.k8s_client.read_yaml_file(self.k8s_yaml_file) try: # read the job configuration (if available, only push model) job_fields, job_pars_parsed = self.read_job_configuration(work_spec) # decide container image and executable to run. In pull mode, defaults are provided container_image = self.decide_container_image(job_fields, job_pars_parsed) executable, args = self.build_executable(job_fields, job_pars_parsed) tmp_log.debug('container_image: "{0}"; executable: "{1}"; args: "{2}"'.format(container_image, executable, args)) # choose the appropriate proxy panda_queues_dict = PandaQueuesDict() this_panda_queue_dict = panda_queues_dict.get(self.queueName, dict()) is_grandly_unified_queue = panda_queues_dict.is_grandly_unified_queue(self.queueName) cert = self._choose_proxy(work_spec, is_grandly_unified_queue) if not cert: err_str = 'No proxy specified in proxySecretPath. Not submitted' tmp_return_value = (False, err_str) return tmp_return_value # get the walltime limit try: max_time = this_panda_queue_dict['maxtime'] except Exception as e: tmp_log.warning('Could not retrieve maxtime field for queue {0}'.format(self.queueName)) max_time = None associated_params_dict = {} for key, val in panda_queues_dict.get_harvester_params(self.queueName).items(): if key in self._allowed_agis_attrs: associated_params_dict[key] = val pilot_url = associated_params_dict.get('pilot_url') pilot_version = str(this_panda_queue_dict.get('pilot_version', 'current')) python_version = str(this_panda_queue_dict.get('python_version', '2')) # prod_source_label = harvester_queue_config.get_source_label(work_spec.jobType) pilot_opt_dict = submitter_common.get_complicated_pilot_options(work_spec.pilotType) if pilot_opt_dict is None: prod_source_label = harvester_queue_config.get_source_label(work_spec.jobType) pilot_type = work_spec.pilotType pilot_url_str = '--piloturl {0}'.format(pilot_url) if pilot_url else '' else: prod_source_label = pilot_opt_dict['prod_source_label'] pilot_type = pilot_opt_dict['pilot_type_opt'] pilot_url_str = pilot_opt_dict['pilot_url_str'] pilot_python_option = submitter_common.get_python_version_option(python_version, prod_source_label) # submit the worker rsp, yaml_content_final = self.k8s_client.create_job_from_yaml(yaml_content, work_spec, prod_source_label, pilot_type, pilot_url_str, pilot_python_option, container_image, executable, args, cert, cpu_adjust_ratio=self.cpuAdjustRatio, memory_adjust_ratio=self.memoryAdjustRatio, max_time=max_time) except Exception as _e: tmp_log.error(traceback.format_exc()) err_str = 'Failed to create a JOB; {0}'.format(_e) tmp_return_value = (False, err_str) else: work_spec.batchID = yaml_content['metadata']['name'] tmp_log.debug('Created worker {0} with batchID={1}'.format(work_spec.workerID, work_spec.batchID)) tmp_return_value = (True, '') return tmp_return_value
def submit_workers(self, workspec_list): tmpLog = core_utils.make_logger(baseLogger, method_name='submit_workers') nWorkers = len(workspec_list) tmpLog.debug('start nWorkers={0}'.format(nWorkers)) # get queue info from AGIS by cacher in db if self.useAtlasAGIS: panda_queues_dict = PandaQueuesDict() panda_queue_name = panda_queues_dict.get_PQ_from_PR(self.queueName) this_panda_queue_dict = panda_queues_dict.get( self.queueName, dict()) # tmpLog.debug('panda_queues_name and queue_info: {0}, {1}'.format(self.queueName, panda_queues_dict[self.queueName])) else: panda_queues_dict = dict() panda_queue_name = self.queueName this_panda_queue_dict = dict() def _handle_one_worker(workspec): # make logger tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format( workspec.workerID), method_name='_handle_one_worker') # get default information from queue info n_core_per_node_from_queue = this_panda_queue_dict.get( 'corecount', 1) if this_panda_queue_dict.get('corecount', 1) else 1 ce_info_dict = dict() batch_log_dict = dict() special_par = '' if self.useAtlasGridCE: # If ATLAS Grid CE mode used tmpLog.debug('Using ATLAS Grid CE mode...') queues_from_queue_list = this_panda_queue_dict.get( 'queues', []) special_par = this_panda_queue_dict.get('special_par', '') ce_endpoint_from_queue = '' ce_flavour_str = '' ce_version_str = '' random.shuffle(queues_from_queue_list) for _queue_dict in queues_from_queue_list: if _queue_dict.get('ce_endpoint') and str( _queue_dict.get('ce_state', '')).upper() == 'ACTIVE': ce_flavour_str = str(_queue_dict.get('ce_flavour', '')).lower() ce_version_str = str(_queue_dict.get('ce_version', '')).lower() if ce_flavour_str in set( ['arc-ce', 'cream-ce', 'htcondor-ce']): ce_info_dict = _queue_dict.copy() ce_endpoint_from_queue = ce_info_dict.get( 'ce_endpoint', '') ce_info_dict['ce_hostname'] = re.sub( ':\w*', '', ce_endpoint_from_queue) break else: ce_flavour_str = '' tmpLog.debug( 'For site {0} got CE endpoint: "{1}", flavour: "{2}"'. format(self.queueName, ce_endpoint_from_queue, ce_flavour_str)) if os.path.isdir(self.CEtemplateDir) and ce_flavour_str: sdf_template_filename = '{ce_flavour_str}.sdf'.format( ce_flavour_str=ce_flavour_str) self.templateFile = os.path.join(self.CEtemplateDir, sdf_template_filename) # template for batch script tmpFile = open(self.templateFile) sdf_template = tmpFile.read() tmpFile.close() # get batch_log, stdout, stderr filename for _line in sdf_template.split('\n'): if _line.startswith('#'): continue _match_batch_log = re.match('log = (.+)', _line) _match_stdout = re.match('output = (.+)', _line) _match_stderr = re.match('error = (.+)', _line) if _match_batch_log: batch_log_value = _match_batch_log.group(1) continue if _match_stdout: stdout_value = _match_stdout.group(1) continue if _match_stderr: stderr_value = _match_stderr.group(1) continue # get override requirements from queue configured try: n_core_per_node = self.nCorePerNode if self.nCorePerNode else n_core_per_node_from_queue except AttributeError: n_core_per_node = n_core_per_node_from_queue # URLs for log files if not (self.logBaseURL is None): if workspec.batchID: batchID = workspec.batchID guess = False else: batchID = '' guess = True batch_log_filename = parse_batch_job_filename( value_str=batch_log_value, file_dir=self.logDir, batchID=batchID, guess=guess) stdout_path_file_name = parse_batch_job_filename( value_str=stdout_value, file_dir=self.logDir, batchID=batchID, guess=guess) stderr_path_filename = parse_batch_job_filename( value_str=stderr_value, file_dir=self.logDir, batchID=batchID, guess=guess) batch_log = '{0}/{1}'.format(self.logBaseURL, batch_log_filename) batch_stdout = '{0}/{1}'.format(self.logBaseURL, stdout_path_file_name) batch_stderr = '{0}/{1}'.format(self.logBaseURL, stderr_path_filename) workspec.set_log_file('batch_log', batch_log) workspec.set_log_file('stdout', batch_stdout) workspec.set_log_file('stderr', batch_stderr) batch_log_dict['batch_log'] = batch_log batch_log_dict['batch_stdout'] = batch_stdout batch_log_dict['batch_stderr'] = batch_stderr batch_log_dict['gtag'] = workspec.workAttributes['stdOut'] tmpLog.debug('Done set_log_file') if not workspec.get_jobspec_list(): tmpLog.debug( 'No jobspec associated in the worker of workerID={0}'. format(workspec.workerID)) else: for jobSpec in workspec.get_jobspec_list(): # using batchLog and stdOut URL as pilotID and pilotLog jobSpec.set_one_attribute( 'pilotID', workspec.workAttributes['stdOut']) jobSpec.set_one_attribute( 'pilotLog', workspec.workAttributes['batchLog']) tmpLog.debug('Done jobspec attribute setting') # set data dict data = { 'workspec': workspec, 'template': sdf_template, 'log_dir': self.logDir, 'n_core_per_node': n_core_per_node, 'panda_queue_name': panda_queue_name, 'x509_user_proxy': self.x509UserProxy, 'ce_info_dict': ce_info_dict, 'batch_log_dict': batch_log_dict, 'special_par': special_par, } return data tmpLog.debug('finished preparing worker attributes') # map(_handle_one_worker, workspec_list) with ThreadPoolExecutor(self.nProcesses * 4) as thread_pool: dataIterator = thread_pool.map(_handle_one_worker, workspec_list) tmpLog.debug('{0} workers handled'.format(nWorkers)) # exec with mcore with ThreadPoolExecutor(self.nProcesses) as thread_pool: retValList = thread_pool.map(submit_a_worker, dataIterator) tmpLog.debug('{0} workers submitted'.format(nWorkers)) # propagate changed attributes retList = [] for workspec, tmpVal in zip(workspec_list, retValList): retVal, tmpDict = tmpVal workspec.set_attributes_with_dict(tmpDict) retList.append(retVal) tmpLog.debug('done') return retList
def make_worker(self, jobspec_list, queue_config, resource_type): tmpLog = core_utils.make_logger(_logger, 'queue={0}'.format( queue_config.queueName), method_name='make_worker') tmpLog.debug('jobspec_list: {0}'.format(jobspec_list)) workSpec = WorkSpec() # get the queue configuration from the DB panda_queues_dict = PandaQueuesDict() queue_dict = panda_queues_dict.get(queue_config.queueName, {}) unified_queue = 'unifiedPandaQueue' in queue_dict.get('catchall', '') # case of traditional (non-unified) queue: look at the queue configuration if not unified_queue: workSpec.nCore = queue_dict.get('corecount', 1) or 1 workSpec.minRamCount = queue_dict.get('maxrss', 1) or 1 # case of unified queue: look at the resource type and queue configuration else: site_corecount = queue_dict.get('corecount', 1) or 1 site_maxrss = queue_dict.get('maxrss', 1) or 1 if 'SCORE' in resource_type: workSpec.nCore = 1 workSpec.minRamCount = site_maxrss / site_corecount else: workSpec.nCore = site_corecount workSpec.minRamCount = site_maxrss # parameters that are independent on traditional vs unified workSpec.maxWalltime = queue_dict.get('maxtime', 1) workSpec.maxDiskCount = queue_dict.get('maxwdir', 1) # get info from jobs if len(jobspec_list) > 0: nCore = 0 minRamCount = 0 maxDiskCount = 0 maxWalltime = 0 for jobSpec in jobspec_list: try: nCore += jobSpec.jobParams['coreCount'] except: nCore += 1 try: minRamCount += jobSpec.jobParams['minRamCount'] except: pass try: maxDiskCount += jobSpec.jobParams['maxDiskCount'] except: pass try: if jobSpec.jobParams['maxWalltime'] not in (None, "NULL"): if hasattr(queue_config, 'maxWalltime'): maxWalltime = max(int(queue_config.walltimeLimit), jobSpec.jobParams['maxWalltime']) else: maxWalltime = jobSpec.jobParams['maxWalltime'] else: maxWalltime = queue_config.walltimeLimit except: pass if nCore > 0 and 'nCore' in self.jobAttributesToUse: workSpec.nCore = nCore if minRamCount > 0 and 'minRamCount' in self.jobAttributesToUse: workSpec.minRamCount = minRamCount if maxDiskCount > 0 and 'maxDiskCount' in self.jobAttributesToUse: workSpec.maxDiskCount = maxDiskCount if maxWalltime > 0 and 'maxWalltime' in self.jobAttributesToUse: workSpec.maxWalltime = maxWalltime # TODO: this needs to be improved with real resource types if resource_type and resource_type != 'ANY': workSpec.resourceType = resource_type elif workSpec.nCore == 1: workSpec.resourceType = 'SCORE' else: workSpec.resourceType = 'MCORE' return workSpec
def submit_workers(self, workspec_list): tmpLog = self.make_logger(baseLogger, method_name='submit_workers') nWorkers = len(workspec_list) tmpLog.debug('start nWorkers={0}'.format(nWorkers)) # get log subdirectory name from timestamp timeNow = datetime.datetime.utcnow() log_subdir = timeNow.strftime('%y-%m-%d_%H') log_subdir_path = os.path.join(self.logDir, log_subdir) try: os.mkdir(log_subdir_path) except OSError as e: if e.errno != errno.EEXIST: raise else: pass # get info from harvester queue config _queueConfigMapper = QueueConfigMapper() harvester_queue_config = _queueConfigMapper.get_queue(self.queueName) # get queue info from AGIS by cacher in db if self.useAtlasAGIS: panda_queues_dict = PandaQueuesDict() panda_queue_name = panda_queues_dict.get_panda_queue_name(self.queueName) this_panda_queue_dict = panda_queues_dict.get(self.queueName, dict()) # tmpLog.debug('panda_queues_name and queue_info: {0}, {1}'.format(self.queueName, panda_queues_dict[self.queueName])) else: panda_queues_dict = dict() panda_queue_name = self.queueName this_panda_queue_dict = dict() def _handle_one_worker(workspec): # make logger tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), method_name='_handle_one_worker') # get default information from queue info n_core_per_node_from_queue = this_panda_queue_dict.get('corecount', 1) if this_panda_queue_dict.get('corecount', 1) else 1 is_unified_queue = 'unifiedPandaQueue' in this_panda_queue_dict.get('catchall', '').split(',') \ or this_panda_queue_dict.get('capability', '') == 'ucore' ce_info_dict = dict() batch_log_dict = dict() special_par = '' if self.useAtlasGridCE: # If ATLAS Grid CE mode used tmpLog.debug('Using ATLAS Grid CE mode...') queues_from_queue_list = this_panda_queue_dict.get('queues', []) special_par = this_panda_queue_dict.get('special_par', '') ce_auxilary_dict = {} for _queue_dict in queues_from_queue_list: if not ( _queue_dict.get('ce_endpoint') and str(_queue_dict.get('ce_state', '')).upper() == 'ACTIVE' and str(_queue_dict.get('ce_flavour', '')).lower() in set(['arc-ce', 'cream-ce', 'htcondor-ce']) ): continue ce_endpoint = _queue_dict.get('ce_endpoint') if ( ce_endpoint in ce_auxilary_dict and str(_queue_dict.get('ce_queue_name', '')).lower() == 'default' ): pass else: ce_auxilary_dict[ce_endpoint] = _queue_dict # qualified CEs from AGIS info n_qualified_ce = len(ce_auxilary_dict) queue_status_dict = self.dbInterface.get_queue_status(self.queueName) worker_ce_stats_dict = self.dbInterface.get_worker_ce_stats(self.queueName) ce_weight_dict = _get_ce_weight_dict(ce_endpoint_list=list(ce_auxilary_dict.keys()), queue_status_dict=queue_status_dict, worker_ce_stats_dict=worker_ce_stats_dict) # good CEs which can be submitted to, duplicate by weight good_ce_weighted_list = [] for _ce_endpoint in ce_auxilary_dict.keys(): good_ce_weighted_list.extend([_ce_endpoint] * ce_weight_dict.get(_ce_endpoint, 0)) tmpLog.debug('queue_status_dict: {0} ; worker_ce_stats_dict: {1} ; ce_weight_dict: {2}'.format( queue_status_dict, worker_ce_stats_dict, ce_weight_dict)) if len(good_ce_weighted_list) > 0: ce_info_dict = ce_auxilary_dict[random.choice(good_ce_weighted_list)].copy() else: tmpLog.info('No good CE endpoint left. Choose an arbitrary CE endpoint') ce_info_dict = random.choice(list(ce_auxilary_dict.values())).copy() ce_endpoint_from_queue = ce_info_dict.get('ce_endpoint', '') ce_flavour_str = str(ce_info_dict.get('ce_flavour', '')).lower() ce_version_str = str(ce_info_dict.get('ce_version', '')).lower() ce_info_dict['ce_hostname'] = re.sub(':\w*', '', ce_endpoint_from_queue) tmpLog.debug('For site {0} got CE endpoint: "{1}", flavour: "{2}"'.format(self.queueName, ce_endpoint_from_queue, ce_flavour_str)) if os.path.isdir(self.CEtemplateDir) and ce_flavour_str: sdf_template_filename = '{ce_flavour_str}.sdf'.format(ce_flavour_str=ce_flavour_str) self.templateFile = os.path.join(self.CEtemplateDir, sdf_template_filename) # template for batch script tmpFile = open(self.templateFile) sdf_template = tmpFile.read() tmpFile.close() # get batch_log, stdout, stderr filename for _line in sdf_template.split('\n'): if _line.startswith('#'): continue _match_batch_log = re.match('log = (.+)', _line) _match_stdout = re.match('output = (.+)', _line) _match_stderr = re.match('error = (.+)', _line) if _match_batch_log: batch_log_value = _match_batch_log.group(1) continue if _match_stdout: stdout_value = _match_stdout.group(1) continue if _match_stderr: stderr_value = _match_stderr.group(1) continue # get override requirements from queue configured try: n_core_per_node = self.nCorePerNode if self.nCorePerNode else n_core_per_node_from_queue except AttributeError: n_core_per_node = n_core_per_node_from_queue # URLs for log files if not (self.logBaseURL is None): if workspec.batchID: batchID = workspec.batchID guess = False else: batchID = '' guess = True batch_log_filename = parse_batch_job_filename(value_str=batch_log_value, file_dir=log_subdir_path, batchID=batchID, guess=guess) stdout_path_file_name = parse_batch_job_filename(value_str=stdout_value, file_dir=log_subdir_path, batchID=batchID, guess=guess) stderr_path_filename = parse_batch_job_filename(value_str=stderr_value, file_dir=log_subdir_path, batchID=batchID, guess=guess) batch_log = '{0}/{1}/{2}'.format(self.logBaseURL, log_subdir, batch_log_filename) batch_stdout = '{0}/{1}/{2}'.format(self.logBaseURL, log_subdir, stdout_path_file_name) batch_stderr = '{0}/{1}/{2}'.format(self.logBaseURL, log_subdir, stderr_path_filename) workspec.set_log_file('batch_log', batch_log) workspec.set_log_file('stdout', batch_stdout) workspec.set_log_file('stderr', batch_stderr) batch_log_dict['batch_log'] = batch_log batch_log_dict['batch_stdout'] = batch_stdout batch_log_dict['batch_stderr'] = batch_stderr batch_log_dict['gtag'] = workspec.workAttributes['stdOut'] tmpLog.debug('Done set_log_file before submission') tmpLog.debug('Done jobspec attribute setting') # set data dict data = {'workspec': workspec, 'template': sdf_template, 'log_dir': self.logDir, 'log_subdir': log_subdir, 'n_core_per_node': n_core_per_node, 'panda_queue_name': panda_queue_name, 'x509_user_proxy': self.x509UserProxy, 'ce_info_dict': ce_info_dict, 'batch_log_dict': batch_log_dict, 'special_par': special_par, 'harvester_queue_config': harvester_queue_config, 'is_unified_queue': is_unified_queue, 'condor_schedd': self.condorSchedd, 'condor_pool': self.condorPool, } return data def _propagate_attributes(workspec, tmpVal): # make logger tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), method_name='_propagate_attributes') (retVal, tmpDict) = tmpVal workspec.set_attributes_with_dict(tmpDict) tmpLog.debug('Done workspec attributes propagation') return retVal tmpLog.debug('finished preparing worker attributes') # map(_handle_one_worker, workspec_list) with ThreadPoolExecutor(self.nProcesses * 4) as thread_pool: dataIterator = thread_pool.map(_handle_one_worker, workspec_list) tmpLog.debug('{0} workers handled'.format(nWorkers)) # exec with mcore with ThreadPoolExecutor(self.nProcesses) as thread_pool: retValList = thread_pool.map(submit_a_worker, dataIterator) tmpLog.debug('{0} workers submitted'.format(nWorkers)) # propagate changed attributes with ThreadPoolExecutor(self.nProcesses) as thread_pool: retIterator = thread_pool.map(lambda _wv_tuple: _propagate_attributes(*_wv_tuple), zip(workspec_list, retValList)) retList = list(retIterator) tmpLog.debug('done') return retList
def create_job_from_yaml(self, yaml_content, work_spec, prod_source_label, pilot_type, pilot_url_str, pilot_python_option, container_image, executable, args, cert, cpu_adjust_ratio=100, memory_adjust_ratio=100, max_time=None): tmp_log = core_utils.make_logger(base_logger, method_name='create_job_from_yaml') # consider PULL mode as default, unless specified submit_mode = 'PULL' # create the configmap in push mode worker_id = None if work_spec.mapType != 'NoJob': submit_mode = 'PUSH' worker_id = str(work_spec.workerID) res = self.create_configmap(work_spec) if not res: # if the configmap creation failed, don't submit a job because the pod creation will hang return res, 'Failed to create a configmap' # retrieve panda queue information panda_queues_dict = PandaQueuesDict() queue_name = panda_queues_dict.get_panda_queue_name( work_spec.computingSite) # set the worker name yaml_content['metadata']['name'] = yaml_content['metadata'][ 'name'] + "-" + str(work_spec.workerID) # set the resource type and other metadata to filter the pods yaml_content['spec']['template'].setdefault('metadata', {}) yaml_content['spec']['template']['metadata'].update({ 'labels': { 'resourceType': str(work_spec.resourceType), 'prodSourceLabel': str(prod_source_label), 'pq': str(work_spec.computingSite) } }) # fill the container details. we can only handle one container (take the first, delete the rest) yaml_containers = yaml_content['spec']['template']['spec'][ 'containers'] del (yaml_containers[1:len(yaml_containers)]) container_env = yaml_containers[0] container_env.setdefault('resources', {}) # set the container image if 'image' not in container_env: container_env['image'] = container_image if 'command' not in container_env: container_env['command'] = executable container_env['args'] = args # set the resources (CPU and memory) we need for the container # note that predefined values in the yaml template will NOT be overwritten # Be familiar with QoS classes: https://kubernetes.io/docs/tasks/configure-pod-container/quality-service-pod # The CPU & memory settings will affect the QoS for the pod container_env.setdefault('resources', {}) if work_spec.nCore > 0: # CPU limits container_env['resources'].setdefault('limits', {}) if 'cpu' not in container_env['resources']['limits']: container_env['resources']['limits']['cpu'] = str( work_spec.nCore) # CPU requests container_env['resources'].setdefault('requests', {}) if 'cpu' not in container_env['resources']['requests']: container_env['resources']['requests']['cpu'] = str( work_spec.nCore * cpu_adjust_ratio / 100.0) if work_spec.minRamCount > 4: # K8S minimum memory limit = 4 MB # memory limits # container_env['resources'].setdefault('limits', {}) # if 'memory' not in container_env['resources']['limits']: # container_env['resources']['limits']['memory'] = str(work_spec.minRamCount) + 'M' # memory requests container_env['resources'].setdefault('requests', {}) if 'memory' not in container_env['resources']['requests']: container_env['resources']['requests']['memory'] = str( work_spec.minRamCount * memory_adjust_ratio / 100.0) + 'M' container_env.setdefault('env', []) # try to retrieve the stdout log file name try: log_file_name = work_spec.workAttributes['stdout'] except (KeyError, AttributeError): tmp_log.debug( 'work_spec does not have stdout workAttribute, using default') log_file_name = '' container_env['env'].extend([ { 'name': 'computingSite', 'value': work_spec.computingSite }, { 'name': 'pandaQueueName', 'value': queue_name }, { 'name': 'resourceType', 'value': work_spec.resourceType }, { 'name': 'prodSourceLabel', 'value': prod_source_label }, { 'name': 'pilotTyp', 'value': pilot_type }, { 'name': 'pilotUrlOpt', 'value': pilot_url_str }, { 'name': 'pythonOption', 'value': pilot_python_option }, # {'name': 'jobType', 'value': work_spec.jobType}, { 'name': 'proxySecretPath', 'value': cert }, { 'name': 'workerID', 'value': str(work_spec.workerID) }, { 'name': 'logs_frontend_w', 'value': harvester_config.pandacon.pandaCacheURL_W }, { 'name': 'logs_frontend_r', 'value': harvester_config.pandacon.pandaCacheURL_R }, { 'name': 'stdout_name', 'value': log_file_name }, { 'name': 'PANDA_JSID', 'value': 'harvester-' + harvester_config.master.harvester_id }, { 'name': 'HARVESTER_WORKER_ID', 'value': str(work_spec.workerID) }, { 'name': 'HARVESTER_ID', 'value': harvester_config.master.harvester_id }, { 'name': 'submit_mode', 'value': submit_mode }, { 'name': 'EXEC_DIR', 'value': EXEC_DIR }, ]) # add the pilots starter configmap yaml_content['spec']['template']['spec'].setdefault('volumes', []) yaml_volumes = yaml_content['spec']['template']['spec']['volumes'] yaml_volumes.append({ 'name': 'pilots-starter', 'configMap': { 'name': 'pilots-starter' } }) # mount the volume to the filesystem container_env.setdefault('volumeMounts', []) container_env['volumeMounts'].append({ 'name': 'pilots-starter', 'mountPath': EXEC_DIR }) # in push mode, add the configmap as a volume to the pod if submit_mode == 'PUSH' and worker_id: yaml_content['spec']['template']['spec'].setdefault('volumes', []) yaml_volumes = yaml_content['spec']['template']['spec']['volumes'] yaml_volumes.append({ 'name': 'job-config', 'configMap': { 'name': worker_id } }) # mount the volume to the filesystem container_env.setdefault('volumeMounts', []) container_env['volumeMounts'].append({ 'name': 'job-config', 'mountPath': CONFIG_DIR }) # if we are running the pilot in a emptyDir with "pilot-dir" name, then set the max size if 'volumes' in yaml_content['spec']['template']['spec']: yaml_volumes = yaml_content['spec']['template']['spec']['volumes'] for volume in yaml_volumes: # do not overwrite any hardcoded sizeLimit value if volume[ 'name'] == 'pilot-dir' and 'emptyDir' in volume and 'sizeLimit' not in volume[ 'emptyDir']: maxwdir_prorated_GB = panda_queues_dict.get_prorated_maxwdir_GB( work_spec.computingSite, work_spec.nCore) if maxwdir_prorated_GB: volume['emptyDir']['sizeLimit'] = '{0}G'.format( maxwdir_prorated_GB) # set the affinity if 'affinity' not in yaml_content['spec']['template']['spec']: yaml_content = self.set_affinity(yaml_content) # set max_time to avoid having a pod running forever if 'activeDeadlineSeconds' not in yaml_content['spec']['template'][ 'spec']: if not max_time: # 4 days max_time = 4 * 24 * 23600 yaml_content['spec']['template']['spec'][ 'activeDeadlineSeconds'] = max_time tmp_log.debug('creating job {0}'.format(yaml_content)) rsp = self.batchv1.create_namespaced_job(body=yaml_content, namespace=self.namespace) return rsp, yaml_content
def run(self): while True: mainLog = self.make_logger(_logger, 'id={0}'.format(self.get_pid()), method_name='run') mainLog.debug('getting number of jobs to be fetched') # get number of jobs to be fetched nJobsPerQueue = self.dbProxy.get_num_jobs_to_fetch( harvester_config.jobfetcher.nQueues, harvester_config.jobfetcher.lookupTime) mainLog.debug('got {0} queues'.format(len(nJobsPerQueue))) # get up to date queue configuration pandaQueueDict = PandaQueuesDict() # loop over all queues for queueName, nJobs in iteritems(nJobsPerQueue): # check queue if not self.queueConfigMapper.has_queue(queueName): continue tmpLog = self.make_logger(_logger, 'queueName={0}'.format(queueName), method_name='run') # get queue queueConfig = self.queueConfigMapper.get_queue(queueName) siteName = queueConfig.siteName # upper limit if nJobs > harvester_config.jobfetcher.maxJobs: nJobs = harvester_config.jobfetcher.maxJobs # get jobs try: is_grandly_unified_queue = pandaQueueDict.is_grandly_unified_queue( siteName) except Exception: is_grandly_unified_queue = False default_prodSourceLabel = queueConfig.get_source_label( is_gu=is_grandly_unified_queue) pdpm = getattr(queueConfig, 'prodSourceLabelRandomWeightsPermille', {}) choice_list = core_utils.make_choice_list( pdpm=pdpm, default=default_prodSourceLabel) prodSourceLabel = random.choice(choice_list) tmpLog.debug('getting {0} jobs for prodSourceLabel {1}'.format( nJobs, prodSourceLabel)) sw = core_utils.get_stopwatch() jobs, errStr = self.communicator.get_jobs( siteName, self.nodeName, prodSourceLabel, self.nodeName, nJobs, queueConfig.getJobCriteria) tmpLog.info('got {0} jobs with {1} {2}'.format( len(jobs), errStr, sw.get_elapsed_time())) # convert to JobSpec if len(jobs) > 0: # get extractor plugin if hasattr(queueConfig, 'extractor'): extractorCore = self.pluginFactory.get_plugin( queueConfig.extractor) else: extractorCore = None jobSpecs = [] fileStatMap = dict() sw_startconvert = core_utils.get_stopwatch() for job in jobs: timeNow = datetime.datetime.utcnow() jobSpec = JobSpec() jobSpec.convert_job_json(job) jobSpec.computingSite = queueName jobSpec.status = 'starting' jobSpec.subStatus = 'fetched' jobSpec.creationTime = timeNow jobSpec.stateChangeTime = timeNow jobSpec.configID = queueConfig.configID jobSpec.set_one_attribute( 'schedulerID', 'harvester-{0}'.format( harvester_config.master.harvester_id)) if queueConfig.zipPerMB is not None and jobSpec.zipPerMB is None: jobSpec.zipPerMB = queueConfig.zipPerMB fileGroupDictList = [ jobSpec.get_input_file_attributes() ] if extractorCore is not None: fileGroupDictList.append( extractorCore.get_aux_inputs(jobSpec)) for fileGroupDict in fileGroupDictList: for tmpLFN, fileAttrs in iteritems(fileGroupDict): # make file spec fileSpec = FileSpec() fileSpec.PandaID = jobSpec.PandaID fileSpec.taskID = jobSpec.taskID fileSpec.lfn = tmpLFN fileSpec.endpoint = queueConfig.ddmEndpointIn fileSpec.scope = fileAttrs['scope'] if 'INTERNAL_FileType' in fileAttrs: fileSpec.fileType = fileAttrs[ 'INTERNAL_FileType'] jobSpec.auxInput = JobSpec.AUX_hasAuxInput else: fileSpec.fileType = 'input' # check file status if tmpLFN not in fileStatMap: fileStatMap[ tmpLFN] = self.dbProxy.get_file_status( tmpLFN, fileSpec.fileType, queueConfig.ddmEndpointIn, 'starting') # set preparing to skip stage-in if the file is (being) taken care of by another job if [ x for x in [ 'ready', 'preparing', 'to_prepare', 'triggered' ] if x in fileStatMap[tmpLFN] ]: fileSpec.status = 'preparing' else: fileSpec.status = 'to_prepare' fileStatMap[tmpLFN].setdefault( fileSpec.status, None) if 'INTERNAL_URL' in fileAttrs: fileSpec.url = fileAttrs['INTERNAL_URL'] jobSpec.add_in_file(fileSpec) jobSpec.trigger_propagation() jobSpecs.append(jobSpec) # insert to DB tmpLog.debug("Converting of {0} jobs {1}".format( len(jobs), sw_startconvert.get_elapsed_time())) sw_insertdb = core_utils.get_stopwatch() self.dbProxy.insert_jobs(jobSpecs) tmpLog.debug('Insert of {0} jobs {1}'.format( len(jobSpecs), sw_insertdb.get_elapsed_time())) mainLog.debug('done') # check if being terminated if self.terminated(harvester_config.jobfetcher.sleepTime): mainLog.debug('terminated') return
def make_worker(self, jobspec_list, queue_config, resource_type): tmpLog = self.make_logger(_logger, 'queue={0}'.format(queue_config.queueName), method_name='make_worker') tmpLog.debug('jobspec_list: {0}'.format(jobspec_list)) workSpec = WorkSpec() workSpec.creationTime = datetime.datetime.utcnow() # get the queue configuration from the DB panda_queues_dict = PandaQueuesDict() queue_dict = panda_queues_dict.get(queue_config.queueName, {}) unified_queue = 'unifiedPandaQueue' in queue_dict.get('catchall', '')\ or queue_dict.get('capability', '') == 'ucore' # case of traditional (non-unified) queue: look at the queue configuration if not unified_queue: workSpec.nCore = queue_dict.get('corecount', 1) or 1 workSpec.minRamCount = queue_dict.get('maxrss', 1) or 1 # case of unified queue: look at the resource type and queue configuration else: if queue_config.queueName in ('Taiwan-LCG2-HPC2_Unified', 'Taiwan-LCG2-HPC_Unified'): # temporary hack to debug killed workers in Taiwan queues site_corecount = queue_dict.get('corecount', 1) or 1 site_maxrss = queue_dict.get('maxrss', 1) or 1 # some cases need to overwrite those values if 'SCORE' in resource_type: # the usual pilot streaming use case workSpec.nCore = 1 workSpec.minRamCount = site_maxrss / site_corecount else: # default values workSpec.nCore = site_corecount workSpec.minRamCount = site_maxrss else: workSpec.nCore, workSpec.minRamCount = self.rt_mapper.calculate_worker_requirements( resource_type, queue_dict) # parameters that are independent on traditional vs unified workSpec.maxWalltime = queue_dict.get('maxtime', 1) workSpec.maxDiskCount = queue_dict.get('maxwdir', 1) # get info from jobs if len(jobspec_list) > 0: nCore = 0 minRamCount = 0 maxDiskCount = 0 maxWalltime = 0 for jobSpec in jobspec_list: job_corecount, job_memory = self.get_job_core_and_memory( queue_dict, jobSpec) nCore += job_corecount minRamCount += job_memory try: maxDiskCount += jobSpec.jobParams['maxDiskCount'] except Exception: pass try: if jobSpec.jobParams['maxWalltime'] not in (None, "NULL"): if hasattr(queue_config, 'maxWalltime'): maxWalltime = max(int(queue_config.walltimeLimit), jobSpec.jobParams['maxWalltime']) else: maxWalltime = jobSpec.jobParams['maxWalltime'] else: maxWalltime = queue_config.walltimeLimit except Exception: pass if (nCore > 0 and 'nCore' in self.jobAttributesToUse) \ or unified_queue: workSpec.nCore = nCore if (minRamCount > 0 and 'minRamCount' in self.jobAttributesToUse) \ or unified_queue: workSpec.minRamCount = minRamCount if maxDiskCount > 0 and 'maxDiskCount' in self.jobAttributesToUse: workSpec.maxDiskCount = maxDiskCount if maxWalltime > 0 and 'maxWalltime' in self.jobAttributesToUse: workSpec.maxWalltime = maxWalltime # TODO: this needs to be improved with real resource types if resource_type and resource_type != 'ANY': workSpec.resourceType = resource_type elif workSpec.nCore == 1: workSpec.resourceType = 'SCORE' else: workSpec.resourceType = 'MCORE' return workSpec
def submit_workers(self, workspec_list): tmpLog = self.make_logger(baseLogger, method_name='submit_workers') nWorkers = len(workspec_list) tmpLog.debug('start nWorkers={0}'.format(nWorkers)) # whether to submit any worker to_submit_any = True # get log subdirectory name from timestamp timeNow = datetime.datetime.utcnow() log_subdir = timeNow.strftime('%y-%m-%d_%H') log_subdir_path = os.path.join(self.logDir, log_subdir) if self.condorSchedd is None or not self.useSpool: try: os.mkdir(log_subdir_path) except OSError as e: if e.errno != errno.EEXIST: raise else: pass # get info from harvester queue config _queueConfigMapper = QueueConfigMapper() harvester_queue_config = _queueConfigMapper.get_queue(self.queueName) # get queue info from AGIS by cacher in db if self.useAtlasAGIS: panda_queues_dict = PandaQueuesDict() panda_queue_name = panda_queues_dict.get_panda_queue_name( self.queueName) this_panda_queue_dict = panda_queues_dict.get( self.queueName, dict()) # tmpLog.debug('panda_queues_name and queue_info: {0}, {1}'.format(self.queueName, panda_queues_dict[self.queueName])) else: panda_queues_dict = dict() panda_queue_name = self.queueName this_panda_queue_dict = dict() # get default information from queue info n_core_per_node_from_queue = this_panda_queue_dict.get( 'corecount', 1) if this_panda_queue_dict.get('corecount', 1) else 1 is_unified_queue = this_panda_queue_dict.get('capability', '') == 'ucore' # get override requirements from queue configured try: n_core_per_node = self.nCorePerNode if self.nCorePerNode else n_core_per_node_from_queue except AttributeError: n_core_per_node = n_core_per_node_from_queue # deal with CE special_par = '' ce_weighting = None if self.useAtlasGridCE: # If ATLAS Grid CE mode used tmpLog.debug('Using ATLAS Grid CE mode...') queues_from_queue_list = this_panda_queue_dict.get('queues', []) special_par = this_panda_queue_dict.get('special_par', '') ce_auxilary_dict = {} for _queue_dict in queues_from_queue_list: if not (_queue_dict.get('ce_endpoint') and str( _queue_dict.get('ce_state', '')).upper() == 'ACTIVE' and str(_queue_dict.get('ce_flavour', '')).lower() in set(['arc-ce', 'cream-ce', 'htcondor-ce'])): continue ce_endpoint = _queue_dict.get('ce_endpoint') if (ce_endpoint in ce_auxilary_dict and str(_queue_dict.get('ce_queue_name', '')).lower() == 'default'): pass else: ce_auxilary_dict[ce_endpoint] = _queue_dict # qualified CEs from AGIS info n_qualified_ce = len(ce_auxilary_dict) if n_qualified_ce > 0: # Get CE weighting tmpLog.debug('Get CE weighting') worker_ce_all_tuple = self.get_ce_statistics( self.queueName, nWorkers) ce_weighting = _get_ce_weighting( ce_endpoint_list=list(ce_auxilary_dict.keys()), worker_ce_all_tuple=worker_ce_all_tuple) stats_weighting_display_str = _get_ce_stats_weighting_display( ce_auxilary_dict.keys(), worker_ce_all_tuple, ce_weighting) tmpLog.debug('CE stats and weighting: {0}'.format( stats_weighting_display_str)) else: tmpLog.error('No valid CE endpoint found') to_submit_any = False def _handle_one_worker(workspec, to_submit=to_submit_any): # make logger tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format( workspec.workerID), method_name='_handle_one_worker') ce_info_dict = dict() batch_log_dict = dict() data = { 'workspec': workspec, 'to_submit': to_submit, } if to_submit: if self.useAtlasGridCE: # choose a CE tmpLog.info('choose a CE...') ce_chosen = _choose_ce(ce_weighting) try: ce_info_dict = ce_auxilary_dict[ce_chosen].copy() except KeyError: tmpLog.info( 'Problem choosing CE with weighting. Choose an arbitrary CE endpoint' ) ce_info_dict = random.choice( list(ce_auxilary_dict.values())).copy() # go on info of the CE ce_endpoint_from_queue = ce_info_dict.get( 'ce_endpoint', '') ce_flavour_str = str(ce_info_dict.get('ce_flavour', '')).lower() ce_version_str = str(ce_info_dict.get('ce_version', '')).lower() ce_info_dict['ce_hostname'] = re.sub( ':\w*', '', ce_endpoint_from_queue) if ce_info_dict['ce_hostname'] == ce_endpoint_from_queue: # add default port to ce_endpoint if missing default_port_map = { 'cream-ce': 8443, 'arc-ce': 2811, 'htcondor-ce': 9619, } if ce_flavour_str in default_port_map: default_port = default_port_map[ce_flavour_str] ce_info_dict['ce_endpoint'] = '{0}:{1}'.format( ce_endpoint_from_queue, default_port) tmpLog.debug( 'For site {0} got CE endpoint: "{1}", flavour: "{2}"'. format(self.queueName, ce_endpoint_from_queue, ce_flavour_str)) if os.path.isdir(self.CEtemplateDir) and ce_flavour_str: sdf_template_filename = '{ce_flavour_str}.sdf'.format( ce_flavour_str=ce_flavour_str) self.templateFile = os.path.join( self.CEtemplateDir, sdf_template_filename) else: try: # Manually define site condor schedd as ceHostname and central manager as ceEndpoint if self.ceHostname and isinstance( self.ceHostname, list) and len(self.ceHostname) > 0: if isinstance(self.ceEndpoint, list) and len(self.ceEndpoint) > 0: ce_info_dict['ce_hostname'], ce_info_dict[ 'ce_endpoint'] = random.choice( list( zip(self.ceHostname, self.ceEndpoint))) else: ce_info_dict['ce_hostname'] = random.choice( self.ceHostname) ce_info_dict['ce_endpoint'] = self.ceEndpoint else: ce_info_dict['ce_hostname'] = self.ceHostname ce_info_dict['ce_endpoint'] = self.ceEndpoint except AttributeError: pass # template for batch script try: tmpFile = open(self.templateFile) sdf_template_raw = tmpFile.read() tmpFile.close() except AttributeError: tmpLog.error( 'No valid templateFile found. Maybe templateFile, CEtemplateDir invalid, or no valid CE found' ) to_submit = False return data else: # get batch_log, stdout, stderr filename, and remobe commented liness sdf_template_str_list = [] for _line in sdf_template_raw.split('\n'): if _line.startswith('#'): continue sdf_template_str_list.append(_line) _match_batch_log = re.match('log = (.+)', _line) _match_stdout = re.match('output = (.+)', _line) _match_stderr = re.match('error = (.+)', _line) if _match_batch_log: batch_log_value = _match_batch_log.group(1) continue if _match_stdout: stdout_value = _match_stdout.group(1) continue if _match_stderr: stderr_value = _match_stderr.group(1) continue sdf_template = '\n'.join(sdf_template_str_list) # Choose from Condor schedd and central managers if isinstance(self.condorSchedd, list) and len(self.condorSchedd) > 0: if isinstance(self.condorPool, list) and len(self.condorPool) > 0: condor_schedd, condor_pool = random.choice( list(zip(self.condorSchedd, self.condorPool))) else: condor_schedd = random.choice(self.condorSchedd) condor_pool = self.condorPool else: condor_schedd = self.condorSchedd condor_pool = self.condorPool # Log Base URL if self.logBaseURL and '[ScheddHostname]' in self.logBaseURL: schedd_hostname = re.sub( r'(?:[a-zA-Z0-9_.\-]*@)?([a-zA-Z0-9.\-]+)(?::[0-9]+)?', lambda matchobj: matchobj.group(1) if matchobj.group(1) else '', condor_schedd) log_base_url = re.sub(r'\[ScheddHostname\]', schedd_hostname, self.logBaseURL) else: log_base_url = self.logBaseURL # URLs for log files if not (log_base_url is None): if workspec.batchID: batchID = workspec.batchID guess = False else: batchID = '' guess = True batch_log_filename = parse_batch_job_filename( value_str=batch_log_value, file_dir=log_subdir_path, batchID=batchID, guess=guess) stdout_path_file_name = parse_batch_job_filename( value_str=stdout_value, file_dir=log_subdir_path, batchID=batchID, guess=guess) stderr_path_filename = parse_batch_job_filename( value_str=stderr_value, file_dir=log_subdir_path, batchID=batchID, guess=guess) batch_log = '{0}/{1}/{2}'.format( log_base_url, log_subdir, batch_log_filename) batch_stdout = '{0}/{1}/{2}'.format( log_base_url, log_subdir, stdout_path_file_name) batch_stderr = '{0}/{1}/{2}'.format( log_base_url, log_subdir, stderr_path_filename) workspec.set_log_file('batch_log', batch_log) workspec.set_log_file('stdout', batch_stdout) workspec.set_log_file('stderr', batch_stderr) batch_log_dict['batch_log'] = batch_log batch_log_dict['batch_stdout'] = batch_stdout batch_log_dict['batch_stderr'] = batch_stderr batch_log_dict['gtag'] = workspec.workAttributes[ 'stdOut'] tmpLog.debug('Done set_log_file before submission') tmpLog.debug('Done jobspec attribute setting') # set data dict data.update({ 'workspec': workspec, 'to_submit': to_submit, 'template': sdf_template, 'executable_file': self.executableFile, 'log_dir': self.logDir, 'log_subdir': log_subdir, 'n_core_per_node': n_core_per_node, 'panda_queue_name': panda_queue_name, 'x509_user_proxy': self.x509UserProxy, 'ce_info_dict': ce_info_dict, 'batch_log_dict': batch_log_dict, 'special_par': special_par, 'harvester_queue_config': harvester_queue_config, 'is_unified_queue': is_unified_queue, 'condor_schedd': condor_schedd, 'condor_pool': condor_pool, 'use_spool': self.useSpool, }) return data def _propagate_attributes(workspec, tmpVal): # make logger tmpLog = core_utils.make_logger( baseLogger, 'workerID={0}'.format(workspec.workerID), method_name='_propagate_attributes') (retVal, tmpDict) = tmpVal workspec.set_attributes_with_dict(tmpDict) tmpLog.debug('Done workspec attributes propagation') return retVal tmpLog.debug('finished preparing worker attributes') # map(_handle_one_worker, workspec_list) with ThreadPoolExecutor(self.nProcesses * 4) as thread_pool: dataIterator = thread_pool.map(_handle_one_worker, workspec_list) tmpLog.debug('{0} workers handled'.format(nWorkers)) # exec with mcore with ThreadPoolExecutor(self.nProcesses) as thread_pool: retValList = thread_pool.map(submit_a_worker, dataIterator) tmpLog.debug('{0} workers submitted'.format(nWorkers)) # propagate changed attributes with ThreadPoolExecutor(self.nProcesses) as thread_pool: retIterator = thread_pool.map( lambda _wv_tuple: _propagate_attributes(*_wv_tuple), zip(workspec_list, retValList)) retList = list(retIterator) tmpLog.debug('done') return retList
def make_worker(self, jobspec_list, queue_config, resource_type): tmpLog = self.make_logger(_logger, 'queue={0}'.format(queue_config.queueName), method_name='make_worker') tmpLog.debug('jobspec_list: {0}'.format(jobspec_list)) workSpec = WorkSpec() workSpec.creationTime = datetime.datetime.utcnow() # get the queue configuration from the DB panda_queues_dict = PandaQueuesDict() queue_dict = panda_queues_dict.get(queue_config.queueName, {}) unified_queue = queue_dict.get('capability', '') == 'ucore' # case of traditional (non-unified) queue: look at the queue configuration if not unified_queue: workSpec.nCore = queue_dict.get('corecount', 1) or 1 workSpec.minRamCount = queue_dict.get('maxrss', 1) or 1 # case of unified queue: look at the resource type and queue configuration else: catchall = queue_dict.get('catchall', '') if 'useMaxRam' in catchall or queue_config.queueName in ('Taiwan-LCG2-HPC2_Unified', 'Taiwan-LCG2-HPC_Unified', 'DESY-ZN_UCORE'): # temporary hack to debug killed workers in Taiwan queues site_corecount = queue_dict.get('corecount', 1) or 1 site_maxrss = queue_dict.get('maxrss', 1) or 1 # some cases need to overwrite those values if 'SCORE' in resource_type: # the usual pilot streaming use case workSpec.nCore = 1 workSpec.minRamCount = site_maxrss / site_corecount else: # default values workSpec.nCore = site_corecount workSpec.minRamCount = site_maxrss else: workSpec.nCore, workSpec.minRamCount = self.rt_mapper.calculate_worker_requirements(resource_type, queue_dict) # parameters that are independent on traditional vs unified workSpec.maxWalltime = queue_dict.get('maxtime', 1) workSpec.maxDiskCount = queue_dict.get('maxwdir', 1) walltimeLimit_default = getattr(queue_config, 'walltimeLimit', 0) if len(jobspec_list) > 0: # get info from jobs nCore = 0 minRamCount = 0 maxDiskCount = 0 maxWalltime = 0 ioIntensity = 0 for jobSpec in jobspec_list: job_corecount, job_memory = self.get_job_core_and_memory(queue_dict, jobSpec) nCore += job_corecount minRamCount += job_memory try: maxDiskCount += jobSpec.jobParams['maxDiskCount'] except Exception: pass try: ioIntensity += jobSpec.jobParams['ioIntensity'] except Exception: pass try: # maxWallTime from AGIS or qconf, not trusting job currently maxWalltime = queue_dict.get('maxtime', walltimeLimit_default) except Exception: pass if (nCore > 0 and 'nCore' in self.jobAttributesToUse) \ or unified_queue: workSpec.nCore = nCore if (minRamCount > 0 and 'minRamCount' in self.jobAttributesToUse) \ or unified_queue: workSpec.minRamCount = minRamCount if maxDiskCount > 0 and 'maxDiskCount' in self.jobAttributesToUse: workSpec.maxDiskCount = maxDiskCount if maxWalltime > 0 and 'maxWalltime' in self.jobAttributesToUse: workSpec.maxWalltime = maxWalltime if ioIntensity > 0 and 'ioIntensity' in self.jobAttributesToUse: workSpec.ioIntensity = ioIntensity workSpec.pilotType = jobspec_list[0].get_pilot_type() else: # when no job # randomize pilot type with weighting workSpec.pilotType = random.choice(self.pilotTypeRandomList) if workSpec.pilotType in ['RC', 'ALRB', 'PT']: tmpLog.info('a worker has pilotType={0}'.format(workSpec.pilotType)) # TODO: this needs to be improved with real resource types if resource_type and resource_type != 'ANY': workSpec.resourceType = resource_type elif workSpec.nCore == 1: workSpec.resourceType = 'SCORE' else: workSpec.resourceType = 'MCORE' return workSpec
def make_worker(self, jobspec_list, queue_config, resource_type): tmpLog = self.make_logger(_logger, 'queue={0}'.format(queue_config.queueName), method_name='make_worker') tmpLog.debug('jobspec_list: {0}'.format(jobspec_list)) workSpec = WorkSpec() workSpec.creationTime = datetime.datetime.utcnow() # get the queue configuration from the DB panda_queues_dict = PandaQueuesDict() queue_dict = panda_queues_dict.get(queue_config.queueName, {}) unified_queue = queue_dict.get('capability', '') == 'ucore' # case of traditional (non-unified) queue: look at the queue configuration if not unified_queue: workSpec.nCore = queue_dict.get('corecount', 1) or 1 workSpec.minRamCount = queue_dict.get('maxrss', 1) or 1 # case of unified queue: look at the resource type and queue configuration else: catchall = queue_dict.get('catchall', '') if 'useMaxRam' in catchall or queue_config.queueName in ( 'Taiwan-LCG2-HPC2_Unified', 'Taiwan-LCG2-HPC_Unified', 'DESY-ZN_UCORE'): # temporary hack to debug killed workers in Taiwan queues site_corecount = queue_dict.get('corecount', 1) or 1 site_maxrss = queue_dict.get('maxrss', 1) or 1 # some cases need to overwrite those values if 'SCORE' in resource_type: # the usual pilot streaming use case workSpec.nCore = 1 workSpec.minRamCount = int( math.ceil(site_maxrss / site_corecount)) else: # default values workSpec.nCore = site_corecount workSpec.minRamCount = site_maxrss else: workSpec.nCore, workSpec.minRamCount = self.rt_mapper.calculate_worker_requirements( resource_type, queue_dict) # parameters that are independent on traditional vs unified workSpec.maxWalltime = queue_dict.get('maxtime', 1) workSpec.maxDiskCount = queue_dict.get('maxwdir', 1) walltimeLimit_default = getattr(queue_config, 'walltimeLimit', 0) if len(jobspec_list) > 0: # get info from jobs nCore = 0 minRamCount = 0 maxDiskCount = 0 maxWalltime = 0 ioIntensity = 0 for jobSpec in jobspec_list: job_corecount, job_memory = self.get_job_core_and_memory( queue_dict, jobSpec) nCore += job_corecount minRamCount += job_memory try: maxDiskCount += jobSpec.jobParams['maxDiskCount'] except Exception: pass try: ioIntensity += jobSpec.jobParams['ioIntensity'] except Exception: pass try: # maxWallTime from AGIS or qconf, not trusting job currently maxWalltime = queue_dict.get('maxtime', walltimeLimit_default) except Exception: pass if (nCore > 0 and 'nCore' in self.jobAttributesToUse) \ or unified_queue: workSpec.nCore = nCore if (minRamCount > 0 and 'minRamCount' in self.jobAttributesToUse) \ or unified_queue: workSpec.minRamCount = minRamCount if maxDiskCount > 0 and 'maxDiskCount' in self.jobAttributesToUse: workSpec.maxDiskCount = maxDiskCount if maxWalltime > 0 and 'maxWalltime' in self.jobAttributesToUse: workSpec.maxWalltime = maxWalltime if ioIntensity > 0 and 'ioIntensity' in self.jobAttributesToUse: workSpec.ioIntensity = ioIntensity workSpec.pilotType = jobspec_list[0].get_pilot_type() else: # when no job # randomize pilot type with weighting pdpm = getattr(queue_config, 'prodSourceLabelRandomWeightsPermille', {}) choice_list = core_utils.make_choice_list(pdpm=pdpm, default='managed') tmp_prodsourcelabel = random.choice(choice_list) fake_job = JobSpec() fake_job.jobParams = {} fake_job.jobParams['prodSourceLabel'] = tmp_prodsourcelabel workSpec.pilotType = fake_job.get_pilot_type() del fake_job if workSpec.pilotType in ['RC', 'ALRB', 'PT']: tmpLog.info('a worker has pilotType={0}'.format( workSpec.pilotType)) # TODO: this needs to be improved with real resource types if resource_type and resource_type != 'ANY': workSpec.resourceType = resource_type elif workSpec.nCore == 1: workSpec.resourceType = 'SCORE' else: workSpec.resourceType = 'MCORE' return workSpec
def submit_workers(self, workspec_list): tmpLog = self.make_logger(baseLogger, method_name='submit_workers') nWorkers = len(workspec_list) tmpLog.debug('start nWorkers={0}'.format(nWorkers)) # whether to submit any worker to_submit_any = True # get log subdirectory name from timestamp timeNow = datetime.datetime.utcnow() log_subdir = timeNow.strftime('%y-%m-%d_%H') log_subdir_path = os.path.join(self.logDir, log_subdir) if self.condorSchedd is None or not self.useSpool: try: os.mkdir(log_subdir_path) except OSError as e: if e.errno != errno.EEXIST: raise else: pass # get info from harvester queue config _queueConfigMapper = QueueConfigMapper() harvester_queue_config = _queueConfigMapper.get_queue(self.queueName) # get queue info from AGIS by cacher in db if self.useAtlasAGIS: panda_queues_dict = PandaQueuesDict() panda_queue_name = panda_queues_dict.get_panda_queue_name(self.queueName) this_panda_queue_dict = panda_queues_dict.get(self.queueName, dict()) # tmpLog.debug('panda_queues_name and queue_info: {0}, {1}'.format(self.queueName, panda_queues_dict[self.queueName])) else: panda_queues_dict = dict() panda_queue_name = self.queueName this_panda_queue_dict = dict() # get default information from queue info n_core_per_node_from_queue = this_panda_queue_dict.get('corecount', 1) if this_panda_queue_dict.get('corecount', 1) else 1 is_unified_queue = this_panda_queue_dict.get('capability', '') == 'ucore' pilot_version_orig = str(this_panda_queue_dict.get('pilot_version', '')) pilot_version_suffix_str = '_pilot2' if pilot_version_orig == '2' else '' # get override requirements from queue configured try: n_core_per_node = self.nCorePerNode if self.nCorePerNode else n_core_per_node_from_queue except AttributeError: n_core_per_node = n_core_per_node_from_queue # deal with Condor schedd and central managers; make a random list the choose n_bulks = _div_round_up(nWorkers, self.minBulkToRamdomizedSchedd) if isinstance(self.condorSchedd, list) and len(self.condorSchedd) > 0: if isinstance(self.condorPool, list) and len(self.condorPool) > 0: orig_list = list(zip(self.condorSchedd, self.condorPool)) else: orig_list = [ (_schedd, self.condorPool) for _schedd in self.condorSchedd ] if n_bulks < len(orig_list): schedd_pool_choice_list = random.sample(orig_list, n_bulks) else: schedd_pool_choice_list = orig_list else: schedd_pool_choice_list = [(self.condorSchedd, self.condorPool)] # deal with CE special_par = '' ce_weighting = None if self.useAtlasGridCE: # If ATLAS Grid CE mode used tmpLog.debug('Using ATLAS Grid CE mode...') queues_from_queue_list = this_panda_queue_dict.get('queues', []) special_par = this_panda_queue_dict.get('special_par', '') ce_auxilary_dict = {} for _queue_dict in queues_from_queue_list: if not ( _queue_dict.get('ce_endpoint') and str(_queue_dict.get('ce_state', '')).upper() == 'ACTIVE' and str(_queue_dict.get('ce_flavour', '')).lower() in set(['arc-ce', 'cream-ce', 'htcondor-ce']) ): continue ce_endpoint = _queue_dict.get('ce_endpoint') if ( ce_endpoint in ce_auxilary_dict and str(_queue_dict.get('ce_queue_name', '')).lower() == 'default' ): pass else: ce_auxilary_dict[ce_endpoint] = _queue_dict # qualified CEs from AGIS info n_qualified_ce = len(ce_auxilary_dict) if n_qualified_ce > 0: # Get CE weighting tmpLog.debug('Get CE weighting') worker_ce_all_tuple = self.get_ce_statistics(self.queueName, nWorkers) ce_weighting = _get_ce_weighting(ce_endpoint_list=list(ce_auxilary_dict.keys()), worker_ce_all_tuple=worker_ce_all_tuple) stats_weighting_display_str = _get_ce_stats_weighting_display( ce_auxilary_dict.keys(), worker_ce_all_tuple, ce_weighting) tmpLog.debug('CE stats and weighting: {0}'.format(stats_weighting_display_str)) else: tmpLog.error('No valid CE endpoint found') to_submit_any = False def _handle_one_worker(workspec, to_submit=to_submit_any): # make logger tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), method_name='_handle_one_worker') ce_info_dict = dict() batch_log_dict = dict() data = {'workspec': workspec, 'to_submit': to_submit,} if to_submit: if self.useAtlasGridCE: # choose a CE tmpLog.info('choose a CE...') ce_chosen = _choose_ce(ce_weighting) try: ce_info_dict = ce_auxilary_dict[ce_chosen].copy() except KeyError: tmpLog.info('Problem choosing CE with weighting. Choose an arbitrary CE endpoint') ce_info_dict = random.choice(list(ce_auxilary_dict.values())).copy() # go on info of the CE ce_endpoint_from_queue = ce_info_dict.get('ce_endpoint', '') ce_flavour_str = str(ce_info_dict.get('ce_flavour', '')).lower() ce_version_str = str(ce_info_dict.get('ce_version', '')).lower() ce_info_dict['ce_hostname'] = re.sub(':\w*', '', ce_endpoint_from_queue) if ce_info_dict['ce_hostname'] == ce_endpoint_from_queue: # add default port to ce_endpoint if missing default_port_map = { 'cream-ce': 8443, 'arc-ce': 2811, 'htcondor-ce': 9619, } if ce_flavour_str in default_port_map: default_port = default_port_map[ce_flavour_str] ce_info_dict['ce_endpoint'] = '{0}:{1}'.format(ce_endpoint_from_queue, default_port) tmpLog.debug('For site {0} got pilot version: "{1}"; CE endpoint: "{2}", flavour: "{3}"'.format( self.queueName, pilot_version_orig, ce_endpoint_from_queue, ce_flavour_str)) if os.path.isdir(self.CEtemplateDir) and ce_flavour_str: sdf_template_filename = '{ce_flavour_str}{pilot_version_suffix_str}.sdf'.format( ce_flavour_str=ce_flavour_str, pilot_version_suffix_str=pilot_version_suffix_str) self.templateFile = os.path.join(self.CEtemplateDir, sdf_template_filename) else: try: # Manually define site condor schedd as ceHostname and central manager as ceEndpoint if self.ceHostname and isinstance(self.ceHostname, list) and len(self.ceHostname) > 0: if isinstance(self.ceEndpoint, list) and len(self.ceEndpoint) > 0: ce_info_dict['ce_hostname'], ce_info_dict['ce_endpoint'] = random.choice(list(zip(self.ceHostname, self.ceEndpoint))) else: ce_info_dict['ce_hostname'] = random.choice(self.ceHostname) ce_info_dict['ce_endpoint'] = self.ceEndpoint else: ce_info_dict['ce_hostname'] = self.ceHostname ce_info_dict['ce_endpoint'] = self.ceEndpoint except AttributeError: pass # template for batch script try: tmpFile = open(self.templateFile) sdf_template_raw = tmpFile.read() tmpFile.close() except AttributeError: tmpLog.error('No valid templateFile found. Maybe templateFile, CEtemplateDir invalid, or no valid CE found') to_submit = False return data else: # get batch_log, stdout, stderr filename, and remobe commented liness sdf_template_str_list = [] for _line in sdf_template_raw.split('\n'): if _line.startswith('#'): continue sdf_template_str_list.append(_line) _match_batch_log = re.match('log = (.+)', _line) _match_stdout = re.match('output = (.+)', _line) _match_stderr = re.match('error = (.+)', _line) if _match_batch_log: batch_log_value = _match_batch_log.group(1) continue if _match_stdout: stdout_value = _match_stdout.group(1) continue if _match_stderr: stderr_value = _match_stderr.group(1) continue sdf_template = '\n'.join(sdf_template_str_list) # Choose from Condor schedd and central managers condor_schedd, condor_pool = random.choice(schedd_pool_choice_list) # set submissionHost if not condor_schedd and not condor_pool: workspec.submissionHost = 'LOCAL' else: workspec.submissionHost = '{0},{1}'.format(condor_schedd, condor_pool) tmpLog.debug('set submissionHost={0}'.format(workspec.submissionHost)) # Log Base URL if self.logBaseURL and '[ScheddHostname]' in self.logBaseURL: schedd_hostname = re.sub(r'(?:[a-zA-Z0-9_.\-]*@)?([a-zA-Z0-9.\-]+)(?::[0-9]+)?', lambda matchobj: matchobj.group(1) if matchobj.group(1) else '', condor_schedd) log_base_url = re.sub(r'\[ScheddHostname\]', schedd_hostname, self.logBaseURL) else: log_base_url = self.logBaseURL # URLs for log files if not (log_base_url is None): if workspec.batchID: batchID = workspec.batchID guess = False else: batchID = '' guess = True batch_log_filename = parse_batch_job_filename(value_str=batch_log_value, file_dir=log_subdir_path, batchID=batchID, guess=guess) stdout_path_file_name = parse_batch_job_filename(value_str=stdout_value, file_dir=log_subdir_path, batchID=batchID, guess=guess) stderr_path_filename = parse_batch_job_filename(value_str=stderr_value, file_dir=log_subdir_path, batchID=batchID, guess=guess) batch_log = '{0}/{1}/{2}'.format(log_base_url, log_subdir, batch_log_filename) batch_stdout = '{0}/{1}/{2}'.format(log_base_url, log_subdir, stdout_path_file_name) batch_stderr = '{0}/{1}/{2}'.format(log_base_url, log_subdir, stderr_path_filename) workspec.set_log_file('batch_log', batch_log) workspec.set_log_file('stdout', batch_stdout) workspec.set_log_file('stderr', batch_stderr) batch_log_dict['batch_log'] = batch_log batch_log_dict['batch_stdout'] = batch_stdout batch_log_dict['batch_stderr'] = batch_stderr batch_log_dict['gtag'] = workspec.workAttributes['stdOut'] tmpLog.debug('Done set_log_file before submission') tmpLog.debug('Done jobspec attribute setting') # set data dict data.update({ 'workspec': workspec, 'to_submit': to_submit, 'template': sdf_template, 'executable_file': self.executableFile, 'log_dir': self.logDir, 'log_subdir': log_subdir, 'n_core_per_node': n_core_per_node, 'panda_queue_name': panda_queue_name, 'x509_user_proxy': self.x509UserProxy, 'ce_info_dict': ce_info_dict, 'batch_log_dict': batch_log_dict, 'special_par': special_par, 'harvester_queue_config': harvester_queue_config, 'is_unified_queue': is_unified_queue, 'condor_schedd': condor_schedd, 'condor_pool': condor_pool, 'use_spool': self.useSpool, 'pilot_version': pilot_version_orig, }) return data def _propagate_attributes(workspec, tmpVal): # make logger tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), method_name='_propagate_attributes') (retVal, tmpDict) = tmpVal workspec.set_attributes_with_dict(tmpDict) tmpLog.debug('Done workspec attributes propagation') return retVal tmpLog.debug('finished preparing worker attributes') # map(_handle_one_worker, workspec_list) with ThreadPoolExecutor(self.nProcesses * 4) as thread_pool: dataIterator = thread_pool.map(_handle_one_worker, workspec_list) tmpLog.debug('{0} workers handled'.format(nWorkers)) # submit retValList = submit_bag_of_workers(list(dataIterator)) tmpLog.debug('{0} workers submitted'.format(nWorkers)) # propagate changed attributes with ThreadPoolExecutor(self.nProcesses) as thread_pool: retIterator = thread_pool.map(lambda _wv_tuple: _propagate_attributes(*_wv_tuple), zip(workspec_list, retValList)) retList = list(retIterator) tmpLog.debug('done') return retList