def create_job_from_yaml(self, yaml_content, work_spec, cert, cert_in_secret=True, cpuadjustratio=100, memoryadjustratio=100): panda_queues_dict = PandaQueuesDict() queue_name = panda_queues_dict.get_panda_queue_name(work_spec.computingSite) yaml_content['metadata']['name'] = yaml_content['metadata']['name'] + "-" + str(work_spec.workerID) yaml_content['spec']['template'].setdefault('metadata', {}) yaml_content['spec']['template']['metadata'].update({ 'labels': {'resourceType': str(work_spec.resourceType)}}) yaml_containers = yaml_content['spec']['template']['spec']['containers'] del(yaml_containers[1:len(yaml_containers)]) container_env = yaml_containers[0] container_env.setdefault('resources', {}) # note that predefined values in the yaml template will NOT be overwritten if work_spec.nCore > 0: container_env['resources'].setdefault('limits', {'cpu': str(work_spec.nCore)}) container_env['resources'].setdefault('requests', {'cpu': str(work_spec.nCore * cpuadjustratio / 100.0)}) if work_spec.minRamCount > 4: # K8S minimum memory limit = 4 MB container_env['resources'].setdefault('limits', { 'memory': str(work_spec.minRamCount) + 'M'}) container_env['resources'].setdefault('requests', { 'memory': str(work_spec.minRamCount*memoryadjustratio/100.0) + 'M'}) container_env.setdefault('env', []) container_env['env'].extend([ {'name': 'computingSite', 'value': work_spec.computingSite}, {'name': 'pandaQueueName', 'value': queue_name}, {'name': 'resourceType', 'value': work_spec.resourceType}, {'name': 'proxySecretPath', 'value': cert if cert_in_secret else None}, {'name': 'proxyContent', 'value': None if cert_in_secret else self.set_proxy(cert)}, {'name': 'workerID', 'value': str(work_spec.workerID)}, {'name': 'logs_frontend_w', 'value': harvester_config.pandacon.pandaCacheURL_W}, {'name': 'logs_frontend_r', 'value': harvester_config.pandacon.pandaCacheURL_R}, {'name': 'PANDA_JSID', 'value': 'harvester-' + harvester_config.master.harvester_id}, {'name': 'HARVESTER_WORKER_ID', 'value': str(work_spec.workerID)}, {'name': 'HARVESTER_ID', 'value': harvester_config.master.harvester_id} ]) if 'affinity' not in yaml_content['spec']['template']['spec']: yaml_content = self.set_affinity(yaml_content) rsp = self.batchv1.create_namespaced_job(body=yaml_content, namespace=self.namespace) return rsp
def submit_workers(self, workspec_list): tmpLog = self.make_logger(baseLogger, method_name='submit_workers') nWorkers = len(workspec_list) tmpLog.debug('start nWorkers={0}'.format(nWorkers)) # whether to submit any worker to_submit_any = True # get log subdirectory name from timestamp timeNow = datetime.datetime.utcnow() log_subdir = timeNow.strftime('%y-%m-%d_%H') log_subdir_path = os.path.join(self.logDir, log_subdir) if self.condorSchedd is None or not self.useSpool: try: os.mkdir(log_subdir_path) except OSError as e: if e.errno != errno.EEXIST: raise else: pass # get info from harvester queue config _queueConfigMapper = QueueConfigMapper() harvester_queue_config = _queueConfigMapper.get_queue(self.queueName) # get queue info from AGIS by cacher in db if self.useAtlasAGIS: panda_queues_dict = PandaQueuesDict() panda_queue_name = panda_queues_dict.get_panda_queue_name( self.queueName) this_panda_queue_dict = panda_queues_dict.get( self.queueName, dict()) # tmpLog.debug('panda_queues_name and queue_info: {0}, {1}'.format(self.queueName, panda_queues_dict[self.queueName])) else: panda_queues_dict = dict() panda_queue_name = self.queueName this_panda_queue_dict = dict() # get default information from queue info n_core_per_node_from_queue = this_panda_queue_dict.get( 'corecount', 1) if this_panda_queue_dict.get('corecount', 1) else 1 is_unified_queue = this_panda_queue_dict.get('capability', '') == 'ucore' # get override requirements from queue configured try: n_core_per_node = self.nCorePerNode if self.nCorePerNode else n_core_per_node_from_queue except AttributeError: n_core_per_node = n_core_per_node_from_queue # deal with CE special_par = '' ce_weighting = None if self.useAtlasGridCE: # If ATLAS Grid CE mode used tmpLog.debug('Using ATLAS Grid CE mode...') queues_from_queue_list = this_panda_queue_dict.get('queues', []) special_par = this_panda_queue_dict.get('special_par', '') ce_auxilary_dict = {} for _queue_dict in queues_from_queue_list: if not (_queue_dict.get('ce_endpoint') and str( _queue_dict.get('ce_state', '')).upper() == 'ACTIVE' and str(_queue_dict.get('ce_flavour', '')).lower() in set(['arc-ce', 'cream-ce', 'htcondor-ce'])): continue ce_endpoint = _queue_dict.get('ce_endpoint') if (ce_endpoint in ce_auxilary_dict and str(_queue_dict.get('ce_queue_name', '')).lower() == 'default'): pass else: ce_auxilary_dict[ce_endpoint] = _queue_dict # qualified CEs from AGIS info n_qualified_ce = len(ce_auxilary_dict) if n_qualified_ce > 0: # Get CE weighting tmpLog.debug('Get CE weighting') worker_ce_all_tuple = self.get_ce_statistics( self.queueName, nWorkers) ce_weighting = _get_ce_weighting( ce_endpoint_list=list(ce_auxilary_dict.keys()), worker_ce_all_tuple=worker_ce_all_tuple) stats_weighting_display_str = _get_ce_stats_weighting_display( ce_auxilary_dict.keys(), worker_ce_all_tuple, ce_weighting) tmpLog.debug('CE stats and weighting: {0}'.format( stats_weighting_display_str)) else: tmpLog.error('No valid CE endpoint found') to_submit_any = False def _handle_one_worker(workspec, to_submit=to_submit_any): # make logger tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format( workspec.workerID), method_name='_handle_one_worker') ce_info_dict = dict() batch_log_dict = dict() data = { 'workspec': workspec, 'to_submit': to_submit, } if to_submit: if self.useAtlasGridCE: # choose a CE tmpLog.info('choose a CE...') ce_chosen = _choose_ce(ce_weighting) try: ce_info_dict = ce_auxilary_dict[ce_chosen].copy() except KeyError: tmpLog.info( 'Problem choosing CE with weighting. Choose an arbitrary CE endpoint' ) ce_info_dict = random.choice( list(ce_auxilary_dict.values())).copy() # go on info of the CE ce_endpoint_from_queue = ce_info_dict.get( 'ce_endpoint', '') ce_flavour_str = str(ce_info_dict.get('ce_flavour', '')).lower() ce_version_str = str(ce_info_dict.get('ce_version', '')).lower() ce_info_dict['ce_hostname'] = re.sub( ':\w*', '', ce_endpoint_from_queue) if ce_info_dict['ce_hostname'] == ce_endpoint_from_queue: # add default port to ce_endpoint if missing default_port_map = { 'cream-ce': 8443, 'arc-ce': 2811, 'htcondor-ce': 9619, } if ce_flavour_str in default_port_map: default_port = default_port_map[ce_flavour_str] ce_info_dict['ce_endpoint'] = '{0}:{1}'.format( ce_endpoint_from_queue, default_port) tmpLog.debug( 'For site {0} got CE endpoint: "{1}", flavour: "{2}"'. format(self.queueName, ce_endpoint_from_queue, ce_flavour_str)) if os.path.isdir(self.CEtemplateDir) and ce_flavour_str: sdf_template_filename = '{ce_flavour_str}.sdf'.format( ce_flavour_str=ce_flavour_str) self.templateFile = os.path.join( self.CEtemplateDir, sdf_template_filename) else: try: # Manually define site condor schedd as ceHostname and central manager as ceEndpoint if self.ceHostname and isinstance( self.ceHostname, list) and len(self.ceHostname) > 0: if isinstance(self.ceEndpoint, list) and len(self.ceEndpoint) > 0: ce_info_dict['ce_hostname'], ce_info_dict[ 'ce_endpoint'] = random.choice( list( zip(self.ceHostname, self.ceEndpoint))) else: ce_info_dict['ce_hostname'] = random.choice( self.ceHostname) ce_info_dict['ce_endpoint'] = self.ceEndpoint else: ce_info_dict['ce_hostname'] = self.ceHostname ce_info_dict['ce_endpoint'] = self.ceEndpoint except AttributeError: pass # template for batch script try: tmpFile = open(self.templateFile) sdf_template_raw = tmpFile.read() tmpFile.close() except AttributeError: tmpLog.error( 'No valid templateFile found. Maybe templateFile, CEtemplateDir invalid, or no valid CE found' ) to_submit = False return data else: # get batch_log, stdout, stderr filename, and remobe commented liness sdf_template_str_list = [] for _line in sdf_template_raw.split('\n'): if _line.startswith('#'): continue sdf_template_str_list.append(_line) _match_batch_log = re.match('log = (.+)', _line) _match_stdout = re.match('output = (.+)', _line) _match_stderr = re.match('error = (.+)', _line) if _match_batch_log: batch_log_value = _match_batch_log.group(1) continue if _match_stdout: stdout_value = _match_stdout.group(1) continue if _match_stderr: stderr_value = _match_stderr.group(1) continue sdf_template = '\n'.join(sdf_template_str_list) # Choose from Condor schedd and central managers if isinstance(self.condorSchedd, list) and len(self.condorSchedd) > 0: if isinstance(self.condorPool, list) and len(self.condorPool) > 0: condor_schedd, condor_pool = random.choice( list(zip(self.condorSchedd, self.condorPool))) else: condor_schedd = random.choice(self.condorSchedd) condor_pool = self.condorPool else: condor_schedd = self.condorSchedd condor_pool = self.condorPool # Log Base URL if self.logBaseURL and '[ScheddHostname]' in self.logBaseURL: schedd_hostname = re.sub( r'(?:[a-zA-Z0-9_.\-]*@)?([a-zA-Z0-9.\-]+)(?::[0-9]+)?', lambda matchobj: matchobj.group(1) if matchobj.group(1) else '', condor_schedd) log_base_url = re.sub(r'\[ScheddHostname\]', schedd_hostname, self.logBaseURL) else: log_base_url = self.logBaseURL # URLs for log files if not (log_base_url is None): if workspec.batchID: batchID = workspec.batchID guess = False else: batchID = '' guess = True batch_log_filename = parse_batch_job_filename( value_str=batch_log_value, file_dir=log_subdir_path, batchID=batchID, guess=guess) stdout_path_file_name = parse_batch_job_filename( value_str=stdout_value, file_dir=log_subdir_path, batchID=batchID, guess=guess) stderr_path_filename = parse_batch_job_filename( value_str=stderr_value, file_dir=log_subdir_path, batchID=batchID, guess=guess) batch_log = '{0}/{1}/{2}'.format( log_base_url, log_subdir, batch_log_filename) batch_stdout = '{0}/{1}/{2}'.format( log_base_url, log_subdir, stdout_path_file_name) batch_stderr = '{0}/{1}/{2}'.format( log_base_url, log_subdir, stderr_path_filename) workspec.set_log_file('batch_log', batch_log) workspec.set_log_file('stdout', batch_stdout) workspec.set_log_file('stderr', batch_stderr) batch_log_dict['batch_log'] = batch_log batch_log_dict['batch_stdout'] = batch_stdout batch_log_dict['batch_stderr'] = batch_stderr batch_log_dict['gtag'] = workspec.workAttributes[ 'stdOut'] tmpLog.debug('Done set_log_file before submission') tmpLog.debug('Done jobspec attribute setting') # set data dict data.update({ 'workspec': workspec, 'to_submit': to_submit, 'template': sdf_template, 'executable_file': self.executableFile, 'log_dir': self.logDir, 'log_subdir': log_subdir, 'n_core_per_node': n_core_per_node, 'panda_queue_name': panda_queue_name, 'x509_user_proxy': self.x509UserProxy, 'ce_info_dict': ce_info_dict, 'batch_log_dict': batch_log_dict, 'special_par': special_par, 'harvester_queue_config': harvester_queue_config, 'is_unified_queue': is_unified_queue, 'condor_schedd': condor_schedd, 'condor_pool': condor_pool, 'use_spool': self.useSpool, }) return data def _propagate_attributes(workspec, tmpVal): # make logger tmpLog = core_utils.make_logger( baseLogger, 'workerID={0}'.format(workspec.workerID), method_name='_propagate_attributes') (retVal, tmpDict) = tmpVal workspec.set_attributes_with_dict(tmpDict) tmpLog.debug('Done workspec attributes propagation') return retVal tmpLog.debug('finished preparing worker attributes') # map(_handle_one_worker, workspec_list) with ThreadPoolExecutor(self.nProcesses * 4) as thread_pool: dataIterator = thread_pool.map(_handle_one_worker, workspec_list) tmpLog.debug('{0} workers handled'.format(nWorkers)) # exec with mcore with ThreadPoolExecutor(self.nProcesses) as thread_pool: retValList = thread_pool.map(submit_a_worker, dataIterator) tmpLog.debug('{0} workers submitted'.format(nWorkers)) # propagate changed attributes with ThreadPoolExecutor(self.nProcesses) as thread_pool: retIterator = thread_pool.map( lambda _wv_tuple: _propagate_attributes(*_wv_tuple), zip(workspec_list, retValList)) retList = list(retIterator) tmpLog.debug('done') return retList
def create_job_from_yaml(self, yaml_content, work_spec, prod_source_label, pilot_type, pilot_url_str, pilot_python_option, container_image, executable, args, cert, cpu_adjust_ratio=100, memory_adjust_ratio=100, max_time=None): tmp_log = core_utils.make_logger(base_logger, method_name='create_job_from_yaml') # consider PULL mode as default, unless specified submit_mode = 'PULL' # create the configmap in push mode worker_id = None if work_spec.mapType != 'NoJob': submit_mode = 'PUSH' worker_id = str(work_spec.workerID) res = self.create_configmap(work_spec) if not res: # if the configmap creation failed, don't submit a job because the pod creation will hang return res, 'Failed to create a configmap' # retrieve panda queue information panda_queues_dict = PandaQueuesDict() queue_name = panda_queues_dict.get_panda_queue_name( work_spec.computingSite) # set the worker name yaml_content['metadata']['name'] = yaml_content['metadata'][ 'name'] + "-" + str(work_spec.workerID) # set the resource type and other metadata to filter the pods yaml_content['spec']['template'].setdefault('metadata', {}) yaml_content['spec']['template']['metadata'].update({ 'labels': { 'resourceType': str(work_spec.resourceType), 'prodSourceLabel': str(prod_source_label), 'pq': str(work_spec.computingSite) } }) # fill the container details. we can only handle one container (take the first, delete the rest) yaml_containers = yaml_content['spec']['template']['spec'][ 'containers'] del (yaml_containers[1:len(yaml_containers)]) container_env = yaml_containers[0] container_env.setdefault('resources', {}) # set the container image if 'image' not in container_env: container_env['image'] = container_image if 'command' not in container_env: container_env['command'] = executable container_env['args'] = args # set the resources (CPU and memory) we need for the container # note that predefined values in the yaml template will NOT be overwritten # Be familiar with QoS classes: https://kubernetes.io/docs/tasks/configure-pod-container/quality-service-pod # The CPU & memory settings will affect the QoS for the pod container_env.setdefault('resources', {}) if work_spec.nCore > 0: # CPU limits container_env['resources'].setdefault('limits', {}) if 'cpu' not in container_env['resources']['limits']: container_env['resources']['limits']['cpu'] = str( work_spec.nCore) # CPU requests container_env['resources'].setdefault('requests', {}) if 'cpu' not in container_env['resources']['requests']: container_env['resources']['requests']['cpu'] = str( work_spec.nCore * cpu_adjust_ratio / 100.0) if work_spec.minRamCount > 4: # K8S minimum memory limit = 4 MB # memory limits # container_env['resources'].setdefault('limits', {}) # if 'memory' not in container_env['resources']['limits']: # container_env['resources']['limits']['memory'] = str(work_spec.minRamCount) + 'M' # memory requests container_env['resources'].setdefault('requests', {}) if 'memory' not in container_env['resources']['requests']: container_env['resources']['requests']['memory'] = str( work_spec.minRamCount * memory_adjust_ratio / 100.0) + 'M' container_env.setdefault('env', []) # try to retrieve the stdout log file name try: log_file_name = work_spec.workAttributes['stdout'] except (KeyError, AttributeError): tmp_log.debug( 'work_spec does not have stdout workAttribute, using default') log_file_name = '' container_env['env'].extend([ { 'name': 'computingSite', 'value': work_spec.computingSite }, { 'name': 'pandaQueueName', 'value': queue_name }, { 'name': 'resourceType', 'value': work_spec.resourceType }, { 'name': 'prodSourceLabel', 'value': prod_source_label }, { 'name': 'pilotTyp', 'value': pilot_type }, { 'name': 'pilotUrlOpt', 'value': pilot_url_str }, { 'name': 'pythonOption', 'value': pilot_python_option }, # {'name': 'jobType', 'value': work_spec.jobType}, { 'name': 'proxySecretPath', 'value': cert }, { 'name': 'workerID', 'value': str(work_spec.workerID) }, { 'name': 'logs_frontend_w', 'value': harvester_config.pandacon.pandaCacheURL_W }, { 'name': 'logs_frontend_r', 'value': harvester_config.pandacon.pandaCacheURL_R }, { 'name': 'stdout_name', 'value': log_file_name }, { 'name': 'PANDA_JSID', 'value': 'harvester-' + harvester_config.master.harvester_id }, { 'name': 'HARVESTER_WORKER_ID', 'value': str(work_spec.workerID) }, { 'name': 'HARVESTER_ID', 'value': harvester_config.master.harvester_id }, { 'name': 'submit_mode', 'value': submit_mode }, { 'name': 'EXEC_DIR', 'value': EXEC_DIR }, ]) # add the pilots starter configmap yaml_content['spec']['template']['spec'].setdefault('volumes', []) yaml_volumes = yaml_content['spec']['template']['spec']['volumes'] yaml_volumes.append({ 'name': 'pilots-starter', 'configMap': { 'name': 'pilots-starter' } }) # mount the volume to the filesystem container_env.setdefault('volumeMounts', []) container_env['volumeMounts'].append({ 'name': 'pilots-starter', 'mountPath': EXEC_DIR }) # in push mode, add the configmap as a volume to the pod if submit_mode == 'PUSH' and worker_id: yaml_content['spec']['template']['spec'].setdefault('volumes', []) yaml_volumes = yaml_content['spec']['template']['spec']['volumes'] yaml_volumes.append({ 'name': 'job-config', 'configMap': { 'name': worker_id } }) # mount the volume to the filesystem container_env.setdefault('volumeMounts', []) container_env['volumeMounts'].append({ 'name': 'job-config', 'mountPath': CONFIG_DIR }) # if we are running the pilot in a emptyDir with "pilot-dir" name, then set the max size if 'volumes' in yaml_content['spec']['template']['spec']: yaml_volumes = yaml_content['spec']['template']['spec']['volumes'] for volume in yaml_volumes: # do not overwrite any hardcoded sizeLimit value if volume[ 'name'] == 'pilot-dir' and 'emptyDir' in volume and 'sizeLimit' not in volume[ 'emptyDir']: maxwdir_prorated_GB = panda_queues_dict.get_prorated_maxwdir_GB( work_spec.computingSite, work_spec.nCore) if maxwdir_prorated_GB: volume['emptyDir']['sizeLimit'] = '{0}G'.format( maxwdir_prorated_GB) # set the affinity if 'affinity' not in yaml_content['spec']['template']['spec']: yaml_content = self.set_affinity(yaml_content) # set max_time to avoid having a pod running forever if 'activeDeadlineSeconds' not in yaml_content['spec']['template'][ 'spec']: if not max_time: # 4 days max_time = 4 * 24 * 23600 yaml_content['spec']['template']['spec'][ 'activeDeadlineSeconds'] = max_time tmp_log.debug('creating job {0}'.format(yaml_content)) rsp = self.batchv1.create_namespaced_job(body=yaml_content, namespace=self.namespace) return rsp, yaml_content
def submit_workers(self, workspec_list): tmpLog = self.make_logger(baseLogger, method_name='submit_workers') nWorkers = len(workspec_list) tmpLog.debug('start nWorkers={0}'.format(nWorkers)) # get log subdirectory name from timestamp timeNow = datetime.datetime.utcnow() log_subdir = timeNow.strftime('%y-%m-%d_%H') log_subdir_path = os.path.join(self.logDir, log_subdir) try: os.mkdir(log_subdir_path) except OSError as e: if e.errno != errno.EEXIST: raise else: pass # get info from harvester queue config _queueConfigMapper = QueueConfigMapper() harvester_queue_config = _queueConfigMapper.get_queue(self.queueName) # get queue info from AGIS by cacher in db if self.useAtlasAGIS: panda_queues_dict = PandaQueuesDict() panda_queue_name = panda_queues_dict.get_panda_queue_name(self.queueName) this_panda_queue_dict = panda_queues_dict.get(self.queueName, dict()) # tmpLog.debug('panda_queues_name and queue_info: {0}, {1}'.format(self.queueName, panda_queues_dict[self.queueName])) else: panda_queues_dict = dict() panda_queue_name = self.queueName this_panda_queue_dict = dict() def _handle_one_worker(workspec): # make logger tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), method_name='_handle_one_worker') # get default information from queue info n_core_per_node_from_queue = this_panda_queue_dict.get('corecount', 1) if this_panda_queue_dict.get('corecount', 1) else 1 is_unified_queue = 'unifiedPandaQueue' in this_panda_queue_dict.get('catchall', '').split(',') \ or this_panda_queue_dict.get('capability', '') == 'ucore' ce_info_dict = dict() batch_log_dict = dict() special_par = '' if self.useAtlasGridCE: # If ATLAS Grid CE mode used tmpLog.debug('Using ATLAS Grid CE mode...') queues_from_queue_list = this_panda_queue_dict.get('queues', []) special_par = this_panda_queue_dict.get('special_par', '') ce_auxilary_dict = {} for _queue_dict in queues_from_queue_list: if not ( _queue_dict.get('ce_endpoint') and str(_queue_dict.get('ce_state', '')).upper() == 'ACTIVE' and str(_queue_dict.get('ce_flavour', '')).lower() in set(['arc-ce', 'cream-ce', 'htcondor-ce']) ): continue ce_endpoint = _queue_dict.get('ce_endpoint') if ( ce_endpoint in ce_auxilary_dict and str(_queue_dict.get('ce_queue_name', '')).lower() == 'default' ): pass else: ce_auxilary_dict[ce_endpoint] = _queue_dict # qualified CEs from AGIS info n_qualified_ce = len(ce_auxilary_dict) queue_status_dict = self.dbInterface.get_queue_status(self.queueName) worker_ce_stats_dict = self.dbInterface.get_worker_ce_stats(self.queueName) ce_weight_dict = _get_ce_weight_dict(ce_endpoint_list=list(ce_auxilary_dict.keys()), queue_status_dict=queue_status_dict, worker_ce_stats_dict=worker_ce_stats_dict) # good CEs which can be submitted to, duplicate by weight good_ce_weighted_list = [] for _ce_endpoint in ce_auxilary_dict.keys(): good_ce_weighted_list.extend([_ce_endpoint] * ce_weight_dict.get(_ce_endpoint, 0)) tmpLog.debug('queue_status_dict: {0} ; worker_ce_stats_dict: {1} ; ce_weight_dict: {2}'.format( queue_status_dict, worker_ce_stats_dict, ce_weight_dict)) if len(good_ce_weighted_list) > 0: ce_info_dict = ce_auxilary_dict[random.choice(good_ce_weighted_list)].copy() else: tmpLog.info('No good CE endpoint left. Choose an arbitrary CE endpoint') ce_info_dict = random.choice(list(ce_auxilary_dict.values())).copy() ce_endpoint_from_queue = ce_info_dict.get('ce_endpoint', '') ce_flavour_str = str(ce_info_dict.get('ce_flavour', '')).lower() ce_version_str = str(ce_info_dict.get('ce_version', '')).lower() ce_info_dict['ce_hostname'] = re.sub(':\w*', '', ce_endpoint_from_queue) tmpLog.debug('For site {0} got CE endpoint: "{1}", flavour: "{2}"'.format(self.queueName, ce_endpoint_from_queue, ce_flavour_str)) if os.path.isdir(self.CEtemplateDir) and ce_flavour_str: sdf_template_filename = '{ce_flavour_str}.sdf'.format(ce_flavour_str=ce_flavour_str) self.templateFile = os.path.join(self.CEtemplateDir, sdf_template_filename) # template for batch script tmpFile = open(self.templateFile) sdf_template = tmpFile.read() tmpFile.close() # get batch_log, stdout, stderr filename for _line in sdf_template.split('\n'): if _line.startswith('#'): continue _match_batch_log = re.match('log = (.+)', _line) _match_stdout = re.match('output = (.+)', _line) _match_stderr = re.match('error = (.+)', _line) if _match_batch_log: batch_log_value = _match_batch_log.group(1) continue if _match_stdout: stdout_value = _match_stdout.group(1) continue if _match_stderr: stderr_value = _match_stderr.group(1) continue # get override requirements from queue configured try: n_core_per_node = self.nCorePerNode if self.nCorePerNode else n_core_per_node_from_queue except AttributeError: n_core_per_node = n_core_per_node_from_queue # URLs for log files if not (self.logBaseURL is None): if workspec.batchID: batchID = workspec.batchID guess = False else: batchID = '' guess = True batch_log_filename = parse_batch_job_filename(value_str=batch_log_value, file_dir=log_subdir_path, batchID=batchID, guess=guess) stdout_path_file_name = parse_batch_job_filename(value_str=stdout_value, file_dir=log_subdir_path, batchID=batchID, guess=guess) stderr_path_filename = parse_batch_job_filename(value_str=stderr_value, file_dir=log_subdir_path, batchID=batchID, guess=guess) batch_log = '{0}/{1}/{2}'.format(self.logBaseURL, log_subdir, batch_log_filename) batch_stdout = '{0}/{1}/{2}'.format(self.logBaseURL, log_subdir, stdout_path_file_name) batch_stderr = '{0}/{1}/{2}'.format(self.logBaseURL, log_subdir, stderr_path_filename) workspec.set_log_file('batch_log', batch_log) workspec.set_log_file('stdout', batch_stdout) workspec.set_log_file('stderr', batch_stderr) batch_log_dict['batch_log'] = batch_log batch_log_dict['batch_stdout'] = batch_stdout batch_log_dict['batch_stderr'] = batch_stderr batch_log_dict['gtag'] = workspec.workAttributes['stdOut'] tmpLog.debug('Done set_log_file before submission') tmpLog.debug('Done jobspec attribute setting') # set data dict data = {'workspec': workspec, 'template': sdf_template, 'log_dir': self.logDir, 'log_subdir': log_subdir, 'n_core_per_node': n_core_per_node, 'panda_queue_name': panda_queue_name, 'x509_user_proxy': self.x509UserProxy, 'ce_info_dict': ce_info_dict, 'batch_log_dict': batch_log_dict, 'special_par': special_par, 'harvester_queue_config': harvester_queue_config, 'is_unified_queue': is_unified_queue, 'condor_schedd': self.condorSchedd, 'condor_pool': self.condorPool, } return data def _propagate_attributes(workspec, tmpVal): # make logger tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), method_name='_propagate_attributes') (retVal, tmpDict) = tmpVal workspec.set_attributes_with_dict(tmpDict) tmpLog.debug('Done workspec attributes propagation') return retVal tmpLog.debug('finished preparing worker attributes') # map(_handle_one_worker, workspec_list) with ThreadPoolExecutor(self.nProcesses * 4) as thread_pool: dataIterator = thread_pool.map(_handle_one_worker, workspec_list) tmpLog.debug('{0} workers handled'.format(nWorkers)) # exec with mcore with ThreadPoolExecutor(self.nProcesses) as thread_pool: retValList = thread_pool.map(submit_a_worker, dataIterator) tmpLog.debug('{0} workers submitted'.format(nWorkers)) # propagate changed attributes with ThreadPoolExecutor(self.nProcesses) as thread_pool: retIterator = thread_pool.map(lambda _wv_tuple: _propagate_attributes(*_wv_tuple), zip(workspec_list, retValList)) retList = list(retIterator) tmpLog.debug('done') return retList
def submit_workers(self, workspec_list): tmpLog = self.make_logger(baseLogger, method_name='submit_workers') nWorkers = len(workspec_list) tmpLog.debug('start nWorkers={0}'.format(nWorkers)) # whether to submit any worker to_submit_any = True # get log subdirectory name from timestamp timeNow = datetime.datetime.utcnow() log_subdir = timeNow.strftime('%y-%m-%d_%H') log_subdir_path = os.path.join(self.logDir, log_subdir) if self.condorSchedd is None or not self.useSpool: try: os.mkdir(log_subdir_path) except OSError as e: if e.errno != errno.EEXIST: raise else: pass # get info from harvester queue config _queueConfigMapper = QueueConfigMapper() harvester_queue_config = _queueConfigMapper.get_queue(self.queueName) # get queue info from AGIS by cacher in db if self.useAtlasAGIS: panda_queues_dict = PandaQueuesDict() panda_queue_name = panda_queues_dict.get_panda_queue_name(self.queueName) this_panda_queue_dict = panda_queues_dict.get(self.queueName, dict()) # tmpLog.debug('panda_queues_name and queue_info: {0}, {1}'.format(self.queueName, panda_queues_dict[self.queueName])) else: panda_queues_dict = dict() panda_queue_name = self.queueName this_panda_queue_dict = dict() # get default information from queue info n_core_per_node_from_queue = this_panda_queue_dict.get('corecount', 1) if this_panda_queue_dict.get('corecount', 1) else 1 is_unified_queue = this_panda_queue_dict.get('capability', '') == 'ucore' pilot_version_orig = str(this_panda_queue_dict.get('pilot_version', '')) pilot_version_suffix_str = '_pilot2' if pilot_version_orig == '2' else '' # get override requirements from queue configured try: n_core_per_node = self.nCorePerNode if self.nCorePerNode else n_core_per_node_from_queue except AttributeError: n_core_per_node = n_core_per_node_from_queue # deal with Condor schedd and central managers; make a random list the choose n_bulks = _div_round_up(nWorkers, self.minBulkToRamdomizedSchedd) if isinstance(self.condorSchedd, list) and len(self.condorSchedd) > 0: if isinstance(self.condorPool, list) and len(self.condorPool) > 0: orig_list = list(zip(self.condorSchedd, self.condorPool)) else: orig_list = [ (_schedd, self.condorPool) for _schedd in self.condorSchedd ] if n_bulks < len(orig_list): schedd_pool_choice_list = random.sample(orig_list, n_bulks) else: schedd_pool_choice_list = orig_list else: schedd_pool_choice_list = [(self.condorSchedd, self.condorPool)] # deal with CE special_par = '' ce_weighting = None if self.useAtlasGridCE: # If ATLAS Grid CE mode used tmpLog.debug('Using ATLAS Grid CE mode...') queues_from_queue_list = this_panda_queue_dict.get('queues', []) special_par = this_panda_queue_dict.get('special_par', '') ce_auxilary_dict = {} for _queue_dict in queues_from_queue_list: if not ( _queue_dict.get('ce_endpoint') and str(_queue_dict.get('ce_state', '')).upper() == 'ACTIVE' and str(_queue_dict.get('ce_flavour', '')).lower() in set(['arc-ce', 'cream-ce', 'htcondor-ce']) ): continue ce_endpoint = _queue_dict.get('ce_endpoint') if ( ce_endpoint in ce_auxilary_dict and str(_queue_dict.get('ce_queue_name', '')).lower() == 'default' ): pass else: ce_auxilary_dict[ce_endpoint] = _queue_dict # qualified CEs from AGIS info n_qualified_ce = len(ce_auxilary_dict) if n_qualified_ce > 0: # Get CE weighting tmpLog.debug('Get CE weighting') worker_ce_all_tuple = self.get_ce_statistics(self.queueName, nWorkers) ce_weighting = _get_ce_weighting(ce_endpoint_list=list(ce_auxilary_dict.keys()), worker_ce_all_tuple=worker_ce_all_tuple) stats_weighting_display_str = _get_ce_stats_weighting_display( ce_auxilary_dict.keys(), worker_ce_all_tuple, ce_weighting) tmpLog.debug('CE stats and weighting: {0}'.format(stats_weighting_display_str)) else: tmpLog.error('No valid CE endpoint found') to_submit_any = False def _handle_one_worker(workspec, to_submit=to_submit_any): # make logger tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), method_name='_handle_one_worker') ce_info_dict = dict() batch_log_dict = dict() data = {'workspec': workspec, 'to_submit': to_submit,} if to_submit: if self.useAtlasGridCE: # choose a CE tmpLog.info('choose a CE...') ce_chosen = _choose_ce(ce_weighting) try: ce_info_dict = ce_auxilary_dict[ce_chosen].copy() except KeyError: tmpLog.info('Problem choosing CE with weighting. Choose an arbitrary CE endpoint') ce_info_dict = random.choice(list(ce_auxilary_dict.values())).copy() # go on info of the CE ce_endpoint_from_queue = ce_info_dict.get('ce_endpoint', '') ce_flavour_str = str(ce_info_dict.get('ce_flavour', '')).lower() ce_version_str = str(ce_info_dict.get('ce_version', '')).lower() ce_info_dict['ce_hostname'] = re.sub(':\w*', '', ce_endpoint_from_queue) if ce_info_dict['ce_hostname'] == ce_endpoint_from_queue: # add default port to ce_endpoint if missing default_port_map = { 'cream-ce': 8443, 'arc-ce': 2811, 'htcondor-ce': 9619, } if ce_flavour_str in default_port_map: default_port = default_port_map[ce_flavour_str] ce_info_dict['ce_endpoint'] = '{0}:{1}'.format(ce_endpoint_from_queue, default_port) tmpLog.debug('For site {0} got pilot version: "{1}"; CE endpoint: "{2}", flavour: "{3}"'.format( self.queueName, pilot_version_orig, ce_endpoint_from_queue, ce_flavour_str)) if os.path.isdir(self.CEtemplateDir) and ce_flavour_str: sdf_template_filename = '{ce_flavour_str}{pilot_version_suffix_str}.sdf'.format( ce_flavour_str=ce_flavour_str, pilot_version_suffix_str=pilot_version_suffix_str) self.templateFile = os.path.join(self.CEtemplateDir, sdf_template_filename) else: try: # Manually define site condor schedd as ceHostname and central manager as ceEndpoint if self.ceHostname and isinstance(self.ceHostname, list) and len(self.ceHostname) > 0: if isinstance(self.ceEndpoint, list) and len(self.ceEndpoint) > 0: ce_info_dict['ce_hostname'], ce_info_dict['ce_endpoint'] = random.choice(list(zip(self.ceHostname, self.ceEndpoint))) else: ce_info_dict['ce_hostname'] = random.choice(self.ceHostname) ce_info_dict['ce_endpoint'] = self.ceEndpoint else: ce_info_dict['ce_hostname'] = self.ceHostname ce_info_dict['ce_endpoint'] = self.ceEndpoint except AttributeError: pass # template for batch script try: tmpFile = open(self.templateFile) sdf_template_raw = tmpFile.read() tmpFile.close() except AttributeError: tmpLog.error('No valid templateFile found. Maybe templateFile, CEtemplateDir invalid, or no valid CE found') to_submit = False return data else: # get batch_log, stdout, stderr filename, and remobe commented liness sdf_template_str_list = [] for _line in sdf_template_raw.split('\n'): if _line.startswith('#'): continue sdf_template_str_list.append(_line) _match_batch_log = re.match('log = (.+)', _line) _match_stdout = re.match('output = (.+)', _line) _match_stderr = re.match('error = (.+)', _line) if _match_batch_log: batch_log_value = _match_batch_log.group(1) continue if _match_stdout: stdout_value = _match_stdout.group(1) continue if _match_stderr: stderr_value = _match_stderr.group(1) continue sdf_template = '\n'.join(sdf_template_str_list) # Choose from Condor schedd and central managers condor_schedd, condor_pool = random.choice(schedd_pool_choice_list) # set submissionHost if not condor_schedd and not condor_pool: workspec.submissionHost = 'LOCAL' else: workspec.submissionHost = '{0},{1}'.format(condor_schedd, condor_pool) tmpLog.debug('set submissionHost={0}'.format(workspec.submissionHost)) # Log Base URL if self.logBaseURL and '[ScheddHostname]' in self.logBaseURL: schedd_hostname = re.sub(r'(?:[a-zA-Z0-9_.\-]*@)?([a-zA-Z0-9.\-]+)(?::[0-9]+)?', lambda matchobj: matchobj.group(1) if matchobj.group(1) else '', condor_schedd) log_base_url = re.sub(r'\[ScheddHostname\]', schedd_hostname, self.logBaseURL) else: log_base_url = self.logBaseURL # URLs for log files if not (log_base_url is None): if workspec.batchID: batchID = workspec.batchID guess = False else: batchID = '' guess = True batch_log_filename = parse_batch_job_filename(value_str=batch_log_value, file_dir=log_subdir_path, batchID=batchID, guess=guess) stdout_path_file_name = parse_batch_job_filename(value_str=stdout_value, file_dir=log_subdir_path, batchID=batchID, guess=guess) stderr_path_filename = parse_batch_job_filename(value_str=stderr_value, file_dir=log_subdir_path, batchID=batchID, guess=guess) batch_log = '{0}/{1}/{2}'.format(log_base_url, log_subdir, batch_log_filename) batch_stdout = '{0}/{1}/{2}'.format(log_base_url, log_subdir, stdout_path_file_name) batch_stderr = '{0}/{1}/{2}'.format(log_base_url, log_subdir, stderr_path_filename) workspec.set_log_file('batch_log', batch_log) workspec.set_log_file('stdout', batch_stdout) workspec.set_log_file('stderr', batch_stderr) batch_log_dict['batch_log'] = batch_log batch_log_dict['batch_stdout'] = batch_stdout batch_log_dict['batch_stderr'] = batch_stderr batch_log_dict['gtag'] = workspec.workAttributes['stdOut'] tmpLog.debug('Done set_log_file before submission') tmpLog.debug('Done jobspec attribute setting') # set data dict data.update({ 'workspec': workspec, 'to_submit': to_submit, 'template': sdf_template, 'executable_file': self.executableFile, 'log_dir': self.logDir, 'log_subdir': log_subdir, 'n_core_per_node': n_core_per_node, 'panda_queue_name': panda_queue_name, 'x509_user_proxy': self.x509UserProxy, 'ce_info_dict': ce_info_dict, 'batch_log_dict': batch_log_dict, 'special_par': special_par, 'harvester_queue_config': harvester_queue_config, 'is_unified_queue': is_unified_queue, 'condor_schedd': condor_schedd, 'condor_pool': condor_pool, 'use_spool': self.useSpool, 'pilot_version': pilot_version_orig, }) return data def _propagate_attributes(workspec, tmpVal): # make logger tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), method_name='_propagate_attributes') (retVal, tmpDict) = tmpVal workspec.set_attributes_with_dict(tmpDict) tmpLog.debug('Done workspec attributes propagation') return retVal tmpLog.debug('finished preparing worker attributes') # map(_handle_one_worker, workspec_list) with ThreadPoolExecutor(self.nProcesses * 4) as thread_pool: dataIterator = thread_pool.map(_handle_one_worker, workspec_list) tmpLog.debug('{0} workers handled'.format(nWorkers)) # submit retValList = submit_bag_of_workers(list(dataIterator)) tmpLog.debug('{0} workers submitted'.format(nWorkers)) # propagate changed attributes with ThreadPoolExecutor(self.nProcesses) as thread_pool: retIterator = thread_pool.map(lambda _wv_tuple: _propagate_attributes(*_wv_tuple), zip(workspec_list, retValList)) retList = list(retIterator) tmpLog.debug('done') return retList