Exemplos de PandaQueuesDict.get_panda_queue_name em Python, exemplos de pandaharvester.harvestermisc.info_utils.PandaQueuesDict.get_panda_queue_name em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: k8s_utils.py Projeto: wguanicedew/harvester

    def create_job_from_yaml(self, yaml_content, work_spec, cert, cert_in_secret=True, cpuadjustratio=100, memoryadjustratio=100):
        panda_queues_dict = PandaQueuesDict()
        queue_name = panda_queues_dict.get_panda_queue_name(work_spec.computingSite)

        yaml_content['metadata']['name'] = yaml_content['metadata']['name'] + "-" + str(work_spec.workerID)

        yaml_content['spec']['template'].setdefault('metadata', {})
        yaml_content['spec']['template']['metadata'].update({
            'labels': {'resourceType': str(work_spec.resourceType)}})

        yaml_containers = yaml_content['spec']['template']['spec']['containers']
        del(yaml_containers[1:len(yaml_containers)])

        container_env = yaml_containers[0]

        container_env.setdefault('resources', {})

        # note that predefined values in the yaml template will NOT be overwritten
        if work_spec.nCore > 0:
            container_env['resources'].setdefault('limits', {'cpu': str(work_spec.nCore)})
            container_env['resources'].setdefault('requests', {'cpu': str(work_spec.nCore * cpuadjustratio / 100.0)})

        if work_spec.minRamCount > 4:
            # K8S minimum memory limit = 4 MB
            container_env['resources'].setdefault('limits', {
                'memory': str(work_spec.minRamCount) + 'M'})
            container_env['resources'].setdefault('requests', {
                'memory': str(work_spec.minRamCount*memoryadjustratio/100.0) + 'M'})

        container_env.setdefault('env', [])

        container_env['env'].extend([
            {'name': 'computingSite', 'value': work_spec.computingSite},
            {'name': 'pandaQueueName', 'value': queue_name},
            {'name': 'resourceType', 'value': work_spec.resourceType},
            {'name': 'proxySecretPath', 'value': cert if cert_in_secret else None},
            {'name': 'proxyContent', 'value': None if cert_in_secret else self.set_proxy(cert)},
            {'name': 'workerID', 'value': str(work_spec.workerID)},
            {'name': 'logs_frontend_w', 'value': harvester_config.pandacon.pandaCacheURL_W},
            {'name': 'logs_frontend_r', 'value': harvester_config.pandacon.pandaCacheURL_R},
            {'name': 'PANDA_JSID', 'value': 'harvester-' + harvester_config.master.harvester_id},
            {'name': 'HARVESTER_WORKER_ID', 'value': str(work_spec.workerID)},
            {'name': 'HARVESTER_ID', 'value': harvester_config.master.harvester_id}
            ])

        if 'affinity' not in yaml_content['spec']['template']['spec']:
            yaml_content = self.set_affinity(yaml_content)

        rsp = self.batchv1.create_namespaced_job(body=yaml_content, namespace=self.namespace)
        return rsp

Exemplo n.º 2

0

Exibir arquivo

    def submit_workers(self, workspec_list):
        tmpLog = self.make_logger(baseLogger, method_name='submit_workers')

        nWorkers = len(workspec_list)
        tmpLog.debug('start nWorkers={0}'.format(nWorkers))

        # whether to submit any worker
        to_submit_any = True

        # get log subdirectory name from timestamp
        timeNow = datetime.datetime.utcnow()
        log_subdir = timeNow.strftime('%y-%m-%d_%H')
        log_subdir_path = os.path.join(self.logDir, log_subdir)
        if self.condorSchedd is None or not self.useSpool:
            try:
                os.mkdir(log_subdir_path)
            except OSError as e:
                if e.errno != errno.EEXIST:
                    raise
                else:
                    pass

        # get info from harvester queue config
        _queueConfigMapper = QueueConfigMapper()
        harvester_queue_config = _queueConfigMapper.get_queue(self.queueName)

        # get queue info from AGIS by cacher in db
        if self.useAtlasAGIS:
            panda_queues_dict = PandaQueuesDict()
            panda_queue_name = panda_queues_dict.get_panda_queue_name(
                self.queueName)
            this_panda_queue_dict = panda_queues_dict.get(
                self.queueName, dict())
            # tmpLog.debug('panda_queues_name and queue_info: {0}, {1}'.format(self.queueName, panda_queues_dict[self.queueName]))
        else:
            panda_queues_dict = dict()
            panda_queue_name = self.queueName
            this_panda_queue_dict = dict()

        # get default information from queue info
        n_core_per_node_from_queue = this_panda_queue_dict.get(
            'corecount', 1) if this_panda_queue_dict.get('corecount', 1) else 1
        is_unified_queue = this_panda_queue_dict.get('capability',
                                                     '') == 'ucore'

        # get override requirements from queue configured
        try:
            n_core_per_node = self.nCorePerNode if self.nCorePerNode else n_core_per_node_from_queue
        except AttributeError:
            n_core_per_node = n_core_per_node_from_queue

        # deal with CE
        special_par = ''
        ce_weighting = None
        if self.useAtlasGridCE:
            # If ATLAS Grid CE mode used
            tmpLog.debug('Using ATLAS Grid CE mode...')
            queues_from_queue_list = this_panda_queue_dict.get('queues', [])
            special_par = this_panda_queue_dict.get('special_par', '')
            ce_auxilary_dict = {}
            for _queue_dict in queues_from_queue_list:
                if not (_queue_dict.get('ce_endpoint') and str(
                        _queue_dict.get('ce_state', '')).upper() == 'ACTIVE'
                        and str(_queue_dict.get('ce_flavour', '')).lower()
                        in set(['arc-ce', 'cream-ce', 'htcondor-ce'])):
                    continue
                ce_endpoint = _queue_dict.get('ce_endpoint')
                if (ce_endpoint in ce_auxilary_dict
                        and str(_queue_dict.get('ce_queue_name',
                                                '')).lower() == 'default'):
                    pass
                else:
                    ce_auxilary_dict[ce_endpoint] = _queue_dict
            # qualified CEs from AGIS info
            n_qualified_ce = len(ce_auxilary_dict)
            if n_qualified_ce > 0:
                # Get CE weighting
                tmpLog.debug('Get CE weighting')
                worker_ce_all_tuple = self.get_ce_statistics(
                    self.queueName, nWorkers)
                ce_weighting = _get_ce_weighting(
                    ce_endpoint_list=list(ce_auxilary_dict.keys()),
                    worker_ce_all_tuple=worker_ce_all_tuple)
                stats_weighting_display_str = _get_ce_stats_weighting_display(
                    ce_auxilary_dict.keys(), worker_ce_all_tuple, ce_weighting)
                tmpLog.debug('CE stats and weighting: {0}'.format(
                    stats_weighting_display_str))
            else:
                tmpLog.error('No valid CE endpoint found')
                to_submit_any = False

        def _handle_one_worker(workspec, to_submit=to_submit_any):
            # make logger
            tmpLog = core_utils.make_logger(baseLogger,
                                            'workerID={0}'.format(
                                                workspec.workerID),
                                            method_name='_handle_one_worker')
            ce_info_dict = dict()
            batch_log_dict = dict()
            data = {
                'workspec': workspec,
                'to_submit': to_submit,
            }
            if to_submit:
                if self.useAtlasGridCE:
                    # choose a CE
                    tmpLog.info('choose a CE...')
                    ce_chosen = _choose_ce(ce_weighting)
                    try:
                        ce_info_dict = ce_auxilary_dict[ce_chosen].copy()
                    except KeyError:
                        tmpLog.info(
                            'Problem choosing CE with weighting. Choose an arbitrary CE endpoint'
                        )
                        ce_info_dict = random.choice(
                            list(ce_auxilary_dict.values())).copy()
                    # go on info of the CE
                    ce_endpoint_from_queue = ce_info_dict.get(
                        'ce_endpoint', '')
                    ce_flavour_str = str(ce_info_dict.get('ce_flavour',
                                                          '')).lower()
                    ce_version_str = str(ce_info_dict.get('ce_version',
                                                          '')).lower()
                    ce_info_dict['ce_hostname'] = re.sub(
                        ':\w*', '', ce_endpoint_from_queue)
                    if ce_info_dict['ce_hostname'] == ce_endpoint_from_queue:
                        # add default port to ce_endpoint if missing
                        default_port_map = {
                            'cream-ce': 8443,
                            'arc-ce': 2811,
                            'htcondor-ce': 9619,
                        }
                        if ce_flavour_str in default_port_map:
                            default_port = default_port_map[ce_flavour_str]
                            ce_info_dict['ce_endpoint'] = '{0}:{1}'.format(
                                ce_endpoint_from_queue, default_port)
                    tmpLog.debug(
                        'For site {0} got CE endpoint: "{1}", flavour: "{2}"'.
                        format(self.queueName, ce_endpoint_from_queue,
                               ce_flavour_str))
                    if os.path.isdir(self.CEtemplateDir) and ce_flavour_str:
                        sdf_template_filename = '{ce_flavour_str}.sdf'.format(
                            ce_flavour_str=ce_flavour_str)
                        self.templateFile = os.path.join(
                            self.CEtemplateDir, sdf_template_filename)
                else:
                    try:
                        # Manually define site condor schedd as ceHostname and central manager as ceEndpoint
                        if self.ceHostname and isinstance(
                                self.ceHostname,
                                list) and len(self.ceHostname) > 0:
                            if isinstance(self.ceEndpoint,
                                          list) and len(self.ceEndpoint) > 0:
                                ce_info_dict['ce_hostname'], ce_info_dict[
                                    'ce_endpoint'] = random.choice(
                                        list(
                                            zip(self.ceHostname,
                                                self.ceEndpoint)))
                            else:
                                ce_info_dict['ce_hostname'] = random.choice(
                                    self.ceHostname)
                                ce_info_dict['ce_endpoint'] = self.ceEndpoint
                        else:
                            ce_info_dict['ce_hostname'] = self.ceHostname
                            ce_info_dict['ce_endpoint'] = self.ceEndpoint
                    except AttributeError:
                        pass
                # template for batch script
                try:
                    tmpFile = open(self.templateFile)
                    sdf_template_raw = tmpFile.read()
                    tmpFile.close()
                except AttributeError:
                    tmpLog.error(
                        'No valid templateFile found. Maybe templateFile, CEtemplateDir invalid, or no valid CE found'
                    )
                    to_submit = False
                    return data
                else:
                    # get batch_log, stdout, stderr filename, and remobe commented liness
                    sdf_template_str_list = []
                    for _line in sdf_template_raw.split('\n'):
                        if _line.startswith('#'):
                            continue
                        sdf_template_str_list.append(_line)
                        _match_batch_log = re.match('log = (.+)', _line)
                        _match_stdout = re.match('output = (.+)', _line)
                        _match_stderr = re.match('error = (.+)', _line)
                        if _match_batch_log:
                            batch_log_value = _match_batch_log.group(1)
                            continue
                        if _match_stdout:
                            stdout_value = _match_stdout.group(1)
                            continue
                        if _match_stderr:
                            stderr_value = _match_stderr.group(1)
                            continue
                    sdf_template = '\n'.join(sdf_template_str_list)
                    # Choose from Condor schedd and central managers
                    if isinstance(self.condorSchedd,
                                  list) and len(self.condorSchedd) > 0:
                        if isinstance(self.condorPool,
                                      list) and len(self.condorPool) > 0:
                            condor_schedd, condor_pool = random.choice(
                                list(zip(self.condorSchedd, self.condorPool)))
                        else:
                            condor_schedd = random.choice(self.condorSchedd)
                            condor_pool = self.condorPool
                    else:
                        condor_schedd = self.condorSchedd
                        condor_pool = self.condorPool
                    # Log Base URL
                    if self.logBaseURL and '[ScheddHostname]' in self.logBaseURL:
                        schedd_hostname = re.sub(
                            r'(?:[a-zA-Z0-9_.\-]*@)?([a-zA-Z0-9.\-]+)(?::[0-9]+)?',
                            lambda matchobj: matchobj.group(1)
                            if matchobj.group(1) else '', condor_schedd)
                        log_base_url = re.sub(r'\[ScheddHostname\]',
                                              schedd_hostname, self.logBaseURL)
                    else:
                        log_base_url = self.logBaseURL
                    # URLs for log files
                    if not (log_base_url is None):
                        if workspec.batchID:
                            batchID = workspec.batchID
                            guess = False
                        else:
                            batchID = ''
                            guess = True
                        batch_log_filename = parse_batch_job_filename(
                            value_str=batch_log_value,
                            file_dir=log_subdir_path,
                            batchID=batchID,
                            guess=guess)
                        stdout_path_file_name = parse_batch_job_filename(
                            value_str=stdout_value,
                            file_dir=log_subdir_path,
                            batchID=batchID,
                            guess=guess)
                        stderr_path_filename = parse_batch_job_filename(
                            value_str=stderr_value,
                            file_dir=log_subdir_path,
                            batchID=batchID,
                            guess=guess)
                        batch_log = '{0}/{1}/{2}'.format(
                            log_base_url, log_subdir, batch_log_filename)
                        batch_stdout = '{0}/{1}/{2}'.format(
                            log_base_url, log_subdir, stdout_path_file_name)
                        batch_stderr = '{0}/{1}/{2}'.format(
                            log_base_url, log_subdir, stderr_path_filename)
                        workspec.set_log_file('batch_log', batch_log)
                        workspec.set_log_file('stdout', batch_stdout)
                        workspec.set_log_file('stderr', batch_stderr)
                        batch_log_dict['batch_log'] = batch_log
                        batch_log_dict['batch_stdout'] = batch_stdout
                        batch_log_dict['batch_stderr'] = batch_stderr
                        batch_log_dict['gtag'] = workspec.workAttributes[
                            'stdOut']
                        tmpLog.debug('Done set_log_file before submission')
                    tmpLog.debug('Done jobspec attribute setting')
                # set data dict
                data.update({
                    'workspec': workspec,
                    'to_submit': to_submit,
                    'template': sdf_template,
                    'executable_file': self.executableFile,
                    'log_dir': self.logDir,
                    'log_subdir': log_subdir,
                    'n_core_per_node': n_core_per_node,
                    'panda_queue_name': panda_queue_name,
                    'x509_user_proxy': self.x509UserProxy,
                    'ce_info_dict': ce_info_dict,
                    'batch_log_dict': batch_log_dict,
                    'special_par': special_par,
                    'harvester_queue_config': harvester_queue_config,
                    'is_unified_queue': is_unified_queue,
                    'condor_schedd': condor_schedd,
                    'condor_pool': condor_pool,
                    'use_spool': self.useSpool,
                })
            return data

        def _propagate_attributes(workspec, tmpVal):
            # make logger
            tmpLog = core_utils.make_logger(
                baseLogger,
                'workerID={0}'.format(workspec.workerID),
                method_name='_propagate_attributes')
            (retVal, tmpDict) = tmpVal
            workspec.set_attributes_with_dict(tmpDict)
            tmpLog.debug('Done workspec attributes propagation')
            return retVal

        tmpLog.debug('finished preparing worker attributes')

        # map(_handle_one_worker, workspec_list)
        with ThreadPoolExecutor(self.nProcesses * 4) as thread_pool:
            dataIterator = thread_pool.map(_handle_one_worker, workspec_list)
        tmpLog.debug('{0} workers handled'.format(nWorkers))

        # exec with mcore
        with ThreadPoolExecutor(self.nProcesses) as thread_pool:
            retValList = thread_pool.map(submit_a_worker, dataIterator)
        tmpLog.debug('{0} workers submitted'.format(nWorkers))

        # propagate changed attributes
        with ThreadPoolExecutor(self.nProcesses) as thread_pool:
            retIterator = thread_pool.map(
                lambda _wv_tuple: _propagate_attributes(*_wv_tuple),
                zip(workspec_list, retValList))

        retList = list(retIterator)
        tmpLog.debug('done')

        return retList

Exemplo n.º 3

0

Exibir arquivo

    def create_job_from_yaml(self,
                             yaml_content,
                             work_spec,
                             prod_source_label,
                             pilot_type,
                             pilot_url_str,
                             pilot_python_option,
                             container_image,
                             executable,
                             args,
                             cert,
                             cpu_adjust_ratio=100,
                             memory_adjust_ratio=100,
                             max_time=None):

        tmp_log = core_utils.make_logger(base_logger,
                                         method_name='create_job_from_yaml')

        # consider PULL mode as default, unless specified
        submit_mode = 'PULL'

        # create the configmap in push mode
        worker_id = None
        if work_spec.mapType != 'NoJob':
            submit_mode = 'PUSH'
            worker_id = str(work_spec.workerID)
            res = self.create_configmap(work_spec)
            if not res:  # if the configmap creation failed, don't submit a job because the pod creation will hang
                return res, 'Failed to create a configmap'

        # retrieve panda queue information
        panda_queues_dict = PandaQueuesDict()
        queue_name = panda_queues_dict.get_panda_queue_name(
            work_spec.computingSite)

        # set the worker name
        yaml_content['metadata']['name'] = yaml_content['metadata'][
            'name'] + "-" + str(work_spec.workerID)

        # set the resource type and other metadata to filter the pods
        yaml_content['spec']['template'].setdefault('metadata', {})
        yaml_content['spec']['template']['metadata'].update({
            'labels': {
                'resourceType': str(work_spec.resourceType),
                'prodSourceLabel': str(prod_source_label),
                'pq': str(work_spec.computingSite)
            }
        })

        # fill the container details. we can only handle one container (take the first, delete the rest)
        yaml_containers = yaml_content['spec']['template']['spec'][
            'containers']
        del (yaml_containers[1:len(yaml_containers)])

        container_env = yaml_containers[0]

        container_env.setdefault('resources', {})
        # set the container image
        if 'image' not in container_env:
            container_env['image'] = container_image

        if 'command' not in container_env:
            container_env['command'] = executable
            container_env['args'] = args

        # set the resources (CPU and memory) we need for the container
        # note that predefined values in the yaml template will NOT be overwritten
        # Be familiar with QoS classes: https://kubernetes.io/docs/tasks/configure-pod-container/quality-service-pod
        # The CPU & memory settings will affect the QoS for the pod
        container_env.setdefault('resources', {})
        if work_spec.nCore > 0:

            # CPU limits
            container_env['resources'].setdefault('limits', {})
            if 'cpu' not in container_env['resources']['limits']:
                container_env['resources']['limits']['cpu'] = str(
                    work_spec.nCore)
            # CPU requests
            container_env['resources'].setdefault('requests', {})
            if 'cpu' not in container_env['resources']['requests']:
                container_env['resources']['requests']['cpu'] = str(
                    work_spec.nCore * cpu_adjust_ratio / 100.0)

        if work_spec.minRamCount > 4:  # K8S minimum memory limit = 4 MB
            # memory limits
            # container_env['resources'].setdefault('limits', {})
            # if 'memory' not in container_env['resources']['limits']:
            #     container_env['resources']['limits']['memory'] = str(work_spec.minRamCount) + 'M'
            # memory requests
            container_env['resources'].setdefault('requests', {})
            if 'memory' not in container_env['resources']['requests']:
                container_env['resources']['requests']['memory'] = str(
                    work_spec.minRamCount * memory_adjust_ratio / 100.0) + 'M'

        container_env.setdefault('env', [])
        # try to retrieve the stdout log file name
        try:
            log_file_name = work_spec.workAttributes['stdout']
        except (KeyError, AttributeError):
            tmp_log.debug(
                'work_spec does not have stdout workAttribute, using default')
            log_file_name = ''

        container_env['env'].extend([
            {
                'name': 'computingSite',
                'value': work_spec.computingSite
            },
            {
                'name': 'pandaQueueName',
                'value': queue_name
            },
            {
                'name': 'resourceType',
                'value': work_spec.resourceType
            },
            {
                'name': 'prodSourceLabel',
                'value': prod_source_label
            },
            {
                'name': 'pilotTyp',
                'value': pilot_type
            },
            {
                'name': 'pilotUrlOpt',
                'value': pilot_url_str
            },
            {
                'name': 'pythonOption',
                'value': pilot_python_option
            },
            # {'name': 'jobType', 'value': work_spec.jobType},
            {
                'name': 'proxySecretPath',
                'value': cert
            },
            {
                'name': 'workerID',
                'value': str(work_spec.workerID)
            },
            {
                'name': 'logs_frontend_w',
                'value': harvester_config.pandacon.pandaCacheURL_W
            },
            {
                'name': 'logs_frontend_r',
                'value': harvester_config.pandacon.pandaCacheURL_R
            },
            {
                'name': 'stdout_name',
                'value': log_file_name
            },
            {
                'name': 'PANDA_JSID',
                'value': 'harvester-' + harvester_config.master.harvester_id
            },
            {
                'name': 'HARVESTER_WORKER_ID',
                'value': str(work_spec.workerID)
            },
            {
                'name': 'HARVESTER_ID',
                'value': harvester_config.master.harvester_id
            },
            {
                'name': 'submit_mode',
                'value': submit_mode
            },
            {
                'name': 'EXEC_DIR',
                'value': EXEC_DIR
            },
        ])

        # add the pilots starter configmap
        yaml_content['spec']['template']['spec'].setdefault('volumes', [])
        yaml_volumes = yaml_content['spec']['template']['spec']['volumes']
        yaml_volumes.append({
            'name': 'pilots-starter',
            'configMap': {
                'name': 'pilots-starter'
            }
        })
        # mount the volume to the filesystem
        container_env.setdefault('volumeMounts', [])
        container_env['volumeMounts'].append({
            'name': 'pilots-starter',
            'mountPath': EXEC_DIR
        })

        # in push mode, add the configmap as a volume to the pod
        if submit_mode == 'PUSH' and worker_id:
            yaml_content['spec']['template']['spec'].setdefault('volumes', [])
            yaml_volumes = yaml_content['spec']['template']['spec']['volumes']
            yaml_volumes.append({
                'name': 'job-config',
                'configMap': {
                    'name': worker_id
                }
            })
            # mount the volume to the filesystem
            container_env.setdefault('volumeMounts', [])
            container_env['volumeMounts'].append({
                'name': 'job-config',
                'mountPath': CONFIG_DIR
            })

        # if we are running the pilot in a emptyDir with "pilot-dir" name, then set the max size
        if 'volumes' in yaml_content['spec']['template']['spec']:
            yaml_volumes = yaml_content['spec']['template']['spec']['volumes']
            for volume in yaml_volumes:
                # do not overwrite any hardcoded sizeLimit value
                if volume[
                        'name'] == 'pilot-dir' and 'emptyDir' in volume and 'sizeLimit' not in volume[
                            'emptyDir']:
                    maxwdir_prorated_GB = panda_queues_dict.get_prorated_maxwdir_GB(
                        work_spec.computingSite, work_spec.nCore)
                    if maxwdir_prorated_GB:
                        volume['emptyDir']['sizeLimit'] = '{0}G'.format(
                            maxwdir_prorated_GB)

        # set the affinity
        if 'affinity' not in yaml_content['spec']['template']['spec']:
            yaml_content = self.set_affinity(yaml_content)

        # set max_time to avoid having a pod running forever
        if 'activeDeadlineSeconds' not in yaml_content['spec']['template'][
                'spec']:
            if not max_time:  # 4 days
                max_time = 4 * 24 * 23600
            yaml_content['spec']['template']['spec'][
                'activeDeadlineSeconds'] = max_time

        tmp_log.debug('creating job {0}'.format(yaml_content))

        rsp = self.batchv1.create_namespaced_job(body=yaml_content,
                                                 namespace=self.namespace)
        return rsp, yaml_content

Exemplo n.º 4

0

Exibir arquivo

Arquivo: htcondor_submitter.py Projeto: jtchilders/panda-harvester

    def submit_workers(self, workspec_list):
        tmpLog = self.make_logger(baseLogger, method_name='submit_workers')

        nWorkers = len(workspec_list)
        tmpLog.debug('start nWorkers={0}'.format(nWorkers))

        # get log subdirectory name from timestamp
        timeNow = datetime.datetime.utcnow()
        log_subdir = timeNow.strftime('%y-%m-%d_%H')
        log_subdir_path = os.path.join(self.logDir, log_subdir)
        try:
            os.mkdir(log_subdir_path)
        except OSError as e:
            if e.errno != errno.EEXIST:
                raise
            else:
                pass

        # get info from harvester queue config
        _queueConfigMapper = QueueConfigMapper()
        harvester_queue_config = _queueConfigMapper.get_queue(self.queueName)

        # get queue info from AGIS by cacher in db
        if self.useAtlasAGIS:
            panda_queues_dict = PandaQueuesDict()
            panda_queue_name = panda_queues_dict.get_panda_queue_name(self.queueName)
            this_panda_queue_dict = panda_queues_dict.get(self.queueName, dict())
            # tmpLog.debug('panda_queues_name and queue_info: {0}, {1}'.format(self.queueName, panda_queues_dict[self.queueName]))
        else:
            panda_queues_dict = dict()
            panda_queue_name = self.queueName
            this_panda_queue_dict = dict()

        def _handle_one_worker(workspec):
            # make logger
            tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID),
                                            method_name='_handle_one_worker')

            # get default information from queue info
            n_core_per_node_from_queue = this_panda_queue_dict.get('corecount', 1) if this_panda_queue_dict.get('corecount', 1) else 1
            is_unified_queue = 'unifiedPandaQueue' in this_panda_queue_dict.get('catchall', '').split(',') \
                               or this_panda_queue_dict.get('capability', '') == 'ucore'
            ce_info_dict = dict()
            batch_log_dict = dict()
            special_par = ''

            if self.useAtlasGridCE:
                # If ATLAS Grid CE mode used
                tmpLog.debug('Using ATLAS Grid CE mode...')
                queues_from_queue_list = this_panda_queue_dict.get('queues', [])
                special_par = this_panda_queue_dict.get('special_par', '')
                ce_auxilary_dict = {}
                for _queue_dict in queues_from_queue_list:
                    if not ( _queue_dict.get('ce_endpoint')
                            and str(_queue_dict.get('ce_state', '')).upper() == 'ACTIVE'
                            and str(_queue_dict.get('ce_flavour', '')).lower() in set(['arc-ce', 'cream-ce', 'htcondor-ce']) ):
                        continue
                    ce_endpoint = _queue_dict.get('ce_endpoint')
                    if ( ce_endpoint in ce_auxilary_dict
                        and str(_queue_dict.get('ce_queue_name', '')).lower() == 'default' ):
                        pass
                    else:
                        ce_auxilary_dict[ce_endpoint] = _queue_dict
                # qualified CEs from AGIS info
                n_qualified_ce = len(ce_auxilary_dict)
                queue_status_dict = self.dbInterface.get_queue_status(self.queueName)
                worker_ce_stats_dict = self.dbInterface.get_worker_ce_stats(self.queueName)
                ce_weight_dict = _get_ce_weight_dict(ce_endpoint_list=list(ce_auxilary_dict.keys()),
                                                        queue_status_dict=queue_status_dict,
                                                        worker_ce_stats_dict=worker_ce_stats_dict)
                # good CEs which can be submitted to, duplicate by weight
                good_ce_weighted_list = []
                for _ce_endpoint in ce_auxilary_dict.keys():
                    good_ce_weighted_list.extend([_ce_endpoint] * ce_weight_dict.get(_ce_endpoint, 0))
                tmpLog.debug('queue_status_dict: {0} ; worker_ce_stats_dict: {1} ; ce_weight_dict: {2}'.format(
                        queue_status_dict, worker_ce_stats_dict, ce_weight_dict))
                if len(good_ce_weighted_list) > 0:
                    ce_info_dict = ce_auxilary_dict[random.choice(good_ce_weighted_list)].copy()
                else:
                    tmpLog.info('No good CE endpoint left. Choose an arbitrary CE endpoint')
                    ce_info_dict = random.choice(list(ce_auxilary_dict.values())).copy()
                ce_endpoint_from_queue = ce_info_dict.get('ce_endpoint', '')
                ce_flavour_str = str(ce_info_dict.get('ce_flavour', '')).lower()
                ce_version_str = str(ce_info_dict.get('ce_version', '')).lower()
                ce_info_dict['ce_hostname'] = re.sub(':\w*', '',  ce_endpoint_from_queue)
                tmpLog.debug('For site {0} got CE endpoint: "{1}", flavour: "{2}"'.format(self.queueName, ce_endpoint_from_queue, ce_flavour_str))
                if os.path.isdir(self.CEtemplateDir) and ce_flavour_str:
                    sdf_template_filename = '{ce_flavour_str}.sdf'.format(ce_flavour_str=ce_flavour_str)
                    self.templateFile = os.path.join(self.CEtemplateDir, sdf_template_filename)

            # template for batch script
            tmpFile = open(self.templateFile)
            sdf_template = tmpFile.read()
            tmpFile.close()

            # get batch_log, stdout, stderr filename
            for _line in sdf_template.split('\n'):
                if _line.startswith('#'):
                    continue
                _match_batch_log = re.match('log = (.+)', _line)
                _match_stdout = re.match('output = (.+)', _line)
                _match_stderr = re.match('error = (.+)', _line)
                if _match_batch_log:
                    batch_log_value = _match_batch_log.group(1)
                    continue
                if _match_stdout:
                    stdout_value = _match_stdout.group(1)
                    continue
                if _match_stderr:
                    stderr_value = _match_stderr.group(1)
                    continue

            # get override requirements from queue configured
            try:
                n_core_per_node = self.nCorePerNode if self.nCorePerNode else n_core_per_node_from_queue
            except AttributeError:
                n_core_per_node = n_core_per_node_from_queue

            # URLs for log files
            if not (self.logBaseURL is None):
                if workspec.batchID:
                    batchID = workspec.batchID
                    guess = False
                else:
                    batchID = ''
                    guess = True
                batch_log_filename = parse_batch_job_filename(value_str=batch_log_value, file_dir=log_subdir_path, batchID=batchID, guess=guess)
                stdout_path_file_name = parse_batch_job_filename(value_str=stdout_value, file_dir=log_subdir_path, batchID=batchID, guess=guess)
                stderr_path_filename = parse_batch_job_filename(value_str=stderr_value, file_dir=log_subdir_path, batchID=batchID, guess=guess)
                batch_log = '{0}/{1}/{2}'.format(self.logBaseURL, log_subdir, batch_log_filename)
                batch_stdout = '{0}/{1}/{2}'.format(self.logBaseURL, log_subdir, stdout_path_file_name)
                batch_stderr = '{0}/{1}/{2}'.format(self.logBaseURL, log_subdir, stderr_path_filename)
                workspec.set_log_file('batch_log', batch_log)
                workspec.set_log_file('stdout', batch_stdout)
                workspec.set_log_file('stderr', batch_stderr)
                batch_log_dict['batch_log'] = batch_log
                batch_log_dict['batch_stdout'] = batch_stdout
                batch_log_dict['batch_stderr'] = batch_stderr
                batch_log_dict['gtag'] = workspec.workAttributes['stdOut']
                tmpLog.debug('Done set_log_file before submission')

            tmpLog.debug('Done jobspec attribute setting')

            # set data dict
            data = {'workspec': workspec,
                    'template': sdf_template,
                    'log_dir': self.logDir,
                    'log_subdir': log_subdir,
                    'n_core_per_node': n_core_per_node,
                    'panda_queue_name': panda_queue_name,
                    'x509_user_proxy': self.x509UserProxy,
                    'ce_info_dict': ce_info_dict,
                    'batch_log_dict': batch_log_dict,
                    'special_par': special_par,
                    'harvester_queue_config': harvester_queue_config,
                    'is_unified_queue': is_unified_queue,
                    'condor_schedd': self.condorSchedd,
                    'condor_pool': self.condorPool,
                    }

            return data

        def _propagate_attributes(workspec, tmpVal):
            # make logger
            tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID),
                                            method_name='_propagate_attributes')
            (retVal, tmpDict) = tmpVal
            workspec.set_attributes_with_dict(tmpDict)
            tmpLog.debug('Done workspec attributes propagation')
            return retVal

        tmpLog.debug('finished preparing worker attributes')

        # map(_handle_one_worker, workspec_list)
        with ThreadPoolExecutor(self.nProcesses * 4) as thread_pool:
            dataIterator = thread_pool.map(_handle_one_worker, workspec_list)
        tmpLog.debug('{0} workers handled'.format(nWorkers))

        # exec with mcore
        with ThreadPoolExecutor(self.nProcesses) as thread_pool:
            retValList = thread_pool.map(submit_a_worker, dataIterator)
        tmpLog.debug('{0} workers submitted'.format(nWorkers))

        # propagate changed attributes
        with ThreadPoolExecutor(self.nProcesses) as thread_pool:
            retIterator = thread_pool.map(lambda _wv_tuple: _propagate_attributes(*_wv_tuple), zip(workspec_list, retValList))

        retList = list(retIterator)
        tmpLog.debug('done')

        return retList

Exemplo n.º 5

0

Exibir arquivo

Arquivo: htcondor_submitter.py Projeto: PanDAWMS/panda-harvester

    def submit_workers(self, workspec_list):
        tmpLog = self.make_logger(baseLogger, method_name='submit_workers')

        nWorkers = len(workspec_list)
        tmpLog.debug('start nWorkers={0}'.format(nWorkers))

        # whether to submit any worker
        to_submit_any = True

        # get log subdirectory name from timestamp
        timeNow = datetime.datetime.utcnow()
        log_subdir = timeNow.strftime('%y-%m-%d_%H')
        log_subdir_path = os.path.join(self.logDir, log_subdir)
        if self.condorSchedd is None or not self.useSpool:
            try:
                os.mkdir(log_subdir_path)
            except OSError as e:
                if e.errno != errno.EEXIST:
                    raise
                else:
                    pass

        # get info from harvester queue config
        _queueConfigMapper = QueueConfigMapper()
        harvester_queue_config = _queueConfigMapper.get_queue(self.queueName)

        # get queue info from AGIS by cacher in db
        if self.useAtlasAGIS:
            panda_queues_dict = PandaQueuesDict()
            panda_queue_name = panda_queues_dict.get_panda_queue_name(self.queueName)
            this_panda_queue_dict = panda_queues_dict.get(self.queueName, dict())
            # tmpLog.debug('panda_queues_name and queue_info: {0}, {1}'.format(self.queueName, panda_queues_dict[self.queueName]))
        else:
            panda_queues_dict = dict()
            panda_queue_name = self.queueName
            this_panda_queue_dict = dict()

        # get default information from queue info
        n_core_per_node_from_queue = this_panda_queue_dict.get('corecount', 1) if this_panda_queue_dict.get('corecount', 1) else 1
        is_unified_queue = this_panda_queue_dict.get('capability', '') == 'ucore'
        pilot_version_orig = str(this_panda_queue_dict.get('pilot_version', ''))
        pilot_version_suffix_str = '_pilot2' if pilot_version_orig == '2' else ''

        # get override requirements from queue configured
        try:
            n_core_per_node = self.nCorePerNode if self.nCorePerNode else n_core_per_node_from_queue
        except AttributeError:
            n_core_per_node = n_core_per_node_from_queue

        # deal with Condor schedd and central managers; make a random list the choose
        n_bulks = _div_round_up(nWorkers, self.minBulkToRamdomizedSchedd)
        if isinstance(self.condorSchedd, list) and len(self.condorSchedd) > 0:
            if isinstance(self.condorPool, list) and len(self.condorPool) > 0:
                orig_list = list(zip(self.condorSchedd, self.condorPool))
            else:
                orig_list = [ (_schedd, self.condorPool) for _schedd in self.condorSchedd ]
            if n_bulks < len(orig_list):
                schedd_pool_choice_list = random.sample(orig_list, n_bulks)
            else:
                schedd_pool_choice_list = orig_list
        else:
            schedd_pool_choice_list = [(self.condorSchedd, self.condorPool)]

        # deal with CE
        special_par = ''
        ce_weighting = None
        if self.useAtlasGridCE:
            # If ATLAS Grid CE mode used
            tmpLog.debug('Using ATLAS Grid CE mode...')
            queues_from_queue_list = this_panda_queue_dict.get('queues', [])
            special_par = this_panda_queue_dict.get('special_par', '')
            ce_auxilary_dict = {}
            for _queue_dict in queues_from_queue_list:
                if not ( _queue_dict.get('ce_endpoint')
                        and str(_queue_dict.get('ce_state', '')).upper() == 'ACTIVE'
                        and str(_queue_dict.get('ce_flavour', '')).lower() in set(['arc-ce', 'cream-ce', 'htcondor-ce']) ):
                    continue
                ce_endpoint = _queue_dict.get('ce_endpoint')
                if ( ce_endpoint in ce_auxilary_dict
                    and str(_queue_dict.get('ce_queue_name', '')).lower() == 'default' ):
                    pass
                else:
                    ce_auxilary_dict[ce_endpoint] = _queue_dict
            # qualified CEs from AGIS info
            n_qualified_ce = len(ce_auxilary_dict)
            if n_qualified_ce > 0:
                # Get CE weighting
                tmpLog.debug('Get CE weighting')
                worker_ce_all_tuple = self.get_ce_statistics(self.queueName, nWorkers)
                ce_weighting = _get_ce_weighting(ce_endpoint_list=list(ce_auxilary_dict.keys()),
                                                        worker_ce_all_tuple=worker_ce_all_tuple)
                stats_weighting_display_str = _get_ce_stats_weighting_display(
                                                ce_auxilary_dict.keys(), worker_ce_all_tuple, ce_weighting)
                tmpLog.debug('CE stats and weighting: {0}'.format(stats_weighting_display_str))
            else:
                tmpLog.error('No valid CE endpoint found')
                to_submit_any = False



        def _handle_one_worker(workspec, to_submit=to_submit_any):
            # make logger
            tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID),
                                            method_name='_handle_one_worker')
            ce_info_dict = dict()
            batch_log_dict = dict()
            data = {'workspec': workspec,
                    'to_submit': to_submit,}
            if to_submit:
                if self.useAtlasGridCE:
                    # choose a CE
                    tmpLog.info('choose a CE...')
                    ce_chosen = _choose_ce(ce_weighting)
                    try:
                        ce_info_dict = ce_auxilary_dict[ce_chosen].copy()
                    except KeyError:
                        tmpLog.info('Problem choosing CE with weighting. Choose an arbitrary CE endpoint')
                        ce_info_dict = random.choice(list(ce_auxilary_dict.values())).copy()
                    # go on info of the CE
                    ce_endpoint_from_queue = ce_info_dict.get('ce_endpoint', '')
                    ce_flavour_str = str(ce_info_dict.get('ce_flavour', '')).lower()
                    ce_version_str = str(ce_info_dict.get('ce_version', '')).lower()
                    ce_info_dict['ce_hostname'] = re.sub(':\w*', '',  ce_endpoint_from_queue)
                    if ce_info_dict['ce_hostname'] == ce_endpoint_from_queue:
                        # add default port to ce_endpoint if missing
                        default_port_map = {
                                'cream-ce': 8443,
                                'arc-ce': 2811,
                                'htcondor-ce': 9619,
                            }
                        if ce_flavour_str in default_port_map:
                            default_port = default_port_map[ce_flavour_str]
                            ce_info_dict['ce_endpoint'] = '{0}:{1}'.format(ce_endpoint_from_queue, default_port)
                    tmpLog.debug('For site {0} got pilot version: "{1}"; CE endpoint: "{2}", flavour: "{3}"'.format(
                                    self.queueName, pilot_version_orig, ce_endpoint_from_queue, ce_flavour_str))
                    if os.path.isdir(self.CEtemplateDir) and ce_flavour_str:
                        sdf_template_filename = '{ce_flavour_str}{pilot_version_suffix_str}.sdf'.format(
                                                    ce_flavour_str=ce_flavour_str, pilot_version_suffix_str=pilot_version_suffix_str)
                        self.templateFile = os.path.join(self.CEtemplateDir, sdf_template_filename)
                else:
                    try:
                        # Manually define site condor schedd as ceHostname and central manager as ceEndpoint
                        if self.ceHostname and isinstance(self.ceHostname, list) and len(self.ceHostname) > 0:
                            if isinstance(self.ceEndpoint, list) and len(self.ceEndpoint) > 0:
                                ce_info_dict['ce_hostname'], ce_info_dict['ce_endpoint'] = random.choice(list(zip(self.ceHostname, self.ceEndpoint)))
                            else:
                                ce_info_dict['ce_hostname'] = random.choice(self.ceHostname)
                                ce_info_dict['ce_endpoint'] = self.ceEndpoint
                        else:
                            ce_info_dict['ce_hostname'] = self.ceHostname
                            ce_info_dict['ce_endpoint'] = self.ceEndpoint
                    except AttributeError:
                        pass
                # template for batch script
                try:
                    tmpFile = open(self.templateFile)
                    sdf_template_raw = tmpFile.read()
                    tmpFile.close()
                except AttributeError:
                    tmpLog.error('No valid templateFile found. Maybe templateFile, CEtemplateDir invalid, or no valid CE found')
                    to_submit = False
                    return data
                else:
                    # get batch_log, stdout, stderr filename, and remobe commented liness
                    sdf_template_str_list = []
                    for _line in sdf_template_raw.split('\n'):
                        if _line.startswith('#'):
                            continue
                        sdf_template_str_list.append(_line)
                        _match_batch_log = re.match('log = (.+)', _line)
                        _match_stdout = re.match('output = (.+)', _line)
                        _match_stderr = re.match('error = (.+)', _line)
                        if _match_batch_log:
                            batch_log_value = _match_batch_log.group(1)
                            continue
                        if _match_stdout:
                            stdout_value = _match_stdout.group(1)
                            continue
                        if _match_stderr:
                            stderr_value = _match_stderr.group(1)
                            continue
                    sdf_template = '\n'.join(sdf_template_str_list)
                    # Choose from Condor schedd and central managers
                    condor_schedd, condor_pool = random.choice(schedd_pool_choice_list)
                    # set submissionHost
                    if not condor_schedd and not condor_pool:
                        workspec.submissionHost = 'LOCAL'
                    else:
                        workspec.submissionHost = '{0},{1}'.format(condor_schedd, condor_pool)
                    tmpLog.debug('set submissionHost={0}'.format(workspec.submissionHost))
                    # Log Base URL
                    if self.logBaseURL and '[ScheddHostname]' in self.logBaseURL:
                        schedd_hostname = re.sub(r'(?:[a-zA-Z0-9_.\-]*@)?([a-zA-Z0-9.\-]+)(?::[0-9]+)?',
                                                    lambda matchobj: matchobj.group(1) if matchobj.group(1) else '',
                                                    condor_schedd)
                        log_base_url = re.sub(r'\[ScheddHostname\]', schedd_hostname, self.logBaseURL)
                    else:
                        log_base_url = self.logBaseURL
                    # URLs for log files
                    if not (log_base_url is None):
                        if workspec.batchID:
                            batchID = workspec.batchID
                            guess = False
                        else:
                            batchID = ''
                            guess = True
                        batch_log_filename = parse_batch_job_filename(value_str=batch_log_value, file_dir=log_subdir_path, batchID=batchID, guess=guess)
                        stdout_path_file_name = parse_batch_job_filename(value_str=stdout_value, file_dir=log_subdir_path, batchID=batchID, guess=guess)
                        stderr_path_filename = parse_batch_job_filename(value_str=stderr_value, file_dir=log_subdir_path, batchID=batchID, guess=guess)
                        batch_log = '{0}/{1}/{2}'.format(log_base_url, log_subdir, batch_log_filename)
                        batch_stdout = '{0}/{1}/{2}'.format(log_base_url, log_subdir, stdout_path_file_name)
                        batch_stderr = '{0}/{1}/{2}'.format(log_base_url, log_subdir, stderr_path_filename)
                        workspec.set_log_file('batch_log', batch_log)
                        workspec.set_log_file('stdout', batch_stdout)
                        workspec.set_log_file('stderr', batch_stderr)
                        batch_log_dict['batch_log'] = batch_log
                        batch_log_dict['batch_stdout'] = batch_stdout
                        batch_log_dict['batch_stderr'] = batch_stderr
                        batch_log_dict['gtag'] = workspec.workAttributes['stdOut']
                        tmpLog.debug('Done set_log_file before submission')
                    tmpLog.debug('Done jobspec attribute setting')
                # set data dict
                data.update({
                        'workspec': workspec,
                        'to_submit': to_submit,
                        'template': sdf_template,
                        'executable_file': self.executableFile,
                        'log_dir': self.logDir,
                        'log_subdir': log_subdir,
                        'n_core_per_node': n_core_per_node,
                        'panda_queue_name': panda_queue_name,
                        'x509_user_proxy': self.x509UserProxy,
                        'ce_info_dict': ce_info_dict,
                        'batch_log_dict': batch_log_dict,
                        'special_par': special_par,
                        'harvester_queue_config': harvester_queue_config,
                        'is_unified_queue': is_unified_queue,
                        'condor_schedd': condor_schedd,
                        'condor_pool': condor_pool,
                        'use_spool': self.useSpool,
                        'pilot_version': pilot_version_orig,
                        })
            return data

        def _propagate_attributes(workspec, tmpVal):
            # make logger
            tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID),
                                            method_name='_propagate_attributes')
            (retVal, tmpDict) = tmpVal
            workspec.set_attributes_with_dict(tmpDict)
            tmpLog.debug('Done workspec attributes propagation')
            return retVal

        tmpLog.debug('finished preparing worker attributes')

        # map(_handle_one_worker, workspec_list)
        with ThreadPoolExecutor(self.nProcesses * 4) as thread_pool:
            dataIterator = thread_pool.map(_handle_one_worker, workspec_list)
        tmpLog.debug('{0} workers handled'.format(nWorkers))

        # submit
        retValList = submit_bag_of_workers(list(dataIterator))
        tmpLog.debug('{0} workers submitted'.format(nWorkers))

        # propagate changed attributes
        with ThreadPoolExecutor(self.nProcesses) as thread_pool:
            retIterator = thread_pool.map(lambda _wv_tuple: _propagate_attributes(*_wv_tuple), zip(workspec_list, retValList))

        retList = list(retIterator)
        tmpLog.debug('done')

        return retList