Python QueueConfigMapper.QueueConfigMapper 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: pandaharvester.harvestercore.queue_config_mapper

클래스/타입: QueueConfigMapper

메소드/함수: QueueConfigMapper

hotexamples.com에서의 예제들: 30

Python QueueConfigMapper.QueueConfigMapper - 30개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 pandaharvester.harvestercore.queue_config_mapper.QueueConfigMapper.QueueConfigMapper에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

QueueConfigMapper(30)

get_queue(26)

load_data(4)

lastUpdate(2)

_update_last_reload_time(1)

예제 #1

파일 보기

def test():
    '''test submission'''
    from pandaharvester.harvestercore.job_spec import JobSpec
    from pandaharvester.harvestercore.plugin_factory import PluginFactory

    import json

    queuename = 'ARC-TEST'
    queueconfmapper = QueueConfigMapper()
    queueconf = queueconfmapper.get_queue(queuename)
    pluginfactory = PluginFactory()

    pandajob = '{"jobsetID": 11881, "logGUID": "88ee8a52-5c70-490c-a585-5eb6f48e4152", "cmtConfig": "x86_64-slc6-gcc49-opt", "prodDBlocks": "mc16_13TeV:mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.merge.EVNT.e5340_e5984_tid11329621_00", "dispatchDBlockTokenForOut": "NULL,NULL", "destinationDBlockToken": "dst:CERN-PROD_DATADISK,dst:NDGF-T1_DATADISK", "destinationSE": "CERN-PROD_PRESERVATION", "realDatasets": "mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.HITS.e5340_e5984_s3126_tid11364822_00,mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.log.e5340_e5984_s3126_tid11364822_00", "prodUserID": "gingrich", "GUID": "A407D965-B139-A543-8851-A8E134A678D7", "realDatasetsIn": "mc16_13TeV:mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.merge.EVNT.e5340_e5984_tid11329621_00", "nSent": 2, "cloud": "WORLD", "StatusCode": 0, "homepackage": "AtlasOffline/21.0.15", "inFiles": "EVNT.11329621._001079.pool.root.1", "processingType": "simul", "currentPriority": 900, "fsize": "129263662", "fileDestinationSE": "CERN-PROD_PRESERVATION,BOINC_MCORE", "scopeOut": "mc16_13TeV", "minRamCount": 1573, "jobDefinitionID": 0, "maxWalltime": 40638, "scopeLog": "mc16_13TeV", "transformation": "Sim_tf.py", "maxDiskCount": 485, "coreCount": 1, "prodDBlockToken": "NULL", "transferType": "NULL", "destinationDblock": "mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.HITS.e5340_e5984_s3126_tid11364822_00_sub0418634273,mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.log.e5340_e5984_s3126_tid11364822_00_sub0418634276", "dispatchDBlockToken": "NULL", "jobPars": "--inputEVNTFile=EVNT.11329621._001079.pool.root.1 --maxEvents=50 --postInclude \\"default:RecJobTransforms/UseFrontier.py\\" --preExec \\"EVNTtoHITS:simFlags.SimBarcodeOffset.set_Value_and_Lock(200000)\\" \\"EVNTtoHITS:simFlags.TRTRangeCut=30.0;simFlags.TightMuonStepping=True\\" --preInclude \\"EVNTtoHITS:SimulationJobOptions/preInclude.BeamPipeKill.py,SimulationJobOptions/preInclude.FrozenShowersFCalOnly.py\\" --skipEvents=4550 --firstEvent=5334551 --outputHITSFile=HITS.11364822._128373.pool.root.1 --physicsList=FTFP_BERT_ATL_VALIDATION --randomSeed=106692 --DBRelease=\\"all:current\\" --conditionsTag \\"default:OFLCOND-MC16-SDR-14\\" --geometryVersion=\\"default:ATLAS-R2-2016-01-00-01_VALIDATION\\" --runNumber=364168 --AMITag=s3126 --DataRunNumber=284500 --simulator=FullG4 --truthStrategy=MC15aPlus", "attemptNr": 2, "swRelease": "Atlas-21.0.15", "nucleus": "CERN-PROD", "maxCpuCount": 40638, "outFiles": "HITS.11364822._128373.pool.root.11,log.11364822._128373.job.log.tgz.11", "ddmEndPointOut": "CERN-PROD_DATADISK,NDGF-T1_DATADISK", "scopeIn": "mc16_13TeV", "PandaID": 3487584273, "sourceSite": "NULL", "dispatchDblock": "panda.11364822.07.05.GEN.0c9b1d3b-feec-411a-89e4-1cbf7347d70c_dis003487584270", "prodSourceLabel": "managed", "checksum": "ad:cd0bf10b", "jobName": "mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.e5340_e5984_s3126.3433643361", "ddmEndPointIn": "NDGF-T1_DATADISK", "taskID": 11364822, "logFile": "log.11364822._128373.job.log.tgz.1"}'
    pandajob = json.loads(pandajob)
    jspec = JobSpec()
    jspec.convert_job_json(pandajob)
    jspec.computingSite = queuename
    jspeclist = [jspec]

    maker = pluginfactory.get_plugin(queueconf.workerMaker)
    wspec = maker.make_worker(jspeclist, queueconf)

    wspec.hasJob = 1
    wspec.set_jobspec_list(jspeclist)

    sub = ARCSubmitter()
    print sub.submit_workers([wspec])
    print wspec.batchID

예제 #2

파일 보기

파일: arc_messenger.py 프로젝트: rukmarr/panda-harvester

    def post_processing(self, workspec, jobspec_list, map_type):
        '''
        Fetch job output and process pilot info for sending in final heartbeat.
        The pilot pickle is loaded and some attributes corrected (schedulerid,
        pilotlog etc), then converted to dictionary and stored in
        workspec.workAttributes[pandaid]. If pilot pickle cannot be used,
        report ARC error in pilotErrorDiag and fill all possible attributes
        using ARC information.
        '''

        arclog = arc_utils.ARCLogger(baselogger, workspec.workerID)
        tmplog = arclog.log
        tmplog.info('Post processing ARC job {0}'.format(workspec.batchID))
        job = workspec.workAttributes['arcjob']
        proxyrole = workspec.workAttributes['proxyrole']
        arcid = job['JobID']
        tmplog.info('Job id {0}'.format(arcid))

        if 'arcdownloadfiles' not in workspec.workAttributes:
            tmplog.error('No files to download')
            return

        # Assume one-to-one mapping of workers to jobs. If jobspec_list is empty
        # it means the job was cancelled by panda or otherwise forgotten
        if not jobspec_list:
            return

        # Set certificate
        userconfig = arc.UserConfig(self.cred_type)
        try:
            userconfig.ProxyPath(str(self.certs[proxyrole]))
        except:
            tmplog.error("Job {0}: no proxy found with role {1}".format(job.JobID, proxyrole))
            return

        queueconfigmapper = QueueConfigMapper()
        queueconfig = queueconfigmapper.get_queue(jobspec_list[0].computingSite)
        logbaseurl = queueconfig.submitter.get('logBaseURL')
        logbasedir = queueconfig.submitter.get('logDir', self.tmpdir)
        logsubdir = workspec.workAttributes['logsubdir']
        pandaid = str(jobspec_list[0].PandaID)

        # Construct log path and url
        logurl = '/'.join([logbaseurl, logsubdir, str(pandaid)]) if logbaseurl else None
        logdir = os.path.join(logbasedir, logsubdir)

        # post_processing is only called once, so no retries are done. But keep
        # the possibility here in case it changes
        (fetched, notfetched, notfetchedretry) = self._download_outputs(workspec.workAttributes['arcdownloadfiles'],
                                                                        logdir, arcid, pandaid, userconfig, tmplog)
        if arcid not in fetched:
            tmplog.warning("Could not get outputs of {0}".format(arcid))

        workspec.workAttributes[long(pandaid)] = {}

        workspec.workAttributes[long(pandaid)] = self._extractAndFixPilotPickle(job, pandaid, (arcid in fetched), logurl, tmplog)
        
        tmplog.debug("pilot info for {0}: {1}".format(pandaid, workspec.workAttributes[long(pandaid)]))

예제 #3

파일 보기

 def __init__(self, single_mode=False, stop_event=None, daemon_mode=True):
     # initialize database and config
     self.singleMode = single_mode
     self.stopEvent = stop_event
     self.daemonMode = daemon_mode
     from pandaharvester.harvestercore.communicator_pool import CommunicatorPool
     self.communicatorPool = CommunicatorPool()
     from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper
     self.queueConfigMapper = QueueConfigMapper()
     from pandaharvester.harvestercore.db_proxy_pool import DBProxyPool as DBProxy
     dbProxy = DBProxy()
     dbProxy.make_tables(self.queueConfigMapper)

예제 #4

파일 보기

파일: multinode_worker_maker.py 프로젝트: wguanicedew/harvester

 def __init__(self, **kwarg):
     BaseWorkerMaker.__init__(self, **kwarg)
     self.pluginFactory = PluginFactory()
     self.queue_config_mapper = QueueConfigMapper()
     tmpLog = self.make_logger(baseLogger, method_name='__init__')
     tmpLog.info("Multinode workermaker: created.")
     tmpLog.debug("Queue name: {0}".format(self.queueName))
     if self.mode == "static":
         tmpLog.info("Static configuration")
     elif self.mode == "dynamic":
         tmpLog.info("Dynamic configuration")
         self.nNodes, self.walltimelimit = self.get_resources()
     self.nJobsPerWorker = self.nNodes * self.nJobsPerNode

예제 #5

파일 보기

파일: cloud_google_monitor.py 프로젝트: rukmarr/panda-harvester

    def __init__(self, **kwarg):
        PluginBase.__init__(self, **kwarg)
        self.queue_config_mapper = QueueConfigMapper()

        # States taken from: https://cloud.google.com/compute/docs/instances/checking-instance-status
        self.vm_to_worker_status = {
            'RUNNING': WorkSpec.ST_running,
            'TERMINATED': WorkSpec.
            ST_running,  # the VM is stopped, but has to be fully deleted
            'STOPPING': WorkSpec.ST_finished,
            'PROVISIONING': WorkSpec.ST_submitted,
            'STAGING': WorkSpec.ST_submitted
        }

예제 #6

파일 보기

파일: service_monitor.py 프로젝트: HSF/harvester

    def __init__(self, pid_file, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.db_proxy = DBProxy()

        if pid_file is not None:
            self.pid_file = pid_file
        else:
            try:
                self.pid_file = harvester_config.service_monitor.pidfile
            except Exception:
                self.pid_file = None

        self.pid = self.get_master_pid()
        self.master_process = psutil.Process(self.pid)
        self.children = self.master_process.children(recursive=True)

        self.cpu_count = multiprocessing.cpu_count()
        self.queue_config_mapper = QueueConfigMapper()
        self.cred_manager = CredManager(self.queue_config_mapper,
                                        single_mode=True)

예제 #7

파일 보기

파일: k8s_submitter.py 프로젝트: dougbenjamin/harvester

    def submit_k8s_worker(self, work_spec):
        tmp_log = self.make_logger(base_logger, method_name='submit_k8s_worker')

        # get info from harvester queue config
        _queueConfigMapper = QueueConfigMapper()
        harvester_queue_config = _queueConfigMapper.get_queue(self.queueName)

        # set the stdout log file
        log_file_name = '{0}_{1}.out'.format(harvester_config.master.harvester_id, work_spec.workerID)
        work_spec.set_log_file('stdout', '{0}/{1}'.format(self.logBaseURL, log_file_name))
        # TODO: consider if we want to upload the yaml file to PanDA cache

        yaml_content = self.k8s_client.read_yaml_file(self.k8s_yaml_file)
        try:

            # read the job configuration (if available, only push model)
            job_fields, job_pars_parsed = self.read_job_configuration(work_spec)

            # decide container image and executable to run. In pull mode, defaults are provided
            container_image = self.decide_container_image(job_fields, job_pars_parsed)
            executable, args = self.build_executable(job_fields, job_pars_parsed)
            tmp_log.debug('container_image: "{0}"; executable: "{1}"; args: "{2}"'.format(container_image, executable,
                                                                                          args))

            # choose the appropriate proxy
            panda_queues_dict = PandaQueuesDict()
            this_panda_queue_dict = panda_queues_dict.get(self.queueName, dict())

            is_grandly_unified_queue = panda_queues_dict.is_grandly_unified_queue(self.queueName)
            cert = self._choose_proxy(work_spec, is_grandly_unified_queue)
            if not cert:
                err_str = 'No proxy specified in proxySecretPath. Not submitted'
                tmp_return_value = (False, err_str)
                return tmp_return_value

            # get the walltime limit
            try:
                max_time = this_panda_queue_dict['maxtime']
            except Exception as e:
                tmp_log.warning('Could not retrieve maxtime field for queue {0}'.format(self.queueName))
                max_time = None

            associated_params_dict = {}
            for key, val in panda_queues_dict.get_harvester_params(self.queueName).items():
                if key in self._allowed_agis_attrs:
                    associated_params_dict[key] = val

            pilot_url = associated_params_dict.get('pilot_url')
            pilot_version = str(this_panda_queue_dict.get('pilot_version', 'current'))
            python_version = str(this_panda_queue_dict.get('python_version', '2'))

            # prod_source_label = harvester_queue_config.get_source_label(work_spec.jobType)
            pilot_opt_dict = submitter_common.get_complicated_pilot_options(work_spec.pilotType)
            if pilot_opt_dict is None:
                prod_source_label = harvester_queue_config.get_source_label(work_spec.jobType)
                pilot_type = work_spec.pilotType
                pilot_url_str = '--piloturl {0}'.format(pilot_url) if pilot_url else ''
            else:
                prod_source_label = pilot_opt_dict['prod_source_label']
                pilot_type = pilot_opt_dict['pilot_type_opt']
                pilot_url_str = pilot_opt_dict['pilot_url_str']

            pilot_python_option = submitter_common.get_python_version_option(python_version, prod_source_label)

            # submit the worker
            rsp, yaml_content_final = self.k8s_client.create_job_from_yaml(yaml_content, work_spec, prod_source_label,
                                                                           pilot_type, pilot_url_str,
                                                                           pilot_python_option,
                                                                           container_image, executable, args, cert,
                                                                           cpu_adjust_ratio=self.cpuAdjustRatio,
                                                                           memory_adjust_ratio=self.memoryAdjustRatio,
                                                                           max_time=max_time)
        except Exception as _e:
            tmp_log.error(traceback.format_exc())
            err_str = 'Failed to create a JOB; {0}'.format(_e)
            tmp_return_value = (False, err_str)
        else:
            work_spec.batchID = yaml_content['metadata']['name']
            tmp_log.debug('Created worker {0} with batchID={1}'.format(work_spec.workerID, work_spec.batchID))
            tmp_return_value = (True, '')

        return tmp_return_value

예제 #8

파일 보기

파일: act_submitter.py 프로젝트: pavlo-svirin/harvester

    def submit_workers(self, workspec_list):
        retList = []
        for workSpec in workspec_list:

            tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workSpec.workerID),
                                            method_name='submit_workers')

            queueconfigmapper = QueueConfigMapper()
            queueconfig = queueconfigmapper.get_queue(workSpec.computingSite)
            prodSourceLabel = queueconfig.get_source_label()

            # If jobSpec is defined we are in push mode, if not pull mode
            # Both assume one to one worker to job mapping
            jobSpec = workSpec.get_jobspec_list()
            if jobSpec:
                jobSpec = jobSpec[0]
                tmpLog.debug("JobSpec: {0}".format(jobSpec.values_map()))
                # Unified queues: take prodsourcelabel from job
                prodSourceLabel = jobSpec.jobParams.get('prodSourceLabel', prodSourceLabel)

            desc = {}
            # If we need to prefetch events, set aCT status waiting.
            # feed_events in act_messenger will fill events and release the job
            if queueconfig.prefetchEvents:
                desc['pandastatus'] = 'waiting'
                desc['actpandastatus'] = 'waiting'
                desc['arcjobid'] = -1 # dummy id to prevent submission
            else:
                desc['pandastatus'] = 'sent'
                desc['actpandastatus'] = 'sent'
            desc['siteName'] = workSpec.computingSite
            desc['proxyid'] = self.proxymap['pilot' if prodSourceLabel in ['user', 'panda'] else 'production']
            desc['prodSourceLabel'] = prodSourceLabel
            desc['sendhb'] = 0
            metadata = {'harvesteraccesspoint': workSpec.get_access_point(),
                        'schedulerid': 'harvester-{}'.format(harvester_config.master.harvester_id)}
            desc['metadata'] = json.dumps(metadata)

            if jobSpec:
                # push mode: aCT takes the url-encoded job description (like it gets from panda server)
                pandaid = jobSpec.PandaID
                actjobdesc = urllib.parse.urlencode(jobSpec.jobParams)
            else:
                # pull mode: just set pandaid (to workerid) and prodsourcelabel
                pandaid = workSpec.workerID
                actjobdesc = 'PandaID=%d&prodSourceLabel=%s' % (pandaid, prodSourceLabel)

            tmpLog.info("Inserting job {0} into aCT DB: {1}".format(pandaid, str(desc)))
            try:
                batchid = self.actDB.insertJob(pandaid, actjobdesc, desc)['LAST_INSERT_ID()']
            except Exception as e:
                result = (False, "Failed to insert job into aCT DB: {0}".format(str(e)))
            else:
                tmpLog.info("aCT batch id {0}".format(batchid))
                workSpec.batchID = str(batchid)
                workSpec.submissionHost = self.hostname
                workSpec.nativeStatus = desc['actpandastatus']
                # Set log files in workSpec
                today = time.strftime('%Y-%m-%d', time.gmtime())
                logurl = '/'.join([queueconfig.submitter.get('logBaseURL'), today, workSpec.computingSite, str(pandaid)])
                workSpec.set_log_file('batch_log', '{0}.log'.format(logurl))
                workSpec.set_log_file('stdout', '{0}.out'.format(logurl))
                workSpec.set_log_file('stderr', '{0}.err'.format(logurl))
                workSpec.set_log_file('jdl', '{0}.jdl'.format(logurl))
                result = (True, '')
            retList.append(result)

        return retList

예제 #9

파일 보기

파일: act_monitor.py 프로젝트: HSF/harvester

    def check_workers(self, workspec_list):
        retList = []
        for workSpec in workspec_list:
            # make logger
            tmpLog = core_utils.make_logger(baseLogger,
                                            'workerID={0}'.format(
                                                workSpec.workerID),
                                            method_name='check_workers')

            queueconfigmapper = QueueConfigMapper()
            queueconfig = queueconfigmapper.get_queue(workSpec.computingSite)
            try:
                tmpLog.debug('Querying aCT for id {0}'.format(
                    workSpec.batchID))
                columns = [
                    'actpandastatus', 'pandastatus', 'computingElement',
                    'node', 'error'
                ]
                actjobs = self.actDB.getJobs("id={0}".format(workSpec.batchID),
                                             columns)
            except Exception as e:
                if self.actDB:
                    tmpLog.error("Failed to query aCT DB: {0}".format(str(e)))
                # send back current status
                retList.append((workSpec.status, ''))
                continue

            if not actjobs:
                tmpLog.error("Job with id {0} not found in aCT".format(
                    workSpec.batchID))
                # send back current status
                retList.append((WorkSpec.ST_failed, "Job not found in aCT"))
                continue

            actstatus = actjobs[0]['actpandastatus']
            workSpec.nativeStatus = actstatus
            newStatus = WorkSpec.ST_running
            errorMsg = ''
            if actstatus in ['waiting', 'sent', 'starting']:
                newStatus = WorkSpec.ST_submitted

            # Handle post running states
            if queueconfig.truePilot:
                # True pilot: keep in running until really done
                if actstatus in ['done', 'donecancelled']:
                    newStatus = WorkSpec.ST_finished
                elif actstatus == 'donefailed':
                    # set failed here with workspec sup error
                    errorMsg = actjobs[0]['error'] or 'Unknown error'
                    error_code = WorkerErrors.error_codes.get('GENERAL_ERROR')
                    workSpec.set_supplemental_error(error_code=error_code,
                                                    error_diag=errorMsg)
                    newStatus = WorkSpec.ST_failed
                    tmpLog.info('ID {0} failed with error {1})'.format(
                        workSpec.batchID, errorMsg))
            elif actstatus in [
                    'done', 'donefailed', 'donecancelled', 'transferring',
                    'tovalidate'
            ]:
                # NG mode: all post processing is now done in the stager
                newStatus = WorkSpec.ST_finished

            if newStatus != workSpec.status:
                tmpLog.info('ID {0} updated status {1} -> {2} ({3})'.format(
                    workSpec.batchID, workSpec.status, newStatus, actstatus))
            else:
                tmpLog.debug('batchStatus {0} -> workerStatus {1}'.format(
                    actstatus, newStatus))

            if actjobs[0]['computingElement']:
                workSpec.computingElement = actjobs[0]['computingElement']
            if actjobs[0]['node']:
                try:
                    pandaid = workSpec.get_jobspec_list()[0].PandaID
                    workSpec.set_work_attributes(
                        {pandaid: {
                            'node': actjobs[0]['node']
                        }})
                except:
                    tmpLog.warning(
                        'Could not extract panda ID for worker {0}'.format(
                            workSpec.batchID))

            retList.append((newStatus, errorMsg))

        return True, retList

예제 #10

파일 보기

파일: go_bulk_stager.py 프로젝트: mweinberg2718/panda-harvester

    def check_stage_out_status(self, jobspec):
        # make logger
        tmpLog = self.make_logger(_logger,
                                  'PandaID={0} ThreadID={1}'.format(
                                      jobspec.PandaID,
                                      threading.current_thread().ident),
                                  method_name='check_stage_out_status')
        tmpLog.debug('start')
        # show the dummy transfer id and set to a value with the PandaID if needed.
        tmpLog.debug('self.dummy_transfer_id = {}'.format(
            self.dummy_transfer_id))
        if self.dummy_transfer_id == '{0}_{1}'.format(dummy_transfer_id_base,
                                                      'XXXX'):
            old_dummy_transfer_id = self.dummy_transfer_id
            self.dummy_transfer_id = '{0}_{1}'.format(dummy_transfer_id_base,
                                                      jobspec.PandaID)
            tmpLog.debug(
                'Change self.dummy_transfer_id  from {0} to {1}'.format(
                    old_dummy_transfer_id, self.dummy_transfer_id))

        # default return
        tmpRetVal = (True, '')
        # set flag if have db lock
        have_db_lock = False
        # check that jobspec.computingSite is defined
        if jobspec.computingSite is None:
            # not found
            tmpLog.error('jobspec.computingSite is not defined')
            return False, 'jobspec.computingSite is not defined'
        else:
            tmpLog.debug('jobspec.computingSite : {0}'.format(
                jobspec.computingSite))
        # get the queueConfig and corresponding objStoreID_ES
        queueConfigMapper = QueueConfigMapper()
        queueConfig = queueConfigMapper.get_queue(jobspec.computingSite)
        # check queueConfig stager section to see if jobtype is set
        if 'jobtype' in queueConfig.stager:
            if queueConfig.stager['jobtype'] == "Yoda":
                self.Yodajob = True
        # set the location of the files in fileSpec.objstoreID
        # see file /cvmfs/atlas.cern.ch/repo/sw/local/etc/agis_ddmendpoints.json
        self.objstoreID = int(queueConfig.stager['objStoreID_ES'])
        if self.Yodajob:
            self.pathConvention = int(queueConfig.stager['pathConvention'])
            tmpLog.debug(
                'Yoda Job - PandaID = {0} objstoreID = {1} pathConvention ={2}'
                .format(jobspec.PandaID, self.objstoreID, self.pathConvention))
        else:
            self.pathConvention = None
            tmpLog.debug('PandaID = {0} objstoreID = {1}'.format(
                jobspec.PandaID, self.objstoreID))
        # test we have a Globus Transfer Client
        if not self.tc:
            errStr = 'failed to get Globus Transfer Client'
            tmpLog.error(errStr)
            return False, errStr
        # set transferID to None
        transferID = None
        # get the scope of the log files
        outfileattrib = jobspec.get_output_file_attributes()
        scopeLog = 'xxxx'
        for key in outfileattrib.keys():
            if "log.tgz" in key:
                scopeLog = outfileattrib[key]['scope']
        # get transfer groups
        groups = jobspec.get_groups_of_output_files()
        tmpLog.debug(
            'jobspec.get_groups_of_output_files() = : {0}'.format(groups))
        # lock if the dummy transfer ID is used to avoid submitting duplicated transfer requests
        for dummy_transferID in groups:
            # skip if valid transfer ID not dummy one
            if validate_transferid(dummy_transferID):
                continue
            # lock for 120 sec
            tmpLog.debug(
                'attempt to set DB lock for self.id - {0} dummy_transferID - {1}'
                .format(self.id, dummy_transferID))
            have_db_lock = self.dbInterface.get_object_lock(dummy_transferID,
                                                            lock_interval=120)
            if not have_db_lock:
                # escape since locked by another thread
                msgStr = 'escape since locked by another thread'
                tmpLog.debug(msgStr)
                return None, msgStr
            # refresh group information since that could have been updated by another thread before getting the lock
            tmpLog.debug('self.dbInterface.refresh_file_group_info(jobspec)')
            self.dbInterface.refresh_file_group_info(jobspec)
            # get transfer groups again with refreshed info
            tmpLog.debug(
                'After db refresh call groups=jobspec.get_groups_of_output_files()'
            )
            groups = jobspec.get_groups_of_output_files()
            tmpLog.debug(
                'jobspec.get_groups_of_output_files() = : {0}'.format(groups))
            # the dummy transfer ID is still there
            if dummy_transferID in groups:
                groupUpdateTime = groups[dummy_transferID]['groupUpdateTime']
                # get files with the dummy transfer ID across jobs
                fileSpecs = self.dbInterface.get_files_with_group_id(
                    dummy_transferID)
                # submit transfer if there are more than 10 files or the group was made before more than 10 min
                msgStr = 'dummy_transferID = {0}  number of files = {1}'.format(
                    dummy_transferID, len(fileSpecs))
                tmpLog.debug(msgStr)
                if len(fileSpecs) >= 10 or \
                        groupUpdateTime < datetime.datetime.utcnow() - datetime.timedelta(minutes=10):
                    tmpLog.debug('prepare to transfer files')
                    # submit transfer and get a real transfer ID
                    # set the Globus destination Endpoint id and path will get them from Agis eventually
                    #self.Globus_srcPath = queueConfig.stager['Globus_srcPath']
                    self.srcEndpoint = queueConfig.stager['srcEndpoint']
                    self.Globus_srcPath = self.basePath
                    self.Globus_dstPath = queueConfig.stager['Globus_dstPath']
                    self.dstEndpoint = queueConfig.stager['dstEndpoint']
                    # Test the endpoints and create the transfer data class
                    errMsg = None
                    try:
                        # Test endpoints for activation
                        tmpStatsrc, srcStr = globus_utils.check_endpoint_activation(
                            tmpLog, self.tc, self.srcEndpoint)
                        tmpStatdst, dstStr = globus_utils.check_endpoint_activation(
                            tmpLog, self.tc, self.dstEndpoint)
                        if tmpStatsrc and tmpStatdst:
                            errStr = 'source Endpoint and destination Endpoint activated'
                            tmpLog.debug(errStr)
                        else:
                            errMsg = ''
                            if not tmpStatsrc:
                                errMsg += ' source Endpoint not activated '
                            if not tmpStatdst:
                                errMsg += ' destination Endpoint not activated '
                            # release process lock
                            tmpLog.debug(
                                'attempt to release DB lock for self.id - {0} dummy_transferID - {1}'
                                .format(self.id, dummy_transferID))
                            self.have_db_lock = self.dbInterface.release_object_lock(
                                dummy_transferID)
                            if not self.have_db_lock:
                                errMsg += ' - Could not release DB lock for {}'.format(
                                    dummy_transferID)
                            tmpLog.error(errMsg)
                            tmpRetVal = (None, errMsg)
                            return tmpRetVal
                        # both endpoints activated now prepare to transfer data
                        tdata = None
                        tdata = TransferData(self.tc,
                                             self.srcEndpoint,
                                             self.dstEndpoint,
                                             sync_level="checksum")
                    except:
                        errStat, errMsg = globus_utils.handle_globus_exception(
                            tmpLog)
                        # release process lock
                        tmpLog.debug(
                            'attempt to release DB lock for self.id - {0} dummy_transferID - {1}'
                            .format(self.id, dummy_transferID))
                        release_db_lock = self.dbInterface.release_object_lock(
                            dummy_transferID)
                        if not release_db_lock:
                            errMsg += ' - Could not release DB lock for {}'.format(
                                dummy_transferID)
                        tmpLog.error(errMsg)
                        tmpRetVal = (errStat, errMsg)
                        return tmpRetVal
                    # loop over all files
                    ifile = 0
                    for fileSpec in fileSpecs:
                        logfile = False
                        scope = 'panda'
                        if fileSpec.scope is not None:
                            scope = fileSpec.scope
                        # for Yoda job set the scope to transient for non log files
                        if self.Yodajob:
                            scope = 'transient'
                        if fileSpec.fileType == "log":
                            logfile = True
                            scope = scopeLog
                        # only print to log file first 25 files
                        if ifile < 25:
                            msgStr = "fileSpec.lfn - {0} fileSpec.scope - {1}".format(
                                fileSpec.lfn, fileSpec.scope)
                            tmpLog.debug(msgStr)
                        if ifile == 25:
                            msgStr = "printed first 25 files skipping the rest".format(
                                fileSpec.lfn, fileSpec.scope)
                            tmpLog.debug(msgStr)
                        hash = hashlib.md5()
                        hash.update('%s:%s' % (scope, fileSpec.lfn))
                        hash_hex = hash.hexdigest()
                        correctedscope = "/".join(scope.split('.'))
                        srcURL = fileSpec.path
                        dstURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format(
                            endPoint=self.Globus_dstPath,
                            scope=correctedscope,
                            hash1=hash_hex[0:2],
                            hash2=hash_hex[2:4],
                            lfn=fileSpec.lfn)
                        if logfile:
                            tmpLog.debug('src={srcURL} dst={dstURL}'.format(
                                srcURL=srcURL, dstURL=dstURL))
                        if ifile < 25:
                            tmpLog.debug('src={srcURL} dst={dstURL}'.format(
                                srcURL=srcURL, dstURL=dstURL))
                        # add files to transfer object - tdata
                        if os.access(srcURL, os.R_OK):
                            if ifile < 25:
                                tmpLog.debug("tdata.add_item({},{})".format(
                                    srcURL, dstURL))
                            tdata.add_item(srcURL, dstURL)
                        else:
                            errMsg = "source file {} does not exist".format(
                                srcURL)
                            # release process lock
                            tmpLog.debug(
                                'attempt to release DB lock for self.id - {0} dummy_transferID - {1}'
                                .format(self.id, dummy_transferID))
                            release_db_lock = self.dbInterface.release_object_lock(
                                dummy_transferID)
                            if not release_db_lock:
                                errMsg += ' - Could not release DB lock for {}'.format(
                                    dummy_transferID)
                            tmpLog.error(errMsg)
                            tmpRetVal = (False, errMsg)
                            return tmpRetVal
                        ifile += 1
                    # submit transfer
                    tmpLog.debug('Number of files to transfer - {}'.format(
                        len(tdata['DATA'])))
                    try:
                        transfer_result = self.tc.submit_transfer(tdata)
                        # check status code and message
                        tmpLog.debug(str(transfer_result))
                        if transfer_result['code'] == "Accepted":
                            # succeeded
                            # set transfer ID which are used for later lookup
                            transferID = transfer_result['task_id']
                            tmpLog.debug(
                                'successfully submitted id={0}'.format(
                                    transferID))
                            # set status for files
                            self.dbInterface.set_file_group(
                                fileSpecs, transferID, 'running')
                            msgStr = 'submitted transfer with ID={0}'.format(
                                transferID)
                            tmpLog.debug(msgStr)
                        else:
                            # release process lock
                            tmpLog.debug(
                                'attempt to release DB lock for self.id - {0} dummy_transferID - {1}'
                                .format(self.id, dummy_transferID))
                            release_db_lock = self.dbInterface.release_object_lock(
                                dummy_transferID)
                            if not release_db_lock:
                                errMsg = 'Could not release DB lock for {}'.format(
                                    dummy_transferID)
                                tmpLog.error(errMsg)
                            tmpRetVal = (None, transfer_result['message'])
                            return tmpRetVal
                    except Exception as e:
                        errStat, errMsg = globus_utils.handle_globus_exception(
                            tmpLog)
                        # release process lock
                        tmpLog.debug(
                            'attempt to release DB lock for self.id - {0} dummy_transferID - {1}'
                            .format(self.id, dummy_transferID))
                        release_db_lock = self.dbInterface.release_object_lock(
                            dummy_transferID)
                        if not release_db_lock:
                            errMsg += ' - Could not release DB lock for {}'.format(
                                dummy_transferID)
                        tmpLog.error(errMsg)
                        return errStat, errMsg
                else:
                    msgStr = 'wait until enough files are pooled'
                    tmpLog.debug(msgStr)
                # release the lock
                tmpLog.debug(
                    'attempt to release DB lock for self.id - {0} dummy_transferID - {1}'
                    .format(self.id, dummy_transferID))
                release_db_lock = self.dbInterface.release_object_lock(
                    dummy_transferID)
                if release_db_lock:
                    tmpLog.debug(
                        'released DB lock for self.id - {0} dummy_transferID - {1}'
                        .format(self.id, dummy_transferID))
                    have_db_lock = False
                else:
                    msgStr += ' - Could not release DB lock for {}'.format(
                        dummy_transferID)
                    tmpLog.error(msgStr)
                # return None to retry later
                return None, msgStr
            # release the db lock if needed
            if have_db_lock:
                tmpLog.debug(
                    'attempt to release DB lock for self.id - {0} dummy_transferID - {1}'
                    .format(self.id, dummy_transferID))
                release_db_lock = self.dbInterface.release_object_lock(
                    dummy_transferID)
                if release_db_lock:
                    tmpLog.debug(
                        'released DB lock for self.id - {0} dummy_transferID - {1}'
                        .format(self.id, dummy_transferID))
                    have_db_lock = False
                else:
                    msgStr += ' - Could not release DB lock for {}'.format(
                        dummy_transferID)
                    tmpLog.error(msgStr)
                    return None, msgStr
        # check transfer with real transfer IDs
        # get transfer groups
        tmpLog.debug("groups = jobspec.get_groups_of_output_files()")
        groups = jobspec.get_groups_of_output_files()
        tmpLog.debug('Number of transfer groups - {0}'.format(len(groups)))
        tmpLog.debug('transfer groups any state - {0}'.format(groups))
        if len(groups) == 0:
            tmpLog.debug(
                "jobspec.get_groups_of_output_files(skip_done=True) returned no files "
            )
            tmpLog.debug("check_stage_out_status return status - True ")
            return True, ''

        for transferID in groups:
            # allow only valid UUID
            if validate_transferid(transferID):
                # get transfer task
                tmpStat, transferTasks = globus_utils.get_transfer_task_by_id(
                    tmpLog, self.tc, transferID)
                # return a temporary error when failed to get task
                if not tmpStat:
                    errStr = 'failed to get transfer task; tc = %s; transferID = %s' % (
                        str(self.tc), str(transferID))
                    tmpLog.error(errStr)
                    return None, errStr
                # return a temporary error when task is missing
                if transferID not in transferTasks:
                    errStr = 'transfer task ID - {} is missing'.format(
                        transferID)
                    tmpLog.error(errStr)
                    return None, errStr
                # succeeded in finding a transfer task by tranferID
                if transferTasks[transferID]['status'] == 'SUCCEEDED':
                    tmpLog.debug(
                        'transfer task {} succeeded'.format(transferID))
                    self.set_FileSpec_objstoreID(jobspec, self.objstoreID,
                                                 self.pathConvention)
                    if self.changeFileStatusOnSuccess:
                        self.set_FileSpec_status(jobspec, 'finished')
                    return True, ''
                # failed
                if transferTasks[transferID]['status'] == 'FAILED':
                    errStr = 'transfer task {} failed'.format(transferID)
                    tmpLog.error(errStr)
                    self.set_FileSpec_status(jobspec, 'failed')
                    return False, errStr
                # another status
                tmpStr = 'transfer task {0} status: {1}'.format(
                    transferID, transferTasks[transferID]['status'])
                tmpLog.debug(tmpStr)
                return None, ''
        # end of loop over transfer groups
        tmpLog.debug(
            'End of loop over transfers groups - ending check_stage_out_status function'
        )
        return None, 'no valid transfer id found'

예제 #11

파일 보기

파일: go_preparator.py 프로젝트: wyang007/panda-harvester

 def trigger_preparation(self, jobspec):
     # get logger
     tmpLog = core_utils.make_logger(_logger,
                                     'PandaID={0}'.format(jobspec.PandaID),
                                     method_name='trigger_preparation')
     tmpLog.debug('start')
     # check that jobspec.computingSite is defined
     if jobspec.computingSite is None:
         # not found
         tmpLog.error('jobspec.computingSite is not defined')
         return False, 'jobspec.computingSite is not defined'
     else:
         tmpLog.debug('jobspec.computingSite : {0}'.format(
             jobspec.computingSite))
     # test we have a Globus Transfer Client
     if not self.tc:
         errStr = 'failed to get Globus Transfer Client'
         tmpLog.error(errStr)
         return False, errStr
     # get label
     label = self.make_label(jobspec)
     tmpLog.debug('label={0}'.format(label))
     # get transfer tasks
     tmpStat, transferTasks = globus_utils.get_transfer_tasks(
         tmpLog, self.tc, label)
     if not tmpStat:
         errStr = 'failed to get transfer tasks'
         tmpLog.error(errStr)
         return False, errStr
     # check if already queued
     if label in transferTasks:
         tmpLog.debug('skip since already queued with {0}'.format(
             str(transferTasks[label])))
         return True, ''
     # set the Globus destination Endpoint id and path will get them from Agis eventually
     from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper
     queueConfigMapper = QueueConfigMapper()
     queueConfig = queueConfigMapper.get_queue(jobspec.computingSite)
     self.Globus_srcPath = queueConfig.preparator['Globus_srcPath']
     self.srcEndpoint = queueConfig.preparator['srcEndpoint']
     self.Globus_dstPath = self.basePath
     #self.Globus_dstPath = queueConfig.preparator['Globus_dstPath']
     self.dstEndpoint = queueConfig.preparator['dstEndpoint']
     # get input files
     files = []
     lfns = []
     inFiles = jobspec.get_input_file_attributes(skip_ready=True)
     for inLFN, inFile in iteritems(inFiles):
         # set path to each file
         inFile['path'] = mover_utils.construct_file_path(
             self.basePath, inFile['scope'], inLFN)
         dstpath = inFile['path']
         # check if path exists if not create it.
         if not os.access(self.basePath, os.F_OK):
             os.makedirs(self.basePath)
         # create the file paths for the Globus source and destination endpoints
         Globus_srcpath = mover_utils.construct_file_path(
             self.Globus_srcPath, inFile['scope'], inLFN)
         Globus_dstpath = mover_utils.construct_file_path(
             self.Globus_dstPath, inFile['scope'], inLFN)
         files.append({
             'scope': inFile['scope'],
             'name': inLFN,
             'Globus_dstPath': Globus_dstpath,
             'Globus_srcPath': Globus_srcpath
         })
         lfns.append(inLFN)
     tmpLog.debug('files[] {0}'.format(files))
     try:
         # Test endpoints for activation
         tmpStatsrc, srcStr = globus_utils.check_endpoint_activation(
             tmpLog, self.tc, self.srcEndpoint)
         tmpStatdst, dstStr = globus_utils.check_endpoint_activation(
             tmpLog, self.tc, self.dstEndpoint)
         if tmpStatsrc and tmpStatdst:
             errStr = 'source Endpoint and destination Endpoint activated'
             tmpLog.debug(errStr)
         else:
             errStr = ''
             if not tmpStatsrc:
                 errStr += ' source Endpoint not activated '
             if not tmpStatdst:
                 errStr += ' destination Endpoint not activated '
             tmpLog.error(errStr)
             return False, errStr
         # both endpoints activated now prepare to transfer data
         if len(files) > 0:
             tdata = TransferData(self.tc,
                                  self.srcEndpoint,
                                  self.dstEndpoint,
                                  label=label,
                                  sync_level="checksum")
             # loop over all input files and add
             for myfile in files:
                 tdata.add_item(myfile['Globus_srcPath'],
                                myfile['Globus_dstPath'])
             # submit
             transfer_result = self.tc.submit_transfer(tdata)
             # check status code and message
             tmpLog.debug(str(transfer_result))
             if transfer_result['code'] == "Accepted":
                 # succeeded
                 # set transfer ID which are used for later lookup
                 transferID = transfer_result['task_id']
                 jobspec.set_groups_to_files(
                     {transferID: {
                         'lfns': lfns,
                         'groupStatus': 'active'
                     }})
                 tmpLog.debug('done')
                 return True, ''
             else:
                 return False, transfer_result['message']
         # if no files to transfer return True
         return True, 'No files to transfer'
     except:
         errStat, errMsg = globus_utils.handle_globus_exception(tmpLog)
         return errStat, {}

예제 #12

파일 보기

파일: saga_monitor.py 프로젝트: virthead/harvester

 def __init__(self, **kwarg):
     PluginBase.__init__(self, **kwarg)
     self.pluginFactory = PluginFactory()
     self.queue_config_mapper = QueueConfigMapper()
     tmpLog = self.make_logger(baseLogger, method_name='__init__')
     tmpLog.info("[{0}] SAGA adaptor will be used.".format(self.adaptor))

예제 #13

파일 보기

def qconf_refresh(arguments):
    from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper
    qcm = QueueConfigMapper()
    qcm._update_last_reload_time()
    qcm.lastUpdate = None
    qcm.load_data(refill_table=arguments.refill)

예제 #14

파일 보기

파일: harvester_admin.py 프로젝트: nikmagini/panda-harvester

def qconf_refresh(arguments):
    from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper
    qcm = QueueConfigMapper()
    qcm.lastUpdate = None
    qcm.load_data()

예제 #15

파일 보기

파일: go_bulk_preparator.py 프로젝트: wyang007/panda-harvester

 def check_status(self, jobspec):
     # make logger
     tmpLog = core_utils.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID),
                                     method_name='check_status')
     tmpLog.debug('start')
     # default return
     tmpRetVal = (True, '')
     # check that jobspec.computingSite is defined
     if jobspec.computingSite is None:
         # not found
         tmpLog.error('jobspec.computingSite is not defined')
         return False, 'jobspec.computingSite is not defined'
     else:
         tmpLog.debug('jobspec.computingSite : {0}'.format(jobspec.computingSite))
     # test we have a Globus Transfer Client
     if not self.tc :
         errStr = 'failed to get Globus Transfer Client'
         tmpLog.error(errStr)
         return False, errStr
     # set transferID to None
     transferID = None
     # get transfer groups
     groups = jobspec.get_groups_of_input_files(skip_ready=True)
     tmpLog.debug('jobspec.get_groups_of_input_files() = : {0}'.format(groups))
     # lock if the dummy transfer ID is used to avoid submitting duplicated transfer requests
     if self.dummy_transfer_id in groups:
         # lock for 120 sec
         if not self.have_db_lock :
             tmpLog.debug('attempt to set DB lock for self.id - {0} self.dummy_transfer_id - {1}'.format(self.id,self.dummy_transfer_id))
             self.have_db_lock = self.dbInterface.get_object_lock(self.dummy_transfer_id, lock_interval=120)
         if not self.have_db_lock:
             # escape since locked by another thread
             msgStr = 'escape since locked by another thread'
             tmpLog.debug(msgStr)
             return None, msgStr
         # refresh group information since that could have been updated by another thread before getting the lock
         self.dbInterface.refresh_file_group_info(jobspec)
         # get transfer groups again with refreshed info
         groups = jobspec.get_groups_of_input_files(skip_ready=True)
         # the dummy transfer ID is still there
         if self.dummy_transfer_id in groups:
             groupUpdateTime = groups[self.dummy_transfer_id]['groupUpdateTime']
             # get files with the dummy transfer ID across jobs
             fileSpecs = self.dbInterface.get_files_with_group_id(self.dummy_transfer_id)
             # submit transfer if there are more than 10 files or the group was made before more than 10 min
             msgStr = 'self.dummy_transfer_id = {0}  number of files = {1}'.format(self.dummy_transfer_id,len(fileSpecs))
             tmpLog.debug(msgStr)
             if len(fileSpecs) >= 10 or \
                     groupUpdateTime < datetime.datetime.utcnow() - datetime.timedelta(minutes=10):
                 tmpLog.debug('prepare to transfer files')
                 # submit transfer and get a real transfer ID
                 # set the Globus destination Endpoint id and path will get them from Agis eventually  
                 from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper
                 queueConfigMapper = QueueConfigMapper()
                 queueConfig = queueConfigMapper.get_queue(jobspec.computingSite)
                 self.Globus_srcPath = queueConfig.preparator['Globus_srcPath']
                 self.srcEndpoint = queueConfig.preparator['srcEndpoint']
                 self.Globus_dstPath = self.basePath
                 #self.Globus_dstPath = queueConfig.preparator['Globus_dstPath']
                 self.dstEndpoint = queueConfig.preparator['dstEndpoint']
                 # Test the endpoints and create the transfer data class 
                 errMsg = None
                 try:
                     # Test endpoints for activation
                     tmpStatsrc, srcStr = globus_utils.check_endpoint_activation(tmpLog,self.tc,self.srcEndpoint)
                     tmpStatdst, dstStr = globus_utils.check_endpoint_activation(tmpLog,self.tc,self.dstEndpoint)
                     if tmpStatsrc and tmpStatdst:
                         errStr = 'source Endpoint and destination Endpoint activated'
                         tmpLog.debug(errStr)
                     else:
                         errMsg = ''
                         if not tmpStatsrc :
                             errMsg += ' source Endpoint not activated '
                         if not tmpStatdst :
                             errMsg += ' destination Endpoint not activated '
                         # release process lock
                         tmpLog.debug('attempt to release DB lock for self.id - {0} self.dummy_transfer_id - {1}'.format(self.id,self.dummy_transfer_id))
                         self.have_db_lock = self.dbInterface.release_object_lock(self.dummy_transfer_id)
                         if not self.have_db_lock:
                             errMsg += ' - Could not release DB lock for {}'.format(self.dummy_transfer_id)
                         tmpLog.error(errMsg)
                         tmpRetVal = (None,errMsg)
                         return tmpRetVal
                     # both endpoints activated now prepare to transfer data
                     tdata = TransferData(self.tc,
                                          self.srcEndpoint,
                                          self.dstEndpoint,
                                          sync_level="checksum")
                 except:
                     errStat, errMsg = globus_utils.handle_globus_exception(tmpLog)
                     # release process lock
                     tmpLog.debug('attempt to release DB lock for self.id - {0} self.dummy_transfer_id - {1}'.format(self.id,self.dummy_transfer_id))
                     self.have_db_lock = self.dbInterface.release_object_lock(self.dummy_transfer_id)
                     if not self.have_db_lock:
                         errMsg += ' - Could not release DB lock for {}'.format(self.dummy_transfer_id)
                     tmpLog.error(errMsg)
                     tmpRetVal = (errStat, errMsg)
                     return tmpRetVal
                 # loop over all files
                 for fileSpec in fileSpecs:
                     attrs = jobspec.get_input_file_attributes()
                     msgStr = "len(jobSpec.get_input_file_attributes()) = {0} type - {1}".format(len(attrs),type(attrs))
                     tmpLog.debug(msgStr)
                     for key, value in attrs.iteritems():
                         msgStr = "input file attributes - {0} {1}".format(key,value)
                         tmpLog.debug(msgStr)
                     msgStr = "fileSpec.lfn - {0} fileSpec.scope - {1}".format(fileSpec.lfn, fileSpec.scope)
                     tmpLog.debug(msgStr)
                     scope = fileSpec.scope
                     hash = hashlib.md5()
                     hash.update('%s:%s' % (scope, fileSpec.lfn))
                     hash_hex = hash.hexdigest()
                     correctedscope = "/".join(scope.split('.'))
                     #srcURL = fileSpec.path
                     srcURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format(endPoint=self.Globus_srcPath,
                                                                                scope=correctedscope,
                                                                                hash1=hash_hex[0:2],
                                                                                hash2=hash_hex[2:4],
                                                                                lfn=fileSpec.lfn)
                     dstURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format(endPoint=self.Globus_dstPath,
                                                                                scope=correctedscope,
                                                                                hash1=hash_hex[0:2],
                                                                                hash2=hash_hex[2:4],
                                                                                lfn=fileSpec.lfn)
                     tmpLog.debug('src={srcURL} dst={dstURL}'.format(srcURL=srcURL, dstURL=dstURL))
                     # add files to transfer object - tdata
                     tmpLog.debug("tdata.add_item({},{})".format(srcURL,dstURL))
                     tdata.add_item(srcURL,dstURL)
                 # submit transfer 
                 try:
                     transfer_result = self.tc.submit_transfer(tdata)
                     # check status code and message
                     tmpLog.debug(str(transfer_result))
                     if transfer_result['code'] == "Accepted":
                         # succeeded
                         # set transfer ID which are used for later lookup
                         transferID = transfer_result['task_id']
                         tmpLog.debug('successfully submitted id={0}'.format(transferID))
                         # set status for files
                         self.dbInterface.set_file_group(fileSpecs, transferID, 'running')
                         msgStr = 'submitted transfer with ID={0}'.format(transferID)
                         tmpLog.debug(msgStr)
                     else:
                         # release process lock
                         tmpLog.debug('attempt to release DB lock for self.id - {0} self.dummy_transfer_id - {1}'.format(self.id,self.dummy_transfer_id))
                         self.have_db_lock = self.dbInterface.release_object_lock(self.dummy_transfer_id)
                         if not self.have_db_lock:
                             errMsg = 'Could not release DB lock for {}'.format(self.dummy_transfer_id)
                             tmpLog.error(errMsg)
                         tmpRetVal = (None, transfer_result['message'])
                         return tmpRetVal
                 except Exception as e:
                     errStat,errMsg = globus_utils.handle_globus_exception(tmpLog)
                     # release process lock
                     tmpLog.debug('attempt to release DB lock for self.id - {0} self.dummy_transfer_id - {1}'.format(self.id,self.dummy_transfer_id))
                     self.have_db_lock = self.dbInterface.release_object_lock(self.dummy_transfer_id)
                     if not self.have_db_lock:
                         errMsg += ' - Could not release DB lock for {}'.format(self.dummy_transfer_id)
                     tmpLog.error(errMsg)
                     return errStat, errMsg
             else:
                 msgStr = 'wait until enough files are pooled'
                 tmpLog.debug(msgStr)
             # release the lock
             tmpLog.debug('attempt to release DB lock for self.id - {0} self.dummy_transfer_id - {1}'.format(self.id,self.dummy_transfer_id))
             self.have_db_lock = self.dbInterface.release_object_lock(self.dummy_transfer_id) 
             if not self.have_db_lock:
                 msgStr += ' - Could not release DB lock for {}'.format(self.dummy_transfer_id)
                 tmpLog.error(msgStr)
             # return None to retry later
             return None, msgStr
     # check transfer with real transfer IDs
     # get transfer groups 
     groups = jobspec.get_groups_of_input_files(skip_ready=True)
     for transferID in groups:
         if transferID != self.dummy_transfer_id :
             # get transfer task
             tmpStat, transferTasks = globus_utils.get_transfer_task_by_id(tmpLog,self.tc,transferID)
             # return a temporary error when failed to get task
             if not tmpStat:
                 errStr = 'failed to get transfer task'
                 tmpLog.error(errStr)
                 return None, errStr
             # return a temporary error when task is missing 
             if transferID not in transferTasks:
                 errStr = 'transfer task ID - {} is missing'.format(transferID)
                 tmpLog.error(errStr)
                 return None, errStr
             # succeeded in finding a transfer task by tranferID
             if transferTasks[transferID]['status'] == 'SUCCEEDED':
                 tmpLog.debug('transfer task {} succeeded'.format(transferID))
                 self.set_FileSpec_status(jobspec,'finished')
                 return True, ''
             # failed
             if transferTasks[transferID]['status'] == 'FAILED':
                 errStr = 'transfer task {} failed'.format(transferID)
                 tmpLog.error(errStr)
                 self.set_FileSpec_status(jobspec,'failed')
                 return False, errStr
             # another status
             tmpStr = 'transfer task {0} status: {1}'.format(transferID,transferTasks[transferID]['status'])
             tmpLog.debug(tmpStr)
             return None, ''

예제 #16

파일 보기

파일: htcondor_submitter.py 프로젝트: jtchilders/panda-harvester

    def submit_workers(self, workspec_list):
        tmpLog = self.make_logger(baseLogger, method_name='submit_workers')

        nWorkers = len(workspec_list)
        tmpLog.debug('start nWorkers={0}'.format(nWorkers))

        # get log subdirectory name from timestamp
        timeNow = datetime.datetime.utcnow()
        log_subdir = timeNow.strftime('%y-%m-%d_%H')
        log_subdir_path = os.path.join(self.logDir, log_subdir)
        try:
            os.mkdir(log_subdir_path)
        except OSError as e:
            if e.errno != errno.EEXIST:
                raise
            else:
                pass

        # get info from harvester queue config
        _queueConfigMapper = QueueConfigMapper()
        harvester_queue_config = _queueConfigMapper.get_queue(self.queueName)

        # get queue info from AGIS by cacher in db
        if self.useAtlasAGIS:
            panda_queues_dict = PandaQueuesDict()
            panda_queue_name = panda_queues_dict.get_panda_queue_name(self.queueName)
            this_panda_queue_dict = panda_queues_dict.get(self.queueName, dict())
            # tmpLog.debug('panda_queues_name and queue_info: {0}, {1}'.format(self.queueName, panda_queues_dict[self.queueName]))
        else:
            panda_queues_dict = dict()
            panda_queue_name = self.queueName
            this_panda_queue_dict = dict()

        def _handle_one_worker(workspec):
            # make logger
            tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID),
                                            method_name='_handle_one_worker')

            # get default information from queue info
            n_core_per_node_from_queue = this_panda_queue_dict.get('corecount', 1) if this_panda_queue_dict.get('corecount', 1) else 1
            is_unified_queue = 'unifiedPandaQueue' in this_panda_queue_dict.get('catchall', '').split(',') \
                               or this_panda_queue_dict.get('capability', '') == 'ucore'
            ce_info_dict = dict()
            batch_log_dict = dict()
            special_par = ''

            if self.useAtlasGridCE:
                # If ATLAS Grid CE mode used
                tmpLog.debug('Using ATLAS Grid CE mode...')
                queues_from_queue_list = this_panda_queue_dict.get('queues', [])
                special_par = this_panda_queue_dict.get('special_par', '')
                ce_auxilary_dict = {}
                for _queue_dict in queues_from_queue_list:
                    if not ( _queue_dict.get('ce_endpoint')
                            and str(_queue_dict.get('ce_state', '')).upper() == 'ACTIVE'
                            and str(_queue_dict.get('ce_flavour', '')).lower() in set(['arc-ce', 'cream-ce', 'htcondor-ce']) ):
                        continue
                    ce_endpoint = _queue_dict.get('ce_endpoint')
                    if ( ce_endpoint in ce_auxilary_dict
                        and str(_queue_dict.get('ce_queue_name', '')).lower() == 'default' ):
                        pass
                    else:
                        ce_auxilary_dict[ce_endpoint] = _queue_dict
                # qualified CEs from AGIS info
                n_qualified_ce = len(ce_auxilary_dict)
                queue_status_dict = self.dbInterface.get_queue_status(self.queueName)
                worker_ce_stats_dict = self.dbInterface.get_worker_ce_stats(self.queueName)
                ce_weight_dict = _get_ce_weight_dict(ce_endpoint_list=list(ce_auxilary_dict.keys()),
                                                        queue_status_dict=queue_status_dict,
                                                        worker_ce_stats_dict=worker_ce_stats_dict)
                # good CEs which can be submitted to, duplicate by weight
                good_ce_weighted_list = []
                for _ce_endpoint in ce_auxilary_dict.keys():
                    good_ce_weighted_list.extend([_ce_endpoint] * ce_weight_dict.get(_ce_endpoint, 0))
                tmpLog.debug('queue_status_dict: {0} ; worker_ce_stats_dict: {1} ; ce_weight_dict: {2}'.format(
                        queue_status_dict, worker_ce_stats_dict, ce_weight_dict))
                if len(good_ce_weighted_list) > 0:
                    ce_info_dict = ce_auxilary_dict[random.choice(good_ce_weighted_list)].copy()
                else:
                    tmpLog.info('No good CE endpoint left. Choose an arbitrary CE endpoint')
                    ce_info_dict = random.choice(list(ce_auxilary_dict.values())).copy()
                ce_endpoint_from_queue = ce_info_dict.get('ce_endpoint', '')
                ce_flavour_str = str(ce_info_dict.get('ce_flavour', '')).lower()
                ce_version_str = str(ce_info_dict.get('ce_version', '')).lower()
                ce_info_dict['ce_hostname'] = re.sub(':\w*', '',  ce_endpoint_from_queue)
                tmpLog.debug('For site {0} got CE endpoint: "{1}", flavour: "{2}"'.format(self.queueName, ce_endpoint_from_queue, ce_flavour_str))
                if os.path.isdir(self.CEtemplateDir) and ce_flavour_str:
                    sdf_template_filename = '{ce_flavour_str}.sdf'.format(ce_flavour_str=ce_flavour_str)
                    self.templateFile = os.path.join(self.CEtemplateDir, sdf_template_filename)

            # template for batch script
            tmpFile = open(self.templateFile)
            sdf_template = tmpFile.read()
            tmpFile.close()

            # get batch_log, stdout, stderr filename
            for _line in sdf_template.split('\n'):
                if _line.startswith('#'):
                    continue
                _match_batch_log = re.match('log = (.+)', _line)
                _match_stdout = re.match('output = (.+)', _line)
                _match_stderr = re.match('error = (.+)', _line)
                if _match_batch_log:
                    batch_log_value = _match_batch_log.group(1)
                    continue
                if _match_stdout:
                    stdout_value = _match_stdout.group(1)
                    continue
                if _match_stderr:
                    stderr_value = _match_stderr.group(1)
                    continue

            # get override requirements from queue configured
            try:
                n_core_per_node = self.nCorePerNode if self.nCorePerNode else n_core_per_node_from_queue
            except AttributeError:
                n_core_per_node = n_core_per_node_from_queue

            # URLs for log files
            if not (self.logBaseURL is None):
                if workspec.batchID:
                    batchID = workspec.batchID
                    guess = False
                else:
                    batchID = ''
                    guess = True
                batch_log_filename = parse_batch_job_filename(value_str=batch_log_value, file_dir=log_subdir_path, batchID=batchID, guess=guess)
                stdout_path_file_name = parse_batch_job_filename(value_str=stdout_value, file_dir=log_subdir_path, batchID=batchID, guess=guess)
                stderr_path_filename = parse_batch_job_filename(value_str=stderr_value, file_dir=log_subdir_path, batchID=batchID, guess=guess)
                batch_log = '{0}/{1}/{2}'.format(self.logBaseURL, log_subdir, batch_log_filename)
                batch_stdout = '{0}/{1}/{2}'.format(self.logBaseURL, log_subdir, stdout_path_file_name)
                batch_stderr = '{0}/{1}/{2}'.format(self.logBaseURL, log_subdir, stderr_path_filename)
                workspec.set_log_file('batch_log', batch_log)
                workspec.set_log_file('stdout', batch_stdout)
                workspec.set_log_file('stderr', batch_stderr)
                batch_log_dict['batch_log'] = batch_log
                batch_log_dict['batch_stdout'] = batch_stdout
                batch_log_dict['batch_stderr'] = batch_stderr
                batch_log_dict['gtag'] = workspec.workAttributes['stdOut']
                tmpLog.debug('Done set_log_file before submission')

            tmpLog.debug('Done jobspec attribute setting')

            # set data dict
            data = {'workspec': workspec,
                    'template': sdf_template,
                    'log_dir': self.logDir,
                    'log_subdir': log_subdir,
                    'n_core_per_node': n_core_per_node,
                    'panda_queue_name': panda_queue_name,
                    'x509_user_proxy': self.x509UserProxy,
                    'ce_info_dict': ce_info_dict,
                    'batch_log_dict': batch_log_dict,
                    'special_par': special_par,
                    'harvester_queue_config': harvester_queue_config,
                    'is_unified_queue': is_unified_queue,
                    'condor_schedd': self.condorSchedd,
                    'condor_pool': self.condorPool,
                    }

            return data

        def _propagate_attributes(workspec, tmpVal):
            # make logger
            tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID),
                                            method_name='_propagate_attributes')
            (retVal, tmpDict) = tmpVal
            workspec.set_attributes_with_dict(tmpDict)
            tmpLog.debug('Done workspec attributes propagation')
            return retVal

        tmpLog.debug('finished preparing worker attributes')

        # map(_handle_one_worker, workspec_list)
        with ThreadPoolExecutor(self.nProcesses * 4) as thread_pool:
            dataIterator = thread_pool.map(_handle_one_worker, workspec_list)
        tmpLog.debug('{0} workers handled'.format(nWorkers))

        # exec with mcore
        with ThreadPoolExecutor(self.nProcesses) as thread_pool:
            retValList = thread_pool.map(submit_a_worker, dataIterator)
        tmpLog.debug('{0} workers submitted'.format(nWorkers))

        # propagate changed attributes
        with ThreadPoolExecutor(self.nProcesses) as thread_pool:
            retIterator = thread_pool.map(lambda _wv_tuple: _propagate_attributes(*_wv_tuple), zip(workspec_list, retValList))

        retList = list(retIterator)
        tmpLog.debug('done')

        return retList

예제 #17

파일 보기

파일: k8s_submitter.py 프로젝트: pavlo-svirin/harvester

    def submit_k8s_worker(self, work_spec):
        tmp_log = self.make_logger(base_logger,
                                   method_name='submit_k8s_worker')

        # get info from harvester queue config
        _queueConfigMapper = QueueConfigMapper()
        harvester_queue_config = _queueConfigMapper.get_queue(self.queueName)
        prod_source_label = harvester_queue_config.get_source_label(
            work_spec.jobType)

        # set the stdout log file
        log_file_name = '{0}_{1}.out'.format(
            harvester_config.master.harvester_id, work_spec.workerID)
        work_spec.set_log_file(
            'stdout', '{0}/{1}'.format(self.logBaseURL, log_file_name))
        # TODO: consider if we want to upload the yaml file to PanDA cache

        yaml_content = self.k8s_client.read_yaml_file(self.k8s_yaml_file)
        try:

            # read the job configuration (if available, only push model)
            job_fields, job_pars_parsed = self.read_job_configuration(
                work_spec)

            # decide container image and executable to run. In pull mode, defaults are provided
            container_image = self.decide_container_image(
                job_fields, job_pars_parsed)
            executable, args = self.build_executable(job_fields,
                                                     job_pars_parsed)
            tmp_log.debug(
                'container_image: "{0}"; executable: "{1}"; args: "{2}"'.
                format(container_image, executable, args))

            # choose the appropriate proxy
            panda_queues_dict = PandaQueuesDict()
            is_grandly_unified_queue = panda_queues_dict.is_grandly_unified_queue(
                self.queueName)
            cert, use_secret = self._choose_proxy(work_spec,
                                                  is_grandly_unified_queue)
            if not cert:
                err_str = 'No proxy specified in proxySecretPath or x509UserProxy. Not submitted'
                tmp_return_value = (False, err_str)
                return tmp_return_value

            # get the walltime limit
            try:
                max_time = panda_queues_dict.get(self.queueName)['maxtime']
            except Exception as e:
                tmp_log.warning(
                    'Could not retrieve maxtime field for queue {0}'.format(
                        self.queueName))
                max_time = None

            # submit the worker
            rsp, yaml_content_final = self.k8s_client.create_job_from_yaml(
                yaml_content,
                work_spec,
                prod_source_label,
                container_image,
                executable,
                args,
                cert,
                cert_in_secret=use_secret,
                cpu_adjust_ratio=self.cpuAdjustRatio,
                memory_adjust_ratio=self.memoryAdjustRatio,
                max_time=max_time)
        except Exception as _e:
            tmp_log.error(traceback.format_exc())
            err_str = 'Failed to create a JOB; {0}'.format(_e)
            tmp_return_value = (False, err_str)
        else:
            work_spec.batchID = yaml_content['metadata']['name']
            tmp_log.debug('Created worker {0} with batchID={1}'.format(
                work_spec.workerID, work_spec.batchID))
            tmp_return_value = (True, '')

        return tmp_return_value

예제 #18

파일 보기

파일: remote_install.py 프로젝트: rukmarr/panda-harvester

def main():
    logging.basicConfig()

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--remoteDir',
        action='store',
        dest='remoteDir',
        default='harvester',
        help=
        'directory on the remote target machine where harvester is installed')
    parser.add_argument(
        '--remoteBuildDir',
        action='store',
        dest='remoteBuildDir',
        default='harvester_build',
        help='directory on the remote target machine where harvester is build')
    parser.add_argument('--remotePythonSetup',
                        action='store',
                        dest='remotePythonSetup',
                        default='',
                        help='python setup on remote target machine')
    parser.add_argument('--queueName',
                        action='store',
                        dest='queueName',
                        default=None,
                        required=True,
                        help='the name of queue where harvester is installed')
    parser.add_argument('--middleware',
                        action='store',
                        dest='middleware',
                        default='rpc',
                        help='middleware to access the remote target machine')
    options = parser.parse_args()

    # remove ~/ which doesn't work with sftp
    options.remoteDir = re.sub('^~/', '', options.remoteDir)
    options.remoteBuildDir = re.sub('^~/', '', options.remoteBuildDir)

    # get queue
    qcm = QueueConfigMapper()
    qcm.load_data()
    queueConfig = qcm.get_queue(options.queueName)
    if queueConfig is None:
        print('ERROR: queue={0} not found in panda_queueconfig.json'.format(
            options.queueName))
        sys.exit(1)

    # get middleware
    if not hasattr(queueConfig, options.middleware):
        print(
            'ERROR: middleware={0} is not defined for {1} in panda_queueconfig.json'
            .format(options.middleware, options.queueName))
        sys.exit(1)
    middleware = getattr(queueConfig, options.middleware)

    # get ssh parameters
    sshHost = middleware['remoteHost']
    try:
        sshPort = middleware['remotePort']
    except Exception:
        sshPort = 22
    sshUserName = middleware['sshUserName']
    try:
        sshPassword = middleware['sshPassword']
    except Exception:
        sshPassword = None

    privateKey = None
    passPhrase = None
    if sshPassword is None:
        try:
            privateKey = middleware['privateKey']
        except Exception:
            print("ERROR: set sshPassword or privateKey in middleware={0}".
                  format(options.middleware))
            sys.exit(1)
        try:
            passPhrase = middleware['passPhrase']
        except Exception:
            passPhrase = None

    try:
        jumpHost = middleware['jumpHost']
    except Exception:
        jumpHost = None
    try:
        jumpPort = middleware['jumpPort']
    except Exception:
        jumpPort = 22

    # ssh
    sshClient = make_ssh_connection(sshHost, sshPort, sshUserName, sshPassword,
                                    passPhrase, privateKey, jumpHost, jumpPort)

    # get remote python version
    exec_out = sshClient.exec_command(';'.join([
        options.remotePythonSetup,
        """python -c 'import sys;print("{0}{1}".format(*(sys.version_info[:2])))' """
    ]))
    remotePythonVer = exec_out[1].read().rstrip()
    sshClient.close()
    print('remote python version : {0}'.format(remotePythonVer))

    # make tmp dir
    with TemporaryDirectory() as tmpDir:
        harvesterGit = "git+git://github.com/PanDAWMS/panda-harvester.git"

        # get all dependencies
        print("getting dependencies")
        p = subprocess.Popen("pip download -d {0} {1}; rm -rf {0}/*".format(
            tmpDir, harvesterGit),
                             stdout=subprocess.PIPE,
                             shell=True)
        stdout, stderr = p.communicate()
        packages = []
        for line in stdout.split('\n'):
            if line.startswith('Successfully downloaded'):
                packages = line.split()[2:]
        packages.append(harvesterGit)
        packages.append('pip')
        packages.remove('pandaharvester')

        # download packages
        print("pip download to {0}".format(tmpDir))
        for package in packages:
            print("getting {0}".format(package))
            ret = subprocess.call(
                "pip download --no-deps --python-version {0} -d {1} {2}".
                format(remotePythonVer, tmpDir, package),
                shell=True)
            if ret != 0:
                print("ERROR: failed to download {0}".format(package))
                sys.exit(1)

        # sftp
        sshClient = make_ssh_connection(sshHost, sshPort, sshUserName,
                                        sshPassword, passPhrase, privateKey,
                                        jumpHost, jumpPort)
        try:
            sshClient.exec_command('rm -rf {0}; mkdir -p {0}'.format(
                options.remoteBuildDir))
        except Exception:
            pass
        sftp = sshClient.open_sftp()
        for name in os.listdir(tmpDir):
            path = os.path.join(tmpDir, name)
            if os.path.isdir(path):
                continue
            remotePath = os.path.join(options.remoteBuildDir, name)
            print("copy {0} to {1}".format(name, remotePath))
            sftp.put(path, remotePath)

        # install
        print("install harvester")
        buildDir = options.remoteBuildDir
        if not buildDir.startswith('/'):
            buildDir = '~/' + buildDir
        exec_out = sshClient.exec_command(';'.join([
            options.remotePythonSetup, 'cd {0}'.format(options.remoteDir),
            'pip install pip pandaharvester --no-index --find-links {0}'.
            format(buildDir)
        ]))
        print(exec_out[1].read())
        print(exec_out[2].read())
        sshClient.close()

예제 #19

파일 보기

 def trigger_stage_out(self, jobspec):
     # make logger
     tmpLog = self.make_logger(baseLogger,
                               'PandaID={0} ThreadID={1}'.format(
                                   jobspec.PandaID,
                                   threading.current_thread().ident),
                               method_name='trigger_stage_out')
     tmpLog.debug('start')
     # check that jobspec.computingSite is defined
     if jobspec.computingSite is None:
         # not found
         tmpLog.error('jobspec.computingSite is not defined')
         return False, 'jobspec.computingSite is not defined'
     else:
         tmpLog.debug('jobspec.computingSite : {0}'.format(
             jobspec.computingSite))
     # get the queueConfig and corresponding objStoreID_ES
     queueConfigMapper = QueueConfigMapper()
     queueConfig = queueConfigMapper.get_queue(jobspec.computingSite)
     # write to debug log queueConfig.stager
     tmpLog.debug(
         'jobspec.computingSite - {0} queueConfig.stager {1}'.format(
             jobspec.computingSite, queueConfig.stager))
     # check queueConfig stager section to see if jobtype is set
     if 'jobtype' in queueConfig.stager:
         if queueConfig.stager['jobtype'] == "Yoda":
             self.Yodajob = True
     # set the location of the files in fileSpec.objstoreID
     # see file /cvmfs/atlas.cern.ch/repo/sw/local/etc/agis_ddmendpoints.json
     self.objstoreID = int(queueConfig.stager['objStoreID_ES'])
     if self.Yodajob:
         self.pathConvention = int(queueConfig.stager['pathConvention'])
         tmpLog.debug(
             'Yoda Job - PandaID = {0} objstoreID = {1} pathConvention ={2}'
             .format(jobspec.PandaID, self.objstoreID, self.pathConvention))
     else:
         self.pathConvention = None
         tmpLog.debug('PandaID = {0} objstoreID = {1}'.format(
             jobspec.PandaID, self.objstoreID))
     self.RSE_dstpath = queueConfig.stager['RSE_dstPath']
     # loop over the output files and copy the files
     ifile = 0
     errors = []
     for fileSpec in jobspec.get_output_file_specs(skip_done=True):
         scope = 'panda'
         if fileSpec.scope is not None:
             scope = fileSpec.scope
         # for Yoda job set the scope to transient
         if self.Yodajob:
             scope = 'transient'
         # only print to log file first 25 files
         if ifile < 25:
             msgStr = "fileSpec.lfn - {0} fileSpec.scope - {1}".format(
                 fileSpec.lfn, fileSpec.scope)
             tmpLog.debug(msgStr)
         if ifile == 25:
             msgStr = "printed first 25 files skipping the rest".format(
                 fileSpec.lfn, fileSpec.scope)
             tmpLog.debug(msgStr)
         hash = hashlib.md5()
         hash.update('%s:%s' % (scope, fileSpec.lfn))
         hash_hex = hash.hexdigest()
         correctedscope = "/".join(scope.split('.'))
         srcURL = fileSpec.path
         dstURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format(
             endPoint=self.RSE_dstPath,
             scope=correctedscope,
             hash1=hash_hex[0:2],
             hash2=hash_hex[2:4],
             lfn=fileSpec.lfn)
         if ifile < 25:
             tmpLog.debug('src={srcURL} dst={dstURL}'.format(srcURL=srcURL,
                                                             dstURL=dstURL))
         # copy the source file from source to destination skip over if file already exists
         if os.path.exists(dstURL):
             tmpLog.debug('Already copied file {0}'.format(dstURL))
             # Set the file spec status
             if self.changeFileStatusOnSuccess:
                 fileSpec.status = 'finished'
         else:
             if os.path.exists(srcURL):
                 # check if destination directory exists if not create it
                 dstDIR = os.path.dirname(dstURL)
                 try:
                     if not os.path.exists(dstDIR):
                         os.makedirs(dstDIR)
                         mode = stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IXGRP
                         mode = mode | stat.S_IROTH | stat.S_IXOTH | stat.S_ISGID
                         os.chmod(dstDIR, mode)
                     # copy the source file to destination file
                     shutil.copy2(srcURL, dstURL)
                     # Set the file spec status
                     if self.changeFileStatusOnSuccess:
                         self.set_FileSpec_status(jobspec, 'finished')
                 except (IOError, os.error) as why:
                     errors.append((srcURL, dstURL, str(why)))
             else:
                 errors.append((srcURL, dstURL, 'Source file missing'))
         ifile += 1
     #  Now test for any errors
     if errors:
         for error in errors:
             tmpLog.debug(
                 'copy error source {0} destination {1} Reason {2}'.format(
                     error[0], error[1], error[2]))
         raise Error(errors)
     # otherwise we are OK
     tmpLog.debug('stop')
     return True, ''

예제 #20

파일 보기

 def trigger_stage_out(self, jobspec):
     # make logger
     tmpLog = self.make_logger(_logger,
                               'PandaID={0}'.format(jobspec.PandaID),
                               method_name='trigger_stage_out')
     tmpLog.debug('start')
     # default return
     tmpRetVal = (True, '')
     # check that jobspec.computingSite is defined
     if jobspec.computingSite is None:
         # not found
         tmpLog.error('jobspec.computingSite is not defined')
         return False, 'jobspec.computingSite is not defined'
     else:
         tmpLog.debug('jobspec.computingSite : {0}'.format(
             jobspec.computingSite))
     # test we have a Globus Transfer Client
     if not self.tc:
         errStr = 'failed to get Globus Transfer Client'
         tmpLog.error(errStr)
         return False, errStr
     # get label
     label = self.make_label(jobspec)
     tmpLog.debug('label={0}'.format(label))
     # get transfer tasks
     tmpStat, transferTasks = globus_utils.get_transfer_tasks(
         tmpLog, self.tc, label)
     if not tmpStat:
         errStr = 'failed to get transfer tasks'
         tmpLog.error(errStr)
         return False, errStr
     # check if already queued
     if label in transferTasks:
         tmpLog.debug('skip since already queued with {0}'.format(
             str(transferTasks[label])))
         return True, ''
     # set the Globus destination Endpoint id and path will get them from Agis eventually
     from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper
     queueConfigMapper = QueueConfigMapper()
     queueConfig = queueConfigMapper.get_queue(jobspec.computingSite)
     #self.Globus_srcPath = queueConfig.stager['Globus_srcPath']
     self.srcEndpoint = queueConfig.stager['srcEndpoint']
     self.Globus_srcPath = self.basePath
     self.Globus_dstPath = queueConfig.stager['Globus_dstPath']
     self.dstEndpoint = queueConfig.stager['dstEndpoint']
     # Test the endpoints and create the transfer data class
     errMsg = None
     try:
         # Test endpoints for activation
         tmpStatsrc, srcStr = globus_utils.check_endpoint_activation(
             tmpLog, self.tc, self.srcEndpoint)
         tmpStatdst, dstStr = globus_utils.check_endpoint_activation(
             tmpLog, self.tc, self.dstEndpoint)
         if tmpStatsrc and tmpStatdst:
             errStr = 'source Endpoint and destination Endpoint activated'
             tmpLog.debug(errStr)
         else:
             errMsg = ''
             if not tmpStatsrc:
                 errMsg += ' source Endpoint not activated '
             if not tmpStatdst:
                 errMsg += ' destination Endpoint not activated '
             tmpLog.error(errMsg)
             tmpRetVal = (False, errMsg)
             return tmpRetVal
         # both endpoints activated now prepare to transfer data
         tdata = TransferData(self.tc,
                              self.srcEndpoint,
                              self.dstEndpoint,
                              label=label,
                              sync_level="checksum")
     except:
         errStat, errMsg = globus_utils.handle_globus_exception(tmpLog)
         tmpRetVal = (errStat, errMsg)
         return tmpRetVal
     # loop over all files
     fileAttrs = jobspec.get_output_file_attributes()
     lfns = []
     for fileSpec in jobspec.outFiles:
         scope = fileAttrs[fileSpec.lfn]['scope']
         hash = hashlib.md5()
         hash.update('%s:%s' % (scope, fileSpec.lfn))
         hash_hex = hash.hexdigest()
         correctedscope = "/".join(scope.split('.'))
         srcURL = fileSpec.path
         dstURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format(
             endPoint=self.Globus_dstPath,
             scope=correctedscope,
             hash1=hash_hex[0:2],
             hash2=hash_hex[2:4],
             lfn=fileSpec.lfn)
         tmpLog.debug('src={srcURL} dst={dstURL}'.format(srcURL=srcURL,
                                                         dstURL=dstURL))
         # add files to transfer object - tdata
         if os.access(srcURL, os.R_OK):
             tmpLog.debug("tdata.add_item({},{})".format(srcURL, dstURL))
             tdata.add_item(srcURL, dstURL)
             lfns.append(fileSpec.lfn)
         else:
             errMsg = "source file {} does not exist".format(srcURL)
             tmpLog.error(errMsg)
             tmpRetVal = (False, errMsg)
             return tmpRetVal
     # submit transfer
     try:
         transfer_result = self.tc.submit_transfer(tdata)
         # check status code and message
         tmpLog.debug(str(transfer_result))
         if transfer_result['code'] == "Accepted":
             # succeeded
             # set transfer ID which are used for later lookup
             transferID = transfer_result['task_id']
             tmpLog.debug(
                 'successfully submitted id={0}'.format(transferID))
             jobspec.set_groups_to_files(
                 {transferID: {
                     'lfns': lfns,
                     'groupStatus': 'active'
                 }})
             # set
             for fileSpec in jobspec.outFiles:
                 if fileSpec.fileAttributes == None:
                     fileSpec.fileAttributes = {}
                     fileSpec.fileAttributes['transferID'] = transferID
         else:
             tmpRetVal = (False, transfer_result['message'])
     except Exception as e:
         errStat, errMsg = globus_utils.handle_globus_exception(tmpLog)
         if errMsg is None:
             errtype, errvalue = sys.exc_info()[:2]
             errMsg = "{0} {1}".format(errtype.__name__, errvalue)
         tmpRetVal = (errStat, errMsg)
     # return
     tmpLog.debug('done')
     return tmpRetVal

예제 #21

파일 보기

    def submit_workers(self, workspec_list):
        retlist = []

        # Get queue info from DB
        pandaqueues = self.dbproxy.get_cache("panda_queues.json", None)
        if pandaqueues is None:
            raise Exception("Failed to get panda queue info from database")
        pandaqueues = pandaqueues.data

        osmap = self.dbproxy.get_cache("ddmendpoints_objectstores.json", None)
        if osmap is None:
            raise Exception("Failed to get Object Store info from database")
        osmap = osmap.data

        for workspec in workspec_list:

            arclog = arc_utils.ARCLogger(baselogger, workspec.workerID)
            tmplog = arclog.log

            # Assume for aCT that jobs are always pre-fetched (no late-binding)
            for jobspec in workspec.get_jobspec_list():

                tmplog.debug("JobSpec: {0}".format(jobspec.values_map()))

                if jobspec.computingSite not in pandaqueues:
                    retlist.append(
                        (False, "No queue information for {0}".format(
                            jobspec.computingSite)))
                    continue

                # Get CEs from panda queue info
                # List of (endpoint, queue) tuples
                arcces = []
                for endpoint in pandaqueues[jobspec.computingSite]['queues']:
                    ce_endpoint = endpoint['ce_endpoint']
                    if not re.search('://', ce_endpoint):
                        ce_endpoint = 'gsiftp://%s' % ce_endpoint
                    ce_queue = endpoint['ce_queue_name']
                    arcces.append((ce_endpoint, ce_queue))

                if not arcces:
                    retlist.append((False, "No CEs defined for %{0}".format(
                        jobspec.computingSite)))
                    continue

                # Set true pilot or not
                queueconfigmapper = QueueConfigMapper()
                queueconfig = queueconfigmapper.get_queue(
                    jobspec.computingSite)
                pandaqueues[jobspec.computingSite][
                    'truepilot'] = 'running' in queueconfig.noHeartbeat

                # Set log URL for GTAG env in job description
                logbaseurl = queueconfig.submitter.get('logBaseURL')
                logsubdir = self._set_logdir(jobspec.computingSite)
                logfileurl = '/'.join(
                    [logbaseurl, logsubdir,
                     '%d.out' % jobspec.PandaID]) if logbaseurl else None

                tmplog.debug("Converting to ARC XRSL format")
                arcxrsl = ARCParser(
                    jobspec.jobParams,
                    jobspec.computingSite,
                    pandaqueues[jobspec.computingSite],
                    logfileurl,
                    self.schedulerid,
                    osmap,
                    '/tmp',  # tmpdir, TODO common tmp dir
                    None,  #jobSpec.eventranges, # TODO event ranges
                    tmplog)
                arcxrsl.parse()
                xrsl = arcxrsl.getXrsl()
                tmplog.debug("ARC xrsl: {0}".format(xrsl))

                # Set the files to be downloaded at the end of the job
                downloadfiles = 'gmlog/errors'
                if 'logFile' in jobspec.jobParams:
                    downloadfiles += ';%s' % jobspec.jobParams[
                        'logFile'].replace('.tgz', '')
                if not pandaqueues[jobspec.computingSite]['truepilot']:
                    downloadfiles += ';jobSmallFiles.tgz'

                # Set certificate
                userconfig = arc.UserConfig(self.cred_type)
                proxyrole = ''
                if jobspec.jobParams['prodSourceLabel'] == 'user':
                    userconfig.ProxyPath(str(self.certs['pilot']))
                    proxyrole = 'pilot'
                else:
                    userconfig.ProxyPath(str(self.certs['production']))
                    proxyrole = 'production'
                tmplog.debug("Submitting using {0} proxy at {1}".format(
                    proxyrole, userconfig.ProxyPath()))

                try:
                    tmplog.debug("Submission targets: {0}".format(arcces))
                    arcjob = self._arc_submit(xrsl, arcces, userconfig, tmplog)
                    tmplog.info("ARC CE job id {0}".format(arcjob.JobID))
                    arc_utils.arcjob2workspec(arcjob, workspec)
                    workspec.workAttributes['arcdownloadfiles'] = downloadfiles
                    workspec.workAttributes['proxyrole'] = proxyrole
                    workspec.workAttributes['logsubdir'] = logsubdir
                    workspec.batchID = arcjob.JobID
                    tmplog.debug(workspec.workAttributes)
                    result = (True, '')
                except Exception as exc:
                    tmplog.error(traceback.format_exc())
                    result = (False,
                              "Failed to submit ARC job: {0}".format(str(exc)))

                retlist.append(result)

        return retlist

예제 #22

파일 보기

파일: cp_compass_stager_hpc.py 프로젝트: virthead/harvester

 def __init__(self, **kwarg):
     BaseStager.__init__(self, **kwarg)
     self.queue_config_mapper = QueueConfigMapper()

예제 #23

파일 보기

파일: go_rucio_stager.py 프로젝트: nikmagini/panda-harvester

    def check_status(self, jobspec):
        # make logger
        tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID),
                                  method_name='check_status')
        tmpLog.debug('executing base check_status')
        tmpStat, tmpMsg = GlobusBulkStager.check_status(self, jobspec)
        tmpLog.debug('got {0} {1}'.format(tmpStat, tmpMsg))
        if tmpStat is not True:
            return tmpStat, tmpMsg
        # get transfer groups
        groups = jobspec.get_groups_of_output_files()
        if len(groups) == 0:
            return tmpStat, tmpMsg
        # get the queueConfig and corresponding objStoreID_ES
        queueConfigMapper = QueueConfigMapper()
        queueConfig = queueConfigMapper.get_queue(jobspec.computingSite)
        # write to debug log queueConfig.stager
        tmpLog.debug('jobspec.computingSite - {0} queueConfig.stager {1}'.format(jobspec.computingSite,queueConfig.stager))
        # check queueConfig stager section to see if srcRSE is set
        if 'srcRSE' in queueConfig.stager:
            srcRSE = queueConfig.stager['srcRSE']
        else:
            tmpLog.debug('Warning srcRSE not defined in stager portion of queue config file')
        # get destination endpoint
        nucleus = jobspec.jobParams['nucleus']
        agis = self.dbInterface.get_cache('panda_queues.json').data
        dstRSE = [agis[x]["astorages"]['pr'][0] for x in agis if agis[x]["atlas_site"] == nucleus][0]
        # if debugging log source and destination RSEs 
        tmpLog.debug('srcRSE - {0} dstRSE - {1}'.format(srcRSE,dstRSE))
        # test that srcRSE and dstRSE are defined
        tmpLog.debug('srcRSE - {0} dstRSE - {1}'.format(srcRSE,dstRSE))
        errStr = '' 
        if srcRSE is None:
            errStr = 'Source RSE is not defined '
        if dstRSE is None:
            errStr = errStr + ' Desitination RSE is not defined'
        if (srcRSE is None) or (dstRSE is None) :
           tmpLog.error(errStr)
           return None,errStr
        # check queueConfig stager section to see if jobtype is set
        if 'jobtype' in queueConfig.stager:
            if queueConfig.stager['jobtype'] == "Yoda" :
                self.Yodajob = True
        # set the location of the files in fileSpec.objstoreID
        # see file /cvmfs/atlas.cern.ch/repo/sw/local/etc/agis_ddmendpoints.json 
        ddm = self.dbInterface.get_cache('agis_ddmendpoints.json').data
        self.objstoreID = ddm[dstRSE]['id']
        if self.Yodajob :
            self.pathConvention = int(queueConfig.stager['pathConvention'])
            tmpLog.debug('Yoda Job - PandaID = {0} objstoreID = {1} pathConvention ={2}'.format(jobspec.PandaID,self.objstoreID,self.pathConvention))
        else:
            self.pathConvention = None
            tmpLog.debug('PandaID = {0} objstoreID = {1}'.format(jobspec.PandaID,self.objstoreID))
        # set the location of the files in fileSpec.objstoreID
        self.set_FileSpec_objstoreID(jobspec, self.objstoreID, self.pathConvention)
        # create the Rucio Client
        try:
            # register dataset
            rucioAPI = RucioClient()
        except Exception:
                core_utils.dump_error_message(tmpLog)
                # treat as a temporary error
                tmpStat = None
                tmpMsg = 'failed to add a rule for {0}:{1}'.format(datasetScope, datasetName)
                return tmpStat,tmpMsg
        # loop over all transfers
        tmpStat = True
        tmpMsg = ''
        for transferID in groups:
            if transferID is None:
                continue
            datasetName = 'panda.harvester.{0}.{1}'.format(jobspec.PandaID, transferID)
            datasetScope = 'transient'
            # lock
            have_db_lock = self.dbInterface.get_object_lock(transferID, lock_interval=120)
            if not have_db_lock:
                msgStr = 'escape since {0} is locked by another thread'.format(transferID)
                tmpLog.debug(msgStr)
                return None, msgStr
            # get transfer status
            groupStatus = self.dbInterface.get_file_group_status(transferID)
            if 'hopped' in groupStatus:
                # already succeeded
                pass
            elif 'failed' in groupStatus:
                # transfer failure
                tmpStat = False
                tmpMsg = 'rucio rule for {0}:{1} already failed'.format(datasetScope, datasetName)
            elif 'hopping' in groupStatus:
                # check rucio rule
                ruleStatus = 'FAILED'
                try:
                    tmpLog.debug('check state for {0}:{1}'.format(datasetScope, datasetName))
                    for ruleInfo in rucioAPI.list_did_rules(datasetScope, datasetName):
                        if ruleInfo['rse_expression'] != dstRSE:
                            continue
                        ruleStatus = ruleInfo['state']
                        tmpLog.debug('got state={0}'.format(ruleStatus))
                        if ruleStatus == 'OK':
                            break
                except DataIdentifierNotFound:
                    tmpLog.error('dataset not found')
                except Exception:
                    core_utils.dump_error_message(tmpLog)
                    ruleStatus = None
                if ruleStatus in ['FAILED', 'CANCELED']:
                    # transfer failure
                    tmpStat = False
                    tmpMsg = 'rucio rule for {0}:{1} failed with {2}'.format(datasetScope, datasetName, ruleStatus)
                    # update file group status
                    self.dbInterface.update_file_group_status(transferID, 'failed')
                elif ruleStatus == 'OK':
                    # update successful file group status
                    self.dbInterface.update_file_group_status(transferID, 'hopped')
                else:
                    # replicating or temporary error
                    tmpStat = None
                    tmpMsg = 'replicating or temporary error for {0}:{1}'.format(datasetScope, datasetName)
            else:
                # make rucio rule
                fileSpecs = self.dbInterface.get_files_with_group_id(transferID)
                fileList = []
                for fileSpec in fileSpecs:
                    tmpFile = dict()
                    tmpFile['scope'] = datasetScope
                    tmpFile['name'] = fileSpec.lfn
                    tmpFile['bytes'] = fileSpec.fsize
                    tmpFile['adler32'] = fileSpec.chksum
                    if fileSpec.fileAttributes is not None and 'guid' in fileSpec.fileAttributes:
                        tmpFile['meta'] = {'guid': fileSpec.fileAttributes['guid']}
                    else :
                        tmpLog.debug('File - {0} does not have a guid value'.format(fileSpec.lfn))
                    tmpLog.debug('Adding file {0} to fileList'.format(fileSpec.lfn))
                    fileList.append(tmpFile)
                    # get source RSE
                    if srcRSE is None and fileSpec.objstoreID is not None:
                        ddm = self.dbInterface.get_cache('agis_ddmendpoints.json').data
                        srcRSE = [x for x in ddm if ddm[x]["id"] == fileSpec.objstoreID][0]
                try:
                    # register dataset
                    tmpLog.debug('register {0}:{1} rse = {2} meta=(hidden: True) lifetime = {3}'
                                 .format(datasetScope, datasetName,srcRSE,(30*24*60*60)))
                    try:
                        rucioAPI.add_dataset(datasetScope, datasetName,
                                             meta={'hidden': True},
                                             lifetime=30 * 24 * 60 * 60,
                                             rse=srcRSE
                                             )
                    except DataIdentifierAlreadyExists:
                        # ignore even if the dataset already exists
                        pass
                    except Exception:
                        errMsg = 'Could not create dataset {0}:{1} srcRSE - {2}'.format(datasetScope,
                                                                                        datasetName,
                                                                                        srcRSE)
                        core_utils.dump_error_message(tmpLog)
                        tmpLog.error(errMsg) 
                        raise
                        # return None,errMsg
                    # add files to dataset
                    #  add 500 files at a time
                    numfiles = len(fileList)
                    maxfiles = 500
                    numslices = numfiles/maxfiles
                    if (numfiles%maxfiles) > 0 :
                        numslices = numslices + 1
                    start = 0
                    for i in range(numslices) :
                        try:
                            stop = start + maxfiles
                            if stop > numfiles :
                                stop = numfiles

                            rucioAPI.add_files_to_datasets([{'scope': datasetScope,
                                                             'name': datasetName,
                                                             'dids': fileList[start:stop],
                                                             'rse': srcRSE}],
                                                           ignore_duplicate=True)
                            start = stop
                        except FileAlreadyExists:
                            # ignore if files already exist
                            pass
                        except Exception:
                            errMsg = 'Could not add files to DS - {0}:{1}  rse - {2} files - {3}'.format(datasetScope,
                                                                                                         datasetName,
                                                                                                         srcRSE,
                                                                                                         fileList)
                            core_utils.dump_error_message(tmpLog)
                            tmpLog.error(errMsg)
                            return None,errMsg
                    # add rule
                    try:
                        tmpDID = dict()
                        tmpDID['scope'] = datasetScope
                        tmpDID['name'] = datasetName
                        tmpRet = rucioAPI.add_replication_rule([tmpDID], 1, dstRSE,
                                                               lifetime=30 * 24 * 60 * 60)
                        ruleIDs = tmpRet[0]
                        tmpLog.debug('registered dataset {0}:{1} with rule {2}'.format(datasetScope, datasetName,
                                                                                       str(ruleIDs)))
                    except DuplicateRule:
                        # ignore duplicated rule
                        tmpLog.debug('rule is already available')
                    except Exception:
                        errMsg = 'Error creating rule for dataset {0}:{1}'.format(datasetScope, datasetName)
                        core_utils.dump_error_message(tmpLog)
                        tmpLog.debug(errMsg)
                        #raise
                        return None,errMsg
                    # update file group status
                    self.dbInterface.update_file_group_status(transferID, 'hopping')
                except Exception:
                    core_utils.dump_error_message(tmpLog)
                    # treat as a temporary error
                    tmpStat = None
                    tmpMsg = 'failed to add a rule for {0}:{1}'.format(datasetScope, datasetName)
            # release lock
            self.dbInterface.release_object_lock(transferID)
            # escape if already failed
            if tmpStat is False:
                break
        # all done
        if tmpStat is True:
            self.set_FileSpec_status(jobspec, 'finished')
        tmpLog.debug('done with {0} : {1}'.format(tmpStat, tmpMsg))
        return tmpStat, tmpMsg

예제 #24

파일 보기

파일: cloud_google_submitter.py 프로젝트: rukmarr/panda-harvester

    def __init__(self, **kwarg):
        self.logBaseURL = 'http://localhost/test'
        PluginBase.__init__(self, **kwarg)

        self.queue_config_mapper = QueueConfigMapper()

예제 #25

파일 보기

                              timeout=self.__worker_update_timeout)
            tmp_log.debug('worker update for {0} ended with {1} {2}'.format(
                batch_id, r.status_code, r.text))

            end_time = time.time()
            tmp_log.debug('done (took {0})'.format(end_time - start_time))
        except:
            tmp_log.error('Excepted with: {0}'.format(traceback.format_exc()))


if __name__ == "__main__":
    """
    Quick tests
    """
    from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper
    queue_config_mapper = QueueConfigMapper()

    apfmon = Apfmon(queue_config_mapper)
    apfmon.create_factory()
    apfmon.create_labels()

    worker_a = WorkSpec()
    worker_a.batchID = 1
    worker_a.computingSite = 'CERN-PROD-DEV_UCORE'
    worker_a.computingElement = 'bla1'
    worker_a.workAttributes = {
        "batchLog":
        "https://aipanda024.cern.ch/condor_logs/18-07-19_09/grid.9659.0.log",
        "stdErr":
        "https://aipanda024.cern.ch/condor_logs/18-07-19_09/grid.9659.0.err",
        "stdOut":

예제 #26

파일 보기

파일: yoda_rucio_rse_direct_stager.py 프로젝트: wguanicedew/harvester

    def trigger_stage_out(self, jobspec):
        # make logger
        tmpLog = self.make_logger(baseLogger, 'PandaID={0} ThreadID={1}'.format(jobspec.PandaID,threading.current_thread().ident),
                                  method_name='trigger_stage_out')
        tmpLog.debug('start')
        # initialize some values
        tmpStat = None
        tmpMsg = ''
        srcRSE = None
        dstRSE = None
        datasetName = 'panda.harvester.{0}.{1}'.format(jobspec.PandaID,str(uuid.uuid4()))
        datasetScope = 'transient'
        # check that jobspec.computingSite is defined
        if jobspec.computingSite is None:
            # not found
            tmpLog.error('jobspec.computingSite is not defined')
            return False, 'jobspec.computingSite is not defined'
        else:
            tmpLog.debug('jobspec.computingSite : {0}'.format(jobspec.computingSite))
        # get the queueConfig and corresponding objStoreID_ES
        queueConfigMapper = QueueConfigMapper()
        queueConfig = queueConfigMapper.get_queue(jobspec.computingSite)
        # write to debug log queueConfig.stager
        tmpLog.debug('jobspec.computingSite - {0} queueConfig.stager {1}'.format(jobspec.computingSite,queueConfig.stager))
        # check queueConfig stager section to see if jobtype is set
        if 'jobtype' in queueConfig.stager:
            if queueConfig.stager['jobtype'] == "Yoda" :
                self.Yodajob = True
        # get destination endpoint
        nucleus = jobspec.jobParams['nucleus']
        agis = self.dbInterface.get_cache('panda_queues.json').data
        dstRSE = [agis[x]["astorages"]['pr'][0] for x in agis if agis[x]["atlas_site"] == nucleus][0]
        # see file /cvmfs/atlas.cern.ch/repo/sw/local/etc/agis_ddmendpoints.json 
        ddm = self.dbInterface.get_cache('agis_ddmendpoints.json').data
        self.objstoreID = ddm[dstRSE]['id']
        if self.Yodajob :
            self.pathConvention = int(queueConfig.stager['pathConvention'])
            tmpLog.debug('Yoda Job - PandaID = {0} objstoreID = {1} pathConvention ={2}'.format(jobspec.PandaID,self.objstoreID,self.pathConvention))
        else:
            self.pathConvention = None
            tmpLog.debug('PandaID = {0} objstoreID = {1}'.format(jobspec.PandaID,self.objstoreID))
        # set the location of the files in fileSpec.objstoreID
        self.set_FileSpec_objstoreID(jobspec, self.objstoreID, self.pathConvention)
        self.RSE_dstpath = queueConfig.stager['RSE_dstPath']
        # check queueConfig stager section to see if srcRSE is set
        if 'srcRSE' in queueConfig.stager:
            srcRSE = queueConfig.stager['srcRSE']
        else:
            tmpLog.debug('Warning srcRSE not defined in stager portion of queue config file')
        tmpLog.debug('srcRSE - {0} dstRSE - {1}'.format(srcRSE,dstRSE))
            
        # loop over the output files and copy the files
        ifile = 0
        errors = []
        fileList = []
        lfns = []
        fileSpec_list = []
        fileSpec_list = jobspec.get_output_file_specs(skip_done=False)
        msgStr = '#(jobspec.get_output_file_specs(skip_done=False)) = {0}'\
                 .format(len(fileSpec_list))
        tmpLog.debug(msgStr)
        for fileSpec in fileSpec_list:
           msgstr = 'fileSpec: dataset scope - {0} file name - {1} size(Bytes) - {2} adler32 - {3}'\
              .format(datasetScope,fileSpec.lfn,fileSpec.fsize,fileSpec.chksum)
           if fileSpec.fileAttributes is not None and 'guid' in fileSpec.fileAttributes:
              msgstr += ' guid - {0}'.format(fileSpec.fileAttributes['guid'])
           tmpLog.debug(msgstr)


        #for fileSpec in jobspec.get_output_file_specs(skip_done=True):
        for fileSpec in jobspec.get_output_file_specs(skip_done=False):
            scope ='panda'
            if fileSpec.scope is not None :
                scope = fileSpec.scope
            # for Yoda job set the scope to transient 
            if self.Yodajob :
                scope = 'transient'
            # only print to log file first 25 files
            if ifile < 25 :
                msgStr = "fileSpec.lfn - {0} fileSpec.scope - {1}".format(fileSpec.lfn, fileSpec.scope)
                tmpLog.debug(msgStr)
            if ifile == 25 :
                msgStr = "printed first 25 files skipping the rest".format(fileSpec.lfn, fileSpec.scope)
                tmpLog.debug(msgStr)
            hash = hashlib.md5()
            hash.update('%s:%s' % (scope, fileSpec.lfn))
            hash_hex = hash.hexdigest()
            correctedscope = "/".join(scope.split('.'))
            srcURL = fileSpec.path
            dstURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format(endPoint=self.RSE_dstPath,
                                                                       scope=correctedscope,
                                                                       hash1=hash_hex[0:2],
                                                                       hash2=hash_hex[2:4],
                                                                       lfn=fileSpec.lfn)
            if ifile < 25 :
                tmpLog.debug('src={srcURL} dst={dstURL}'.format(srcURL=srcURL, dstURL=dstURL))
            tmpFile = dict()
            # copy the source file from source to destination skip over if file already exists
            if os.path.exists(dstURL):
                tmpLog.debug('Already copied file {0}'.format(dstURL))
                # save for adding to rucio dataset
                tmpFile['scope'] = datasetScope
                tmpFile['name'] = fileSpec.lfn
                tmpFile['bytes'] = fileSpec.fsize
                tmpFile['adler32'] = fileSpec.chksum
                if fileSpec.fileAttributes is not None and 'guid' in fileSpec.fileAttributes:
                    tmpFile['meta'] = {'guid': fileSpec.fileAttributes['guid']}
                else :
                    tmpLog.debug('File - {0} does not have a guid value'.format(fileSpec.lfn))
                tmpLog.debug('Adding file {0} to fileList'.format(fileSpec.lfn))
                fileList.append(tmpFile)
                lfns.append(fileSpec.lfn)
                # get source RSE
                if srcRSE is None and fileSpec.objstoreID is not None:
                    ddm = self.dbInterface.get_cache('agis_ddmendpoints.json').data
                    srcRSE = [x for x in ddm if ddm[x]["id"] == fileSpec.objstoreID][0]
                    tmpLog.debug('srcRSE - {0} defined from agis_ddmendpoints.json'.format(srcRSE))
            else :
                if os.path.exists(srcURL) :
                    # check if destination directory exists if not create it
                    dstDIR = os.path.dirname(dstURL)
                    try:
                        if not os.path.exists(dstDIR) :
                            os.makedirs(dstDIR)
                            mode = stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IXGRP 
                            mode = mode | stat.S_IROTH | stat.S_IXOTH | stat.S_ISGID
                            os.chmod(dstDIR,mode)
                        # copy the source file to destination file
                        shutil.copy2(srcURL, dstURL)
                        # save for adding to rucio dataset
                        tmpFile['scope'] = datasetScope
                        tmpFile['name'] = fileSpec.lfn
                        tmpFile['bytes'] = fileSpec.fsize
                        tmpFile['adler32'] = fileSpec.chksum
                        if fileSpec.fileAttributes is not None and 'guid' in fileSpec.fileAttributes:
                            tmpFile['meta'] = {'guid': fileSpec.fileAttributes['guid']}
                        else :
                            tmpLog.debug('File - {0} does not have a guid value'.format(fileSpec.lfn))
                        tmpLog.debug('Adding file {0} to fileList'.format(fileSpec.lfn))
                        fileList.append(tmpFile)
                        lfns.append(fileSpec.lfn)
                        # get source RSE if not already set
                        if srcRSE is None and fileSpec.objstoreID is not None:
                            ddm = self.dbInterface.get_cache('agis_ddmendpoints.json').data
                            srcRSE = [x for x in ddm if ddm[x]["id"] == fileSpec.objstoreID][0]
                            tmpLog.debug('srcRSE - {0} defined from agis_ddmendpoints.json'.format(srcRSE))
                    except (IOError, os.error) as why:
                        errors.append((srcURL, dstURL, str(why)))
                else :
                    errors.append((srcURL, dstURL, 'Source file missing'))
            ifile += 1

        # test that srcRSE and dstRSE are defined
        tmpLog.debug('srcRSE - {0} dstRSE - {1}'.format(srcRSE,dstRSE))
        errStr = '' 
        if srcRSE is None:
            errStr = 'Source RSE is not defined '
        if dstRSE is None:
            errStr = errStr + ' Desitination RSE is not defined'
        if (srcRSE is None) or (dstRSE is None) :
           tmpLog.error(errStr)
           return None,errStr

        # test to see if there are any files to add dataset
        if len(fileList) == 0:
            errStr = 'There are no files to add to database'
            tmpLog.error(errStr)
            return None,errStr
        # print out the file list
        tmpLog.debug('fileList - {0}'.format(fileList))
        
        # create the dataset and add files to it and create a transfer rule
        try:
            # register dataset
            rucioAPI = RucioClient()
            tmpLog.debug('register {0}:{1} rse = {2} meta=(hidden: True) lifetime = {3}'
                         .format(datasetScope, datasetName,srcRSE,(30*24*60*60)))
            try:
                rucioAPI.add_dataset(datasetScope, datasetName,
                                     meta={'hidden': True},
                                     lifetime=30 * 24 * 60 * 60,
                                     rse=srcRSE
                                     )
            except DataIdentifierAlreadyExists:
                # ignore even if the dataset already exists
                pass
            except Exception:
                errMsg = 'Could not create dataset {0}:{1} srcRSE - {2}'.format(datasetScope,
                                                                                datasetName,
                                                                                srcRSE)
                core_utils.dump_error_message(tmpLog)
                tmpLog.error(errMsg)
                return None,errMsg
            # add files to dataset
            #  add 500 files at a time
            numfiles = len(fileList)
            maxfiles = 500
            numslices = numfiles/maxfiles
            if (numfiles%maxfiles) > 0 :
               numslices = numslices + 1
            start = 0
            for i in range(numslices) :
               try:
                  stop = start + maxfiles
                  if stop > numfiles :
                     stop = numfiles

                  rucioAPI.add_files_to_datasets([{'scope': datasetScope,
                                                   'name': datasetName,
                                                   'dids': fileList[start:stop],
                                                   'rse': srcRSE}],
                                                 ignore_duplicate=True)
                  start = stop
               except FileAlreadyExists:
                  # ignore if files already exist
                  pass
               except Exception:
                  errMsg = 'Could not add files to DS - {0}:{1}  rse - {2} files - {3}'.format(datasetScope,
                                                                                               datasetName,
                                                                                               srcRSE,
                                                                                               fileList)
                  core_utils.dump_error_message(tmpLog)
                  tmpLog.error(errMsg)
                  return None,errMsg
            # add rule
            try:
                tmpDID = dict()
                tmpDID['scope'] = datasetScope
                tmpDID['name'] = datasetName
                tmpRet = rucioAPI.add_replication_rule([tmpDID], 1, dstRSE,
                                                       lifetime=30 * 24 * 60 * 60)
                ruleIDs = tmpRet[0]
                tmpLog.debug('registered dataset {0}:{1} with rule {2}'.format(datasetScope, datasetName,
                                                                               str(ruleIDs)))
                # group the output files together by the Rucio transfer rule
                jobspec.set_groups_to_files({ruleIDs: {'lfns': lfns,'groupStatus': 'pending'}})
                msgStr = 'jobspec.set_groups_to_files -Rucio rule - {0}, lfns - {1}, groupStatus - pending'.format(ruleIDs,lfns)
                tmpLog.debug(msgStr)
                tmpLog.debug('call self.dbInterface.set_file_group(jobspec.get_output_file_specs(skip_done=True),ruleIDs,pending)')
                tmpStat = self.dbInterface.set_file_group(jobspec.get_output_file_specs(skip_done=True),ruleIDs,'transferring')
                tmpLog.debug('called self.dbInterface.set_file_group(jobspec.get_output_file_specs(skip_done=True),ruleIDs,transferring)')
                tmpStat = True
                tmpMsg = 'created Rucio rule successfully'
            except DuplicateRule:
                # ignore duplicated rule
                tmpLog.debug('rule is already available')
            except Exception:
                errMsg = 'Error creating rule for dataset {0}:{1}'.format(datasetScope, datasetName)
                core_utils.dump_error_message(tmpLog)
                tmpLog.debug(errMsg)
                return None,errMsg
            # update file group status
            self.dbInterface.update_file_group_status(ruleIDs, 'transferring')
        except Exception:
                core_utils.dump_error_message(tmpLog)
                # treat as a temporary error
                tmpStat = None
                tmpMsg = 'failed to add a rule for {0}:{1}'.format(datasetScope, datasetName)

        #  Now test for any errors
        if errors:
            for error in errors:
                tmpLog.debug('copy error source {0} destination {1} Reason {2}'.format(error[0],error[1],error[2]))
            raise Error(errors)
        # otherwise we are OK                            
        tmpLog.debug('stop')
        return tmpStat,tmpMsg

예제 #27

파일 보기

    def submit_workers(self, workspec_list):
        tmpLog = self.make_logger(baseLogger, method_name='submit_workers')

        nWorkers = len(workspec_list)
        tmpLog.debug('start nWorkers={0}'.format(nWorkers))

        # whether to submit any worker
        to_submit_any = True

        # get log subdirectory name from timestamp
        timeNow = datetime.datetime.utcnow()
        log_subdir = timeNow.strftime('%y-%m-%d_%H')
        log_subdir_path = os.path.join(self.logDir, log_subdir)
        if self.condorSchedd is None or not self.useSpool:
            try:
                os.mkdir(log_subdir_path)
            except OSError as e:
                if e.errno != errno.EEXIST:
                    raise
                else:
                    pass

        # get info from harvester queue config
        _queueConfigMapper = QueueConfigMapper()
        harvester_queue_config = _queueConfigMapper.get_queue(self.queueName)

        # get queue info from AGIS by cacher in db
        if self.useAtlasAGIS:
            panda_queues_dict = PandaQueuesDict()
            panda_queue_name = panda_queues_dict.get_panda_queue_name(
                self.queueName)
            this_panda_queue_dict = panda_queues_dict.get(
                self.queueName, dict())
            # tmpLog.debug('panda_queues_name and queue_info: {0}, {1}'.format(self.queueName, panda_queues_dict[self.queueName]))
        else:
            panda_queues_dict = dict()
            panda_queue_name = self.queueName
            this_panda_queue_dict = dict()

        # get default information from queue info
        n_core_per_node_from_queue = this_panda_queue_dict.get(
            'corecount', 1) if this_panda_queue_dict.get('corecount', 1) else 1
        is_unified_queue = this_panda_queue_dict.get('capability',
                                                     '') == 'ucore'

        # get override requirements from queue configured
        try:
            n_core_per_node = self.nCorePerNode if self.nCorePerNode else n_core_per_node_from_queue
        except AttributeError:
            n_core_per_node = n_core_per_node_from_queue

        # deal with CE
        special_par = ''
        ce_weighting = None
        if self.useAtlasGridCE:
            # If ATLAS Grid CE mode used
            tmpLog.debug('Using ATLAS Grid CE mode...')
            queues_from_queue_list = this_panda_queue_dict.get('queues', [])
            special_par = this_panda_queue_dict.get('special_par', '')
            ce_auxilary_dict = {}
            for _queue_dict in queues_from_queue_list:
                if not (_queue_dict.get('ce_endpoint') and str(
                        _queue_dict.get('ce_state', '')).upper() == 'ACTIVE'
                        and str(_queue_dict.get('ce_flavour', '')).lower()
                        in set(['arc-ce', 'cream-ce', 'htcondor-ce'])):
                    continue
                ce_endpoint = _queue_dict.get('ce_endpoint')
                if (ce_endpoint in ce_auxilary_dict
                        and str(_queue_dict.get('ce_queue_name',
                                                '')).lower() == 'default'):
                    pass
                else:
                    ce_auxilary_dict[ce_endpoint] = _queue_dict
            # qualified CEs from AGIS info
            n_qualified_ce = len(ce_auxilary_dict)
            if n_qualified_ce > 0:
                # Get CE weighting
                tmpLog.debug('Get CE weighting')
                worker_ce_all_tuple = self.get_ce_statistics(
                    self.queueName, nWorkers)
                ce_weighting = _get_ce_weighting(
                    ce_endpoint_list=list(ce_auxilary_dict.keys()),
                    worker_ce_all_tuple=worker_ce_all_tuple)
                stats_weighting_display_str = _get_ce_stats_weighting_display(
                    ce_auxilary_dict.keys(), worker_ce_all_tuple, ce_weighting)
                tmpLog.debug('CE stats and weighting: {0}'.format(
                    stats_weighting_display_str))
            else:
                tmpLog.error('No valid CE endpoint found')
                to_submit_any = False

        def _handle_one_worker(workspec, to_submit=to_submit_any):
            # make logger
            tmpLog = core_utils.make_logger(baseLogger,
                                            'workerID={0}'.format(
                                                workspec.workerID),
                                            method_name='_handle_one_worker')
            ce_info_dict = dict()
            batch_log_dict = dict()
            data = {
                'workspec': workspec,
                'to_submit': to_submit,
            }
            if to_submit:
                if self.useAtlasGridCE:
                    # choose a CE
                    tmpLog.info('choose a CE...')
                    ce_chosen = _choose_ce(ce_weighting)
                    try:
                        ce_info_dict = ce_auxilary_dict[ce_chosen].copy()
                    except KeyError:
                        tmpLog.info(
                            'Problem choosing CE with weighting. Choose an arbitrary CE endpoint'
                        )
                        ce_info_dict = random.choice(
                            list(ce_auxilary_dict.values())).copy()
                    # go on info of the CE
                    ce_endpoint_from_queue = ce_info_dict.get(
                        'ce_endpoint', '')
                    ce_flavour_str = str(ce_info_dict.get('ce_flavour',
                                                          '')).lower()
                    ce_version_str = str(ce_info_dict.get('ce_version',
                                                          '')).lower()
                    ce_info_dict['ce_hostname'] = re.sub(
                        ':\w*', '', ce_endpoint_from_queue)
                    if ce_info_dict['ce_hostname'] == ce_endpoint_from_queue:
                        # add default port to ce_endpoint if missing
                        default_port_map = {
                            'cream-ce': 8443,
                            'arc-ce': 2811,
                            'htcondor-ce': 9619,
                        }
                        if ce_flavour_str in default_port_map:
                            default_port = default_port_map[ce_flavour_str]
                            ce_info_dict['ce_endpoint'] = '{0}:{1}'.format(
                                ce_endpoint_from_queue, default_port)
                    tmpLog.debug(
                        'For site {0} got CE endpoint: "{1}", flavour: "{2}"'.
                        format(self.queueName, ce_endpoint_from_queue,
                               ce_flavour_str))
                    if os.path.isdir(self.CEtemplateDir) and ce_flavour_str:
                        sdf_template_filename = '{ce_flavour_str}.sdf'.format(
                            ce_flavour_str=ce_flavour_str)
                        self.templateFile = os.path.join(
                            self.CEtemplateDir, sdf_template_filename)
                else:
                    try:
                        # Manually define site condor schedd as ceHostname and central manager as ceEndpoint
                        if self.ceHostname and isinstance(
                                self.ceHostname,
                                list) and len(self.ceHostname) > 0:
                            if isinstance(self.ceEndpoint,
                                          list) and len(self.ceEndpoint) > 0:
                                ce_info_dict['ce_hostname'], ce_info_dict[
                                    'ce_endpoint'] = random.choice(
                                        list(
                                            zip(self.ceHostname,
                                                self.ceEndpoint)))
                            else:
                                ce_info_dict['ce_hostname'] = random.choice(
                                    self.ceHostname)
                                ce_info_dict['ce_endpoint'] = self.ceEndpoint
                        else:
                            ce_info_dict['ce_hostname'] = self.ceHostname
                            ce_info_dict['ce_endpoint'] = self.ceEndpoint
                    except AttributeError:
                        pass
                # template for batch script
                try:
                    tmpFile = open(self.templateFile)
                    sdf_template_raw = tmpFile.read()
                    tmpFile.close()
                except AttributeError:
                    tmpLog.error(
                        'No valid templateFile found. Maybe templateFile, CEtemplateDir invalid, or no valid CE found'
                    )
                    to_submit = False
                    return data
                else:
                    # get batch_log, stdout, stderr filename, and remobe commented liness
                    sdf_template_str_list = []
                    for _line in sdf_template_raw.split('\n'):
                        if _line.startswith('#'):
                            continue
                        sdf_template_str_list.append(_line)
                        _match_batch_log = re.match('log = (.+)', _line)
                        _match_stdout = re.match('output = (.+)', _line)
                        _match_stderr = re.match('error = (.+)', _line)
                        if _match_batch_log:
                            batch_log_value = _match_batch_log.group(1)
                            continue
                        if _match_stdout:
                            stdout_value = _match_stdout.group(1)
                            continue
                        if _match_stderr:
                            stderr_value = _match_stderr.group(1)
                            continue
                    sdf_template = '\n'.join(sdf_template_str_list)
                    # Choose from Condor schedd and central managers
                    if isinstance(self.condorSchedd,
                                  list) and len(self.condorSchedd) > 0:
                        if isinstance(self.condorPool,
                                      list) and len(self.condorPool) > 0:
                            condor_schedd, condor_pool = random.choice(
                                list(zip(self.condorSchedd, self.condorPool)))
                        else:
                            condor_schedd = random.choice(self.condorSchedd)
                            condor_pool = self.condorPool
                    else:
                        condor_schedd = self.condorSchedd
                        condor_pool = self.condorPool
                    # Log Base URL
                    if self.logBaseURL and '[ScheddHostname]' in self.logBaseURL:
                        schedd_hostname = re.sub(
                            r'(?:[a-zA-Z0-9_.\-]*@)?([a-zA-Z0-9.\-]+)(?::[0-9]+)?',
                            lambda matchobj: matchobj.group(1)
                            if matchobj.group(1) else '', condor_schedd)
                        log_base_url = re.sub(r'\[ScheddHostname\]',
                                              schedd_hostname, self.logBaseURL)
                    else:
                        log_base_url = self.logBaseURL
                    # URLs for log files
                    if not (log_base_url is None):
                        if workspec.batchID:
                            batchID = workspec.batchID
                            guess = False
                        else:
                            batchID = ''
                            guess = True
                        batch_log_filename = parse_batch_job_filename(
                            value_str=batch_log_value,
                            file_dir=log_subdir_path,
                            batchID=batchID,
                            guess=guess)
                        stdout_path_file_name = parse_batch_job_filename(
                            value_str=stdout_value,
                            file_dir=log_subdir_path,
                            batchID=batchID,
                            guess=guess)
                        stderr_path_filename = parse_batch_job_filename(
                            value_str=stderr_value,
                            file_dir=log_subdir_path,
                            batchID=batchID,
                            guess=guess)
                        batch_log = '{0}/{1}/{2}'.format(
                            log_base_url, log_subdir, batch_log_filename)
                        batch_stdout = '{0}/{1}/{2}'.format(
                            log_base_url, log_subdir, stdout_path_file_name)
                        batch_stderr = '{0}/{1}/{2}'.format(
                            log_base_url, log_subdir, stderr_path_filename)
                        workspec.set_log_file('batch_log', batch_log)
                        workspec.set_log_file('stdout', batch_stdout)
                        workspec.set_log_file('stderr', batch_stderr)
                        batch_log_dict['batch_log'] = batch_log
                        batch_log_dict['batch_stdout'] = batch_stdout
                        batch_log_dict['batch_stderr'] = batch_stderr
                        batch_log_dict['gtag'] = workspec.workAttributes[
                            'stdOut']
                        tmpLog.debug('Done set_log_file before submission')
                    tmpLog.debug('Done jobspec attribute setting')
                # set data dict
                data.update({
                    'workspec': workspec,
                    'to_submit': to_submit,
                    'template': sdf_template,
                    'executable_file': self.executableFile,
                    'log_dir': self.logDir,
                    'log_subdir': log_subdir,
                    'n_core_per_node': n_core_per_node,
                    'panda_queue_name': panda_queue_name,
                    'x509_user_proxy': self.x509UserProxy,
                    'ce_info_dict': ce_info_dict,
                    'batch_log_dict': batch_log_dict,
                    'special_par': special_par,
                    'harvester_queue_config': harvester_queue_config,
                    'is_unified_queue': is_unified_queue,
                    'condor_schedd': condor_schedd,
                    'condor_pool': condor_pool,
                    'use_spool': self.useSpool,
                })
            return data

        def _propagate_attributes(workspec, tmpVal):
            # make logger
            tmpLog = core_utils.make_logger(
                baseLogger,
                'workerID={0}'.format(workspec.workerID),
                method_name='_propagate_attributes')
            (retVal, tmpDict) = tmpVal
            workspec.set_attributes_with_dict(tmpDict)
            tmpLog.debug('Done workspec attributes propagation')
            return retVal

        tmpLog.debug('finished preparing worker attributes')

        # map(_handle_one_worker, workspec_list)
        with ThreadPoolExecutor(self.nProcesses * 4) as thread_pool:
            dataIterator = thread_pool.map(_handle_one_worker, workspec_list)
        tmpLog.debug('{0} workers handled'.format(nWorkers))

        # exec with mcore
        with ThreadPoolExecutor(self.nProcesses) as thread_pool:
            retValList = thread_pool.map(submit_a_worker, dataIterator)
        tmpLog.debug('{0} workers submitted'.format(nWorkers))

        # propagate changed attributes
        with ThreadPoolExecutor(self.nProcesses) as thread_pool:
            retIterator = thread_pool.map(
                lambda _wv_tuple: _propagate_attributes(*_wv_tuple),
                zip(workspec_list, retValList))

        retList = list(retIterator)
        tmpLog.debug('done')

        return retList

예제 #28

파일 보기

파일: yoda_rucio_rse_direct_stager.py 프로젝트: wguanicedew/harvester

    def check_stage_out_status(self, jobspec):
        tmpStat = True
        tmpMsg = ''
        # make logger
        tmpLog = self.make_logger(baseLogger, 'PandaID={0} ThreadID={1}'.format(jobspec.PandaID,threading.current_thread().ident),
                                  method_name='check_stage_out_status')
        tmpLog.debug('start')
        # check that jobspec.computingSite is defined
        if jobspec.computingSite is None:
            # not found
            tmpLog.error('jobspec.computingSite is not defined')
            return False, 'jobspec.computingSite is not defined'
        else:
            tmpLog.debug('jobspec.computingSite : {0}'.format(jobspec.computingSite))
        # get the queueConfig and corresponding objStoreID_ES
        queueConfigMapper = QueueConfigMapper()
        queueConfig = queueConfigMapper.get_queue(jobspec.computingSite)
        # write to debug log queueConfig.stager
        tmpLog.debug('jobspec.computingSite - {0} queueConfig.stager {1}'.format(jobspec.computingSite,queueConfig.stager))
        # check queueConfig stager section to see if jobtype is set
        if 'jobtype' in queueConfig.stager:
            if queueConfig.stager['jobtype'] == "Yoda" :
                self.Yodajob = True
        # get destination endpoint
        nucleus = jobspec.jobParams['nucleus']
        agis = self.dbInterface.get_cache('panda_queues.json').data
        dstRSE = [agis[x]["astorages"]['pr'][0] for x in agis if agis[x]["atlas_site"] == nucleus][0]
        # set the location of the files in fileSpec.objstoreID
        # see file /cvmfs/atlas.cern.ch/repo/sw/local/etc/agis_ddmendpoints.json 
        ddm = self.dbInterface.get_cache('agis_ddmendpoints.json').data
        self.objstoreID = ddm[dstRSE]['id']
        if self.Yodajob :
            self.pathConvention = int(queueConfig.stager['pathConvention'])
            tmpLog.debug('Yoda Job - PandaID = {0} objstoreID = {1} pathConvention ={2}'.format(jobspec.PandaID,self.objstoreID,self.pathConvention))
        else:
            self.pathConvention = None
            tmpLog.debug('PandaID = {0} objstoreID = {1}'.format(jobspec.PandaID,self.objstoreID))
        # set the location of the files in fileSpec.objstoreID
        self.set_FileSpec_objstoreID(jobspec, self.objstoreID, self.pathConvention)
        # Get the files grouped by Rucio Rule ID 
        groups = jobspec.get_groups_of_output_files()
        if len(groups) == 0:
            tmpLog.debug('No Rucio Rules')
            return None,'No Rucio Rules'
        tmpLog.debug('#Rucio Rules - {0} - Rules - {1}'.format(len(groups),groups)) 
        
        try:
            rucioAPI = RucioClient()
        except:
            tmpLog.error('failure to get Rucio Client try again later')
            return None,'failure to get Rucio Client try again later'

        # loop over the Rucio rules 
        for rucioRule in groups:
            if rucioRule is None:
                continue
            # lock
            have_db_lock = self.dbInterface.get_object_lock(rucioRule, lock_interval=120)
            if not have_db_lock:
                msgStr = 'escape since {0} is locked by another thread'.format(rucioRule)
                tmpLog.debug(msgStr)
                return None, msgStr
            # get transfer status
            groupStatus = self.dbInterface.get_file_group_status(rucioRule)
            tmpLog.debug('rucioRule - {0} - groupStatus - {1}'.format(rucioRule,groupStatus))
            if 'transferred' in groupStatus:
                # already succeeded - set the fileSpec status for these files 
                self.set_FileSpec_objstoreID(jobspec, self.objstoreID, self.pathConvention)
                pass
            elif 'failed' in groupStatus :
                # transfer failure
                tmpStat = False
                tmpMsg = 'rucio rule for {0}:{1} already failed'.format(datasetScope, datasetName)
            elif 'transferring' in groupStatus or 'pending' in groupStatus:
                # transfer started in Rucio check status
                try:
                    result = rucioAPI.get_replication_rule(rucioRule,False)
                    if result['state'] == "OK" :
                        # files transfered to nucleus
                        tmpLog.debug('Files for Rucio Rule {0} successfully transferred'.format(rucioRule))
                        self.dbInterface.update_file_group_status(rucioRule, 'transferred')
                        # set the fileSpec status for these files 
                        self.set_FileSpec_objstoreID(jobspec, self.objstoreID, self.pathConvention)
                        self.set_FileSpec_status(jobspec,'finished')
                    elif result['state'] == "FAILED" :
                        # failed Rucio Transfer
                        tmpStat = False
                        tmpMsg = 'Failed Rucio Transfer - Rucio Rule - {0}'.format(rucioRule)
                        tmpLog.debug(tmpMsg)
                        self.set_FileSpec_status(jobspec,'failed')
                    elif result['state'] == 'STUCK' :
                        tmpStat = None
                        tmpMsg = 'Rucio Transfer Rule {0} Stuck'.format(rucioRule)
                        tmpLog.debug(tmpMsg)
                except:
                        tmpStat = None
                        tmpMsg = 'Could not get information or Rucio Rule {0}'.format(rucioRule)
                        tmpLog.error(tmpMsg)
                        pass
            # release the lock
            if have_db_lock:
                tmpLog.debug('attempt to release DB lock for Rucio Rule {0}'.format(rucioRule))
                release_db_lock = self.dbInterface.release_object_lock(rucioRule) 
                if release_db_lock:
                    tmpLog.debug('released DB lock for rucioRule - {0}'.format(rucioRule))
                    have_db_lock = False 
                else:
                    msgStr = ' Could not release DB lock for {}'.format(rucioRule)
                    tmpLog.error(msgStr)
                    return None, msgStr

        tmpLog.debug('stop')
        return  tmpStat, tmpMsg

예제 #29

파일 보기

fork_child_pid = os.fork()
if fork_child_pid != 0:
    signal_utils.set_suicide_handler(None)
    os.wait()
else:

    if len(sys.argv) not in (2, 4):
        print("Wrong number of parameters. You can either:")
        print("  - specify the queue name")
        print(
            "  - specify the queue name, jobType (managed, user) and resourceType (SCORE, SCORE_HIMEM, MCORE, MCORE_HIMEM)"
        )
        sys.exit(0)

    queueName = sys.argv[1]
    queueConfigMapper = QueueConfigMapper()
    queueConfig = queueConfigMapper.get_queue(queueName)

    if queueConfig.prodSourceLabel in ('user', 'managed'):
        jobType = queueConfig.prodSourceLabel
    else:
        jobType = 'managed'  # default, can be overwritten by parameters

    resourceType = 'SCORE'  # default, can be overwritten by parameters

    if len(sys.argv) == 4:
        # jobType should be 'managed' or 'user'. If not specified will default to a production job
        if sys.argv[2] in ('user', 'managed'):
            jobType = sys.argv[2]
        else:
            print('value for jobType not valid, defaulted to {0}'.format(

예제 #30

파일 보기

 def __init__(self, **kwarg):
     PluginBase.__init__(self, **kwarg)
     self.queue_config_mapper = QueueConfigMapper()