예제 #1
0
    def master_prepare(self,app,appconfig):
        '''Prepare the master job'''

        from pandatools import Client
        from taskbuffer.JobSpec import JobSpec
        from taskbuffer.FileSpec import FileSpec

        job = app._getParent()
        logger.debug('ExecutablePandaRTHandler master_prepare called for %s', job.getFQID('.')) 

        # set chirp variables
        if configPanda['chirpconfig'] or configPanda['chirpserver']:
            setChirpVariables()

#       Pack inputsandbox
        inputsandbox = 'sources.%s.tar' % commands.getoutput('uuidgen 2> /dev/null') 
        inpw = job.getInputWorkspace()
        # add user script to inputsandbox
        if hasattr(job.application.exe, "name"):
            if not job.application.exe in job.inputsandbox:
                job.inputsandbox.append(job.application.exe)

        for fname in [f.name for f in job.inputsandbox]:
            fname.rstrip(os.sep)
            path = fname[:fname.rfind(os.sep)]
            f = fname[fname.rfind(os.sep)+1:]
            rc, output = commands.getstatusoutput('tar rf %s -C %s %s' % (inpw.getPath(inputsandbox), path, f))
            if rc:
                logger.error('Packing inputsandbox failed with status %d',rc)
                logger.error(output)
                raise ApplicationConfigurationError('Packing inputsandbox failed.')
        if len(job.inputsandbox) > 0:
            rc, output = commands.getstatusoutput('gzip %s' % (inpw.getPath(inputsandbox)))
            if rc:
                logger.error('Packing inputsandbox failed with status %d',rc)
                logger.error(output)
                raise ApplicationConfigurationError('Packing inputsandbox failed.')
            inputsandbox += ".gz"
        else:
            inputsandbox = None

#       Upload Inputsandbox
        if inputsandbox:
            logger.debug('Uploading source tarball ...')
            uploadSources(inpw.getPath(),os.path.basename(inputsandbox))
            self.inputsandbox = inputsandbox
        else:
            self.inputsandbox = None

#       input dataset
        if job.inputdata:
            if job.inputdata._name != 'DQ2Dataset':
                raise ApplicationConfigurationError('PANDA application supports only DQ2Datasets')

        # run brokerage here if not splitting
        if not job.splitter:
            from GangaPanda.Lib.Panda.Panda import runPandaBrokerage
            runPandaBrokerage(job)
        elif job.splitter._name not in ['DQ2JobSplitter', 'ArgSplitter', 'ArgSplitterTask']:
            raise ApplicationConfigurationError('Panda splitter must be DQ2JobSplitter or ArgSplitter')
        
        if job.backend.site == 'AUTO':
            raise ApplicationConfigurationError('site is still AUTO after brokerage!')

#       output dataset
        if job.outputdata:
            if job.outputdata._name != 'DQ2OutputDataset':
                raise ApplicationConfigurationError('Panda backend supports only DQ2OutputDataset')
        else:
            logger.info('Adding missing DQ2OutputDataset')
            job.outputdata = DQ2OutputDataset()

        job.outputdata.datasetname,outlfn = dq2outputdatasetname(job.outputdata.datasetname, job.id, job.outputdata.isGroupDS, job.outputdata.groupname)

        self.outDsLocation = Client.PandaSites[job.backend.site]['ddm']

        try:
            Client.addDataset(job.outputdata.datasetname,False,location=self.outDsLocation)
            logger.info('Output dataset %s registered at %s'%(job.outputdata.datasetname,self.outDsLocation))
            dq2_set_dataset_lifetime(job.outputdata.datasetname, location=self.outDsLocation)
        except exceptions.SystemExit:
            raise BackendError('Panda','Exception in Client.addDataset %s: %s %s'%(job.outputdata.datasetname,sys.exc_info()[0],sys.exc_info()[1]))

        # handle the libds
        if job.backend.libds:
            self.libDataset = job.backend.libds
            self.fileBO = getLibFileSpecFromLibDS(self.libDataset)
            self.library = self.fileBO.lfn
        elif job.backend.bexec:
            self.libDataset = job.outputdata.datasetname+'.lib'
            self.library = '%s.tgz' % self.libDataset
            try:
                Client.addDataset(self.libDataset,False,location=self.outDsLocation)
                dq2_set_dataset_lifetime(self.libDataset, location=self.outDsLocation)
                logger.info('Lib dataset %s registered at %s'%(self.libDataset,self.outDsLocation))
            except exceptions.SystemExit:
                raise BackendError('Panda','Exception in Client.addDataset %s: %s %s'%(self.libDataset,sys.exc_info()[0],sys.exc_info()[1]))

        # collect extOutFiles
        self.extOutFile = []
        for tmpName in job.outputdata.outputdata:
            if tmpName != '':
                self.extOutFile.append(tmpName)

        for tmpName in job.outputsandbox:
            if tmpName != '':
                self.extOutFile.append(tmpName)

        for tmpName in job.backend.extOutFile:
            if tmpName != '':
                self.extOutFile.append(tmpName)

        # create build job
        if job.backend.bexec != '':
            jspec = JobSpec()
            jspec.jobDefinitionID   = job.id
            jspec.jobName           = commands.getoutput('uuidgen 2> /dev/null')
            jspec.transformation    = '%s/buildGen-00-00-01' % Client.baseURLSUB
            if Client.isDQ2free(job.backend.site):
                jspec.destinationDBlock = '%s/%s' % (job.outputdata.datasetname,self.libDataset)
                jspec.destinationSE     = 'local'
            else:
                jspec.destinationDBlock = self.libDataset
                jspec.destinationSE     = job.backend.site
            jspec.prodSourceLabel   = configPanda['prodSourceLabelBuild']
            jspec.processingType    = configPanda['processingType']
            jspec.assignedPriority  = configPanda['assignedPriorityBuild']
            jspec.computingSite     = job.backend.site
            jspec.cloud             = job.backend.requirements.cloud
            jspec.jobParameters     = '-o %s' % (self.library)
            if self.inputsandbox:
                jspec.jobParameters     += ' -i %s' % (self.inputsandbox)
            else:
                raise ApplicationConfigurationError('Executable on Panda with build job defined, but inputsandbox is emtpy !')
            matchURL = re.search('(http.*://[^/]+)/',Client.baseURLCSRVSSL)
            if matchURL:
                jspec.jobParameters += ' --sourceURL %s ' % matchURL.group(1)
            if job.backend.bexec != '':
                jspec.jobParameters += ' --bexec "%s" ' % urllib.quote(job.backend.bexec)
                jspec.jobParameters += ' -r %s ' % '.'
                

            fout = FileSpec()
            fout.lfn  = self.library
            fout.type = 'output'
            fout.dataset = self.libDataset
            fout.destinationDBlock = self.libDataset
            jspec.addFile(fout)

            flog = FileSpec()
            flog.lfn = '%s.log.tgz' % self.libDataset
            flog.type = 'log'
            flog.dataset = self.libDataset
            flog.destinationDBlock = self.libDataset
            jspec.addFile(flog)
            return jspec
        else:
            return None
예제 #2
0
    def master_prepare(self,app,appmasterconfig):

        # PandaTools
        from pandatools import Client
        from pandatools import AthenaUtils
        from taskbuffer.JobSpec import JobSpec
        from taskbuffer.FileSpec import FileSpec

        job = app._getParent()
        logger.debug('AthenaMCPandaRTHandler master_prepare called for %s', job.getFQID('.'))
        usertag = configDQ2['usertag']
        #usertag='user09'
        nickname = getNickname(allowMissingNickname=True)
        self.libDataset = '%s.%s.ganga.%s_%d.lib._%06d' % (usertag,nickname,commands.getoutput('hostname').split('.')[0],int(time.time()),job.id)
#        self.userprefix='%s.%s.ganga' % (usertag,gridProxy.identity())
        sources = 'sources.%s.tar.gz' % commands.getoutput('uuidgen 2> /dev/null') 
        self.library = '%s.lib.tgz' % self.libDataset

        # check DBRelease
        # if job.backend.dbRelease != '' and job.backend.dbRelease.find(':') == -1:
         #   raise ApplicationConfigurationError(None,"ERROR : invalid argument for backend.dbRelease. Must be 'DatasetName:FileName'")

#       unpack library
        logger.debug('Creating source tarball ...')        
        tmpdir = '/tmp/%s' % commands.getoutput('uuidgen 2> /dev/null')
        os.mkdir(tmpdir)

        inputbox=[]
        if os.path.exists(app.transform_archive):
            # must add a condition on size.
            inputbox += [ File(app.transform_archive) ]
        if app.evgen_job_option:
            self.evgen_job_option=app.evgen_job_option
            if os.path.exists(app.evgen_job_option):
                # locally modified job option file to add to the input sand box
                inputbox += [ File(app.evgen_job_option) ]
                self.evgen_job_option=app.evgen_job_option.split("/")[-1]

         
#       add input sandbox files
        if (job.inputsandbox):
            for file in job.inputsandbox:
                inputbox += [ file ]
#        add option files
        for extFile in job.backend.extOutFile:
            try:
                shutil.copy(extFile,tmpdir)
            except IOError:
                os.makedirs(tmpdir)
                shutil.copy(extFile,tmpdir)
#       fill the archive
        for opt_file in inputbox:
            try:
                shutil.copy(opt_file.name,tmpdir)
            except IOError:
                os.makedirs(tmpdir)
                shutil.copy(opt_file.name,tmpdir)
#       now tar it up again

        inpw = job.getInputWorkspace()
        rc, output = commands.getstatusoutput('tar czf %s -C %s .' % (inpw.getPath(sources),tmpdir))
        if rc:
            logger.error('Packing sources failed with status %d',rc)
            logger.error(output)
            raise ApplicationConfigurationError(None,'Packing sources failed.')

        shutil.rmtree(tmpdir)

#       upload sources

        logger.debug('Uploading source tarball ...')
        try:
            cwd = os.getcwd()
            os.chdir(inpw.getPath())
            rc, output = Client.putFile(sources)
            if output != 'True':
                logger.error('Uploading sources %s failed. Status = %d', sources, rc)
                logger.error(output)
                raise ApplicationConfigurationError(None,'Uploading archive failed')
        finally:
            os.chdir(cwd)      


        # Use Panda's brokerage
##         if job.inputdata and len(app.sites)>0:
##             # update cloud, use inputdata's
##             from dq2.info.TiersOfATLAS import whichCloud,ToACache
##             inclouds=[]
##             for site in app.sites:
##                 cloudSite=whichCloud(app.sites[0])
##                 if cloudSite not in inclouds:
##                     inclouds.append(cloudSite)
##             # now converting inclouds content into proper brokering stuff.
##             outclouds=[]
##             for cloudSite in inclouds:
##                 for cloudID, eachCloud in ToACache.dbcloud.iteritems():
##                     if cloudSite==eachCloud:
##                         cloud=cloudID
##                         outclouds.append(cloud)
##                         break
                    
##             print outclouds
##             # finally, matching with user's wishes
##             if len(outclouds)>0:
##                 if not job.backend.requirements.cloud: # no user wish, update
##                     job.backend.requirements.cloud=outclouds[0]
##                 else:
##                     try:
##                         assert job.backend.requirements.cloud in outclouds
##                     except:
##                         raise ApplicationConfigurationError(None,'Input dataset not available in target cloud %s. Please try any of the following %s' % (job.backend.requirements.cloud, str(outclouds)))
                                                            
        from GangaPanda.Lib.Panda.Panda import runPandaBrokerage
        
        runPandaBrokerage(job)
        
        if job.backend.site == 'AUTO':
            raise ApplicationConfigurationError(None,'site is still AUTO after brokerage!')

        # output dataset preparation and registration
        try:
            outDsLocation = Client.PandaSites[job.backend.site]['ddm']
        except:
            raise ApplicationConfigurationError(None,"Could not extract output dataset location from job.backend.site value: %s. Aborting" % job.backend.site)
        if not app.dryrun:
            for outtype in app.outputpaths.keys():
                dset=string.replace(app.outputpaths[outtype],"/",".")
                dset=dset[1:]
                # dataset registration must be done only once.
                print "registering output dataset %s at %s" % (dset,outDsLocation)
                try:
                    Client.addDataset(dset,False,location=outDsLocation)
                    dq2_set_dataset_lifetime(dset, location=outDsLocation)
                except:
                    raise ApplicationConfigurationError(None,"Fail to create output dataset %s. Aborting" % dset)
            # extend registration to build job lib dataset:
            print "registering output dataset %s at %s" % (self.libDataset,outDsLocation)

            try:
                Client.addDataset(self.libDataset,False,location=outDsLocation)
                dq2_set_dataset_lifetime(self.libDataset, outDsLocation)
            except:
                raise ApplicationConfigurationError(None,"Fail to create output dataset %s. Aborting" % self.libDataset)


        ###
        cacheVer = "-AtlasProduction_" + str(app.prod_release)
            
        logger.debug("master job submit?")
        self.outsite=job.backend.site
        if app.se_name and app.se_name != "none" and not self.outsite:
            self.outsite=app.se_name

       
        #       create build job
        jspec = JobSpec()
        jspec.jobDefinitionID   = job.id
        jspec.jobName           = commands.getoutput('uuidgen 2> /dev/null')
        jspec.AtlasRelease      = 'Atlas-%s' % app.atlas_rel
        jspec.homepackage       = 'AnalysisTransforms'+cacheVer#+nightVer
        jspec.transformation    = '%s/buildJob-00-00-03' % Client.baseURLSUB # common base to Athena and AthenaMC jobs: buildJob is a pilot job which takes care of all inputs for the real jobs (in prepare()
        jspec.destinationDBlock = self.libDataset
        jspec.destinationSE     = job.backend.site
        jspec.prodSourceLabel   = 'panda'
        jspec.assignedPriority  = 2000
        jspec.computingSite     = job.backend.site
        jspec.cloud             = job.backend.requirements.cloud
#        jspec.jobParameters     = self.args not known yet
        jspec.jobParameters     = '-o %s' % (self.library)
        if app.userarea:
            print app.userarea
            jspec.jobParameters     += ' -i %s' % (os.path.basename(app.userarea))
        else:
            jspec.jobParameters     += ' -i %s' % (sources)
        jspec.cmtConfig         = AthenaUtils.getCmtConfig(athenaVer=app.atlas_rel)
        
        matchURL = re.search('(http.*://[^/]+)/',Client.baseURLSSL)
        if matchURL:
            jspec.jobParameters += ' --sourceURL %s' % matchURL.group(1)

        fout = FileSpec()
        fout.lfn  = self.library
        fout.type = 'output'
        fout.dataset = self.libDataset
        fout.destinationDBlock = self.libDataset
        jspec.addFile(fout)

        flog = FileSpec()
        flog.lfn = '%s.log.tgz' % self.libDataset
        flog.type = 'log'
        flog.dataset = self.libDataset
        flog.destinationDBlock = self.libDataset
        jspec.addFile(flog)
        #print "MASTER JOB DETAILS:",jspec.jobParameters

        return jspec
    def prepare(self, app, appsubconfig, appmasterconfig, jobmasterconfig):
        """Prepare the specific aspec of each subjob.
           Returns: subjobconfig list of objects understood by backends."""

        from pandatools import Client
        from pandatools import AthenaUtils
        from taskbuffer.JobSpec import JobSpec
        from taskbuffer.FileSpec import FileSpec
        from GangaAtlas.Lib.ATLASDataset.DQ2Dataset import dq2_set_dataset_lifetime
        from GangaPanda.Lib.Panda.Panda import refreshPandaSpecs
        
        # make sure we have the correct siteType
        refreshPandaSpecs()

        job = app._getParent()
        masterjob = job._getRoot()

        logger.debug('ProdTransPandaRTHandler prepare called for %s',
                     job.getFQID('.'))

        job.backend.actualCE = job.backend.site
        job.backend.requirements.cloud = Client.PandaSites[job.backend.site]['cloud']

        # check that the site is in a submit-able status
        if not job.splitter or job.splitter._name != 'DQ2JobSplitter':
            allowed_sites = job.backend.list_ddm_sites()

        try:
            outDsLocation = Client.PandaSites[job.backend.site]['ddm']
            tmpDsExist = False
            if (configPanda['processingType'].startswith('gangarobot') or configPanda['processingType'].startswith('hammercloud')):
                #if Client.getDatasets(job.outputdata.datasetname):
                if getDatasets(job.outputdata.datasetname):
                    tmpDsExist = True
                    logger.info('Re-using output dataset %s'%job.outputdata.datasetname)
            if not configPanda['specialHandling']=='ddm:rucio' and not  configPanda['processingType'].startswith('gangarobot') and not configPanda['processingType'].startswith('hammercloud') and not configPanda['processingType'].startswith('rucio_test'):
                Client.addDataset(job.outputdata.datasetname,False,location=outDsLocation,allowProdDisk=True,dsExist=tmpDsExist)
            logger.info('Output dataset %s registered at %s'%(job.outputdata.datasetname,outDsLocation))
            dq2_set_dataset_lifetime(job.outputdata.datasetname, outDsLocation)
        except exceptions.SystemExit:
            raise BackendError('Panda','Exception in adding dataset %s: %s %s'%(job.outputdata.datasetname,sys.exc_info()[0],sys.exc_info()[1]))
        
        # JobSpec.
        jspec = JobSpec()
        jspec.currentPriority = app.priority
        jspec.jobDefinitionID = masterjob.id
        jspec.jobName = commands.getoutput('uuidgen 2> /dev/null')
        jspec.coreCount = app.core_count
        jspec.AtlasRelease = 'Atlas-%s' % app.atlas_release
        jspec.homepackage = app.home_package
        jspec.transformation = app.transformation
        jspec.destinationDBlock = job.outputdata.datasetname
        if job.outputdata.location:
            jspec.destinationSE = job.outputdata.location
        else:
            jspec.destinationSE = job.backend.site
        if job.inputdata:
            jspec.prodDBlock = job.inputdata.dataset[0]
        else:
            jspec.prodDBlock = 'NULL'
        if app.prod_source_label:
            jspec.prodSourceLabel = app.prod_source_label
        else:
            jspec.prodSourceLabel = configPanda['prodSourceLabelRun']
        jspec.processingType = configPanda['processingType']
        jspec.specialHandling = configPanda['specialHandling']
        jspec.computingSite = job.backend.site
        jspec.cloud = job.backend.requirements.cloud
        jspec.cmtConfig = app.atlas_cmtconfig
        if app.dbrelease == 'LATEST':
            try:
                latest_dbrelease = getLatestDBReleaseCaching()
            except:
                from pandatools import Client
                latest_dbrelease = Client.getLatestDBRelease()
            m = re.search('(.*):DBRelease-(.*)\.tar\.gz', latest_dbrelease)
            if m:
                self.dbrelease_dataset = m.group(1)
                self.dbrelease = m.group(2)
            else:
                raise ApplicationConfigurationError(None, "Error retrieving LATEST DBRelease. Try setting application.dbrelease manually.")
        else:
            self.dbrelease_dataset = app.dbrelease_dataset
            self.dbrelease = app.dbrelease
        jspec.jobParameters = app.job_parameters

        if self.dbrelease:
            if self.dbrelease == 'current':
                jspec.jobParameters += ' --DBRelease=current' 
            else:
                if jspec.transformation.endswith("_tf.py") or jspec.transformation.endswith("_tf"):
                    jspec.jobParameters += ' --DBRelease=DBRelease-%s.tar.gz' % (self.dbrelease,)
                else:
                    jspec.jobParameters += ' DBRelease=DBRelease-%s.tar.gz' % (self.dbrelease,)
                dbspec = FileSpec()
                dbspec.lfn = 'DBRelease-%s.tar.gz' % self.dbrelease
                dbspec.dataset = self.dbrelease_dataset
                dbspec.prodDBlock = jspec.prodDBlock
                dbspec.type = 'input'
                jspec.addFile(dbspec)

        if job.inputdata:
            m = re.search('(.*)\.(.*)\.(.*)\.(.*)\.(.*)\.(.*)',
                          job.inputdata.dataset[0])
            if not m:
                logger.error("Error retrieving run number from dataset name")
                #raise ApplicationConfigurationError(None, "Error retrieving run number from dataset name")
                runnumber = 105200
            else:
                runnumber = int(m.group(2))
            if jspec.transformation.endswith("_tf.py") or jspec.transformation.endswith("_tf"):
                jspec.jobParameters += ' --runNumber %d' % runnumber
            else:
                jspec.jobParameters += ' RunNumber=%d' % runnumber
        
        # Output files.
        randomized_lfns = []
        ilfn = 0
        for lfn, lfntype in zip(app.output_files,app.output_type):
            ofspec = FileSpec()
            if app.randomize_lfns:
                randomized_lfn = lfn + ('.%s.%d.%s' % (job.backend.site, int(time.time()), commands.getoutput('uuidgen 2> /dev/null')[:4] ) )
            else:
                randomized_lfn = lfn
            ofspec.lfn = randomized_lfn
            randomized_lfns.append(randomized_lfn)
            ofspec.destinationDBlock = jspec.destinationDBlock
            ofspec.destinationSE = jspec.destinationSE
            ofspec.dataset = jspec.destinationDBlock
            ofspec.type = 'output'
            jspec.addFile(ofspec)
            if jspec.transformation.endswith("_tf.py") or jspec.transformation.endswith("_tf"):
                jspec.jobParameters += ' --output%sFile %s' % (lfntype, randomized_lfns[ilfn])
            else:
                jspec.jobParameters += ' output%sFile=%s' % (lfntype, randomized_lfns[ilfn])
            ilfn=ilfn+1

        # Input files.
        if job.inputdata:
            for guid, lfn, size, checksum, scope in zip(job.inputdata.guids, job.inputdata.names, job.inputdata.sizes, job.inputdata.checksums, job.inputdata.scopes):
                ifspec = FileSpec()
                ifspec.lfn = lfn
                ifspec.GUID = guid
                ifspec.fsize = size
                ifspec.md5sum = checksum
                ifspec.scope = scope
                ifspec.dataset = jspec.prodDBlock
                ifspec.prodDBlock = jspec.prodDBlock
                ifspec.type = 'input'
                jspec.addFile(ifspec)
            if app.input_type:
                itype = app.input_type
            else:
                itype = m.group(5)
            if jspec.transformation.endswith("_tf.py") or jspec.transformation.endswith("_tf"):
                jspec.jobParameters += ' --input%sFile %s' % (itype, ','.join(job.inputdata.names))
            else:
                jspec.jobParameters += ' input%sFile=%s' % (itype, ','.join(job.inputdata.names))

        # Log files.
        lfspec = FileSpec()
        lfspec.lfn = '%s.job.log.tgz' % jspec.jobName
        lfspec.destinationDBlock = jspec.destinationDBlock
        lfspec.destinationSE  = jspec.destinationSE
        lfspec.dataset = jspec.destinationDBlock
        lfspec.type = 'log'
        jspec.addFile(lfspec)
        
        return jspec
예제 #4
0
    def master_prepare(self, app, appconfig):
        '''Prepare the master job'''

        from pandatools import Client
        from taskbuffer.JobSpec import JobSpec
        from taskbuffer.FileSpec import FileSpec

        job = app._getParent()
        logger.debug('ExecutablePandaRTHandler master_prepare called for %s',
                     job.getFQID('.'))

        # set chirp variables
        if configPanda['chirpconfig'] or configPanda['chirpserver']:
            setChirpVariables()

#       Pack inputsandbox
        inputsandbox = 'sources.%s.tar' % commands.getoutput(
            'uuidgen 2> /dev/null')
        inpw = job.getInputWorkspace()
        # add user script to inputsandbox
        if hasattr(job.application.exe, "name"):
            if not job.application.exe in job.inputsandbox:
                job.inputsandbox.append(job.application.exe)

        for fname in [f.name for f in job.inputsandbox]:
            fname.rstrip(os.sep)
            path = fname[:fname.rfind(os.sep)]
            f = fname[fname.rfind(os.sep) + 1:]
            rc, output = commands.getstatusoutput(
                'tar rf %s -C %s %s' % (inpw.getPath(inputsandbox), path, f))
            if rc:
                logger.error('Packing inputsandbox failed with status %d', rc)
                logger.error(output)
                raise ApplicationConfigurationError(
                    None, 'Packing inputsandbox failed.')
        if len(job.inputsandbox) > 0:
            rc, output = commands.getstatusoutput('gzip %s' %
                                                  (inpw.getPath(inputsandbox)))
            if rc:
                logger.error('Packing inputsandbox failed with status %d', rc)
                logger.error(output)
                raise ApplicationConfigurationError(
                    None, 'Packing inputsandbox failed.')
            inputsandbox += ".gz"
        else:
            inputsandbox = None

#       Upload Inputsandbox
        if inputsandbox:
            logger.debug('Uploading source tarball ...')
            uploadSources(inpw.getPath(), os.path.basename(inputsandbox))
            self.inputsandbox = inputsandbox
        else:
            self.inputsandbox = None

#       input dataset
        if job.inputdata:
            if job.inputdata._name != 'DQ2Dataset':
                raise ApplicationConfigurationError(
                    None, 'PANDA application supports only DQ2Datasets')

        # run brokerage here if not splitting
        if not job.splitter:
            from GangaPanda.Lib.Panda.Panda import runPandaBrokerage
            runPandaBrokerage(job)
        elif job.splitter._name not in [
                'DQ2JobSplitter', 'ArgSplitter', 'ArgSplitterTask'
        ]:
            raise ApplicationConfigurationError(
                None, 'Panda splitter must be DQ2JobSplitter or ArgSplitter')

        if job.backend.site == 'AUTO':
            raise ApplicationConfigurationError(
                None, 'site is still AUTO after brokerage!')

#       output dataset
        if job.outputdata:
            if job.outputdata._name != 'DQ2OutputDataset':
                raise ApplicationConfigurationError(
                    None, 'Panda backend supports only DQ2OutputDataset')
        else:
            logger.info('Adding missing DQ2OutputDataset')
            job.outputdata = DQ2OutputDataset()

        job.outputdata.datasetname, outlfn = dq2outputdatasetname(
            job.outputdata.datasetname, job.id, job.outputdata.isGroupDS,
            job.outputdata.groupname)

        self.outDsLocation = Client.PandaSites[job.backend.site]['ddm']

        try:
            Client.addDataset(job.outputdata.datasetname,
                              False,
                              location=self.outDsLocation)
            logger.info('Output dataset %s registered at %s' %
                        (job.outputdata.datasetname, self.outDsLocation))
            dq2_set_dataset_lifetime(job.outputdata.datasetname,
                                     location=self.outDsLocation)
        except exceptions.SystemExit:
            raise BackendError(
                'Panda', 'Exception in Client.addDataset %s: %s %s' %
                (job.outputdata.datasetname, sys.exc_info()[0],
                 sys.exc_info()[1]))

        # handle the libds
        if job.backend.libds:
            self.libDataset = job.backend.libds
            self.fileBO = getLibFileSpecFromLibDS(self.libDataset)
            self.library = self.fileBO.lfn
        elif job.backend.bexec:
            self.libDataset = job.outputdata.datasetname + '.lib'
            self.library = '%s.tgz' % self.libDataset
            try:
                Client.addDataset(self.libDataset,
                                  False,
                                  location=self.outDsLocation)
                dq2_set_dataset_lifetime(self.libDataset,
                                         location=self.outDsLocation)
                logger.info('Lib dataset %s registered at %s' %
                            (self.libDataset, self.outDsLocation))
            except exceptions.SystemExit:
                raise BackendError(
                    'Panda', 'Exception in Client.addDataset %s: %s %s' %
                    (self.libDataset, sys.exc_info()[0], sys.exc_info()[1]))

        # collect extOutFiles
        self.extOutFile = []
        for tmpName in job.outputdata.outputdata:
            if tmpName != '':
                self.extOutFile.append(tmpName)

        for tmpName in job.outputsandbox:
            if tmpName != '':
                self.extOutFile.append(tmpName)

        for tmpName in job.backend.extOutFile:
            if tmpName != '':
                self.extOutFile.append(tmpName)

        # create build job
        if job.backend.bexec != '':
            jspec = JobSpec()
            jspec.jobDefinitionID = job.id
            jspec.jobName = commands.getoutput('uuidgen 2> /dev/null')
            jspec.transformation = '%s/buildGen-00-00-01' % Client.baseURLSUB
            if Client.isDQ2free(job.backend.site):
                jspec.destinationDBlock = '%s/%s' % (
                    job.outputdata.datasetname, self.libDataset)
                jspec.destinationSE = 'local'
            else:
                jspec.destinationDBlock = self.libDataset
                jspec.destinationSE = job.backend.site
            jspec.prodSourceLabel = configPanda['prodSourceLabelBuild']
            jspec.processingType = configPanda['processingType']
            jspec.assignedPriority = configPanda['assignedPriorityBuild']
            jspec.computingSite = job.backend.site
            jspec.cloud = job.backend.requirements.cloud
            jspec.jobParameters = '-o %s' % (self.library)
            if self.inputsandbox:
                jspec.jobParameters += ' -i %s' % (self.inputsandbox)
            else:
                raise ApplicationConfigurationError(
                    None,
                    'Executable on Panda with build job defined, but inputsandbox is emtpy !'
                )
            matchURL = re.search('(http.*://[^/]+)/', Client.baseURLCSRVSSL)
            if matchURL:
                jspec.jobParameters += ' --sourceURL %s ' % matchURL.group(1)
            if job.backend.bexec != '':
                jspec.jobParameters += ' --bexec "%s" ' % urllib.quote(
                    job.backend.bexec)
                jspec.jobParameters += ' -r %s ' % '.'

            fout = FileSpec()
            fout.lfn = self.library
            fout.type = 'output'
            fout.dataset = self.libDataset
            fout.destinationDBlock = self.libDataset
            jspec.addFile(fout)

            flog = FileSpec()
            flog.lfn = '%s.log.tgz' % self.libDataset
            flog.type = 'log'
            flog.dataset = self.libDataset
            flog.destinationDBlock = self.libDataset
            jspec.addFile(flog)
            return jspec
        else:
            return None
예제 #5
0
 def retry(self,JobsetID,newSite=False,newOpts={},noSubmit=False,ignoreDuplication=False,useJobsetID=False,retryBuild=False,reproduceFiles=[],unsetRetryID=False):
     # get logger
     tmpLog = PLogger.getPandaLogger()
     # check proxy
     self.gridPassPhrase,self.vomsFQAN = PsubUtils.checkGridProxy(
         self.gridPassPhrase,
         False,
         self.verbose,
         useCache=True)
     # force update just in case
     self.status(JobsetID,True)
     # set an empty map since mutable default value is used
     if newOpts == {}:
         newOpts = {}
     # get jobset
     newJobsetID = -1
     jobList = self.getJobIDsWithSetID(JobsetID)
     if jobList == None:
         # works only for jobsetID
         if useJobsetID:
             return
         # works with jobID   
         isJobset = False
         jobList = [JobsetID]
     else:
         isJobset = True
         tmpMsg = "ID=%s is composed of JobID=" % JobsetID
         for tmpJobID in jobList:
             tmpMsg += '%s,' % tmpJobID
         tmpMsg = tmpMsg[:-1]
         tmpLog.info(tmpMsg)
     for JobID in jobList:    
         # get job info from local repository
         localJob = self.getJobInfo(JobID)
         if localJob == None:
             tmpLog.warning("JobID=%s not found in local repository. Synchronization may be needed" % JobID)            
             return None
         # for JEDI
         if localJob.isJEDI():
             status,out = Client.retryTask(
                     localJob.jediTaskID,
                     verbose=self.verbose,
                     properErrorCode=True,
                     newParams=newOpts)
             if status != 0:
                 tmpLog.error(status)
                 tmpLog.error(out)
                 tmpLog.error("Failed to retry TaskID=%s" % localJob.jediTaskID)
                 return False
             tmpStat,tmpDiag = out
             if (not tmpStat in [0,True] and newOpts == {}) or (newOpts != {} and tmpStat != 3):
                 tmpLog.error(tmpDiag)
                 tmpLog.error("Failed to retry TaskID=%s" % localJob.jediTaskID)
                 return False
             tmpLog.info(tmpDiag)
             continue
         # skip running job
         if localJob.dbStatus != 'frozen':
             tmpLog.info('Retry failed subjobs in running jobId=%s' % JobID)
             status,out = Client.retryFailedJobsInActive(JobID,verbose=self.verbose)
             if status != 0:
                 tmpLog.error(status)
                 tmpLog.error(out)
                 tmpLog.error("Failed to retry JobID=%s" % JobID)
             else:
                 job = self.status(JobID)
             if isJobset:
                 continue
             else:
                 return
         # skip already retried
         if localJob.retryID != '0':
             if isJobset:
                 tmpLog.info('Skip JobID=%s since already retried by JobID=%s JobsetID=%s' % \
                             (JobID,localJob.retryID,localJob.retryJobsetID))
                 continue
             else:
                 tmpLog.warning('This job was already retried by JobID=%s' % localJob.retryID)
                 return
         # check status of buildJob
         if not retryBuild and not localJob.buildStatus in ['','finished']:
             tmpMsgStr = 'Cannot retry since status of buildJob %s is %s (!= finished). ' \
                         % (localJob.PandaID.split(',')[0],localJob.buildStatus)
             tmpMsgStr += 'Please execute %s with the same input/output datasets (or containers). ' % localJob.jobType
             tmpMsgStr += 'It will run only on failed/cancelled/unused input files '
             tmpMsgStr += 'and append output files to the output dataset container. '
             tmpMsgStr += 'Or you may set retryBuild=True in pbook.retry() '                
             tmpLog.warning(tmpMsgStr)
             if isJobset:
                 continue
             else:
                 return
         # check opts for newSite
         if newSite or newOpts != {}:
             if not localJob.outDS.endswith('/') and not newOpts.has_key('outDS') and not newOpts.has_key('--outDS'):
                 tmpLog.warning('You need to specify --outDS in newOpts to retry at new site unless container is used as output')
                 return
         # get list of failed jobs
         pandaIDs  = localJob.PandaID.split(',')
         statusList= localJob.jobStatus.split(',')
         jobList = []
         for idx in range(len(pandaIDs)):
             # check status unless reproduce files
             if reproduceFiles == [] and not statusList[idx] in ['failed','cancelled']:
                 continue
             jobList.append(pandaIDs[idx])
         # no failed job
         if jobList == []:
             if isJobset:
                 tmpLog.info('Skip JobID=%s since no failed jobs' % JobID)                    
                 continue
             else:
                 tmpLog.info('No failed jobs to be retried for JobID=%s' % JobID)
                 return
         # get full job spec
         tmpLog.info("Retrying JobID=%s ..." % JobID)
         tmpLog.info("Getting job info")
         idxJL  = 0
         nQuery = 500
         pandaJobs = []
         while idxJL < len(jobList):
             # avoid burst query
             tmpLog.info(" %5s/%s" % (idxJL,len(jobList)))                
             status,oTmp = Client.getFullJobStatus(
                     jobList[idxJL:idxJL+nQuery],
                     verbose=self.verbose)
             if status != 0:
                 tmpLog.error(status)
                 tmpLog.error(oTmp)
                 tmpLog.error("Cannot get job info from Panda server")
                 return
             pandaJobs += oTmp
             idxJL += nQuery
             time.sleep(1)
         tmpLog.info(" %5s/%s" % (len(jobList),len(jobList)))
         # get PandaIDs to reproduce files
         if reproduceFiles != []:
             # change wildcard to .* for regexp
             reproduceFilePatt = []
             for tmpReproduceFile in reproduceFiles:
                 if '*' in tmpReproduceFile:
                     tmpReproduceFile = tmpReproduceFile.replace('*','.*')
                 reproduceFilePatt.append(tmpReproduceFile)
             # get list of jobs which produced interesting files    
             tmpJobList = []
             tmpPandaJobs = []
             for tmpPandaJob in pandaJobs:
                 # check names
                 tmpMatchFlag = False
                 for tmpFile in tmpPandaJob.Files:
                     if tmpFile.type == 'output' and tmpFile.status == 'ready':
                         for tmpReproduceFile in reproduceFilePatt:
                             # normal matching
                             if tmpReproduceFile == tmpFile.lfn:
                                 tmpMatchFlag = True
                                 break
                             # wild card
                             if '*' in tmpReproduceFile and \
                                re.search('^'+tmpReproduceFile,tmpFile.lfn) != None:
                                 tmpMatchFlag = True
                                 break
                         if tmpMatchFlag:
                             break
                 # append
                 if tmpMatchFlag:
                     tmpJobList.append(tmpPandaJob.PandaID)
                     tmpPandaJobs.append(tmpPandaJob)
             # use new list
             jobList = tmpJobList
             pandaJobs = tmpPandaJobs
             if jobList == []:
                 tmpLog.info("No jobs to reproduce files : Jobs in JobID=%s didn't produce lost files" % JobID)
                 continue
         # jobdefID
         newJobdefID = PsubUtils.readJobDefID()
         # reset some parameters
         retryJobs    = []
         retrySite    = None
         retryElement = None
         retryDestSE  = None
         outDsName    = None
         shadowList   = []
         oldLibDS     = None
         newLibDS     = None
         newLibTgz    = None
         rebroMap     = {}
         for idx in range(len(jobList)):
             job = pandaJobs[idx]
             # skip exired
             if job == None:
                 tmpLog.warning("Could not retry jobs older than 30 days : JobID=%s (PandaID=%s) expired" \
                                % (JobID,jobList[idxJob]))
                 return
             # skip jobs reassigned by rebrokerage
             if (job.jobStatus == 'cancelled' and job.taskBufferErrorCode in [105,'105']) or \
                    (job.jobStatus == 'failed' and job.taskBufferErrorCode in [106,'106']):
                 # extract JobIDs of reassigned jobs
                 tmpM = re.search('JobsetID=(\d+) JobID=(\d+)',job.taskBufferErrorDiag)
                 if tmpM != None:
                     tmpRebKey = (tmpM.group(1),tmpM.group(2))
                     if not rebroMap.has_key(tmpRebKey):
                         rebroMap[tmpRebKey] = 0
                     # count # of reassigned jobs
                     rebroMap[tmpRebKey] += 1
                 continue
             # get shadow list
             if (not ignoreDuplication) and outDsName == None and job.prodSourceLabel == 'user':
                 # look for dataset for log since it doesn't have suffix even when --individualOutDS is used
                 for tmpFile in job.Files:
                     if tmpFile.type == 'log':
                         outDsName = tmpFile.dataset
                         break
                 # output dataset was not found    
                 if outDsName == None:
                     tmpLog.error("Could not get output dataset name for JobID=%s (PandaID=%s)" \
                                  % (JobID,job.PandaID))
                     return
                 # get files in shadow
                 if outDsName.endswith('/'):
                     shadowList = Client.getFilesInShadowDataset(
                             outDsName,
                             Client.suffixShadow,
                             self.verbose)
                 else:
                     # disable duplication check mainly for old overlay jobs since non-signal files are wrongly skipped
                     #shadowList = Client.getFilesInShadowDatasetOld(outDsName,Client.suffixShadow,self.verbose)
                     pass
             # unify sitename
             if retrySite == None:
                 retrySite    = job.computingSite
                 retryElement = job.computingElement
                 retryDestSE  = job.destinationSE
             # reset
             job.jobStatus           = None
             job.commandToPilot      = None
             job.startTime           = None
             job.endTime             = None
             job.attemptNr           = 1+job.attemptNr
             for attr in job._attributes:
                 if attr.endswith('ErrorCode') or attr.endswith('ErrorDiag'):
                     setattr(job,attr,None)
             job.transExitCode       = None
             job.computingSite       = retrySite
             job.computingElement    = retryElement
             job.destinationSE       = retryDestSE
             job.dispatchDBlock      = None
             if not unsetRetryID:
                 job.jobExecutionID  = JobID
             job.jobDefinitionID     = newJobdefID
             job.parentID            = job.PandaID
             if job.jobsetID != ['NULL',None,-1]:
                 if not unsetRetryID:
                     job.sourceSite  = job.jobsetID
                 job.jobsetID        = newJobsetID
             skipInputList = []
             numUsedFiles = 0
             # loop over all files    
             for file in job.Files:
                 file.rowID = None
                 if file.type == 'input':
                     # protection against wrong sync which doesn't update buildStatus correctly
                     if not retryBuild and file.lfn.endswith('.lib.tgz') and file.GUID == 'NULL':
                         tmpLog.warning('GUID for %s is unknown. Cannot retry when corresponding buildJob failed' \
                                        % file.lfn)
                         return
                     if not retryBuild or not file.lfn.endswith('.lib.tgz'):
                         file.status = 'ready'
                     # set new lib dataset    
                     if retryBuild and file.lfn.endswith('.lib.tgz'):
                         if newLibTgz != None:
                             file.lfn            = newLibTgz
                             file.dataset        = newLibDS
                             file.dispatchDBlock = newLibDS
                     # check with shadow for non lib.tgz/DBR 
                     tmpDbrMatch = re.search('^DBRelease-.*\.tar\.gz$',file.lfn)
                     if tmpDbrMatch == None and not file.lfn.endswith('.lib.tgz'):
                         if file.lfn in shadowList:
                             skipInputList.append(file)
                         else:
                             numUsedFiles += 1
                 elif file.type in ('output','log'):
                     file.destinationSE = retryDestSE
                     file.destinationDBlock = re.sub('_sub\d+$','',file.destinationDBlock)
                     # add retry num
                     if file.dataset.endswith('/') or job.prodSourceLabel == 'panda':
                         oldOutDsName = file.destinationDBlock
                         retryDsPatt = '_r'
                         if reproduceFiles != []:
                             retryDsPatt = '_rp'
                         retryMatch = re.search(retryDsPatt+'(\d+)$',file.destinationDBlock)
                         if retryMatch == None:
                             file.destinationDBlock += (retryDsPatt+'1')
                         else:
                             tmpDestinationDBlock = re.sub(retryDsPatt+'(\d+)$','',file.destinationDBlock)
                             file.destinationDBlock = tmpDestinationDBlock + retryDsPatt + '%d' % (1+int(retryMatch.group(1)))
                         if job.processingType == 'usermerge':
                             job.jobParameters = job.jobParameters.replace(' %s ' % oldOutDsName,
                                                                           ' %s ' % file.destinationDBlock)
                         # use new dataset name for buildXYZ
                         if job.prodSourceLabel == 'panda':
                             if file.lfn.endswith('.lib.tgz'):
                                 # get new libDS and lib.tgz names
                                 oldLibDS  = file.dataset
                                 file.dataset = file.destinationDBlock
                                 newLibDS = file.dataset
                                 file.lfn = re.sub(oldLibDS,newLibDS,file.lfn)
                                 newLibTgz = file.lfn
                             else:
                                 file.dataset = file.destinationDBlock                                    
                     # add attempt nr
                     oldName  = file.lfn
                     if job.prodSourceLabel == 'panda' and file.lfn.endswith('.lib.tgz'):
                         continue
                     else:
                         # append attempt number at the tail 
                         file.lfn = re.sub("\.\d+$","",file.lfn)
                         file.lfn = "%s.%d" % (file.lfn,job.attemptNr)
                     newName  = file.lfn
                     # modify jobParameters
                     job.jobParameters = re.sub("'%s'" % oldName ,"'%s'" % newName,
                                                job.jobParameters)
                     # look for output in trf
                     oldGenelicName = re.sub('\.\d+$','',oldName)
                     match = re.search(oldGenelicName+'(\.\d+)*(%20|")',job.jobParameters)
                     if match != None:
                         job.jobParameters = job.jobParameters.replace(match.group(0),newName+match.group(2))
             # change lib.tgz name
             if retryBuild and newLibDS != None:
                 job.jobParameters = re.sub(oldLibDS,newLibDS,job.jobParameters)
                 # change destinationDBlock
                 if job.prodSourceLabel == 'panda':
                     job.destinationDBlock = newLibDS
             # all files are used by others
             if numUsedFiles == 0 and skipInputList != []:
                 continue
             # remove skipped files
             strSkipped = ''
             for tmpFile in skipInputList:
                 strSkipped += '%s,' % tmpFile.lfn
                 job.Files.remove(tmpFile)
             strSkipped = strSkipped[:-1]
             # modify jobpar
             if strSkipped != '':
                 optionToSkipFiles = '--skipInputByRetry'
                 if not optionToSkipFiles in job.jobParameters:
                     # just append
                     job.jobParameters += "%s=%s " % (optionToSkipFiles,strSkipped)
                 else:
                     # extract already skipped files
                     tmpMatch = re.search("(%s=[^ ]+)",job.jobParameters)
                     if tmpMatch == None:
                         tmpLog.error("Failed to extract arg of %s for PandaID=%s" \
                                      % (optionToSkipFiles,job.PandaID))
                         return
                     # replace
                     job.jobParameters = re.sub(tmpMatch.group(1),"%s,%s" % (tmpMatch.group(1),optionToSkipFiles),
                                                job.jobParameters)
             if self.verbose:
                 tmpLog.debug(job.jobParameters)
             # append
             retryJobs.append(job)
         # info on rebrokeage    
         if rebroMap != {}:
             for tmpRebKey,tmpRebNumJobs in rebroMap.iteritems():
                 tmpRebSetID,tmpRebJobID = tmpRebKey
                 tmpLog.info('Skip %s jobs since JobID=%s JobsetID=%s already reassigned them to another site' % \
                             (tmpRebNumJobs,tmpRebJobID,tmpRebSetID))
             if retryJobs == []:
                 tmpLog.info("No more jobs to be retried for JobID=%s" % JobID)
                 if isJobset:
                     continue
                 else:
                     return
         # all input files were or are being used by other jobs
         if retryJobs == []:
             tmpLog.info('All input files were or are being used by other jobs for the same output. No jobs to be retried. If you need to ignore duplication check (e.g., using the same EVNT file for multiple simulation subjobs), set ignoreDuplication=True. i.e. retry(123,ignoreDuplication=True)')
             if isJobset:
                 continue
             else:
                 return
         # check voms role
         if not retryJobs[0].workingGroup in ['NULL',None,'']:
             # VOMS role was used 
             if not "--workingGroup" in job.metadata:
                 # extract voms roles from metadata
                 match =  re.search("--voms( |=)[ \"]*([^ \"]+)",job.metadata)
                 if match != None:
                     vomsRoles = match.group(2)
                 else:
                     vomsRoles = "atlas:/atlas/%s/Role=production" % retryJobs[0].workingGroup
             # regenerate proxy with VOMS roles
             try:
                 tmpLog.info("Checking proxy role to resubmit %s jobs" % retryJobs[0].workingGroup)
                 self.gridPassPhrase,self.vomsFQAN = PsubUtils.checkGridProxy(
                         self.gridPassPhrase,
                         False,
                         self.verbose,vomsRoles,
                         useCache=True)
             except:
                 tmpLog.error("Failed to generate a proxy with %s" % vomsRoles)
                 return
         # check runtime env for new site submission
         if (newSite or newOpts != {}):
             if retryJobs[0].processingType == 'pathena' or '--useAthenaPackages' in retryJobs[0].metadata:
                 from pandatools import AthenaUtils
                 stA,retA = AthenaUtils.getAthenaVer()
                 if not stA:
                     tmpLog.error("Failed to get Athena rel/cache version in current runtime env")
                     return
                 athenaVer = retA['athenaVer']
                 cacheVer  = retA['cacheVer']
                 nightVer  = retA['nightVer']
                 wrongSetup = False
                 if retryJobs[0].AtlasRelease != 'Atlas-%s' % athenaVer:
                     wrongSetup = True
                     errMsg =  "Current Athena version Atlas-%s is inconsitent with the previous submission %s. " % (athenaVer,retryJobs[0].AtlasRelease)
                 elif retryJobs[0].homepackage != 'AnalysisTransforms'+cacheVer+nightVer:
                     wrongSetup = True                        
                     errMsg =  "Current cache version %s is inconsitent with the previous submission. " % cacheVer.replace('-','').replace('_','-')
                 if wrongSetup:    
                     errMsg += 'You need to have the same runtime env as before since all job spec need to be re-created to send jobs to a new site. '
                     errMsg += 'Please setup Athena correctly and restart pbook'                        
                     tmpLog.error(errMsg)
                     return
         # test mode
         if noSubmit:
             continue
         # invoke pathena/prun to send job to new site
         if (newSite or newOpts != {}) and retryJobs[0].processingType != 'usermerge':
             # set parent jobID and jobsetID
             newOpts['provenanceID'] = retryJobs[0].jobExecutionID
             newOpts['panda_parentJobsetID'] = retryJobs[0].sourceSite
             tmpLog.info("Constructing job spec again to be sent to another site ...")
             comStat= PsubUtils.execWithModifiedParams(retryJobs,newOpts,self.verbose,newSite)
             if comStat == 0:
                 # update database
                 time.sleep(2)
                 self.sync()
             else:
                 tmpLog.error("Failed to submit jobs to Panda server")                
             return
         # register datasets
         tmpOutDsLocation = Client.PandaSites[retryJobs[-1].computingSite]['ddm']
         addedDataset = []
         shadowDSname = None
         for tmpFile in retryJobs[-1].Files:
             if tmpFile.type in ['output','log'] and tmpFile.dataset.endswith('/'):
                 # add shadow
                 """
                 removed shadow
                 if shadowDSname == None and tmpFile.type == 'log':
                     shadowDSname = "%s%s" % (tmpFile.destinationDBlock,Client.suffixShadow)
                     Client.addDataset(shadowDSname,self.verbose)
                 """    
                 # add datasets    
                 if not tmpFile.destinationDBlock in addedDataset:
                     # create dataset
                     Client.addDataset(
                             tmpFile.destinationDBlock,
                             self.verbose,
                             location=tmpOutDsLocation,
                             dsCheck=False)
                     # add to container
                     Client.addDatasetsToContainer(
                             tmpFile.dataset,
                             [tmpFile.destinationDBlock],
                             self.verbose)
                     # append
                     addedDataset.append(tmpFile.destinationDBlock)
         # register libDS
         if retryBuild and newLibDS != None:
             Client.addDataset(
                     newLibDS,
                     self.verbose,
                     location=tmpOutDsLocation,
                     dsCheck=False)
         # submit
         tmpLog.info("Submitting job ...")            
         status,out = Client.submitJobs(retryJobs,verbose=self.verbose)
         if out == None or status != 0:
             tmpLog.error(status)
             tmpLog.error(out)
             tmpLog.error("Failed to submit jobs to Panda server")
             return
         # update database
         pandaIDstatus = {}
         newJobID = None
         for items in out:
             # get newJobID
             if newJobID == None:
                 newJobID = items[1]
             # check PandaID
             PandaID = items[0]
             if PandaID == 'NULL':
                 tmpLog.error("Panda server returned wrong IDs. It may have a temporary problem")
                 return
             # set newJobsetID
             if newJobsetID in [None,-1]:
                 newJobsetID = items[2]['jobsetID']
             # dummy statuso
             pandaIDstatus[PandaID] = ('defined','NULL')
         # set retry ID
         if not unsetRetryID:
             localJob.retryID = newJobID
             if not newJobsetID in [None,-1,'NULL']:
                 localJob.retryJobsetID = newJobsetID
             try:
                 PdbUtils.updateJobDB(localJob,self.verbose)
             except:
                 tmpLog.error("Failed to set retryID for JobID=%s" % JobID)
                 return
         # set new paramers
         newLocalJob = PdbUtils.convertPtoD(retryJobs,pandaIDstatus)
         newLocalJob.JobID = newJobID
         if not newJobsetID in [None,-1,'NULL']:
             newLocalJob.groupID = newJobsetID
         newLocalJob.creationTime = datetime.datetime.utcnow()
         # insert to DB
         try:
             PdbUtils.insertJobDB(newLocalJob,self.verbose)
         except:
             tmpLog.error("Failed to insert JobID=%s to local repository" % newJobID)
             return
         # write new jobdefID
         PsubUtils.writeJobDefID(newJobID)
         # done
         tmpMsg = 'Done. New JobID=%s' % newJobID
         if not newJobsetID in [None,-1,'NULL']:
             tmpMsg += " JobsetID=%s" % newJobsetID
         tmpLog.info(tmpMsg)
예제 #6
0
    def prepare(self, app, appsubconfig, appmasterconfig, jobmasterconfig):
        """Prepare the specific aspec of each subjob.
           Returns: subjobconfig list of objects understood by backends."""

        from pandatools import Client
        from pandatools import AthenaUtils
        from taskbuffer.JobSpec import JobSpec
        from taskbuffer.FileSpec import FileSpec
        from GangaAtlas.Lib.ATLASDataset.DQ2Dataset import dq2_set_dataset_lifetime
        from GangaPanda.Lib.Panda.Panda import refreshPandaSpecs

        # make sure we have the correct siteType
        refreshPandaSpecs()

        job = app._getParent()
        masterjob = job._getRoot()

        logger.debug('ProdTransPandaRTHandler prepare called for %s',
                     job.getFQID('.'))

        job.backend.actualCE = job.backend.site
        job.backend.requirements.cloud = Client.PandaSites[
            job.backend.site]['cloud']

        # check that the site is in a submit-able status
        if not job.splitter or job.splitter._name != 'DQ2JobSplitter':
            allowed_sites = job.backend.list_ddm_sites()

        try:
            outDsLocation = Client.PandaSites[job.backend.site]['ddm']
            tmpDsExist = False
            if (configPanda['processingType'].startswith('gangarobot') or
                    configPanda['processingType'].startswith('hammercloud')):
                #if Client.getDatasets(job.outputdata.datasetname):
                if getDatasets(job.outputdata.datasetname):
                    tmpDsExist = True
                    logger.info('Re-using output dataset %s' %
                                job.outputdata.datasetname)
            if not configPanda[
                    'specialHandling'] == 'ddm:rucio' and not configPanda[
                        'processingType'].startswith(
                            'gangarobot'
                        ) and not configPanda['processingType'].startswith(
                            'hammercloud') and not configPanda[
                                'processingType'].startswith('rucio_test'):
                Client.addDataset(job.outputdata.datasetname,
                                  False,
                                  location=outDsLocation,
                                  allowProdDisk=True,
                                  dsExist=tmpDsExist)
            logger.info('Output dataset %s registered at %s' %
                        (job.outputdata.datasetname, outDsLocation))
            dq2_set_dataset_lifetime(job.outputdata.datasetname, outDsLocation)
        except exceptions.SystemExit:
            raise BackendError(
                'Panda', 'Exception in adding dataset %s: %s %s' %
                (job.outputdata.datasetname, sys.exc_info()[0],
                 sys.exc_info()[1]))

        # JobSpec.
        jspec = JobSpec()
        jspec.currentPriority = app.priority
        jspec.jobDefinitionID = masterjob.id
        jspec.jobName = commands.getoutput('uuidgen 2> /dev/null')
        jspec.coreCount = app.core_count
        jspec.AtlasRelease = 'Atlas-%s' % app.atlas_release
        jspec.homepackage = app.home_package
        jspec.transformation = app.transformation
        jspec.destinationDBlock = job.outputdata.datasetname
        if job.outputdata.location:
            jspec.destinationSE = job.outputdata.location
        else:
            jspec.destinationSE = job.backend.site
        if job.inputdata:
            jspec.prodDBlock = job.inputdata.dataset[0]
        else:
            jspec.prodDBlock = 'NULL'
        if app.prod_source_label:
            jspec.prodSourceLabel = app.prod_source_label
        else:
            jspec.prodSourceLabel = configPanda['prodSourceLabelRun']
        jspec.processingType = configPanda['processingType']
        jspec.specialHandling = configPanda['specialHandling']
        jspec.computingSite = job.backend.site
        jspec.cloud = job.backend.requirements.cloud
        jspec.cmtConfig = app.atlas_cmtconfig
        if app.dbrelease == 'LATEST':
            try:
                latest_dbrelease = getLatestDBReleaseCaching()
            except:
                from pandatools import Client
                latest_dbrelease = Client.getLatestDBRelease()
            m = re.search('(.*):DBRelease-(.*)\.tar\.gz', latest_dbrelease)
            if m:
                self.dbrelease_dataset = m.group(1)
                self.dbrelease = m.group(2)
            else:
                raise ApplicationConfigurationError(
                    None,
                    "Error retrieving LATEST DBRelease. Try setting application.dbrelease manually."
                )
        else:
            self.dbrelease_dataset = app.dbrelease_dataset
            self.dbrelease = app.dbrelease
        jspec.jobParameters = app.job_parameters

        if self.dbrelease:
            if self.dbrelease == 'current':
                jspec.jobParameters += ' --DBRelease=current'
            else:
                if jspec.transformation.endswith(
                        "_tf.py") or jspec.transformation.endswith("_tf"):
                    jspec.jobParameters += ' --DBRelease=DBRelease-%s.tar.gz' % (
                        self.dbrelease, )
                else:
                    jspec.jobParameters += ' DBRelease=DBRelease-%s.tar.gz' % (
                        self.dbrelease, )
                dbspec = FileSpec()
                dbspec.lfn = 'DBRelease-%s.tar.gz' % self.dbrelease
                dbspec.dataset = self.dbrelease_dataset
                dbspec.prodDBlock = jspec.prodDBlock
                dbspec.type = 'input'
                jspec.addFile(dbspec)

        if job.inputdata:
            m = re.search('(.*)\.(.*)\.(.*)\.(.*)\.(.*)\.(.*)',
                          job.inputdata.dataset[0])
            if not m:
                logger.error("Error retrieving run number from dataset name")
                #raise ApplicationConfigurationError(None, "Error retrieving run number from dataset name")
                runnumber = 105200
            else:
                runnumber = int(m.group(2))
            if jspec.transformation.endswith(
                    "_tf.py") or jspec.transformation.endswith("_tf"):
                jspec.jobParameters += ' --runNumber %d' % runnumber
            else:
                jspec.jobParameters += ' RunNumber=%d' % runnumber

        # Output files.
        randomized_lfns = []
        ilfn = 0
        for lfn, lfntype in zip(app.output_files, app.output_type):
            ofspec = FileSpec()
            if app.randomize_lfns:
                randomized_lfn = lfn + (
                    '.%s.%d.%s' %
                    (job.backend.site, int(time.time()),
                     commands.getoutput('uuidgen 2> /dev/null')[:4]))
            else:
                randomized_lfn = lfn
            ofspec.lfn = randomized_lfn
            randomized_lfns.append(randomized_lfn)
            ofspec.destinationDBlock = jspec.destinationDBlock
            ofspec.destinationSE = jspec.destinationSE
            ofspec.dataset = jspec.destinationDBlock
            ofspec.type = 'output'
            jspec.addFile(ofspec)
            if jspec.transformation.endswith(
                    "_tf.py") or jspec.transformation.endswith("_tf"):
                jspec.jobParameters += ' --output%sFile %s' % (
                    lfntype, randomized_lfns[ilfn])
            else:
                jspec.jobParameters += ' output%sFile=%s' % (
                    lfntype, randomized_lfns[ilfn])
            ilfn = ilfn + 1

        # Input files.
        if job.inputdata:
            for guid, lfn, size, checksum, scope in zip(
                    job.inputdata.guids, job.inputdata.names,
                    job.inputdata.sizes, job.inputdata.checksums,
                    job.inputdata.scopes):
                ifspec = FileSpec()
                ifspec.lfn = lfn
                ifspec.GUID = guid
                ifspec.fsize = size
                ifspec.md5sum = checksum
                ifspec.scope = scope
                ifspec.dataset = jspec.prodDBlock
                ifspec.prodDBlock = jspec.prodDBlock
                ifspec.type = 'input'
                jspec.addFile(ifspec)
            if app.input_type:
                itype = app.input_type
            else:
                itype = m.group(5)
            if jspec.transformation.endswith(
                    "_tf.py") or jspec.transformation.endswith("_tf"):
                jspec.jobParameters += ' --input%sFile %s' % (itype, ','.join(
                    job.inputdata.names))
            else:
                jspec.jobParameters += ' input%sFile=%s' % (itype, ','.join(
                    job.inputdata.names))

        # Log files.
        lfspec = FileSpec()
        lfspec.lfn = '%s.job.log.tgz' % jspec.jobName
        lfspec.destinationDBlock = jspec.destinationDBlock
        lfspec.destinationSE = jspec.destinationSE
        lfspec.dataset = jspec.destinationDBlock
        lfspec.type = 'log'
        jspec.addFile(lfspec)

        return jspec
예제 #7
0
    def retry(self,
              JobsetID,
              newSite=False,
              newOpts={},
              noSubmit=False,
              ignoreDuplication=True):
        # get logger
        tmpLog = PLogger.getPandaLogger()
        # check proxy
        self.gridPassPhrase, self.vomsFQAN = PsubUtils.checkGridProxy(
            self.gridPassPhrase, False, self.verbose)
        # get jobset
        newJobsetID = -1
        jobList = self.getJobIDsWithSetID(JobsetID)
        if jobList == None:
            isJobset = False
            jobList = [JobsetID]
        else:
            isJobset = True
            tmpMsg = "JobsetID=%s is composed of JobID=" % JobsetID
            for tmpJobID in jobList:
                tmpMsg += '%s,' % tmpJobID
            tmpMsg = tmpMsg[:-1]
            tmpLog.info(tmpMsg)
        for JobID in jobList:
            # get job info from local repository
            localJob = self.getJobInfo(JobID)
            if localJob == None:
                tmpLog.warning(
                    "JobID=%s not found in local repository. Synchronization may be needed"
                    % JobID)
                return None
            # skip running job
            if localJob.dbStatus != 'frozen':
                tmpLog.warning('Cannot retry running jobs')
                if isJobset:
                    continue
                else:
                    return
            # skip already retried
            if localJob.retryID != '0':
                if isJobset:
                    tmpLog.info('Skip JobID=%s since already retried by JobID=%s JobsetID=%s' % \
                                (JobID,localJob.retryID,localJob.retryJobsetID))
                    continue
                else:
                    tmpLog.warning('This job was already retried by JobID=%s' %
                                   localJob.retryID)
                    return
            # check status of buildJob
            if not localJob.buildStatus in ['', 'finished']:
                tmpMsgStr = 'Cannot retry since status of buildJob %s is %s (!= finished). ' \
                            % (localJob.PandaID.split(',')[0],localJob.buildStatus)
                tmpMsgStr += 'Please execute %s with the same input/output datasets (or containers). ' % localJob.jobType
                tmpMsgStr += 'It will run only on failed/cancelled/unused input files '
                tmpMsgStr += 'and append output files to the output dataset container'
                tmpLog.warning(tmpMsgStr)
                if isJobset:
                    continue
                else:
                    return
            # check opts for newSite
            if newSite:
                if not localJob.outDS.endswith('/') and not newOpts.has_key(
                        'outDS') and not newOpts.has_key('--outDS'):
                    tmpLog.warning(
                        'You need to specify --outDS in newOpts to retry at new site unless container is used as output'
                    )
                    return
            # get list of failed jobs
            pandaIDs = localJob.PandaID.split(',')
            statusList = localJob.jobStatus.split(',')
            jobList = []
            for idx in range(len(pandaIDs)):
                # check status
                if not statusList[idx] in ['failed', 'cancelled']:
                    continue
                jobList.append(pandaIDs[idx])
            # no failed job
            if jobList == []:
                if isJobset:
                    tmpLog.info('Skip JobID=%s since no failed jobs' % JobID)
                    continue
                else:
                    tmpLog.info('No failed jobs to be retried for JobID=%s' %
                                JobID)
                    return
            # get full job spec
            tmpLog.info("Retrying JobID=%s ..." % JobID)
            tmpLog.info("Getting job info")
            idxJL = 0
            nQuery = 500
            pandaJobs = []
            while idxJL < len(jobList):
                # avoid burst query
                tmpLog.info(" %5s/%s" % (idxJL, len(jobList)))
                status, oTmp = Client.getFullJobStatus(jobList[idxJL:idxJL +
                                                               nQuery],
                                                       verbose=self.verbose)
                if status != 0:
                    tmpLog.error(status)
                    tmpLog.error(oTmp)
                    tmpLog.error("Cannot get job info from Panda server")
                    return
                pandaJobs += oTmp
                idxJL += nQuery
                time.sleep(1)
            tmpLog.info(" %5s/%s" % (len(jobList), len(jobList)))
            # jobdefID
            newJobdefID = PsubUtils.readJobDefID()
            # reset some parameters
            retryJobs = []
            retrySite = None
            retryElement = None
            retryDestSE = None
            outDsName = None
            shadowList = []
            for idx in range(len(jobList)):
                job = pandaJobs[idx]
                # skip exired
                if job == None:
                    tmpLog.warning("Could not retry jobs older than 30 days : JobID=%s (PandaID=%s) expired" \
                                   % (JobID,jobList[idxJob]))
                    return
                # get shadow list
                if (not ignoreDuplication
                    ) and outDsName == None and job.prodSourceLabel == 'user':
                    # look for dataset for log since it doesn't have suffix even when --individualOutDS is used
                    for tmpFile in job.Files:
                        if tmpFile.type == 'log':
                            outDsName = tmpFile.dataset
                            break
                    # output dataset was not found
                    if outDsName == None:
                        tmpLog.error("Could not get output dataset name for JobID=%s (PandaID=%s)" \
                                     % (JobID,job.PandaID))
                        return
                    # get files in shadow
                    if outDsName.endswith('/'):
                        shadowList = Client.getFilesInShadowDataset(
                            outDsName, Client.suffixShadow, self.verbose)
                    else:
                        # disable duplication check mainly for old overlay jobs since non-signal files are wrongly skipped
                        #shadowList = Client.getFilesInShadowDatasetOld(outDsName,Client.suffixShadow,self.verbose)
                        pass
                # unify sitename
                if retrySite == None:
                    retrySite = job.computingSite
                    retryElement = job.computingElement
                    retryDestSE = job.destinationSE
                # reset
                job.jobStatus = None
                job.commandToPilot = None
                job.startTime = None
                job.endTime = None
                job.attemptNr = 1 + job.attemptNr
                for attr in job._attributes:
                    if attr.endswith('ErrorCode') or attr.endswith(
                            'ErrorDiag'):
                        setattr(job, attr, None)
                job.transExitCode = None
                job.computingSite = retrySite
                job.computingElement = retryElement
                job.destinationSE = retryDestSE
                job.dispatchDBlock = None
                job.jobExecutionID = JobID
                job.jobDefinitionID = newJobdefID
                job.parentID = job.PandaID
                if job.jobsetID != ['NULL', None, -1]:
                    job.sourceSite = job.jobsetID
                    job.jobsetID = newJobsetID
                skipInputList = []
                numUsedFiles = 0
                for file in job.Files:
                    file.rowID = None
                    if file.type == 'input':
                        # protection against wrong sync which doesn't update buildStatus correctly
                        if file.lfn.endswith(
                                '.lib.tgz') and file.GUID == 'NULL':
                            tmpLog.warning('GUID for %s is unknown. Cannot retry when corresponding buildJob failed' \
                                           % file.lfn)
                            return
                        file.status = 'ready'
                        # check with shadow for non lib.tgz/DBR
                        tmpDbrMatch = re.search('^DBRelease-.*\.tar\.gz$',
                                                file.lfn)
                        if tmpDbrMatch == None and not file.lfn.endswith(
                                '.lib.tgz'):
                            if file.lfn in shadowList:
                                skipInputList.append(file)
                            else:
                                numUsedFiles += 1
                    elif file.type in ('output', 'log'):
                        file.destinationSE = retryDestSE
                        file.destinationDBlock = re.sub(
                            '_sub\d+$', '', file.destinationDBlock)
                        # add retry num
                        if file.dataset.endswith('/'):
                            retryMatch = re.search('_r(\d+)$',
                                                   file.destinationDBlock)
                            if retryMatch == None:
                                file.destinationDBlock += '_r1'
                            else:
                                tmpDestinationDBlock = re.sub(
                                    '_r(\d+)$', '', file.destinationDBlock)
                                file.destinationDBlock = tmpDestinationDBlock + '_r%d' % (
                                    1 + int(retryMatch.group(1)))
                        # add attempt nr
                        oldName = file.lfn
                        file.lfn = re.sub("\.\d+$", "", file.lfn)
                        file.lfn = "%s.%d" % (file.lfn, job.attemptNr)
                        newName = file.lfn
                        # modify jobParameters
                        job.jobParameters = re.sub("'%s'" % oldName,
                                                   "'%s'" % newName,
                                                   job.jobParameters)
                        # look for output in trf
                        oldGenelicName = re.sub('\.\d+$', '', oldName)
                        match = re.search(oldGenelicName + '(\.\d+)*(%20|")',
                                          job.jobParameters)
                        if match != None:
                            job.jobParameters = job.jobParameters.replace(
                                match.group(0), newName + match.group(2))
                # all files are used by others
                if numUsedFiles == 0 and skipInputList != []:
                    continue
                # remove skipped files
                strSkipped = ''
                for tmpFile in skipInputList:
                    strSkipped += '%s,' % tmpFile.lfn
                    job.Files.remove(tmpFile)
                strSkipped = strSkipped[:-1]
                # modify jobpar
                if strSkipped != '':
                    optionToSkipFiles = '--skipInputByRetry'
                    if not optionToSkipFiles in job.jobParameters:
                        # just append
                        job.jobParameters += "%s=%s " % (optionToSkipFiles,
                                                         strSkipped)
                    else:
                        # extract already skipped files
                        tmpMatch = re.search("(%s=[^ ]+)", job.jobParameters)
                        if tmpMatch == None:
                            tmpLog.error("Failed to extract arg of %s for PandaID=%s" \
                                         % (optionToSkipFiles,job.PandaID))
                            return
                        # replace
                        job.jobParameters = re.sub(
                            tmpMatch.group(1),
                            "%s,%s" % (tmpMatch.group(1), optionToSkipFiles),
                            job.jobParameters)
                if self.verbose:
                    tmpLog.debug(job.jobParameters)
                # append
                retryJobs.append(job)
            # all input files were or are being used by other jobs
            if retryJobs == []:
                tmpLog.info(
                    'All input files were or are being used by other jobs for the same output. No jobs to be retried. If you need to ignore duplication check (e.g., using the same EVNT file for multiple simulation subjobs), set ignoreDuplication=True. i.e. retry(123,ignoreDuplication=True)'
                )
                return
# check voms role
            if not retryJobs[0].workingGroup in ['NULL', None, '']:
                # VOMS role was used
                if not "--workingGroup" in job.metadata:
                    # extract voms roles from metadata
                    match = re.search("--voms( |=)[ \"]*([^ \"]+)",
                                      job.metadata)
                    if match != None:
                        vomsRoles = match.group(2)
                    else:
                        vomsRoles = "atlas:/atlas/%s/Role=production" % retryJobs[
                            0].workingGroup
# regenerate proxy with VOMS roles
                    try:
                        tmpLog.info("Checking proxy role to resubmit %s jobs" %
                                    retryJobs[0].workingGroup)
                        self.gridPassPhrase, self.vomsFQAN = PsubUtils.checkGridProxy(
                            self.gridPassPhrase, False, self.verbose,
                            vomsRoles)
                    except:
                        tmpLog.error("Failed to generate a proxy with %s" %
                                     vomsRoles)
                        return
            # check runtime env for new site submission
            if newSite:
                if retryJobs[
                        0].processingType == 'pathena' or '--useAthenaPackages' in retryJobs[
                            0].metadata:
                    from pandatools import AthenaUtils
                    stA, retA = AthenaUtils.getAthenaVer()
                    if not stA:
                        tmpLog.error(
                            "Failed to get Athena rel/cache version in current runtime env"
                        )
                        return
                    athenaVer = retA['athenaVer']
                    cacheVer = retA['cacheVer']
                    nightVer = retA['nightVer']
                    wrongSetup = False
                    if retryJobs[0].AtlasRelease != 'Atlas-%s' % athenaVer:
                        wrongSetup = True
                        errMsg = "Current Athena version Atlas-%s is inconsitent with the previous submission %s. " % (
                            athenaVer, retryJobs[0].AtlasRelease)
                    elif retryJobs[
                            0].homepackage != 'AnalysisTransforms' + cacheVer + nightVer:
                        wrongSetup = True
                        errMsg = "Current cache version %s is inconsitent with the previous submission. " % cacheVer.replace(
                            '-', '').replace('_', '-')
                    if wrongSetup:
                        errMsg += 'You need to have the same runtime env as before since all job spec need to be re-created to send jobs to a new site. '
                        errMsg += 'Please setup Athena correctly and restart pbook'
                        tmpLog.error(errMsg)
                        return
# test mode
            if noSubmit:
                continue
            # invoke pathena/prun to send job to new site
            if newSite:
                tmpLog.info(
                    "Constrcuting job spec again to be sent to another site ..."
                )
                comStat = PsubUtils.execWithModifiedParams(
                    retryJobs, newOpts, self.verbose)
                if comStat == 0:
                    # update database
                    time.sleep(2)
                    self.sync()
                else:
                    tmpLog.error("Failed to submit jobs to Panda server")
                return
            # register datasets
            tmpOutDsLocation = Client.PandaSites[
                retryJobs[-1].computingSite]['ddm']
            addedDataset = []
            shadowDSname = None
            for tmpFile in retryJobs[-1].Files:
                if tmpFile.type in ['output', 'log'
                                    ] and tmpFile.dataset.endswith('/'):
                    # add shadow
                    if shadowDSname == None and tmpFile.type == 'log':
                        shadowDSname = "%s%s" % (tmpFile.destinationDBlock,
                                                 Client.suffixShadow)
                        Client.addDataset(shadowDSname, self.verbose)
                    # add datasets
                    if not tmpFile.destinationDBlock in addedDataset:
                        # create dataset
                        Client.addDataset(tmpFile.destinationDBlock,
                                          self.verbose,
                                          location=tmpOutDsLocation)
                        # add to container
                        Client.addDatasetsToContainer(
                            tmpFile.dataset, [tmpFile.destinationDBlock],
                            self.verbose)
                        # append
                        addedDataset.append(tmpFile.destinationDBlock)
            # submit
            tmpLog.info("Submitting job ...")
            status, out = Client.submitJobs(retryJobs, verbose=self.verbose)
            if out == None or status != 0:
                tmpLog.error(status)
                tmpLog.error(out)
                tmpLog.error("Failed to submit jobs to Panda server")
                return
            # update database
            pandaIDstatus = {}
            newJobID = None
            for items in out:
                # get newJobID
                if newJobID == None:
                    newJobID = items[1]
                # check PandaID
                PandaID = items[0]
                if PandaID == 'NULL':
                    tmpLog.error(
                        "Panda server returned wrong IDs. It may have a temporary problem"
                    )
                    return
                # set newJobsetID
                if newJobsetID in [None, -1]:
                    newJobsetID = items[2]['jobsetID']
                # dummy statuso
                pandaIDstatus[PandaID] = ('defined', 'NULL')
            # set retry ID
            localJob.retryID = newJobID
            if not newJobsetID in [None, -1, 'NULL']:
                localJob.retryJobsetID = newJobsetID
            try:
                PdbUtils.updateJobDB(localJob, self.verbose)
            except:
                tmpLog.error("Failed to set retryID for JobID=%s" % JobID)
                return
            # set new paramers
            newLocalJob = PdbUtils.convertPtoD(retryJobs, pandaIDstatus)
            newLocalJob.JobID = newJobID
            if not newJobsetID in [None, -1, 'NULL']:
                newLocalJob.groupID = newJobsetID
            newLocalJob.creationTime = datetime.datetime.utcnow()
            # insert to DB
            try:
                PdbUtils.insertJobDB(newLocalJob, self.verbose)
            except:
                tmpLog.error("Failed to insert JobID=%s to local repository" %
                             newJobID)
                return
            # write new jobdefID
            PsubUtils.writeJobDefID(newJobID)
            # done
            tmpMsg = 'Done. New JobID=%s' % newJobID
            if not newJobsetID in [None, -1, 'NULL']:
                tmpMsg += " JobsetID=%s" % newJobsetID
            tmpLog.info(tmpMsg)
예제 #8
0
    def master_prepare(self, app, appmasterconfig):

        # PandaTools
        from pandatools import Client
        from pandatools import AthenaUtils
        from taskbuffer.JobSpec import JobSpec
        from taskbuffer.FileSpec import FileSpec

        job = app._getParent()
        logger.debug('AthenaMCPandaRTHandler master_prepare called for %s',
                     job.getFQID('.'))
        usertag = configDQ2['usertag']
        #usertag='user09'
        nickname = getNickname(allowMissingNickname=True)
        self.libDataset = '%s.%s.ganga.%s_%d.lib._%06d' % (
            usertag, nickname, commands.getoutput('hostname').split('.')[0],
            int(time.time()), job.id)
        #        self.userprefix='%s.%s.ganga' % (usertag,gridProxy.identity())
        sources = 'sources.%s.tar.gz' % commands.getoutput(
            'uuidgen 2> /dev/null')
        self.library = '%s.lib.tgz' % self.libDataset

        # check DBRelease
        # if job.backend.dbRelease != '' and job.backend.dbRelease.find(':') == -1:
        #   raise ApplicationConfigurationError(None,"ERROR : invalid argument for backend.dbRelease. Must be 'DatasetName:FileName'")

        #       unpack library
        logger.debug('Creating source tarball ...')
        tmpdir = '/tmp/%s' % commands.getoutput('uuidgen 2> /dev/null')
        os.mkdir(tmpdir)

        inputbox = []
        if os.path.exists(app.transform_archive):
            # must add a condition on size.
            inputbox += [File(app.transform_archive)]
        if app.evgen_job_option:
            self.evgen_job_option = app.evgen_job_option
            if os.path.exists(app.evgen_job_option):
                # locally modified job option file to add to the input sand box
                inputbox += [File(app.evgen_job_option)]
                self.evgen_job_option = app.evgen_job_option.split("/")[-1]

#       add input sandbox files
        if (job.inputsandbox):
            for file in job.inputsandbox:
                inputbox += [file]
#        add option files
        for extFile in job.backend.extOutFile:
            try:
                shutil.copy(extFile, tmpdir)
            except IOError:
                os.makedirs(tmpdir)
                shutil.copy(extFile, tmpdir)
#       fill the archive
        for opt_file in inputbox:
            try:
                shutil.copy(opt_file.name, tmpdir)
            except IOError:
                os.makedirs(tmpdir)
                shutil.copy(opt_file.name, tmpdir)
#       now tar it up again

        inpw = job.getInputWorkspace()
        rc, output = commands.getstatusoutput('tar czf %s -C %s .' %
                                              (inpw.getPath(sources), tmpdir))
        if rc:
            logger.error('Packing sources failed with status %d', rc)
            logger.error(output)
            raise ApplicationConfigurationError(None,
                                                'Packing sources failed.')

        shutil.rmtree(tmpdir)

        #       upload sources

        logger.debug('Uploading source tarball ...')
        try:
            cwd = os.getcwd()
            os.chdir(inpw.getPath())
            rc, output = Client.putFile(sources)
            if output != 'True':
                logger.error('Uploading sources %s failed. Status = %d',
                             sources, rc)
                logger.error(output)
                raise ApplicationConfigurationError(
                    None, 'Uploading archive failed')
        finally:
            os.chdir(cwd)

        # Use Panda's brokerage
##         if job.inputdata and len(app.sites)>0:
##             # update cloud, use inputdata's
##             from dq2.info.TiersOfATLAS import whichCloud,ToACache
##             inclouds=[]
##             for site in app.sites:
##                 cloudSite=whichCloud(app.sites[0])
##                 if cloudSite not in inclouds:
##                     inclouds.append(cloudSite)
##             # now converting inclouds content into proper brokering stuff.
##             outclouds=[]
##             for cloudSite in inclouds:
##                 for cloudID, eachCloud in ToACache.dbcloud.iteritems():
##                     if cloudSite==eachCloud:
##                         cloud=cloudID
##                         outclouds.append(cloud)
##                         break

##             print outclouds
##             # finally, matching with user's wishes
##             if len(outclouds)>0:
##                 if not job.backend.requirements.cloud: # no user wish, update
##                     job.backend.requirements.cloud=outclouds[0]
##                 else:
##                     try:
##                         assert job.backend.requirements.cloud in outclouds
##                     except:
##                         raise ApplicationConfigurationError(None,'Input dataset not available in target cloud %s. Please try any of the following %s' % (job.backend.requirements.cloud, str(outclouds)))

        from GangaPanda.Lib.Panda.Panda import runPandaBrokerage

        runPandaBrokerage(job)

        if job.backend.site == 'AUTO':
            raise ApplicationConfigurationError(
                None, 'site is still AUTO after brokerage!')

        # output dataset preparation and registration
        try:
            outDsLocation = Client.PandaSites[job.backend.site]['ddm']
        except:
            raise ApplicationConfigurationError(
                None,
                "Could not extract output dataset location from job.backend.site value: %s. Aborting"
                % job.backend.site)
        if not app.dryrun:
            for outtype in app.outputpaths.keys():
                dset = string.replace(app.outputpaths[outtype], "/", ".")
                dset = dset[1:]
                # dataset registration must be done only once.
                print "registering output dataset %s at %s" % (dset,
                                                               outDsLocation)
                try:
                    Client.addDataset(dset, False, location=outDsLocation)
                    dq2_set_dataset_lifetime(dset, location=outDsLocation)
                except:
                    raise ApplicationConfigurationError(
                        None,
                        "Fail to create output dataset %s. Aborting" % dset)
            # extend registration to build job lib dataset:
            print "registering output dataset %s at %s" % (self.libDataset,
                                                           outDsLocation)

            try:
                Client.addDataset(self.libDataset,
                                  False,
                                  location=outDsLocation)
                dq2_set_dataset_lifetime(self.libDataset, outDsLocation)
            except:
                raise ApplicationConfigurationError(
                    None, "Fail to create output dataset %s. Aborting" %
                    self.libDataset)

        ###
        cacheVer = "-AtlasProduction_" + str(app.prod_release)

        logger.debug("master job submit?")
        self.outsite = job.backend.site
        if app.se_name and app.se_name != "none" and not self.outsite:
            self.outsite = app.se_name

        #       create build job
        jspec = JobSpec()
        jspec.jobDefinitionID = job.id
        jspec.jobName = commands.getoutput('uuidgen 2> /dev/null')
        jspec.AtlasRelease = 'Atlas-%s' % app.atlas_rel
        jspec.homepackage = 'AnalysisTransforms' + cacheVer  #+nightVer
        jspec.transformation = '%s/buildJob-00-00-03' % Client.baseURLSUB  # common base to Athena and AthenaMC jobs: buildJob is a pilot job which takes care of all inputs for the real jobs (in prepare()
        jspec.destinationDBlock = self.libDataset
        jspec.destinationSE = job.backend.site
        jspec.prodSourceLabel = 'panda'
        jspec.assignedPriority = 2000
        jspec.computingSite = job.backend.site
        jspec.cloud = job.backend.requirements.cloud
        #        jspec.jobParameters     = self.args not known yet
        jspec.jobParameters = '-o %s' % (self.library)
        if app.userarea:
            print app.userarea
            jspec.jobParameters += ' -i %s' % (os.path.basename(app.userarea))
        else:
            jspec.jobParameters += ' -i %s' % (sources)
        jspec.cmtConfig = AthenaUtils.getCmtConfig(athenaVer=app.atlas_rel)

        matchURL = re.search('(http.*://[^/]+)/', Client.baseURLSSL)
        if matchURL:
            jspec.jobParameters += ' --sourceURL %s' % matchURL.group(1)

        fout = FileSpec()
        fout.lfn = self.library
        fout.type = 'output'
        fout.dataset = self.libDataset
        fout.destinationDBlock = self.libDataset
        jspec.addFile(fout)

        flog = FileSpec()
        flog.lfn = '%s.log.tgz' % self.libDataset
        flog.type = 'log'
        flog.dataset = self.libDataset
        flog.destinationDBlock = self.libDataset
        jspec.addFile(flog)
        #print "MASTER JOB DETAILS:",jspec.jobParameters

        return jspec