Пример #1
0
    def convert(self):
        '''Convert the condition in a JDL specification'''

        requirements = []

        if self.sites:
            ce_requirement = ' ||\n     '.join([
                'other.GlueCEUniqueID == "%s"' % ce for ce in getCEsForSites(
                    self.sites, self.excluded_sites, CREAM=True)
            ])
            if not ce_requirement:
                raise BackendError(
                    'CREAM',
                    'Job cannot be submitted as no valid site has been specified.'
                )
            requirements.append('( %s )' % ce_requirement)

        if self.os:
            os_name = self.os.lower()
            if os_name == 'slc3':
                requirements.append(slc3_req)
            elif os_name == 'slc4':
                requirements.append(slc4_req)
            else:
                raise BackendError(
                    'LCG',
                    'Job cannot be submitted as unknown OS %s has been requested.'
                    % self.os)

        return requirements
Пример #2
0
    def _setup_bulk_subjobs(self, dirac_ids, dirac_script):
        """
        This is the old bulk submit method which is used to construct the subjobs for a parametric job
        Args:
            dirac_ids (list): This is a list of the Dirac ids which have been created
            dirac_script (str): Name of the dirac script which contains the job jdl
        """
        f = open(dirac_script, 'r')
        parametric_datasets = get_parametric_datasets(f.read().split('\n'))
        f.close()
        if len(parametric_datasets) != len(dirac_ids):
            raise BackendError(
                'Dirac',
                'Missmatch between number of datasets defines in dirac API script and those returned by DIRAC'
            )

        master_job = self.getJobObject()
        master_job.subjobs = []
        for i in range(len(dirac_ids)):
            j = Job()
            j.copyFrom(master_job)
            j.splitter = None
            j.backend.id = dirac_ids[i]
            j.id = i
            j.inputdata = self._setup_subjob_dataset(parametric_datasets[i])
            j.status = 'submitted'
            j.time.timenow('submitted')
            master_job.subjobs.append(j)
        return True
Пример #3
0
    def _common_submit(self, dirac_script):
        '''Submit the job via the Dirac server.
        Args:
            dirac_script (str): filename of the JDL which is to be submitted to DIRAC
        '''
        j = self.getJobObject()
        self.id = None
        self.actualCE = None
        self.status = None
        self.extraInfo = None
        self.statusInfo = ''
        j.been_queued = False
        dirac_cmd = """execfile(\'%s\')""" % dirac_script

        try:
            result = execute(dirac_cmd, cred_req=self.credential_requirements)
        except GangaDiracError as err:

            err_msg = 'Error submitting job to Dirac: %s' % str(err)
            logger.error(err_msg)
            logger.error("\n\n===\n%s\n===\n" % dirac_script)
            logger.error("\n\n====\n")
            with open(dirac_script, 'r') as file_in:
                logger.error("%s" % file_in.read())
            logger.error("\n====\n")
            raise BackendError('Dirac', err_msg)

        idlist = result
        if type(idlist) is list:
            return self._setup_bulk_subjobs(idlist, dirac_script)

        self.id = idlist
        return type(self.id) == int
Пример #4
0
def dirac_inputdata(app, hasOtherInputData=False):
    """ Construct the JDL component which requests the inputdata for a job
    Args:
        app (IApplication): app which belongs to the job of interest
        hasOtherInputData (bool): This is used to stop BannedSites being added to the JDL structure through backend.settings
    """
    job = app.getJobObject()
    input_data = None
    parametricinput_data = None

    inputLFNs = []

    input_data = None
    parametricinput_data = None

    if not job.inputdata and (not job.master or not job.master.inputdata):
        return input_data, parametricinput_data

    wanted_job = job
    if not job.inputdata and job.master and job.master.inputdata is not None and job.master.inputdata:
        wanted_job = job.master

    inputLFNs = [
        'LFN:' + this_file.lfn for this_file in wanted_job.inputdata
        if isinstance(this_file, DiracFile)
    ]

    # master job with a splitter reaching prepare, hence bulk submit
    if not job.master and job.splitter:
        parametricinput_data = dirac_parametric_split(app)
        if parametricinput_data is not None and len(
                parametricinput_data) > getConfig('DIRAC')['MaxDiracBulkJobs']:
            raise BackendError(
                'Dirac',
                'Number of bulk submission jobs \'%s\' exceeds the maximum allowed \'%s\' if more are needed please modify your config. Note there is a hard limit in Dirac of currently 1000.'
                % (len(parametricinput_data),
                   getConfig('DIRAC')['MaxDiracBulkJobs']))
        # master job with no splitter or subjob already split proceed as normal
        else:
            input_data = inputLFNs

    if 'Destination' not in job.backend.settings and not inputLFNs and not hasOtherInputData:
        t1_sites = getConfig('DIRAC')['noInputDataBannedSites']
        logger.info(
            'Job has no inputdata (T1 sites will be banned to help avoid overloading them).'
        )
        if 'BannedSites' in job.backend.settings:
            job.backend.settings['BannedSites'].extend(t1_sites)
            job.backend.settings['BannedSites'] = unique(
                job.backend.settings['BannedSites'])
        else:
            if t1_sites:
                job.backend.settings['BannedSites'] = t1_sites[:]

    if not input_data and not parametricinput_data:
        input_data = inputLFNs

    return input_data, parametricinput_data
Пример #5
0
def get_job_ident(dirac_script_lines):
    '''parse the dirac script for the label given to the job object'''
    target_line = [
        line for line in dirac_script_lines if line.find('Job()') >= 0]
    if len(target_line) != 1:
        raise BackendError(
            'Dirac', 'Could not determine the identifier of the Dirac Job object in API script')

    return target_line[0].split('=', 1)[0].strip()
Пример #6
0
def retrievePandaJobs(job, jIDs):
    '''
    methods for retrieving panda job ids of panda jobs given a jobDefId
    '''
    from pandatools import Client

    ick = False
    jstatus = ''
    num_pjobs = 0

    logger.debug("retrievePandaJobs jIDs=%s" % jIDs)

    # get status from Panda server
    rc, jobsStatus = Client.getFullJobStatus(jIDs, False)
    if rc:
        logger.error('Return code %d retrieving job status information.', rc)
        raise BackendError(
            'Jedi', 'Return code %d retrieving job status information.' % rc)

    for status in jobsStatus:
        if not status: continue

        jstatus = status.jobStatus
        if status.jobStatus == None:
            logger.warning('No panda jobs expected')
            job.backend.pandajobs = []

        elif status.jobStatus in [
                "defined", "activated", "running", "failed", "finished",
                "holding", "assigned"
        ]:
            logger.debug('Panda jobs are running')
            logger.debug("PandaID: %d" % status.PandaID)

            pjobj = JediPandaJob()
            pjobj.id = status.PandaID
            pjobj.url = 'http://panda.cern.ch/?job=%d' % status.PandaID
            pjobj.jopSpec = dict(zip(status._attributes, status.values()))
            for k in pjobj.jobSpec.keys():
                if type(pjobj.jobSpec[k]) not in [type(''), type(1)]:
                    pjobj.jobSpec[k] = str(pjobj.jobSpec[k])

            if pjobj not in job.backend.pandajobs:
                job.backend.pandajobs.append(pjobj)
            else:
                logger.debug("Panda job %s already exists locally" % pjobj.id)

            num_pjobs += 1
        else:
            logger.warning(
                "getFullJobStatus returned unsupported status %s for Panda job %s "
                % (status.jobStatus, status.PandaID))

        ick = True

    return (ick, jstatus, num_pjobs)
Пример #7
0
 def kill(self):
     """ Kill a Dirac jobs"""
     if not self.id:
         return None
     dirac_cmd = 'kill(%d)' % self.id
     try:
         result = execute(dirac_cmd, cred_req=self.credential_requirements)
     except GangaDiracError as err:
         raise BackendError('Dirac', 'Could not kill job: %s' % err)
     return True
Пример #8
0
def get_parametric_datasets(dirac_script_lines):
    '''parse the dirac script and retrieve the parametric inputdataset'''
    method_str = '.setParametricInputData('

    def parametric_input_filter(API_line):
        return API_line.find(method_str) >= 0
        # return API_line.find('.setParametricInputData(') >= 0

    parametric_line = filter(parametric_input_filter, dirac_script_lines)
    if len(parametric_line) is 0:
        raise BackendError(
            'Dirac', 'No "setParametricInputData()" lines in dirac API')
    if len(parametric_line) > 1:
        raise BackendError(
            'Dirac', 'Multiple "setParametricInputData()" lines in dirac API')

    end_method_marker = parametric_line[0].find(method_str) + len(method_str)
    dataset_str = parametric_line[0][end_method_marker:-1]
    return eval(dataset_str)
Пример #9
0
    def list_sites_cloud(self, cloudName='', blacklist=True, req_str=''):

        if cloudName:
            cloudID = cloudName
        else:
            cloudID = self.cloud

        try:
            cloud = self._cloudNameList[cloudID]
        except:
            cloud = cloudID

        sites = getSites(cloud)

        # exclude sites
        for site in self.excluded_sites:
            if site in sites:
                sites.remove(site)

        # blacklist
        if (blacklist):
            _refreshCESEInfo()
            for bad_site in CESEInfo['blacklist']:
                try:
                    sites.remove(bad_site)
                except ValueError:
                    pass

        if (req_str != '') and (self._name == 'AtlasLCGRequirements'):
            # check release requirements
            old_software = self.software
            self.software = [req_str]
            be = self._getParent()
            matches = be.get_requirement_matches()

            # find the ces for each site
            new_sites = []
            for site in sites:
                ces = getCEsForSites([site])
                for ce in ces:
                    if ce in matches:
                        new_sites.append(site)
                        break

            sites = new_sites

        if sites:
            return sites
        raise BackendError(
            'LCG',
            'Could not find any sites for selected cloud %s. Allowed clouds: %s'
            % (cloud, self._cloudNameList.keys()))
Пример #10
0
def checkForRebrokerage(string):
    import re
    matchObj = re.match(
        'reassigned to another site by rebrokerage. new PandaID=(\d+) JobsetID=(\d+) JobID=(\d+)',
        string)
    if matchObj:
        newPandaID = long(matchObj.group(1))
        newJobsetID = long(matchObj.group(2))
        newJobID = long(matchObj.group(3))
        return newPandaID
    raise BackendError(
        'Jedi',
        'Error getting new PandaID for rebrokered job. Report to DA Help')
Пример #11
0
def createContainer(name):
    from pandatools import Client
    # don't create containers for HC datasets
    if not configPanda['processingType'].startswith(
            'gangarobot') and not configPanda['processingType'].startswith(
                'hammercloud'):
        try:
            Client.createContainer(name, False)
            logger.info('Created output container %s' % name)
        except exceptions.SystemExit:
            raise BackendError(
                'Panda', 'Exception in Client.createContainer %s: %s %s' %
                (name, sys.exc_info()[0], sys.exc_info()[1]))
Пример #12
0
def test_all_exceptions(gpi):
    """Create all exceptions and make sure they behave correctly"""

    import Ganga.Core.exceptions
    test_str = "My Test Error"

    def exception_test(err_name):
        """Run tests on the given exception"""
        err_type = getattr(Ganga.Core.exceptions, err_name)
        err_obj = err_type(test_str)
        assert test_str in str(err_obj)

    err_list = [
        "GangaException", "GangaFileError", "PluginError",
        "ApplicationConfigurationError", "ApplicationPrepareError",
        "IncompleteJobSubmissionError", "IncompleteKillError",
        "JobManagerError", "GangaAttributeError", "GangaValueError",
        "GangaIOError", "SplitterError", "ProtectedAttributeError",
        "ReadOnlyObjectError", "TypeMismatchError", "SchemaError",
        "SchemaVersionError", "CredentialsError", "CredentialRenewalError",
        "InvalidCredentialError", "ExpiredCredentialError"
    ]

    for e in err_list:
        exception_test(e)

    # check the BackendError
    from Ganga.Core.exceptions import BackendError
    err = BackendError("TestBackend", test_str)
    assert "TestBackend" in str(err)
    assert test_str in str(err)

    # check the InaccessibleObjectError
    from Ganga.Core.exceptions import InaccessibleObjectError, JobManagerError
    from Ganga.Core.GangaRepository import getRegistry
    err = InaccessibleObjectError(
        getRegistry('jobs').repository, 0,
        JobManagerError("My JobManagerError"))
    assert "jobs" in str(err)
    assert "#0" in str(err)
    assert "My JobManagerError" in str(err)

    # check the RepositoryError
    from Ganga.Core.exceptions import RepositoryError
    from Ganga.Core.GangaRepository import getRegistry
    RepositoryError(getRegistry('jobs').repository, test_str)

    # Construct another to check the except clause in the exception is called
    RepositoryError(getRegistry('jobs').repository, test_str)
Пример #13
0
 def command(klass, cmd, soutfile=None, allowed_exit=None):
     if allowed_exit is None:
         allowed_exit = [0]
     rc, soutfile, ef = shell_cmd(cmd, soutfile, allowed_exit)
     if not ef:
         logger.error(
             'Problem submitting batch job. Maybe your chosen batch system is not available or you have configured it wrongly'
         )
         with open(soutfile) as sout_file:
             logger.error(sout_file.read())
             raiseable = BackendError(
                 klass._name,
                 'It seems that %s commands are not installed properly:%s' %
                 (klass._name, sout_file.readline()))
     return rc, soutfile
Пример #14
0
def dirac_outputfile_jdl(output_files, empty_SE_check):
    """
    This constructs the setOutputData such that the data will be sent to the chosen SE/Token
    In the case that the empty_SE_check is True it will raise an exception if the defaultSE is empty
    In the case that it's False an empty SE is allowed.
    Args:
        output_files (list): List of IGangaFile objects which are requested from job.outputfiles
        empty_SE_check (bool): If this is True then throw exception if DiracFile objects don't have any defaultSE set
    """

    _output_files = [
        this_file for this_file in output_files
        if isinstance(this_file, DiracFile)
    ]

    file_SE_dict = {}

    for this_file in _output_files:
        if not this_file.defaultSE in file_SE_dict:
            file_SE_dict[this_file.defaultSE] = []
        file_SE_dict[this_file.defaultSE].append(this_file.namePattern)

    per_SE_JDL = '''j.setOutputData(###OUTPUTDATA###, outputPath='###OUTPUT_PATH###', outputSE=###OUTPUT_SE###)'''
    total_JDL = ''

    for outputSE, namePatterns in file_SE_dict.iteritems():

        myLine = str(per_SE_JDL)
        myLine = myLine.replace('###OUTPUTDATA###', str(namePatterns))
        if outputSE != '':
            myLine = myLine.replace('###OUTPUT_SE###', str([outputSE]))
        else:
            if empty_SE_check:
                ## If true check, if not false check
                raise BackendError(
                    "Dirac",
                    "Can't submit a DIRAC job with DiracFile outputfile without setting a defaultSE."
                )
            myLine = myLine.replace('###OUTPUT_SE###', str([]))

        total_JDL += myLine + "\n"

    return total_JDL
Пример #15
0
    def checkReport(self, jobDoc):

        job = self.getJobObject()

        config = Config.getConfig('Metrics')
        location = config['location']
        if not os.path.exists(location):
            raise BackendError('CRAB',
                               'Location %s file doesnt exist.' % (location))

        config = ConfigParser()
        config.read(location)

        PARAMS = [('status', 'status')]

        if config.has_section('report'):
            PARAMS += config.items('report')
        else:
            logger.warning('No report in metrics')

        for n, v in PARAMS:
            if v:
                job.backend.report[v] = jobDoc.getAttribute(v)
Пример #16
0
    def master_resubmit(self, jobs):
        '''Resubmit failed Jedi job'''
        from pandatools import Client

        jobIDs = {}
        for job in jobs:
            jobIDs[job.backend.id] = job

        allJobIDs = jobIDs.keys()
        pandaJobIDs = {}
        for jID in allJobIDs:
            status, jediTaskDict = Client.getJediTaskDetails(
                {'jediTaskID': jID}, False, True, verbose=False)
            if status != 0:
                logger.error("Failed to get task details for %s" % jID)
                raise BackendError(
                    'Jedi',
                    'Return code %d retrieving job status information.' %
                    status)

            # Retrieve job
            job = jobIDs[jediTaskDict['jediTaskID']]

            newJobsetID = -1  # get jobset
            retryJobs = []  # jspecs
            resubmittedJobs = []  # ganga jobs

            if jediTaskDict['status'] in [
                    'failed', 'killed', 'cancelled', 'aborted', 'broken',
                    'finished'
            ]:
                retryJobs.append(job)
                resubmittedJobs.append(jID)
            #elif jediTaskDict['status'] == 'finished':
            #    pass
            else:
                logger.warning("Cannot resubmit. Jedi task %s is status %s." %
                               (jID, jediTaskDict['status']))
                return False

            # submit
            if len(retryJobs) == 0:
                logger.warning("No failed jobs to resubmit")
                return False

            status, out = Client.retryTask(jID, verbose=False)
            if status != 0:
                logger.error(status)
                logger.error(out)
                logger.error("Failed to retry JobID=%s" % jID)
                return False
            tmpStat, tmpDiag = out
            if not tmpStat:
                logger.error(tmpDiag)
                logger.error("Failed to retry JobID=%s" % jID)
                return False
            logger.info(tmpDiag)

            job.backend.status = None
            job.backend.jobSpec = {}
            job.updateStatus('submitted')

        logger.info('Resubmission successful')
        return True
Пример #17
0
    def master_prepare(self, app, appconfig):
        '''Prepare the master job'''

        from pandatools import Client
        from taskbuffer.JobSpec import JobSpec
        from taskbuffer.FileSpec import FileSpec

        job = app._getParent()
        logger.debug('ExecutablePandaRTHandler master_prepare called for %s',
                     job.getFQID('.'))

        # set chirp variables
        if configPanda['chirpconfig'] or configPanda['chirpserver']:
            setChirpVariables()

#       Pack inputsandbox
        inputsandbox = 'sources.%s.tar' % commands.getoutput(
            'uuidgen 2> /dev/null')
        inpw = job.getInputWorkspace()
        # add user script to inputsandbox
        if hasattr(job.application.exe, "name"):
            if not job.application.exe in job.inputsandbox:
                job.inputsandbox.append(job.application.exe)

        for fname in [f.name for f in job.inputsandbox]:
            fname.rstrip(os.sep)
            path = fname[:fname.rfind(os.sep)]
            f = fname[fname.rfind(os.sep) + 1:]
            rc, output = commands.getstatusoutput(
                'tar rf %s -C %s %s' % (inpw.getPath(inputsandbox), path, f))
            if rc:
                logger.error('Packing inputsandbox failed with status %d', rc)
                logger.error(output)
                raise ApplicationConfigurationError(
                    'Packing inputsandbox failed.')
        if len(job.inputsandbox) > 0:
            rc, output = commands.getstatusoutput('gzip %s' %
                                                  (inpw.getPath(inputsandbox)))
            if rc:
                logger.error('Packing inputsandbox failed with status %d', rc)
                logger.error(output)
                raise ApplicationConfigurationError(
                    'Packing inputsandbox failed.')
            inputsandbox += ".gz"
        else:
            inputsandbox = None

#       Upload Inputsandbox
        if inputsandbox:
            logger.debug('Uploading source tarball ...')
            uploadSources(inpw.getPath(), os.path.basename(inputsandbox))
            self.inputsandbox = inputsandbox
        else:
            self.inputsandbox = None

#       input dataset
        if job.inputdata:
            if job.inputdata._name != 'DQ2Dataset':
                raise ApplicationConfigurationError(
                    'PANDA application supports only DQ2Datasets')

        # run brokerage here if not splitting
        if not job.splitter:
            from GangaPanda.Lib.Panda.Panda import runPandaBrokerage
            runPandaBrokerage(job)
        elif job.splitter._name not in [
                'DQ2JobSplitter', 'ArgSplitter', 'ArgSplitterTask'
        ]:
            raise ApplicationConfigurationError(
                'Panda splitter must be DQ2JobSplitter or ArgSplitter')

        if job.backend.site == 'AUTO':
            raise ApplicationConfigurationError(
                'site is still AUTO after brokerage!')

#       output dataset
        if job.outputdata:
            if job.outputdata._name != 'DQ2OutputDataset':
                raise ApplicationConfigurationError(
                    'Panda backend supports only DQ2OutputDataset')
        else:
            logger.info('Adding missing DQ2OutputDataset')
            job.outputdata = DQ2OutputDataset()

        job.outputdata.datasetname, outlfn = dq2outputdatasetname(
            job.outputdata.datasetname, job.id, job.outputdata.isGroupDS,
            job.outputdata.groupname)

        self.outDsLocation = Client.PandaSites[job.backend.site]['ddm']

        try:
            Client.addDataset(job.outputdata.datasetname,
                              False,
                              location=self.outDsLocation)
            logger.info('Output dataset %s registered at %s' %
                        (job.outputdata.datasetname, self.outDsLocation))
            dq2_set_dataset_lifetime(job.outputdata.datasetname,
                                     location=self.outDsLocation)
        except exceptions.SystemExit:
            raise BackendError(
                'Panda', 'Exception in Client.addDataset %s: %s %s' %
                (job.outputdata.datasetname, sys.exc_info()[0],
                 sys.exc_info()[1]))

        # handle the libds
        if job.backend.libds:
            self.libDataset = job.backend.libds
            self.fileBO = getLibFileSpecFromLibDS(self.libDataset)
            self.library = self.fileBO.lfn
        elif job.backend.bexec:
            self.libDataset = job.outputdata.datasetname + '.lib'
            self.library = '%s.tgz' % self.libDataset
            try:
                Client.addDataset(self.libDataset,
                                  False,
                                  location=self.outDsLocation)
                dq2_set_dataset_lifetime(self.libDataset,
                                         location=self.outDsLocation)
                logger.info('Lib dataset %s registered at %s' %
                            (self.libDataset, self.outDsLocation))
            except exceptions.SystemExit:
                raise BackendError(
                    'Panda', 'Exception in Client.addDataset %s: %s %s' %
                    (self.libDataset, sys.exc_info()[0], sys.exc_info()[1]))

        # collect extOutFiles
        self.extOutFile = []
        for tmpName in job.outputdata.outputdata:
            if tmpName != '':
                self.extOutFile.append(tmpName)

        for tmpName in job.outputsandbox:
            if tmpName != '':
                self.extOutFile.append(tmpName)

        for tmpName in job.backend.extOutFile:
            if tmpName != '':
                self.extOutFile.append(tmpName)

        # create build job
        if job.backend.bexec != '':
            jspec = JobSpec()
            jspec.jobDefinitionID = job.id
            jspec.jobName = commands.getoutput('uuidgen 2> /dev/null')
            jspec.transformation = '%s/buildGen-00-00-01' % Client.baseURLSUB
            if Client.isDQ2free(job.backend.site):
                jspec.destinationDBlock = '%s/%s' % (
                    job.outputdata.datasetname, self.libDataset)
                jspec.destinationSE = 'local'
            else:
                jspec.destinationDBlock = self.libDataset
                jspec.destinationSE = job.backend.site
            jspec.prodSourceLabel = configPanda['prodSourceLabelBuild']
            jspec.processingType = configPanda['processingType']
            jspec.assignedPriority = configPanda['assignedPriorityBuild']
            jspec.computingSite = job.backend.site
            jspec.cloud = job.backend.requirements.cloud
            jspec.jobParameters = '-o %s' % (self.library)
            if self.inputsandbox:
                jspec.jobParameters += ' -i %s' % (self.inputsandbox)
            else:
                raise ApplicationConfigurationError(
                    'Executable on Panda with build job defined, but inputsandbox is emtpy !'
                )
            matchURL = re.search('(http.*://[^/]+)/', Client.baseURLCSRVSSL)
            if matchURL:
                jspec.jobParameters += ' --sourceURL %s ' % matchURL.group(1)
            if job.backend.bexec != '':
                jspec.jobParameters += ' --bexec "%s" ' % urllib.quote(
                    job.backend.bexec)
                jspec.jobParameters += ' -r %s ' % '.'

            fout = FileSpec()
            fout.lfn = self.library
            fout.type = 'output'
            fout.dataset = self.libDataset
            fout.destinationDBlock = self.libDataset
            jspec.addFile(fout)

            flog = FileSpec()
            flog.lfn = '%s.log.tgz' % self.libDataset
            flog.type = 'log'
            flog.dataset = self.libDataset
            flog.destinationDBlock = self.libDataset
            jspec.addFile(flog)
            return jspec
        else:
            return None
Пример #18
0
def dirac_inputdata(app):
    job = stripProxy(app).getJobObject()
    input_data = None
    parametricinput_data = None

    inputLFNs = []

    if hasattr(job.inputdata, 'getLFNs'):
        inputLFNs = job.inputdata.getLFNs()

    if job.master:
        logger.debug("job.master.inputdata: %s " % str(job.master.inputdata))
    logger.debug("job.inputdata: %s" % str(job.inputdata))
    if hasattr(job.inputdata, 'getLFNs'):
        logger.debug("getLFNs(): %s" % job.inputdata.getLFNs())

    has_input_DiracFile = False
    for this_file in job.inputfiles:
        if isType(this_file, DiracFile):
            has_input_DiracFile = True
            break
    if job.master and not has_input_DiracFile:
        for this_file in job.master.inputfiles:
            if isType(this_file, DiracFile):
                has_input_DiracFile = True
                break

    if len(inputLFNs) > 0:
        # master job with a splitter reaching prepare, hence bulk submit
        if not job.master and job.splitter:
            parametricinput_data = dirac_parametric_split(app)
            if parametricinput_data is not None and len(
                    parametricinput_data) > getConfig(
                        'DIRAC')['MaxDiracBulkJobs']:
                raise BackendError(
                    'Dirac',
                    'Number of bulk submission jobs \'%s\' exceeds the maximum allowed \'%s\' if more are needed please modify your config. Note there is a hard limit in Dirac of currently 1000.'
                    % (len(parametricinput_data),
                       getConfig('DIRAC')['MaxDiracBulkJobs']))
        # master job with no splitter or subjob already split proceed as normal
        else:
            input_data = job.inputdata.getLFNs()

    elif 'Destination' not in job.backend.settings and not has_input_DiracFile:
        ##THIS IS NOT VERY DIRAC CENTRIC
        ##PLEASE WHEN TIME MOVE TO LHCBDIRAC where T1 is more applicable rcurrie
        ##Also editing the settings on the fly is asking for potential problems, should avoid
        t1_sites = getConfig('DIRAC')['noInputDataBannedSites']
        logger.info(
            'Job has no inputdata (T1 sites will be banned to help avoid overloading them).'
        )
        if 'BannedSites' in job.backend.settings:
            job.backend.settings['BannedSites'].extend(t1_sites)
            job.backend.settings['BannedSites'] = unique(
                job.backend.settings['BannedSites'])
        else:
            job.backend.settings['BannedSites'] = t1_sites[:]

    #import traceback
    # traceback.print_stack()

    return input_data, parametricinput_data
Пример #19
0
    def _internal_job_finalisation(job, updated_dirac_status):
        """
        This method performs the main job finalisation
        Args:
            job (Job): Thi is the job we want to finalise
            updated_dirac_status (str): String representing the Ganga finalisation state of the job failed/completed
        """

        if updated_dirac_status == 'completed':
            start = time.time()
            # firstly update job to completing
            DiracBase._getStateTime(job, 'completing')
            if job.status in ['removed', 'killed']:
                return
            elif (job.master and job.master.status in ['removed', 'killed']):
                return  # user changed it under us

            job.updateStatus('completing')
            if job.master:
                job.master.updateMasterJobStatus()

            output_path = job.getOutputWorkspace().getPath()

            logger.info('Contacting DIRAC for job: %s' % job.fqid)
            # Contact dirac which knows about the job
            job.backend.normCPUTime, getSandboxResult, file_info_dict, completeTimeResult = execute(
                "finished_job(%d, '%s')" % (job.backend.id, output_path),
                cred_req=job.backend.credential_requirements)

            now = time.time()
            logger.info(
                '%0.2fs taken to download output from DIRAC for Job %s' %
                ((now - start), job.fqid))

            #logger.info('Job ' + job.fqid + ' OutputDataInfo: ' + str(file_info_dict))
            #logger.info('Job ' + job.fqid + ' OutputSandbox: ' + str(getSandboxResult))
            #logger.info('Job ' + job.fqid + ' normCPUTime: ' + str(job.backend.normCPUTime))

            # Set DiracFile metadata
            wildcards = [
                f.namePattern for f in job.outputfiles.get(DiracFile)
                if regex.search(f.namePattern) is not None
            ]

            lfn_store = os.path.join(
                output_path,
                getConfig('Output')['PostProcessLocationsFileName'])

            # Make the file on disk with a nullop...
            if not os.path.isfile(lfn_store):
                with open(lfn_store, 'w'):
                    pass

            if job.outputfiles.get(DiracFile):

                # Now we can iterate over the contents of the file without touching it
                with open(lfn_store, 'ab') as postprocesslocationsfile:
                    if not hasattr(file_info_dict, 'keys'):
                        logger.error("Error understanding OutputDataInfo: %s" %
                                     str(file_info_dict))
                        raise GangaDiracError(
                            "Error understanding OutputDataInfo: %s" %
                            str(file_info_dict))

                    ## Caution is not clear atm whether this 'Value' is an LHCbism or bug
                    list_of_files = file_info_dict.get('Value',
                                                       file_info_dict.keys())

                    for file_name in list_of_files:
                        file_name = os.path.basename(file_name)
                        info = file_info_dict.get(file_name)
                        #logger.debug("file_name: %s,\tinfo: %s" % (str(file_name), str(info)))

                        if not hasattr(info, 'get'):
                            logger.error(
                                "Error getting OutputDataInfo for: %s" %
                                str(job.getFQID('.')))
                            logger.error(
                                "Please check the Dirac Job still exists or attempt a job.backend.reset() to try again!"
                            )
                            logger.error("Err: %s" % str(info))
                            logger.error("file_info_dict: %s" %
                                         str(file_info_dict))
                            raise GangaDiracError(
                                "Error getting OutputDataInfo")

                        valid_wildcards = [
                            wc for wc in wildcards
                            if fnmatch.fnmatch(file_name, wc)
                        ]
                        if not valid_wildcards:
                            valid_wildcards.append('')

                        for wc in valid_wildcards:
                            #logger.debug("wildcard: %s" % str(wc))

                            DiracFileData = 'DiracFile:::%s&&%s->%s:::%s:::%s\n' % (
                                wc, file_name,
                                info.get('LFN', 'Error Getting LFN!'),
                                str(info.get('LOCATIONS', ['NotAvailable'])),
                                info.get('GUID', 'NotAvailable'))
                            #logger.debug("DiracFileData: %s" % str(DiracFileData))
                            postprocesslocationsfile.write(DiracFileData)
                            postprocesslocationsfile.flush()

                logger.debug("Written: %s" % open(lfn_store, 'r').readlines())

            # check outputsandbox downloaded correctly
            if not result_ok(getSandboxResult):
                logger.warning('Problem retrieving outputsandbox: %s' %
                               str(getSandboxResult))
                DiracBase._getStateTime(job, 'failed')
                if job.status in ['removed', 'killed']:
                    return
                elif (job.master
                      and job.master.status in ['removed', 'killed']):
                    return  # user changed it under us
                job.updateStatus('failed')
                if job.master:
                    job.master.updateMasterJobStatus()
                raise BackendError(
                    'Dirac', 'Problem retrieving outputsandbox: %s' %
                    str(getSandboxResult))

            # finally update job to completed
            DiracBase._getStateTime(job, 'completed', completeTimeResult)
            if job.status in ['removed', 'killed']:
                return
            elif (job.master and job.master.status in ['removed', 'killed']):
                return  # user changed it under us
            job.updateStatus('completed')
            if job.master:
                job.master.updateMasterJobStatus()
            now = time.time()
            logger.debug('Job ' + job.fqid + ' Time for complete update : ' +
                         str(now - start))

        elif updated_dirac_status == 'failed':
            # firstly update status to failed
            DiracBase._getStateTime(job, 'failed')
            if job.status in ['removed', 'killed']:
                return
            if (job.master and job.master.status in ['removed', 'killed']):
                return  # user changed it under us
            job.updateStatus('failed')
            if job.master:
                job.master.updateMasterJobStatus()

            # if requested try downloading outputsandbox anyway
            if configDirac['failed_sandbox_download']:
                execute("getOutputSandbox(%d,'%s')" %
                        (job.backend.id, job.getOutputWorkspace().getPath()),
                        cred_req=job.backend.credential_requirements)
        else:
            logger.error("Job #%s Unexpected dirac status '%s' encountered" %
                         (job.getFQID('.'), updated_dirac_status))
Пример #20
0
    def prepare(self, app, appsubconfig, appmasterconfig, jobmasterconfig):
        """Prepare the specific aspec of each subjob.
           Returns: subjobconfig list of objects understood by backends."""

        from pandatools import Client
        from pandatools import AthenaUtils
        from taskbuffer.JobSpec import JobSpec
        from taskbuffer.FileSpec import FileSpec
        from GangaAtlas.Lib.ATLASDataset.DQ2Dataset import dq2_set_dataset_lifetime
        from GangaPanda.Lib.Panda.Panda import refreshPandaSpecs
        
        # make sure we have the correct siteType
        refreshPandaSpecs()

        job = app._getParent()
        masterjob = job._getRoot()

        logger.debug('ProdTransPandaRTHandler prepare called for %s',
                     job.getFQID('.'))

        job.backend.actualCE = job.backend.site
        job.backend.requirements.cloud = Client.PandaSites[job.backend.site]['cloud']

        # check that the site is in a submit-able status
        if not job.splitter or job.splitter._name != 'DQ2JobSplitter':
            allowed_sites = job.backend.list_ddm_sites()

        try:
            outDsLocation = Client.PandaSites[job.backend.site]['ddm']
            tmpDsExist = False
            if (configPanda['processingType'].startswith('gangarobot') or configPanda['processingType'].startswith('hammercloud')):
                #if Client.getDatasets(job.outputdata.datasetname):
                if getDatasets(job.outputdata.datasetname):
                    tmpDsExist = True
                    logger.info('Re-using output dataset %s'%job.outputdata.datasetname)
            if not configPanda['specialHandling']=='ddm:rucio' and not  configPanda['processingType'].startswith('gangarobot') and not configPanda['processingType'].startswith('hammercloud') and not configPanda['processingType'].startswith('rucio_test'):
                Client.addDataset(job.outputdata.datasetname,False,location=outDsLocation,allowProdDisk=True,dsExist=tmpDsExist)
            logger.info('Output dataset %s registered at %s'%(job.outputdata.datasetname,outDsLocation))
            dq2_set_dataset_lifetime(job.outputdata.datasetname, outDsLocation)
        except exceptions.SystemExit:
            raise BackendError('Panda','Exception in adding dataset %s: %s %s'%(job.outputdata.datasetname,sys.exc_info()[0],sys.exc_info()[1]))
        
        # JobSpec.
        jspec = JobSpec()
        jspec.currentPriority = app.priority
        jspec.jobDefinitionID = masterjob.id
        jspec.jobName = commands.getoutput('uuidgen 2> /dev/null')
        jspec.coreCount = app.core_count
        jspec.AtlasRelease = 'Atlas-%s' % app.atlas_release
        jspec.homepackage = app.home_package
        jspec.transformation = app.transformation

        # set the transfer type (e.g. for directIO tests)
        if job.backend.requirements.transfertype != '':
            jspec.transferType = job.backend.requirements.transfertype

        jspec.destinationDBlock = job.outputdata.datasetname
        if job.outputdata.location:
            jspec.destinationSE = job.outputdata.location
        else:
            jspec.destinationSE = job.backend.site
        if job.inputdata:
            jspec.prodDBlock = job.inputdata.dataset[0]
        else:
            jspec.prodDBlock = 'NULL'
        if app.prod_source_label:
            jspec.prodSourceLabel = app.prod_source_label
        else:
            jspec.prodSourceLabel = configPanda['prodSourceLabelRun']
        jspec.processingType = configPanda['processingType']
        if job.backend.requirements.specialHandling:
            jspec.specialHandling = job.backend.requirements.specialHandling
        else:
            jspec.specialHandling = configPanda['specialHandling']
        jspec.computingSite = job.backend.site
        jspec.cloud = job.backend.requirements.cloud
        jspec.cmtConfig = app.atlas_cmtconfig
        if app.dbrelease == 'LATEST':
            try:
                latest_dbrelease = getLatestDBReleaseCaching()
            except:
                from pandatools import Client
                latest_dbrelease = Client.getLatestDBRelease()
            m = re.search('(.*):DBRelease-(.*)\.tar\.gz', latest_dbrelease)
            if m:
                self.dbrelease_dataset = m.group(1)
                self.dbrelease = m.group(2)
            else:
                raise ApplicationConfigurationError("Error retrieving LATEST DBRelease. Try setting application.dbrelease manually.")
        else:
            self.dbrelease_dataset = app.dbrelease_dataset
            self.dbrelease = app.dbrelease
        jspec.jobParameters = app.job_parameters

        if self.dbrelease:
            if self.dbrelease == 'current':
                jspec.jobParameters += ' --DBRelease=current' 
            else:
                if jspec.transformation.endswith("_tf.py") or jspec.transformation.endswith("_tf"):
                    jspec.jobParameters += ' --DBRelease=DBRelease-%s.tar.gz' % (self.dbrelease,)
                else:
                    jspec.jobParameters += ' DBRelease=DBRelease-%s.tar.gz' % (self.dbrelease,)
                dbspec = FileSpec()
                dbspec.lfn = 'DBRelease-%s.tar.gz' % self.dbrelease
                dbspec.dataset = self.dbrelease_dataset
                dbspec.prodDBlock = jspec.prodDBlock
                dbspec.type = 'input'
                jspec.addFile(dbspec)

        if job.inputdata:
            m = re.search('(.*)\.(.*)\.(.*)\.(.*)\.(.*)\.(.*)',
                          job.inputdata.dataset[0])
            if not m:
                logger.error("Error retrieving run number from dataset name")
                #raise ApplicationConfigurationError(None, "Error retrieving run number from dataset name")
                runnumber = 105200
            else:
                runnumber = int(m.group(2))
            if jspec.transformation.endswith("_tf.py") or jspec.transformation.endswith("_tf"):
                jspec.jobParameters += ' --runNumber %d' % runnumber
            else:
                jspec.jobParameters += ' RunNumber=%d' % runnumber
        
        # Output files.
        randomized_lfns = []
        ilfn = 0
        for lfn, lfntype in zip(app.output_files,app.output_type):
            ofspec = FileSpec()
            if app.randomize_lfns:
                randomized_lfn = lfn + ('.%s.%d.%s' % (job.backend.site, int(time.time()), commands.getoutput('uuidgen 2> /dev/null')[:4] ) )
            else:
                randomized_lfn = lfn

            ofspec.lfn = randomized_lfn
            randomized_lfns.append(randomized_lfn)
            ofspec.destinationDBlock = jspec.destinationDBlock
            ofspec.destinationSE = jspec.destinationSE
            ofspec.dataset = jspec.destinationDBlock
            ofspec.type = 'output'
            jspec.addFile(ofspec)

            # remove the first section of the file name if it matches the file type
            if len(randomized_lfn.split('.')) > 1 and randomized_lfn.split('.')[0].find(lfntype) != -1:
                randomized_lfn = '.'.join(randomized_lfn.split('.')[1:])

            if jspec.transformation.endswith("_tf.py") or jspec.transformation.endswith("_tf"):
                jspec.jobParameters += ' --output%sFile %s' % (lfntype, randomized_lfn)
            else:
                jspec.jobParameters += ' output%sFile=%s' % (lfntype, randomized_lfn)
            ilfn=ilfn+1

        # Input files.
        if job.inputdata:
            for guid, lfn, size, checksum, scope in zip(job.inputdata.guids, job.inputdata.names, job.inputdata.sizes, job.inputdata.checksums, job.inputdata.scopes):
                ifspec = FileSpec()
                ifspec.lfn = lfn
                ifspec.GUID = guid
                ifspec.fsize = size
                ifspec.md5sum = checksum
                ifspec.scope = scope
                ifspec.dataset = jspec.prodDBlock
                ifspec.prodDBlock = jspec.prodDBlock
                ifspec.type = 'input'
                jspec.addFile(ifspec)
            if app.input_type:
                itype = app.input_type
            else:
                itype = m.group(5)

            # Change inputfile parameter depending on input type
            if job.backend.requirements.transfertype.upper() == 'DIRECT':
                tmp_in_file = "@tmpin_" + job.inputdata.dataset[0].split(':')[-1]
            else:
                tmp_in_file = ','.join(job.inputdata.names)

            # set the inputfile parameter
            if jspec.transformation.endswith("_tf.py") or jspec.transformation.endswith("_tf"):
                jspec.jobParameters += ' --input%sFile %s' % (itype, tmp_in_file)
            else:
                jspec.jobParameters += ' input%sFile=%s' % (itype, tmp_in_file)

        # Log files.
        lfspec = FileSpec()
        lfspec.lfn = '%s.job.log.tgz' % jspec.jobName
        lfspec.destinationDBlock = jspec.destinationDBlock
        lfspec.destinationSE  = jspec.destinationSE
        lfspec.dataset = jspec.destinationDBlock
        lfspec.type = 'log'
        jspec.addFile(lfspec)
        
        return jspec
Пример #21
0
    def _resubmit(self):
        """Resubmit a DIRAC job"""
        j = self.getJobObject()
        parametric = False
        script_path = os.path.join(j.getInputWorkspace().getPath(),
                                   'dirac-script.py')
        # Check old script
        if j.master is None and not os.path.exists(script_path):
            raise BackendError('Dirac',
                               'No "dirac-script.py" found in j.inputdir')

        if j.master is not None and not os.path.exists(script_path):
            script_path = os.path.join(j.master.getInputWorkspace().getPath(),
                                       'dirac-script.py')
            if not os.path.exists(script_path):
                raise BackendError(
                    'Dirac',
                    'No "dirac-script.py" found in j.inputdir or j.master.inputdir'
                )
            parametric = True

        # Read old script
        f = open(script_path, 'r')
        script = f.read()
        f.close()

        # Create new script - ##note instead of using get_parametric_dataset
        # could just use j.inputdata.
        if parametric is True:
            parametric_datasets = get_parametric_datasets(script.split('\n'))
            if j.master:
                if len(parametric_datasets) != len(j.master.subjobs):
                    raise BackendError(
                        'Dirac',
                        'number of parametric datasets defined in API script doesn\'t match number of master.subjobs'
                    )
            if j.inputdata and len(j.inputdata) > 0:
                _input_files = [
                    f for f in j.inputdata if not isType(f, DiracFile)
                ]
            else:
                _input_files = []
            if set(parametric_datasets[j.id]).symmetric_difference(
                    set([f.namePattern for f in _input_files])):
                raise BackendError(
                    'Dirac',
                    'Mismatch between dirac-script and job attributes.')
            script = script.replace(
                '.setParametricInputData(%s)' % str(parametric_datasets),
                '.setInputData(%s)' % str(parametric_datasets[j.id]))
            script = script.replace('%n', str(j.id))  # name

        start_user_settings = '# <-- user settings\n'
        new_script = script[:script.find(start_user_settings) +
                            len(start_user_settings)]

        job_ident = get_job_ident(script.split('\n'))
        for key, value in self.settings.iteritems():
            if str(key).startswith('set'):
                _key = key[3:]
            else:
                _key = key
            if type(value) is str:
                template = '%s.set%s("%s")\n'
            else:
                template = '%s.set%s(%s)\n'
            new_script += template % (job_ident, str(_key), str(value))
        new_script += script[script.find('# user settings -->'):]

        # Save new script
        new_script_filename = os.path.join(j.getInputWorkspace().getPath(),
                                           'dirac-script.py')
        f = open(new_script_filename, 'w')
        f.write(new_script)
        f.flush()
        f.close()
        return self._common_submit(new_script_filename)
Пример #22
0
    def parseResults(self):

        job = self.getJobObject()

        server = CRABServer()
        try:
            server.status(job)
            server.getOutput(job)
        except:
            logger.error('Could not get the output of the job.')
            # Let's not raise this yet (in case of a double call).
            # raise CRABServerError('Impossible to get the output of the job')

        workdir = job.inputdata.ui_working_dir
        index = int(job.id) + 1
        doc_path = '%s/res/crab_fjr_%d.xml' % (workdir, index)

        if not os.path.exists(doc_path):
            logger.error('FJR %s not found.' % (doc_path))
            return

        try:
            doc = parse(doc_path)
        except:
            logger.error("Could not parse document. File not present?")
            return
        status = doc.firstChild.getAttribute("Status")

        if status in ["Failed"]:
            self.postMortem(job)
            job.updateStatus('failed')
        elif status in ["Success"]:
            if job.status == 'submitting':
                job.updateStatus('submitted')
            job.updateStatus('completed')
        else:
            logger.warning("UNKNOWN PARSE STATUS: " + str(status))

        config = Config.getConfig('Metrics')
        location = config['location']
        if not os.path.exists(location):
            raise BackendError('CRAB',
                               'Location %s file doesnt exist.' % (location))

        config = ConfigParser()
        config.read(location)

        #Iterate over all them
        SECTIONS = config.sections()
        if 'report' in SECTIONS:
            SECTIONS.remove('report')

        # Only five sections work here...
        for section in SECTIONS:

            if section not in job.backend.fjr:
                job.backend.fjr[section] = {}

            performancereport = doc.getElementsByTagName(
                "PerformanceReport")[0]
            performancesummary = performancereport.getElementsByTagName(
                "PerformanceSummary")
            for pfs in performancesummary:
                if pfs.getAttribute("Metric") == section:
                    metrics = pfs.getElementsByTagName("Metric")
                    for metric in metrics:
                        name = metric.getAttribute("Name")
                        if config.has_option(section, name):
                            # Due to the names with minus intead of underscore, we have to do thiw walkarround
                            # to send them to the DB.
                            name = config.get(section, name)
                            if name:
                                job.backend.fjr[section][
                                    name] = metric.getAttribute("Value")
Пример #23
0
def dirac_outputfile_jdl(output_files, empty_SE_check):
    """
    This constructs the setOutputData such that the data will be sent to the chosen SE/Token
    In the case that the empty_SE_check is True it will raise an exception if the defaultSE is empty
    In the case that it's False an empty SE is allowed.
    Args:
        output_files (list): List of IGangaFile objects which are requested from job.outputfiles
        empty_SE_check (bool): If this is True then throw exception if DiracFile objects don't have any defaultSE set
    """

    _output_files = [
        this_file for this_file in output_files
        if isinstance(this_file, DiracFile)
    ]

    file_SE_dict = {}

    for this_file in _output_files:

        # Group files by destination SE
        if not this_file.defaultSE in file_SE_dict:
            file_SE_dict[this_file.defaultSE] = {}

        # Then group them by remoteDir
        remoteDir = this_file.expandString(this_file.remoteDir)
        if not remoteDir in file_SE_dict[this_file.defaultSE]:
            file_SE_dict[this_file.defaultSE][remoteDir] = []

        # Now can construct string to upload the file
        file_SE_dict[this_file.defaultSE][remoteDir].append(
            this_file.namePattern)

    per_SE_JDL = '''j.setOutputData(###OUTPUTDATA###, outputPath='###OUTPUT_PATH###', outputSE=###OUTPUT_SE###)'''
    total_JDL = ''

    ganga_defined_output_path = ""

    if output_files:
        job = output_files[0].getJobObject()
        if getConfig('DIRAC')['useGangaPath']:
            ganga_defined_output_path = 'GangaJob_%s/OutputFiles' % job.getFQID(
                '/')

    # Loop over all SE
    for outputSE, remote_dirs in file_SE_dict.iteritems():

        # Loop over all paths for the LFN
        for remote_dir, namePatterns in remote_dirs.iteritems():

            myLine = str(per_SE_JDL)
            myLine = myLine.replace('###OUTPUTDATA###', str(namePatterns))
            if outputSE != '':
                myLine = myLine.replace('###OUTPUT_SE###', str([outputSE]))
            else:
                if empty_SE_check:
                    ## If true check, if not false check
                    raise BackendError(
                        "Dirac",
                        "Can't submit a DIRAC job with DiracFile outputfile without setting a defaultSE."
                    )
                myLine = myLine.replace('###OUTPUT_SE###', str([]))

            relative_path = ''
            if getConfig('DIRAC')['useGangaPath']:
                relative_path = ganga_defined_output_path

            if remote_dir:
                relative_path = remote_dir

            myLine = myLine.replace('###OUTPUT_PATH###', relative_path)

            total_JDL += myLine + "\n"

    return total_JDL
Пример #24
0
    def master_updateMonitoringInformation(jobs):
        '''Monitor jobs'''
        from pandatools import Client

        #active_status = [ None, 'defined', 'unknown', 'assigned', 'waiting', 'activated', 'sent', 'starting', 'running', 'holding', 'transferring' ]

        submitting_status = []
        active_status = [
            None, 'registered', 'waiting', 'defined', 'pending', 'assigning',
            'ready', 'scouting', 'running', 'holding', 'merging', 'prepared',
            'aborting', 'finishing'
        ]

        inactive_status = ['finished', 'aborted', 'broken', 'failed', 'done']

        # Find jobs to be monitored
        jobdict = {}
        for job in jobs:
            # add a delay as Panda can be a little slow in sorting out a new Task
            if job.backend.id and job.backend.status in active_status and (
                (datetime.datetime.utcnow() -
                 job.time.timestamps["submitted"]).seconds > 120):
                jobdict[job.backend.id] = job

        logger.debug("jobdict = %s" % jobdict)

        # Monitor active Jedi tasks
        allJobIDs = jobdict.keys()
        pandaJobIDs = {}
        for jID in allJobIDs:
            status, jediTaskDict = Client.getJediTaskDetails(
                {'jediTaskID': jID}, False, True, verbose=False)
            if status != 0:
                logger.error("Failed to get task details for %s" % jID)
                #raise BackendError('Jedi','Return code %d retrieving job status information.' % status)
                continue
            # Retrieve job
            job = jobdict[jediTaskDict['jediTaskID']]
            # Store associated Panda jobs
            if job.backend.pandajobs:
                pandaJobIDs[job.backend.id] = [
                    pj.id for pj in job.backend.pandajobs
                ]
            else:
                pandaJobIDs[
                    jediTaskDict['jediTaskID']] = jediTaskDict['PandaID']
            logger.debug("jID = %s, pandaJobIDs = %s" % (jID, pandaJobIDs))

            # Fill the output data dataset list
            if 'outDS' in jediTaskDict and jediTaskDict['outDS'] != '':
                for ds in jediTaskDict['outDS'].split(','):
                    if not ds in job.outputdata.datasetList:
                        job.outputdata.datasetList.append(ds)

            # Jedi job status has changed
            if job.backend.status != jediTaskDict['status']:
                logger.debug('Job %s has changed status from %s to %s',
                             job.getFQID('.'), job.backend.status,
                             jediTaskDict['status'])
                job.backend.status = jediTaskDict['status']
                job.backend.reason = jediTaskDict['statistics']

                # Now update Jedi job status
                if jediTaskDict['status'] in [
                        'registered', 'waiting', 'defined', 'pending',
                        'assigning', 'ready'
                ]:
                    job.updateStatus('submitted')
                elif jediTaskDict['status'] in [
                        'scouting', 'running', 'holding', 'merging', 'prepared'
                ]:
                    job.updateStatus('running')
                elif jediTaskDict['status'] in ['done']:
                    job.updateStatus('completed')
                elif jediTaskDict['status'] in ['failed', 'finished']:
                    job.updateStatus('failed')
                elif jediTaskDict['status'] in [
                        'aborted', 'broken', 'cancelled'
                ] and job.status not in ['completed', 'failed']:
                    job.updateStatus('killed')
                else:
                    logger.warning('Unexpected Jedi task status %s',
                                   jediTaskDict['status'])

            # Check if associated Panda job exist and monitor them
            if not job.backend.pandajobs:
                jdefids = pandaJobIDs[jID]
                # skip if there are no Panda jobs yet
                if not jdefids:
                    continue
                tot_num_mjobs = 0

                do_master_update = True
                ick, status, num_mjobs = retrievePandaJobs(job, jdefids)
                logger.debug('retrievePandaJobs returns: %s %s' %
                             (repr(ick), status))
                if not ick:
                    logger.debug(
                        'Panda job retrival failure for Jedi task %s with PandaIds %s'
                        % (job.backend.id, jdefids))
                    do_master_update = False

                tot_num_mjobs += num_mjobs
                logger.debug('Job %s retrieved %d Panda jobs' %
                             (job.getFQID('.'), tot_num_mjobs))
            # Now monitor the already attached Panda jobs
            else:
                jdefids = [pj.id for pj in job.backend.pandajobs]
                rc, jobsStatus = Client.getFullJobStatus(jdefids, False)
                if rc:
                    logger.error(
                        'Return code %d retrieving job status information.',
                        rc)
                    raise BackendError(
                        'Jedi',
                        'Return code %d retrieving job status information.' %
                        rc)

                for status in jobsStatus:
                    if not status: continue

                    for pjob in job.backend.pandajobs:
                        if pjob.id == status.PandaID:
                            # skip if no status change
                            if pjob.status == status.jobStatus:
                                continue
                            # Else update job record
                            pjob.jobSpec = dict(
                                zip(status._attributes, status.values()))

                            for k in pjob.jobSpec.keys():
                                if type(pjob.jobSpec[k]) not in [
                                        type(''), type(1)
                                ]:
                                    pjob.jobSpec[k] = str(pjob.jobSpec[k])

                            logger.debug(
                                'Job %s with Panda job %s has changed status from %s to %s',
                                job.getFQID('.'), pjob.id, pjob.status,
                                status.jobStatus)
                            pjob.status = status.jobStatus
                            pjob.exitcode = str(status.transExitCode)
                            pjob.piloterrorcode = str(status.pilotErrorCode)
                            pjob.reason = ''
                            for k in pjob.jobSpec.keys():
                                if k.endswith('ErrorDiag'
                                              ) and pjob.jobSpec[k] != 'NULL':
                                    pjob.reason += '%s: %s, ' % (
                                        k, str(pjob.jobSpec[k]))
                            #if job.backend.jobSpec['transExitCode'] != 'NULL':
                            pjob.reason += 'transExitCode: %s' % pjob.jobSpec[
                                'transExitCode']

                            if status.jobStatus in [
                                    'defined', 'unknown', 'assigned',
                                    'waiting', 'activated', 'sent'
                            ]:
                                logger.debug('Panda job %s %s' %
                                             (pjob.id, status.jobStatus))
                            elif status.jobStatus in [
                                    'starting', 'running', 'holding',
                                    'transferring', 'merging'
                            ]:
                                logger.debug('Panda job %s %s ' %
                                             (pjob.id, status.jobStatus))
                            elif status.jobStatus in ['finished']:
                                logger.debug('Panda job %s %s ' %
                                             (pjob.id, status.jobStatus))
                            elif status.jobStatus == 'failed':
                                logger.debug('Panda job %s %s ' %
                                             (pjob.id, status.jobStatus))
                                # check for server side retry
                                if 'taskBufferErrorDiag' in pjob.jobSpec and pjob.jobSpec[
                                        'taskBufferErrorDiag'].find(
                                            "PandaID=") != -1:
                                    # grab the new panda ID
                                    newPandaID = long(
                                        pjob.jobSpec['taskBufferErrorDiag'].
                                        split("=")[1])
                                    pjob.id = newPandaID
                                    pjob.status = None
                                    pjob.url = 'http://panda.cern.ch/?job=%d' % newPandaID
                            elif status.jobStatus == 'cancelled' and pjob.status not in [
                                    'completed', 'failed'
                            ]:  # bug 67716
                                logger.debug('Panda job %s cancelled' %
                                             pjob.id)
                                if 'taskBufferErrorDiag' in pjob.jobSpec and "rebrokerage" in pjob.jobSpec[
                                        'taskBufferErrorDiag']:
                                    newPandaID = checkForRebrokerage(
                                        pjob.jobSpec['taskBufferErrorDiag'])
                                    logger.warning(
                                        "Subjob rebrokered by Panda server. Job %d moved to %d."
                                        % (pjob.id, newPandaID))
                                    pjob.id = newPandaID
                                    pjob.status = None
                            else:
                                logger.warning('Unexpected job status %s',
                                               status.jobStatus)
Пример #25
0
    def prepare(self, app, appsubconfig, appmasterconfig, jobmasterconfig):

        logger.debug("Prepare")

        inputsandbox, outputsandbox = sandbox_prepare(app, appsubconfig,
                                                      appmasterconfig,
                                                      jobmasterconfig)

        job = stripProxy(app).getJobObject()

        if job.inputdata:
            if not job.splitter:
                if len(job.inputdata) > 100:
                    raise BackendError(
                        "Dirac",
                        "You're submitting a job to Dirac with no splitter and more than 100 files, please add a splitter and try again!"
                    )

        outputfiles = [
            this_file for this_file in job.outputfiles
            if isType(this_file, DiracFile)
        ]

        data_str = 'import os\n'
        data_str += 'execfile(\'data.py\')\n'

        if hasattr(job, '_splitter_data'):
            data_str += job._splitter_data
        inputsandbox.append(FileBuffer('data-wrapper.py', data_str))

        input_data = []

        # Cant wait to get rid of this when people no-longer specify
        # inputdata in options file
        #######################################################################
        # splitters ensure that subjobs pick up inputdata from job over that in
        # optsfiles but need to take care of unsplit jobs
        if not job.master:
            share_path = os.path.join(get_share_path(app), 'inputdata',
                                      'options_data.pkl')

            if not job.inputdata:
                if os.path.exists(share_path):
                    f = open(share_path, 'r+b')
                    job.inputdata = pickle.load(f)
                    f.close()

        #######################################################################

        # Cant wait to get rid of this when people no-longer specify
        # outputsandbox or outputdata in options file
        #######################################################################
        share_path = os.path.join(get_share_path(app), 'output',
                                  'options_parser.pkl')

        if os.path.exists(share_path):
            #        if not os.path.exists(share_path):
            # raise GangaException('could not find the parser')
            f = open(share_path, 'r+b')
            parser = pickle.load(f)
            f.close()

            outbox, outdata = parser.get_output(job)

            from Ganga.GPIDev.Lib.File import FileUtils
            from Ganga.GPIDev.Base.Filters import allComponentFilters

            fileTransform = allComponentFilters['gangafiles']
            outdata_files = [
                fileTransform(this_file, None) for this_file in outdata
                if not FileUtils.doesFileExist(this_file, job.outputfiles)
            ]
            job.non_copyable_outputfiles.extend([
                output_file for output_file in outdata_files
                if not isType(output_file, DiracFile)
            ])
            outbox_files = [
                fileTransform(this_file, None) for this_file in outbox
                if not FileUtils.doesFileExist(this_file, job.outputfiles)
            ]
            job.non_copyable_outputfiles.extend([
                outbox_file for outbox_file in outbox_files
                if not isType(outbox_file, DiracFile)
            ])

            outputsandbox.extend(
                [f.namePattern for f in job.non_copyable_outputfiles])

            outputsandbox.extend([
                f.namePattern for f in job.outputfiles
                if not isType(f, DiracFile)
            ])
            outputsandbox = unique(outputsandbox)  # + outbox[:])
        #######################################################################

        input_data_dirac, parametricinput_data = dirac_inputdata(
            job.application)

        if input_data_dirac is not None:
            for f in input_data_dirac:
                if isType(f, DiracFile):
                    input_data.append(f.lfn)
                elif isType(f, str):
                    input_data.append(f)
                else:
                    raise ApplicationConfigurationError(
                        "Don't know How to handle anythig other than DiracFiles or strings to LFNs!"
                    )

        commandline = "python ./gaudipython-wrapper.py"
        if is_gaudi_child(app):
            commandline = 'gaudirun.py '
            commandline += ' '.join([str(arg) for arg in app.args])
            commandline += ' options.pkl data-wrapper.py'
        logger.debug('Command line: %s: ', commandline)

        gaudi_script_path = os.path.join(job.getInputWorkspace().getPath(),
                                         "gaudi-script.py")

        script_generator(
            gaudi_script_template(),
            #remove_unreplaced = False,
            outputfile_path=gaudi_script_path,
            PLATFORM=app.platform,
            COMMAND=commandline,
            XMLSUMMARYPARSING=getXMLSummaryScript()  # ,
            #OUTPUTFILESINJECTEDCODE = getWNCodeForOutputPostprocessing(job, '    ')
        )

        #logger.debug( "input_data %s" % str( input_data ) )

        # We want to propogate the ancestor depth to DIRAC when we have
        # inputdata set
        if job.inputdata is not None and isType(job.inputdata, LHCbDataset):

            # As the RT Handler we already know we have a Dirac backend
            if type(job.backend.settings) is not dict:
                raise ApplicationConfigurationError(
                    'backend.settings should be a dict')

            if 'AncestorDepth' in job.backend.settings:
                ancestor_depth = job.backend.settings['AncestorDepth']
            else:
                ancestor_depth = job.inputdata.depth
        else:
            ancestor_depth = 0

        lhcbdirac_script_template = lhcbdiracAPI_script_template()

        lhcb_dirac_outputfiles = lhcbdirac_outputfile_jdl(outputfiles)

        # not necessary to use lhcbdiracAPI_script_template any more as doing our own uploads to Dirac
        # remove after Ganga6 release
        # NOTE special case for replicas: replicate string must be empty for no
        # replication
        dirac_script = script_generator(
            lhcbdirac_script_template,
            DIRAC_IMPORT=
            'from LHCbDIRAC.Interfaces.API.DiracLHCb import DiracLHCb',
            DIRAC_JOB_IMPORT=
            'from LHCbDIRAC.Interfaces.API.LHCbJob import LHCbJob',
            DIRAC_OBJECT='DiracLHCb()',
            JOB_OBJECT='LHCbJob()',
            NAME=mangle_job_name(app),
            APP_NAME=stripProxy(app).appname,
            APP_VERSION=app.version,
            APP_SCRIPT=gaudi_script_path,
            APP_LOG_FILE='Ganga_%s_%s.log' %
            (stripProxy(app).appname, app.version),
            INPUTDATA=input_data,
            PARAMETRIC_INPUTDATA=parametricinput_data,
            OUTPUT_SANDBOX=API_nullifier(outputsandbox),
            OUTPUTFILESSCRIPT=lhcb_dirac_outputfiles,
            # job.fqid,#outputdata_path,
            OUTPUT_PATH="",
            SETTINGS=diracAPI_script_settings(job.application),
            DIRAC_OPTS=job.backend.diracOpts,
            PLATFORM=app.platform,
            REPLICATE='True'
            if getConfig('DIRAC')['ReplicateOutputData'] else '',
            ANCESTOR_DEPTH=ancestor_depth,
            ## This is to be modified in the final 'submit' function in the backend
            ## The backend also handles the inputfiles DiracFiles ass appropriate
            INPUT_SANDBOX='##INPUT_SANDBOX##')
        logger.debug("prepare: LHCbGaudiDiracRunTimeHandler")

        return StandardJobConfig(dirac_script,
                                 inputbox=unique(inputsandbox),
                                 outputbox=unique(outputsandbox))