Exemplo n.º 1
0
    def submitDirect(self, schedd, cmd, arg, info): #pylint: disable=R0201
        """
        Submit directly to the schedd using the HTCondor module
        """
        dagAd = classad.ClassAd()
        addCRABInfoToClassAd(dagAd, info)

        groups = CMSGroupMapper.map_user_to_groups(dagAd["CRAB_UserHN"])
        if groups:
            dagAd["CMSGroups"] = groups

        # NOTE: Changes here must be synchronized with the job_submit in DagmanCreator.py in CAFTaskWorker
        dagAd["Out"] = str(os.path.join(info['scratch'], "request.out"))
        dagAd["Err"] = str(os.path.join(info['scratch'], "request.err"))
        dagAd["CRAB_Attempt"] = 0
        # We switched from local to scheduler universe.  Why?  It seems there's no way in the
        # local universe to change the hold signal at runtime.  That's fairly important for our
        # resubmit implementation.
        #dagAd["JobUniverse"] = 12
        dagAd["JobUniverse"] = 7
        dagAd["HoldKillSig"] = "SIGUSR1"
        dagAd["Cmd"] = cmd
        dagAd['Args'] = arg
        dagAd["TransferInput"] = str(info['inputFilesString'])
        dagAd["LeaveJobInQueue"] = classad.ExprTree("(JobStatus == 4) && ((StageOutFinish =?= UNDEFINED) || (StageOutFinish == 0))")
        dagAd["PeriodicRemove"] = classad.ExprTree("(JobStatus == 5) && (time()-EnteredCurrentStatus > 30*86400)")
        dagAd["TransferOutput"] = info['outputFilesString']
        dagAd["OnExitRemove"] = classad.ExprTree("( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2))")
        dagAd["OtherJobRemoveRequirements"] = classad.ExprTree("DAGManJobId =?= ClusterId")
        dagAd["RemoveKillSig"] = "SIGUSR1"
        dagAd["OnExitHold"] = classad.ExprTree("(ExitCode =!= UNDEFINED && ExitCode != 0)")
        dagAd["Environment"] = classad.ExprTree('strcat("PATH=/usr/bin:/bin CRAB3_VERSION=3.3.0-pre1 CONDOR_ID=", ClusterId, ".", ProcId," %s")' % " ".join(info['additional_environment_options'].split(";")))
        dagAd["RemoteCondorSetup"] = info['remote_condor_setup']
        dagAd["Requirements"] = classad.ExprTree('true || false')
        dagAd["TaskType"] = "ROOT"
        dagAd["X509UserProxy"] = info['user_proxy']

        with HTCondorUtils.AuthenticatedSubprocess(info['user_proxy']) as (parent, rpipe):
            if not parent:
                resultAds = []
                schedd.submit(dagAd, 1, True, resultAds)
                schedd.spool(resultAds)
                if resultAds:
                    id = "%s.%s" % (resultAds[0]['ClusterId'], resultAds[0]['ProcId'])
                    schedd.edit([id], "LeaveJobInQueue", classad.ExprTree("(JobStatus == 4) && (time()-EnteredCurrentStatus < 30*86400)"))
        results = rpipe.read()
        if results != "OK":
            raise TaskWorkerException("Failure when submitting task to scheduler. Error reason: '%s'" % results)
Exemplo n.º 2
0
    def submitDirect(self, schedd, cmd, arg, info): #pylint: disable=R0201
        """
        Submit directly to the schedd using the HTCondor module
        """
        dagAd = classad.ClassAd()
        addCRABInfoToClassAd(dagAd, info)

        groups = CMSGroupMapper.map_user_to_groups(dagAd["CRAB_UserHN"])
        if groups:
            dagAd["CMSGroups"] = groups

        # NOTE: Changes here must be synchronized with the job_submit in DagmanCreator.py in CAFTaskWorker
        dagAd["Out"] = str(os.path.join(info['scratch'], "request.out"))
        dagAd["Err"] = str(os.path.join(info['scratch'], "request.err"))
        dagAd["CRAB_Attempt"] = 0
        # We switched from local to scheduler universe.  Why?  It seems there's no way in the
        # local universe to change the hold signal at runtime.  That's fairly important for our
        # resubmit implementation.
        #dagAd["JobUniverse"] = 12
        dagAd["JobUniverse"] = 7
        dagAd["HoldKillSig"] = "SIGUSR1"
        dagAd["Cmd"] = cmd
        dagAd['Args'] = arg
        dagAd["TransferInput"] = str(info['inputFilesString'])
        dagAd["LeaveJobInQueue"] = classad.ExprTree("(JobStatus == 4) && ((StageOutFinish =?= UNDEFINED) || (StageOutFinish == 0))")
        dagAd["PeriodicRemove"] = classad.ExprTree("(JobStatus == 5) && (time()-EnteredCurrentStatus > 30*86400)")
        dagAd["TransferOutput"] = info['outputFilesString']
        dagAd["OnExitRemove"] = classad.ExprTree("( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2))")
        dagAd["OtherJobRemoveRequirements"] = classad.ExprTree("DAGManJobId =?= ClusterId")
        dagAd["RemoveKillSig"] = "SIGUSR1"
        dagAd["OnExitHold"] = classad.ExprTree("(ExitCode =!= UNDEFINED && ExitCode != 0)")
        dagAd["Environment"] = classad.ExprTree('strcat("PATH=/usr/bin:/bin CRAB3_VERSION=3.3.0-pre1 CONDOR_ID=", ClusterId, ".", ProcId," %s")' % " ".join(info['additional_environment_options'].split(";")))
        dagAd["RemoteCondorSetup"] = info['remote_condor_setup']
        dagAd["Requirements"] = classad.ExprTree('true || false')
        dagAd["TaskType"] = "ROOT"
        dagAd["X509UserProxy"] = info['user_proxy']

        with HTCondorUtils.AuthenticatedSubprocess(info['user_proxy']) as (parent, rpipe):
            if not parent:
                resultAds = []
                schedd.submit(dagAd, 1, True, resultAds)
                schedd.spool(resultAds)
                if resultAds:
                    id = "%s.%s" % (resultAds[0]['ClusterId'], resultAds[0]['ProcId'])
                    schedd.edit([id], "LeaveJobInQueue", classad.ExprTree("(JobStatus == 4) && (time()-EnteredCurrentStatus < 30*86400)"))
        results = rpipe.read()
        if results != "OK":
            raise TaskWorkerException("Failure when submitting task to scheduler. Error reason: '%s'" % results)
    def executeInternal(self, *args, **kwargs):
        """Internal execution to submit to selected scheduler
           Before submission it does duplicate check to see if
           task was not submitted by previous time"""
        if not htcondor:
            raise Exception("Unable to import HTCondor module")

        task = kwargs["task"]
        workflow = task["tm_taskname"]
        info = args[0][0]
        # self.logger.debug("Task input information: %s" % str(info))
        dashboardParams = args[0][1]
        inputFiles = args[0][2]

        self.logger.debug("Starting duplicate check")
        dup = self.duplicateCheck(task)
        self.logger.debug("Duplicate check finished %s", dup)
        if dup != None:
            return dup

        cwd = os.getcwd()
        os.chdir(kwargs["tempDir"])

        info["inputFilesString"] = ", ".join(inputFiles)
        outputFiles = ["RunJobs.dag.dagman.out", "RunJobs.dag.rescue.001"]
        info["outputFilesString"] = ", ".join(outputFiles)
        arg = "RunJobs.dag"

        info["resthost"] = '"%s"' % (self.server["host"])
        # info['resthost'] = self.config.TaskWorker.resturl
        info["resturinoapi"] = '"%s"' % (self.restURInoAPI)

        try:
            info["remote_condor_setup"] = ""
            if task["tm_collector"]:
                self.backendurls["htcondorPool"] = task["tm_collector"]
            loc = HTCondorLocator.HTCondorLocator(self.backendurls)
            address = ""
            schedd = ""
            try:
                self.logger.debug("Getting schedd object")
                schedd, address = loc.getScheddObjNew(task["tm_schedd"])
                self.logger.debug("Got schedd object")
            except Exception as exp:
                msg = "The CRAB server backend was not able to contact the Grid scheduler."
                msg += " Please try again later."
                msg += " Message from the scheduler: %s" % (str(exp))
                self.logger.exception("%s: %s", workflow, msg)
                raise TaskWorkerException(msg)

            try:
                dummyAddress = loc.scheddAd["Machine"]
            except:
                raise TaskWorkerException("Unable to get schedd address for task %s" % (task["tm_taskname"]))

            # Get location of schedd-specific environment script from schedd ad.
            info["remote_condor_setup"] = loc.scheddAd.get("RemoteCondorSetup", "")

            info["CMSGroups"] = set.union(
                CMSGroupMapper.map_user_to_groups(kwargs["task"]["tm_username"]), kwargs["task"]["user_groups"]
            )
            self.logger.info("User %s mapped to local groups %s." % (kwargs["task"]["tm_username"], info["CMSGroups"]))

            self.logger.debug("Finally submitting to the schedd")
            if address:
                self.clusterId = self.submitDirect(schedd, "dag_bootstrap_startup.sh", arg, info)
            else:
                raise TaskWorkerException("Not able to get schedd address.")
            self.logger.debug("Submission finished")
        finally:
            os.chdir(cwd)

        configreq = {"workflow": kwargs["task"]["tm_taskname"], "status": "SUBMITTED", "subresource": "success"}
        self.logger.debug("Pushing information centrally %s", configreq)
        data = urllib.urlencode(configreq)
        self.server.post(self.resturi, data=data)

        self.sendDashboardJobs(dashboardParams, info["apmon"])

        return Result.Result(task=kwargs["task"], result=(-1))
Exemplo n.º 4
0
    def executeInternal(self, info, dashboardParams, inputFiles, **kwargs):
        """Internal execution to submit to selected scheduler
           Before submission it does duplicate check to see if
           task was not submitted by previous time"""
        if not htcondor:
            raise Exception("Unable to import HTCondor module")

        task = kwargs['task']
        workflow = task['tm_taskname']

        cwd = os.getcwd()
        os.chdir(kwargs['tempDir'])

        info['start_time'] = task['tm_start_time']
        info['inputFilesString'] = ", ".join(inputFiles + ['subdag.ad'])
        outputFiles = ["RunJobs.dag.dagman.out", "RunJobs.dag.rescue.001"]
        info['outputFilesString'] = ", ".join(outputFiles)
        arg = "RunJobs.dag"

        info['resthost'] = '"%s"' % (self.server['host'])
        #info['resthost'] = self.config.TaskWorker.resturl
        info['resturinoapi'] = '"%s"' % (self.restURInoAPI)

        try:
            info['remote_condor_setup'] = ''
            if task['tm_collector']:
                self.backendurls['htcondorPool'] = task['tm_collector']
            loc = HTCondorLocator.HTCondorLocator(self.backendurls)
            address = ""
            schedd = ""
            try:
                self.logger.debug("Getting schedd object")
                schedd, address = loc.getScheddObjNew(task['tm_schedd'])
                self.logger.debug("Got schedd object")
            except Exception as exp:
                msg = "The CRAB server backend was not able to contact the Grid scheduler."
                msg += " Please try again later."
                msg += " Message from the scheduler: %s" % (str(exp))
                self.logger.exception("%s: %s", workflow, msg)
                raise TaskWorkerException(msg, retry=True)

            try:
                dummyAddress = loc.scheddAd['Machine']
            except:
                raise TaskWorkerException("Unable to get schedd address for task %s" % (task['tm_taskname']), retry=True)

            # Get location of schedd-specific environment script from schedd ad.
            info['remote_condor_setup'] = loc.scheddAd.get("RemoteCondorSetup", "")

            info["CMSGroups"] = set.union(CMSGroupMapper.map_user_to_groups(kwargs['task']['tm_username']), kwargs['task']['user_groups'])
            self.logger.info("User %s mapped to local groups %s.", kwargs['task']['tm_username'], info["CMSGroups"])

            self.logger.debug("Finally submitting to the schedd")
            if address:
                self.clusterId = self.submitDirect(schedd, 'dag_bootstrap_startup.sh', arg, info)
            else:
                raise TaskWorkerException("Not able to get schedd address.", retry=True)
            self.logger.debug("Submission finished")
        finally:
            os.chdir(cwd)

        configreq = {'workflow': kwargs['task']['tm_taskname'],
                     'status': "SUBMITTED",
                     'subresource': 'success',
                     'clusterid' : self.clusterId } #that's the condor cluster id of the dag (actually dag_bootstrap.sh that becomes that dag if everything goes well)
        self.logger.debug("Pushing information centrally %s", configreq)
        data = urllib.urlencode(configreq)
        self.server.post(self.resturi, data=data)

        self.sendDashboardJobs(dashboardParams, info['apmon'])

        return Result.Result(task=kwargs['task'], result=(-1))
Exemplo n.º 5
0
    def alter_submit(self, crab_retry):
        """
        Copy the content of the generic file Job.submit into a job-specific file
        Job.<job_id>.submit and add attributes that are job-specific (e.g. CRAB_Retry).
        Add also parameters that can be overwritten at each manual job resubmission
        (e.g. MaxWallTimeMins, RequestMemory, RequestCpus, JobPrio, DESIRED_SITES).
        """
        ## Start the Job.<job_id>.submit content with the CRAB_Retry.
        new_submit_text = '+CRAB_Retry = %d\n' % (crab_retry)
        msg = "Setting CRAB_Retry = %s" % (crab_retry)
        self.logger.info(msg)
        ## Add job and postjob log URLs
        job_retry = "%s.%s" % (self.job_id, crab_retry)
        new_submit_text += '+CRAB_JobLogURL = %s\n' % classad.quote(
            os.path.join(self.userWebDirPrx, "job_out." + job_retry + ".txt"))
        new_submit_text += '+CRAB_PostJobLogURL = %s\n' % classad.quote(
            os.path.join(self.userWebDirPrx, "postjob." + job_retry + ".txt"))
        ## For the parameters that can be overwritten at each manual job resubmission,
        ## read them from the task ad, unless there is resubmission information there
        ## and this job is not one that has to be resubmitted, in which case we should
        ## use the same parameters (site black- and whitelists, requested memory, etc)
        ## as used by the previous job retry (which are saved in self.resubmit_info).
        CRAB_ResubmitList_in_taskad = ('CRAB_ResubmitList' in self.task_ad)
        use_resubmit_info = False
        resubmit_jobids = []
        if 'CRAB_ResubmitList' in self.task_ad:
            resubmit_jobids = map(str, self.task_ad['CRAB_ResubmitList'])
            try:
                resubmit_jobids = set(resubmit_jobids)
                if resubmit_jobids and self.job_id not in resubmit_jobids:
                    use_resubmit_info = True
            except TypeError:
                resubmit_jobids = True
        ## If there is no resubmit_info, we can of course not use it.
        if not self.resubmit_info:
            use_resubmit_info = False
        ## Get the resubmission parameters.
        maxjobruntime = None
        maxmemory = None
        numcores = None
        priority = None
        if not use_resubmit_info:
            #if 'MaxWallTimeMins_RAW' in self.task_ad:
            #    if self.task_ad['MaxWallTimeMins_RAW'] != 1315:
            #        maxjobruntime = self.task_ad.lookup('MaxWallTimeMins_RAW')
            #        self.resubmit_info['maxjobruntime'] = maxjobruntime
            if 'MaxWallTimeMinsProbe' in self.task_ad and self.stage == 'probe':
                maxjobruntime = int(
                    str(self.task_ad.lookup('MaxWallTimeMinsProbe')))
            elif 'MaxWallTimeMinsTail' in self.task_ad and self.stage == 'tail':
                maxjobruntime = int(
                    str(self.task_ad.lookup('MaxWallTimeMinsTail')))
            elif 'MaxWallTimeMinsRun' in self.task_ad:
                maxjobruntime = int(
                    str(self.task_ad.lookup('MaxWallTimeMinsRun')))
            if 'RequestMemory' in self.task_ad:
                maxmemory = int(str(self.task_ad.lookup('RequestMemory')))
            if 'RequestCpus' in self.task_ad:
                numcores = int(str(self.task_ad.lookup('RequestCpus')))
            if 'JobPrio' in self.task_ad:
                priority = int(str(self.task_ad['JobPrio']))
            if str(self.job_id) == '0':  #jobids can be like 1-1 for subjobs
                priority = 20  #the maximum for splitting jobs
        else:
            inkey = str(crab_retry) if crab_retry == 0 else str(crab_retry - 1)
            while inkey not in self.resubmit_info and int(inkey) > 0:
                inkey = str(int(inkey) - 1)
            maxjobruntime = self.resubmit_info[inkey].get('maxjobruntime')
            maxmemory = self.resubmit_info[inkey].get('maxmemory')
            numcores = self.resubmit_info[inkey].get('numcores')
            priority = self.resubmit_info[inkey].get('priority')
        ## Save the (new) values of the resubmission parameters in self.resubmit_info
        ## for the current job retry number.
        outkey = str(crab_retry)
        if outkey not in self.resubmit_info:
            self.resubmit_info[outkey] = {}
        self.resubmit_info[outkey]['maxjobruntime'] = maxjobruntime
        self.resubmit_info[outkey]['maxmemory'] = maxmemory
        self.resubmit_info[outkey]['numcores'] = numcores
        self.resubmit_info[outkey]['priority'] = priority
        self.resubmit_info[outkey]['use_resubmit_info'] = use_resubmit_info
        self.resubmit_info[outkey][
            'CRAB_ResubmitList_in_taskad'] = CRAB_ResubmitList_in_taskad

        ## Add the resubmission parameters to the Job.<job_id>.submit content.
        savelogs = 0 if self.stage == 'probe' else self.task_ad.lookup(
            'CRAB_SaveLogsFlag')
        saveoutputs = 0 if self.stage == 'probe' else self.task_ad.lookup(
            'CRAB_TransferOutputs')
        new_submit_text += '+CRAB_TransferOutputs = {0}\n+CRAB_SaveLogsFlag = {1}\n'.format(
            saveoutputs, savelogs)
        if maxjobruntime is not None:
            new_submit_text += '+EstimatedWallTimeMins = %s\n' % str(
                maxjobruntime)
            new_submit_text += '+MaxWallTimeMinsRun = %s\n' % str(
                maxjobruntime)  # how long it can run
            new_submit_text += '+MaxWallTimeMins = %s\n' % str(
                maxjobruntime)  # how long a slot can it match to
        # no plus sign for next 3 attributes, since those are Condor standard ones
        if maxmemory is not None:
            new_submit_text += 'RequestMemory = %s\n' % (str(maxmemory))
        if numcores is not None:
            new_submit_text += 'RequestCpus = %s\n' % (str(numcores))
        if priority is not None:
            new_submit_text += 'JobPrio = %s\n' % (str(priority))

        ## Within the schedd, order the first few jobs in the task before all other tasks of the same priority.
        pre_job_prio = 1
        if int(self.job_id.split('-')[0]) <= 5:
            pre_job_prio = 0
        new_submit_text += '+PreJobPrio1 = %d\n' % pre_job_prio

        ## The schedd will use PostJobPrio1 as a secondary job-priority sorting key: it
        ## will first run jobs by JobPrio; then, for jobs with the same JobPrio, it will
        ## run the job with the higher PostJobPrio1.
        new_submit_text += '+PostJobPrio1 = -%s\n' % str(
            self.task_ad.lookup('QDate'))

        ## Order retries before all other jobs in this task
        new_submit_text += '+PostJobPrio2 = %d\n' % crab_retry

        ## Add the site black- and whitelists and the DESIRED_SITES to the
        ## Job.<job_id>.submit content.
        new_submit_text = self.redo_sites(new_submit_text, crab_retry,
                                          use_resubmit_info)

        ## Add group information:
        username = self.task_ad.get('CRAB_UserHN')
        if 'CMSGroups' in self.task_ad:
            new_submit_text += '+CMSGroups = %s\n' % classad.quote(
                self.task_ad['CMSGroups'])
        elif username:
            groups = CMSGroupMapper.map_user_to_groups(username)
            if groups:
                new_submit_text += '+CMSGroups = %s\n' % classad.quote(groups)

        ## Finally add (copy) all the content of the generic Job.submit file.
        with open("Job.submit", 'r') as fd:
            new_submit_text += fd.read()
        ## Write the Job.<job_id>.submit file.
        with open("Job.%s.submit" % (self.job_id), 'w') as fd:
            fd.write(new_submit_text)
Exemplo n.º 6
0
    def alter_submit(self, crab_retry):
        """
        Copy the content of the generic file Job.submit into a job-specific file
        Job.<job_id>.submit and add attributes that are job-specific (e.g. CRAB_Retry).
        Add also parameters that can be overwritten at each manual job resubmission
        (e.g. MaxWallTimeMins, RequestMemory, RequestCpus, JobPrio, DESIRED_SITES).
        """
        ## Start the Job.<job_id>.submit content with the CRAB_Retry.
        new_submit_text = '+CRAB_Retry = %d\n' % (crab_retry)
        msg = "Setting CRAB_Retry = %s" % (crab_retry)
        self.logger.info(msg)
        ## For the parameters that can be overwritten at each manual job resubmission,
        ## read them from the task ad, unless there is resubmission information there
        ## and this job is not one that has to be resubmitted, in which case we should
        ## use the same parameters (site black- and whitelists, requested memory, etc)
        ## as used by the previous job retry (which are saved in self.resubmit_info).
        CRAB_ResubmitList_in_taskad = ('CRAB_ResubmitList' in self.task_ad)
        use_resubmit_info = False
        resubmit_jobids = []
        if 'CRAB_ResubmitList' in self.task_ad:
            resubmit_jobids = map(str, self.task_ad['CRAB_ResubmitList'])
            try:
                resubmit_jobids = set(resubmit_jobids)
                if resubmit_jobids and self.job_id not in resubmit_jobids:
                    use_resubmit_info = True
            except TypeError:
                resubmit_jobids = True
        ## If there is no resubmit_info, we can of course not use it.
        if not self.resubmit_info:
            use_resubmit_info = False
        ## Get the resubmission parameters.
        maxjobruntime = None
        maxmemory     = None
        numcores      = None
        priority      = None
        if not use_resubmit_info:
            #if 'MaxWallTimeMins_RAW' in self.task_ad:
            #    if self.task_ad['MaxWallTimeMins_RAW'] != 1315:
            #        maxjobruntime = self.task_ad.lookup('MaxWallTimeMins_RAW')
            #        self.resubmit_info['maxjobruntime'] = maxjobruntime
            if 'MaxWallTimeMinsProbe' in self.task_ad and self.stage == 'probe':
                maxjobruntime = int(str(self.task_ad.lookup('MaxWallTimeMinsProbe')))
            elif 'MaxWallTimeMinsTail' in self.task_ad and self.stage == 'tail':
                maxjobruntime = int(str(self.task_ad.lookup('MaxWallTimeMinsTail')))
            elif 'MaxWallTimeMins' in self.task_ad:
                maxjobruntime = int(str(self.task_ad.lookup('MaxWallTimeMins')))
            if 'RequestMemory' in self.task_ad:
                maxmemory = int(str(self.task_ad.lookup('RequestMemory')))
            if 'RequestCpus' in self.task_ad:
                numcores = int(str(self.task_ad.lookup('RequestCpus')))
            if 'JobPrio' in self.task_ad:
                priority = int(str(self.task_ad['JobPrio']))
            if str(self.job_id) == '0': #jobids can be like 1-1 for subjobs
                priority = 20 #the maximum for splitting jobs
        else:
            inkey = str(crab_retry) if crab_retry == 0 else str(crab_retry - 1)
            while inkey not in self.resubmit_info and int(inkey) > 0:
                inkey = str(int(inkey) -  1)
            maxjobruntime = self.resubmit_info[inkey].get('maxjobruntime')
            maxmemory     = self.resubmit_info[inkey].get('maxmemory')
            numcores      = self.resubmit_info[inkey].get('numcores')
            priority      = self.resubmit_info[inkey].get('priority')
        ## Save the (new) values of the resubmission parameters in self.resubmit_info
        ## for the current job retry number.
        outkey = str(crab_retry)
        if outkey not in self.resubmit_info:
            self.resubmit_info[outkey] = {}
        self.resubmit_info[outkey]['maxjobruntime'] = maxjobruntime
        self.resubmit_info[outkey]['maxmemory']     = maxmemory
        self.resubmit_info[outkey]['numcores']      = numcores
        self.resubmit_info[outkey]['priority']      = priority
        self.resubmit_info[outkey]['use_resubmit_info'] = use_resubmit_info
        self.resubmit_info[outkey]['CRAB_ResubmitList_in_taskad'] = CRAB_ResubmitList_in_taskad
        ## Add the resubmission parameters to the Job.<job_id>.submit content.
        if maxjobruntime is not None:
            new_submit_text += '+EstimatedWallTimeMins = %s\n' % str(maxjobruntime)
            new_submit_text += '+MaxWallTimeMins = (JobStatus=?=1) ? EstimatedWallTimeMins : %s\n' % str(maxjobruntime)
        if maxmemory is not None:
            new_submit_text += '+RequestMemory = %s\n' % (str(maxmemory))
        if numcores is not None:
            new_submit_text += '+RequestCpus = %s\n' % (str(numcores))
        if priority is not None:
            new_submit_text += '+JobPrio = %s\n' % (str(priority))

        ## Within the schedd, order the first few jobs in the task before all other tasks of the same priority.
        pre_job_prio = 1
        if int(self.job_id.split('-')[0]) <= 5:
            pre_job_prio = 0
        new_submit_text += '+PreJobPrio1 = %d\n' % pre_job_prio

        ## The schedd will use PostJobPrio1 as a secondary job-priority sorting key: it
        ## will first run jobs by JobPrio; then, for jobs with the same JobPrio, it will
        ## run the job with the higher PostJobPrio1.
        new_submit_text += '+PostJobPrio1 = -%s\n' % str(self.task_ad.lookup('QDate'))

        ## Order retries before all other jobs in this task
        new_submit_text += '+PostJobPrio2 = %d\n' % crab_retry

        ## Add the site black- and whitelists and the DESIRED_SITES to the
        ## Job.<job_id>.submit content.
        new_submit_text = self.redo_sites(new_submit_text, crab_retry, use_resubmit_info)

        ## Add group information:
        username = self.task_ad.get('CRAB_UserHN')
        if 'CMSGroups' in self.task_ad:
            new_submit_text += '+CMSGroups = %s\n' % classad.quote(self.task_ad['CMSGroups'])
        elif username:
            groups = CMSGroupMapper.map_user_to_groups(username)
            if groups:
                new_submit_text += '+CMSGroups = %s\n' % classad.quote(groups)

        ## Finally add (copy) all the content of the generic Job.submit file.
        with open("Job.submit", 'r') as fd:
            new_submit_text += fd.read()
        ## Write the Job.<job_id>.submit file.
        with open("Job.%s.submit" % (self.job_id), 'w') as fd:
            fd.write(new_submit_text)
Exemplo n.º 7
0
    def alter_submit(self, crab_retry):
        """
        Copy the content of the generic file Job.submit into a job-specific file
        Job.<job_id>.submit and add attributes that are job-specific (e.g. CRAB_Retry).
        Add also parameters that can be overwritten at each manual job resubmission
        (e.g. MaxWallTimeMins, RequestMemory, RequestCpus, JobPrio, DESIRED_SITES).
        """
        ## Start the Job.<job_id>.submit content with the CRAB_Retry.
        new_submit_text = '+CRAB_Retry = %d\n' % (crab_retry)
        msg = "Setting CRAB_Retry = %s" % (crab_retry)
        self.logger.info(msg)
        ## For the parameters that can be overwritten at each manual job resubmission,
        ## read them from the task ad, unless there is resubmission information there
        ## and this job is not one that has to be resubmitted, in which case we should
        ## use the same parameters (site black- and whitelists, requested memory, etc)
        ## as used by the previous job retry (which are saved in self.resubmit_info).
        CRAB_ResubmitList_in_taskad = ('CRAB_ResubmitList' in self.task_ad)
        use_resubmit_info = False
        resubmit_jobids = []
        if 'CRAB_ResubmitList' in self.task_ad:
            resubmit_jobids = self.task_ad['CRAB_ResubmitList']
            try:
                resubmit_jobids = set(resubmit_jobids)
                if resubmit_jobids and self.job_id not in resubmit_jobids:
                    use_resubmit_info = True
            except TypeError:
                resubmit_jobids = True
        ## If there is no resubmit_info, we can of course not use it.
        if not self.resubmit_info:
            use_resubmit_info = False
        ## Get the resubmission parameters.
        maxjobruntime = None
        maxmemory = None
        numcores = None
        priority = None
        if not use_resubmit_info:
            #if 'MaxWallTimeMins_RAW' in self.task_ad:
            #    if self.task_ad['MaxWallTimeMins_RAW'] != 1315:
            #        maxjobruntime = self.task_ad.lookup('MaxWallTimeMins_RAW')
            #        self.resubmit_info['maxjobruntime'] = maxjobruntime
            if 'MaxWallTimeMins' in self.task_ad:
                maxjobruntime = int(str(
                    self.task_ad.lookup('MaxWallTimeMins')))
            if 'RequestMemory' in self.task_ad:
                maxmemory = int(str(self.task_ad.lookup('RequestMemory')))
            if 'RequestCpus' in self.task_ad:
                numcores = int(str(self.task_ad.lookup('RequestCpus')))
            if 'JobPrio' in self.task_ad:
                priority = int(str(self.task_ad['JobPrio']))
        else:
            inkey = str(crab_retry) if crab_retry == 0 else str(crab_retry - 1)
            while inkey not in self.resubmit_info and int(inkey) > 0:
                inkey = str(int(inkey) - 1)
            maxjobruntime = self.resubmit_info[inkey].get('maxjobruntime')
            maxmemory = self.resubmit_info[inkey].get('maxmemory')
            numcores = self.resubmit_info[inkey].get('numcores')
            priority = self.resubmit_info[inkey].get('priority')
        ## Save the (new) values of the resubmission parameters in self.resubmit_info
        ## for the current job retry number.
        outkey = str(crab_retry)
        if outkey not in self.resubmit_info:
            self.resubmit_info[outkey] = {}
        self.resubmit_info[outkey]['maxjobruntime'] = maxjobruntime
        self.resubmit_info[outkey]['maxmemory'] = maxmemory
        self.resubmit_info[outkey]['numcores'] = numcores
        self.resubmit_info[outkey]['priority'] = priority
        self.resubmit_info[outkey]['use_resubmit_info'] = use_resubmit_info
        self.resubmit_info[outkey][
            'CRAB_ResubmitList_in_taskad'] = CRAB_ResubmitList_in_taskad
        ## Add the resubmission parameters to the Job.<job_id>.submit content.
        if maxjobruntime is not None:
            new_submit_text += '+MaxWallTimeMins = %s\n' % (str(maxjobruntime))
        if maxmemory is not None:
            new_submit_text += '+RequestMemory = %s\n' % (str(maxmemory))
        if numcores is not None:
            new_submit_text += '+RequestCpus = %s\n' % (str(numcores))
        if priority is not None:
            new_submit_text += '+JobPrio = %s\n' % (str(priority))

        ## Within the schedd, order the first few jobs in the task before all other tasks of the same priority.
        pre_job_prio = 1
        if self.job_id <= 5:
            pre_job_prio = 0
        new_submit_text += '+PreJobPrio1 = %d\n' % pre_job_prio

        ## The schedd will use PostJobPrio1 as a secondary job-priority sorting key: it
        ## will first run jobs by JobPrio; then, for jobs with the same JobPrio, it will
        ## run the job with the higher PostJobPrio1.
        new_submit_text += '+PostJobPrio1 = -%s\n' % str(
            self.task_ad.lookup('QDate'))

        ## Order retries before all other jobs in this task
        new_submit_text += '+PostJobPrio2 = %d\n' % crab_retry

        ## Add the site black- and whitelists and the DESIRED_SITES to the
        ## Job.<job_id>.submit content.
        new_submit_text = self.redo_sites(new_submit_text, crab_retry,
                                          use_resubmit_info)

        ## Add group information:
        username = self.task_ad.get('CRAB_UserHN')
        if 'CMSGroups' in self.task_ad:
            new_submit_text += '+CMSGroups = %s\n' % classad.quote(
                self.task_ad['CMSGroups'])
        elif username:
            groups = CMSGroupMapper.map_user_to_groups(username)
            if groups:
                new_submit_text += '+CMSGroups = %s\n' % classad.quote(groups)

        ## Finally add (copy) all the content of the generic Job.submit file.
        with open("Job.submit", 'r') as fd:
            new_submit_text += fd.read()
        ## Write the Job.<job_id>.submit file.
        with open("Job.%d.submit" % (self.job_id), 'w') as fd:
            fd.write(new_submit_text)
Exemplo n.º 8
0
    def executeInternal(self, info, dashboardParams, inputFiles, **kwargs):
        """Internal execution to submit to selected scheduler
           Before submission it does duplicate check to see if
           task was not submitted by previous time"""
        if not htcondor:
            raise Exception("Unable to import HTCondor module")

        task = kwargs['task']
        workflow = task['tm_taskname']

        cwd = os.getcwd()
        os.chdir(kwargs['tempDir'])

        info['start_time'] = task['tm_start_time']
        info['inputFilesString'] = ", ".join(inputFiles + ['subdag.ad'])
        outputFiles = ["RunJobs.dag.dagman.out", "RunJobs.dag.rescue.001"]
        info['outputFilesString'] = ", ".join(outputFiles)
        arg = "RunJobs.dag"

        info['resthost'] = '"%s"' % (self.crabserver.server['host'])
        info['dbinstance'] = '"%s"' % (self.crabserver.getDbInstance())

        try:
            info['remote_condor_setup'] = ''
            if task['tm_collector']:
                self.backendurls['htcondorPool'] = task['tm_collector']
            loc = HTCondorLocator.HTCondorLocator(self.backendurls)
            address = ""
            schedd = ""
            try:
                self.logger.debug("Getting schedd object")
                schedd, address = loc.getScheddObjNew(task['tm_schedd'])
                self.logger.debug("Got schedd object")
            except Exception as exp:
                msg = "The CRAB server backend was not able to contact the Grid scheduler."
                msg += " Please try again later."
                msg += " Message from the scheduler: %s" % (str(exp))
                self.logger.exception("%s: %s", workflow, msg)
                raise TaskWorkerException(msg, retry=True)

            try:
                dummyAddress = loc.scheddAd['Machine']
            except:
                raise TaskWorkerException("Unable to get schedd address for task %s" % (task['tm_taskname']), retry=True)

            # Get location of schedd-specific environment script from schedd ad.
            info['remote_condor_setup'] = loc.scheddAd.get("RemoteCondorSetup", "")

            info["CMSGroups"] = set.union(CMSGroupMapper.map_user_to_groups(kwargs['task']['tm_username']), kwargs['task']['user_groups'])
            self.logger.info("User %s mapped to local groups %s.", kwargs['task']['tm_username'], info["CMSGroups"])
            if not info["CMSGroups"]:
                raise TaskWorkerException("CMSGroups can not be empty. Failing task %s" % (task['tm_taskname']), retry=True)

            self.logger.debug("Finally submitting to the schedd")
            if address:
                try:
                    self.clusterId = self.submitDirect(schedd, 'dag_bootstrap_startup.sh', arg, info)
                except Exception as submissionError:
                    msg = "Something went wrong: %s \n" % str(submissionError)
                    if self.clusterId:
                        msg += "But a dagman_bootstrap was submitted with clusterId %s." % self.clusterId
                    else:
                        msg += 'No clusterId was returned to DagmanSubmitter.'
                    msg += " Clean up condor queue before trying again."
                    self.logger.error(msg)
                    constrain = 'crab_reqname=="%s"' % kwargs['task']['tm_taskname']
                    constrain = str(constrain)  # beware unicode, it breaks htcondor binding
                    self.logger.error("Sending: condor_rm -constrain '%s'", constrain)
                    schedd.act(htcondor.JobAction.Remove, constrain)
                    # raise again to communicate failure upstream
                    raise submissionError
            else:
                raise TaskWorkerException("Not able to get schedd address.", retry=True)
            self.logger.debug("Submission finished")
        finally:
            os.chdir(cwd)

        configreq = {'workflow': kwargs['task']['tm_taskname'],
                     'status': "SUBMITTED",
                     'subresource': 'success',
                     'clusterid' : self.clusterId} #that's the condor cluster id of the dag_bootstrap.sh
        self.logger.debug("Pushing information centrally %s", configreq)
        data = urllib.urlencode(configreq)
        self.crabserver.post(api='workflowdb', data=data)

        self.sendDashboardJobs(dashboardParams, info['apmon'])

        return Result.Result(task=kwargs['task'], result='OK')
Exemplo n.º 9
0
    def executeInternal(self, info, dashboardParams, inputFiles, **kwargs):
        """Internal execution to submit to selected scheduler
           Before submission it does duplicate check to see if
           task was not submitted by previous time"""
        if not htcondor:
            raise Exception("Unable to import HTCondor module")

        task = kwargs['task']
        workflow = task['tm_taskname']

        cwd = os.getcwd()
        os.chdir(kwargs['tempDir'])

        info['start_time'] = task['tm_start_time']
        info['inputFilesString'] = ", ".join(inputFiles + ['subdag.ad'])
        outputFiles = ["RunJobs.dag.dagman.out", "RunJobs.dag.rescue.001"]
        info['outputFilesString'] = ", ".join(outputFiles)
        arg = "RunJobs.dag"

        info['resthost'] = '"%s"' % (self.server['host'])
        #info['resthost'] = self.config.TaskWorker.resturl
        info['resturinoapi'] = '"%s"' % (self.restURInoAPI)

        try:
            info['remote_condor_setup'] = ''
            if task['tm_collector']:
                self.backendurls['htcondorPool'] = task['tm_collector']
            loc = HTCondorLocator.HTCondorLocator(self.backendurls)
            address = ""
            schedd = ""
            try:
                self.logger.debug("Getting schedd object")
                schedd, address = loc.getScheddObjNew(task['tm_schedd'])
                self.logger.debug("Got schedd object")
            except Exception as exp:
                msg = "The CRAB server backend was not able to contact the Grid scheduler."
                msg += " Please try again later."
                msg += " Message from the scheduler: %s" % (str(exp))
                self.logger.exception("%s: %s", workflow, msg)
                raise TaskWorkerException(msg, retry=True)

            try:
                dummyAddress = loc.scheddAd['Machine']
            except:
                raise TaskWorkerException("Unable to get schedd address for task %s" % (task['tm_taskname']), retry=True)

            # Get location of schedd-specific environment script from schedd ad.
            info['remote_condor_setup'] = loc.scheddAd.get("RemoteCondorSetup", "")

            info["CMSGroups"] = set.union(CMSGroupMapper.map_user_to_groups(kwargs['task']['tm_username']), kwargs['task']['user_groups'])
            self.logger.info("User %s mapped to local groups %s.", kwargs['task']['tm_username'], info["CMSGroups"])

            self.logger.debug("Finally submitting to the schedd")
            if address:
                self.clusterId = self.submitDirect(schedd, 'dag_bootstrap_startup.sh', arg, info)
            else:
                raise TaskWorkerException("Not able to get schedd address.", retry=True)
            self.logger.debug("Submission finished")
        finally:
            os.chdir(cwd)

        configreq = {'workflow': kwargs['task']['tm_taskname'],
                     'status': "SUBMITTED",
                     'subresource': 'success',
                     'clusterid' : self.clusterId } #that's the condor cluster id of the dag (actually dag_bootstrap.sh that becomes that dag if everything goes well)
        self.logger.debug("Pushing information centrally %s", configreq)
        data = urllib.urlencode(configreq)
        self.server.post(self.resturi, data=data)

        self.sendDashboardJobs(dashboardParams, info['apmon'])

        return Result.Result(task=kwargs['task'], result=(-1))
Exemplo n.º 10
0
    def alter_submit(self, crab_retry):
        """
        Copy the content of the generic file Job.submit into a job-specific file
        Job.<job_id>.submit and add attributes that are job-specific (e.g. CRAB_Retry).
        Add also parameters that can be overwritten at each manual job resubmission
        (e.g. MaxWallTimeMins, RequestMemory, RequestCpus, JobPrio, DESIRED_SITES).
        """
        ## Start the Job.<job_id>.submit content with the CRAB_Retry.
        new_submit_text = '+CRAB_Retry = %d\n' % (crab_retry)
        msg = "Setting CRAB_Retry = %s" % (crab_retry)
        self.logger.info(msg)
        ## For the parameters that can be overwritten at each manual job resubmission,
        ## read them from the task ad, unless there is resubmission information there
        ## and this job is not one that has to be resubmitted, in which case we should
        ## use the same parameters (site black- and whitelists, requested memory, etc)
        ## as used by the previous job retry (which are saved in self.resubmit_info).
        CRAB_ResubmitList_in_taskad = ('CRAB_ResubmitList' in self.task_ad)
        use_resubmit_info = False
        resubmit_jobids = []
        if 'CRAB_ResubmitList' in self.task_ad:
            resubmit_jobids = self.task_ad['CRAB_ResubmitList']
            try:
                resubmit_jobids = set(resubmit_jobids)
                if resubmit_jobids and self.job_id not in resubmit_jobids:
                    use_resubmit_info = True
            except TypeError:
                resubmit_jobids = True
        ## If there is no resubmit_info, we can of course not use it.
        if not self.resubmit_info:
            use_resubmit_info = False
        ## Get the resubmission parameters.
        maxjobruntime = None
        maxmemory     = None
        numcores      = None
        priority      = None
        if not use_resubmit_info:
            #if 'MaxWallTimeMins_RAW' in self.task_ad:
            #    if self.task_ad['MaxWallTimeMins_RAW'] != 1315:
            #        maxjobruntime = self.task_ad.lookup('MaxWallTimeMins_RAW')
            #        self.resubmit_info['maxjobruntime'] = maxjobruntime
            if 'MaxWallTimeMins' in self.task_ad:
                maxjobruntime = int(str(self.task_ad.lookup('MaxWallTimeMins')))
            if 'RequestMemory' in self.task_ad:
                maxmemory = int(str(self.task_ad.lookup('RequestMemory')))
            if 'RequestCpus' in self.task_ad:
                numcores = int(str(self.task_ad.lookup('RequestCpus')))
            if 'JobPrio' in self.task_ad:
                priority = int(str(self.task_ad['JobPrio']))
        else:
            inkey = str(crab_retry) if crab_retry == 0 else str(crab_retry - 1)
            while inkey not in self.resubmit_info and int(inkey) > 0:
                inkey = str(int(inkey) -  1)
            maxjobruntime = self.resubmit_info[inkey].get('maxjobruntime')
            maxmemory     = self.resubmit_info[inkey].get('maxmemory')
            numcores      = self.resubmit_info[inkey].get('numcores')
            priority      = self.resubmit_info[inkey].get('priority')
        ## Save the (new) values of the resubmission parameters in self.resubmit_info
        ## for the current job retry number.
        outkey = str(crab_retry)
        if outkey not in self.resubmit_info:
            self.resubmit_info[outkey] = {}
        self.resubmit_info[outkey]['maxjobruntime'] = maxjobruntime
        self.resubmit_info[outkey]['maxmemory']     = maxmemory
        self.resubmit_info[outkey]['numcores']      = numcores
        self.resubmit_info[outkey]['priority']      = priority
        self.resubmit_info[outkey]['use_resubmit_info'] = use_resubmit_info
        self.resubmit_info[outkey]['CRAB_ResubmitList_in_taskad'] = CRAB_ResubmitList_in_taskad
        ## Add the resubmission parameters to the Job.<job_id>.submit content.
        if maxjobruntime is not None:
            new_submit_text += '+MaxWallTimeMins = %s\n' % (str(maxjobruntime))
        if maxmemory is not None:
            new_submit_text += '+RequestMemory = %s\n' % (str(maxmemory))
        if numcores is not None:
            new_submit_text += '+RequestCpus = %s\n' % (str(numcores))
        if priority is not None:
            new_submit_text += '+JobPrio = %s\n' % (str(priority))

        ## Within the schedd, order the first few jobs in the task before all other tasks of the same priority.
        pre_job_prio = 1
        if self.job_id <= 5:
            pre_job_prio = 0
        new_submit_text += '+PreJobPrio1 = %d\n' % pre_job_prio

        ## The schedd will use PostJobPrio1 as a secondary job-priority sorting key: it
        ## will first run jobs by JobPrio; then, for jobs with the same JobPrio, it will
        ## run the job with the higher PostJobPrio1.
        new_submit_text += '+PostJobPrio1 = -%s\n' % str(self.task_ad.lookup('QDate'))

        ## Order retries before all other jobs in this task
        new_submit_text += '+PostJobPrio2 = %d\n' % crab_retry

        ## This is used to send to dashbord the location of the logfiles
        try:
            storage_rules = htcondor.param['CRAB_StorageRules']
        except:
            storage_rules = "^/home/remoteGlidein,http://submit-5.t2.ucsd.edu/CSstoragePath"
        new_submit_text += '+CRAB_UserWebDir = "%s"\n' % getWebdirForDb(str(self.task_ad.get('CRAB_ReqName')), storage_rules)

        try:
            with open('proxied_webdir') as fd:
                proxied_webdir = fd.read()
            new_submit_text += '+CRAB_UserWebDirPrx = "%s"\n' % proxied_webdir
        except IOError as e:
            self.logger.error(("'I/O error(%s): %s', when looking for the proxied_webdir file. Might be normal"
                         " if the schedd does not have a proxiedurl in the REST external config." % (e.errno, e.strerror)))
        ## Add the site black- and whitelists and the DESIRED_SITES to the
        ## Job.<job_id>.submit content.
        new_submit_text = self.redo_sites(new_submit_text, crab_retry, use_resubmit_info)

        ## Add group information:
        username = self.task_ad.get('CRAB_UserHN')
        if 'CMSGroups' in self.task_ad:
            new_submit_text += '+CMSGroups = %s\n' % classad.quote(self.task_ad['CMSGroups'])
        elif username:
            groups = CMSGroupMapper.map_user_to_groups(username)
            if groups:
                new_submit_text += '+CMSGroups = %s\n' % classad.quote(groups)

        ## Finally add (copy) all the content of the generic Job.submit file.
        with open("Job.submit", 'r') as fd:
            new_submit_text += fd.read()
        ## Write the Job.<job_id>.submit file.
        with open("Job.%d.submit" % (self.job_id), 'w') as fd:
            fd.write(new_submit_text)