Exemplo n.º 1
0
def makeAds( config ):
    reversed_mapping = config['reversed_mapping']

    needs_site = defaultdict(set)
    for workflow, tasks in config['modifications'].items():
        for taskname,specs in tasks.items():
            anAd = classad.ClassAd()
            anAd["GridResource"] = "condor localhost localhost"
            anAd["TargetUniverse"] = 5
            exp = 'regexp(target.WMAgent_SubTaskName, %s)'% classad.quote(str(taskname))
            anAd["Requirements"] = classad.ExprTree(str(exp))
            
            if "ReplaceSiteWhitelist" in specs:
                anAd["Name"] = str("Site Replacement for %s"% taskname)
                anAd["eval_set_DESIRED_Sites"] = str(",".join(specs['ReplaceSiteWhitelist']))
                print anAd
            elif "AddWhitelist" in specs:
                for site in specs['AddWhitelist']:
                    needs_site[site].add(taskname)
 

    for site in  needs_site:
        if not site in reversed_mapping: continue
        anAd = classad.ClassAd()
        anAd["GridResource"] = "condor localhost localhost"
        anAd["TargetUniverse"] = 5
        anAd["Name"] = str("Overflow rule to go to %s"%site)
        anAd["OverflowTasknames"] = map(str, needs_site[site])
        #exp = classad.ExprTree('regexp(%s, ExtDESIRED_Sites) && member(target.WMAgent_SubTaskName, OverflowTasknames)' % classad.quote(str(site)))
        exprs = ['regexp(%s, ExtDESIRED_Sites)'% classad.quote(str(origin)) for origin in reversed_mapping[site]]
        exp = classad.ExprTree('member(target.WMAgent_SubTaskName, OverflowTasknames) && ( %s )' % str("||".join( exprs )))
        anAd["Requirements"] = classad.ExprTree(str(exp))
        #anAd["eval_set_DESIRED_Sites"] = classad.Function("strcat", str(",".join( reversed_mapping[site]+[''] )), classad.Attribute("ExtDESIRED_Sites"))
        anAd["eval_set_DESIRED_Sites"] = classad.Function("strcat", str(site), classad.Attribute("ExtDESIRED_Sites"))
        print anAd
def getSubmitFileAdditions(resource_ad):
    """Returns additions to a submit file (as list of strings) to make the
    submitted job match the given resource

    """
    global _logger

    lines = []
    if 'grid_resource' in resource_ad:
        lines.append('+GridResource = %s' % classad.quote(resource_ad['grid_resource']))
    if 'Transform' in resource_ad:
        transform_ad = resource_ad['Transform']
        set_lines = []
        copy_lines = []
        eval_set_lines = []
        for key, value in transform_ad.iteritems():
            if key.startswith('set_'):
                set_lines.append('+%s = %s' % (key.replace('set_', '', 1), value))
            elif key.startswith('copy_'):
                if value in resource_ad:
                    copy_lines.append('+%s = %s' % (key.replace('copy_', '', 1), resource_ad[value]))
                else:
                    _logger.warning("Ignoring '%s': '%s' missing from Resource Ad" % (key, value))
            elif key.startswith('eval_set_'):
                eval_set_lines.append('+%s = %s' % (key.replace('eval_set_', '', 1), resource_ad.eval(value)))
            elif key.startswith('delete_'):
                _logger.warning("Ignoring '%s': 'delete' transforms not supported", key)
            else:
                _logger.warning("Ignoring '%s': unknown transform type", key)
        lines += copy_lines + set_lines + eval_set_lines
    return lines
Exemplo n.º 3
0
    def updateSiteInformation(self, jobs, siteName, excludeSite):
        """
        _updateSiteInformation_

        Allow or disallow jobs to run at a site.
        Called externally by Ops scripts if a site enters or leaves Down, Draining or Aborted.

        Kill job if after removing site from allowed sites it has nowhere to run.

        Parameters:    excludeSite = False when moving to Normal
                       excludeSite = True when moving to Down, Draining or Aborted
        """
        schedd = htcondor.Schedd()

        jobtokill = []
        try:
            itobj = schedd.xquery('WMAgent_AgentName =?= %s && JobStatus =?= 1' % classad.quote(self.agent),
                                  ['ClusterId', 'ProcId', 'DESIRED_Sites', 'ExtDESIRED_Sites'])
        except Exception as ex:
            logging.error("Failed to query condor schedd.")
            logging.exception(ex)
            return jobtokill
        else:
            jobInfo = {}
            for jobAd in itobj:
                gridId = "%s.%s" % (jobAd['ClusterId'], jobAd['ProcId'])
                jobInfo[gridId] = jobAd
            for job in jobs:
                jobAd = jobInfo.get(job['gridid'], None)
                if jobAd:
                    desiredSites = jobAd.get('DESIRED_Sites').split(',')
                    extDesiredSites = jobAd.get('ExtDESIRED_Sites').split(',')
                    if excludeSite:
                        # Remove siteName from DESIRED_Sites if job has it
                        if siteName in desiredSites:
                            if len(desiredSites) > 1:
                                desiredSites.remove(siteName)
                                desiredSites = ','.join(desiredSites)
                                try:
                                    schedd.edit([job['gridid']], 'DESIRED_Sites', classad.ExprTree('"%s"' % desiredSites))
                                except Exception as ex:
                                    logging.error("Failed to edit sites for job %s" % job['gridid'])
                                    logging.exception(ex)
                            else:
                                jobtokill.append(job)
                    else:
                        # Add siteName to DESIRED_Sites if ExtDESIRED_Sites has it (moving back to Normal)
                        if siteName not in desiredSites and siteName in extDesiredSites:
                            desiredSites.append(siteName)
                            desiredSites = ','.join(sorted(desiredSites))
                            try:
                                schedd.edit([job['gridid']], 'DESIRED_Sites', classad.ExprTree('"%s"' % desiredSites))
                            except Exception as ex:
                                logging.error("Failed to edit sites for job %s" % job['gridid'])
                                logging.exception(ex)

        return jobtokill
Exemplo n.º 4
0
    def getClassAds(self):
        """
        _getClassAds_

        Grab CONDOR classAds using CONDOR-PYTHON

        This looks at the schedd running on the
        Submit-Host and edit/remove jobs
        """

        jobInfo = {}
        schedd = condor.Schedd()

        try:
            logging.debug("Start: Retrieving classAds using Condor Python XQuery")
            itobj = schedd.xquery(
                'WMAgent_JobID =!= "UNDEFINED" && WMAgent_AgentName == %s' % classad.quote(str(self.agent)),
                ["JobStatus", "EnteredCurrentStatus", "JobStartDate", "QDate", "DESIRED_Sites",
                 "ExtDESIRED_Sites", "MATCH_EXP_JOBGLIDEIN_CMSSite", "WMAgent_JobID"]
                )
            logging.debug("Finish: Retrieving classAds using Condor Python XQuery")
        except:
            msg = "Query to condor schedd failed in PyCondorPlugin"
            logging.debug(msg)
            return None, None
        else:
            for slicedAds in grouper(itobj, 1000):
                for jobAd in slicedAds:
                    ### This condition ignores jobs that are Removed, but stay in the X state
                    ### For manual condor_rm removal, job wont be in the queue \
                    ### and status of the jobs will be read from condor log
                    if jobAd["JobStatus"] == 3:
                        continue
                    else:
                        ## For some strange race condition, schedd sometimes does not publish StartDate for a Running Job
                        ## Get the entire classad for such a job
                        ## Do not crash WMA, wait for next polling cycle to get all the info.
                        if jobAd["JobStatus"] == 2 and jobAd.get("JobStartDate") is None:
                            logging.debug("THIS SHOULD NOT HAPPEN. JobStartDate is MISSING from the CLASSAD.")
                            logging.debug("Could be caused by some race condition. Wait for the next Polling Cycle")
                            logging.debug("%s", str(jobAd))
                            continue

                        tmpDict = {}
                        tmpDict["JobStatus"] = int(jobAd.get("JobStatus", 100))
                        tmpDict["stateTime"] = int(jobAd["EnteredCurrentStatus"])
                        tmpDict["runningTime"] = int(jobAd.get("JobStartDate", 0))
                        tmpDict["submitTime"] = int(jobAd["QDate"])
                        tmpDict["DESIRED_Sites"] = jobAd["DESIRED_Sites"]
                        tmpDict["ExtDESIRED_Sites"] = jobAd["ExtDESIRED_Sites"]
                        tmpDict["runningCMSSite"] = jobAd.get("MATCH_EXP_JOBGLIDEIN_CMSSite", None)
                        tmpDict["WMAgentID"] = int(jobAd["WMAgent_JobID"])
                        jobInfo[tmpDict["WMAgentID"]] = tmpDict

            logging.info("Retrieved %i classAds", len(jobInfo))

        return jobInfo, schedd
Exemplo n.º 5
0
def get_schedd_ads(environ):
    pool = _get_pool(environ)
    coll = htcondor.Collector(pool)
    if pool:
        name = _get_name(environ)
        if name:
            return [coll.query(htcondor.AdTypes.Schedd, "Name=?=%s" % classad.quote(name))[0]]
        else:
            return coll.query(htcondor.AdTypes.Schedd, "true")
    return [coll.locate(htcondor.DaemonTypes.Schedd)]
Exemplo n.º 6
0
    def killWorkflowJobs(self, workflow):
        """
        _killWorkflowJobs_

        Kill all the jobs belonging to a specif workflow.
        """
        sd = condor.Schedd()
        logging.debug("Going to remove all the jobs for workflow %s", workflow)
        sd.act(condor.JobAction.Remove, 'WMAgent_RequestName == %s' % classad.quote(str(workflow)))

        return
Exemplo n.º 7
0
    def updateJobInformation(self, workflow, task, **kwargs):
        """
        _updateJobInformation_

        Update job information for all jobs in the workflow and task,
        the change will take effect if the job is Idle or becomes idle.

        The currently supported changes are only priority for which both the task (taskPriority)
        and workflow priority (requestPriority) must be provided.
        """
        sd = condor.Schedd()
        if 'taskPriority' in kwargs and 'requestPriority' in kwargs:
            # Do a priority update
            priority = (int(kwargs['requestPriority']) + int(kwargs['taskPriority'] * self.maxTaskPriority))
            try:
                sd.edit('WMAgent_JobID =!= "UNDEFINED" && WMAgent_SubTaskName == %s && WMAgent_RequestName == %s && JobPrio != %d' %
                        (classad.quote(str(task)),classad.quote(str(workflow)),classad.Literal(int(priority))), "JobPrio", classad.Literal(int(priority)))
            except:
                msg = "Couldn\'t edit classAd to change job Priority for WMAgent_SubTaskName=%s, WMAgent_RequestName=%s " % (classad.quote(str(task)), classad.quote(str(workflow)))
                logging.debug(msg)

        return
Exemplo n.º 8
0
def makeReleaseAds(config):
    """
    Create a set of rules to release a task to match
    """
    for task,where in config.get('release',{}).items():
        anAd = classad.ClassAd()
        anAd["Name"] = str("Releasing task %s"%(task))
        anAd["GridResource"] = "condor localhost localhost"
        exp = '(HasBeenSetHeld is true) && (target.WMAgent_SubTaskName =?= %s)' % classad.quote(str(task))
        anAd["Requirements"] = classad.ExprTree(str(exp))
        anAd["copy_Held_DESIRED_Sites"] = "DESIRED_Sites"
        anAd["set_HasBeenRouted"] = False
        anAd["set_HasBeenSetHeld"] = False
        print anAd
def main():
    opts = parse_opts()

    users = set()
    for line in open(opts.local_users):
        line = line.strip()
        if line.startswith("#"): continue
        users.add(line)

    collectors = set()
    for pool in opts.pool:
        coll = htcondor.Collector(pool)
        collectors.add(coll)
        if not opts.quiet: print >> sys.stderr, "Querying collector %s for schedds matching" % pool, opts.const

    reqs = '(JobStatus == 1) && stringListMember(%s, DESIRED_Sites)' % classad.quote(opts.site)
    idle_count = {}
    for user in users:
        if user == "*": continue
        idle_count.setdefault(user, 0)
    user_map = {}
    if not opts.quiet: print >> sys.stderr, "Schedd job requirements:", reqs
    for coll in collectors:
        for schedd_ad in coll.query(htcondor.AdTypes.Schedd, opts.const, ['MyAddress', 'CondorVersion', 'Name', 'ScheddIpAddr']):
            if not opts.quiet: print >> sys.stderr, "Querying", schedd_ad.get('Name', "Unknown")
            schedd = htcondor.Schedd(schedd_ad)
            try:
                if opts.jobs_only:
                    schedd_data = schedd.xquery(requirements=reqs, projection=["x509userproxysubject", "CRAB_UserHN", "JobStatus"])
                else:
                    schedd_data = schedd.xquery(requirements=reqs, projection=["x509userproxysubject", "CRAB_UserHN", "JobStatus"], opts=htcondor.QueryOpts.AutoCluster)
            except RuntimeError, e:
                if not opts.quiet: print >> sys.stderr, "Error querying %s: %s" % (schedd_ad.get('Name', "Unknown"), e)
            if not opts.jobs_only:
                for cluster in schedd_data:
                    user = cluster.get("CRAB_UserHN")
                    if (user in users) or ("*" in users):
                        idle_count.setdefault(user, 0)
                        idle_count[user] += int(cluster.get("JobCount", 0))
                        if 'x509userproxysubject' in cluster:
                            user_map[user] = cluster['x509userproxysubject']
            if opts.jobs_only:
                for job in schedd_data:
                    user = job.get("CRAB_UserHN")
                    if (user in users) or ("*" in users):
                        idle_count.setdefault(user, 0)
                        idle_count[user] += 1
                        if 'x509userproxysubject' in job:
                            user_map[user] = job['x509userproxysubject']
Exemplo n.º 10
0
    def updateJobInformation(self, workflow, task, **kwargs):
        """
        _updateJobInformation_

        Update job information for all jobs in the workflow and task,
        the change will take effect if the job is Idle or becomes idle.

        The currently supported changes are only priority for which both the task (taskPriority)
        and workflow priority (requestPriority) must be provided.
        """
        schedd = htcondor.Schedd()

        if 'taskPriority' in kwargs and 'requestPriority' in kwargs:
            newPriority = int(kwargs['requestPriority']) + int(kwargs['taskPriority'] * self.maxTaskPriority)
            try:
                constraint = "WMAgent_SubTaskName =?= %s" % classad.quote(str(task))
                constraint += " && WMAgent_RequestName =?= %s" % classad.quote(str(workflow))
                constraint += " && JobPrio =!= %d" % newPriority
                schedd.edit(constraint, 'JobPrio', classad.Literal(newPriority))
            except Exception as ex:
                logging.error("Failed to update JobPrio for WMAgent_SubTaskName=%s", task)
                logging.exception(ex)

        return
Exemplo n.º 11
0
    def killWorkflowJobs(self, workflow):
        """
        _killWorkflowJobs_

        Kill all the jobs belonging to a specific workflow.
        """
        logging.info("Going to remove all the jobs for workflow %s", workflow)

        schedd = htcondor.Schedd()

        try:
            schedd.act(htcondor.JobAction.Remove, "WMAgent_RequestName == %s" % classad.quote(str(workflow)))
        except RuntimeError:
            logging.warn("Error while killing jobs on the schedd: WMAgent_RequestName=%s", workflow)

        return
Exemplo n.º 12
0
    def executeAll(self, joblist=None, attributes=None, values=None):
        """ Given equal sized lists of job ids, attributes and values,
            executes in one large transaction a single qedit for each job.
        """
        global disk_cache
        joblist = joblist or []
        attributes = attributes or []
        values = values or []
        if not (len(joblist) == len(attributes) == len(values)):
            raise QueryError(
                "Arguments to QEdit.executeAll should have the same length")
        try:
            htcondor_full_reload()
            if self.pool_name:
                collector = htcondor.Collector(str(self.pool_name))
            else:
                collector = htcondor.Collector()

            if self.schedd_name:
                schedd_ad = disk_cache.get(self.schedd_name + '.locate')
                if schedd_ad is None:
                    schedd_ad = collector.locate(htcondor.DaemonTypes.Schedd,
                                                 self.schedd_name)
                    disk_cache.save(self.schedd_name + '.locate', schedd_ad)

                schedd = htcondor.Schedd(schedd_ad)
            else:
                schedd = htcondor.Schedd()
            with schedd.transaction() as _:
                for jobid, attr, val in zip(joblist, attributes, values):
                    schedd.edit([jobid], attr, classad.quote(val))
        except Exception as ex:
            s = 'default'
            if self.schedd_name is not None:
                s = self.schedd_name
            p = 'default'
            if self.pool_name is not None:
                p = self.pool_name
            try:
                j1 = jobid
                j2 = attr
                j3 = val
            except:
                j1 = j2 = j3 = 'unknown'
            err_str = 'Error querying schedd %s in pool %s using python bindings (qedit of job/attr/val %s/%s/%s): %s' % (
                s, p, j1, j2, j3, ex)
            raise QueryError(err_str)
Exemplo n.º 13
0
    def getClassAds(self):
        """
        _getClassAds_

        Grab CONDOR classAds using CONDOR-PYTHON

        This looks at the schedd running on the
        Submit-Host and edit/remove jobs
        """

        jobInfo = {}
        schedd = condor.Schedd()

        try:
            logging.debug("Start: Retrieving classAds using Condor Python XQuery")
            itobj = schedd.xquery(
                'WMAgent_JobID =!= "UNDEFINED" && WMAgent_AgentName == %s' % classad.quote(str(self.agent)),
                ["JobStatus", "EnteredCurrentStatus", "JobStartDate", "QDate", "DESIRED_Sites",
                 "ExtDESIRED_Sites", "MATCH_EXP_JOBGLIDEIN_CMSSite", "WMAgent_JobID"]
                )
            logging.debug("Finish: Retrieving classAds using Condor Python XQuery")
        except:
            msg = "Query to condor schedd failed in PyCondorPlugin"
            logging.error(msg)
            return None, None
        else:
            for jobAd in itobj:
                ### This condition ignores jobs that are Removed, but stay in the X state
                ### For manual condor_rm removal, job wont be in the queue \
                ### and status of the jobs will be read from condor log
                if jobAd["JobStatus"] == 3:
                    continue
                else:
                    tmpDict = {}
                    tmpDict["JobStatus"] = int(jobAd.get("JobStatus", 100))
                    tmpDict["stateTime"] = int(jobAd["EnteredCurrentStatus"])
                    tmpDict["runningTime"] = int(jobAd.get("JobStartDate", 0))
                    tmpDict["submitTime"] = int(jobAd["QDate"])
                    tmpDict["DESIRED_Sites"] = jobAd["DESIRED_Sites"]
                    tmpDict["ExtDESIRED_Sites"] = jobAd["ExtDESIRED_Sites"]
                    tmpDict["runningCMSSite"] = jobAd.get("MATCH_EXP_JOBGLIDEIN_CMSSite", None)
                    tmpDict["WMAgentID"] = int(jobAd["WMAgent_JobID"])
                    jobInfo[tmpDict["WMAgentID"]] = tmpDict

            logging.info("Retrieved %i classAds", len(jobInfo))

        return jobInfo, schedd
Exemplo n.º 14
0
def shared_submit_descriptors(unique_id=None, requirements=None):
    return {
        "executable":
        THIS_FILE.as_posix(),
        "My.Is_Transfer_Job":
        "true",
        "My.WantFlocking":
        "true",
        "keep_claim_idle":
        "300",
        "request_disk":
        "1GB",
        "requirements":
        requirements if requirements is not None else "true",
        "My.UniqueID":
        "{}".format(classad.quote(unique_id) if unique_id is not None else ''),
    }
Exemplo n.º 15
0
def shared_submit_descriptors(
    executable: Optional[Path] = None,
    unique_id: Optional[str] = None,
    requirements: Optional[str] = None,
) -> Dict[str, str]:
    if executable is None:
        executable = THIS_FILE

    return {
        "executable": executable.as_posix(),
        "keep_claim_idle": "300",
        "request_disk": "1GB",
        "requirements": requirements or "true",
        "My.Is_Transfer_Job": "true",
        "My.WantFlocking":
        "true",  # special attribute for the CHTC pool, not necessary at other sites
        "My.UniqueID": classad.quote(unique_id) if unique_id else "",
    }
Exemplo n.º 16
0
def makeAds(config):
    reversed_mapping = config['reversed_mapping']

    needs_site = defaultdict(set)
    for workflow, tasks in config['modifications'].items():
        for taskname,specs in tasks.items():
            anAd = classad.ClassAd()
            anAd["GridResource"] = "condor localhost localhost"
            anAd["TargetUniverse"] = 5
            exp = '(HasBeenReplaced isnt true)  && (target.WMAgent_SubTaskName =?= %s)' % classad.quote(str(taskname))
            anAd["Requirements"] = classad.ExprTree(str(exp))
            
            if "ReplaceSiteWhitelist" in specs:
                anAd["Name"] = str("Site Replacement for %s"% taskname)
                #if ("T2_CH_CERN_HLT" in specs['ReplaceSiteWhitelist']) and not g_is_cern: specs['ReplaceSiteWhitelist'].remove("T2_CH_CERN_HLT")
                anAd["eval_set_DESIRED_Sites"] = str(",".join(specs['ReplaceSiteWhitelist']))
                anAd['set_Rank'] = classad.ExprTree("stringlistmember(GLIDEIN_CMSSite, ExtDESIRED_Sites)")
                anAd["set_HasBeenReplaced"] = True
                anAd["set_HasBeenRouted"] = False
                print anAd
            elif "AddWhitelist" in specs:
                for site in specs['AddWhitelist']:
                    needs_site[site].add(taskname)
 

    for site in  needs_site:
        if not site in reversed_mapping: continue
        #if site == "T2_CH_CERN_HLT" and not g_is_cern: continue
        anAd = classad.ClassAd()
        anAd["GridResource"] = "condor localhost localhost"
        anAd["TargetUniverse"] = 5
        anAd["Name"] = str("Overflow rule to go to %s"%site)
        anAd["OverflowTasknames"] = map(str, needs_site[site])
        overflow_names_escaped = anAd.lookup('OverflowTasknames').__repr__()
        del anAd['OverflowTaskNames']
        exprs = ['regexp(%s, target.ExtDESIRED_Sites)'% classad.quote(str(origin)) for origin in reversed_mapping[site]]
        exp = classad.ExprTree('member(target.WMAgent_SubTaskName, %s) && ( %s ) && (target.HasBeenRouted_%s =!= true)' % (overflow_names_escaped, str("||".join( exprs )), str(site)))
        anAd["Requirements"] = classad.ExprTree(str(exp))
        anAd["copy_DESIRED_Sites"] = "Prev_DESIRED_Sites"
        anAd["eval_set_DESIRED_Sites"] = classad.ExprTree('ifThenElse(sortStringSet("") isnt error, sortStringSet(strcat(%s, ",", Prev_DESIRED_Sites)), strcat(%s, ",", Prev_DESIRED_Sites))' % (classad.quote(str(site)), classad.quote(str(site))))
        anAd['set_Rank'] = classad.ExprTree("stringlistmember(GLIDEIN_CMSSite, ExtDESIRED_Sites)")
        anAd['set_HasBeenRouted'] = False
        anAd['set_HasBeenRouted_%s' % str(site)] = True
        print anAd
Exemplo n.º 17
0
def makeHoldAds(config):
    """
    Create a set of rules to hold a task from matching
    """
    for task,where in config.get('hold',{}).items():
        # task is the task name
        # where is either an empty list=all sites, or a list of sites (not implemented)
        anAd = classad.ClassAd()
        anAd["Name"] = str("Holding task %s from %s"%(task, where))
        anAd["GridResource"] = "condor localhost localhost"
        anAd["TargetUniverse"] = 5
        exp = '(HasBeenSetHeld isnt true)  && (target.WMAgent_SubTaskName =?= %s)' % classad.quote(str(task))
        anAd["Requirements"] = classad.ExprTree(str(exp))
        ## we use the site whitelist to prevent matching
        anAd["copy_DESIRED_Sites"] = "Held_DESIRED_Sites"
        anAd["set_DESIRED_Sites"] = "T2_NW_NOWHERE"
        anAd["set_HasBeenRouted"] = False
        anAd["set_HasBeenSetHeld"] = True
        print anAd
Exemplo n.º 18
0
    def killWorkflowJobs(self, workflow):
        """
        _killWorkflowJobs_

        Kill all the jobs belonging to a specific workflow.
        """
        logging.info("Going to remove all the jobs for workflow %s", workflow)

        schedd = htcondor.Schedd()

        try:
            schedd.act(htcondor.JobAction.Remove,
                       "WMAgent_RequestName == %s" % classad.quote(workflow))
        except RuntimeError:
            logging.warn(
                "Error while killing jobs on the schedd: WMAgent_RequestName=%s",
                workflow)

        return
Exemplo n.º 19
0
def makeHoldAds(config):
    """
    Create a set of rules to hold a task from matching
    """
    for task, where in config.get('hold', {}).items():
        # task is the task name
        # where is either an empty list=all sites, or a list of sites (not implemented)
        anAd = classad.ClassAd()
        anAd["Name"] = str("Holding task %s from %s" % (task, where))
        anAd["GridResource"] = "condor localhost localhost"
        anAd["TargetUniverse"] = 5
        exp = '(HasBeenSetHeld isnt true)  && (target.WMAgent_SubTaskName =?= %s)' % classad.quote(
            str(task))
        anAd["Requirements"] = classad.ExprTree(str(exp))
        ## we use the site whitelist to prevent matching
        anAd["copy_DESIRED_Sites"] = "Held_DESIRED_Sites"
        anAd["set_DESIRED_Sites"] = "T2_NW_NOWHERE"
        anAd["set_HasBeenRouted"] = False
        anAd["set_HasBeenSetHeld"] = True
        print anAd
Exemplo n.º 20
0
    def executeAll(self, joblist=None, attributes=None, values=None):
        """ Given equal sized lists of job ids, attributes and values,
            executes in one large transaction a single qedit for each job.
        """
        joblist = joblist or []
        attributes = attributes or []
        values = values or []
        if not (len(joblist) == len(attributes) == len(values)):
            raise QueryError("Arguments to QEdit.executeAll should have the same length")
        try:
            htcondor.reload_config()
            if self.pool_name:
                collector = htcondor.Collector(str(self.pool_name))
            else:
                collector = htcondor.Collector()

            if self.schedd_name:
                schedd_ad = collector.locate(htcondor.DaemonTypes.Schedd,
                                             self.schedd_name)
                schedd = htcondor.Schedd(schedd_ad)
            else:
                schedd = htcondor.Schedd()
            with schedd.transaction() as _:
                for jobid, attr, val in zip(joblist, attributes, values):
                    schedd.edit([jobid], attr, classad.quote(val))
        except Exception as ex:
            s = 'default'
            if self.schedd_name is not None:
                s = self.schedd_name
            p = 'default'
            if self.pool_name is not None:
                p = self.pool_name
            try:
                j1 = jobid
                j2 = attr
                j3 = val
            except:
                j1 = j2 = j3 = 'unknown'
            err_str = 'Error querying schedd %s in pool %s using python bindings (qedit of job/attr/val %s/%s/%s): %s' % (s, p, j1, j2, j3, ex)
            raise QueryError(err_str)
Exemplo n.º 21
0
def updateClassAd(collector, daemon, name, statistics='All:2', direct=False):
    """Returns an updated ClassAd from a HTCondor daemon"""

    # Begin building a list of keyword arguments.
    kwargs = {'statistics': statistics}

    if direct:
        # If we're doing a direct query, daemon_type and name must be defined.
        kwargs['daemon_type'] = htcondor.DaemonTypes.names[daemon]
        kwargs['name'] = name

        # Do a direct query on the given daemon at the given hostname.
        # Return the ClassAd and the number of ClassAds queried (always 1).
        return (collector.directQuery(**kwargs), 1)

    else:
        # If we're doing a regular query, ad_type must be defined
        # and the ClassAd should be constrained to the given hostname.
        kwargs['ad_type'] = htcondor.AdTypes.names[daemon]
        if name:
            constraint = 'Name =?= {0}'.format(classad.quote(name))
            kwargs['constraint'] = constraint
        else:
            name = '(unspecified)'

        # Do a regular query on the given daemon at the given hostname.
        ads = collector.query(**kwargs)

        # A regular query can give multiple ClassAds, e.g. if there are multiple
        # daemons running on the same host (rare, usually duplicate daemon).

        if len(ads) == 0:  # if no ClassAds, then exit
            sys.stderr.write(
                ('Error: Received {0} ClassAds from the {1} named {2} '
                 'from the Collector at {3}.\n').format(
                     len(ads), daemon, name, POOL))
            sys.exit(1)

        # Return the first ClassAd and the number of ClassAds queried.
        return (ads[0], len(ads))
Exemplo n.º 22
0
    def submitDirect(self, schedd, cmd, arg, info): #pylint: disable=R0201
        """
        Submit directly to the schedd using the HTCondor module
        """
        dagAd = classad.ClassAd()
        addCRABInfoToClassAd(dagAd, info)

        if info["CMSGroups"]:
            dagAd["CMSGroups"] = ','.join(info["CMSGroups"])
        else:
            dagAd["CMSGroups"] = classad.Value.Undefined

        # NOTE: Changes here must be synchronized with the job_submit in DagmanCreator.py in CAFTaskWorker
        dagAd["CRAB_Attempt"] = 0
        # We switched from local to scheduler universe.  Why?  It seems there's no way in the
        # local universe to change the hold signal at runtime.  That's fairly important for our
        # resubmit implementation.
        #dagAd["JobUniverse"] = 12
        dagAd["JobUniverse"] = 7
        dagAd["HoldKillSig"] = "SIGUSR1"
        dagAd["X509UserProxy"] = info['user_proxy']
        dagAd["Requirements"] = classad.ExprTree('true || false')
        dagAd["Environment"] = classad.ExprTree('strcat("PATH=/usr/bin:/bin CRAB3_VERSION=3.3.0-pre1 CONDOR_ID=", ClusterId, ".", ProcId," %s")' % " ".join(info['additional_environment_options'].split(";")))
        dagAd["RemoteCondorSetup"] = info['remote_condor_setup']

        dagAd["CRAB_TaskSubmitTime"] = classad.ExprTree("%s" % info["start_time"].encode('ascii', 'ignore'))
        dagAd['CRAB_TaskLifetimeDays'] = TASKLIFETIME // 24 // 60 // 60
        dagAd['CRAB_TaskEndTime'] = int(info["start_time"]) + TASKLIFETIME
        #For task management info see https://github.com/dmwm/CRABServer/issues/4681#issuecomment-302336451
        dagAd["LeaveJobInQueue"] = classad.ExprTree("true")
        dagAd["PeriodicHold"] = classad.ExprTree("time() > CRAB_TaskEndTime")
        dagAd["TransferOutput"] = info['outputFilesString']
        dagAd["OnExitHold"] = classad.ExprTree("(ExitCode =!= UNDEFINED && ExitCode != 0)")
        dagAd["OnExitRemove"] = classad.ExprTree("( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2))")
        dagAd["OtherJobRemoveRequirements"] = classad.ExprTree("DAGManJobId =?= ClusterId")
        dagAd["RemoveKillSig"] = "SIGUSR1"

        with open('subdag.ad' ,'w') as fd:
            for k, v in dagAd.items():
                if k == 'X509UserProxy':
                    v = os.path.basename(v)
                if isinstance(v, basestring):
                    value = classad.quote(v)
                elif isinstance(v, classad.ExprTree):
                    value = repr(v)
                elif isinstance(v, list):
                    value = "{{{0}}}".format(json.dumps(v)[1:-1])
                else:
                    value = v
                fd.write('+{0} = {1}\n'.format(k, value))

        dagAd["TaskType"] = "ROOT"
        dagAd["Out"] = str(os.path.join(info['scratch'], "request.out"))
        dagAd["Err"] = str(os.path.join(info['scratch'], "request.err"))
        dagAd["Cmd"] = cmd
        dagAd['Args'] = arg
        dagAd["TransferInput"] = str(info['inputFilesString'])

        condorIdDict = {}
        with HTCondorUtils.AuthenticatedSubprocess(info['user_proxy'], pickleOut=True, outputObj=condorIdDict, logger=self.logger) as (parent, rpipe):
            if not parent:
                resultAds = []
                condorIdDict['ClusterId'] = schedd.submit(dagAd, 1, True, resultAds)
                schedd.spool(resultAds)
                # editing the LeaveJobInQueue since the remote submit overwrites it
                # see https://github.com/dmwm/CRABServer/pull/5212#issuecomment-216519749
                if resultAds:
                    id_ = "%s.%s" % (resultAds[0]['ClusterId'], resultAds[0]['ProcId'])
                    schedd.edit([id_], "LeaveJobInQueue", classad.ExprTree("true"))

        try:
            results = pickle.load(rpipe)
        except EOFError:
            #Do not want to retry this since error may happen after submit (during edit for example).
            #And this can cause the task to be submitted twice (although we have a protection in the duplicatedCheck)
            raise TaskWorkerException("Timeout executing condor submit command.", retry=False)

        #notice that the clusterId might be set even if there was a failure. This is if the schedd.submit succeded, but the spool  call failed
        if 'ClusterId' in results.outputObj:
            self.logger.debug("Condor cluster ID just submitted is: %s", results.outputObj['ClusterId'])
        if results.outputMessage != "OK":
            self.logger.debug("Now printing the environment used for submission:\n" + "-"*70 + "\n" + results.environmentStr + "-"*70)
            raise TaskWorkerException("Failure when submitting task to scheduler. Error reason: '%s'" % results.outputMessage, retry=True)

        #if we don't raise exception above the id is here
        return results.outputObj['ClusterId']
Exemplo n.º 23
0
def main():
    logging.basicConfig(format="%(asctime)s %(message)s", level=logging.INFO)

    args = parse_args()

    print("Called with args: {}".format(args))

    if args.cmd == "sync":
        if args.unique_id:
            schedd = htcondor.Schedd()
            existing_job = schedd.query(
                constraint="UniqueId == {} && JobStatus =!= 4".format(
                    classad.quote(args.unique_id)),
                attr_list=[],
                limit=1,
            )
            if len(existing_job) > 0:
                logging.warning(
                    'Jobs already found in queue with UniqueId == "%s", exiting',
                    args.unique_id,
                )
                sys.exit()
        print("Will synchronize {} at source to {} at destination".format(
            args.src, args.dest))
        cluster_id = submit_outer_dag(
            args.working_dir,
            args.src,
            args.dest,
            requirements=read_requirements_file(args.requirements_file)
            or args.requirements,
            unique_id=args.unique_id,
            test_mode=args.test_mode,
        )
        print("Parent job running in cluster {}".format(cluster_id))
    elif args.cmd == "generate":
        logging.info("Generating file listing for %s", args.src)
        generate_file_listing(args.src,
                              Path("source_manifest.txt"),
                              test_mode=args.test_mode)
    elif args.cmd == "write_subdag":
        logging.info(
            "Generating SUBGDAG for transfer of %s->%s",
            args.source_prefix,
            args.dest_prefix,
        )
        write_inner_dag(
            args.source_prefix,
            args.source_manifest,
            args.dest_prefix,
            requirements=read_requirements_file(args.requirements_file)
            or args.requirements,
            test_mode=args.test_mode,
            unique_id=args.unique_id,
        )
    elif args.cmd == "exec":
        xfer_exec(args.src)
    elif args.cmd == "verify":
        with args.json.open(mode="r") as f:
            cmd_info = json.load(f)
        # Split the DAG job name to get the cmd_info key
        info = cmd_info[args.fileid.split(":")[-1]]
        verify(
            Path(info["dest_prefix"]),
            Path(info["dest"]),
            Path("{}.metadata".format(info['src_file_noslash'])),
            Path(info["transfer_manifest"]),
        )
    elif args.cmd == "verify_remote":
        verify_remote(args.src)
    elif args.cmd == "analyze":
        analyze(args.transfer_manifest)
Exemplo n.º 24
0
def makeOverflowAds(config):
    # Mapping from source to a list of destinations.
    # key can be read by site in values
    reversed_mapping = config['reversed_mapping']

    overflow_tasks = {}
    for workflow, tasks in config.get('modifications', {}).items():
        for taskname, specs in tasks.items():
            anAd = classad.ClassAd()
            anAd["GridResource"] = "condor localhost localhost"
            anAd["TargetUniverse"] = 5
            exp = '(HasBeenReplaced isnt true)  && (target.WMAgent_SubTaskName =?= %s)' % classad.quote(
                str(taskname))
            anAd["Requirements"] = classad.ExprTree(str(exp))
            add_whitelist = specs.get("AddWhitelist")
            if "ReplaceSiteWhitelist" in specs:
                anAd["Name"] = str("Site Replacement for %s" % taskname)
                anAd["eval_set_DESIRED_Sites"] = str(",".join(
                    specs['ReplaceSiteWhitelist']))
                anAd['set_Rank'] = classad.ExprTree(
                    "stringlistmember(GLIDEIN_CMSSite, ExtDESIRED_Sites)")
                anAd["set_HasBeenReplaced"] = True
                anAd["set_HasBeenRouted"] = False
                print anAd
            elif add_whitelist:
                add_whitelist.sort()
                add_whitelist_key = ",".join(add_whitelist)
                tasks = overflow_tasks.setdefault(add_whitelist_key, [])
                tasks.append(taskname)

    # Create a source->dests mapping from the provided reverse_mapping.
    source_to_dests = {}
    for dest, sources in reversed_mapping.items():
        for source in sources:
            dests = source_to_dests.setdefault(source, set())
            dests.add(dest)
    tmp_source_to_dests = source_to_dests

    # For each unique set of site whitelists, create a new rule.  Each task
    # should appear on just one of these ads, meaning it should only get routed
    # once.
    for whitelist_sites, tasks in overflow_tasks.items():
        ## these are the sites that need to be added in whitelist.
        whitelist_sites_set = set(whitelist_sites.split(","))

        # Create an updated source_to_dests, where the dests are filtered
        # on the whitelist.
        source_to_dests = {}
        for source, dests in tmp_source_to_dests.items():
            new_dests = [str(i) for i in dests if i in whitelist_sites_set]
            if new_dests:
                source_to_dests[str(source)] = new_dests

        anAd = classad.ClassAd()
        anAd["GridResource"] = "condor localhost localhost"
        anAd["TargetUniverse"] = 5
        anAd["Name"] = "Master overflow rule to run at %s in addition" % str(
            whitelist_sites)

        # ClassAds trick to create a properly-formatted ClassAd list.
        anAd["OverflowTasknames"] = map(str, tasks)
        overflow_names_escaped = anAd.lookup('OverflowTasknames').__repr__()
        del anAd['OverflowTaskNames']

        exp = classad.ExprTree(
            'member(target.WMAgent_SubTaskName, %s) && (HasBeenRouted_Overflow isnt true)'
            % overflow_names_escaped)
        anAd["Requirements"] = classad.ExprTree(str(exp))
        anAd["copy_DESIRED_Sites"] = "Pre_DESIRED_Sites"
        anAd["eval_set_DESIRED_Sites"] = classad.ExprTree(
            'ifThenElse(siteMapping("", []) isnt error, siteMapping(Pre_DESIRED_Sites, %s), Pre_DESIRED_Sites)'
            % str(classad.ClassAd(source_to_dests)))

        # Where possible, prefer to run at a site where the input can be read locally.
        anAd['set_Rank'] = classad.ExprTree(
            "stringlistmember(GLIDEIN_CMSSite, ExtDESIRED_Sites)")
        anAd['set_HasBeenRouted'] = False
        anAd['set_HasBeenRouted_Overflow'] = True
        print anAd
Exemplo n.º 25
0
 def test_quote(self):
     self.assertEquals(classad.quote("foo"), '"foo"')
     self.assertEquals(classad.quote('"foo'), '"\\"foo"')
     for i in ["foo", '"foo', '"\\"foo']:
         self.assertEquals(i, classad.unquote(classad.quote(i)))
Exemplo n.º 26
0
    def customizePerJob(self, job):
        """
        JDL additions just for this implementation. Over-ridden in sub-classes
        These are the Glide-in specific bits
        """
        jdl = []
        jobCE = job['location']
        if not jobCE:
            # Then we ended up with a site that doesn't exist?
            logging.error("Job for non-existant site %s", job['location'])
            return jdl

        if self.submitWMSMode and len(job.get('possibleSites', [])) > 0:
            strg = ','.join(map(str, job.get('possibleSites')))
            jdl.append('+DESIRED_Sites = \"%s\"\n' % strg)
        else:
            jdl.append('+DESIRED_Sites = \"%s\"\n' % (jobCE))

        if self.submitWMSMode and len(job.get('potentialSites', [])) > 0:
            strg = ','.join(map(str, job.get('potentialSites')))
            jdl.append('+ExtDESIRED_Sites = \"%s\"\n' % strg)
        else:
            jdl.append('+ExtDESIRED_Sites = \"%s\"\n' % (jobCE))

        if job.get('proxyPath'):
            jdl.append('x509userproxy = %s\n' % job['proxyPath'])

        jdl.append('+WMAgent_RequestName = "%s"\n' % job['requestName'])
        match = GROUP_NAME_RE.match(job['requestName'])
        if match:
            jdl.append('+CMSGroups = %s\n' % classad.quote(match.groups()[0]))
        else:
            jdl.append('+CMSGroups = undefined\n')

        jdl.append('+WMAgent_SubTaskName = "%s"\n' % job['taskName'])
        jdl.append('+CMS_JobType = "%s"\n' % job['taskType'])

        # Handling for AWS, cloud and opportunistic resources
        jdl.append('+AllowOpportunistic = %s\n' % job.get('allowOpportunistic', False))

        # dataset info
        if job.get('inputDataset'):
            jdl.append('+DESIRED_CMSDataset = "%s"\n' % job['inputDataset'])
        else:
            jdl.append('+DESIRED_CMSDataset = undefined\n')
        if job.get('inputDatasetLocations'):
            jdl.append('+DESIRED_CMSDataLocations = "%s"\n' % ','.join(job['inputDatasetLocations']))
        else:
            jdl.append('+DESIRED_CMSDataLocations = undefined\n')

        # HighIO jobs
        jdl.append('+Requestioslots = %d\n' % job.get('highIOjob', 0))

        # Performance and resource estimates
        numberOfCores = job.get('numberOfCores', 1)
        requestMemory = int(job['estimatedMemoryUsage']) if job.get('estimatedMemoryUsage', None) else 1000
        requestDisk = int(job['estimatedDiskUsage']) if job.get('estimatedDiskUsage', None) else 20*1000*1000*numberOfCores
        maxWallTimeMins = int(job['estimatedJobTime'])/60.0 if job.get('estimatedJobTime', None) else 12*60
        jdl.append('request_memory = %d\n' % requestMemory)
        jdl.append('request_disk = %d\n' % requestDisk)
        jdl.append('+MaxWallTimeMins = %d\n' % maxWallTimeMins)

        # How many cores job is using
        jdl.append('machine_count = 1\n')
        jdl.append('request_cpus = %s\n' % numberOfCores)

        # Add OS requirements for jobs
        if job.get('scramArch') is not None and job.get('scramArch').startswith("slc6_"):
            jdl.append('+REQUIRED_OS = "rhel6"\n')
        else:
            jdl.append('+REQUIRED_OS = "any"\n')

        return jdl
Exemplo n.º 27
0
def test_quote(input, expected):
    assert classad.quote(input) == expected
Exemplo n.º 28
0
def makeOverflowAds(config):
    # Mapping from source to a list of destinations.
    reversed_mapping = config['reversed_mapping']

    overflow_tasks = {}
    for workflow, tasks in config['modifications'].items():
        for taskname,specs in tasks.items():
            anAd = classad.ClassAd()
            anAd["GridResource"] = "condor localhost localhost"
            anAd["TargetUniverse"] = 5
            exp = '(HasBeenReplaced isnt true)  && (target.WMAgent_SubTaskName =?= %s)' % classad.quote(str(taskname))
            anAd["Requirements"] = classad.ExprTree(str(exp))
            add_whitelist = specs.get("AddWhitelist")
            if "ReplaceSiteWhitelist" in specs:
                anAd["Name"] = str("Site Replacement for %s"% taskname)
                anAd["eval_set_DESIRED_Sites"] = str(",".join(specs['ReplaceSiteWhitelist']))
                anAd['set_Rank'] = classad.ExprTree("stringlistmember(GLIDEIN_CMSSite, ExtDESIRED_Sites)")
                anAd["set_HasBeenReplaced"] = True
                anAd["set_HasBeenRouted"] = False
                print anAd
            elif add_whitelist:
                add_whitelist.sort()
                add_whitelist_key = ",".join(add_whitelist)
                tasks = overflow_tasks.setdefault(add_whitelist_key, [])
                tasks.append(taskname)

    # Create a source->dests mapping from the provided reverse_mapping.
    source_to_dests = {}
    for dest, sources in reversed_mapping.items():
        for source in sources:
            dests = source_to_dests.setdefault(source, set())
            dests.add(dest)
    tmp_source_to_dests = source_to_dests

    # For each unique set of site whitelists, create a new rule.  Each task
    # should appear on just one of these ads, meaning it should only get routed
    # once.
    for whitelist_sites, tasks in overflow_tasks.items():
        whitelist_sites_set = set(whitelist_sites.split(","))

        # Create an updated source_to_dests, where the dests are filtered
        # on the whitelist.
        source_to_dests = {}
        for source, dests in tmp_source_to_dests.items():
            new_dests = [str(i) for i in dests if i in whitelist_sites_set]
            if new_dests:
                source_to_dests[str(source)] = new_dests

        anAd = classad.ClassAd()
        anAd["GridResource"] = "condor localhost localhost"
        anAd["TargetUniverse"] = 5
        anAd["Name"] = "Master overflow rule for %s" % str(whitelist_sites)

        # ClassAds trick to create a properly-formatted ClassAd list.
        anAd["OverflowTasknames"] = map(str, tasks)
        overflow_names_escaped = anAd.lookup('OverflowTasknames').__repr__()
        del anAd['OverflowTaskNames']

        exp = classad.ExprTree('member(target.WMAgent_SubTaskName, %s) && (HasBeenRouted_Overflow isnt true)' % overflow_names_escaped)
        anAd["Requirements"] = classad.ExprTree(str(exp))
        # siteMapping will apply the source->dest rules, given the current set of sources in ExtDESIRED_Sites.
        anAd["eval_set_DESIRED_Sites"] = classad.ExprTree('ifThenElse(siteMapping("", []) isnt error, siteMapping(ExtDESIRED_Sites, %s), ExtDESIRED_Sites)' % str(classad.ClassAd(source_to_dests)))

        # Where possible, prefer to run at a site where the input can be read locally.
        anAd['set_Rank'] = classad.ExprTree("stringlistmember(GLIDEIN_CMSSite, ExtDESIRED_Sites)")
        anAd['set_HasBeenRouted'] = False
        anAd['set_HasBeenRouted_Overflow'] = True
        print anAd
Exemplo n.º 29
0
def make_inner_dag(
    direction: TransferDirection,
    requirements: Optional[str],
    transfer_cmd_info: T_CMD_INFO,
    verify_cmd_info: T_CMD_INFO,
    unique_id: Optional[str] = None,
    test_mode: bool = False,
):
    # Only import htcondor.dags submit-side
    import htcondor.dags as dags

    inner_dag = dags.DAG(
        max_jobs_by_category={"TRANSFER_JOBS": 1} if test_mode else None)

    tof = [METADATA_FILE_NAME]
    tor = {METADATA_FILE_NAME: "$(flattened_name).metadata"}

    pull_tof = [SANDBOX_FILE_NAME]
    pull_tor = {SANDBOX_FILE_NAME: "$(flattened_name)"}

    shared_descriptors = shared_submit_descriptors(unique_id=unique_id,
                                                   requirements=requirements)

    inner_dag.layer(
        name=direction,
        submit_description=htcondor.Submit({
            "output":
            "$(flattened_name).out",
            "error":
            "$(flattened_name).err",
            "log":
            "transfer_file.log",
            "arguments":
            classad.quote("{} '$(remote_file)'".format(
                DIRECTION_TO_COMMAND[direction])),
            "should_transfer_files":
            "yes",
            "transfer_input_files":
            "$(local_file)" if direction is TransferDirection.PUSH else "",
            "transfer_output_files":
            ", ".join(tof + (
                pull_tof if direction is TransferDirection.PULL else [])),
            "transfer_output_remaps":
            classad.quote(" ; ".join(
                "{} = {}".format(k, v) for k, v in {
                    **tor,
                    **(pull_tor if TransferDirection.PULL else {}),
                }.items())),
            **shared_descriptors,
        }),
        vars=transfer_cmd_info,
        post=dags.Script(
            executable=THIS_FILE,
            arguments=[
                Commands.POST_TRANSFER,
                "--cmd-info",
                TRANSFER_COMMANDS_FILE_NAME,
                "--key",
                "$JOB",
            ],
        ),
    )

    inner_dag.layer(
        name="verify",
        submit_description=htcondor.Submit({
            "output":
            "$(flattened_name).out",
            "error":
            "$(flattened_name).err",
            "log":
            "verify_file.log",
            "arguments":
            classad.quote("{} '$(remote_file)'".format(
                Commands.GET_REMOTE_METADATA)),
            "should_transfer_files":
            "yes",
            "transfer_output_files":
            ", ".join(tof),
            "transfer_output_remaps":
            classad.quote(" ; ".join("{} = {}".format(k, v)
                                     for k, v in tor.items())),
            **shared_descriptors,
        }),
        vars=verify_cmd_info,
        post=dags.Script(
            executable=THIS_FILE,
            arguments=[
                Commands.POST_TRANSFER,
                "--cmd-info",
                VERIFY_COMMANDS_FILE_NAME,
                "--key",
                "$JOB",
                "--only-verify",
            ],
        ),
    )

    logging.info("Inner DAG shape:\n{}".format(inner_dag.describe()))

    return inner_dag
    "transfer_input_files":
    "$(item)",
    "output":
    "test-$(ProcID).out",
    "error":
    "test-$(ProcID).err",
    "request_cpus":
    "1",
    "request_memory":
    "1GB",
    "request_disk":
    "1GB",
    "hold":
    "true",
    "My.HoldReason":
    classad.quote("Spooling input files"),
    "My.HoldReasonCode":
    "16",
    "My.LeaveJobInQueue":
    f"JobStatus == {COMPLETED} && ( {COMPLETION_DATE} =?= UNDEFINED || {COMPLETION_DATE} == 0 || ((time() - {COMPLETION_DATE}) < {REMOVAL_DELAY}) )",
    "transfer_output_remaps":
    classad.quote(
        "_condor_stdout=test-$(ProcID).out ; _condor_stderr=test-$(ProcID).err"
    ),
})

collector = htcondor.Collector("cm.chtc.wisc.edu")

schedd_ad = collector.locate(htcondor.DaemonTypes.Schedd,
                             "submittest0000.chtc.wisc.edu")
Exemplo n.º 31
0
    def track(self, jobs):
        """
        _track_

        Track the jobs while in condor
        This returns a three-way ntuple
        First, the total number of jobs still running
        Second, the jobs that need to be changed
        Third, the jobs that need to be completed
        """
        jobInfo = {}
        changeList = []
        completeList = []
        runningList = []

        # get info about all active and recent jobs
        logging.debug("SimpleCondorPlugin is going to track %s jobs",
                      len(jobs))

        schedd = htcondor.Schedd()

        logging.debug("Start: Retrieving classAds using Condor Python XQuery")
        try:
            itobj = schedd.xquery(
                "WMAgent_AgentName == %s" % classad.quote(self.agent), [
                    'ClusterId', 'ProcId', 'JobStatus',
                    'MachineAttrGLIDEIN_CMSSite0'
                ])
            for jobAd in itobj:
                gridId = "%s.%s" % (jobAd['ClusterId'], jobAd['ProcId'])
                jobStatus = SimpleCondorPlugin.exitCodeMap().get(
                    jobAd.get('JobStatus'), 'Unknown')
                location = jobAd.get('MachineAttrGLIDEIN_CMSSite0', None)
                jobInfo[gridId] = (jobStatus, location)
        except Exception as ex:
            logging.error(
                "Query to condor schedd failed in SimpleCondorPlugin.")
            logging.error("Returning empty lists for all job types...")
            logging.exception(ex)
            return runningList, changeList, completeList

        logging.debug("Finished retrieving %d classAds from Condor",
                      len(jobInfo))

        # now go over the jobs and see what we have
        for job in jobs:

            # if the schedd doesn't know a job, consider it complete
            # doing any further checks is not cost effective
            if job['gridid'] not in jobInfo:
                (newStatus, location) = ('Completed', None)
            else:
                (newStatus, location) = jobInfo[job['gridid']]

            # check for status changes
            if newStatus != job['status']:

                # update location info for Idle->Running transition
                if newStatus == 'Running' and job['status'] == 'Idle':
                    if location:
                        job['location'] = location
                        logging.debug(
                            "JobAdInfo: Job location for jobid=%i gridid=%s changed to %s",
                            job['jobid'], job['gridid'], location)

                job['status'] = newStatus
                job['status_time'] = int(time.time())
                logging.debug(
                    "JobAdInfo: Job status for jobid=%i gridid=%s changed to %s",
                    job['jobid'], job['gridid'], job['status'])
                changeList.append(job)

            job['globalState'] = SimpleCondorPlugin.stateMap().get(newStatus)

            # stop tracking finished jobs
            if job['globalState'] in ['Complete', 'Error']:
                completeList.append(job)
            else:
                runningList.append(job)

        logging.debug(
            "SimpleCondorPlugin tracking : %i/%i/%i (Executing/Changing/Complete)",
            len(runningList), len(changeList), len(completeList))

        return runningList, changeList, completeList
Exemplo n.º 32
0
    def alter_submit(self, crab_retry):
        """
        Copy the content of the generic file Job.submit into a job-specific file
        Job.<job_id>.submit and add attributes that are job-specific (e.g. CRAB_Retry).
        Add also parameters that can be overwritten at each manual job resubmission
        (e.g. MaxWallTimeMins, RequestMemory, RequestCpus, JobPrio, DESIRED_SITES).
        """
        ## Start the Job.<job_id>.submit content with the CRAB_Retry.
        new_submit_text = '+CRAB_Retry = %d\n' % (crab_retry)
        msg = "Setting CRAB_Retry = %s" % (crab_retry)
        self.logger.info(msg)
        ## Add job and postjob log URLs
        job_retry = "%s.%s" % (self.job_id, crab_retry)
        new_submit_text += '+CRAB_JobLogURL = %s\n' % classad.quote(
            os.path.join(self.userWebDirPrx, "job_out." + job_retry + ".txt"))
        new_submit_text += '+CRAB_PostJobLogURL = %s\n' % classad.quote(
            os.path.join(self.userWebDirPrx, "postjob." + job_retry + ".txt"))
        ## For the parameters that can be overwritten at each manual job resubmission,
        ## read them from the task ad, unless there is resubmission information there
        ## and this job is not one that has to be resubmitted, in which case we should
        ## use the same parameters (site black- and whitelists, requested memory, etc)
        ## as used by the previous job retry (which are saved in self.resubmit_info).
        CRAB_ResubmitList_in_taskad = ('CRAB_ResubmitList' in self.task_ad)
        use_resubmit_info = False
        resubmit_jobids = []
        if 'CRAB_ResubmitList' in self.task_ad:
            resubmit_jobids = map(str, self.task_ad['CRAB_ResubmitList'])
            try:
                resubmit_jobids = set(resubmit_jobids)
                if resubmit_jobids and self.job_id not in resubmit_jobids:
                    use_resubmit_info = True
            except TypeError:
                resubmit_jobids = True
        ## If there is no resubmit_info, we can of course not use it.
        if not self.resubmit_info:
            use_resubmit_info = False
        ## Get the resubmission parameters.
        maxjobruntime = None
        maxmemory = None
        numcores = None
        priority = None
        if not use_resubmit_info:
            #if 'MaxWallTimeMins_RAW' in self.task_ad:
            #    if self.task_ad['MaxWallTimeMins_RAW'] != 1315:
            #        maxjobruntime = self.task_ad.lookup('MaxWallTimeMins_RAW')
            #        self.resubmit_info['maxjobruntime'] = maxjobruntime
            if 'MaxWallTimeMinsProbe' in self.task_ad and self.stage == 'probe':
                maxjobruntime = int(
                    str(self.task_ad.lookup('MaxWallTimeMinsProbe')))
            elif 'MaxWallTimeMinsTail' in self.task_ad and self.stage == 'tail':
                maxjobruntime = int(
                    str(self.task_ad.lookup('MaxWallTimeMinsTail')))
            elif 'MaxWallTimeMinsRun' in self.task_ad:
                maxjobruntime = int(
                    str(self.task_ad.lookup('MaxWallTimeMinsRun')))
            if 'RequestMemory' in self.task_ad:
                maxmemory = int(str(self.task_ad.lookup('RequestMemory')))
            if 'RequestCpus' in self.task_ad:
                numcores = int(str(self.task_ad.lookup('RequestCpus')))
            if 'JobPrio' in self.task_ad:
                priority = int(str(self.task_ad['JobPrio']))
            if str(self.job_id) == '0':  #jobids can be like 1-1 for subjobs
                priority = 20  #the maximum for splitting jobs
        else:
            inkey = str(crab_retry) if crab_retry == 0 else str(crab_retry - 1)
            while inkey not in self.resubmit_info and int(inkey) > 0:
                inkey = str(int(inkey) - 1)
            maxjobruntime = self.resubmit_info[inkey].get('maxjobruntime')
            maxmemory = self.resubmit_info[inkey].get('maxmemory')
            numcores = self.resubmit_info[inkey].get('numcores')
            priority = self.resubmit_info[inkey].get('priority')
        ## Save the (new) values of the resubmission parameters in self.resubmit_info
        ## for the current job retry number.
        outkey = str(crab_retry)
        if outkey not in self.resubmit_info:
            self.resubmit_info[outkey] = {}
        self.resubmit_info[outkey]['maxjobruntime'] = maxjobruntime
        self.resubmit_info[outkey]['maxmemory'] = maxmemory
        self.resubmit_info[outkey]['numcores'] = numcores
        self.resubmit_info[outkey]['priority'] = priority
        self.resubmit_info[outkey]['use_resubmit_info'] = use_resubmit_info
        self.resubmit_info[outkey][
            'CRAB_ResubmitList_in_taskad'] = CRAB_ResubmitList_in_taskad

        ## Add the resubmission parameters to the Job.<job_id>.submit content.
        savelogs = 0 if self.stage == 'probe' else self.task_ad.lookup(
            'CRAB_SaveLogsFlag')
        saveoutputs = 0 if self.stage == 'probe' else self.task_ad.lookup(
            'CRAB_TransferOutputs')
        new_submit_text += '+CRAB_TransferOutputs = {0}\n+CRAB_SaveLogsFlag = {1}\n'.format(
            saveoutputs, savelogs)
        if maxjobruntime is not None:
            new_submit_text += '+EstimatedWallTimeMins = %s\n' % str(
                maxjobruntime)
            new_submit_text += '+MaxWallTimeMinsRun = %s\n' % str(
                maxjobruntime)  # how long it can run
            new_submit_text += '+MaxWallTimeMins = %s\n' % str(
                maxjobruntime)  # how long a slot can it match to
        # no plus sign for next 3 attributes, since those are Condor standard ones
        if maxmemory is not None:
            new_submit_text += 'RequestMemory = %s\n' % (str(maxmemory))
        if numcores is not None:
            new_submit_text += 'RequestCpus = %s\n' % (str(numcores))
        if priority is not None:
            new_submit_text += 'JobPrio = %s\n' % (str(priority))

        ## Within the schedd, order the first few jobs in the task before all other tasks of the same priority.
        pre_job_prio = 1
        if int(self.job_id.split('-')[0]) <= 5:
            pre_job_prio = 0
        new_submit_text += '+PreJobPrio1 = %d\n' % pre_job_prio

        ## The schedd will use PostJobPrio1 as a secondary job-priority sorting key: it
        ## will first run jobs by JobPrio; then, for jobs with the same JobPrio, it will
        ## run the job with the higher PostJobPrio1.
        new_submit_text += '+PostJobPrio1 = -%s\n' % str(
            self.task_ad.lookup('QDate'))

        ## Order retries before all other jobs in this task
        new_submit_text += '+PostJobPrio2 = %d\n' % crab_retry

        ## Add the site black- and whitelists and the DESIRED_SITES to the
        ## Job.<job_id>.submit content.
        new_submit_text = self.redo_sites(new_submit_text, crab_retry,
                                          use_resubmit_info)

        ## Add group information:
        username = self.task_ad.get('CRAB_UserHN')
        if 'CMSGroups' in self.task_ad:
            new_submit_text += '+CMSGroups = %s\n' % classad.quote(
                self.task_ad['CMSGroups'])
        elif username:
            groups = CMSGroupMapper.map_user_to_groups(username)
            if groups:
                new_submit_text += '+CMSGroups = %s\n' % classad.quote(groups)

        ## Finally add (copy) all the content of the generic Job.submit file.
        with open("Job.submit", 'r') as fd:
            new_submit_text += fd.read()
        ## Write the Job.<job_id>.submit file.
        with open("Job.%s.submit" % (self.job_id), 'w') as fd:
            fd.write(new_submit_text)
Exemplo n.º 33
0
def make_inner_dag(
    requirements: Optional[str],
    xfer_cmd_info: T_CMD_INFO,
    verify_cmd_info: T_CMD_INFO,
    unique_id: Optional[str] = None,
    test_mode: bool = False,
):

    # Only import htcondor.dags submit-side
    import htcondor.dags as dags

    inner_dag = dags.DAG(
        max_jobs_by_category={"TRANSFER_JOBS": 1} if test_mode else None)

    inner_dag.layer(
        name="xfer",
        submit_description=htcondor.Submit({
            "output":
            "$(src_file_noslash).out",
            "error":
            "$(src_file_noslash).err",
            "log":
            "xfer_file.log",
            "arguments":
            classad.quote("exec '$(src_file)'"),
            "should_transfer_files":
            "yes",
            "transfer_output_files":
            "{}, metadata".format(SANDBOX_FILE_NAME),
            "transfer_output_remaps":
            classad.quote(
                "{} = $(dest); metadata = $(src_file_noslash).metadata".format(
                    SANDBOX_FILE_NAME)),
            **shared_submit_descriptors(unique_id, requirements),
        }),
        vars=xfer_cmd_info,
        post=dags.Script(
            executable=THIS_FILE,
            arguments=[
                "verify", "--json=xfer_commands.json", "--fileid", "$JOB"
            ],
        ),
    )

    inner_dag.layer(
        name="verify",
        submit_description=htcondor.Submit({
            "output":
            "$(src_file_noslash).out",
            "error":
            "$(src_file_noslash).err",
            "log":
            "verify_file.log",
            "arguments":
            classad.quote("verify_remote '$(src_file)'"),
            "should_transfer_files":
            "yes",
            "transfer_output_files":
            "metadata",
            "transfer_output_remaps":
            classad.quote("metadata = $(src_file_noslash).metadata"),
            **shared_submit_descriptors(unique_id, requirements),
        }),
        vars=verify_cmd_info,
        post=dags.Script(
            executable=THIS_FILE,
            arguments=[
                "verify", "--json=verify_commands.json", "--fileid", "$JOB"
            ],
        ),
    )

    return inner_dag
def main():
    opts = parse_opts()

    users = set()
    for line in open(opts.local_users):
        line = line.strip()
        if line.startswith("#"): continue
        users.add(line)

    collectors = set()
    for pool in opts.pool:
        coll = htcondor.Collector(pool)
        collectors.add(coll)
        if not opts.quiet:
            print >> sys.stderr, "Querying collector %s for schedds matching" % pool, opts.const

    reqs = '(JobStatus == 1) && stringListMember(%s, DESIRED_Sites)' % classad.quote(
        opts.site)
    idle_count = {}
    for user in users:
        if user == "*": continue
        idle_count.setdefault(user, 0)
    user_map = {}
    if not opts.quiet: print >> sys.stderr, "Schedd job requirements:", reqs
    for coll in collectors:
        for schedd_ad in coll.query(
                htcondor.AdTypes.Schedd, opts.const,
            ['MyAddress', 'CondorVersion', 'Name', 'ScheddIpAddr']):
            if not opts.quiet:
                print >> sys.stderr, "Querying", schedd_ad.get(
                    'Name', "Unknown")
            schedd = htcondor.Schedd(schedd_ad)
            try:
                if opts.jobs_only:
                    schedd_data = schedd.xquery(requirements=reqs,
                                                projection=[
                                                    "x509userproxysubject",
                                                    "CRAB_UserHN", "JobStatus"
                                                ])
                else:
                    schedd_data = schedd.xquery(
                        requirements=reqs,
                        projection=[
                            "x509userproxysubject", "CRAB_UserHN", "JobStatus"
                        ],
                        opts=htcondor.QueryOpts.AutoCluster)
            except RuntimeError, e:
                if not opts.quiet:
                    print >> sys.stderr, "Error querying %s: %s" % (
                        schedd_ad.get('Name', "Unknown"), e)
            if not opts.jobs_only:
                for cluster in schedd_data:
                    user = cluster.get("CRAB_UserHN")
                    if (user in users) or ("*" in users):
                        idle_count.setdefault(user, 0)
                        idle_count[user] += int(cluster.get("JobCount", 0))
                        if 'x509userproxysubject' in cluster:
                            user_map[user] = cluster['x509userproxysubject']
            if opts.jobs_only:
                for job in schedd_data:
                    user = job.get("CRAB_UserHN")
                    if (user in users) or ("*" in users):
                        idle_count.setdefault(user, 0)
                        idle_count[user] += 1
                        if 'x509userproxysubject' in job:
                            user_map[user] = job['x509userproxysubject']
Exemplo n.º 35
0
def main():
    """
    Need a doc string here.
    """
    setupLog()

    if '_CONDOR_JOB_AD' not in os.environ or not os.path.exists(
            os.environ["_CONDOR_JOB_AD"]):
        printLog(
            "Exiting AdjustSites since _CONDOR_JOB_AD is not in the environment or does not exist"
        )
        sys.exit(0)

    printLog("Starting AdjustSites with _CONDOR_JOB_AD=%s" %
             os.environ['_CONDOR_JOB_AD'])

    with open(os.environ['_CONDOR_JOB_AD']) as fd:
        ad = classad.parseOne(fd)
    printLog("Parsed ad: %s" % ad)

    # instantiate a server object to talk with crabserver
    host = ad['CRAB_RestHost']
    dbInstance = ad['CRAB_DbInstance']
    cert = ad['X509UserProxy']
    crabserver = CRABRest(host, cert, cert, retry=3, userAgent='CRABSchedd')
    crabserver.setDbInstance(dbInstance)

    checkTaskInfo(crabserver, ad)

    # is this the first time this script runs for this task ? (it runs at each resubmit as well !)
    if not os.path.exists('WEB_DIR'):
        makeWebDir(ad)
        printLog(
            "Webdir has been set up. Uploading the webdir URL to the REST")

        retries = 0
        exitCode = 1
        maxRetries = 3
        while retries < maxRetries and exitCode != 0:
            exitCode = uploadWebDir(crabserver, ad)
            if exitCode != 0:
                time.sleep(retries * 20)
            retries += 1
        if exitCode != 0:
            printLog(
                "Exiting AdjustSites because the webdir upload failed %d times."
                % maxRetries)
            sys.exit(1)
        printLog(
            "Webdir URL has been uploaded, exit code is %s. Setting the classad for the proxied webdir"
            % exitCode)

        saveProxiedWebdir(crabserver, ad)
        printLog("Proxied webdir saved")

    printLog(
        "Clearing the automatic blacklist and handling RunJobs.dag.nodes.log for resubmissions"
    )

    clearAutomaticBlacklist()

    resubmitJobIds = []
    if 'CRAB_ResubmitList' in ad:
        resubmitJobIds = ad['CRAB_ResubmitList']
        try:
            resubmitJobIds = set(resubmitJobIds)
            resubmitJobIds = [str(i) for i in resubmitJobIds]
        except TypeError:
            resubmitJobIds = True

    # Hold and release processing and tail DAGs here so that modifications
    # to the submission and log files will be picked up.
    schedd = htcondor.Schedd()
    tailconst = "TaskType =?= \"TAIL\" && CRAB_ReqName =?= %s" % classad.quote(
        ad.get("CRAB_ReqName"))
    if resubmitJobIds and ad.get('CRAB_SplitAlgo') == 'Automatic':
        printLog("Holding processing and tail DAGs")
        schedd.edit(tailconst, "HoldKillSig", 'SIGKILL')
        schedd.act(htcondor.JobAction.Hold, tailconst)

    if resubmitJobIds:
        adjustedJobIds = []
        filenames = getGlob(ad, "RunJobs.dag.nodes.log",
                            "RunJobs[1-9]*.subdag.nodes.log")
        for fn in filenames:
            if hasattr(htcondor, 'lock'):
                # While dagman is not running at this point, the schedd may be writing events to this
                # file; hence, we only edit the file while holding an appropriate lock.
                # Note this lock method didn't exist until 8.1.6; prior to this, we simply
                # run dangerously.
                with htcondor.lock(open(fn, 'a'), htcondor.LockType.WriteLock):
                    adjustedJobIds.extend(
                        adjustPostScriptExitStatus(resubmitJobIds, fn))
            else:
                adjustedJobIds.extend(
                    adjustPostScriptExitStatus(resubmitJobIds, fn))
        ## Adjust the maximum allowed number of retries only for the job ids for which
        ## the POST script exit status was adjusted. Why only for these job ids and not
        ## for all job ids in resubmitJobIds? Because if resubmitJobIds = True, which as
        ## a general rule means "all failed job ids", we don't have a way to know if a
        ## job is in failed status or not just from the RunJobs.dag file, while job ids
        ## in adjustedJobIds correspond only to failed jobs.
        adjustMaxRetries(adjustedJobIds, ad)

    if 'CRAB_SiteAdUpdate' in ad:
        newSiteAd = ad['CRAB_SiteAdUpdate']
        with open("site.ad") as fd:
            siteAd = classad.parseOne(fd)
        siteAd.update(newSiteAd)
        with open("site.ad", "w") as fd:
            fd.write(str(siteAd))

    if resubmitJobIds and ad.get('CRAB_SplitAlgo') == 'Automatic':
        printLog("Releasing processing and tail DAGs")
        schedd.edit(tailconst, "HoldKillSig", 'SIGUSR1')
        schedd.act(htcondor.JobAction.Release, tailconst)

    printLog("Exiting AdjustSite")
Exemplo n.º 36
0
    def updateSiteInformation(self, jobs, siteName, excludeSite):
        """
        _updateSiteInformation_

        Allow or disallow jobs to run at a site.
        Called externally by Ops scripts if a site enters or leaves Down, Draining or Aborted.

        Kill job if after removing site from allowed sites it has nowhere to run.

        Parameters:    excludeSite = False when moving to Normal
                       excludeSite = True when moving to Down, Draining or Aborted
        """
        schedd = htcondor.Schedd()

        jobtokill = []
        try:
            itobj = schedd.xquery(
                'WMAgent_AgentName =?= %s && JobStatus =?= 1' %
                classad.quote(self.agent),
                ['ClusterId', 'ProcId', 'DESIRED_Sites', 'ExtDESIRED_Sites'])
        except Exception as ex:
            logging.error("Failed to query condor schedd.")
            logging.exception(ex)
            return jobtokill
        else:
            jobInfo = {}
            for jobAd in itobj:
                gridId = "%s.%s" % (jobAd['ClusterId'], jobAd['ProcId'])
                jobInfo[gridId] = jobAd
            for job in jobs:
                jobAd = jobInfo.get(job['gridid'], None)
                if jobAd:
                    desiredSites = jobAd.get('DESIRED_Sites').split(',')
                    extDesiredSites = jobAd.get('ExtDESIRED_Sites').split(',')
                    if excludeSite:
                        # Remove siteName from DESIRED_Sites if job has it
                        if siteName in desiredSites:
                            if len(desiredSites) > 1:
                                desiredSites.remove(siteName)
                                desiredSites = ','.join(desiredSites)
                                try:
                                    schedd.edit([job['gridid']],
                                                'DESIRED_Sites',
                                                classad.ExprTree('"%s"' %
                                                                 desiredSites))
                                except Exception as ex:
                                    logging.error(
                                        "Failed to edit sites for job %s" %
                                        job['gridid'])
                                    logging.exception(ex)
                            else:
                                jobtokill.append(job)
                    else:
                        # Add siteName to DESIRED_Sites if ExtDESIRED_Sites has it (moving back to Normal)
                        if siteName not in desiredSites and siteName in extDesiredSites:
                            desiredSites.append(siteName)
                            desiredSites = ','.join(sorted(desiredSites))
                            try:
                                schedd.edit([job['gridid']], 'DESIRED_Sites',
                                            classad.ExprTree('"%s"' %
                                                             desiredSites))
                            except Exception as ex:
                                logging.error(
                                    "Failed to edit sites for job %s" %
                                    job['gridid'])
                                logging.exception(ex)

        return jobtokill
Exemplo n.º 37
0
    def customizePerJob(self, job):
        """
        JDL additions just for this implementation. Over-ridden in sub-classes
        These are the Glide-in specific bits
        """
        jdl = []
        jobCE = job['location']
        if not jobCE:
            # Then we ended up with a site that doesn't exist?
            logging.error("Job for non-existant site %s", job['location'])
            return jdl

        if self.submitWMSMode and len(job.get('possibleSites', [])) > 0:
            strg = ','.join(map(str, job.get('possibleSites')))
            jdl.append('+DESIRED_Sites = \"%s\"\n' % strg)
        else:
            jdl.append('+DESIRED_Sites = \"%s\"\n' % (jobCE))

        if self.submitWMSMode and len(job.get('potentialSites', [])) > 0:
            strg = ','.join(map(str, job.get('potentialSites')))
            jdl.append('+ExtDESIRED_Sites = \"%s\"\n' % strg)
        else:
            jdl.append('+ExtDESIRED_Sites = \"%s\"\n' % (jobCE))

        if job.get('proxyPath'):
            jdl.append('x509userproxy = %s\n' % job['proxyPath'])

        jdl.append('+WMAgent_RequestName = "%s"\n' % job['requestName'])
        match = GROUP_NAME_RE.match(job['requestName'])
        if match:
            jdl.append('+CMSGroups = %s\n' % classad.quote(match.groups()[0]))
        else:
            jdl.append('+CMSGroups = undefined\n')

        jdl.append('+WMAgent_SubTaskName = "%s"\n' % job['taskName'])
        jdl.append('+CMS_JobType = "%s"\n' % job['taskType'])

        # Handling for AWS, cloud and opportunistic resources
        jdl.append('+AllowOpportunistic = %s\n' % job.get('allowOpportunistic', False))

        # dataset info
        if job.get('inputDataset'):
            jdl.append('+DESIRED_CMSDataset = "%s"\n' % job['inputDataset'])
        else:
            jdl.append('+DESIRED_CMSDataset = undefined\n')
        if job.get('inputDatasetLocations'):
            jdl.append('+DESIRED_CMSDataLocations = "%s"\n' % ','.join(job['inputDatasetLocations']))
        else:
            jdl.append('+DESIRED_CMSDataLocations = undefined\n')

        # HighIO and repack jobs handling
        highio = 1 if job['taskType'] in ["Merge", "Cleanup", "LogCollect"] else 0
        repackjob = 1 if job['taskType'] == 'Repack' else 0
        jdl.append('+Requestioslots = %d\n' % highio)
        jdl.append('+RequestRepackslots = %d\n' % repackjob)

        # Performance and resource estimates (including JDL magic tweaks)
        origCores = job.get('numberOfCores', 1)
        estimatedMins = int(job['estimatedJobTime']/60.0) if job.get('estimatedJobTime') else 12*60
        estimatedMinsSingleCore = estimatedMins * origCores
        # For now, assume a 15 minute job startup overhead -- condor will round this up further
        jdl.append('+EstimatedSingleCoreMins = %d\n' % estimatedMinsSingleCore)
        jdl.append('+OriginalMaxWallTimeMins = %d\n' % estimatedMins)
        jdl.append('+MaxWallTimeMins = WMCore_ResizeJob ? (EstimatedSingleCoreMins/RequestCpus + 15) : OriginalMaxWallTimeMins\n')

        requestMemory = int(job['estimatedMemoryUsage']) if job.get('estimatedMemoryUsage', None) else 1000
        jdl.append('+OriginalMemory = %d\n' % requestMemory)
        jdl.append('+ExtraMemory = %d\n' % self.extraMem)
        jdl.append('+RequestMemory = OriginalMemory + ExtraMemory * (WMCore_ResizeJob ? (RequestCpus-OriginalCpus) : 0)\n')

        requestDisk = int(job['estimatedDiskUsage']) if job.get('estimatedDiskUsage', None) else 20*1000*1000*origCores
        jdl.append('request_disk = %d\n' % requestDisk)

        # Set up JDL for multithreaded jobs.
        # By default, RequestCpus will evaluate to whatever CPU request was in the workflow.
        # If the job is labelled as resizable, then the logic is more complex:
        # - If the job is running in a slot with N cores, this should evaluate to N
        # - If the job is being matched against a machine, match all available CPUs, provided
        # they are between min and max CPUs.
        # - Otherwise, just use the original CPU count.
        jdl.append('machine_count = 1\n')
        minCores = int(job.get('minCores', max(1, origCores/2)))
        maxCores = max(int(job.get('maxCores', origCores)), origCores)
        jdl.append('+MinCores = %d\n' % minCores)
        jdl.append('+MaxCores = %d\n' % maxCores)
        # Advertise the original CPU setting, in case someone needs this for monitoring
        jdl.append('+OriginalCpus = %d\n' % origCores)
        # Prefer slots that are closest to our MaxCores without going over.
        # If the slot size is _greater_ than our MaxCores, we prefer not to
        # use it - we might unnecessarily fragment the slot.
        jdl.append('rank = isUndefined(Cpus) ? 0 : ifThenElse(Cpus > MaxCores, -Cpus, Cpus)\n')
        # Record the number of CPUs utilized at match time.  We'll use this later
        # for monitoring and accounting.  Defaults to 0; once matched, it'll
        # put an attribute in the job  MATCH_EXP_JOB_GLIDEIN_Cpus = 4
        jdl.append('+JOB_GLIDEIN_Cpus = "$$(Cpus:0)"\n')
        # Make sure the resize request stays within MinCores and MaxCores.
        jdl.append('+RequestResizedCpus = (Cpus>MaxCores) ? MaxCores : ((Cpus < MinCores) ? MinCores : Cpus)\n')
        # If the job is running, then we should report the matched CPUs in RequestCpus - but only if there are sane
        # values.  Otherwise, we just report the original CPU request
        jdl.append('+JobCpus = ((JobStatus =!= 1) && (JobStatus =!= 5) && !isUndefined(MATCH_EXP_JOB_GLIDEIN_Cpus) && (int(MATCH_EXP_JOB_GLIDEIN_Cpus) isnt error)) ? int(MATCH_EXP_JOB_GLIDEIN_Cpus) : OriginalCpus\n')

        # Cpus is taken from the machine ad - hence it is only defined when we are doing negotiation.
        # Otherwise, we use either the cores in the running job (if available) or the original cores.
        jdl.append('+RequestCpus = WMCore_ResizeJob ? (!isUndefined(Cpus) ? RequestResizedCpus : JobCpus) : OriginalCpus\n')

        jdl.append('+WMCore_ResizeJob = %s\n' % bool(job.get('resizeJob', False)))


        # Add OS requirements for jobs
        if job.get('scramArch') is not None and job.get('scramArch').startswith("slc6_"):
            jdl.append('+REQUIRED_OS = "rhel6"\n')
        else:
            jdl.append('+REQUIRED_OS = "any"\n')

        return jdl
Exemplo n.º 38
0
    def getJobParameters(self, jobList):
        """
        _getJobParameters_

        Return a list of dictionaries with submit parameters per job.
        """

        undefined = 'UNDEFINED'
        jobParameters = []

        for job in jobList:
            ad = {}

            ad['initial_Dir'] = job['cache_dir']
            ad['transfer_input_files'] = "%s,%s/%s,%s" % (
                job['sandbox'], job['packageDir'], 'JobPackage.pkl',
                self.unpacker)
            ad['Arguments'] = "%s %i %s" % (os.path.basename(
                job['sandbox']), job['id'], job["retry_count"])
            ad['transfer_output_files'] = "Report.%i.pkl,wmagentJob.log" % job[
                "retry_count"]

            # Do not define Requirements and X509 ads for Volunteer resources
            if self.reqStr and "T3_CH_Volunteer" not in job.get(
                    'possibleSites'):
                ad['Requirements'] = self.reqStr

            ad['My.x509userproxy'] = classad.quote(self.x509userproxy)
            sites = ','.join(sorted(job.get('possibleSites')))
            ad['My.DESIRED_Sites'] = classad.quote(str(sites))
            sites = ','.join(sorted(job.get('potentialSites')))
            ad['My.ExtDESIRED_Sites'] = classad.quote(str(sites))
            ad['My.CMS_JobRetryCount'] = str(job['retry_count'])
            ad['My.WMAgent_RequestName'] = classad.quote(job['request_name'])
            match = re.compile("^[a-zA-Z0-9_]+_([a-zA-Z0-9]+)-").match(
                job['request_name'])
            if match:
                ad['My.CMSGroups'] = classad.quote(match.groups()[0])
            else:
                ad['My.CMSGroups'] = undefined
            ad['My.WMAgent_JobID'] = str(job['jobid'])
            ad['My.WMAgent_SubTaskName'] = classad.quote(job['task_name'])
            ad['My.CMS_JobType'] = classad.quote(job['task_type'])
            ad['My.CMS_Type'] = classad.quote(activityToType(job['activity']))

            # Handling for AWS, cloud and opportunistic resources
            ad['My.AllowOpportunistic'] = str(
                job.get('allowOpportunistic', False))
            if job.get('inputDataset'):
                ad['My.DESIRED_CMSDataset'] = classad.quote(
                    job['inputDataset'])
            else:
                ad['My.DESIRED_CMSDataset'] = undefined
            if job.get('inputDatasetLocations'):
                sites = ','.join(sorted(job['inputDatasetLocations']))
                ad['My.DESIRED_CMSDataLocations'] = classad.quote(str(sites))
            else:
                ad['My.DESIRED_CMSDataLocations'] = undefined
            if job.get('inputPileup'):
                cmsPileups = ','.join(sorted(job['inputPileup']))
                ad['My.DESIRED_CMSPileups'] = classad.quote(str(cmsPileups))
            else:
                ad['My.DESIRED_CMSPileups'] = undefined
            # HighIO and repack jobs
            ad['My.Requestioslots'] = str(
                1 if job['task_type'] in
                ["Merge", "Cleanup", "LogCollect"] else 0)
            ad['My.RequestRepackslots'] = str(1 if job['task_type'] ==
                                              'Repack' else 0)
            # Performance and resource estimates (including JDL magic tweaks)
            origCores = job.get('numberOfCores', 1)
            estimatedMins = int(
                job['estimatedJobTime'] /
                60.0) if job.get('estimatedJobTime') else 12 * 60
            estimatedMinsSingleCore = estimatedMins * origCores
            # For now, assume a 15 minute job startup overhead -- condor will round this up further
            ad['My.EstimatedSingleCoreMins'] = str(estimatedMinsSingleCore)
            ad['My.OriginalMaxWallTimeMins'] = str(estimatedMins)
            ad['My.MaxWallTimeMins'] = 'WMCore_ResizeJob ? (EstimatedSingleCoreMins/RequestCpus + 15) : OriginalMaxWallTimeMins'
            requestMemory = int(job['estimatedMemoryUsage']) if job.get(
                'estimatedMemoryUsage', None) else 1000
            ad['My.OriginalMemory'] = str(requestMemory)
            ad['My.ExtraMemory'] = str(self.extraMem)
            ad['request_memory'] = 'OriginalMemory + ExtraMemory * (WMCore_ResizeJob ? (RequestCpus-OriginalCpus) : 0)'
            requestDisk = int(job['estimatedDiskUsage']) if job.get(
                'estimatedDiskUsage', None) else 20 * 1000 * 1000 * origCores
            ad['request_disk'] = str(requestDisk)
            # Set up JDL for multithreaded jobs.
            # By default, RequestCpus will evaluate to whatever CPU request was in the workflow.
            # If the job is labelled as resizable, then the logic is more complex:
            # - If the job is running in a slot with N cores, this should evaluate to N
            # - If the job is being matched against a machine, match all available CPUs, provided
            # they are between min and max CPUs.
            # - Otherwise, just use the original CPU count.
            ad['My.MinCores'] = str(job.get('minCores', max(1, origCores / 2)))
            ad['My.MaxCores'] = str(
                max(int(job.get('maxCores', origCores)), origCores))
            ad['My.OriginalCpus'] = str(origCores)
            # Prefer slots that are closest to our MaxCores without going over.
            # If the slot size is _greater_ than our MaxCores, we prefer not to
            # use it - we might unnecessarily fragment the slot.
            ad['Rank'] = 'isUndefined(Cpus) ? 0 : ifThenElse(Cpus > MaxCores, -Cpus, Cpus)'
            # Record the number of CPUs utilized at match time.  We'll use this later
            # for monitoring and accounting.  Defaults to 0; once matched, it'll
            # put an attribute in the job  MATCH_EXP_JOB_GLIDEIN_Cpus = 4
            ad['My.JOB_GLIDEIN_Cpus'] = classad.quote("$$(Cpus:0)")
            # Make sure the resize request stays within MinCores and MaxCores.
            ad['My.RequestResizedCpus'] = '(Cpus>MaxCores) ? MaxCores : ((Cpus < MinCores) ? MinCores : Cpus)'
            # If the job is running, then we should report the matched CPUs in RequestCpus - but only if there are sane
            # values.  Otherwise, we just report the original CPU request
            ad['My.JobCpus'] = (
                '((JobStatus =!= 1) && (JobStatus =!= 5) && !isUndefined(MATCH_EXP_JOB_GLIDEIN_Cpus) '
                '&& (int(MATCH_EXP_JOB_GLIDEIN_Cpus) isnt error)) ? int(MATCH_EXP_JOB_GLIDEIN_Cpus) : OriginalCpus'
            )
            # Cpus is taken from the machine ad - hence it is only defined when we are doing negotiation.
            # Otherwise, we use either the cores in the running job (if available) or the original cores.
            ad['request_cpus'] = 'WMCore_ResizeJob ? (!isUndefined(Cpus) ? RequestResizedCpus : JobCpus) : OriginalCpus'
            ad['My.WMCore_ResizeJob'] = str(job.get('resizeJob', False))
            taskPriority = int(job.get('taskPriority', 1))
            priority = int(job.get('wf_priority', 0))
            ad['My.JobPrio'] = str(int(priority + taskPriority * 1))
            ad['My.PostJobPrio1'] = str(
                int(-1 * len(job.get('potentialSites', []))))
            ad['My.PostJobPrio2'] = str(int(-1 * job['task_id']))
            # Add OS requirements for jobs
            requiredOSes = self.scramArchtoRequiredOS(job.get('scramArch'))
            ad['My.REQUIRED_OS'] = classad.quote(requiredOSes)
            cmsswVersions = ','.join(job.get('swVersion'))
            ad['My.CMSSW_Versions'] = classad.quote(cmsswVersions)

            jobParameters.append(ad)

        return jobParameters
Exemplo n.º 39
0
def test_quote_unquote_is_symmetric(input):
    assert classad.unquote(classad.quote(input)) == input
Exemplo n.º 40
0
 def test_quote(self):
     self.assertEquals(classad.quote("foo"), '"foo"')
     self.assertEquals(classad.quote('"foo'), '"\\"foo"')
     for i in ["foo", '"foo', '"\\"foo']:
         self.assertEquals(i, classad.unquote(classad.quote(i)))
Exemplo n.º 41
0
    def customizePerJob(self, job):
        """
        JDL additions just for this implementation. Over-ridden in sub-classes
        These are the Glide-in specific bits
        """
        jdl = []
        jobCE = job['location']
        if not jobCE:
            # Then we ended up with a site that doesn't exist?
            logging.error("Job for non-existant site %s", job['location'])
            return jdl

        if self.submitWMSMode and len(job.get('possibleSites', [])) > 0:
            strg = ','.join([str(x) for x in job.get('possibleSites')])
            jdl.append('+DESIRED_Sites = \"%s\"\n' % strg)
        else:
            jdl.append('+DESIRED_Sites = \"%s\"\n' % (jobCE))

        if self.submitWMSMode and len(job.get('potentialSites', [])) > 0:
            strg = ','.join([str(x) for x in job.get('potentialSites')])
            jdl.append('+ExtDESIRED_Sites = \"%s\"\n' % strg)
        else:
            jdl.append('+ExtDESIRED_Sites = \"%s\"\n' % (jobCE))

        if job.get('proxyPath'):
            jdl.append('x509userproxy = %s\n' % job['proxyPath'])

        jdl.append('+WMAgent_RequestName = "%s"\n' % job['requestName'])
        match = GROUP_NAME_RE.match(job['requestName'])
        if match:
            jdl.append('+CMSGroups = %s\n' % classad.quote(match.groups()[0]))
        else:
            jdl.append('+CMSGroups = undefined\n')

        jdl.append('+WMAgent_SubTaskName = "%s"\n' % job['taskName'])
        jdl.append('+CMS_JobType = "%s"\n' % job['taskType'])

        # Handling for AWS, cloud and opportunistic resources
        jdl.append('+AllowOpportunistic = %s\n' %
                   job.get('allowOpportunistic', False))

        # dataset info
        if job.get('inputDataset'):
            jdl.append('+DESIRED_CMSDataset = "%s"\n' % job['inputDataset'])
        else:
            jdl.append('+DESIRED_CMSDataset = undefined\n')
        if job.get('inputDatasetLocations'):
            jdl.append('+DESIRED_CMSDataLocations = "%s"\n' %
                       ','.join(job['inputDatasetLocations']))
        else:
            jdl.append('+DESIRED_CMSDataLocations = undefined\n')

        # HighIO and repack jobs handling
        highio = 1 if job['taskType'] in ["Merge", "Cleanup", "LogCollect"
                                          ] else 0
        repackjob = 1 if job['taskType'] == 'Repack' else 0
        jdl.append('+Requestioslots = %d\n' % highio)
        jdl.append('+RequestRepackslots = %d\n' % repackjob)

        # Performance and resource estimates (including JDL magic tweaks)
        origCores = job.get('numberOfCores', 1)
        estimatedMins = int(job['estimatedJobTime'] /
                            60.0) if job.get('estimatedJobTime') else 12 * 60
        estimatedMinsSingleCore = estimatedMins * origCores
        # For now, assume a 15 minute job startup overhead -- condor will round this up further
        jdl.append('+EstimatedSingleCoreMins = %d\n' % estimatedMinsSingleCore)
        jdl.append('+OriginalMaxWallTimeMins = %d\n' % estimatedMins)
        jdl.append(
            '+MaxWallTimeMins = WMCore_ResizeJob ? (EstimatedSingleCoreMins/RequestCpus + 15) : OriginalMaxWallTimeMins\n'
        )

        requestMemory = int(job['estimatedMemoryUsage']) if job.get(
            'estimatedMemoryUsage', None) else 1000
        jdl.append('+OriginalMemory = %d\n' % requestMemory)
        jdl.append('+ExtraMemory = %d\n' % self.extraMem)
        jdl.append(
            '+RequestMemory = OriginalMemory + ExtraMemory * (WMCore_ResizeJob ? (RequestCpus-OriginalCpus) : 0)\n'
        )

        requestDisk = int(job['estimatedDiskUsage']) if job.get(
            'estimatedDiskUsage', None) else 20 * 1000 * 1000 * origCores
        jdl.append('request_disk = %d\n' % requestDisk)

        # Set up JDL for multithreaded jobs.
        # By default, RequestCpus will evaluate to whatever CPU request was in the workflow.
        # If the job is labelled as resizable, then the logic is more complex:
        # - If the job is running in a slot with N cores, this should evaluate to N
        # - If the job is being matched against a machine, match all available CPUs, provided
        # they are between min and max CPUs.
        # - Otherwise, just use the original CPU count.
        jdl.append('machine_count = 1\n')
        minCores = int(job.get('minCores', max(1, origCores / 2)))
        maxCores = max(int(job.get('maxCores', origCores)), origCores)
        jdl.append('+MinCores = %d\n' % minCores)
        jdl.append('+MaxCores = %d\n' % maxCores)
        # Advertise the original CPU setting, in case someone needs this for monitoring
        jdl.append('+OriginalCpus = %d\n' % origCores)
        # Prefer slots that are closest to our MaxCores without going over.
        # If the slot size is _greater_ than our MaxCores, we prefer not to
        # use it - we might unnecessarily fragment the slot.
        jdl.append(
            'rank = isUndefined(Cpus) ? 0 : ifThenElse(Cpus > MaxCores, -Cpus, Cpus)\n'
        )
        # Record the number of CPUs utilized at match time.  We'll use this later
        # for monitoring and accounting.  Defaults to 0; once matched, it'll
        # put an attribute in the job  MATCH_EXP_JOB_GLIDEIN_Cpus = 4
        jdl.append('+JOB_GLIDEIN_Cpus = "$$(Cpus:0)"\n')
        # Make sure the resize request stays within MinCores and MaxCores.
        jdl.append(
            '+RequestResizedCpus = (Cpus>MaxCores) ? MaxCores : ((Cpus < MinCores) ? MinCores : Cpus)\n'
        )
        # If the job is running, then we should report the matched CPUs in RequestCpus - but only if there are sane
        # values.  Otherwise, we just report the original CPU request
        jdl.append(
            '+JobCpus = ((JobStatus =!= 1) && (JobStatus =!= 5) && !isUndefined(MATCH_EXP_JOB_GLIDEIN_Cpus) '
            '&& (int(MATCH_EXP_JOB_GLIDEIN_Cpus) isnt error)) ? int(MATCH_EXP_JOB_GLIDEIN_Cpus) : OriginalCpus\n'
        )

        # Cpus is taken from the machine ad - hence it is only defined when we are doing negotiation.
        # Otherwise, we use either the cores in the running job (if available) or the original cores.
        jdl.append(
            '+RequestCpus = WMCore_ResizeJob ? (!isUndefined(Cpus) ? RequestResizedCpus : JobCpus) : OriginalCpus\n'
        )

        jdl.append('+WMCore_ResizeJob = %s\n' %
                   bool(job.get('resizeJob', False)))

        # Add OS requirements for jobs
        requiredOSes = self.scramArchtoRequiredOS(job.get('scramArch'))
        jdl.append('+REQUIRED_OS = "%s"\n' % requiredOSes)

        return jdl
Exemplo n.º 42
0
    def track(self, jobs):
        """
        _track_

        Track the jobs while in condor
        This returns a three-way ntuple
        First, the total number of jobs still running
        Second, the jobs that need to be changed
        Third, the jobs that need to be completed
        """
        jobInfo = {}
        changeList = []
        completeList = []
        runningList = []

        # get info about all active and recent jobs
        logging.debug("SimpleCondorPlugin is going to track %s jobs", len(jobs))

        schedd = htcondor.Schedd()

        logging.debug("Start: Retrieving classAds using Condor Python XQuery")
        try:
            itobj = schedd.xquery("WMAgent_AgentName == %s" % classad.quote(self.agent),
                                  ['ClusterId', 'ProcId', 'JobStatus', 'MATCH_EXP_JOBGLIDEIN_CMSSite'])
            for jobAd in itobj:
                gridId = "%s.%s" % (jobAd['ClusterId'], jobAd['ProcId'])
                jobStatus = SimpleCondorPlugin.exitCodeMap().get(jobAd.get('JobStatus'), 'Unknown')
                location = jobAd.get('MATCH_EXP_JOBGLIDEIN_CMSSite', None)
                jobInfo[gridId] = (jobStatus, location)
        except Exception as ex:
            logging.error("Query to condor schedd failed in SimpleCondorPlugin.")
            logging.error("Returning empty lists for all job types...")
            logging.exception(ex)
            return runningList, changeList, completeList

        logging.debug("Finished retrieving %d classAds from Condor", len(jobInfo))

        # now go over the jobs and see what we have
        for job in jobs:

            # if the schedd doesn't know a job, consider it complete
            # doing any further checks is not cost effective
            if job['gridid'] not in jobInfo:
                (newStatus, location) = ('Completed', None)
            else:
                (newStatus,location) = jobInfo[job['gridid']]

            # check for status changes
            if newStatus != job['status']:

                # update location info for Idle->Running transition
                if newStatus == 'Running' and job['status'] == 'Idle':
                    if location:
                        job['location'] = location
                        logging.debug("JobAdInfo: Job location for jobid=%i gridid=%s changed to %s", job['jobid'], job['gridid'], location)

                job['status'] = newStatus
                job['status_time'] = int(time.time())
                logging.debug("JobAdInfo: Job status for jobid=%i gridid=%s changed to %s", job['jobid'], job['gridid'], job['status'])
                changeList.append(job)

            job['globalState'] = SimpleCondorPlugin.stateMap().get(newStatus)

            # stop tracking finished jobs
            if job['globalState'] in ['Complete', 'Error']:
                completeList.append(job)
            else:
                runningList.append(job)

        logging.debug("SimpleCondorPlugin tracking : %i/%i/%i (Executing/Changing/Complete)",
                      len(runningList), len(changeList), len(completeList))

        return runningList, changeList, completeList
Exemplo n.º 43
0
    def submitDirect(self, schedd, cmd, arg, info): #pylint: disable=R0201
        """
        Submit directly to the schedd using the HTCondor module
        """
        dagAd = classad.ClassAd()
        addCRABInfoToClassAd(dagAd, info)

        if info["CMSGroups"]:
            dagAd["CMSGroups"] = ','.join(info["CMSGroups"])
        else:
            dagAd["CMSGroups"] = classad.Value.Undefined

        # NOTE: Changes here must be synchronized with the job_submit in DagmanCreator.py in CAFTaskWorker
        dagAd["CRAB_Attempt"] = 0
        # We switched from local to scheduler universe.  Why?  It seems there's no way in the
        # local universe to change the hold signal at runtime.  That's fairly important for our
        # resubmit implementation.
        #dagAd["JobUniverse"] = 12
        dagAd["JobUniverse"] = 7
        dagAd["HoldKillSig"] = "SIGUSR1"
        dagAd["X509UserProxy"] = info['user_proxy']
        dagAd["Requirements"] = classad.ExprTree('true || false')
        dagAd["Environment"] = classad.ExprTree('strcat("PATH=/usr/bin:/bin CRAB3_VERSION=3.3.0-pre1 CONDOR_ID=", ClusterId, ".", ProcId," %s")' % " ".join(info['additional_environment_options'].split(";")))
        dagAd["RemoteCondorSetup"] = info['remote_condor_setup']

        dagAd["CRAB_TaskSubmitTime"] = classad.ExprTree("%s" % info["start_time"].encode('ascii', 'ignore'))
        dagAd['CRAB_TaskLifetimeDays'] = TASKLIFETIME // 24 // 60 // 60
        dagAd['CRAB_TaskEndTime'] = int(info["start_time"]) + TASKLIFETIME
        #For task management info see https://github.com/dmwm/CRABServer/issues/4681#issuecomment-302336451
        dagAd["LeaveJobInQueue"] = classad.ExprTree("true")
        dagAd["PeriodicHold"] = classad.ExprTree("time() > CRAB_TaskEndTime")
        dagAd["TransferOutput"] = info['outputFilesString']
        dagAd["OnExitHold"] = classad.ExprTree("(ExitCode =!= UNDEFINED && ExitCode != 0)")
        dagAd["OnExitRemove"] = classad.ExprTree("( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2))")
        dagAd["OtherJobRemoveRequirements"] = classad.ExprTree("DAGManJobId =?= ClusterId")
        dagAd["RemoveKillSig"] = "SIGUSR1"

        with open('subdag.ad' ,'w') as fd:
            for k, v in dagAd.items():
                if k == 'X509UserProxy':
                    v = os.path.basename(v)
                if isinstance(v, basestring):
                    value = classad.quote(v)
                elif isinstance(v, classad.ExprTree):
                    value = repr(v)
                elif isinstance(v, list):
                    value = "{{{0}}}".format(json.dumps(v)[1:-1])
                else:
                    value = v
                fd.write('+{0} = {1}\n'.format(k, value))

        dagAd["TaskType"] = "ROOT"
        dagAd["Out"] = str(os.path.join(info['scratch'], "request.out"))
        dagAd["Err"] = str(os.path.join(info['scratch'], "request.err"))
        dagAd["Cmd"] = cmd
        dagAd['Args'] = arg
        dagAd["TransferInput"] = str(info['inputFilesString'])

        condorIdDict = {}
        with HTCondorUtils.AuthenticatedSubprocess(info['user_proxy'], pickleOut=True, outputObj=condorIdDict, logger=self.logger) as (parent, rpipe):
            if not parent:
                resultAds = []
                condorIdDict['ClusterId'] = schedd.submit(dagAd, 1, True, resultAds)
                schedd.spool(resultAds)
                # editing the LeaveJobInQueue since the remote submit overwrites it
                # see https://github.com/dmwm/CRABServer/pull/5212#issuecomment-216519749
                if resultAds:
                    id_ = "%s.%s" % (resultAds[0]['ClusterId'], resultAds[0]['ProcId'])
                    schedd.edit([id_], "LeaveJobInQueue", classad.ExprTree("true"))

        try:
            results = pickle.load(rpipe)
        except EOFError:
            #Do not want to retry this since error may happen after submit (during edit for example).
            #And this can cause the task to be submitted twice (although we have a protection in the duplicatedCheck)
            raise TaskWorkerException("Timeout executing condor submit command.", retry=False)

        #notice that the clusterId might be set even if there was a failure. This is if the schedd.submit succeded, but the spool  call failed
        if 'ClusterId' in results.outputObj:
            self.logger.debug("Condor cluster ID just submitted is: %s", results.outputObj['ClusterId'])
        if results.outputMessage != "OK":
            self.logger.debug("Now printing the environment used for submission:\n" + "-"*70 + "\n" + results.environmentStr + "-"*70)
            raise TaskWorkerException("Failure when submitting task to scheduler. Error reason: '%s'" % results.outputMessage, retry=True)

        #if we don't raise exception above the id is here
        return results.outputObj['ClusterId']
Exemplo n.º 44
0
    def alter_submit(self, crab_retry):
        """
        Copy the content of the generic file Job.submit into a job-specific file
        Job.<job_id>.submit and add attributes that are job-specific (e.g. CRAB_Retry).
        Add also parameters that can be overwritten at each manual job resubmission
        (e.g. MaxWallTimeMins, RequestMemory, RequestCpus, JobPrio, DESIRED_SITES).
        """
        ## Start the Job.<job_id>.submit content with the CRAB_Retry.
        new_submit_text = '+CRAB_Retry = %d\n' % (crab_retry)
        msg = "Setting CRAB_Retry = %s" % (crab_retry)
        self.logger.info(msg)
        ## For the parameters that can be overwritten at each manual job resubmission,
        ## read them from the task ad, unless there is resubmission information there
        ## and this job is not one that has to be resubmitted, in which case we should
        ## use the same parameters (site black- and whitelists, requested memory, etc)
        ## as used by the previous job retry (which are saved in self.resubmit_info).
        CRAB_ResubmitList_in_taskad = ('CRAB_ResubmitList' in self.task_ad)
        use_resubmit_info = False
        resubmit_jobids = []
        if 'CRAB_ResubmitList' in self.task_ad:
            resubmit_jobids = self.task_ad['CRAB_ResubmitList']
            try:
                resubmit_jobids = set(resubmit_jobids)
                if resubmit_jobids and self.job_id not in resubmit_jobids:
                    use_resubmit_info = True
            except TypeError:
                resubmit_jobids = True
        ## If there is no resubmit_info, we can of course not use it.
        if not self.resubmit_info:
            use_resubmit_info = False
        ## Get the resubmission parameters.
        maxjobruntime = None
        maxmemory = None
        numcores = None
        priority = None
        if not use_resubmit_info:
            #if 'MaxWallTimeMins_RAW' in self.task_ad:
            #    if self.task_ad['MaxWallTimeMins_RAW'] != 1315:
            #        maxjobruntime = self.task_ad.lookup('MaxWallTimeMins_RAW')
            #        self.resubmit_info['maxjobruntime'] = maxjobruntime
            if 'MaxWallTimeMins' in self.task_ad:
                maxjobruntime = int(str(
                    self.task_ad.lookup('MaxWallTimeMins')))
            if 'RequestMemory' in self.task_ad:
                maxmemory = int(str(self.task_ad.lookup('RequestMemory')))
            if 'RequestCpus' in self.task_ad:
                numcores = int(str(self.task_ad.lookup('RequestCpus')))
            if 'JobPrio' in self.task_ad:
                priority = int(str(self.task_ad['JobPrio']))
        else:
            inkey = str(crab_retry) if crab_retry == 0 else str(crab_retry - 1)
            while inkey not in self.resubmit_info and int(inkey) > 0:
                inkey = str(int(inkey) - 1)
            maxjobruntime = self.resubmit_info[inkey].get('maxjobruntime')
            maxmemory = self.resubmit_info[inkey].get('maxmemory')
            numcores = self.resubmit_info[inkey].get('numcores')
            priority = self.resubmit_info[inkey].get('priority')
        ## Save the (new) values of the resubmission parameters in self.resubmit_info
        ## for the current job retry number.
        outkey = str(crab_retry)
        if outkey not in self.resubmit_info:
            self.resubmit_info[outkey] = {}
        self.resubmit_info[outkey]['maxjobruntime'] = maxjobruntime
        self.resubmit_info[outkey]['maxmemory'] = maxmemory
        self.resubmit_info[outkey]['numcores'] = numcores
        self.resubmit_info[outkey]['priority'] = priority
        self.resubmit_info[outkey]['use_resubmit_info'] = use_resubmit_info
        self.resubmit_info[outkey][
            'CRAB_ResubmitList_in_taskad'] = CRAB_ResubmitList_in_taskad
        ## Add the resubmission parameters to the Job.<job_id>.submit content.
        if maxjobruntime is not None:
            new_submit_text += '+MaxWallTimeMins = %s\n' % (str(maxjobruntime))
        if maxmemory is not None:
            new_submit_text += '+RequestMemory = %s\n' % (str(maxmemory))
        if numcores is not None:
            new_submit_text += '+RequestCpus = %s\n' % (str(numcores))
        if priority is not None:
            new_submit_text += '+JobPrio = %s\n' % (str(priority))

        ## Within the schedd, order the first few jobs in the task before all other tasks of the same priority.
        pre_job_prio = 1
        if self.job_id <= 5:
            pre_job_prio = 0
        new_submit_text += '+PreJobPrio1 = %d\n' % pre_job_prio

        ## The schedd will use PostJobPrio1 as a secondary job-priority sorting key: it
        ## will first run jobs by JobPrio; then, for jobs with the same JobPrio, it will
        ## run the job with the higher PostJobPrio1.
        new_submit_text += '+PostJobPrio1 = -%s\n' % str(
            self.task_ad.lookup('QDate'))

        ## Order retries before all other jobs in this task
        new_submit_text += '+PostJobPrio2 = %d\n' % crab_retry

        ## Add the site black- and whitelists and the DESIRED_SITES to the
        ## Job.<job_id>.submit content.
        new_submit_text = self.redo_sites(new_submit_text, crab_retry,
                                          use_resubmit_info)

        ## Add group information:
        username = self.task_ad.get('CRAB_UserHN')
        if 'CMSGroups' in self.task_ad:
            new_submit_text += '+CMSGroups = %s\n' % classad.quote(
                self.task_ad['CMSGroups'])
        elif username:
            groups = CMSGroupMapper.map_user_to_groups(username)
            if groups:
                new_submit_text += '+CMSGroups = %s\n' % classad.quote(groups)

        ## Finally add (copy) all the content of the generic Job.submit file.
        with open("Job.submit", 'r') as fd:
            new_submit_text += fd.read()
        ## Write the Job.<job_id>.submit file.
        with open("Job.%d.submit" % (self.job_id), 'w') as fd:
            fd.write(new_submit_text)
Exemplo n.º 45
0
    def parse_job_set_file(self, job_set_file):
        commands = {"name", "iterator", "job"}
        iterator_types = {"table"}
        lineno = 0
        with open(job_set_file, "rt") as f:
            while f:

                line = f.readline()
                if line == "":
                    break
                lineno += 1

                line = line.strip()
                if line == "" or line.startswith("#"):
                    continue

                try:
                    command = line.split()[0].split("=")[0].casefold()
                except IndexError:
                    raise IndexError(
                        f"""Malformed command in {job_set_file} at line {lineno}."""
                    )

                if command not in commands:
                    raise ValueError(
                        f"""Unrecognized command "{command}" in {job_set_file} at line {lineno}."""
                    )

                if command == "name":
                    if self.name is not None:
                        raise ValueError(
                            f"""Job set name can only be set once, second name found in {job_set_file} at line {lineno}."""
                        )
                    try:
                        value = line.split("#")[0].split("=")[1].strip()
                    except IndexError:
                        raise IndexError(
                            f"""Malformed {command} command in {job_set_file} at line {lineno}."""
                        )
                    if value.strip() == "":
                        raise ValueError(
                            f"""Blank job set name found in {job_set_file} at line {lineno}."""
                        )

                    self.name = value

                elif command == "iterator":
                    if self.itemdata is not None:
                        raise ValueError(
                            f"""Job set iterator can only be set once, second iterator found in {job_set_file} at line {lineno}."""
                        )

                    try:
                        value = line.split("#")[0].split("=")[1].strip()
                    except IndexError:
                        raise IndexError(
                            f"""Malformed {command} command in {job_set_file} at line {lineno}."""
                        )

                    if len(value.split()) < 3:
                        raise ValueError(
                            f"""Unparseable iterator "{value}" in {job_set_file} at line {lineno}."""
                        )

                    iterator_type = value.split()[0]
                    if iterator_type not in iterator_types:
                        raise ValueError(
                            f"""Unknown iterator type "{iterator_type}" in {job_set_file} at line {lineno}."""
                        )

                    if iterator_type == "table":

                        # Get the column names
                        iterator_names = value.replace(",", " ").split()[1:-1]
                        iterator_names = [x.strip() for x in iterator_names]

                        # Read the iterator values into a itemdata list of dicts
                        iterator_source = value.split()[-1]
                        if iterator_source == "{":
                            inline = "{"
                            inlineno = 0
                            inline_data = ""
                            while inline != "":
                                inline = f.readline()
                                inlineno += 1

                                if inline.strip() == "":
                                    continue

                                if inline.split("#")[0].strip() == "}":
                                    break

                                # Assume that a newly opened bracket without
                                # a closing bracket means that there was an error.
                                try:
                                    if inline.split(
                                            "#")[0].split()[-1].strip() == "{":
                                        raise ValueError(
                                            f"""Unclosed bracket in {job_set_file} starting at line {lineno}."""
                                        )
                                except IndexError:
                                    pass  # Let the parser handle this situation

                                inline_data += inline
                            else:
                                raise ValueError(
                                    f"""Unclosed bracket in {job_set_file} starting at line {lineno}."""
                                )
                            self.itemdata = self.parse_columnar_itemdata(
                                iterator_names,
                                inline_data,
                                lineno=lineno,
                                fname=job_set_file)
                            lineno += inlineno
                        else:
                            try:
                                with open(iterator_source, "rt") as f_iter:
                                    self.itemdata = self.parse_columnar_itemdata(
                                        iterator_names,
                                        f_iter.read(),
                                        fname=iterator_source)
                            except IOError as e:
                                raise IOError(
                                    f"Error opening table file {iterator_source} in {job_set_file} at line {lineno}:\n{str(e)}"
                                )

                elif command == "job":
                    try:
                        value = " ".join(
                            line.split("#")[0].strip().split()[1:])
                    except IndexError:
                        raise IndexError(
                            f"""Malformed {command} command in {job_set_file} at line {lineno}."""
                        )

                    # Get the variable name mappings
                    mappings = []
                    if len(value.split()) > 1:
                        mapping_strs = ",".join(value.split()[:-1])
                        if "=" in mapping_strs:
                            for mapping_str in mapping_strs.split(","):
                                mapping = tuple(
                                    x.strip() for x in mapping_str.split("="))
                                if len(mapping) != 2:
                                    raise ValueError(
                                        f"""Unsupported mapping "{mapping_str}" in {job_set_file} at line {lineno}."""
                                    )
                                mappings.append(mapping)
                        else:
                            raise ValueError(
                                f"""Unsupported mapping "{' '.join(value.split()[:-1])}" in {job_set_file} at line {lineno}."""
                            )
                    mappings = dict(mappings)

                    # Read the job submit description into a Submit object
                    job_source = value.split()[-1]
                    if job_source == "{":
                        inline = "{"
                        inlineno = 0
                        inline_data = ""
                        while inline != "":
                            inline = f.readline()
                            inlineno += 1

                            if inline.strip() == "":
                                continue

                            if inline.split("#")[0].strip() == "}":
                                break

                            # Assume that a newly opened bracket without
                            # a closing bracket means that there was an error.
                            try:
                                if inline.split(
                                        "#")[0].split()[-1].strip() == "{":
                                    raise ValueError(
                                        f"""Unclosed bracket in {job_set_file} starting at line {lineno}."""
                                    )
                            except IndexError:
                                pass  # Let the parser handle this situation

                            inline_data += inline.lstrip()
                        else:
                            raise ValueError(
                                f"""Unclosed bracket in {job_set_file} starting at line {lineno}."""
                            )
                        lineno += inlineno
                        submit_obj = htcondor.Submit(inline_data)
                        #Set s_method to HTC_JOBSET_SUBMIT
                        submit_obj.setSubmitMethod(JSM_HTC_JOBSET_SUBMIT, True)
                    else:
                        try:
                            with open(job_source, "rt") as f_sub:
                                submit_obj = htcondor.Submit(f_sub.read())
                                #Set s_method to HTC_JOBSET_SUBMIT
                                submit_obj.setSubmitMethod(
                                    JSM_HTC_JOBSET_SUBMIT, True)
                        except IOError as e:
                            raise IOError(
                                f"Error opening submit description file {job_source} in {job_set_file} at line {lineno}:\n{str(e)}"
                            )

                    # Remap variables in the Submit object
                    submit_obj = self.remap_submit_variables(
                        mappings, submit_obj)

                    # Store each job
                    self.jobs.append(submit_obj)

        # Add job set name to each job's Submit object
        for i_job, job in enumerate(self.jobs):
            job["MY.JobSetName"] = classad.quote(self.name)
            job["MY.InJobSet"] = True
Exemplo n.º 46
0
    def submitDirect(self, schedd, cmd, arg, info):  # pylint: disable=R0201
        """
        Submit directly to the schedd using the HTCondor module
        """
        dagAd = classad.ClassAd()
        addCRABInfoToClassAd(dagAd, info)

        if info["CMSGroups"]:
            dagAd["CMSGroups"] = ",".join(info["CMSGroups"])
        else:
            dagAd["CMSGroups"] = classad.Value.Undefined

        # NOTE: Changes here must be synchronized with the job_submit in DagmanCreator.py in CAFTaskWorker
        dagAd["CRAB_Attempt"] = 0
        # We switched from local to scheduler universe.  Why?  It seems there's no way in the
        # local universe to change the hold signal at runtime.  That's fairly important for our
        # resubmit implementation.
        # dagAd["JobUniverse"] = 12
        dagAd["JobUniverse"] = 7
        dagAd["HoldKillSig"] = "SIGUSR1"
        dagAd["X509UserProxy"] = info["user_proxy"]
        dagAd["Requirements"] = classad.ExprTree("true || false")
        dagAd["TaskType"] = "ROOT"
        dagAd["Environment"] = classad.ExprTree(
            'strcat("PATH=/usr/bin:/bin CRAB3_VERSION=3.3.0-pre1 CONDOR_ID=", ClusterId, ".", ProcId," %s")'
            % " ".join(info["additional_environment_options"].split(";"))
        )
        dagAd["RemoteCondorSetup"] = info["remote_condor_setup"]

        with open("subdag.ad", "w") as fd:
            for k, v in dagAd.items():
                if k == "X509UserProxy":
                    v = os.path.basename(v)
                if isinstance(v, basestring):
                    value = classad.quote(v)
                elif isinstance(v, classad.ExprTree):
                    value = repr(v)
                else:
                    value = v
                fd.write("+{0} = {1}\n".format(k, value))

        dagAd["Out"] = str(os.path.join(info["scratch"], "request.out"))
        dagAd["Err"] = str(os.path.join(info["scratch"], "request.err"))
        dagAd["Cmd"] = cmd
        dagAd["Args"] = arg
        dagAd["TransferInput"] = str(info["inputFilesString"])
        dagAd["CRAB_TaskSubmitTime"] = classad.ExprTree("%s" % info["start_time"].encode("ascii", "ignore"))
        # Putting JobStatus == 4 since LeaveJobInQueue is for completed jobs (probably redundant)
        LEAVE_JOB_IN_QUEUE_EXPR = "(JobStatus == 4) && ((time()-CRAB_TaskSubmitTime) < %s)" % TASKLIFETIME
        dagAd["LeaveJobInQueue"] = classad.ExprTree(LEAVE_JOB_IN_QUEUE_EXPR)
        # Removing a task after the expiration date no matter what its status is
        dagAd["PeriodicRemove"] = classad.ExprTree("((time()-CRAB_TaskSubmitTime) > %s)" % TASKLIFETIME)
        dagAd["TransferOutput"] = info["outputFilesString"]
        dagAd["OnExitRemove"] = classad.ExprTree(
            "( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2))"
        )
        dagAd["OtherJobRemoveRequirements"] = classad.ExprTree("DAGManJobId =?= ClusterId")
        dagAd["RemoveKillSig"] = "SIGUSR1"
        dagAd["OnExitHold"] = classad.ExprTree("(ExitCode =!= UNDEFINED && ExitCode != 0)")

        condorIdDict = {}
        with HTCondorUtils.AuthenticatedSubprocess(info["user_proxy"], pickleOut=True, outputObj=condorIdDict) as (
            parent,
            rpipe,
        ):
            if not parent:
                resultAds = []
                condorIdDict["ClusterId"] = schedd.submit(dagAd, 1, True, resultAds)
                schedd.spool(resultAds)
                # editing the LeaveJobInQueue since the remote submit overwrites it
                # see https://github.com/dmwm/CRABServer/pull/5212#issuecomment-216519749
                if resultAds:
                    id_ = "%s.%s" % (resultAds[0]["ClusterId"], resultAds[0]["ProcId"])
                    schedd.edit([id_], "LeaveJobInQueue", classad.ExprTree(LEAVE_JOB_IN_QUEUE_EXPR))

        results = pickle.load(rpipe)

        # notice that the clusterId might be set even if there was a failure. This is if the schedd.submit succeded, but the spool  call failed
        if "ClusterId" in results.outputObj:
            self.logger.debug("Condor cluster ID just submitted is: %s", results.outputObj["ClusterId"])
        if results.outputMessage != "OK":
            self.logger.debug(
                "Now printing the environment used for submission:\n"
                + "-" * 70
                + "\n"
                + results.environmentStr
                + "-" * 70
            )
            raise TaskWorkerException(
                "Failure when submitting task to scheduler. Error reason: '%s'" % results.outputMessage, retry=True
            )

        # if we don't raise exception above the id is here
        return results.outputObj["ClusterId"]
Exemplo n.º 47
0
def main():
    """
    Need a doc string here.
    """
    setupLog()

    if '_CONDOR_JOB_AD' not in os.environ or not os.path.exists(os.environ["_CONDOR_JOB_AD"]):
        printLog("Exiting AdjustSites since _CONDOR_JOB_AD is not in the environment or does not exist")
        sys.exit(0)

    printLog("Starting AdjustSites with _CONDOR_JOB_AD=%s" % os.environ['_CONDOR_JOB_AD'])

    with open(os.environ['_CONDOR_JOB_AD']) as fd:
        ad = classad.parseOld(fd)
    printLog("Parsed ad: %s" % ad)

    makeWebDir(ad)

    printLog("Webdir has been set up. Uploading the webdir URL to the REST")

    retries = 0
    exitCode = 1
    while retries < 3 and exitCode != 0:
        exitCode = updateWebDir(ad)
        if exitCode != 0:
            time.sleep(retries * 20)
        retries += 1

    if exitCode != 0:
        printLog("Exiting AdjustSites because the webdir upload failed three times.")
        sys.exit(1)

    printLog("Webdir URL has been uploaded, exit code is %s. Setting the classad for the proxied webdir" % exitCode)

    saveProxiedWebdir(ad)

    printLog("Proxied webdir saved. Clearing the automatic blacklist and handling RunJobs.dag.nodes.log for resubmissions")

    clearAutomaticBlacklist()

    resubmitJobIds = []
    if 'CRAB_ResubmitList' in ad:
        resubmitJobIds = ad['CRAB_ResubmitList']
        try:
            resubmitJobIds = set(resubmitJobIds)
            resubmitJobIds = [str(i) for i in resubmitJobIds]
        except TypeError:
            resubmitJobIds = True

    # Hold and release processing and tail DAGs here so that modifications
    # to the submission and log files will be picked up.
    schedd = htcondor.Schedd()
    tailconst = "TaskType =?= \"TAIL\" && CRAB_ReqName =?= %s" % classad.quote(ad.get("CRAB_ReqName"))
    if resubmitJobIds and ad.get('CRAB_SplitAlgo') == 'Automatic':
        printLog("Holding processing and tail DAGs")
        schedd.edit(tailconst, "HoldKillSig", 'SIGKILL')
        schedd.act(htcondor.JobAction.Hold, tailconst)

    if resubmitJobIds:
        adjustedJobIds = []
        filenames = getGlob(ad, "RunJobs.dag.nodes.log", "RunJobs[1-9]*.subdag.nodes.log")
        for fn in filenames:
            if hasattr(htcondor, 'lock'):
                # While dagman is not running at this point, the schedd may be writing events to this
                # file; hence, we only edit the file while holding an appropriate lock.
                # Note this lock method didn't exist until 8.1.6; prior to this, we simply
                # run dangerously.
                with htcondor.lock(open(fn, 'a'), htcondor.LockType.WriteLock):
                    adjustedJobIds.extend(adjustPostScriptExitStatus(resubmitJobIds, fn))
            else:
                adjustedJobIds.extend(adjustPostScriptExitStatus(resubmitJobIds, fn))
        ## Adjust the maximum allowed number of retries only for the job ids for which
        ## the POST script exit status was adjusted. Why only for these job ids and not
        ## for all job ids in resubmitJobIds? Because if resubmitJobIds = True, which as
        ## a general rule means "all failed job ids", we don't have a way to know if a
        ## job is in failed status or not just from the RunJobs.dag file, while job ids
        ## in adjustedJobIds correspond only to failed jobs.
        adjustMaxRetries(adjustedJobIds, ad)

    if 'CRAB_SiteAdUpdate' in ad:
        newSiteAd = ad['CRAB_SiteAdUpdate']
        with open("site.ad") as fd:
            siteAd = classad.parse(fd)
        siteAd.update(newSiteAd)
        with open("site.ad", "w") as fd:
            fd.write(str(siteAd))

    if resubmitJobIds and ad.get('CRAB_SplitAlgo') == 'Automatic':
        printLog("Releasing processing and tail DAGs")
        schedd.edit(tailconst, "HoldKillSig", 'SIGUSR1')
        schedd.act(htcondor.JobAction.Release, tailconst)

    printLog("Exiting AdjustSite")
Exemplo n.º 48
0
    def alter_submit(self, crab_retry):
        """
        Copy the content of the generic file Job.submit into a job-specific file
        Job.<job_id>.submit and add attributes that are job-specific (e.g. CRAB_Retry).
        Add also parameters that can be overwritten at each manual job resubmission
        (e.g. MaxWallTimeMins, RequestMemory, RequestCpus, JobPrio, DESIRED_SITES).
        """
        ## Start the Job.<job_id>.submit content with the CRAB_Retry.
        new_submit_text = '+CRAB_Retry = %d\n' % (crab_retry)
        msg = "Setting CRAB_Retry = %s" % (crab_retry)
        self.logger.info(msg)
        ## For the parameters that can be overwritten at each manual job resubmission,
        ## read them from the task ad, unless there is resubmission information there
        ## and this job is not one that has to be resubmitted, in which case we should
        ## use the same parameters (site black- and whitelists, requested memory, etc)
        ## as used by the previous job retry (which are saved in self.resubmit_info).
        CRAB_ResubmitList_in_taskad = ('CRAB_ResubmitList' in self.task_ad)
        use_resubmit_info = False
        resubmit_jobids = []
        if 'CRAB_ResubmitList' in self.task_ad:
            resubmit_jobids = map(str, self.task_ad['CRAB_ResubmitList'])
            try:
                resubmit_jobids = set(resubmit_jobids)
                if resubmit_jobids and self.job_id not in resubmit_jobids:
                    use_resubmit_info = True
            except TypeError:
                resubmit_jobids = True
        ## If there is no resubmit_info, we can of course not use it.
        if not self.resubmit_info:
            use_resubmit_info = False
        ## Get the resubmission parameters.
        maxjobruntime = None
        maxmemory     = None
        numcores      = None
        priority      = None
        if not use_resubmit_info:
            #if 'MaxWallTimeMins_RAW' in self.task_ad:
            #    if self.task_ad['MaxWallTimeMins_RAW'] != 1315:
            #        maxjobruntime = self.task_ad.lookup('MaxWallTimeMins_RAW')
            #        self.resubmit_info['maxjobruntime'] = maxjobruntime
            if 'MaxWallTimeMinsProbe' in self.task_ad and self.stage == 'probe':
                maxjobruntime = int(str(self.task_ad.lookup('MaxWallTimeMinsProbe')))
            elif 'MaxWallTimeMinsTail' in self.task_ad and self.stage == 'tail':
                maxjobruntime = int(str(self.task_ad.lookup('MaxWallTimeMinsTail')))
            elif 'MaxWallTimeMins' in self.task_ad:
                maxjobruntime = int(str(self.task_ad.lookup('MaxWallTimeMins')))
            if 'RequestMemory' in self.task_ad:
                maxmemory = int(str(self.task_ad.lookup('RequestMemory')))
            if 'RequestCpus' in self.task_ad:
                numcores = int(str(self.task_ad.lookup('RequestCpus')))
            if 'JobPrio' in self.task_ad:
                priority = int(str(self.task_ad['JobPrio']))
            if str(self.job_id) == '0': #jobids can be like 1-1 for subjobs
                priority = 20 #the maximum for splitting jobs
        else:
            inkey = str(crab_retry) if crab_retry == 0 else str(crab_retry - 1)
            while inkey not in self.resubmit_info and int(inkey) > 0:
                inkey = str(int(inkey) -  1)
            maxjobruntime = self.resubmit_info[inkey].get('maxjobruntime')
            maxmemory     = self.resubmit_info[inkey].get('maxmemory')
            numcores      = self.resubmit_info[inkey].get('numcores')
            priority      = self.resubmit_info[inkey].get('priority')
        ## Save the (new) values of the resubmission parameters in self.resubmit_info
        ## for the current job retry number.
        outkey = str(crab_retry)
        if outkey not in self.resubmit_info:
            self.resubmit_info[outkey] = {}
        self.resubmit_info[outkey]['maxjobruntime'] = maxjobruntime
        self.resubmit_info[outkey]['maxmemory']     = maxmemory
        self.resubmit_info[outkey]['numcores']      = numcores
        self.resubmit_info[outkey]['priority']      = priority
        self.resubmit_info[outkey]['use_resubmit_info'] = use_resubmit_info
        self.resubmit_info[outkey]['CRAB_ResubmitList_in_taskad'] = CRAB_ResubmitList_in_taskad
        ## Add the resubmission parameters to the Job.<job_id>.submit content.
        if maxjobruntime is not None:
            new_submit_text += '+EstimatedWallTimeMins = %s\n' % str(maxjobruntime)
            new_submit_text += '+MaxWallTimeMins = (JobStatus=?=1) ? EstimatedWallTimeMins : %s\n' % str(maxjobruntime)
        if maxmemory is not None:
            new_submit_text += '+RequestMemory = %s\n' % (str(maxmemory))
        if numcores is not None:
            new_submit_text += '+RequestCpus = %s\n' % (str(numcores))
        if priority is not None:
            new_submit_text += '+JobPrio = %s\n' % (str(priority))

        ## Within the schedd, order the first few jobs in the task before all other tasks of the same priority.
        pre_job_prio = 1
        if int(self.job_id.split('-')[0]) <= 5:
            pre_job_prio = 0
        new_submit_text += '+PreJobPrio1 = %d\n' % pre_job_prio

        ## The schedd will use PostJobPrio1 as a secondary job-priority sorting key: it
        ## will first run jobs by JobPrio; then, for jobs with the same JobPrio, it will
        ## run the job with the higher PostJobPrio1.
        new_submit_text += '+PostJobPrio1 = -%s\n' % str(self.task_ad.lookup('QDate'))

        ## Order retries before all other jobs in this task
        new_submit_text += '+PostJobPrio2 = %d\n' % crab_retry

        ## Add the site black- and whitelists and the DESIRED_SITES to the
        ## Job.<job_id>.submit content.
        new_submit_text = self.redo_sites(new_submit_text, crab_retry, use_resubmit_info)

        ## Add group information:
        username = self.task_ad.get('CRAB_UserHN')
        if 'CMSGroups' in self.task_ad:
            new_submit_text += '+CMSGroups = %s\n' % classad.quote(self.task_ad['CMSGroups'])
        elif username:
            groups = CMSGroupMapper.map_user_to_groups(username)
            if groups:
                new_submit_text += '+CMSGroups = %s\n' % classad.quote(groups)

        ## Finally add (copy) all the content of the generic Job.submit file.
        with open("Job.submit", 'r') as fd:
            new_submit_text += fd.read()
        ## Write the Job.<job_id>.submit file.
        with open("Job.%s.submit" % (self.job_id), 'w') as fd:
            fd.write(new_submit_text)
Exemplo n.º 49
0
    def updateSiteInformation(self, jobs, siteName, excludeSite):
        """
        _updateSiteInformation_

        Allow or disallow jobs to run at a site.
        Called externally by Ops scripts if a site enters or leaves Down, Draining or Aborted.

        Kill job if after removing site from allowed sites it has nowhere to run.

        Parameters:    excludeSite = False when moving to Normal
                       excludeSite = True when moving to Down, Draining or Aborted
        """
        sd = htcondor.Schedd()
        jobIdToKill = []
        jobtokill = []
        origSiteLists = set()

        try:
            itobj = sd.xquery('WMAgent_AgentName =?= %s && JobStatus =?= 1' % classad.quote(self.agent),
                              ['WMAgent_JobID', 'DESIRED_Sites', 'ExtDESIRED_Sites'])

            for jobAd in itobj:
                jobAdId = jobAd.get('WMAgent_JobID')
                desiredSites = jobAd.get('DESIRED_Sites')
                extDesiredSites = jobAd.get('ExtDESIRED_Sites')
                if excludeSite and siteName == desiredSites:
                    jobIdToKill.append(jobAdId)
                else:
                    origSiteLists.add((desiredSites, extDesiredSites))
            logging.info("Set of %d site list condor combinations", len(origSiteLists))
        except Exception as ex:
            msg = "Failed to query condor schedd: %s" % str(ex)
            logging.exception(msg)
            return jobtokill

        with sd.transaction() as txn:
            for siteStrings in origSiteLists:
                desiredList = set([site.strip() for site in siteStrings[0].split(",")])
                extDesiredList = set([site.strip() for site in siteStrings[1].split(",")])

                if excludeSite and siteName not in desiredList:
                    continue
                elif not excludeSite and (siteName in desiredList or siteName not in extDesiredList):
                    continue
                elif excludeSite:
                    desiredList.remove(siteName)
                    extDesiredList.add(siteName)
                else:  # well, then include
                    desiredList.add(siteName)
                    extDesiredList.remove(siteName)

                # now put it back in the string format expected by condor
                desiredList = ",".join(desiredList)
                extDesiredList = ",".join(extDesiredList)

                try:
                    sd.edit('DESIRED_Sites =?= %s && ExtDESIRED_Sites =?= %s' % (classad.quote(siteStrings[0]),
                                                                                 classad.quote(siteStrings[1])),
                            "DESIRED_Sites", classad.quote(str(desiredList)))
                    sd.edit('DESIRED_Sites =?= %s && ExtDESIRED_Sites =?= %s' % (classad.quote(siteStrings[0]),
                                                                                 classad.quote(siteStrings[1])),
                            "ExtDESIRED_Sites", classad.quote(str(extDesiredList)))
                except RuntimeError as ex:
                    msg = 'Failed to condor edit job sites. Could be that no jobs were in condor anymore: %s' % str(ex)
                    logging.warning(msg)

        # now update the list of jobs to be killed
        jobtokill = [job for job in jobs if job['id'] in jobIdToKill]

        return jobtokill
Exemplo n.º 50
0
def classad_quote(input_value):
    import classad
    return classad.quote(str(input_value))
Exemplo n.º 51
0
def find_job_event_logs(
    users=None,
    cluster_ids=None,
    files=None,
    batches=None,
    collector=None,
    schedd=None,
):
    """
    Discover job event logs to read events from based on various methods.

    Parameters
    ----------
    users
        Find job event logs for these user's active jobs.
    cluster_ids
        Find job event logs for these clusters.
    files
        Find these job event logs (basically, these just get passed straight through).
    batches
        Find job event logs for these batch names.
    collector
        Query this collector to find the schedd.
        Defaults to the local collector.
    schedd
        Query this schedd for users, cluster_ids, and batches.
        Defaults to the local schedd.
    """
    if users is None:
        users = []
    if cluster_ids is None:
        cluster_ids = []
    if files is None:
        files = []
    if batches is None:
        batches = []

    constraint = " || ".join(
        itertools.chain(
            ("Owner == {}".format(classad.quote(u)) for u in users),
            ("ClusterId == {}".format(cid) for cid in cluster_ids),
            ("JobBatchName == {}".format(b) for b in batches),
        ))

    clusters = set()
    event_logs = set()
    batch_names = {}
    already_warned_missing_log = set()
    dagman_job_cluster_id_to_log_path = {}
    dagman_job_cluster_ids = set()

    for file in files:
        event_logs.add(os.path.abspath(file))

    for ad in get_ads(constraint, collector, schedd):
        cluster_id = ad["ClusterId"]
        clusters.add(cluster_id)

        batch_names[cluster_id] = ad.get("JobBatchName")

        if "DAGManNodesLog" in ad:
            log_path = dagman_job_cluster_id_to_log_path[cluster_id] = ad[
                "DAGManNodesLog"]
        elif "UserLog" in ad:
            log_path = ad["UserLog"]
        else:
            if cluster_id not in already_warned_missing_log:
                warning(
                    "Cluster {} does not have a job event log file (set log=<path> in the submit description)"
                    .format(cluster_id))
                already_warned_missing_log.add(cluster_id)
            continue

        if not os.path.isabs(log_path):
            log_path = os.path.abspath(os.path.join(ad["Iwd"], log_path))

        event_logs.add(log_path)

        # this job is the actual DAGMan controller job
        if "OtherJobRemoveRequirements" in ad:
            dagman_job_cluster_ids.add(cluster_id)

    return (
        clusters,
        constraint,
        event_logs,
        batch_names,
        dagman_job_cluster_id_to_log_path,
        dagman_job_cluster_ids,
    )
Exemplo n.º 52
0
    def customizePerJob(self, job):
        """
        JDL additions just for this implementation. Over-ridden in sub-classes
        These are the Glide-in specific bits
        """
        jdl = []
        jobCE = job['location']
        if not jobCE:
            # Then we ended up with a site that doesn't exist?
            logging.error("Job for non-existant site %s", job['location'])
            return jdl

        if self.submitWMSMode and len(job.get('possibleSites', [])) > 0:
            strg = ','.join(map(str, job.get('possibleSites')))
            jdl.append('+DESIRED_Sites = \"%s\"\n' % strg)
        else:
            jdl.append('+DESIRED_Sites = \"%s\"\n' % (jobCE))

        if self.submitWMSMode and len(job.get('potentialSites', [])) > 0:
            strg = ','.join(map(str, job.get('potentialSites')))
            jdl.append('+ExtDESIRED_Sites = \"%s\"\n' % strg)
        else:
            jdl.append('+ExtDESIRED_Sites = \"%s\"\n' % (jobCE))

        if job.get('proxyPath'):
            jdl.append('x509userproxy = %s\n' % job['proxyPath'])

        jdl.append('+WMAgent_RequestName = "%s"\n' % job['requestName'])
        match = GROUP_NAME_RE.match(job['requestName'])
        if match:
            jdl.append('+CMSGroups = %s\n' % classad.quote(match.groups()[0]))
        else:
            jdl.append('+CMSGroups = undefined\n')

        jdl.append('+WMAgent_SubTaskName = "%s"\n' % job['taskName'])
        jdl.append('+CMS_JobType = "%s"\n' % job['taskType'])

        # Handling for AWS, cloud and opportunistic resources
        jdl.append('+AllowOpportunistic = %s\n' %
                   job.get('allowOpportunistic', False))

        # dataset info
        if job.get('inputDataset'):
            jdl.append('+DESIRED_CMSDataset = "%s"\n' % job['inputDataset'])
        else:
            jdl.append('+DESIRED_CMSDataset = undefined\n')
        if job.get('inputDatasetLocations'):
            jdl.append('+DESIRED_CMSDataLocations = "%s"\n' %
                       ','.join(job['inputDatasetLocations']))
        else:
            jdl.append('+DESIRED_CMSDataLocations = undefined\n')

        # HighIO and repack jobs handling
        highio = 1 if job['taskType'] in ["Merge", "Cleanup", "LogCollect"
                                          ] else 0
        repackjob = 1 if job['taskType'] == 'Repack' else 0
        jdl.append('+Requestioslots = %d\n' % highio)
        jdl.append('+RequestRepackslots = %d\n' % repackjob)

        # Performance and resource estimates
        numberOfCores = job.get('numberOfCores', 1)
        requestMemory = int(job['estimatedMemoryUsage']) if job.get(
            'estimatedMemoryUsage', None) else 1000
        requestDisk = int(job['estimatedDiskUsage']) if job.get(
            'estimatedDiskUsage', None) else 20 * 1000 * 1000 * numberOfCores
        maxWallTimeMins = int(job['estimatedJobTime']) / 60.0 if job.get(
            'estimatedJobTime', None) else 12 * 60
        jdl.append('request_memory = %d\n' % requestMemory)
        jdl.append('request_disk = %d\n' % requestDisk)
        jdl.append('+MaxWallTimeMins = %d\n' % maxWallTimeMins)

        # How many cores job is using
        jdl.append('machine_count = 1\n')
        jdl.append('request_cpus = %s\n' % numberOfCores)

        # Add OS requirements for jobs
        if job.get('scramArch') is not None and job.get(
                'scramArch').startswith("slc6_"):
            jdl.append('+REQUIRED_OS = "rhel6"\n')
        else:
            jdl.append('+REQUIRED_OS = "any"\n')

        return jdl
Exemplo n.º 53
0
    def updateSiteInformation(self, jobs, siteName, excludeSite):
        """
        _updateSiteInformation_

        Allow or disallow jobs to run at a site.
        Called externally by Ops scripts if a site enters or leaves Down, Draining or Aborted.

        Kill job if after removing site from allowed sites it has nowhere to run.

        Parameters:    excludeSite = False when moving to Normal
                       excludeSite = True when moving to Down, Draining or Aborted
        """
        sd = htcondor.Schedd()
        jobIdToKill = []
        jobtokill = []
        origSiteLists = set()

        try:
            itobj = sd.xquery(
                'WMAgent_AgentName =?= %s && JobStatus =?= 1' %
                classad.quote(self.agent),
                ['WMAgent_JobID', 'DESIRED_Sites', 'ExtDESIRED_Sites'])

            for jobAd in itobj:
                jobAdId = jobAd.get('WMAgent_JobID')
                desiredSites = jobAd.get('DESIRED_Sites')
                extDesiredSites = jobAd.get('ExtDESIRED_Sites')
                if excludeSite and siteName == desiredSites:
                    jobIdToKill.append(jobAdId)
                else:
                    origSiteLists.add((desiredSites, extDesiredSites))
            logging.info("Set of %d site list condor combinations",
                         len(origSiteLists))
        except Exception as ex:
            msg = "Failed to query condor schedd: %s" % str(ex)
            logging.exception(msg)
            return jobtokill

        with sd.transaction() as dummyTxn:
            for siteStrings in origSiteLists:
                desiredList = set(
                    [site.strip() for site in siteStrings[0].split(",")])
                extDesiredList = set(
                    [site.strip() for site in siteStrings[1].split(",")])

                if excludeSite and siteName not in desiredList:
                    continue
                elif not excludeSite and (siteName in desiredList
                                          or siteName not in extDesiredList):
                    continue
                elif excludeSite:
                    desiredList.remove(siteName)
                    extDesiredList.add(siteName)
                else:  # well, then include
                    desiredList.add(siteName)
                    extDesiredList.remove(siteName)

                # now put it back in the string format expected by condor
                desiredListStr = ",".join(desiredList)
                extDesiredListStr = ",".join(extDesiredList)

                try:
                    sd.edit(
                        'DESIRED_Sites =?= %s && ExtDESIRED_Sites =?= %s' %
                        (classad.quote(siteStrings[0]),
                         classad.quote(siteStrings[1])), "DESIRED_Sites",
                        classad.quote(str(desiredListStr)))
                    sd.edit(
                        'DESIRED_Sites =?= %s && ExtDESIRED_Sites =?= %s' %
                        (classad.quote(siteStrings[0]),
                         classad.quote(siteStrings[1])), "ExtDESIRED_Sites",
                        classad.quote(str(extDesiredListStr)))
                except RuntimeError as ex:
                    msg = 'Failed to condor edit job sites. Could be that no jobs were in condor anymore: %s' % str(
                        ex)
                    logging.warning(msg)

        # now update the list of jobs to be killed
        jobtokill = [job for job in jobs if job['id'] in jobIdToKill]

        return jobtokill
Exemplo n.º 54
0
    def alter_submit(self, crab_retry):
        """
        Copy the content of the generic file Job.submit into a job-specific file
        Job.<job_id>.submit and add attributes that are job-specific (e.g. CRAB_Retry).
        Add also parameters that can be overwritten at each manual job resubmission
        (e.g. MaxWallTimeMins, RequestMemory, RequestCpus, JobPrio, DESIRED_SITES).
        """
        ## Start the Job.<job_id>.submit content with the CRAB_Retry.
        new_submit_text = '+CRAB_Retry = %d\n' % (crab_retry)
        msg = "Setting CRAB_Retry = %s" % (crab_retry)
        self.logger.info(msg)
        ## For the parameters that can be overwritten at each manual job resubmission,
        ## read them from the task ad, unless there is resubmission information there
        ## and this job is not one that has to be resubmitted, in which case we should
        ## use the same parameters (site black- and whitelists, requested memory, etc)
        ## as used by the previous job retry (which are saved in self.resubmit_info).
        CRAB_ResubmitList_in_taskad = ('CRAB_ResubmitList' in self.task_ad)
        use_resubmit_info = False
        resubmit_jobids = []
        if 'CRAB_ResubmitList' in self.task_ad:
            resubmit_jobids = self.task_ad['CRAB_ResubmitList']
            try:
                resubmit_jobids = set(resubmit_jobids)
                if resubmit_jobids and self.job_id not in resubmit_jobids:
                    use_resubmit_info = True
            except TypeError:
                resubmit_jobids = True
        ## If there is no resubmit_info, we can of course not use it.
        if not self.resubmit_info:
            use_resubmit_info = False
        ## Get the resubmission parameters.
        maxjobruntime = None
        maxmemory     = None
        numcores      = None
        priority      = None
        if not use_resubmit_info:
            #if 'MaxWallTimeMins_RAW' in self.task_ad:
            #    if self.task_ad['MaxWallTimeMins_RAW'] != 1315:
            #        maxjobruntime = self.task_ad.lookup('MaxWallTimeMins_RAW')
            #        self.resubmit_info['maxjobruntime'] = maxjobruntime
            if 'MaxWallTimeMins' in self.task_ad:
                maxjobruntime = int(str(self.task_ad.lookup('MaxWallTimeMins')))
            if 'RequestMemory' in self.task_ad:
                maxmemory = int(str(self.task_ad.lookup('RequestMemory')))
            if 'RequestCpus' in self.task_ad:
                numcores = int(str(self.task_ad.lookup('RequestCpus')))
            if 'JobPrio' in self.task_ad:
                priority = int(str(self.task_ad['JobPrio']))
        else:
            inkey = str(crab_retry) if crab_retry == 0 else str(crab_retry - 1)
            while inkey not in self.resubmit_info and int(inkey) > 0:
                inkey = str(int(inkey) -  1)
            maxjobruntime = self.resubmit_info[inkey].get('maxjobruntime')
            maxmemory     = self.resubmit_info[inkey].get('maxmemory')
            numcores      = self.resubmit_info[inkey].get('numcores')
            priority      = self.resubmit_info[inkey].get('priority')
        ## Save the (new) values of the resubmission parameters in self.resubmit_info
        ## for the current job retry number.
        outkey = str(crab_retry)
        if outkey not in self.resubmit_info:
            self.resubmit_info[outkey] = {}
        self.resubmit_info[outkey]['maxjobruntime'] = maxjobruntime
        self.resubmit_info[outkey]['maxmemory']     = maxmemory
        self.resubmit_info[outkey]['numcores']      = numcores
        self.resubmit_info[outkey]['priority']      = priority
        self.resubmit_info[outkey]['use_resubmit_info'] = use_resubmit_info
        self.resubmit_info[outkey]['CRAB_ResubmitList_in_taskad'] = CRAB_ResubmitList_in_taskad
        ## Add the resubmission parameters to the Job.<job_id>.submit content.
        if maxjobruntime is not None:
            new_submit_text += '+MaxWallTimeMins = %s\n' % (str(maxjobruntime))
        if maxmemory is not None:
            new_submit_text += '+RequestMemory = %s\n' % (str(maxmemory))
        if numcores is not None:
            new_submit_text += '+RequestCpus = %s\n' % (str(numcores))
        if priority is not None:
            new_submit_text += '+JobPrio = %s\n' % (str(priority))

        ## Within the schedd, order the first few jobs in the task before all other tasks of the same priority.
        pre_job_prio = 1
        if self.job_id <= 5:
            pre_job_prio = 0
        new_submit_text += '+PreJobPrio1 = %d\n' % pre_job_prio

        ## The schedd will use PostJobPrio1 as a secondary job-priority sorting key: it
        ## will first run jobs by JobPrio; then, for jobs with the same JobPrio, it will
        ## run the job with the higher PostJobPrio1.
        new_submit_text += '+PostJobPrio1 = -%s\n' % str(self.task_ad.lookup('QDate'))

        ## Order retries before all other jobs in this task
        new_submit_text += '+PostJobPrio2 = %d\n' % crab_retry

        ## This is used to send to dashbord the location of the logfiles
        try:
            storage_rules = htcondor.param['CRAB_StorageRules']
        except:
            storage_rules = "^/home/remoteGlidein,http://submit-5.t2.ucsd.edu/CSstoragePath"
        new_submit_text += '+CRAB_UserWebDir = "%s"\n' % getWebdirForDb(str(self.task_ad.get('CRAB_ReqName')), storage_rules)

        try:
            with open('proxied_webdir') as fd:
                proxied_webdir = fd.read()
            new_submit_text += '+CRAB_UserWebDirPrx = "%s"\n' % proxied_webdir
        except IOError as e:
            self.logger.error(("'I/O error(%s): %s', when looking for the proxied_webdir file. Might be normal"
                         " if the schedd does not have a proxiedurl in the REST external config." % (e.errno, e.strerror)))
        ## Add the site black- and whitelists and the DESIRED_SITES to the
        ## Job.<job_id>.submit content.
        new_submit_text = self.redo_sites(new_submit_text, crab_retry, use_resubmit_info)

        ## Add group information:
        username = self.task_ad.get('CRAB_UserHN')
        if 'CMSGroups' in self.task_ad:
            new_submit_text += '+CMSGroups = %s\n' % classad.quote(self.task_ad['CMSGroups'])
        elif username:
            groups = CMSGroupMapper.map_user_to_groups(username)
            if groups:
                new_submit_text += '+CMSGroups = %s\n' % classad.quote(groups)

        ## Finally add (copy) all the content of the generic Job.submit file.
        with open("Job.submit", 'r') as fd:
            new_submit_text += fd.read()
        ## Write the Job.<job_id>.submit file.
        with open("Job.%d.submit" % (self.job_id), 'w') as fd:
            fd.write(new_submit_text)